From 3947bbea9da6937f6ee5b2639fc5bc05b679369c Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:31:02 +0200 Subject: [PATCH 1/5] copy builder to new folder called builder2 --- arrow-array/src/builder2/boolean_builder.rs | 349 +++++++ arrow-array/src/builder2/buffer_builder.rs | 225 +++++ .../src/builder2/fixed_size_binary_builder.rs | 255 +++++ .../src/builder2/fixed_size_list_builder.rs | 492 ++++++++++ .../src/builder2/generic_byte_run_builder.rs | 517 +++++++++++ .../src/builder2/generic_bytes_builder.rs | 596 ++++++++++++ .../generic_bytes_dictionary_builder.rs | 667 +++++++++++++ .../builder2/generic_bytes_view_builder.rs | 733 +++++++++++++++ .../src/builder2/generic_list_builder.rs | 877 ++++++++++++++++++ arrow-array/src/builder2/map_builder.rs | 380 ++++++++ arrow-array/src/builder2/mod.rs | 325 +++++++ arrow-array/src/builder2/null_builder.rs | 182 ++++ arrow-array/src/builder2/primitive_builder.rs | 618 ++++++++++++ .../builder2/primitive_dictionary_builder.rs | 446 +++++++++ .../src/builder2/primitive_run_builder.rs | 313 +++++++ arrow-array/src/builder2/struct_builder.rs | 872 +++++++++++++++++ arrow-array/src/builder2/union_builder.rs | 313 +++++++ arrow-array/src/lib.rs | 1 + 18 files changed, 8161 insertions(+) create mode 100644 arrow-array/src/builder2/boolean_builder.rs create mode 100644 arrow-array/src/builder2/buffer_builder.rs create mode 100644 arrow-array/src/builder2/fixed_size_binary_builder.rs create mode 100644 arrow-array/src/builder2/fixed_size_list_builder.rs create mode 100644 arrow-array/src/builder2/generic_byte_run_builder.rs create mode 100644 arrow-array/src/builder2/generic_bytes_builder.rs create mode 100644 arrow-array/src/builder2/generic_bytes_dictionary_builder.rs create mode 100644 arrow-array/src/builder2/generic_bytes_view_builder.rs create mode 100644 arrow-array/src/builder2/generic_list_builder.rs create mode 100644 arrow-array/src/builder2/map_builder.rs create mode 100644 arrow-array/src/builder2/mod.rs create mode 100644 arrow-array/src/builder2/null_builder.rs create mode 100644 arrow-array/src/builder2/primitive_builder.rs create mode 100644 arrow-array/src/builder2/primitive_dictionary_builder.rs create mode 100644 arrow-array/src/builder2/primitive_run_builder.rs create mode 100644 arrow-array/src/builder2/struct_builder.rs create mode 100644 arrow-array/src/builder2/union_builder.rs diff --git a/arrow-array/src/builder2/boolean_builder.rs b/arrow-array/src/builder2/boolean_builder.rs new file mode 100644 index 000000000000..cdc00d03f26a --- /dev/null +++ b/arrow-array/src/builder2/boolean_builder.rs @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, BooleanBufferBuilder}; +use crate::{ArrayRef, BooleanArray}; +use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`BooleanArray`] +/// +/// # Example +/// +/// Create a `BooleanArray` from a `BooleanBuilder` +/// +/// ``` +/// +/// # use arrow_array::{Array, BooleanArray, builder::BooleanBuilder}; +/// +/// let mut b = BooleanBuilder::new(); +/// b.append_value(true); +/// b.append_null(); +/// b.append_value(false); +/// b.append_value(true); +/// let arr = b.finish(); +/// +/// assert_eq!(4, arr.len()); +/// assert_eq!(1, arr.null_count()); +/// assert_eq!(true, arr.value(0)); +/// assert!(arr.is_valid(0)); +/// assert!(!arr.is_null(0)); +/// assert!(!arr.is_valid(1)); +/// assert!(arr.is_null(1)); +/// assert_eq!(false, arr.value(2)); +/// assert!(arr.is_valid(2)); +/// assert!(!arr.is_null(2)); +/// assert_eq!(true, arr.value(3)); +/// assert!(arr.is_valid(3)); +/// assert!(!arr.is_null(3)); +/// ``` +#[derive(Debug)] +pub struct BooleanBuilder { + values_builder: BooleanBufferBuilder, + null_buffer_builder: NullBufferBuilder, +} + +impl Default for BooleanBuilder { + fn default() -> Self { + Self::new() + } +} + +impl BooleanBuilder { + /// Creates a new boolean builder + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new boolean builder with space for `capacity` elements without re-allocating + pub fn with_capacity(capacity: usize) -> Self { + Self { + values_builder: BooleanBufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + } + } + + /// Returns the capacity of this builder measured in slots of type `T` + pub fn capacity(&self) -> usize { + self.values_builder.capacity() + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value(&mut self, v: bool) { + self.values_builder.append(v); + self.null_buffer_builder.append_non_null(); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.values_builder.advance(1); + } + + /// Appends `n` `null`s into the builder. + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.null_buffer_builder.append_n_nulls(n); + self.values_builder.advance(n); + } + + /// Appends an `Option` into the builder + #[inline] + pub fn append_option(&mut self, v: Option) { + match v { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Appends a slice of type `T` into the builder + #[inline] + pub fn append_slice(&mut self, v: &[bool]) { + self.values_builder.append_slice(v); + self.null_buffer_builder.append_n_non_nulls(v.len()); + } + + /// Appends n `additional` bits of value `v` into the buffer + #[inline] + pub fn append_n(&mut self, additional: usize, v: bool) { + self.values_builder.append_n(additional, v); + self.null_buffer_builder.append_n_non_nulls(additional); + } + + /// Appends values from a slice of type `T` and a validity boolean slice. + /// + /// Returns an error if the slices are of different lengths + #[inline] + pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<(), ArrowError> { + if values.len() != is_valid.len() { + Err(ArrowError::InvalidArgumentError( + "Value and validity lengths must be equal".to_string(), + )) + } else { + self.null_buffer_builder.append_slice(is_valid); + self.values_builder.append_slice(values); + Ok(()) + } + } + + /// Builds the [BooleanArray] and reset this builder. + pub fn finish(&mut self) -> BooleanArray { + let len = self.len(); + let null_bit_buffer = self.null_buffer_builder.finish(); + let builder = ArrayData::builder(DataType::Boolean) + .len(len) + .add_buffer(self.values_builder.finish().into_inner()) + .nulls(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + BooleanArray::from(array_data) + } + + /// Builds the [BooleanArray] without resetting the builder. + pub fn finish_cloned(&self) -> BooleanArray { + let len = self.len(); + let nulls = self.null_buffer_builder.finish_cloned(); + let value_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let builder = ArrayData::builder(DataType::Boolean) + .len(len) + .add_buffer(value_buffer) + .nulls(nulls); + + let array_data = unsafe { builder.build_unchecked() }; + BooleanArray::from(array_data) + } + + /// Returns the current values buffer as a slice + /// + /// Boolean values are bit-packed into bytes. To extract the i-th boolean + /// from the bytes, you can use `arrow_buffer::bit_util::get_bit()`. + pub fn values_slice(&self) -> &[u8] { + self.values_builder.as_slice() + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +impl ArrayBuilder for BooleanBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.values_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl Extend> for BooleanBuilder { + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Array; + + #[test] + fn test_boolean_array_builder() { + // 00000010 01001000 + let buf = Buffer::from([72_u8, 2_u8]); + let mut builder = BooleanArray::builder(10); + for i in 0..10 { + if i == 3 || i == 6 || i == 9 { + builder.append_value(true); + } else { + builder.append_value(false); + } + } + + let arr = builder.finish(); + assert_eq!(&buf, arr.values().inner()); + assert_eq!(10, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..10 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") + } + } + + #[test] + fn test_boolean_array_builder_append_slice() { + let arr1 = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); + + let mut builder = BooleanArray::builder(0); + builder.append_slice(&[true, false]); + builder.append_null(); + builder.append_null(); + builder.append_value(false); + let arr2 = builder.finish(); + + assert_eq!(arr1, arr2); + } + + #[test] + fn test_boolean_array_builder_append_slice_large() { + let arr1 = BooleanArray::from(vec![true; 513]); + + let mut builder = BooleanArray::builder(512); + builder.append_slice(&[true; 513]); + let arr2 = builder.finish(); + + assert_eq!(arr1, arr2); + } + + #[test] + fn test_boolean_array_builder_no_null() { + let mut builder = BooleanArray::builder(0); + builder.append_option(Some(true)); + builder.append_value(false); + builder.append_slice(&[true, false, true]); + builder + .append_values(&[false, false, true], &[true, true, true]) + .unwrap(); + + let array = builder.finish(); + assert_eq!(0, array.null_count()); + assert!(array.nulls().is_none()); + } + + #[test] + fn test_boolean_array_builder_finish_cloned() { + let mut builder = BooleanArray::builder(16); + builder.append_option(Some(true)); + builder.append_value(false); + builder.append_slice(&[true, false, true]); + let mut array = builder.finish_cloned(); + assert_eq!(3, array.true_count()); + assert_eq!(2, array.false_count()); + + builder + .append_values(&[false, false, true], &[true, true, true]) + .unwrap(); + + array = builder.finish(); + assert_eq!(4, array.true_count()); + assert_eq!(4, array.false_count()); + + assert_eq!(0, array.null_count()); + assert!(array.nulls().is_none()); + } + + #[test] + fn test_extend() { + let mut builder = BooleanBuilder::new(); + builder.extend([false, false, true, false, false].into_iter().map(Some)); + builder.extend([true, true, false].into_iter().map(Some)); + let array = builder.finish(); + let values = array.iter().map(|x| x.unwrap()).collect::>(); + assert_eq!( + &values, + &[false, false, true, false, false, true, true, false] + ) + } + + #[test] + fn test_boolean_array_builder_append_n() { + let mut builder = BooleanBuilder::new(); + builder.append_n(3, true); + builder.append_n(2, false); + let array = builder.finish(); + assert_eq!(3, array.true_count()); + assert_eq!(2, array.false_count()); + assert_eq!(0, array.null_count()); + + let values = array.iter().map(|x| x.unwrap()).collect::>(); + assert_eq!(&values, &[true, true, true, false, false]) + } +} diff --git a/arrow-array/src/builder2/buffer_builder.rs b/arrow-array/src/builder2/buffer_builder.rs new file mode 100644 index 000000000000..ab67669febb8 --- /dev/null +++ b/arrow-array/src/builder2/buffer_builder.rs @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub use arrow_buffer::BufferBuilder; +use half::f16; + +use crate::types::*; + +/// Buffer builder for signed 8-bit integer type. +pub type Int8BufferBuilder = BufferBuilder; +/// Buffer builder for signed 16-bit integer type. +pub type Int16BufferBuilder = BufferBuilder; +/// Buffer builder for signed 32-bit integer type. +pub type Int32BufferBuilder = BufferBuilder; +/// Buffer builder for signed 64-bit integer type. +pub type Int64BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 8-bit integer type. +pub type UInt8BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 16-bit integer type. +pub type UInt16BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 32-bit integer type. +pub type UInt32BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 64-bit integer type. +pub type UInt64BufferBuilder = BufferBuilder; +/// Buffer builder for 16-bit floating point type. +pub type Float16BufferBuilder = BufferBuilder; +/// Buffer builder for 32-bit floating point type. +pub type Float32BufferBuilder = BufferBuilder; +/// Buffer builder for 64-bit floating point type. +pub type Float64BufferBuilder = BufferBuilder; + +/// Buffer builder for 128-bit decimal type. +pub type Decimal128BufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 256-bit decimal type. +pub type Decimal256BufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for timestamp type of second unit. +pub type TimestampSecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for timestamp type of millisecond unit. +pub type TimestampMillisecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for timestamp type of microsecond unit. +pub type TimestampMicrosecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for timestamp type of nanosecond unit. +pub type TimestampNanosecondBufferBuilder = + BufferBuilder<::Native>; + +/// Buffer builder for 32-bit date type. +pub type Date32BufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 64-bit date type. +pub type Date64BufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for 32-bit elaspsed time since midnight of second unit. +pub type Time32SecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit. +pub type Time32MillisecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for 64-bit elaspsed time since midnight of microsecond unit. +pub type Time64MicrosecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for 64-bit elaspsed time since midnight of nanosecond unit. +pub type Time64NanosecondBufferBuilder = + BufferBuilder<::Native>; + +/// Buffer builder for “calendar” interval in months. +pub type IntervalYearMonthBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for “calendar” interval in days and milliseconds. +pub type IntervalDayTimeBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder “calendar” interval in months, days, and nanoseconds. +pub type IntervalMonthDayNanoBufferBuilder = + BufferBuilder<::Native>; + +/// Buffer builder for elaspsed time of second unit. +pub type DurationSecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of milliseconds unit. +pub type DurationMillisecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of microseconds unit. +pub type DurationMicrosecondBufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of nanoseconds unit. +pub type DurationNanosecondBufferBuilder = + BufferBuilder<::Native>; + +#[cfg(test)] +mod tests { + use crate::builder::{ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder}; + use crate::Array; + + #[test] + fn test_builder_i32_empty() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(0, b.len()); + assert_eq!(16, b.capacity()); + let a = b.finish(); + assert_eq!(0, a.len()); + } + + #[test] + fn test_builder_i32_alloc_zero_bytes() { + let mut b = Int32BufferBuilder::new(0); + b.append(123); + let a = b.finish(); + assert_eq!(4, a.len()); + } + + #[test] + fn test_builder_i32() { + let mut b = Int32BufferBuilder::new(5); + for i in 0..5 { + b.append(i); + } + assert_eq!(16, b.capacity()); + let a = b.finish(); + assert_eq!(20, a.len()); + } + + #[test] + fn test_builder_i32_grow_buffer() { + let mut b = Int32BufferBuilder::new(2); + assert_eq!(16, b.capacity()); + for i in 0..20 { + b.append(i); + } + assert_eq!(32, b.capacity()); + let a = b.finish(); + assert_eq!(80, a.len()); + } + + #[test] + fn test_builder_finish() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(16, b.capacity()); + for i in 0..10 { + b.append(i); + } + let mut a = b.finish(); + assert_eq!(40, a.len()); + assert_eq!(0, b.len()); + assert_eq!(0, b.capacity()); + + // Try build another buffer after cleaning up. + for i in 0..20 { + b.append(i) + } + assert_eq!(32, b.capacity()); + a = b.finish(); + assert_eq!(80, a.len()); + } + + #[test] + fn test_reserve() { + let mut b = UInt8BufferBuilder::new(2); + assert_eq!(64, b.capacity()); + b.reserve(64); + assert_eq!(64, b.capacity()); + b.reserve(65); + assert_eq!(128, b.capacity()); + + let mut b = Int32BufferBuilder::new(2); + assert_eq!(16, b.capacity()); + b.reserve(16); + assert_eq!(16, b.capacity()); + b.reserve(17); + assert_eq!(32, b.capacity()); + } + + #[test] + fn test_append_slice() { + let mut b = UInt8BufferBuilder::new(0); + b.append_slice(b"Hello, "); + b.append_slice(b"World!"); + let buffer = b.finish(); + assert_eq!(13, buffer.len()); + + let mut b = Int32BufferBuilder::new(0); + b.append_slice(&[32, 54]); + let buffer = b.finish(); + assert_eq!(8, buffer.len()); + } + + #[test] + fn test_append_values() { + let mut a = Int8Builder::new(); + a.append_value(1); + a.append_null(); + a.append_value(-2); + assert_eq!(a.len(), 3); + + // append values + let values = &[1, 2, 3, 4]; + let is_valid = &[true, true, false, true]; + a.append_values(values, is_valid); + + assert_eq!(a.len(), 7); + let array = a.finish(); + assert_eq!(array.value(0), 1); + assert!(array.is_null(1)); + assert_eq!(array.value(2), -2); + assert_eq!(array.value(3), 1); + assert_eq!(array.value(4), 2); + assert!(array.is_null(5)); + assert_eq!(array.value(6), 4); + } +} diff --git a/arrow-array/src/builder2/fixed_size_binary_builder.rs b/arrow-array/src/builder2/fixed_size_binary_builder.rs new file mode 100644 index 000000000000..ae466bd6b4c8 --- /dev/null +++ b/arrow-array/src/builder2/fixed_size_binary_builder.rs @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, UInt8BufferBuilder}; +use crate::{ArrayRef, FixedSizeBinaryArray}; +use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`FixedSizeBinaryArray`] +/// ``` +/// # use arrow_array::builder::FixedSizeBinaryBuilder; +/// # use arrow_array::Array; +/// # +/// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); +/// // [b"hello", null, b"arrow"] +/// builder.append_value(b"hello").unwrap(); +/// builder.append_null(); +/// builder.append_value(b"arrow").unwrap(); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), b"hello"); +/// assert!(array.is_null(1)); +/// assert_eq!(array.value(2), b"arrow"); +/// ``` +#[derive(Debug)] +pub struct FixedSizeBinaryBuilder { + values_builder: UInt8BufferBuilder, + null_buffer_builder: NullBufferBuilder, + value_length: i32, +} + +impl FixedSizeBinaryBuilder { + /// Creates a new [`FixedSizeBinaryBuilder`] + pub fn new(byte_width: i32) -> Self { + Self::with_capacity(1024, byte_width) + } + + /// Creates a new [`FixedSizeBinaryBuilder`], `capacity` is the number of byte slices + /// that can be appended without reallocating + pub fn with_capacity(capacity: usize, byte_width: i32) -> Self { + assert!( + byte_width >= 0, + "value length ({byte_width}) of the array must >= 0" + ); + Self { + values_builder: UInt8BufferBuilder::new(capacity * byte_width as usize), + null_buffer_builder: NullBufferBuilder::new(capacity), + value_length: byte_width, + } + } + + /// Appends a byte slice into the builder. + /// + /// Automatically update the null buffer to delimit the slice appended in as a + /// distinct value element. + #[inline] + pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> { + if self.value_length != value.as_ref().len() as i32 { + Err(ArrowError::InvalidArgumentError( + "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" + .to_string(), + )) + } else { + self.values_builder.append_slice(value.as_ref()); + self.null_buffer_builder.append_non_null(); + Ok(()) + } + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) { + self.values_builder + .append_slice(&vec![0u8; self.value_length as usize][..]); + self.null_buffer_builder.append_null(); + } + + /// Builds the [`FixedSizeBinaryArray`] and reset this builder. + pub fn finish(&mut self) -> FixedSizeBinaryArray { + let array_length = self.len(); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(self.values_builder.finish()) + .nulls(self.null_buffer_builder.finish()) + .len(array_length); + let array_data = unsafe { array_data_builder.build_unchecked() }; + FixedSizeBinaryArray::from(array_data) + } + + /// Builds the [`FixedSizeBinaryArray`] without resetting the builder. + pub fn finish_cloned(&self) -> FixedSizeBinaryArray { + let array_length = self.len(); + let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(values_buffer) + .nulls(self.null_buffer_builder.finish_cloned()) + .len(array_length); + let array_data = unsafe { array_data_builder.build_unchecked() }; + FixedSizeBinaryArray::from(array_data) + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +impl ArrayBuilder for FixedSizeBinaryBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::Array; + + #[test] + fn test_fixed_size_binary_builder() { + let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); + + // [b"hello", null, "arrow"] + builder.append_value(b"hello").unwrap(); + builder.append_null(); + builder.append_value(b"arrow").unwrap(); + let array: FixedSizeBinaryArray = builder.finish(); + + assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); + assert_eq!(3, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(10, array.value_offset(2)); + assert_eq!(5, array.value_length()); + } + + #[test] + fn test_fixed_size_binary_builder_finish_cloned() { + let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); + + // [b"hello", null, "arrow"] + builder.append_value(b"hello").unwrap(); + builder.append_null(); + builder.append_value(b"arrow").unwrap(); + let mut array: FixedSizeBinaryArray = builder.finish_cloned(); + + assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); + assert_eq!(3, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(10, array.value_offset(2)); + assert_eq!(5, array.value_length()); + + // [b"finis", null, "clone"] + builder.append_value(b"finis").unwrap(); + builder.append_null(); + builder.append_value(b"clone").unwrap(); + + array = builder.finish(); + + assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); + assert_eq!(6, array.len()); + assert_eq!(2, array.null_count()); + assert_eq!(25, array.value_offset(5)); + assert_eq!(5, array.value_length()); + } + + #[test] + fn test_fixed_size_binary_builder_with_zero_value_length() { + let mut builder = FixedSizeBinaryBuilder::new(0); + + builder.append_value(b"").unwrap(); + builder.append_null(); + builder.append_value(b"").unwrap(); + assert!(!builder.is_empty()); + + let array: FixedSizeBinaryArray = builder.finish(); + assert_eq!(&DataType::FixedSizeBinary(0), array.data_type()); + assert_eq!(3, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(0, array.value_offset(2)); + assert_eq!(0, array.value_length()); + assert_eq!(b"", array.value(0)); + assert_eq!(b"", array.value(2)); + } + + #[test] + #[should_panic( + expected = "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" + )] + fn test_fixed_size_binary_builder_with_inconsistent_value_length() { + let mut builder = FixedSizeBinaryBuilder::with_capacity(1, 4); + builder.append_value(b"hello").unwrap(); + } + #[test] + fn test_fixed_size_binary_builder_empty() { + let mut builder = FixedSizeBinaryBuilder::new(5); + assert!(builder.is_empty()); + + let fixed_size_binary_array = builder.finish(); + assert_eq!( + &DataType::FixedSizeBinary(5), + fixed_size_binary_array.data_type() + ); + assert_eq!(0, fixed_size_binary_array.len()); + } + + #[test] + #[should_panic(expected = "value length (-1) of the array must >= 0")] + fn test_fixed_size_binary_builder_invalid_value_length() { + let _ = FixedSizeBinaryBuilder::with_capacity(15, -1); + } +} diff --git a/arrow-array/src/builder2/fixed_size_list_builder.rs b/arrow-array/src/builder2/fixed_size_list_builder.rs new file mode 100644 index 000000000000..c5c7c6449592 --- /dev/null +++ b/arrow-array/src/builder2/fixed_size_list_builder.rs @@ -0,0 +1,492 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::ArrayBuilder; +use crate::{ArrayRef, FixedSizeListArray}; +use arrow_buffer::NullBufferBuilder; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`FixedSizeListArray`] +/// ``` +/// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array}; +/// let values_builder = Int32Builder::new(); +/// let mut builder = FixedSizeListBuilder::new(values_builder, 3); +/// +/// // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] +/// builder.values().append_value(0); +/// builder.values().append_value(1); +/// builder.values().append_value(2); +/// builder.append(true); +/// builder.values().append_null(); +/// builder.values().append_null(); +/// builder.values().append_null(); +/// builder.append(false); +/// builder.values().append_value(3); +/// builder.values().append_null(); +/// builder.values().append_value(5); +/// builder.append(true); +/// builder.values().append_value(6); +/// builder.values().append_value(7); +/// builder.values().append_null(); +/// builder.append(true); +/// let list_array = builder.finish(); +/// assert_eq!( +/// *list_array.value(0), +/// Int32Array::from(vec![Some(0), Some(1), Some(2)]) +/// ); +/// assert!(list_array.is_null(1)); +/// assert_eq!( +/// *list_array.value(2), +/// Int32Array::from(vec![Some(3), None, Some(5)]) +/// ); +/// assert_eq!( +/// *list_array.value(3), +/// Int32Array::from(vec![Some(6), Some(7), None]) +/// ) +/// ``` +/// +#[derive(Debug)] +pub struct FixedSizeListBuilder { + null_buffer_builder: NullBufferBuilder, + values_builder: T, + list_len: i32, + field: Option, +} + +impl FixedSizeListBuilder { + /// Creates a new [`FixedSizeListBuilder`] from a given values array builder + /// `value_length` is the number of values within each array + pub fn new(values_builder: T, value_length: i32) -> Self { + let capacity = values_builder + .len() + .checked_div(value_length as _) + .unwrap_or_default(); + + Self::with_capacity(values_builder, value_length, capacity) + } + + /// Creates a new [`FixedSizeListBuilder`] from a given values array builder + /// `value_length` is the number of values within each array + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, value_length: i32, capacity: usize) -> Self { + Self { + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + list_len: value_length, + field: None, + } + } + + /// Override the field passed to [`FixedSizeListArray::new`] + /// + /// By default, a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl ArrayBuilder for FixedSizeListBuilder +where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl FixedSizeListBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the length of the list + pub fn value_length(&self) -> i32 { + self.list_len + } + + /// Finish the current fixed-length list array slot + #[inline] + pub fn append(&mut self, is_valid: bool) { + self.null_buffer_builder.append(is_valid); + } + + /// Builds the [`FixedSizeListBuilder`] and reset this builder. + pub fn finish(&mut self) -> FixedSizeListArray { + let len = self.len(); + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + + assert_eq!( + values.len(), len * self.list_len as usize, + "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", + values.len(), + self.list_len, + len, + ); + + let field = self + .field + .clone() + .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); + + FixedSizeListArray::new(field, self.list_len, values, nulls) + } + + /// Builds the [`FixedSizeListBuilder`] without resetting the builder. + pub fn finish_cloned(&self) -> FixedSizeListArray { + let len = self.len(); + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + assert_eq!( + values.len(), len * self.list_len as usize, + "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", + values.len(), + self.list_len, + len, + ); + + let field = self + .field + .clone() + .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); + + FixedSizeListArray::new(field, self.list_len, values, nulls) + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_schema::DataType; + + use crate::builder::Int32Builder; + use crate::Array; + use crate::Int32Array; + + fn make_list_builder( + include_null_element: bool, + include_null_in_values: bool, + ) -> FixedSizeListBuilder> { + let values_builder = Int32Builder::new(); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + + builder.values().append_value(2); + builder.values().append_value(3); + builder.values().append_value(4); + builder.append(true); + + if include_null_element { + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + } else { + builder.values().append_value(2); + builder.values().append_value(3); + builder.values().append_value(4); + builder.append(true); + } + + if include_null_in_values { + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true); + } else { + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true); + } + + builder + } + + #[test] + fn test_fixed_size_list_array_builder() { + let mut builder = make_list_builder(true, true); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + fn test_fixed_size_list_array_builder_with_field() { + let builder = make_list_builder(false, false); + let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + fn test_fixed_size_list_array_builder_with_field_and_null() { + let builder = make_list_builder(true, false); + let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + #[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")] + fn test_fixed_size_list_array_builder_with_field_null_panic() { + let builder = make_list_builder(true, true); + let mut builder = builder.with_field(Field::new("list_item", DataType::Int32, false)); + + builder.finish(); + } + + #[test] + #[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")] + fn test_fixed_size_list_array_builder_with_field_type_panic() { + let values_builder = Int32Builder::new(); + let builder = FixedSizeListBuilder::new(values_builder, 3); + let mut builder = builder.with_field(Field::new("list_item", DataType::Int64, true)); + + // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true); + + builder.finish(); + } + + #[test] + fn test_fixed_size_list_array_builder_cloned_with_field() { + let builder = make_list_builder(true, true); + let builder = builder.with_field(Field::new("list_element", DataType::Int32, true)); + + let list_array = builder.finish_cloned(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + #[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")] + fn test_fixed_size_list_array_builder_cloned_with_field_null_panic() { + let builder = make_list_builder(true, true); + let builder = builder.with_field(Field::new("list_item", DataType::Int32, false)); + + builder.finish_cloned(); + } + + #[test] + fn test_fixed_size_list_array_builder_cloned_with_field_and_null() { + let builder = make_list_builder(true, false); + let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + #[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")] + fn test_fixed_size_list_array_builder_cloned_with_field_type_panic() { + let builder = make_list_builder(false, false); + let builder = builder.with_field(Field::new("list_item", DataType::Int64, true)); + + builder.finish_cloned(); + } + + #[test] + fn test_fixed_size_list_array_builder_finish_cloned() { + let mut builder = make_list_builder(true, true); + + let mut list_array = builder.finish_cloned(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(3, list_array.value_length()); + + builder.values().append_value(6); + builder.values().append_value(7); + builder.values().append_null(); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(6, list_array.len()); + assert_eq!(2, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + fn test_fixed_size_list_array_builder_with_field_empty() { + let values_builder = Int32Array::builder(0); + let mut builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( + "list_item", + DataType::Int32, + false, + )); + assert!(builder.is_empty()); + let arr = builder.finish(); + assert_eq!(0, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_fixed_size_list_array_builder_cloned_with_field_empty() { + let values_builder = Int32Array::builder(0); + let builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( + "list_item", + DataType::Int32, + false, + )); + assert!(builder.is_empty()); + let arr = builder.finish_cloned(); + assert_eq!(0, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_fixed_size_list_array_builder_empty() { + let values_builder = Int32Array::builder(5); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + assert!(builder.is_empty()); + let arr = builder.finish(); + assert_eq!(0, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_fixed_size_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + #[should_panic( + expected = "Length of the child array (10) must be the multiple of the value length (3) and the array length (3)." + )] + fn test_fixed_size_list_array_builder_fail() { + let values_builder = Int32Array::builder(5); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + builder.values().append_slice(&[7, 8, 9, 10]); + builder.append(true); + + builder.finish(); + } +} diff --git a/arrow-array/src/builder2/generic_byte_run_builder.rs b/arrow-array/src/builder2/generic_byte_run_builder.rs new file mode 100644 index 000000000000..0bf5658b297e --- /dev/null +++ b/arrow-array/src/builder2/generic_byte_run_builder.rs @@ -0,0 +1,517 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::bytes::ByteArrayNativeType; +use std::{any::Any, sync::Arc}; + +use crate::{ + types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, + ArrayRef, ArrowPrimitiveType, RunArray, +}; + +use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; + +use arrow_buffer::ArrowNativeType; + +/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray) +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::GenericByteRunBuilder; +/// # use arrow_array::{GenericByteArray, BinaryArray}; +/// # use arrow_array::types::{BinaryType, Int16Type}; +/// # use arrow_array::{Array, Int16Array}; +/// # use arrow_array::cast::AsArray; +/// +/// let mut builder = +/// GenericByteRunBuilder::::new(); +/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter()); +/// builder.append_value(b"def"); +/// builder.append_null(); +/// let array = builder.finish(); +/// +/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// assert!(av.is_null(3)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &BinaryArray = av.as_binary(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert_eq!(ava.value(2), b"def"); +/// ``` +#[derive(Debug)] +pub struct GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: GenericByteBuilder, + current_value: Vec, + has_current_value: bool, + current_run_end_index: usize, + prev_run_end_index: usize, +} + +impl Default for GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + fn default() -> Self { + Self::new() + } +} + +impl GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + /// Creates a new `GenericByteRunBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: GenericByteBuilder::::new(), + current_value: Vec::new(), + has_current_value: false, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } + + /// Creates a new `GenericByteRunBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + /// `data_capacity`: the expected number of bytes of run end encoded values + pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: GenericByteBuilder::::with_capacity(capacity, data_capacity), + current_value: Vec::new(), + has_current_value: false, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } +} + +impl ArrayBuilder for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the length of logical array encoded by + /// the eventual runs array. + fn len(&self) -> usize { + self.current_run_end_index + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ + /// Appends optional value to the logical array encoded by the RunArray. + pub fn append_option(&mut self, input_value: Option>) { + match input_value { + Some(value) => self.append_value(value), + None => self.append_null(), + } + } + + /// Appends value to the logical array encoded by the RunArray. + pub fn append_value(&mut self, input_value: impl AsRef) { + let value: &[u8] = input_value.as_ref().as_ref(); + if !self.has_current_value { + self.append_run_end(); + self.current_value.extend_from_slice(value); + self.has_current_value = true; + } else if self.current_value.as_slice() != value { + self.append_run_end(); + self.current_value.clear(); + self.current_value.extend_from_slice(value); + } + self.current_run_end_index += 1; + } + + /// Appends null to the logical array encoded by the RunArray. + pub fn append_null(&mut self) { + if self.has_current_value { + self.append_run_end(); + self.current_value.clear(); + self.has_current_value = false; + } + self.current_run_end_index += 1; + } + + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { + // write the last run end to the array. + self.append_run_end(); + + // reset the run end index to zero. + self.current_value.clear(); + self.has_current_value = false; + self.current_run_end_index = 0; + self.prev_run_end_index = 0; + + // build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); + + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } + + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + // Appends the current run to the array. + fn append_run_end(&mut self) { + // empty array or the function called without appending any value. + if self.prev_run_end_index == self.current_run_end_index { + return; + } + let run_end_index = self.run_end_index_as_native(); + self.run_ends_builder.append_value(run_end_index); + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { + // Safety: + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; + self.values_builder.append_value(native); + } else { + self.values_builder.append_null(); + } + self.prev_run_end_index = self.current_run_end_index; + } + + // Similar to `append_run_end` but on custom builders. + // Used in `finish_cloned` which is not suppose to mutate `self`. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut GenericByteBuilder, + ) { + let run_end_index = self.run_end_index_as_native(); + run_ends_builder.append_value(run_end_index); + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { + // Safety: + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; + values_builder.append_value(native); + } else { + values_builder.append_null(); + } + } + + fn run_end_index_as_native(&self) -> R::Native { + R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| { + panic!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + ) + }) + } +} + +impl Extend> for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, + S: AsRef, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + +/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray) +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded values are Strings. +/// +/// # use arrow_array::builder::StringRunBuilder; +/// # use arrow_array::{Int16Array, StringArray}; +/// # use arrow_array::types::Int16Type; +/// # use arrow_array::cast::AsArray; +/// # +/// let mut builder = StringRunBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value("abc"); +/// builder.append_null(); +/// builder.extend([Some("def"), Some("def"), Some("abc")]); +/// let array = builder.finish(); +/// +/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = av.as_string::(); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), "def"); +/// assert_eq!(ava.value(3), "abc"); +/// +/// ``` +pub type StringRunBuilder = GenericByteRunBuilder; + +/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray) +pub type LargeStringRunBuilder = GenericByteRunBuilder; + +/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray) +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded data is binary values. +/// +/// # use arrow_array::builder::BinaryRunBuilder; +/// # use arrow_array::{BinaryArray, Int16Array}; +/// # use arrow_array::cast::AsArray; +/// # use arrow_array::types::Int16Type; +/// +/// let mut builder = BinaryRunBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value(b"abc"); +/// builder.append_null(); +/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); +/// let array = builder.finish(); +/// +/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &BinaryArray = av.as_binary(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), b"def"); +/// assert_eq!(ava.value(3), b"abc"); +/// +/// ``` +pub type BinaryRunBuilder = GenericByteRunBuilder; + +/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) +pub type LargeBinaryRunBuilder = GenericByteRunBuilder; + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::cast::AsArray; + use crate::types::{Int16Type, Int32Type}; + use crate::GenericByteArray; + use crate::Int16RunArray; + + fn test_bytes_run_builder(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteRunBuilder::::new(); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_null(); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); + let array = builder.finish(); + + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + assert_eq!(array.logical_null_count(), 2); + + assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(*ava.value(0), *values[0]); + assert!(ava.is_null(1)); + assert_eq!(*ava.value(2), *values[1]); + assert_eq!(*ava.value(3), *values[2]); + } + + #[test] + fn test_string_run_builder() { + test_bytes_run_builder::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_string_run_builder_with_empty_strings() { + test_bytes_run_builder::(vec!["abc", "", "ghi"]); + } + + #[test] + fn test_binary_run_builder() { + test_bytes_run_builder::(vec![b"abc", b"def", b"ghi"]); + } + + fn test_bytes_run_builder_finish_cloned(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteRunBuilder::::new(); + + builder.append_value(values[0]); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); + builder.append_value(values[0]); + let mut array: Int16RunArray = builder.finish_cloned(); + + assert_eq!(array.len(), 5); + assert_eq!(array.null_count(), 0); + assert_eq!(array.logical_null_count(), 1); + + assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava.value(0), values[0]); + assert!(ava.is_null(1)); + assert_eq!(ava.value(2), values[1]); + assert_eq!(ava.value(3), values[0]); + + // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only + // one entry in final output. + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[1]); + array = builder.finish(); + + assert_eq!(array.len(), 8); + assert_eq!(array.null_count(), 0); + assert_eq!(array.logical_null_count(), 1); + + assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]); + + // Values are polymorphic and so require a downcast. + let av2 = array.values(); + let ava2: &GenericByteArray = + av2.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava2.value(0), values[0]); + assert!(ava2.is_null(1)); + assert_eq!(ava2.value(2), values[1]); + // The value appended before and after `finish_cloned` has only one entry. + assert_eq!(ava2.value(3), values[0]); + assert_eq!(ava2.value(4), values[1]); + } + + #[test] + fn test_string_run_builder_finish_cloned() { + test_bytes_run_builder_finish_cloned::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_binary_run_builder_finish_cloned() { + test_bytes_run_builder_finish_cloned::(vec![b"abc", b"def", b"ghi"]); + } + + #[test] + fn test_extend() { + let mut builder = StringRunBuilder::::new(); + builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some)); + builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 10); + assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]); + + let str_array = array.values().as_string::(); + assert_eq!(str_array.value(0), "a"); + assert_eq!(str_array.value(1), ""); + assert_eq!(str_array.value(2), "b"); + assert_eq!(str_array.value(3), "cupcakes"); + } +} diff --git a/arrow-array/src/builder2/generic_bytes_builder.rs b/arrow-array/src/builder2/generic_bytes_builder.rs new file mode 100644 index 000000000000..6b6c1a842350 --- /dev/null +++ b/arrow-array/src/builder2/generic_bytes_builder.rs @@ -0,0 +1,596 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; +use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; +use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; +use arrow_buffer::NullBufferBuilder; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayDataBuilder; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`GenericByteArray`] +/// +/// For building strings, see docs on [`GenericStringBuilder`]. +/// For building binary, see docs on [`GenericBinaryBuilder`]. +pub struct GenericByteBuilder { + value_builder: UInt8BufferBuilder, + offsets_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, +} + +impl GenericByteBuilder { + /// Creates a new [`GenericByteBuilder`]. + pub fn new() -> Self { + Self::with_capacity(1024, 1024) + } + + /// Creates a new [`GenericByteBuilder`]. + /// + /// - `item_capacity` is the number of items to pre-allocate. + /// The size of the preallocated buffer of offsets is the number of items plus one. + /// - `data_capacity` is the total number of bytes of data to pre-allocate + /// (for all items, not per item). + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); + offsets_builder.append(T::Offset::from_usize(0).unwrap()); + Self { + value_builder: UInt8BufferBuilder::new(data_capacity), + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(item_capacity), + } + } + + /// Creates a new [`GenericByteBuilder`] from buffers. + /// + /// # Safety + /// + /// This doesn't verify buffer contents as it assumes the buffers are from + /// existing and valid [`GenericByteArray`]. + pub unsafe fn new_from_buffer( + offsets_buffer: MutableBuffer, + value_buffer: MutableBuffer, + null_buffer: Option, + ) -> Self { + let offsets_builder = BufferBuilder::::new_from_buffer(offsets_buffer); + let value_builder = BufferBuilder::::new_from_buffer(value_buffer); + + let null_buffer_builder = null_buffer + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) + .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); + + Self { + offsets_builder, + value_builder, + null_buffer_builder, + } + } + + #[inline] + fn next_offset(&self) -> T::Offset { + T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") + } + + /// Appends a value into the builder. + /// + /// See the [GenericStringBuilder] documentation for examples of + /// incrementally building string values with multiple `write!` calls. + /// + /// # Panics + /// + /// Panics if the resulting length of [`Self::values_slice`] would exceed + /// `T::Offset::MAX` bytes. + /// + /// For example, this can happen with [`StringArray`] or [`BinaryArray`] + /// where the total length of all values exceeds 2GB + /// + /// [`StringArray`]: crate::StringArray + /// [`BinaryArray`]: crate::BinaryArray + #[inline] + pub fn append_value(&mut self, value: impl AsRef) { + self.value_builder.append_slice(value.as_ref().as_ref()); + self.null_buffer_builder.append(true); + self.offsets_builder.append(self.next_offset()); + } + + /// Append an `Option` value into the builder. + /// + /// - A `None` value will append a null value. + /// - A `Some` value will append the value. + /// + /// See [`Self::append_value`] for more panic information. + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder. + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append(false); + self.offsets_builder.append(self.next_offset()); + } + + /// Builds the [`GenericByteArray`] and reset this builder. + pub fn finish(&mut self) -> GenericByteArray { + let array_type = T::DATA_TYPE; + let array_builder = ArrayDataBuilder::new(array_type) + .len(self.len()) + .add_buffer(self.offsets_builder.finish()) + .add_buffer(self.value_builder.finish()) + .nulls(self.null_buffer_builder.finish()); + + self.offsets_builder.append(self.next_offset()); + let array_data = unsafe { array_builder.build_unchecked() }; + GenericByteArray::from(array_data) + } + + /// Builds the [`GenericByteArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericByteArray { + let array_type = T::DATA_TYPE; + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice()); + let array_builder = ArrayDataBuilder::new(array_type) + .len(self.len()) + .add_buffer(offset_buffer) + .add_buffer(value_buffer) + .nulls(self.null_buffer_builder.finish_cloned()); + + let array_data = unsafe { array_builder.build_unchecked() }; + GenericByteArray::from(array_data) + } + + /// Returns the current values buffer as a slice + pub fn values_slice(&self) -> &[u8] { + self.value_builder.as_slice() + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[T::Offset] { + self.offsets_builder.as_slice() + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Returns the current null buffer as a mutable slice + pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { + self.null_buffer_builder.as_slice_mut() + } +} + +impl std::fmt::Debug for GenericByteBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?; + f.debug_struct("") + .field("value_builder", &self.value_builder) + .field("offsets_builder", &self.offsets_builder) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl Default for GenericByteBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ArrayBuilder for GenericByteBuilder { + /// Returns the number of binary slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } +} + +impl> Extend> for GenericByteBuilder { + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +/// Array builder for [`GenericStringArray`][crate::GenericStringArray] +/// +/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with +/// [`GenericByteBuilder::append_null`]. +/// +/// This builder also implements [`std::fmt::Write`] with any written data +/// included in the next appended value. This allows using [`std::fmt::Display`] +/// with standard Rust idioms like `write!` and `writeln!` to write data +/// directly to the builder without intermediate allocations. +/// +/// # Example writing strings with `append_value` +/// ``` +/// # use arrow_array::builder::GenericStringBuilder; +/// let mut builder = GenericStringBuilder::::new(); +/// +/// // Write one string value +/// builder.append_value("foobarbaz"); +/// +/// // Write a second string +/// builder.append_value("v2"); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), "foobarbaz"); +/// assert_eq!(array.value(1), "v2"); +/// ``` +/// +/// # Example incrementally writing strings with `std::fmt::Write` +/// +/// ``` +/// # use std::fmt::Write; +/// # use arrow_array::builder::GenericStringBuilder; +/// let mut builder = GenericStringBuilder::::new(); +/// +/// // Write data in multiple `write!` calls +/// write!(builder, "foo").unwrap(); +/// write!(builder, "bar").unwrap(); +/// // The next call to append_value finishes the current string +/// // including all previously written strings. +/// builder.append_value("baz"); +/// +/// // Write second value with a single write call +/// write!(builder, "v2").unwrap(); +/// // finish the value by calling append_value with an empty string +/// builder.append_value(""); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), "foobarbaz"); +/// assert_eq!(array.value(1), "v2"); +/// ``` +pub type GenericStringBuilder = GenericByteBuilder>; + +impl std::fmt::Write for GenericStringBuilder { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.value_builder.append_slice(s.as_bytes()); + Ok(()) + } +} + +/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] +/// +/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with +/// [`GenericByteBuilder::append_null`]. +/// +/// # Example +/// ``` +/// # use arrow_array::builder::GenericBinaryBuilder; +/// let mut builder = GenericBinaryBuilder::::new(); +/// +/// // Write data +/// builder.append_value("foo"); +/// +/// // Write second value +/// builder.append_value(&[0,1,2]); +/// +/// let array = builder.finish(); +/// // binary values +/// assert_eq!(array.value(0), b"foo"); +/// assert_eq!(array.value(1), b"\x00\x01\x02"); +/// ``` +/// +/// # Example incrementally writing bytes with `write_bytes` +/// +/// ``` +/// # use std::io::Write; +/// # use arrow_array::builder::GenericBinaryBuilder; +/// let mut builder = GenericBinaryBuilder::::new(); +/// +/// // Write data in multiple `write_bytes` calls +/// write!(builder, "foo").unwrap(); +/// write!(builder, "bar").unwrap(); +/// // The next call to append_value finishes the current string +/// // including all previously written strings. +/// builder.append_value("baz"); +/// +/// // Write second value with a single write call +/// write!(builder, "v2").unwrap(); +/// // finish the value by calling append_value with an empty string +/// builder.append_value(""); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), "foobarbaz".as_bytes()); +/// assert_eq!(array.value(1), "v2".as_bytes()); +/// ``` +pub type GenericBinaryBuilder = GenericByteBuilder>; + +impl std::io::Write for GenericBinaryBuilder { + fn write(&mut self, bs: &[u8]) -> std::io::Result { + self.value_builder.append_slice(bs); + Ok(bs.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::Array; + use crate::GenericStringArray; + use std::fmt::Write as _; + use std::io::Write as _; + + fn _test_generic_binary_builder() { + let mut builder = GenericBinaryBuilder::::new(); + + builder.append_value(b"hello"); + builder.append_value(b""); + builder.append_null(); + builder.append_value(b"rust"); + + let array = builder.finish(); + + assert_eq!(4, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(b"hello", array.value(0)); + assert_eq!([] as [u8; 0], array.value(1)); + assert!(array.is_null(2)); + assert_eq!(b"rust", array.value(3)); + assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]); + assert_eq!(O::from_usize(4).unwrap(), array.value_length(3)); + } + + #[test] + fn test_binary_builder() { + _test_generic_binary_builder::() + } + + #[test] + fn test_large_binary_builder() { + _test_generic_binary_builder::() + } + + fn _test_generic_binary_builder_all_nulls() { + let mut builder = GenericBinaryBuilder::::new(); + builder.append_null(); + builder.append_null(); + builder.append_null(); + assert_eq!(3, builder.len()); + assert!(!builder.is_empty()); + + let array = builder.finish(); + assert_eq!(3, array.null_count()); + assert_eq!(3, array.len()); + assert!(array.is_null(0)); + assert!(array.is_null(1)); + assert!(array.is_null(2)); + } + + #[test] + fn test_binary_builder_all_nulls() { + _test_generic_binary_builder_all_nulls::() + } + + #[test] + fn test_large_binary_builder_all_nulls() { + _test_generic_binary_builder_all_nulls::() + } + + fn _test_generic_binary_builder_reset() { + let mut builder = GenericBinaryBuilder::::new(); + + builder.append_value(b"hello"); + builder.append_value(b""); + builder.append_null(); + builder.append_value(b"rust"); + builder.finish(); + + assert!(builder.is_empty()); + + builder.append_value(b"parquet"); + builder.append_null(); + builder.append_value(b"arrow"); + builder.append_value(b""); + let array = builder.finish(); + + assert_eq!(4, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(b"parquet", array.value(0)); + assert!(array.is_null(1)); + assert_eq!(b"arrow", array.value(2)); + assert_eq!(b"", array.value(1)); + assert_eq!(O::zero(), array.value_offsets()[0]); + assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]); + assert_eq!(O::from_usize(5).unwrap(), array.value_length(2)); + } + + #[test] + fn test_binary_builder_reset() { + _test_generic_binary_builder_reset::() + } + + #[test] + fn test_large_binary_builder_reset() { + _test_generic_binary_builder_reset::() + } + + fn _test_generic_string_array_builder() { + let mut builder = GenericStringBuilder::::new(); + let owned = "arrow".to_owned(); + + builder.append_value("hello"); + builder.append_value(""); + builder.append_value(&owned); + builder.append_null(); + builder.append_option(Some("rust")); + builder.append_option(None::<&str>); + builder.append_option(None::); + assert_eq!(7, builder.len()); + + assert_eq!( + GenericStringArray::::from(vec![ + Some("hello"), + Some(""), + Some("arrow"), + None, + Some("rust"), + None, + None + ]), + builder.finish() + ); + } + + #[test] + fn test_string_array_builder() { + _test_generic_string_array_builder::() + } + + #[test] + fn test_large_string_array_builder() { + _test_generic_string_array_builder::() + } + + fn _test_generic_string_array_builder_finish() { + let mut builder = GenericStringBuilder::::with_capacity(3, 11); + + builder.append_value("hello"); + builder.append_value("rust"); + builder.append_null(); + + builder.finish(); + assert!(builder.is_empty()); + assert_eq!(&[O::zero()], builder.offsets_slice()); + + builder.append_value("arrow"); + builder.append_value("parquet"); + let arr = builder.finish(); + // array should not have null buffer because there is not `null` value. + assert!(arr.nulls().is_none()); + assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) + } + + #[test] + fn test_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() + } + + #[test] + fn test_large_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() + } + + fn _test_generic_string_array_builder_finish_cloned() { + let mut builder = GenericStringBuilder::::with_capacity(3, 11); + + builder.append_value("hello"); + builder.append_value("rust"); + builder.append_null(); + + let mut arr = builder.finish_cloned(); + assert!(!builder.is_empty()); + assert_eq!(3, arr.len()); + + builder.append_value("arrow"); + builder.append_value("parquet"); + arr = builder.finish(); + + assert!(arr.nulls().is_some()); + assert_eq!(&[O::zero()], builder.offsets_slice()); + assert_eq!(5, arr.len()); + } + + #[test] + fn test_string_array_builder_finish_cloned() { + _test_generic_string_array_builder_finish_cloned::() + } + + #[test] + fn test_large_string_array_builder_finish_cloned() { + _test_generic_string_array_builder_finish_cloned::() + } + + #[test] + fn test_extend() { + let mut builder = GenericStringBuilder::::new(); + builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some)); + builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some)); + let array = builder.finish(); + assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); + assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); + } + + #[test] + fn test_write_str() { + let mut builder = GenericStringBuilder::::new(); + write!(builder, "foo").unwrap(); + builder.append_value(""); + writeln!(builder, "bar").unwrap(); + builder.append_value(""); + write!(builder, "fiz").unwrap(); + write!(builder, "buz").unwrap(); + builder.append_value(""); + let a = builder.finish(); + let r: Vec<_> = a.iter().flatten().collect(); + assert_eq!(r, &["foo", "bar\n", "fizbuz"]) + } + + #[test] + fn test_write_bytes() { + let mut builder = GenericBinaryBuilder::::new(); + write!(builder, "foo").unwrap(); + builder.append_value(""); + writeln!(builder, "bar").unwrap(); + builder.append_value(""); + write!(builder, "fiz").unwrap(); + write!(builder, "buz").unwrap(); + builder.append_value(""); + let a = builder.finish(); + let r: Vec<_> = a.iter().flatten().collect(); + assert_eq!( + r, + &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] + ) + } +} diff --git a/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs new file mode 100644 index 000000000000..3a1469177488 --- /dev/null +++ b/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs @@ -0,0 +1,667 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::{ArrowError, DataType}; +use hashbrown::HashTable; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`DictionaryArray`] of [`GenericByteArray`] +/// +/// For example to map a set of byte indices to String values. Note that +/// the use of a `HashMap` here will not scale to very large arrays or +/// result in an ordered dictionary. +#[derive(Debug)] +pub struct GenericByteDictionaryBuilder +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + state: ahash::RandomState, + dedup: HashTable, + + keys_builder: PrimitiveBuilder, + values_builder: GenericByteBuilder, +} + +impl Default for GenericByteDictionaryBuilder +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + fn default() -> Self { + Self::new() + } +} + +impl GenericByteDictionaryBuilder +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + /// Creates a new `GenericByteDictionaryBuilder` + pub fn new() -> Self { + let keys_builder = PrimitiveBuilder::new(); + let values_builder = GenericByteBuilder::::new(); + Self { + state: Default::default(), + dedup: HashTable::with_capacity(keys_builder.capacity()), + keys_builder, + values_builder, + } + } + + /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities + /// + /// `keys_capacity`: the number of keys, i.e. length of array to build + /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary + /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary + pub fn with_capacity( + keys_capacity: usize, + value_capacity: usize, + data_capacity: usize, + ) -> Self { + Self { + state: Default::default(), + dedup: Default::default(), + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), + values_builder: GenericByteBuilder::::with_capacity(value_capacity, data_capacity), + } + } + + /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary + /// which is initialized with the given values. + /// The indices of those dictionary values are used as keys. + /// + /// # Example + /// + /// ``` + /// # use arrow_array::builder::StringDictionaryBuilder; + /// # use arrow_array::{Int16Array, StringArray}; + /// + /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); + /// + /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap(); + /// builder.append("def").unwrap(); + /// builder.append_null(); + /// builder.append("abc").unwrap(); + /// + /// let dictionary_array = builder.finish(); + /// + /// let keys = dictionary_array.keys(); + /// + /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); + /// ``` + pub fn new_with_dictionary( + keys_capacity: usize, + dictionary_values: &GenericByteArray, + ) -> Result { + let state = ahash::RandomState::default(); + let dict_len = dictionary_values.len(); + + let mut dedup = HashTable::with_capacity(dict_len); + + let values_len = dictionary_values.value_data().len(); + let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); + + K::Native::from_usize(dictionary_values.len()) + .ok_or(ArrowError::DictionaryKeyOverflowError)?; + + for (idx, maybe_value) in dictionary_values.iter().enumerate() { + match maybe_value { + Some(value) => { + let value_bytes: &[u8] = value.as_ref(); + let hash = state.hash_one(value_bytes); + + dedup + .entry( + hash, + |idx: &usize| value_bytes == get_bytes(&values_builder, *idx), + |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)), + ) + .or_insert(idx); + + values_builder.append_value(value); + } + None => values_builder.append_null(), + } + } + + Ok(Self { + state, + dedup, + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), + values_builder, + }) + } +} + +impl ArrayBuilder for GenericByteDictionaryBuilder +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.keys_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericByteDictionaryBuilder +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + fn get_or_insert_key(&mut self, value: impl AsRef) -> Result { + let value_native: &T::Native = value.as_ref(); + let value_bytes: &[u8] = value_native.as_ref(); + + let state = &self.state; + let storage = &mut self.values_builder; + let hash = state.hash_one(value_bytes); + + let idx = *self + .dedup + .entry( + hash, + |idx| value_bytes == get_bytes(storage, *idx), + |idx| state.hash_one(get_bytes(storage, *idx)), + ) + .or_insert_with(|| { + let idx = storage.len(); + storage.append_value(value); + idx + }) + .get(); + + let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?; + + Ok(key) + } + + /// Append a value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append(&mut self, value: impl AsRef) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value(key); + Ok(key) + } + + /// Append a value multiple times to the array. + /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append_n( + &mut self, + value: impl AsRef, + count: usize, + ) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value_n(key, count); + Ok(key) + } + + /// Infallibly append a value to this builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_value(&mut self, value: impl AsRef) { + self.append(value).expect("dictionary key overflow"); + } + + /// Infallibly append a value to this builder repeatedly `count` times. + /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_values(&mut self, value: impl AsRef, count: usize) { + self.append_n(value, count) + .expect("dictionary key overflow"); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.keys_builder.append_null() + } + + /// Infallibly append `n` null slots into the builder + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.keys_builder.append_nulls(n) + } + + /// Append an `Option` value into the builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append an `Option` value into the builder repeatedly `count` times. + /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_options(&mut self, value: Option>, count: usize) { + match value { + None => self.keys_builder.append_nulls(count), + Some(v) => self.append_values(v, count), + }; + } + + /// Builds the `DictionaryArray` and reset this builder. + pub fn finish(&mut self) -> DictionaryArray { + self.dedup.clear(); + let values = self.values_builder.finish(); + let keys = self.keys_builder.finish(); + + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } + + /// Builds the `DictionaryArray` without resetting the builder. + pub fn finish_cloned(&self) -> DictionaryArray { + let values = self.values_builder.finish_cloned(); + let keys = self.keys_builder.finish_cloned(); + + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.keys_builder.validity_slice() + } +} + +impl> Extend> + for GenericByteDictionaryBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[u8] { + let offsets = values.offsets_slice(); + let values = values.values_slice(); + + let end_offset = offsets[idx + 1].as_usize(); + let start_offset = offsets[idx].as_usize(); + + &values[start_offset..end_offset] +} + +/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray) +/// +/// ``` +/// // Create a dictionary array indexed by bytes whose values are Strings. +/// // It can thus hold up to 256 distinct string values. +/// +/// # use arrow_array::builder::StringDictionaryBuilder; +/// # use arrow_array::{Int8Array, StringArray}; +/// # use arrow_array::types::Int8Type; +/// +/// let mut builder = StringDictionaryBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append("abc").unwrap(); +/// builder.append_null(); +/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single lookup +/// builder.append("abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert_eq!(ava.value(1), "def"); +/// +/// ``` +pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; + +/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) +pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; + +/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) +/// +/// ``` +/// // Create a dictionary array indexed by bytes whose values are binary. +/// // It can thus hold up to 256 distinct binary values. +/// +/// # use arrow_array::builder::BinaryDictionaryBuilder; +/// # use arrow_array::{BinaryArray, Int8Array}; +/// # use arrow_array::types::Int8Type; +/// +/// let mut builder = BinaryDictionaryBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append(b"abc").unwrap(); +/// builder.append_null(); +/// builder.append(b"def").unwrap(); +/// builder.append(b"def").unwrap(); +/// builder.append(b"abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert_eq!(ava.value(1), b"def"); +/// +/// ``` +pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; + +/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) +pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Int8Array; + use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; + use crate::{BinaryArray, StringArray}; + + fn test_bytes_dictionary_builder(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append(values[0]).unwrap(); + builder.append_null(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(*ava.value(0), *values[0]); + assert_eq!(*ava.value(1), *values[1]); + } + + #[test] + fn test_string_dictionary_builder() { + test_bytes_dictionary_builder::>(vec!["abc", "def"]); + } + + #[test] + fn test_binary_dictionary_builder() { + test_bytes_dictionary_builder::>(vec![b"abc", b"def"]); + } + + fn test_bytes_dictionary_builder_finish_cloned(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteDictionaryBuilder::::new(); + + builder.append(values[0]).unwrap(); + builder.append_null(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); + let mut array = builder.finish_cloned(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava.value(0), values[0]); + assert_eq!(ava.value(1), values[1]); + + builder.append(values[0]).unwrap(); + builder.append(values[2]).unwrap(); + builder.append(values[1]).unwrap(); + + array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![ + Some(0), + None, + Some(1), + Some(1), + Some(0), + Some(0), + Some(2), + Some(1) + ]) + ); + + // Values are polymorphic and so require a downcast. + let av2 = array.values(); + let ava2: &GenericByteArray = + av2.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava2.value(0), values[0]); + assert_eq!(ava2.value(1), values[1]); + assert_eq!(ava2.value(2), values[2]); + } + + #[test] + fn test_string_dictionary_builder_finish_cloned() { + test_bytes_dictionary_builder_finish_cloned::>(vec![ + "abc", "def", "ghi", + ]); + } + + #[test] + fn test_binary_dictionary_builder_finish_cloned() { + test_bytes_dictionary_builder_finish_cloned::>(vec![ + b"abc", b"def", b"ghi", + ]); + } + + fn test_bytes_dictionary_builder_with_existing_dictionary( + dictionary: GenericByteArray, + values: Vec<&T::Native>, + ) where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = + GenericByteDictionaryBuilder::::new_with_dictionary(6, &dictionary) + .unwrap(); + builder.append(values[0]).unwrap(); + builder.append_null(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); + builder.append(values[2]).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); + + assert!(!ava.is_valid(0)); + assert_eq!(ava.value(1), values[1]); + assert_eq!(ava.value(2), values[0]); + assert_eq!(ava.value(3), values[2]); + } + + #[test] + fn test_string_dictionary_builder_with_existing_dictionary() { + test_bytes_dictionary_builder_with_existing_dictionary::>( + StringArray::from(vec![None, Some("def"), Some("abc")]), + vec!["abc", "def", "ghi"], + ); + } + + #[test] + fn test_binary_dictionary_builder_with_existing_dictionary() { + let values: Vec> = vec![None, Some(b"def"), Some(b"abc")]; + test_bytes_dictionary_builder_with_existing_dictionary::>( + BinaryArray::from(values), + vec![b"abc", b"def", b"ghi"], + ); + } + + fn test_bytes_dictionary_builder_with_reserved_null_value( + dictionary: GenericByteArray, + values: Vec<&T::Native>, + ) where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = + GenericByteDictionaryBuilder::::new_with_dictionary(4, &dictionary) + .unwrap(); + builder.append(values[0]).unwrap(); + builder.append_null(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); + let array = builder.finish(); + + assert!(array.is_null(1)); + assert!(!array.is_valid(1)); + + let keys = array.keys(); + + assert_eq!(keys.value(0), 1); + assert!(keys.is_null(1)); + // zero initialization is currently guaranteed by Buffer allocation and resizing + assert_eq!(keys.value(1), 0); + assert_eq!(keys.value(2), 2); + assert_eq!(keys.value(3), 1); + } + + #[test] + fn test_string_dictionary_builder_with_reserved_null_value() { + let v: Vec> = vec![None]; + test_bytes_dictionary_builder_with_reserved_null_value::>( + StringArray::from(v), + vec!["abc", "def"], + ); + } + + #[test] + fn test_binary_dictionary_builder_with_reserved_null_value() { + let values: Vec> = vec![None]; + test_bytes_dictionary_builder_with_reserved_null_value::>( + BinaryArray::from(values), + vec![b"abc", b"def"], + ); + } + + #[test] + fn test_extend() { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + let dict = builder.finish(); + assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); + assert_eq!(dict.values().len(), 4); + } +} diff --git a/arrow-array/src/builder2/generic_bytes_view_builder.rs b/arrow-array/src/builder2/generic_bytes_view_builder.rs new file mode 100644 index 000000000000..6bb1cf41438e --- /dev/null +++ b/arrow-array/src/builder2/generic_bytes_view_builder.rs @@ -0,0 +1,733 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; +use arrow_data::ByteView; +use arrow_schema::ArrowError; +use hashbrown::hash_table::Entry; +use hashbrown::HashTable; + +use crate::builder2::ArrayBuilder; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{ArrayRef, GenericByteViewArray}; + +const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB +const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB + +enum BlockSizeGrowthStrategy { + Fixed { size: u32 }, + Exponential { current_size: u32 }, +} + +impl BlockSizeGrowthStrategy { + fn next_size(&mut self) -> u32 { + match self { + Self::Fixed { size } => *size, + Self::Exponential { current_size } => { + if *current_size < MAX_BLOCK_SIZE { + // we have fixed start/end block sizes, so we can't overflow + *current_size = current_size.saturating_mul(2); + *current_size + } else { + MAX_BLOCK_SIZE + } + } + } + } +} + +/// A builder for [`GenericByteViewArray`] +/// +/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data, +/// and a list of views into those buffers. +/// +/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`] +/// +/// This builder can be used in two ways +/// +/// # Append Values +/// +/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable +/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] +/// writes values larger than 12 bytes to the current in-progress block, with values smaller +/// than 12 bytes inlined into the views. If a value is appended that will not fit in the +/// in-progress block, it will be closed, and a new block of sufficient size allocated +/// +/// # Append Views +/// +/// Some use-cases may wish to reuse an existing allocation containing string data, for example, +/// when parsing data from a parquet data page. In such a case entire blocks can be appended +/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended +/// using [`GenericByteViewBuilder::try_append_view`] +pub struct GenericByteViewBuilder { + views_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + completed: Vec, + in_progress: Vec, + block_size: BlockSizeGrowthStrategy, + /// Some if deduplicating strings + /// map ` -> ` + string_tracker: Option<(HashTable, ahash::RandomState)>, + phantom: PhantomData, +} + +impl GenericByteViewBuilder { + /// Creates a new [`GenericByteViewBuilder`]. + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. + pub fn with_capacity(capacity: usize) -> Self { + Self { + views_builder: BufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + completed: vec![], + in_progress: vec![], + block_size: BlockSizeGrowthStrategy::Exponential { + current_size: STARTING_BLOCK_SIZE, + }, + string_tracker: None, + phantom: Default::default(), + } + } + + /// Set a fixed buffer size for variable length strings + /// + /// The block size is the size of the buffer used to store values greater + /// than 12 bytes. The builder allocates new buffers when the current + /// buffer is full. + /// + /// By default the builder balances buffer size and buffer count by + /// growing buffer size exponentially from 8KB up to 2MB. The + /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. + /// + /// If this method is used, any new buffers allocated are + /// exactly this size. This can be useful for advanced users + /// that want to control the memory usage and buffer count. + /// + /// See for more details on the implications. + pub fn with_fixed_block_size(self, block_size: u32) -> Self { + debug_assert!(block_size > 0, "Block size must be greater than 0"); + Self { + block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, + ..self + } + } + + /// Override the size of buffers to allocate for holding string data + /// Use `with_fixed_block_size` instead. + #[deprecated(since = "53.0.0", note = "Use `with_fixed_block_size` instead")] + pub fn with_block_size(self, block_size: u32) -> Self { + self.with_fixed_block_size(block_size) + } + + /// Deduplicate strings while building the array + /// + /// This will potentially decrease the memory usage if the array have repeated strings + /// It will also increase the time to build the array as it needs to hash the strings + pub fn with_deduplicate_strings(self) -> Self { + Self { + string_tracker: Some(( + HashTable::with_capacity(self.views_builder.capacity()), + Default::default(), + )), + ..self + } + } + + /// Append a new data block returning the new block offset + /// + /// Note: this will first flush any in-progress block + /// + /// This allows appending views from blocks added using [`Self::append_block`]. See + /// [`Self::append_value`] for appending individual values + /// + /// ``` + /// # use arrow_array::builder::StringViewBuilder; + /// let mut builder = StringViewBuilder::new(); + /// + /// let block = builder.append_block(b"helloworldbingobongo".into()); + /// + /// builder.try_append_view(block, 0, 5).unwrap(); + /// builder.try_append_view(block, 5, 5).unwrap(); + /// builder.try_append_view(block, 10, 5).unwrap(); + /// builder.try_append_view(block, 15, 5).unwrap(); + /// builder.try_append_view(block, 0, 15).unwrap(); + /// let array = builder.finish(); + /// + /// let actual: Vec<_> = array.iter().flatten().collect(); + /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"]; + /// assert_eq!(actual, expected); + /// ``` + pub fn append_block(&mut self, buffer: Buffer) -> u32 { + assert!(buffer.len() < u32::MAX as usize); + + self.flush_in_progress(); + let offset = self.completed.len(); + self.push_completed(buffer); + offset as u32 + } + + /// Append a view of the given `block`, `offset` and `length` + /// + /// # Safety + /// (1) The block must have been added using [`Self::append_block`] + /// (2) The range `offset..offset+length` must be within the bounds of the block + /// (3) The data in the block must be valid of type `T` + pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { + let b = self.completed.get_unchecked(block as usize); + let start = offset as usize; + let end = start.saturating_add(len as usize); + let b = b.get_unchecked(start..end); + + let view = make_view(b, block, offset); + self.views_builder.append(view); + self.null_buffer_builder.append_non_null(); + } + + /// Try to append a view of the given `block`, `offset` and `length` + /// + /// See [`Self::append_block`] + pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> { + let b = self.completed.get(block as usize).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!("No block found with index {block}")) + })?; + let start = offset as usize; + let end = start.saturating_add(len as usize); + + let b = b.get(start..end).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Range {start}..{end} out of bounds for block of length {}", + b.len() + )) + })?; + + if T::Native::from_bytes_checked(b).is_none() { + return Err(ArrowError::InvalidArgumentError( + "Invalid view data".to_string(), + )); + } + + unsafe { + self.append_view_unchecked(block, offset, len); + } + Ok(()) + } + + /// Flushes the in progress block if any + #[inline] + fn flush_in_progress(&mut self) { + if !self.in_progress.is_empty() { + let f = Buffer::from_vec(std::mem::take(&mut self.in_progress)); + self.push_completed(f) + } + } + + /// Append a block to `self.completed`, checking for overflow + #[inline] + fn push_completed(&mut self, block: Buffer) { + assert!(block.len() < u32::MAX as usize, "Block too large"); + assert!(self.completed.len() < u32::MAX as usize, "Too many blocks"); + self.completed.push(block); + } + + /// Returns the value at the given index + /// Useful if we want to know what value has been inserted to the builder + /// The index has to be smaller than `self.len()`, otherwise it will panic + pub fn get_value(&self, index: usize) -> &[u8] { + let view = self.views_builder.as_slice().get(index).unwrap(); + let len = *view as u32; + if len <= 12 { + // # Safety + // The view is valid from the builder + unsafe { GenericByteViewArray::::inline_value(view, len as usize) } + } else { + let view = ByteView::from(*view); + if view.buffer_index < self.completed.len() as u32 { + let block = &self.completed[view.buffer_index as usize]; + &block[view.offset as usize..view.offset as usize + view.length as usize] + } else { + &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] + } + } + } + + /// Appends a value into the builder + /// + /// # Panics + /// + /// Panics if + /// - String buffer count exceeds `u32::MAX` + /// - String length exceeds `u32::MAX` + #[inline] + pub fn append_value(&mut self, value: impl AsRef) { + let v: &[u8] = value.as_ref().as_ref(); + let length: u32 = v.len().try_into().unwrap(); + if length <= 12 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); + view_buffer[4..4 + v.len()].copy_from_slice(v); + self.views_builder.append(u128::from_le_bytes(view_buffer)); + self.null_buffer_builder.append_non_null(); + return; + } + + // Deduplication if: + // (1) deduplication is enabled. + // (2) len > 12 + if let Some((mut ht, hasher)) = self.string_tracker.take() { + let hash_val = hasher.hash_one(v); + let hasher_fn = |v: &_| hasher.hash_one(v); + + let entry = ht.entry( + hash_val, + |idx| { + let stored_value = self.get_value(*idx); + v == stored_value + }, + hasher_fn, + ); + match entry { + Entry::Occupied(occupied) => { + // If the string already exists, we will directly use the view + let idx = occupied.get(); + self.views_builder + .append(self.views_builder.as_slice()[*idx]); + self.null_buffer_builder.append_non_null(); + self.string_tracker = Some((ht, hasher)); + return; + } + Entry::Vacant(vacant) => { + // o.w. we insert the (string hash -> view index) + // the idx is current length of views_builder, as we are inserting a new view + vacant.insert(self.views_builder.len()); + } + } + self.string_tracker = Some((ht, hasher)); + } + + let required_cap = self.in_progress.len() + v.len(); + if self.in_progress.capacity() < required_cap { + self.flush_in_progress(); + let to_reserve = v.len().max(self.block_size.next_size() as usize); + self.in_progress.reserve(to_reserve); + }; + let offset = self.in_progress.len() as u32; + self.in_progress.extend_from_slice(v); + + let view = ByteView { + length, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: self.completed.len() as u32, + offset, + }; + self.views_builder.append(view.into()); + self.null_buffer_builder.append_non_null(); + } + + /// Append an `Option` value into the builder + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.views_builder.append(0); + } + + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { + self.flush_in_progress(); + let completed = std::mem::take(&mut self.completed); + let len = self.views_builder.len(); + let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); + let nulls = self.null_buffer_builder.finish(); + if let Some((ref mut ht, _)) = self.string_tracker.as_mut() { + ht.clear(); + } + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { + let mut completed = self.completed.clone(); + if !self.in_progress.is_empty() { + completed.push(Buffer::from_slice_ref(&self.in_progress)); + } + let len = self.views_builder.len(); + let views = Buffer::from_slice_ref(self.views_builder.as_slice()); + let views = ScalarBuffer::new(views, 0, len); + let nulls = self.null_buffer_builder.finish_cloned(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Return the allocated size of this builder in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + let views = self.views_builder.capacity() * std::mem::size_of::(); + let null = self.null_buffer_builder.allocated_size(); + let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); + let in_progress = self.in_progress.capacity(); + let tracker = match &self.string_tracker { + Some((ht, _)) => ht.capacity() * std::mem::size_of::(), + None => 0, + }; + buffer_size + in_progress + tracker + views + null + } +} + +impl Default for GenericByteViewBuilder { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for GenericByteViewBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}ViewBuilder", T::PREFIX)?; + f.debug_struct("") + .field("views_builder", &self.views_builder) + .field("in_progress", &self.in_progress) + .field("completed", &self.completed) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl ArrayBuilder for GenericByteViewBuilder { + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +impl> Extend> + for GenericByteViewBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +/// Array builder for [`StringViewArray`][crate::StringViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +/// +/// # Example +/// ``` +/// # use arrow_array::builder::StringViewBuilder; +/// # use arrow_array::StringViewArray; +/// let mut builder = StringViewBuilder::new(); +/// builder.append_value("hello"); +/// builder.append_null(); +/// builder.append_value("world"); +/// let array = builder.finish(); +/// +/// let expected = vec![Some("hello"), None, Some("world")]; +/// let actual: Vec<_> = array.iter().collect(); +/// assert_eq!(expected, actual); +/// ``` +pub type StringViewBuilder = GenericByteViewBuilder; + +/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +/// +/// # Example +/// ``` +/// # use arrow_array::builder::BinaryViewBuilder; +/// use arrow_array::BinaryViewArray; +/// let mut builder = BinaryViewBuilder::new(); +/// builder.append_value("hello"); +/// builder.append_null(); +/// builder.append_value("world"); +/// let array = builder.finish(); +/// +/// let expected: Vec> = vec![Some(b"hello"), None, Some(b"world")]; +/// let actual: Vec<_> = array.iter().collect(); +/// assert_eq!(expected, actual); +/// ``` +/// +pub type BinaryViewBuilder = GenericByteViewBuilder; + +/// Creates a view from a fixed length input (the compiler can generate +/// specialized code for this) +fn make_inlined_view(data: &[u8]) -> u128 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); + view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); + u128::from_le_bytes(view_buffer) +} + +/// Create a view based on the given data, block id and offset. +/// +/// Note that the code below is carefully examined with x86_64 assembly code: +/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), +/// which slows down things. +#[inline(never)] +pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { + let len = data.len(); + + // Generate specialized code for each potential small string length + // to improve performance + match len { + 0 => make_inlined_view::<0>(data), + 1 => make_inlined_view::<1>(data), + 2 => make_inlined_view::<2>(data), + 3 => make_inlined_view::<3>(data), + 4 => make_inlined_view::<4>(data), + 5 => make_inlined_view::<5>(data), + 6 => make_inlined_view::<6>(data), + 7 => make_inlined_view::<7>(data), + 8 => make_inlined_view::<8>(data), + 9 => make_inlined_view::<9>(data), + 10 => make_inlined_view::<10>(data), + 11 => make_inlined_view::<11>(data), + 12 => make_inlined_view::<12>(data), + // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. + _ => { + let view = ByteView { + length: len as u32, + prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), + buffer_index: block_id, + offset, + }; + view.as_u128() + } + } +} + +#[cfg(test)] +mod tests { + use core::str; + + use super::*; + use crate::Array; + + #[test] + fn test_string_view_deduplicate() { + let value_1 = "long string to test string view"; + let value_2 = "not so similar string but long"; + + let mut builder = StringViewBuilder::new() + .with_deduplicate_strings() + .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers + + let values = vec![ + Some(value_1), + Some(value_2), + Some("short"), + Some(value_1), + None, + Some(value_2), + Some(value_1), + ]; + builder.extend(values.clone()); + + let array = builder.finish_cloned(); + array.to_data().validate_full().unwrap(); + assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. + let actual: Vec<_> = array.iter().collect(); + assert_eq!(actual, values); + + let view0 = array.views().first().unwrap(); + let view3 = array.views().get(3).unwrap(); + let view6 = array.views().get(6).unwrap(); + + assert_eq!(view0, view3); + assert_eq!(view0, view6); + + assert_eq!(array.views().get(1), array.views().get(5)); + } + + #[test] + fn test_string_view_deduplicate_after_finish() { + let mut builder = StringViewBuilder::new().with_deduplicate_strings(); + + let value_1 = "long string to test string view"; + let value_2 = "not so similar string but long"; + builder.append_value(value_1); + let _array = builder.finish(); + builder.append_value(value_2); + let _array = builder.finish(); + builder.append_value(value_1); + let _array = builder.finish(); + } + + #[test] + fn test_string_view() { + let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81"); + let b2 = Buffer::from(b"cupcakes"); + let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity"); + + let mut v = StringViewBuilder::new(); + assert_eq!(v.append_block(b1), 0); + + v.append_value("This is a very long string that exceeds the inline length"); + v.append_value("This is another very long string that exceeds the inline length"); + + assert_eq!(v.append_block(b2), 2); + assert_eq!(v.append_block(b3), 3); + + // Test short strings + v.try_append_view(0, 0, 5).unwrap(); // world + v.try_append_view(0, 6, 7).unwrap(); // bananas + v.try_append_view(2, 3, 5).unwrap(); // cake + v.try_append_view(2, 0, 3).unwrap(); // cup + v.try_append_view(2, 0, 8).unwrap(); // cupcakes + v.try_append_view(0, 13, 4).unwrap(); // 😁 + v.try_append_view(0, 13, 0).unwrap(); // + + // Test longer strings + v.try_append_view(3, 0, 16).unwrap(); // Many strings are + v.try_append_view(1, 0, 19).unwrap(); // This is a very long + v.try_append_view(3, 13, 27).unwrap(); // here contained of great length + + v.append_value("I do so like long strings"); + + let array = v.finish_cloned(); + array.to_data().validate_full().unwrap(); + assert_eq!(array.data_buffers().len(), 5); + let actual: Vec<_> = array.iter().flatten().collect(); + assert_eq!( + actual, + &[ + "This is a very long string that exceeds the inline length", + "This is another very long string that exceeds the inline length", + "world", + "bananas", + "cakes", + "cup", + "cupcakes", + "😁", + "", + "Many strings are", + "This is a very long", + "are here contained of great", + "I do so like long strings" + ] + ); + + let err = v.try_append_view(0, u32::MAX, 1).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17"); + + let err = v.try_append_view(0, 1, u32::MAX).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17" + ); + + let err = v.try_append_view(0, 13, 2).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Invalid view data"); + + let err = v.try_append_view(0, 40, 0).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Range 40..40 out of bounds for block of length 17" + ); + + let err = v.try_append_view(5, 0, 0).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: No block found with index 5" + ); + } + + #[test] + fn test_string_view_with_block_size_growth() { + let mut exp_builder = StringViewBuilder::new(); + let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); + + let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); + + for i in 0..9 { + // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M + for _ in 0..(2_u32.pow(i)) { + exp_builder.append_value(long_string); + fixed_builder.append_value(long_string); + } + exp_builder.flush_in_progress(); + fixed_builder.flush_in_progress(); + + // Every step only add one buffer, but the buffer size is much larger + assert_eq!(exp_builder.completed.len(), i as usize + 1); + assert_eq!( + exp_builder.completed[i as usize].len(), + STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) + ); + + // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 + assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); + + // Every buffer is fixed size + assert!(fixed_builder + .completed + .iter() + .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)); + } + + // Add one more value, and the buffer stop growing. + exp_builder.append_value(long_string); + exp_builder.flush_in_progress(); + assert_eq!( + exp_builder.completed.last().unwrap().capacity(), + MAX_BLOCK_SIZE as usize + ); + } +} diff --git a/arrow-array/src/builder2/generic_list_builder.rs b/arrow-array/src/builder2/generic_list_builder.rs new file mode 100644 index 000000000000..d110f61e2623 --- /dev/null +++ b/arrow-array/src/builder2/generic_list_builder.rs @@ -0,0 +1,877 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, BufferBuilder}; +use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; +use arrow_buffer::NullBufferBuilder; +use arrow_buffer::{Buffer, OffsetBuffer}; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`GenericListArray`] +/// +/// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. +/// +/// # Example +/// +/// Here is code that constructs a ListArray with the contents: +/// `[[A,B,C], [], NULL, [D], [NULL, F]]` +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array}; +/// # +/// let values_builder = StringBuilder::new(); +/// let mut builder = ListBuilder::new(values_builder); +/// +/// // [A, B, C] +/// builder.values().append_value("A"); +/// builder.values().append_value("B"); +/// builder.values().append_value("C"); +/// builder.append(true); +/// +/// // [ ] (empty list) +/// builder.append(true); +/// +/// // Null +/// builder.append(false); +/// +/// // [D] +/// builder.values().append_value("D"); +/// builder.append(true); +/// +/// // [NULL, F] +/// builder.values().append_null(); +/// builder.values().append_value("F"); +/// builder.append(true); +/// +/// // Build the array +/// let array = builder.finish(); +/// +/// // Values is a string array +/// // "A", "B" "C", "?", "D", NULL, "F" +/// assert_eq!( +/// array.values().as_ref(), +/// &StringArray::from(vec![ +/// Some("A"), Some("B"), Some("C"), +/// Some("D"), None, Some("F") +/// ]) +/// ); +/// +/// // Offsets are indexes into the values array +/// assert_eq!( +/// array.value_offsets(), +/// &[0, 3, 3, 3, 4, 6] +/// ); +/// ``` +/// +/// [`ListBuilder`]: crate::builder::ListBuilder +/// [`ListArray`]: crate::array::ListArray +/// [`LargeListBuilder`]: crate::builder::LargeListBuilder +/// [`LargeListArray`]: crate::array::LargeListArray +#[derive(Debug)] +pub struct GenericListBuilder { + offsets_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, +} + +impl Default for GenericListBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl GenericListBuilder { + /// Creates a new [`GenericListBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let mut offsets_builder = BufferBuilder::::new(capacity + 1); + offsets_builder.append(OffsetSize::zero()); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + field: None, + } + } + + /// Override the field passed to [`GenericListArray::new`] + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl ArrayBuilder + for GenericListBuilder +where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool) { + self.offsets_builder.append(self.next_offset()); + self.null_buffer_builder.append(is_valid); + } + + /// Returns the next offset + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + fn next_offset(&self) -> OffsetSize { + OffsetSize::from_usize(self.values_builder.len()).unwrap() + } + + /// Append a value to this [`GenericListBuilder`] + /// + /// ``` + /// # use arrow_array::builder::{Int32Builder, ListBuilder}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_array::types::Int32Type; + /// let mut builder = ListBuilder::new(Int32Builder::new()); + /// + /// builder.append_value([Some(1), Some(2), Some(3)]); + /// builder.append_value([]); + /// builder.append_value([None]); + /// + /// let array = builder.finish(); + /// assert_eq!(array.len(), 3); + /// + /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); + /// let values = array.values().as_primitive::(); + /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); + /// ``` + /// + /// This is an alternative API to appending directly to [`Self::values`] and + /// delimiting the result with [`Self::append`] + /// + /// ``` + /// # use arrow_array::builder::{Int32Builder, ListBuilder}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_array::types::Int32Type; + /// let mut builder = ListBuilder::new(Int32Builder::new()); + /// + /// builder.values().append_value(1); + /// builder.values().append_value(2); + /// builder.values().append_value(3); + /// builder.append(true); + /// builder.append(true); + /// builder.values().append_null(); + /// builder.append(true); + /// + /// let array = builder.finish(); + /// assert_eq!(array.len(), 3); + /// + /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); + /// let values = array.values().as_primitive::(); + /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); + /// ``` + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + pub fn append_list(&mut self, list: GenericListArray) { + self.values_builder.extend(list.values().iter()); + self.append(true); + } + + /// Append a null to this [`GenericListBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(self.next_offset()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + + let offsets = self.offsets_builder.finish(); + // Safety: Safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + self.offsets_builder.append(OffsetSize::zero()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), + }; + + GenericListArray::new(field, offsets, values, nulls) + } + + /// Builds the [`GenericListArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), + }; + + GenericListArray::new(field, offsets, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +impl Extend> for GenericListBuilder +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + self.values_builder.extend(elements); + self.append(true); + } + None => self.append(false), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{make_builder, Int32Builder, Int8Builder, ListBuilder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Int32Array, ListArray}; + use arrow_schema::DataType; + + fn _test_generic_list_array_builder() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6, 8].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2)); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_array_builder() { + _test_generic_list_array_builder::() + } + + #[test] + fn test_large_list_array_builder() { + _test_generic_list_array_builder::() + } + + fn _test_generic_list_array_builder_nulls() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.append(false); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); + } + + #[test] + fn test_list_array_builder_nulls() { + _test_generic_list_array_builder_nulls::() + } + + #[test] + fn test_large_list_array_builder_nulls() { + _test_generic_list_array_builder_nulls::() + } + + #[test] + fn test_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_list_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListBuilder::new(primitive_builder); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true); + builder.values().append(false); + builder.values().values().append_value(8); + builder.values().append(true); + builder.append(true); + + builder.append(false); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]); + assert_eq!(array.null_count(), 1); + assert_eq!(array.logical_null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert_eq!(elements.logical_null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_list_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_generic_list_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_large_list_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_generic_list_array_builder::(values_builder); + } + + fn test_boxed_generic_list_generic_list_array_builder( + values_builder: Box, + ) { + let mut builder: GenericListBuilder> = + GenericListBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(true); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(false); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(true); + builder.append(true); + + builder.append(false); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") + .append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6].map(O::usize_as)); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::List(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::List(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic(expected = "Non-nullable field of ListArray \\\"item\\\" cannot contain nulls")] + fn test_checks_nullability() { + let field = Arc::new(Field::new_list_field(DataType::Int32, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListArray expected data type Int64 got Int32")] + fn test_checks_data_type() { + let field = Arc::new(Field::new_list_field(DataType::Int64, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } + + #[test] + fn should_run() { + let from: Arc = create_test_list(); + let mut to = ListBuilder::new( + ListBuilder::new( + Int32Builder::new() + ) + ); + let indices: &[usize] = &[0, 1, 2]; + let data_type = DataType::List( + Arc::new(Field::new( + "item", + DataType::List( + Arc::new(Field::new( + "item", + DataType::Int32, + false + )) + ), + false + )) + ); + + for &i in indices { + if from.is_valid(i) { + let inner_list = from.value(i).as_any().downcast_ref::>().unwrap(); + // to.append_value(inner_list); + } else { + to.append_null(); + } + } + } + + fn create_test_list() -> Arc { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListBuilder::new(primitive_builder); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true); + builder.values().append(false); + builder.values().values().append_value(8); + builder.values().append(true); + builder.append(true); + + builder.append(false); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true); + builder.append(true); + + Arc::new(builder.finish()) + } +} diff --git a/arrow-array/src/builder2/map_builder.rs b/arrow-array/src/builder2/map_builder.rs new file mode 100644 index 000000000000..1e88a34cc4dc --- /dev/null +++ b/arrow-array/src/builder2/map_builder.rs @@ -0,0 +1,380 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, BufferBuilder}; +use crate::{Array, ArrayRef, MapArray, StructArray}; +use arrow_buffer::Buffer; +use arrow_buffer::{NullBuffer, NullBufferBuilder}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`MapArray`] +/// +/// ``` +/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; +/// # use arrow_array::{Int32Array, StringArray}; +/// +/// let string_builder = StringBuilder::new(); +/// let int_builder = Int32Builder::with_capacity(4); +/// +/// // Construct `[{"joe": 1}, {"blogs": 2, "foo": 4}, {}, null]` +/// let mut builder = MapBuilder::new(None, string_builder, int_builder); +/// +/// builder.keys().append_value("joe"); +/// builder.values().append_value(1); +/// builder.append(true).unwrap(); +/// +/// builder.keys().append_value("blogs"); +/// builder.values().append_value(2); +/// builder.keys().append_value("foo"); +/// builder.values().append_value(4); +/// builder.append(true).unwrap(); +/// builder.append(true).unwrap(); +/// builder.append(false).unwrap(); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]); +/// assert_eq!(array.values().as_ref(), &Int32Array::from(vec![1, 2, 4])); +/// assert_eq!(array.keys().as_ref(), &StringArray::from(vec!["joe", "blogs", "foo"])); +/// +/// ``` +#[derive(Debug)] +pub struct MapBuilder { + offsets_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + field_names: MapFieldNames, + key_builder: K, + value_builder: V, + value_field: Option, +} + +/// The [`Field`] names for a [`MapArray`] +#[derive(Debug, Clone)] +pub struct MapFieldNames { + /// [`Field`] name for map entries + pub entry: String, + /// [`Field`] name for map key + pub key: String, + /// [`Field`] name for map value + pub value: String, +} + +impl Default for MapFieldNames { + fn default() -> Self { + Self { + entry: "entries".to_string(), + key: "keys".to_string(), + value: "values".to_string(), + } + } +} + +impl MapBuilder { + /// Creates a new `MapBuilder` + pub fn new(field_names: Option, key_builder: K, value_builder: V) -> Self { + let capacity = key_builder.len(); + Self::with_capacity(field_names, key_builder, value_builder, capacity) + } + + /// Creates a new `MapBuilder` with capacity + pub fn with_capacity( + field_names: Option, + key_builder: K, + value_builder: V, + capacity: usize, + ) -> Self { + let mut offsets_builder = BufferBuilder::::new(capacity + 1); + offsets_builder.append(0); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + field_names: field_names.unwrap_or_default(), + key_builder, + value_builder, + value_field: None, + } + } + + /// Override the field passed to [`MapBuilder::new`] + /// + /// By default a nullable field is created with the name `values` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `V` + pub fn with_values_field(self, field: impl Into) -> Self { + Self { + value_field: Some(field.into()), + ..self + } + } + + /// Returns the key array builder of the map + pub fn keys(&mut self) -> &mut K { + &mut self.key_builder + } + + /// Returns the value array builder of the map + pub fn values(&mut self) -> &mut V { + &mut self.value_builder + } + + /// Returns both the key and value array builders of the map + pub fn entries(&mut self) -> (&mut K, &mut V) { + (&mut self.key_builder, &mut self.value_builder) + } + + /// Finish the current map array slot + /// + /// Returns an error if the key and values builders are in an inconsistent state. + #[inline] + pub fn append(&mut self, is_valid: bool) -> Result<(), ArrowError> { + if self.key_builder.len() != self.value_builder.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot append to a map builder when its keys and values have unequal lengths of {} and {}", + self.key_builder.len(), + self.value_builder.len() + ))); + } + self.offsets_builder.append(self.key_builder.len() as i32); + self.null_buffer_builder.append(is_valid); + Ok(()) + } + + /// Builds the [`MapArray`] + pub fn finish(&mut self) -> MapArray { + let len = self.len(); + // Build the keys + let keys_arr = self.key_builder.finish(); + let values_arr = self.value_builder.finish(); + let offset_buffer = self.offsets_builder.finish(); + self.offsets_builder.append(0); + let null_bit_buffer = self.null_buffer_builder.finish(); + + self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) + } + + /// Builds the [`MapArray`] without resetting the builder. + pub fn finish_cloned(&self) -> MapArray { + let len = self.len(); + // Build the keys + let keys_arr = self.key_builder.finish_cloned(); + let values_arr = self.value_builder.finish_cloned(); + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let nulls = self.null_buffer_builder.finish_cloned(); + self.finish_helper(keys_arr, values_arr, offset_buffer, nulls, len) + } + + fn finish_helper( + &self, + keys_arr: Arc, + values_arr: Arc, + offset_buffer: Buffer, + nulls: Option, + len: usize, + ) -> MapArray { + assert!( + keys_arr.null_count() == 0, + "Keys array must have no null values, found {} null value(s)", + keys_arr.null_count() + ); + + let keys_field = Arc::new(Field::new( + self.field_names.key.as_str(), + keys_arr.data_type().clone(), + false, // always non-nullable + )); + let values_field = match &self.value_field { + Some(f) => f.clone(), + None => Arc::new(Field::new( + self.field_names.value.as_str(), + values_arr.data_type().clone(), + true, + )), + }; + + let struct_array = + StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); + + let map_field = Arc::new(Field::new( + self.field_names.entry.as_str(), + struct_array.data_type().clone(), + false, // always non-nullable + )); + let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys + .len(len) + .add_buffer(offset_buffer) + .add_child_data(struct_array.into_data()) + .nulls(nulls); + + let array_data = unsafe { array_data.build_unchecked() }; + + MapArray::from(array_data) + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +impl ArrayBuilder for MapBuilder { + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +#[cfg(test)] +mod tests { + use crate::builder::{make_builder, Int32Builder, StringBuilder}; + use crate::{Int32Array, StringArray}; + + use super::*; + + #[test] + #[should_panic(expected = "Keys array must have no null values, found 1 null value(s)")] + fn test_map_builder_with_null_keys_panics() { + let mut builder = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); + builder.keys().append_null(); + builder.values().append_value(42); + builder.append(true).unwrap(); + + builder.finish(); + } + + #[test] + fn test_boxed_map_builder() { + let keys_builder = make_builder(&DataType::Utf8, 5); + let values_builder = make_builder(&DataType::Int32, 5); + + let mut builder = MapBuilder::new(None, keys_builder, values_builder); + builder + .keys() + .as_any_mut() + .downcast_mut::() + .expect("should be an StringBuilder") + .append_value("1"); + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(42); + builder.append(true).unwrap(); + + let map_array = builder.finish(); + + assert_eq!( + map_array + .keys() + .as_any() + .downcast_ref::() + .expect("should be an StringArray") + .value(0), + "1" + ); + assert_eq!( + map_array + .values() + .as_any() + .downcast_ref::() + .expect("should be an Int32Array") + .value(0), + 42 + ); + } + + #[test] + fn test_with_values_field() { + let value_field = Arc::new(Field::new("bars", DataType::Int32, false)); + let mut builder = MapBuilder::new(None, Int32Builder::new(), Int32Builder::new()) + .with_values_field(value_field.clone()); + builder.keys().append_value(1); + builder.values().append_value(2); + builder.append(true).unwrap(); + builder.append(false).unwrap(); // This is fine as nullability refers to nullability of values + builder.keys().append_value(3); + builder.values().append_value(4); + builder.append(true).unwrap(); + let map = builder.finish(); + + assert_eq!(map.len(), 3); + assert_eq!( + map.data_type(), + &DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Arc::new(Field::new("keys", DataType::Int32, false)), + value_field.clone() + ] + .into() + ), + false, + )), + false + ) + ); + + builder.keys().append_value(5); + builder.values().append_value(6); + builder.append(true).unwrap(); + let map = builder.finish(); + + assert_eq!(map.len(), 1); + assert_eq!( + map.data_type(), + &DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Arc::new(Field::new("keys", DataType::Int32, false)), + value_field + ] + .into() + ), + false, + )), + false + ) + ); + } +} diff --git a/arrow-array/src/builder2/mod.rs b/arrow-array/src/builder2/mod.rs new file mode 100644 index 000000000000..89a96280eb87 --- /dev/null +++ b/arrow-array/src/builder2/mod.rs @@ -0,0 +1,325 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines push-based APIs for constructing arrays +//! +//! # Basic Usage +//! +//! Builders can be used to build simple, non-nested arrays +//! +//! ``` +//! # use arrow_array::builder::Int32Builder; +//! # use arrow_array::PrimitiveArray; +//! let mut a = Int32Builder::new(); +//! a.append_value(1); +//! a.append_null(); +//! a.append_value(2); +//! let a = a.finish(); +//! +//! assert_eq!(a, PrimitiveArray::from(vec![Some(1), None, Some(2)])); +//! ``` +//! +//! ``` +//! # use arrow_array::builder::StringBuilder; +//! # use arrow_array::{Array, StringArray}; +//! let mut a = StringBuilder::new(); +//! a.append_value("foo"); +//! a.append_value("bar"); +//! a.append_null(); +//! let a = a.finish(); +//! +//! assert_eq!(a, StringArray::from_iter([Some("foo"), Some("bar"), None])); +//! ``` +//! +//! # Nested Usage +//! +//! Builders can also be used to build more complex nested arrays, such as lists +//! +//! ``` +//! # use arrow_array::builder::{Int32Builder, ListBuilder}; +//! # use arrow_array::ListArray; +//! # use arrow_array::types::Int32Type; +//! let mut a = ListBuilder::new(Int32Builder::new()); +//! // [1, 2] +//! a.values().append_value(1); +//! a.values().append_value(2); +//! a.append(true); +//! // null +//! a.append(false); +//! // [] +//! a.append(true); +//! // [3, null] +//! a.values().append_value(3); +//! a.values().append_null(); +//! a.append(true); +//! +//! // [[1, 2], null, [], [3, null]] +//! let a = a.finish(); +//! +//! assert_eq!(a, ListArray::from_iter_primitive::([ +//! Some(vec![Some(1), Some(2)]), +//! None, +//! Some(vec![]), +//! Some(vec![Some(3), None])] +//! )) +//! ``` +//! +//! # Custom Builders +//! +//! It is common to have a collection of statically defined Rust types that +//! you want to convert to Arrow arrays. +//! +//! An example of doing so is below +//! +//! ``` +//! # use std::any::Any; +//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; +//! # use arrow_array::{ArrayRef, RecordBatch, StructArray}; +//! # use arrow_schema::{DataType, Field}; +//! # use std::sync::Arc; +//! /// A custom row representation +//! struct MyRow { +//! i32: i32, +//! optional_i32: Option, +//! string: Option, +//! i32_list: Option>>, +//! } +//! +//! /// Converts `Vec` into `StructArray` +//! #[derive(Debug, Default)] +//! struct MyRowBuilder { +//! i32: Int32Builder, +//! string: StringBuilder, +//! i32_list: ListBuilder, +//! } +//! +//! impl MyRowBuilder { +//! fn append(&mut self, row: &MyRow) { +//! self.i32.append_value(row.i32); +//! self.string.append_option(row.string.as_ref()); +//! self.i32_list.append_option(row.i32_list.as_ref().map(|x| x.iter().copied())); +//! } +//! +//! /// Note: returns StructArray to allow nesting within another array if desired +//! fn finish(&mut self) -> StructArray { +//! let i32 = Arc::new(self.i32.finish()) as ArrayRef; +//! let i32_field = Arc::new(Field::new("i32", DataType::Int32, false)); +//! +//! let string = Arc::new(self.string.finish()) as ArrayRef; +//! let string_field = Arc::new(Field::new("i32", DataType::Utf8, false)); +//! +//! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; +//! let value_field = Arc::new(Field::new_list_field(DataType::Int32, true)); +//! let i32_list_field = Arc::new(Field::new("i32_list", DataType::List(value_field), true)); +//! +//! StructArray::from(vec![ +//! (i32_field, i32), +//! (string_field, string), +//! (i32_list_field, i32_list), +//! ]) +//! } +//! } +//! +//! impl<'a> Extend<&'a MyRow> for MyRowBuilder { +//! fn extend>(&mut self, iter: T) { +//! iter.into_iter().for_each(|row| self.append(row)); +//! } +//! } +//! +//! /// Converts a slice of [`MyRow`] to a [`RecordBatch`] +//! fn rows_to_batch(rows: &[MyRow]) -> RecordBatch { +//! let mut builder = MyRowBuilder::default(); +//! builder.extend(rows); +//! RecordBatch::from(&builder.finish()) +//! } +//! ``` + +pub use arrow_buffer::BooleanBufferBuilder; + +mod boolean_builder; +pub use boolean_builder::*; +mod buffer_builder; +pub use buffer_builder::*; +mod fixed_size_binary_builder; +pub use fixed_size_binary_builder::*; +mod fixed_size_list_builder; +pub use fixed_size_list_builder::*; +mod generic_bytes_builder; +pub use generic_bytes_builder::*; +mod generic_list_builder; +pub use generic_list_builder::*; +mod map_builder; +pub use map_builder::*; +mod null_builder; +pub use null_builder::*; +mod primitive_builder; +pub use primitive_builder::*; +mod primitive_dictionary_builder; +pub use primitive_dictionary_builder::*; +mod primitive_run_builder; +pub use primitive_run_builder::*; +mod struct_builder; +pub use struct_builder::*; +mod generic_bytes_dictionary_builder; +pub use generic_bytes_dictionary_builder::*; +mod generic_byte_run_builder; +pub use generic_byte_run_builder::*; +mod generic_bytes_view_builder; +pub use generic_bytes_view_builder::*; +mod union_builder; + +pub use union_builder::*; + +use crate::ArrayRef; +use std::any::Any; + +/// Trait for dealing with different array builders at runtime +/// +/// # Example +/// +/// ``` +/// // Create +/// # use arrow_array::{ArrayRef, StringArray}; +/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder}; +/// +/// let mut data_builders: Vec> = vec![ +/// Box::new(Float64Builder::new()), +/// Box::new(Int64Builder::new()), +/// Box::new(StringBuilder::new()), +/// ]; +/// +/// // Fill +/// data_builders[0] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value(3.14); +/// data_builders[1] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value(-1); +/// data_builders[2] +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap() +/// .append_value("🍎"); +/// +/// // Finish +/// let array_refs: Vec = data_builders +/// .iter_mut() +/// .map(|builder| builder.finish()) +/// .collect(); +/// assert_eq!(array_refs[0].len(), 1); +/// assert_eq!(array_refs[1].is_null(0), false); +/// assert_eq!( +/// array_refs[2] +/// .as_any() +/// .downcast_ref::() +/// .unwrap() +/// .value(0), +/// "🍎" +/// ); +/// ``` +pub trait ArrayBuilder: Any + Send + Sync { + /// Returns the number of array slots in the builder + fn len(&self) -> usize; + + /// Returns whether number of array slots is zero + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Builds the array + fn finish(&mut self) -> ArrayRef; + + /// Builds the array without resetting the underlying builder. + fn finish_cloned(&self) -> ArrayRef; + + /// Returns the builder as a non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &dyn Any; + + /// Returns the builder as a mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut dyn Any; + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box; +} + +impl ArrayBuilder for Box { + fn len(&self) -> usize { + (**self).len() + } + + fn is_empty(&self) -> bool { + (**self).is_empty() + } + + fn finish(&mut self) -> ArrayRef { + (**self).finish() + } + + fn finish_cloned(&self) -> ArrayRef { + (**self).finish_cloned() + } + + fn as_any(&self) -> &dyn Any { + (**self).as_any() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + (**self).as_any_mut() + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +/// Builder for [`ListArray`](crate::array::ListArray) +pub type ListBuilder = GenericListBuilder; + +/// Builder for [`LargeListArray`](crate::array::LargeListArray) +pub type LargeListBuilder = GenericListBuilder; + +/// Builder for [`BinaryArray`](crate::array::BinaryArray) +/// +/// See examples on [`GenericBinaryBuilder`] +pub type BinaryBuilder = GenericBinaryBuilder; + +/// Builder for [`LargeBinaryArray`](crate::array::LargeBinaryArray) +/// +/// See examples on [`GenericBinaryBuilder`] +pub type LargeBinaryBuilder = GenericBinaryBuilder; + +/// Builder for [`StringArray`](crate::array::StringArray) +/// +/// See examples on [`GenericStringBuilder`] +pub type StringBuilder = GenericStringBuilder; + +/// Builder for [`LargeStringArray`](crate::array::LargeStringArray) +/// +/// See examples on [`GenericStringBuilder`] +pub type LargeStringBuilder = GenericStringBuilder; diff --git a/arrow-array/src/builder2/null_builder.rs b/arrow-array/src/builder2/null_builder.rs new file mode 100644 index 000000000000..10e6d38274a5 --- /dev/null +++ b/arrow-array/src/builder2/null_builder.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::ArrayBuilder; +use crate::{ArrayRef, NullArray}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`NullArray`] +/// +/// # Example +/// +/// Create a `NullArray` from a `NullBuilder` +/// +/// ``` +/// +/// # use arrow_array::{Array, NullArray, builder::NullBuilder}; +/// +/// let mut b = NullBuilder::new(); +/// b.append_empty_value(); +/// b.append_null(); +/// b.append_nulls(3); +/// b.append_empty_values(3); +/// let arr = b.finish(); +/// +/// assert_eq!(8, arr.len()); +/// assert_eq!(0, arr.null_count()); +/// ``` +#[derive(Debug)] +pub struct NullBuilder { + len: usize, +} + +impl Default for NullBuilder { + fn default() -> Self { + Self::new() + } +} + +impl NullBuilder { + /// Creates a new null builder + pub fn new() -> Self { + Self { len: 0 } + } + + /// Creates a new null builder with space for `capacity` elements without re-allocating + #[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"] + pub fn with_capacity(_capacity: usize) -> Self { + Self::new() + } + + /// Returns the capacity of this builder measured in slots of type `T` + #[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"] + pub fn capacity(&self) -> usize { + self.len + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.len += 1; + } + + /// Appends `n` `null`s into the builder. + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.len += n; + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_empty_value(&mut self) { + self.append_null(); + } + + /// Appends `n` `null`s into the builder. + #[inline] + pub fn append_empty_values(&mut self, n: usize) { + self.append_nulls(n); + } + + /// Builds the [NullArray] and reset this builder. + pub fn finish(&mut self) -> NullArray { + let len = self.len(); + let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); + + let array_data = unsafe { builder.build_unchecked() }; + NullArray::from(array_data) + } + + /// Builds the [NullArray] without resetting the builder. + pub fn finish_cloned(&self) -> NullArray { + let len = self.len(); + let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); + + let array_data = unsafe { builder.build_unchecked() }; + NullArray::from(array_data) + } +} + +impl ArrayBuilder for NullBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.len + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Array; + + #[test] + fn test_null_array_builder() { + let mut builder = NullArray::builder(10); + builder.append_null(); + builder.append_nulls(4); + builder.append_empty_value(); + builder.append_empty_values(4); + + let arr = builder.finish(); + assert_eq!(10, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + assert!(arr.is_nullable()); + } + + #[test] + fn test_null_array_builder_finish_cloned() { + let mut builder = NullArray::builder(16); + builder.append_null(); + builder.append_empty_value(); + builder.append_empty_values(3); + let mut array = builder.finish_cloned(); + assert_eq!(5, array.len()); + + builder.append_empty_values(5); + array = builder.finish(); + assert_eq!(10, array.len()); + } +} diff --git a/arrow-array/src/builder2/primitive_builder.rs b/arrow-array/src/builder2/primitive_builder.rs new file mode 100644 index 000000000000..93cfb6695f50 --- /dev/null +++ b/arrow-array/src/builder2/primitive_builder.rs @@ -0,0 +1,618 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, BufferBuilder}; +use crate::types::*; +use crate::{ArrayRef, PrimitiveArray}; +use arrow_buffer::NullBufferBuilder; +use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::sync::Arc; + +/// A signed 8-bit integer array builder. +pub type Int8Builder = PrimitiveBuilder; +/// A signed 16-bit integer array builder. +pub type Int16Builder = PrimitiveBuilder; +/// A signed 32-bit integer array builder. +pub type Int32Builder = PrimitiveBuilder; +/// A signed 64-bit integer array builder. +pub type Int64Builder = PrimitiveBuilder; +/// An usigned 8-bit integer array builder. +pub type UInt8Builder = PrimitiveBuilder; +/// An usigned 16-bit integer array builder. +pub type UInt16Builder = PrimitiveBuilder; +/// An usigned 32-bit integer array builder. +pub type UInt32Builder = PrimitiveBuilder; +/// An usigned 64-bit integer array builder. +pub type UInt64Builder = PrimitiveBuilder; +/// A 16-bit floating point array builder. +pub type Float16Builder = PrimitiveBuilder; +/// A 32-bit floating point array builder. +pub type Float32Builder = PrimitiveBuilder; +/// A 64-bit floating point array builder. +pub type Float64Builder = PrimitiveBuilder; + +/// A timestamp second array builder. +pub type TimestampSecondBuilder = PrimitiveBuilder; +/// A timestamp millisecond array builder. +pub type TimestampMillisecondBuilder = PrimitiveBuilder; +/// A timestamp microsecond array builder. +pub type TimestampMicrosecondBuilder = PrimitiveBuilder; +/// A timestamp nanosecond array builder. +pub type TimestampNanosecondBuilder = PrimitiveBuilder; + +/// A 32-bit date array builder. +pub type Date32Builder = PrimitiveBuilder; +/// A 64-bit date array builder. +pub type Date64Builder = PrimitiveBuilder; + +/// A 32-bit elaspsed time in seconds array builder. +pub type Time32SecondBuilder = PrimitiveBuilder; +/// A 32-bit elaspsed time in milliseconds array builder. +pub type Time32MillisecondBuilder = PrimitiveBuilder; +/// A 64-bit elaspsed time in microseconds array builder. +pub type Time64MicrosecondBuilder = PrimitiveBuilder; +/// A 64-bit elaspsed time in nanoseconds array builder. +pub type Time64NanosecondBuilder = PrimitiveBuilder; + +/// A “calendar” interval in months array builder. +pub type IntervalYearMonthBuilder = PrimitiveBuilder; +/// A “calendar” interval in days and milliseconds array builder. +pub type IntervalDayTimeBuilder = PrimitiveBuilder; +/// A “calendar” interval in months, days, and nanoseconds array builder. +pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; + +/// An elapsed time in seconds array builder. +pub type DurationSecondBuilder = PrimitiveBuilder; +/// An elapsed time in milliseconds array builder. +pub type DurationMillisecondBuilder = PrimitiveBuilder; +/// An elapsed time in microseconds array builder. +pub type DurationMicrosecondBuilder = PrimitiveBuilder; +/// An elapsed time in nanoseconds array builder. +pub type DurationNanosecondBuilder = PrimitiveBuilder; + +/// A decimal 128 array builder +pub type Decimal128Builder = PrimitiveBuilder; +/// A decimal 256 array builder +pub type Decimal256Builder = PrimitiveBuilder; + +/// Builder for [`PrimitiveArray`] +#[derive(Debug)] +pub struct PrimitiveBuilder { + values_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + data_type: DataType, +} + +impl ArrayBuilder for PrimitiveBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.values_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl Default for PrimitiveBuilder { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveBuilder { + /// Creates a new primitive array builder + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new primitive array builder with capacity no of items + pub fn with_capacity(capacity: usize) -> Self { + Self { + values_builder: BufferBuilder::::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + data_type: T::DATA_TYPE, + } + } + + /// Creates a new primitive array builder from buffers + pub fn new_from_buffer( + values_buffer: MutableBuffer, + null_buffer: Option, + ) -> Self { + let values_builder = BufferBuilder::::new_from_buffer(values_buffer); + + let null_buffer_builder = null_buffer + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, values_builder.len())) + .unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len())); + + Self { + values_builder, + null_buffer_builder, + data_type: T::DATA_TYPE, + } + } + + /// By default [`PrimitiveBuilder`] uses [`ArrowPrimitiveType::DATA_TYPE`] as the + /// data type of the generated array. + /// + /// This method allows overriding the data type, to allow specifying timezones + /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] + /// + /// # Panics + /// + /// This method panics if `data_type` is not [PrimitiveArray::is_compatible] + pub fn with_data_type(self, data_type: DataType) -> Self { + assert!( + PrimitiveArray::::is_compatible(&data_type), + "incompatible data type for builder, expected {} got {}", + T::DATA_TYPE, + data_type + ); + Self { data_type, ..self } + } + + /// Returns the capacity of this builder measured in slots of type `T` + pub fn capacity(&self) -> usize { + self.values_builder.capacity() + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value(&mut self, v: T::Native) { + self.null_buffer_builder.append_non_null(); + self.values_builder.append(v); + } + + /// Appends a value of type `T` into the builder `n` times + #[inline] + pub fn append_value_n(&mut self, v: T::Native, n: usize) { + self.null_buffer_builder.append_n_non_nulls(n); + self.values_builder.append_n(n, v); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.values_builder.advance(1); + } + + /// Appends `n` no. of null's into the builder + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.null_buffer_builder.append_n_nulls(n); + self.values_builder.advance(n); + } + + /// Appends an `Option` into the builder + #[inline] + pub fn append_option(&mut self, v: Option) { + match v { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Appends a slice of type `T` into the builder + #[inline] + pub fn append_slice(&mut self, v: &[T::Native]) { + self.null_buffer_builder.append_n_non_nulls(v.len()); + self.values_builder.append_slice(v); + } + + /// Appends values from a slice of type `T` and a validity boolean slice + /// + /// # Panics + /// + /// Panics if `values` and `is_valid` have different lengths + #[inline] + pub fn append_values(&mut self, values: &[T::Native], is_valid: &[bool]) { + assert_eq!( + values.len(), + is_valid.len(), + "Value and validity lengths must be equal" + ); + self.null_buffer_builder.append_slice(is_valid); + self.values_builder.append_slice(values); + } + + /// Appends values from a trusted length iterator. + /// + /// # Safety + /// This requires the iterator be a trusted length. This could instead require + /// the iterator implement `TrustedLen` once that is stabilized. + #[inline] + pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { + let iter = iter.into_iter(); + let len = iter + .size_hint() + .1 + .expect("append_trusted_len_iter requires an upper bound"); + + self.null_buffer_builder.append_n_non_nulls(len); + self.values_builder.append_trusted_len_iter(iter); + } + + /// Builds the [`PrimitiveArray`] and reset this builder. + pub fn finish(&mut self) -> PrimitiveArray { + let len = self.len(); + let nulls = self.null_buffer_builder.finish(); + let builder = ArrayData::builder(self.data_type.clone()) + .len(len) + .add_buffer(self.values_builder.finish()) + .nulls(nulls); + + let array_data = unsafe { builder.build_unchecked() }; + PrimitiveArray::::from(array_data) + } + + /// Builds the [`PrimitiveArray`] without resetting the builder. + pub fn finish_cloned(&self) -> PrimitiveArray { + let len = self.len(); + let nulls = self.null_buffer_builder.finish_cloned(); + let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let builder = ArrayData::builder(self.data_type.clone()) + .len(len) + .add_buffer(values_buffer) + .nulls(nulls); + + let array_data = unsafe { builder.build_unchecked() }; + PrimitiveArray::::from(array_data) + } + + /// Returns the current values buffer as a slice + pub fn values_slice(&self) -> &[T::Native] { + self.values_builder.as_slice() + } + + /// Returns the current values buffer as a mutable slice + pub fn values_slice_mut(&mut self) -> &mut [T::Native] { + self.values_builder.as_slice_mut() + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Returns the current null buffer as a mutable slice + pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { + self.null_buffer_builder.as_slice_mut() + } + + /// Returns the current values buffer and null buffer as a slice + pub fn slices_mut(&mut self) -> (&mut [T::Native], Option<&mut [u8]>) { + ( + self.values_builder.as_slice_mut(), + self.null_buffer_builder.as_slice_mut(), + ) + } +} + +impl PrimitiveBuilder

{ + /// Sets the precision and scale + pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { + validate_decimal_precision_and_scale::

(precision, scale)?; + Ok(Self { + data_type: P::TYPE_CONSTRUCTOR(precision, scale), + ..self + }) + } +} + +impl PrimitiveBuilder

{ + /// Sets the timezone + pub fn with_timezone(self, timezone: impl Into>) -> Self { + self.with_timezone_opt(Some(timezone.into())) + } + + /// Sets an optional timezone + pub fn with_timezone_opt>>(self, timezone: Option) -> Self { + Self { + data_type: DataType::Timestamp(P::UNIT, timezone.map(Into::into)), + ..self + } + } +} + +impl Extend> for PrimitiveBuilder

{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_schema::TimeUnit; + + use crate::array::Array; + use crate::array::BooleanArray; + use crate::array::Date32Array; + use crate::array::Int32Array; + use crate::array::TimestampSecondArray; + + #[test] + fn test_primitive_array_builder_i32() { + let mut builder = Int32Array::builder(5); + for i in 0..5 { + builder.append_value(i); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_i32_append_iter() { + let mut builder = Int32Array::builder(5); + unsafe { builder.append_trusted_len_iter(0..5) }; + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_i32_append_nulls() { + let mut builder = Int32Array::builder(5); + builder.append_nulls(5); + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(5, arr.null_count()); + for i in 0..5 { + assert!(arr.is_null(i)); + assert!(!arr.is_valid(i)); + } + } + + #[test] + fn test_primitive_array_builder_date32() { + let mut builder = Date32Array::builder(5); + for i in 0..5 { + builder.append_value(i); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i32, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_timestamp_second() { + let mut builder = TimestampSecondArray::builder(5); + for i in 0..5 { + builder.append_value(i); + } + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..5 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i as i64, arr.value(i)); + } + } + + #[test] + fn test_primitive_array_builder_bool() { + // 00000010 01001000 + let buf = Buffer::from([72_u8, 2_u8]); + let mut builder = BooleanArray::builder(10); + for i in 0..10 { + if i == 3 || i == 6 || i == 9 { + builder.append_value(true); + } else { + builder.append_value(false); + } + } + + let arr = builder.finish(); + assert_eq!(&buf, arr.values().inner()); + assert_eq!(10, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + for i in 0..10 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") + } + } + + #[test] + fn test_primitive_array_builder_append_option() { + let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_option(Some(0)); + builder.append_option(None); + builder.append_option(Some(2)); + builder.append_option(None); + builder.append_option(Some(4)); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_append_null() { + let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_value(0); + builder.append_value(2); + builder.append_null(); + builder.append_null(); + builder.append_value(4); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_append_slice() { + let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); + + let mut builder = Int32Array::builder(5); + builder.append_slice(&[0, 2]); + builder.append_null(); + builder.append_null(); + builder.append_value(4); + let arr2 = builder.finish(); + + assert_eq!(arr1.len(), arr2.len()); + assert_eq!(arr1.offset(), arr2.offset()); + assert_eq!(arr1.null_count(), arr2.null_count()); + for i in 0..5 { + assert_eq!(arr1.is_null(i), arr2.is_null(i)); + assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); + if arr1.is_valid(i) { + assert_eq!(arr1.value(i), arr2.value(i)); + } + } + } + + #[test] + fn test_primitive_array_builder_finish() { + let mut builder = Int32Builder::new(); + builder.append_slice(&[2, 4, 6, 8]); + let mut arr = builder.finish(); + assert_eq!(4, arr.len()); + assert_eq!(0, builder.len()); + + builder.append_slice(&[1, 3, 5, 7, 9]); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_primitive_array_builder_finish_cloned() { + let mut builder = Int32Builder::new(); + builder.append_value(23); + builder.append_value(45); + let result = builder.finish_cloned(); + assert_eq!(result, Int32Array::from(vec![23, 45])); + builder.append_value(56); + assert_eq!(builder.finish_cloned(), Int32Array::from(vec![23, 45, 56])); + + builder.append_slice(&[2, 4, 6, 8]); + let mut arr = builder.finish(); + assert_eq!(7, arr.len()); + assert_eq!(arr, Int32Array::from(vec![23, 45, 56, 2, 4, 6, 8])); + assert_eq!(0, builder.len()); + + builder.append_slice(&[1, 3, 5, 7, 9]); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_primitive_array_builder_with_data_type() { + let mut builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + builder.append_value(1); + let array = builder.finish(); + assert_eq!(array.precision(), 1); + assert_eq!(array.scale(), 2); + + let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); + let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); + builder.append_value(1); + let array = builder.finish(); + assert_eq!(array.data_type(), &data_type); + } + + #[test] + #[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")] + fn test_invalid_with_data_type() { + Int32Builder::new().with_data_type(DataType::Int64); + } + + #[test] + fn test_extend() { + let mut builder = PrimitiveBuilder::::new(); + builder.extend([1, 2, 3, 5, 2, 4, 4].into_iter().map(Some)); + builder.extend([2, 4, 6, 2].into_iter().map(Some)); + let array = builder.finish(); + assert_eq!(array.values(), &[1, 2, 3, 5, 2, 4, 4, 2, 4, 6, 2]); + } +} diff --git a/arrow-array/src/builder2/primitive_dictionary_builder.rs b/arrow-array/src/builder2/primitive_dictionary_builder.rs new file mode 100644 index 000000000000..91d60003e187 --- /dev/null +++ b/arrow-array/src/builder2/primitive_dictionary_builder.rs @@ -0,0 +1,446 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::{ArrayBuilder, PrimitiveBuilder}; +use crate::types::ArrowDictionaryKeyType; +use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use arrow_buffer::{ArrowNativeType, ToByteSlice}; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +/// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it +/// +/// This is necessary to handle types such as f32, which don't natively implement these +#[derive(Debug)] +struct Value(T); + +impl std::hash::Hash for Value { + fn hash(&self, state: &mut H) { + self.0.to_byte_slice().hash(state) + } +} + +impl PartialEq for Value { + fn eq(&self, other: &Self) -> bool { + self.0.to_byte_slice().eq(other.0.to_byte_slice()) + } +} + +impl Eq for Value {} + +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::PrimitiveDictionaryBuilder; +/// # use arrow_array::types::{UInt32Type, UInt8Type}; +/// # use arrow_array::{Array, UInt32Array, UInt8Array}; +/// +/// let mut builder = PrimitiveDictionaryBuilder::::new(); +/// builder.append(12345678).unwrap(); +/// builder.append_null(); +/// builder.append(22345678).unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &UInt8Array::from(vec![Some(0), None, Some(1)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); +/// let avs: &[u32] = ava.values(); +/// +/// assert!(!array.is_null(0)); +/// assert!(array.is_null(1)); +/// assert!(!array.is_null(2)); +/// +/// assert_eq!(avs, &[12345678, 22345678]); +/// ``` +#[derive(Debug)] +pub struct PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + map: HashMap, usize>, +} + +impl Default for PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + /// Creates a new `PrimitiveDictionaryBuilder`. + pub fn new() -> Self { + Self { + keys_builder: PrimitiveBuilder::new(), + values_builder: PrimitiveBuilder::new(), + map: HashMap::new(), + } + } + + /// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders. + /// + /// # Panics + /// + /// This method panics if `keys_builder` or `values_builder` is not empty. + pub fn new_from_empty_builders( + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + ) -> Self { + assert!( + keys_builder.is_empty() && values_builder.is_empty(), + "keys and values builders must be empty" + ); + Self { + keys_builder, + values_builder, + map: HashMap::new(), + } + } + + /// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values. + /// + /// # Safety + /// + /// caller must ensure that the passed in builders are valid for DictionaryArray. + pub unsafe fn new_from_builders( + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + ) -> Self { + let keys = keys_builder.values_slice(); + let values = values_builder.values_slice(); + let mut map = HashMap::with_capacity(values.len()); + + keys.iter().zip(values.iter()).for_each(|(key, value)| { + map.insert(Value(*value), K::Native::to_usize(*key).unwrap()); + }); + + Self { + keys_builder, + values_builder, + map, + } + } + + /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities + /// + /// `keys_capacity`: the number of keys, i.e. length of array to build + /// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary + pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self { + Self { + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), + values_builder: PrimitiveBuilder::with_capacity(values_capacity), + map: HashMap::with_capacity(values_capacity), + } + } +} + +impl ArrayBuilder for PrimitiveDictionaryBuilder +where + K: ArrowDictionaryKeyType, + V: ArrowPrimitiveType, +{ + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.keys_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl PrimitiveDictionaryBuilder +where + K: ArrowDictionaryKeyType, + V: ArrowPrimitiveType, +{ + #[inline] + fn get_or_insert_key(&mut self, value: V::Native) -> Result { + match self.map.get(&Value(value)) { + Some(&key) => { + Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) + } + None => { + let key = self.values_builder.len(); + self.values_builder.append_value(value); + self.map.insert(Value(value), key); + Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) + } + } + } + + /// Append a primitive value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + #[inline] + pub fn append(&mut self, value: V::Native) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value(key); + Ok(key) + } + + /// Append a value multiple times to the array. + /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append_n(&mut self, value: V::Native, count: usize) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value_n(key, count); + Ok(key) + } + + /// Infallibly append a value to this builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] + pub fn append_value(&mut self, value: V::Native) { + self.append(value).expect("dictionary key overflow"); + } + + /// Infallibly append a value to this builder repeatedly `count` times. + /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_values(&mut self, value: V::Native, count: usize) { + self.append_n(value, count) + .expect("dictionary key overflow"); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.keys_builder.append_null() + } + + /// Append `n` null slots into the builder + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.keys_builder.append_nulls(n) + } + + /// Append an `Option` value into the builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] + pub fn append_option(&mut self, value: Option) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append an `Option` value into the builder repeatedly `count` times. + /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_options(&mut self, value: Option, count: usize) { + match value { + None => self.keys_builder.append_nulls(count), + Some(v) => self.append_values(v, count), + }; + } + + /// Builds the `DictionaryArray` and reset this builder. + pub fn finish(&mut self) -> DictionaryArray { + self.map.clear(); + let values = self.values_builder.finish(); + let keys = self.keys_builder.finish(); + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } + + /// Builds the `DictionaryArray` without resetting the builder. + pub fn finish_cloned(&self) -> DictionaryArray { + let values = self.values_builder.finish_cloned(); + let keys = self.keys_builder.finish_cloned(); + + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } + + /// Returns the current dictionary values buffer as a slice + pub fn values_slice(&self) -> &[V::Native] { + self.values_builder.values_slice() + } + + /// Returns the current dictionary values buffer as a mutable slice + pub fn values_slice_mut(&mut self) -> &mut [V::Native] { + self.values_builder.values_slice_mut() + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.keys_builder.validity_slice() + } +} + +impl Extend> + for PrimitiveDictionaryBuilder +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::UInt32Array; + use crate::array::UInt8Array; + use crate::builder::Decimal128Builder; + use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; + + #[test] + fn test_primitive_dictionary_builder() { + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); + builder.append(12345678).unwrap(); + builder.append_null(); + builder.append(22345678).unwrap(); + let array = builder.finish(); + + assert_eq!( + array.keys(), + &UInt8Array::from(vec![Some(0), None, Some(1)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); + let avs: &[u32] = ava.values(); + + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert!(!array.is_null(2)); + + assert_eq!(avs, &[12345678, 22345678]); + } + + #[test] + fn test_extend() { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + let dict = builder.finish(); + assert_eq!( + dict.keys().values(), + &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0] + ); + assert_eq!(dict.values().len(), 5); + } + + #[test] + #[should_panic(expected = "DictionaryKeyOverflowError")] + fn test_primitive_dictionary_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(257, 257); + // 256 unique keys. + for i in 0..256 { + builder.append(i + 1000).unwrap(); + } + // Special error if the key overflows (256th entry) + builder.append(1257).unwrap(); + } + + #[test] + fn test_primitive_dictionary_with_builders() { + let keys_builder = PrimitiveBuilder::::new(); + let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let mut builder = + PrimitiveDictionaryBuilder::::new_from_empty_builders( + keys_builder, + values_builder, + ); + let dict_array = builder.finish(); + assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2)); + assert_eq!( + dict_array.data_type(), + &DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal128(1, 2)), + ) + ); + } +} diff --git a/arrow-array/src/builder2/primitive_run_builder.rs b/arrow-array/src/builder2/primitive_run_builder.rs new file mode 100644 index 000000000000..1db9c91e081d --- /dev/null +++ b/arrow-array/src/builder2/primitive_run_builder.rs @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, sync::Arc}; + +use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray}; + +use super::{ArrayBuilder, PrimitiveBuilder}; + +use arrow_buffer::ArrowNativeType; + +/// Builder for [`RunArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::PrimitiveRunBuilder; +/// # use arrow_array::cast::AsArray; +/// # use arrow_array::types::{UInt32Type, Int16Type}; +/// # use arrow_array::{Array, UInt32Array, Int16Array}; +/// +/// let mut builder = +/// PrimitiveRunBuilder::::new(); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_null(); +/// builder.append_value(5678); +/// builder.append_value(5678); +/// let array = builder.finish(); +/// +/// assert_eq!(array.run_ends().values(), &[3, 4, 6]); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &UInt32Array = av.as_primitive::(); +/// +/// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); +/// ``` +#[derive(Debug)] +pub struct PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + current_value: Option, + current_run_end_index: usize, + prev_run_end_index: usize, +} + +impl Default for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Creates a new `PrimitiveRunBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: PrimitiveBuilder::new(), + current_value: None, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } + + /// Creates a new `PrimitiveRunBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + pub fn with_capacity(capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: PrimitiveBuilder::with_capacity(capacity), + current_value: None, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } +} + +impl ArrayBuilder for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the length of logical array encoded by + /// the eventual runs array. + fn len(&self) -> usize { + self.current_run_end_index + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Appends optional value to the logical array encoded by the RunArray. + pub fn append_option(&mut self, value: Option) { + if self.current_run_end_index == 0 { + self.current_run_end_index = 1; + self.current_value = value; + return; + } + if self.current_value != value { + self.append_run_end(); + self.current_value = value; + } + + self.current_run_end_index += 1; + } + + /// Appends value to the logical array encoded by the run-ends array. + pub fn append_value(&mut self, value: V::Native) { + self.append_option(Some(value)) + } + + /// Appends null to the logical array encoded by the run-ends array. + pub fn append_null(&mut self) { + self.append_option(None) + } + + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { + // write the last run end to the array. + self.append_run_end(); + + // reset the run index to zero. + self.current_value = None; + self.current_run_end_index = 0; + + // build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); + + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } + + RunArray::try_new(&run_ends_array, &values_array).unwrap() + } + + // Appends the current run to the array. + fn append_run_end(&mut self) { + // empty array or the function called without appending any value. + if self.prev_run_end_index == self.current_run_end_index { + return; + } + let run_end_index = self.run_end_index_as_native(); + self.run_ends_builder.append_value(run_end_index); + self.values_builder.append_option(self.current_value); + self.prev_run_end_index = self.current_run_end_index; + } + + // Similar to `append_run_end` but on custom builders. + // Used in `finish_cloned` which is not suppose to mutate `self`. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut PrimitiveBuilder, + ) { + let run_end_index = self.run_end_index_as_native(); + run_ends_builder.append_value(run_end_index); + values_builder.append_option(self.current_value); + } + + fn run_end_index_as_native(&self) -> R::Native { + R::Native::from_usize(self.current_run_end_index) + .unwrap_or_else(|| panic!( + "Cannot convert `current_run_end_index` {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + } +} + +impl Extend> for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + +#[cfg(test)] +mod tests { + use crate::builder::PrimitiveRunBuilder; + use crate::cast::AsArray; + use crate::types::{Int16Type, UInt32Type}; + use crate::{Array, UInt32Array}; + + #[test] + fn test_primitive_ree_array_builder() { + let mut builder = PrimitiveRunBuilder::::new(); + builder.append_value(1234); + builder.append_value(1234); + builder.append_value(1234); + builder.append_null(); + builder.append_value(5678); + builder.append_value(5678); + + let array = builder.finish(); + + assert_eq!(array.null_count(), 0); + assert_eq!(array.logical_null_count(), 1); + assert_eq!(array.len(), 6); + + assert_eq!(array.run_ends().values(), &[3, 4, 6]); + + let av = array.values(); + + assert!(!av.is_null(0)); + assert!(av.is_null(1)); + assert!(!av.is_null(2)); + + // Values are polymorphic and so require a downcast. + let ava: &UInt32Array = av.as_primitive::(); + + assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); + } + + #[test] + fn test_extend() { + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some)); + builder.extend([4, 4, 6, 2].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + assert_eq!(array.logical_null_count(), 0); + assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]); + assert_eq!( + array.values().as_primitive::().values(), + &[1, 2, 5, 4, 6, 2] + ); + } +} diff --git a/arrow-array/src/builder2/struct_builder.rs b/arrow-array/src/builder2/struct_builder.rs new file mode 100644 index 000000000000..b765a6d2e179 --- /dev/null +++ b/arrow-array/src/builder2/struct_builder.rs @@ -0,0 +1,872 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::StructArray; +use crate::{ + builder2::*, + types::{Int16Type, Int32Type, Int64Type, Int8Type}, +}; +use arrow_buffer::NullBufferBuilder; +use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit}; +use std::sync::Arc; + +/// Builder for [`StructArray`] +/// +/// Note that callers should make sure that methods of all the child field builders are +/// properly called to maintain the consistency of the data structure. +/// +/// +/// Handling arrays with complex layouts, such as `List>>`, in Rust can be challenging due to its strong typing system. +/// To construct a collection builder ([`ListBuilder`], [`LargeListBuilder`], or [`MapBuilder`]) using [`make_builder`], multiple calls are required. This complexity arises from the recursive approach utilized by [`StructBuilder::from_fields`]. +/// +/// Initially, [`StructBuilder::from_fields`] invokes [`make_builder`], which returns a `Box`. To obtain the specific collection builder, one must first use [`StructBuilder::field_builder`] to get a `Collection<[Box]>`. Subsequently, the `values()` result from this operation can be downcast to the desired builder type. +/// +/// For example, when working with [`ListBuilder`], you would first call [`StructBuilder::field_builder::>>`] and then downcast the [`Box`] to the specific [`StructBuilder`] you need. +/// +/// For a practical example see the code below: +/// +/// ```rust +/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder}; +/// use arrow_schema::{DataType, Field, Fields}; +/// use std::sync::Arc; +/// +/// // This is an example column that has a List>> layout +/// let mut example_col = ListBuilder::new(StructBuilder::from_fields( +/// vec![Field::new( +/// "value_list", +/// DataType::List(Arc::new(Field::new_list_field( +/// DataType::Struct(Fields::from(vec![ +/// Field::new("key", DataType::Utf8, true), +/// Field::new("value", DataType::Utf8, true), +/// ])), //In this example we are trying to get to this builder and insert key/value pairs +/// true, +/// ))), +/// true, +/// )], +/// 0, +/// )); +/// +/// // We can obtain the StructBuilder without issues, because example_col was created with StructBuilder +/// let col_struct_builder: &mut StructBuilder = example_col.values(); +/// +/// // We can't obtain the ListBuilder with the expected generic types, because under the hood +/// // the StructBuilder was returned as a Box and passed as such to the ListBuilder constructor +/// +/// // This panics in runtime, even though we know that the builder is a ListBuilder. +/// // let sb = col_struct_builder +/// // .field_builder::>(0) +/// // .as_mut() +/// // .unwrap(); +/// +/// //To keep in line with Rust's strong typing, we fetch a ListBuilder> from the column StructBuilder first... +/// let mut list_builder_option = +/// col_struct_builder.field_builder::>>(0); +/// +/// let list_builder = list_builder_option.as_mut().unwrap(); +/// +/// // ... and then downcast the key/value pair values to a StructBuilder +/// let struct_builder = list_builder +/// .values() +/// .as_any_mut() +/// .downcast_mut::() +/// .unwrap(); +/// +/// // We can now append values to the StructBuilder +/// let key_builder = struct_builder.field_builder::(0).unwrap(); +/// key_builder.append_value("my key"); +/// +/// let value_builder = struct_builder.field_builder::(1).unwrap(); +/// value_builder.append_value("my value"); +/// +/// struct_builder.append(true); +/// list_builder.append(true); +/// col_struct_builder.append(true); +/// example_col.append(true); +/// +/// let array = example_col.finish(); +/// +/// println!("My array: {:?}", array); +/// ``` +/// +pub struct StructBuilder { + fields: Fields, + field_builders: Vec>, + null_buffer_builder: NullBufferBuilder, +} + +impl std::fmt::Debug for StructBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StructBuilder") + .field("fields", &self.fields) + .field("bitmap_builder", &self.null_buffer_builder) + .field("len", &self.len()) + .finish() + } +} + +impl ArrayBuilder for StructBuilder { + /// Returns the number of array slots in the builder. + /// + /// Note that this always return the first child field builder's length, and it is + /// the caller's responsibility to maintain the consistency that all the child field + /// builder should have the equal number of elements. + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + /// Returns the builder as a non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } +} + +/// Returns a builder with capacity for `capacity` elements of datatype +/// `DataType`. +/// +/// This function is useful to construct arrays from an arbitrary vectors with +/// known/expected schema. +/// +/// See comments on [StructBuilder] for retrieving collection builders built by +/// make_builder. +pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + use crate::builder::*; + match datatype { + DataType::Null => Box::new(NullBuilder::new()), + DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)), + DataType::Int8 => Box::new(Int8Builder::with_capacity(capacity)), + DataType::Int16 => Box::new(Int16Builder::with_capacity(capacity)), + DataType::Int32 => Box::new(Int32Builder::with_capacity(capacity)), + DataType::Int64 => Box::new(Int64Builder::with_capacity(capacity)), + DataType::UInt8 => Box::new(UInt8Builder::with_capacity(capacity)), + DataType::UInt16 => Box::new(UInt16Builder::with_capacity(capacity)), + DataType::UInt32 => Box::new(UInt32Builder::with_capacity(capacity)), + DataType::UInt64 => Box::new(UInt64Builder::with_capacity(capacity)), + DataType::Float16 => Box::new(Float16Builder::with_capacity(capacity)), + DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)), + DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), + DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), + DataType::LargeBinary => Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)), + DataType::FixedSizeBinary(len) => { + Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) + } + DataType::Decimal128(p, s) => Box::new( + Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)), + ), + DataType::Decimal256(p, s) => Box::new( + Decimal256Builder::with_capacity(capacity).with_data_type(DataType::Decimal256(*p, *s)), + ), + DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), + DataType::LargeUtf8 => Box::new(LargeStringBuilder::with_capacity(capacity, 1024)), + DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), + DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), + DataType::Time32(TimeUnit::Second) => { + Box::new(Time32SecondBuilder::with_capacity(capacity)) + } + DataType::Time32(TimeUnit::Millisecond) => { + Box::new(Time32MillisecondBuilder::with_capacity(capacity)) + } + DataType::Time64(TimeUnit::Microsecond) => { + Box::new(Time64MicrosecondBuilder::with_capacity(capacity)) + } + DataType::Time64(TimeUnit::Nanosecond) => { + Box::new(Time64NanosecondBuilder::with_capacity(capacity)) + } + DataType::Timestamp(TimeUnit::Second, tz) => Box::new( + TimestampSecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Second, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Millisecond, tz) => Box::new( + TimestampMillisecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Millisecond, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Microsecond, tz) => Box::new( + TimestampMicrosecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Box::new( + TimestampNanosecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Nanosecond, tz.clone())), + ), + DataType::Interval(IntervalUnit::YearMonth) => { + Box::new(IntervalYearMonthBuilder::with_capacity(capacity)) + } + DataType::Interval(IntervalUnit::DayTime) => { + Box::new(IntervalDayTimeBuilder::with_capacity(capacity)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Box::new(IntervalMonthDayNanoBuilder::with_capacity(capacity)) + } + DataType::Duration(TimeUnit::Second) => { + Box::new(DurationSecondBuilder::with_capacity(capacity)) + } + DataType::Duration(TimeUnit::Millisecond) => { + Box::new(DurationMillisecondBuilder::with_capacity(capacity)) + } + DataType::Duration(TimeUnit::Microsecond) => { + Box::new(DurationMicrosecondBuilder::with_capacity(capacity)) + } + DataType::Duration(TimeUnit::Nanosecond) => { + Box::new(DurationNanosecondBuilder::with_capacity(capacity)) + } + DataType::List(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeList(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(LargeListBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::FixedSizeList(field, size) => { + let size = *size; + let values_builder_capacity = { + let size: usize = size.try_into().unwrap(); + capacity * size + }; + let builder = make_builder(field.data_type(), values_builder_capacity); + Box::new( + FixedSizeListBuilder::with_capacity(builder, size, capacity) + .with_field(field.clone()), + ) + } + DataType::Map(field, _) => match field.data_type() { + DataType::Struct(fields) => { + let map_field_names = MapFieldNames { + key: fields[0].name().clone(), + value: fields[1].name().clone(), + entry: field.name().clone(), + }; + let key_builder = make_builder(fields[0].data_type(), capacity); + let value_builder = make_builder(fields[1].data_type(), capacity); + Box::new( + MapBuilder::with_capacity( + Some(map_field_names), + key_builder, + value_builder, + capacity, + ) + .with_values_field(fields[1].clone()), + ) + } + t => panic!("The field of Map data type {t:?} should has a child Struct field"), + }, + DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), + t @ DataType::Dictionary(key_type, value_type) => { + macro_rules! dict_builder { + ($key_type:ty) => { + match &**value_type { + DataType::Utf8 => { + let dict_builder: StringDictionaryBuilder<$key_type> = + StringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeUtf8 => { + let dict_builder: LargeStringDictionaryBuilder<$key_type> = + LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::Binary => { + let dict_builder: BinaryDictionaryBuilder<$key_type> = + BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeBinary => { + let dict_builder: LargeBinaryDictionaryBuilder<$key_type> = + LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + t => panic!("Dictionary value type {t:?} is not currently supported"), + } + }; + } + match &**key_type { + DataType::Int8 => dict_builder!(Int8Type), + DataType::Int16 => dict_builder!(Int16Type), + DataType::Int32 => dict_builder!(Int32Type), + DataType::Int64 => dict_builder!(Int64Type), + _ => { + panic!("Data type {t:?} with key type {key_type:?} is not currently supported") + } + } + } + t => panic!("Data type {t:?} is not currently supported"), + } +} + +impl StructBuilder { + /// Creates a new `StructBuilder` + pub fn new(fields: impl Into, field_builders: Vec>) -> Self { + Self { + field_builders, + fields: fields.into(), + null_buffer_builder: NullBufferBuilder::new(0), + } + } + + /// Creates a new `StructBuilder` from [`Fields`] and `capacity` + pub fn from_fields(fields: impl Into, capacity: usize) -> Self { + let fields = fields.into(); + let mut builders = Vec::with_capacity(fields.len()); + for field in &fields { + builders.push(make_builder(field.data_type(), capacity)); + } + Self::new(fields, builders) + } + + /// Returns a mutable reference to the child field builder at index `i`. + /// Result will be `None` if the input type `T` provided doesn't match the actual + /// field builder's type. + pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { + self.field_builders[i].as_any_mut().downcast_mut::() + } + + /// Returns the number of fields for the struct this builder is building. + pub fn num_fields(&self) -> usize { + self.field_builders.len() + } + + /// Appends an element (either null or non-null) to the struct. The actual elements + /// should be appended for each child sub-array in a consistent way. + #[inline] + pub fn append(&mut self, is_valid: bool) { + self.null_buffer_builder.append(is_valid); + } + + /// Appends a null element to the struct. + #[inline] + pub fn append_null(&mut self) { + self.append(false) + } + + /// Builds the `StructArray` and reset this builder. + pub fn finish(&mut self) -> StructArray { + self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish()); + } + + let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); + let nulls = self.null_buffer_builder.finish(); + StructArray::new(self.fields.clone(), arrays, nulls) + } + + /// Builds the `StructArray` without resetting the builder. + pub fn finish_cloned(&self) -> StructArray { + self.validate_content(); + + if self.fields.is_empty() { + return StructArray::new_empty_fields( + self.len(), + self.null_buffer_builder.finish_cloned(), + ); + } + + let arrays = self + .field_builders + .iter() + .map(|f| f.finish_cloned()) + .collect(); + + let nulls = self.null_buffer_builder.finish_cloned(); + + StructArray::new(self.fields.clone(), arrays, nulls) + } + + /// Constructs and validates contents in the builder to ensure that + /// - fields and field_builders are of equal length + /// - the number of items in individual field_builders are equal to self.len() + fn validate_content(&self) { + if self.fields.len() != self.field_builders.len() { + panic!("Number of fields is not equal to the number of field_builders."); + } + self.field_builders.iter().enumerate().for_each(|(idx, x)| { + if x.len() != self.len() { + let builder = SchemaBuilder::from(&self.fields); + let schema = builder.finish(); + + panic!("{}", format!( + "StructBuilder ({:?}) and field_builder with index {} ({:?}) are of unequal lengths: ({} != {}).", + schema, + idx, + self.fields[idx].data_type(), + self.len(), + x.len() + )); + } + }); + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } +} + +#[cfg(test)] +mod tests { + use std::any::type_name; + + use super::*; + use arrow_buffer::Buffer; + use arrow_data::ArrayData; + use arrow_schema::Field; + + use crate::{array::Array, types::ArrowDictionaryKeyType}; + + #[test] + fn test_struct_array_builder() { + let string_builder = StringBuilder::new(); + let int_builder = Int32Builder::new(); + + let fields = vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ]; + let field_builders = vec![ + Box::new(string_builder) as Box, + Box::new(int_builder) as Box, + ]; + + let mut builder = StructBuilder::new(fields, field_builders); + assert_eq!(2, builder.num_fields()); + + let string_builder = builder + .field_builder::(0) + .expect("builder at field 0 should be string builder"); + string_builder.append_value("joe"); + string_builder.append_null(); + string_builder.append_null(); + string_builder.append_value("mark"); + + let int_builder = builder + .field_builder::(1) + .expect("builder at field 1 should be int builder"); + int_builder.append_value(1); + int_builder.append_value(2); + int_builder.append_null(); + int_builder.append_value(4); + + builder.append(true); + builder.append(true); + builder.append_null(); + builder.append(true); + + let struct_data = builder.finish().into_data(); + + assert_eq!(4, struct_data.len()); + assert_eq!(1, struct_data.null_count()); + assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity()); + + let expected_string_data = ArrayData::builder(DataType::Utf8) + .len(4) + .null_bit_buffer(Some(Buffer::from(&[9_u8]))) + .add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7])) + .add_buffer(Buffer::from_slice_ref(b"joemark")) + .build() + .unwrap(); + + let expected_int_data = ArrayData::builder(DataType::Int32) + .len(4) + .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) + .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) + .build() + .unwrap(); + + assert_eq!(expected_string_data, struct_data.child_data()[0]); + assert_eq!(expected_int_data, struct_data.child_data()[1]); + } + + #[test] + fn test_struct_array_builder_finish() { + let int_builder = Int32Builder::new(); + let bool_builder = BooleanBuilder::new(); + + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; + + let mut builder = StructBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[ + false, true, false, true, false, true, false, true, false, true, + ]); + + // Append slot values - all are valid. + for _ in 0..10 { + builder.append(true); + } + + assert_eq!(10, builder.len()); + + let arr = builder.finish(); + + assert_eq!(10, arr.len()); + assert_eq!(0, builder.len()); + + builder + .field_builder::(0) + .unwrap() + .append_slice(&[1, 3, 5, 7, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[false, true, false, true, false]); + + // Append slot values - all are valid. + for _ in 0..5 { + builder.append(true); + } + + assert_eq!(5, builder.len()); + + let arr = builder.finish(); + + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_build_fixed_size_list() { + const LIST_LENGTH: i32 = 4; + let fixed_size_list_dtype = + DataType::new_fixed_size_list(DataType::Int32, LIST_LENGTH, false); + let mut builder = make_builder(&fixed_size_list_dtype, 10); + let builder = builder + .as_any_mut() + .downcast_mut::>>(); + match builder { + Some(builder) => { + assert_eq!(builder.value_length(), LIST_LENGTH); + assert!(builder + .values() + .as_any_mut() + .downcast_mut::() + .is_some()); + } + None => panic!("expected FixedSizeListBuilder, got a different builder type"), + } + } + + #[test] + fn test_struct_array_builder_finish_cloned() { + let int_builder = Int32Builder::new(); + let bool_builder = BooleanBuilder::new(); + + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; + + let mut builder = StructBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[ + false, true, false, true, false, true, false, true, false, true, + ]); + + // Append slot values - all are valid. + for _ in 0..10 { + builder.append(true); + } + + assert_eq!(10, builder.len()); + + let mut arr = builder.finish_cloned(); + + assert_eq!(10, arr.len()); + assert_eq!(10, builder.len()); + + builder + .field_builder::(0) + .unwrap() + .append_slice(&[1, 3, 5, 7, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[false, true, false, true, false]); + + // Append slot values - all are valid. + for _ in 0..5 { + builder.append(true); + } + + assert_eq!(15, builder.len()); + + arr = builder.finish(); + + assert_eq!(15, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_struct_array_builder_from_schema() { + let mut fields = vec![ + Field::new("f1", DataType::Float32, false), + Field::new("f2", DataType::Utf8, false), + ]; + let sub_fields = vec![ + Field::new("g1", DataType::Int32, false), + Field::new("g2", DataType::Boolean, false), + ]; + let struct_type = DataType::Struct(sub_fields.into()); + fields.push(Field::new("f3", struct_type, false)); + + let mut builder = StructBuilder::from_fields(fields, 5); + assert_eq!(3, builder.num_fields()); + assert!(builder.field_builder::(0).is_some()); + assert!(builder.field_builder::(1).is_some()); + assert!(builder.field_builder::(2).is_some()); + } + + #[test] + fn test_datatype_properties() { + let fields = Fields::from(vec![ + Field::new("f1", DataType::Decimal128(1, 2), false), + Field::new( + "f2", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), + false, + ), + ]); + let mut builder = StructBuilder::from_fields(fields.clone(), 1); + builder + .field_builder::(0) + .unwrap() + .append_value(1); + builder + .field_builder::(1) + .unwrap() + .append_value(1); + builder.append(true); + let array = builder.finish(); + + assert_eq!(array.data_type(), &DataType::Struct(fields.clone())); + assert_eq!(array.column(0).data_type(), fields[0].data_type()); + assert_eq!(array.column(1).data_type(), fields[1].data_type()); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int8_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int8); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int16_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int16); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int32_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int32); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int64_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int64); + } + + fn test_struct_array_builder_from_dictionary_type_inner( + key_type: DataType, + ) { + let dict_field = Field::new( + "f1", + DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)), + false, + ); + let fields = vec![dict_field.clone()]; + let expected_dtype = DataType::Struct(fields.into()); + let cloned_dict_field = dict_field.clone(); + let expected_child_dtype = dict_field.data_type(); + let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5); + let Some(dict_builder) = struct_builder.field_builder::>(0) + else { + panic!( + "Builder should be StringDictionaryBuilder<{}>", + type_name::() + ) + }; + dict_builder.append_value("dict string"); + struct_builder.append(true); + let array = struct_builder.finish(); + + assert_eq!(array.data_type(), &expected_dtype); + assert_eq!(array.column(0).data_type(), expected_child_dtype); + assert_eq!(array.column(0).len(), 1); + } + + #[test] + #[should_panic( + expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported" + )] + fn test_struct_array_builder_from_schema_unsupported_type() { + let fields = vec![ + Field::new("f1", DataType::UInt64, false), + Field::new( + "f2", + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + false, + ), + ]; + + let _ = StructBuilder::from_fields(fields, 5); + } + + #[test] + #[should_panic(expected = "Dictionary value type Int32 is not currently supported")] + fn test_struct_array_builder_from_dict_with_unsupported_value_type() { + let fields = vec![Field::new( + "f1", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)), + false, + )]; + + let _ = StructBuilder::from_fields(fields, 5); + } + + #[test] + fn test_struct_array_builder_field_builder_type_mismatch() { + let int_builder = Int32Builder::with_capacity(10); + + let fields = vec![Field::new("f1", DataType::Int32, false)]; + let field_builders = vec![Box::new(int_builder) as Box]; + + let mut builder = StructBuilder::new(fields, field_builders); + assert!(builder.field_builder::(0).is_none()); + } + + #[test] + #[should_panic( + expected = "StructBuilder (Schema { fields: [Field { name: \"f1\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"f2\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." + )] + fn test_struct_array_builder_unequal_field_builders_lengths() { + let mut int_builder = Int32Builder::with_capacity(10); + let mut bool_builder = BooleanBuilder::new(); + + int_builder.append_value(1); + int_builder.append_value(2); + bool_builder.append_value(true); + + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; + + let mut builder = StructBuilder::new(fields, field_builders); + builder.append(true); + builder.append(true); + builder.finish(); + } + + #[test] + #[should_panic(expected = "Number of fields is not equal to the number of field_builders.")] + fn test_struct_array_builder_unequal_field_field_builders() { + let int_builder = Int32Builder::with_capacity(10); + + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![Box::new(int_builder) as Box]; + + let mut builder = StructBuilder::new(fields, field_builders); + builder.finish(); + } + + #[test] + #[should_panic( + expected = "Incorrect datatype for StructArray field \\\"timestamp\\\", expected Timestamp(Nanosecond, Some(\\\"UTC\\\")) got Timestamp(Nanosecond, None)" + )] + fn test_struct_array_mismatch_builder() { + let fields = vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_owned().into())), + false, + )]; + + let field_builders: Vec> = + vec![Box::new(TimestampNanosecondBuilder::new())]; + + let mut sa = StructBuilder::new(fields, field_builders); + sa.finish(); + } + + #[test] + fn test_empty() { + let mut builder = StructBuilder::new(Fields::empty(), vec![]); + builder.append(true); + builder.append(false); + + let a1 = builder.finish_cloned(); + let a2 = builder.finish(); + assert_eq!(a1, a2); + assert_eq!(a1.len(), 2); + assert_eq!(a1.null_count(), 1); + assert!(a1.is_valid(0)); + assert!(a1.is_null(1)); + } +} diff --git a/arrow-array/src/builder2/union_builder.rs b/arrow-array/src/builder2/union_builder.rs new file mode 100644 index 000000000000..0ddc38000899 --- /dev/null +++ b/arrow-array/src/builder2/union_builder.rs @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder2::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder}; +use crate::builder2::BufferBuilder; +use crate::{make_array, ArrowPrimitiveType, UnionArray}; +use arrow_buffer::NullBufferBuilder; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::{ArrowError, DataType, Field}; +use std::any::Any; +use std::collections::BTreeMap; +use std::sync::Arc; + +/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. +#[derive(Debug)] +struct FieldData { + /// The type id for this field + type_id: i8, + /// The Arrow data type represented in the `values_buffer`, which is untyped + data_type: DataType, + /// A buffer containing the values for this field in raw bytes + values_buffer: Box, + /// The number of array slots represented by the buffer + slots: usize, + /// A builder for the null bitmap + null_buffer_builder: NullBufferBuilder, +} + +/// A type-erased [`BufferBuilder`] used by [`FieldData`] +trait FieldDataValues: std::fmt::Debug { + fn as_mut_any(&mut self) -> &mut dyn Any; + + fn append_null(&mut self); + + fn finish(&mut self) -> Buffer; +} + +impl FieldDataValues for BufferBuilder { + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn append_null(&mut self) { + self.advance(1) + } + + fn finish(&mut self) -> Buffer { + self.finish() + } +} + +impl FieldData { + /// Creates a new `FieldData`. + fn new(type_id: i8, data_type: DataType, capacity: usize) -> Self { + Self { + type_id, + data_type, + slots: 0, + values_buffer: Box::new(BufferBuilder::::new(capacity)), + null_buffer_builder: NullBufferBuilder::new(capacity), + } + } + + /// Appends a single value to this `FieldData`'s `values_buffer`. + fn append_value(&mut self, v: T::Native) { + self.values_buffer + .as_mut_any() + .downcast_mut::>() + .expect("Tried to append unexpected type") + .append(v); + + self.null_buffer_builder.append(true); + self.slots += 1; + } + + /// Appends a null to this `FieldData`. + fn append_null(&mut self) { + self.values_buffer.append_null(); + self.null_buffer_builder.append(false); + self.slots += 1; + } +} + +/// Builder for [`UnionArray`] +/// +/// Example: **Dense Memory Layout** +/// +/// ``` +/// # use arrow_array::builder::UnionBuilder; +/// # use arrow_array::types::{Float64Type, Int32Type}; +/// +/// let mut builder = UnionBuilder::new_dense(); +/// builder.append::("a", 1).unwrap(); +/// builder.append::("b", 3.0).unwrap(); +/// builder.append::("a", 4).unwrap(); +/// let union = builder.build().unwrap(); +/// +/// assert_eq!(union.type_id(0), 0); +/// assert_eq!(union.type_id(1), 1); +/// assert_eq!(union.type_id(2), 0); +/// +/// assert_eq!(union.value_offset(0), 0); +/// assert_eq!(union.value_offset(1), 0); +/// assert_eq!(union.value_offset(2), 1); +/// ``` +/// +/// Example: **Sparse Memory Layout** +/// ``` +/// # use arrow_array::builder::UnionBuilder; +/// # use arrow_array::types::{Float64Type, Int32Type}; +/// +/// let mut builder = UnionBuilder::new_sparse(); +/// builder.append::("a", 1).unwrap(); +/// builder.append::("b", 3.0).unwrap(); +/// builder.append::("a", 4).unwrap(); +/// let union = builder.build().unwrap(); +/// +/// assert_eq!(union.type_id(0), 0); +/// assert_eq!(union.type_id(1), 1); +/// assert_eq!(union.type_id(2), 0); +/// +/// assert_eq!(union.value_offset(0), 0); +/// assert_eq!(union.value_offset(1), 1); +/// assert_eq!(union.value_offset(2), 2); +/// ``` +#[derive(Debug)] +pub struct UnionBuilder { + /// The current number of slots in the array + len: usize, + /// Maps field names to `FieldData` instances which track the builders for that field + fields: BTreeMap, + /// Builder to keep track of type ids + type_id_builder: Int8BufferBuilder, + /// Builder to keep track of offsets (`None` for sparse unions) + value_offset_builder: Option, + initial_capacity: usize, +} + +impl UnionBuilder { + /// Creates a new dense array builder. + pub fn new_dense() -> Self { + Self::with_capacity_dense(1024) + } + + /// Creates a new sparse array builder. + pub fn new_sparse() -> Self { + Self::with_capacity_sparse(1024) + } + + /// Creates a new dense array builder with capacity. + pub fn with_capacity_dense(capacity: usize) -> Self { + Self { + len: 0, + fields: Default::default(), + type_id_builder: Int8BufferBuilder::new(capacity), + value_offset_builder: Some(Int32BufferBuilder::new(capacity)), + initial_capacity: capacity, + } + } + + /// Creates a new sparse array builder with capacity. + pub fn with_capacity_sparse(capacity: usize) -> Self { + Self { + len: 0, + fields: Default::default(), + type_id_builder: Int8BufferBuilder::new(capacity), + value_offset_builder: None, + initial_capacity: capacity, + } + } + + /// Appends a null to this builder, encoding the null in the array + /// of the `type_name` child / field. + /// + /// Since `UnionArray` encodes nulls as an entry in its children + /// (it doesn't have a validity bitmap itself), and where the null + /// is part of the final array, appending a NULL requires + /// specifying which field (child) to use. + #[inline] + pub fn append_null( + &mut self, + type_name: &str, + ) -> Result<(), ArrowError> { + self.append_option::(type_name, None) + } + + /// Appends a value to this builder. + #[inline] + pub fn append( + &mut self, + type_name: &str, + v: T::Native, + ) -> Result<(), ArrowError> { + self.append_option::(type_name, Some(v)) + } + + fn append_option( + &mut self, + type_name: &str, + v: Option, + ) -> Result<(), ArrowError> { + let type_name = type_name.to_string(); + + let mut field_data = match self.fields.remove(&type_name) { + Some(data) => { + if data.data_type != T::DATA_TYPE { + return Err(ArrowError::InvalidArgumentError(format!( + "Attempt to write col \"{}\" with type {} doesn't match existing type {}", + type_name, + T::DATA_TYPE, + data.data_type + ))); + } + data + } + None => match self.value_offset_builder { + Some(_) => FieldData::new::( + self.fields.len() as i8, + T::DATA_TYPE, + self.initial_capacity, + ), + // In the case of a sparse union, we should pass the maximum of the currently length and the capacity. + None => { + let mut fd = FieldData::new::( + self.fields.len() as i8, + T::DATA_TYPE, + self.len.max(self.initial_capacity), + ); + for _ in 0..self.len { + fd.append_null(); + } + fd + } + }, + }; + self.type_id_builder.append(field_data.type_id); + + match &mut self.value_offset_builder { + // Dense Union + Some(offset_builder) => { + offset_builder.append(field_data.slots as i32); + } + // Sparse Union + None => { + for (_, fd) in self.fields.iter_mut() { + // Append to all bar the FieldData currently being appended to + fd.append_null(); + } + } + } + + match v { + Some(v) => field_data.append_value::(v), + None => field_data.append_null(), + } + + self.fields.insert(type_name, field_data); + self.len += 1; + Ok(()) + } + + /// Builds this builder creating a new `UnionArray`. + pub fn build(self) -> Result { + let mut children = Vec::with_capacity(self.fields.len()); + let union_fields = self + .fields + .into_iter() + .map( + |( + name, + FieldData { + type_id, + data_type, + mut values_buffer, + slots, + mut null_buffer_builder, + }, + )| { + let array_ref = make_array(unsafe { + ArrayDataBuilder::new(data_type.clone()) + .add_buffer(values_buffer.finish()) + .len(slots) + .nulls(null_buffer_builder.finish()) + .build_unchecked() + }); + children.push(array_ref); + (type_id, Arc::new(Field::new(name, data_type, false))) + }, + ) + .collect(); + UnionArray::try_new( + union_fields, + self.type_id_builder.into(), + self.value_offset_builder.map(Into::into), + children, + ) + } +} diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 0fc9d30ab6e3..3e8ac2302c48 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -254,6 +254,7 @@ pub mod temporal_conversions; pub mod timezone; mod trusted_len; pub mod types; +pub mod builder2; #[cfg(test)] mod tests { From d0e5c32f0db6e461fe25da421c3a12cf8310fc29 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:31:46 +0200 Subject: [PATCH 2/5] add initial SpecificArrayBuilder --- arrow-array/src/builder2/mod.rs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/builder2/mod.rs b/arrow-array/src/builder2/mod.rs index 89a96280eb87..fd5ef99a5776 100644 --- a/arrow-array/src/builder2/mod.rs +++ b/arrow-array/src/builder2/mod.rs @@ -184,7 +184,7 @@ mod union_builder; pub use union_builder::*; -use crate::ArrayRef; +use crate::{Array, ArrayAccessor, ArrayRef}; use std::any::Any; /// Trait for dealing with different array builders at runtime @@ -235,7 +235,9 @@ use std::any::Any; /// "🍎" /// ); /// ``` -pub trait ArrayBuilder: Any + Send + Sync { +pub trait SpecificArrayBuilder: Any + Send + Sync { + type Output: Array + ArrayAccessor; + /// Returns the number of array slots in the builder fn len(&self) -> usize; @@ -245,10 +247,10 @@ pub trait ArrayBuilder: Any + Send + Sync { } /// Builds the array - fn finish(&mut self) -> ArrayRef; + fn finish(&mut self) -> Self::Output; /// Builds the array without resetting the underlying builder. - fn finish_cloned(&self) -> ArrayRef; + fn finish_cloned(&self) -> Self::Output; /// Returns the builder as a non-mutable `Any` reference. /// @@ -266,9 +268,14 @@ pub trait ArrayBuilder: Any + Send + Sync { /// Returns the boxed builder as a box of `Any`. fn into_box_any(self: Box) -> Box; + + // Append a value to the builder + fn append_value(&mut self, value: ::Item); } -impl ArrayBuilder for Box { +impl SpecificArrayBuilder for Box> { + type Output = T; + fn len(&self) -> usize { (**self).len() } @@ -296,6 +303,10 @@ impl ArrayBuilder for Box { fn into_box_any(self: Box) -> Box { self } + + fn append_value(&mut self, value: ::Item) { + (**self).append_value(value) + } } /// Builder for [`ListArray`](crate::array::ListArray) From c6e88cdc3df44da1fb47915704d659c2c6ec8f60 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 9 Dec 2024 23:05:15 +0200 Subject: [PATCH 3/5] tmp --- arrow-array/src/builder2/boolean_builder.rs | 35 +- arrow-array/src/builder2/buffer_builder.rs | 225 ----- .../src/builder2/fixed_size_binary_builder.rs | 25 +- .../src/builder2/fixed_size_list_builder.rs | 50 +- .../src/builder2/generic_byte_run_builder.rs | 517 ----------- .../src/builder2/generic_bytes_builder.rs | 21 +- .../generic_bytes_dictionary_builder.rs | 667 -------------- .../builder2/generic_bytes_view_builder.rs | 20 +- .../src/builder2/generic_list_builder.rs | 219 +---- arrow-array/src/builder2/map_builder.rs | 380 -------- arrow-array/src/builder2/mod.rs | 135 +-- arrow-array/src/builder2/null_builder.rs | 182 ---- arrow-array/src/builder2/primitive_builder.rs | 29 +- .../builder2/primitive_dictionary_builder.rs | 446 --------- .../src/builder2/primitive_run_builder.rs | 313 ------- arrow-array/src/builder2/struct_builder.rs | 872 ------------------ arrow-array/src/builder2/union_builder.rs | 313 ------- 17 files changed, 233 insertions(+), 4216 deletions(-) delete mode 100644 arrow-array/src/builder2/buffer_builder.rs delete mode 100644 arrow-array/src/builder2/generic_byte_run_builder.rs delete mode 100644 arrow-array/src/builder2/generic_bytes_dictionary_builder.rs delete mode 100644 arrow-array/src/builder2/map_builder.rs delete mode 100644 arrow-array/src/builder2/null_builder.rs delete mode 100644 arrow-array/src/builder2/primitive_dictionary_builder.rs delete mode 100644 arrow-array/src/builder2/primitive_run_builder.rs delete mode 100644 arrow-array/src/builder2/struct_builder.rs delete mode 100644 arrow-array/src/builder2/union_builder.rs diff --git a/arrow-array/src/builder2/boolean_builder.rs b/arrow-array/src/builder2/boolean_builder.rs index cdc00d03f26a..52ec4661d797 100644 --- a/arrow-array/src/builder2/boolean_builder.rs +++ b/arrow-array/src/builder2/boolean_builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::{ArrayBuilder, BooleanBufferBuilder}; -use crate::{ArrayRef, BooleanArray}; +use crate::builder2::{SpecificArrayBuilder, BooleanBufferBuilder}; +use crate::{ArrayAccessor, ArrayRef, BooleanArray}; use arrow_buffer::Buffer; use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; @@ -187,37 +187,6 @@ impl BooleanBuilder { } } -impl ArrayBuilder for BooleanBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} impl Extend> for BooleanBuilder { #[inline] diff --git a/arrow-array/src/builder2/buffer_builder.rs b/arrow-array/src/builder2/buffer_builder.rs deleted file mode 100644 index ab67669febb8..000000000000 --- a/arrow-array/src/builder2/buffer_builder.rs +++ /dev/null @@ -1,225 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -pub use arrow_buffer::BufferBuilder; -use half::f16; - -use crate::types::*; - -/// Buffer builder for signed 8-bit integer type. -pub type Int8BufferBuilder = BufferBuilder; -/// Buffer builder for signed 16-bit integer type. -pub type Int16BufferBuilder = BufferBuilder; -/// Buffer builder for signed 32-bit integer type. -pub type Int32BufferBuilder = BufferBuilder; -/// Buffer builder for signed 64-bit integer type. -pub type Int64BufferBuilder = BufferBuilder; -/// Buffer builder for usigned 8-bit integer type. -pub type UInt8BufferBuilder = BufferBuilder; -/// Buffer builder for usigned 16-bit integer type. -pub type UInt16BufferBuilder = BufferBuilder; -/// Buffer builder for usigned 32-bit integer type. -pub type UInt32BufferBuilder = BufferBuilder; -/// Buffer builder for usigned 64-bit integer type. -pub type UInt64BufferBuilder = BufferBuilder; -/// Buffer builder for 16-bit floating point type. -pub type Float16BufferBuilder = BufferBuilder; -/// Buffer builder for 32-bit floating point type. -pub type Float32BufferBuilder = BufferBuilder; -/// Buffer builder for 64-bit floating point type. -pub type Float64BufferBuilder = BufferBuilder; - -/// Buffer builder for 128-bit decimal type. -pub type Decimal128BufferBuilder = BufferBuilder<::Native>; -/// Buffer builder for 256-bit decimal type. -pub type Decimal256BufferBuilder = BufferBuilder<::Native>; - -/// Buffer builder for timestamp type of second unit. -pub type TimestampSecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for timestamp type of millisecond unit. -pub type TimestampMillisecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for timestamp type of microsecond unit. -pub type TimestampMicrosecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for timestamp type of nanosecond unit. -pub type TimestampNanosecondBufferBuilder = - BufferBuilder<::Native>; - -/// Buffer builder for 32-bit date type. -pub type Date32BufferBuilder = BufferBuilder<::Native>; -/// Buffer builder for 64-bit date type. -pub type Date64BufferBuilder = BufferBuilder<::Native>; - -/// Buffer builder for 32-bit elaspsed time since midnight of second unit. -pub type Time32SecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit. -pub type Time32MillisecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for 64-bit elaspsed time since midnight of microsecond unit. -pub type Time64MicrosecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for 64-bit elaspsed time since midnight of nanosecond unit. -pub type Time64NanosecondBufferBuilder = - BufferBuilder<::Native>; - -/// Buffer builder for “calendar” interval in months. -pub type IntervalYearMonthBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for “calendar” interval in days and milliseconds. -pub type IntervalDayTimeBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder “calendar” interval in months, days, and nanoseconds. -pub type IntervalMonthDayNanoBufferBuilder = - BufferBuilder<::Native>; - -/// Buffer builder for elaspsed time of second unit. -pub type DurationSecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for elaspsed time of milliseconds unit. -pub type DurationMillisecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for elaspsed time of microseconds unit. -pub type DurationMicrosecondBufferBuilder = - BufferBuilder<::Native>; -/// Buffer builder for elaspsed time of nanoseconds unit. -pub type DurationNanosecondBufferBuilder = - BufferBuilder<::Native>; - -#[cfg(test)] -mod tests { - use crate::builder::{ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder}; - use crate::Array; - - #[test] - fn test_builder_i32_empty() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(0, b.len()); - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(0, a.len()); - } - - #[test] - fn test_builder_i32_alloc_zero_bytes() { - let mut b = Int32BufferBuilder::new(0); - b.append(123); - let a = b.finish(); - assert_eq!(4, a.len()); - } - - #[test] - fn test_builder_i32() { - let mut b = Int32BufferBuilder::new(5); - for i in 0..5 { - b.append(i); - } - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(20, a.len()); - } - - #[test] - fn test_builder_i32_grow_buffer() { - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - for i in 0..20 { - b.append(i); - } - assert_eq!(32, b.capacity()); - let a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_builder_finish() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(16, b.capacity()); - for i in 0..10 { - b.append(i); - } - let mut a = b.finish(); - assert_eq!(40, a.len()); - assert_eq!(0, b.len()); - assert_eq!(0, b.capacity()); - - // Try build another buffer after cleaning up. - for i in 0..20 { - b.append(i) - } - assert_eq!(32, b.capacity()); - a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_reserve() { - let mut b = UInt8BufferBuilder::new(2); - assert_eq!(64, b.capacity()); - b.reserve(64); - assert_eq!(64, b.capacity()); - b.reserve(65); - assert_eq!(128, b.capacity()); - - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - b.reserve(16); - assert_eq!(16, b.capacity()); - b.reserve(17); - assert_eq!(32, b.capacity()); - } - - #[test] - fn test_append_slice() { - let mut b = UInt8BufferBuilder::new(0); - b.append_slice(b"Hello, "); - b.append_slice(b"World!"); - let buffer = b.finish(); - assert_eq!(13, buffer.len()); - - let mut b = Int32BufferBuilder::new(0); - b.append_slice(&[32, 54]); - let buffer = b.finish(); - assert_eq!(8, buffer.len()); - } - - #[test] - fn test_append_values() { - let mut a = Int8Builder::new(); - a.append_value(1); - a.append_null(); - a.append_value(-2); - assert_eq!(a.len(), 3); - - // append values - let values = &[1, 2, 3, 4]; - let is_valid = &[true, true, false, true]; - a.append_values(values, is_valid); - - assert_eq!(a.len(), 7); - let array = a.finish(); - assert_eq!(array.value(0), 1); - assert!(array.is_null(1)); - assert_eq!(array.value(2), -2); - assert_eq!(array.value(3), 1); - assert_eq!(array.value(4), 2); - assert!(array.is_null(5)); - assert_eq!(array.value(6), 4); - } -} diff --git a/arrow-array/src/builder2/fixed_size_binary_builder.rs b/arrow-array/src/builder2/fixed_size_binary_builder.rs index ae466bd6b4c8..6c20e5562b2a 100644 --- a/arrow-array/src/builder2/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder2/fixed_size_binary_builder.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::{ArrayBuilder, UInt8BufferBuilder}; -use crate::{ArrayRef, FixedSizeBinaryArray}; +use crate::builder2::{SpecificArrayBuilder}; +use crate::builder::{UInt8BufferBuilder}; +use crate::{ArrayAccessor, ArrayRef, FixedSizeBinaryArray}; use arrow_buffer::Buffer; use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; @@ -122,7 +123,9 @@ impl FixedSizeBinaryBuilder { } } -impl ArrayBuilder for FixedSizeBinaryBuilder { +impl SpecificArrayBuilder for FixedSizeBinaryBuilder { + type Output = FixedSizeBinaryArray; + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -144,14 +147,26 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { } /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + // TODO - should panic? + // should we document it or return a Result? + self.append_value(value).expect("append value failed"); + } + + fn append_null(&mut self) { + self.append_null(); + } + + // TODO - implement append nulls with better performance? } #[cfg(test)] diff --git a/arrow-array/src/builder2/fixed_size_list_builder.rs b/arrow-array/src/builder2/fixed_size_list_builder.rs index c5c7c6449592..f01c18b16c72 100644 --- a/arrow-array/src/builder2/fixed_size_list_builder.rs +++ b/arrow-array/src/builder2/fixed_size_list_builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::ArrayBuilder; -use crate::{ArrayRef, FixedSizeListArray}; +use crate::builder2::SpecificArrayBuilder; +use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{Field, FieldRef}; use std::any::Any; @@ -62,14 +62,14 @@ use std::sync::Arc; /// ``` /// #[derive(Debug)] -pub struct FixedSizeListBuilder { +pub struct FixedSizeListBuilder where for<'a> &'a ::Output: ArrayAccessor { null_buffer_builder: NullBufferBuilder, values_builder: T, list_len: i32, field: Option, } -impl FixedSizeListBuilder { +impl FixedSizeListBuilder where for<'a> &'a ::Output: ArrayAccessor { /// Creates a new [`FixedSizeListBuilder`] from a given values array builder /// `value_length` is the number of values within each array pub fn new(values_builder: T, value_length: i32) -> Self { @@ -107,10 +107,14 @@ impl FixedSizeListBuilder { } } -impl ArrayBuilder for FixedSizeListBuilder +impl SpecificArrayBuilder for FixedSizeListBuilder where - T: 'static, + ValuesOutput: Array, + T: 'static + SpecificArrayBuilder, + for<'a> &'a ValuesOutput: ArrayAccessor { + type Output = FixedSizeListArray; + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -132,19 +136,31 @@ where } /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + // our item is their output + self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); + self.append(true); + } + + fn append_null(&mut self) { + // TODO - make sure we should append nulls to the values builder + self.values_builder.append_nulls(self.list_len as usize); + self.append(false); + } } -impl FixedSizeListBuilder +impl FixedSizeListBuilder where - T: 'static, + T: 'static, for<'a> &'a ::Output: ArrayAccessor { /// Returns the child array builder as a mutable reference. /// @@ -220,14 +236,14 @@ mod tests { use super::*; use arrow_schema::DataType; - use crate::builder::Int32Builder; + use crate::builder2::{Int32Builder, PrimitiveBuilder}; use crate::Array; use crate::Int32Array; fn make_list_builder( include_null_element: bool, include_null_in_values: bool, - ) -> FixedSizeListBuilder> { + ) -> FixedSizeListBuilder> { let values_builder = Int32Builder::new(); let mut builder = FixedSizeListBuilder::new(values_builder, 3); @@ -415,7 +431,7 @@ mod tests { #[test] fn test_fixed_size_list_array_builder_with_field_empty() { - let values_builder = Int32Array::builder(0); + let values_builder = Int32Array::builder2(0); let mut builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( "list_item", DataType::Int32, @@ -429,7 +445,7 @@ mod tests { #[test] fn test_fixed_size_list_array_builder_cloned_with_field_empty() { - let values_builder = Int32Array::builder(0); + let values_builder = Int32Array::builder2(0); let builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( "list_item", DataType::Int32, @@ -443,7 +459,7 @@ mod tests { #[test] fn test_fixed_size_list_array_builder_empty() { - let values_builder = Int32Array::builder(5); + let values_builder = Int32Array::builder2(5); let mut builder = FixedSizeListBuilder::new(values_builder, 3); assert!(builder.is_empty()); let arr = builder.finish(); @@ -453,7 +469,7 @@ mod tests { #[test] fn test_fixed_size_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); + let values_builder = Int32Array::builder2(5); let mut builder = FixedSizeListBuilder::new(values_builder, 3); builder.values().append_slice(&[1, 2, 3]); @@ -477,7 +493,7 @@ mod tests { expected = "Length of the child array (10) must be the multiple of the value length (3) and the array length (3)." )] fn test_fixed_size_list_array_builder_fail() { - let values_builder = Int32Array::builder(5); + let values_builder = Int32Array::builder2(5); let mut builder = FixedSizeListBuilder::new(values_builder, 3); builder.values().append_slice(&[1, 2, 3]); diff --git a/arrow-array/src/builder2/generic_byte_run_builder.rs b/arrow-array/src/builder2/generic_byte_run_builder.rs deleted file mode 100644 index 0bf5658b297e..000000000000 --- a/arrow-array/src/builder2/generic_byte_run_builder.rs +++ /dev/null @@ -1,517 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::types::bytes::ByteArrayNativeType; -use std::{any::Any, sync::Arc}; - -use crate::{ - types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, - ArrayRef, ArrowPrimitiveType, RunArray, -}; - -use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; - -use arrow_buffer::ArrowNativeType; - -/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray) -/// -/// # Example: -/// -/// ``` -/// -/// # use arrow_array::builder::GenericByteRunBuilder; -/// # use arrow_array::{GenericByteArray, BinaryArray}; -/// # use arrow_array::types::{BinaryType, Int16Type}; -/// # use arrow_array::{Array, Int16Array}; -/// # use arrow_array::cast::AsArray; -/// -/// let mut builder = -/// GenericByteRunBuilder::::new(); -/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter()); -/// builder.append_value(b"def"); -/// builder.append_null(); -/// let array = builder.finish(); -/// -/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]); -/// -/// let av = array.values(); -/// -/// assert!(!av.is_null(0)); -/// assert!(av.is_null(1)); -/// assert!(!av.is_null(2)); -/// assert!(av.is_null(3)); -/// -/// // Values are polymorphic and so require a downcast. -/// let ava: &BinaryArray = av.as_binary(); -/// -/// assert_eq!(ava.value(0), b"abc"); -/// assert_eq!(ava.value(2), b"def"); -/// ``` -#[derive(Debug)] -pub struct GenericByteRunBuilder -where - R: ArrowPrimitiveType, - V: ByteArrayType, -{ - run_ends_builder: PrimitiveBuilder, - values_builder: GenericByteBuilder, - current_value: Vec, - has_current_value: bool, - current_run_end_index: usize, - prev_run_end_index: usize, -} - -impl Default for GenericByteRunBuilder -where - R: ArrowPrimitiveType, - V: ByteArrayType, -{ - fn default() -> Self { - Self::new() - } -} - -impl GenericByteRunBuilder -where - R: ArrowPrimitiveType, - V: ByteArrayType, -{ - /// Creates a new `GenericByteRunBuilder` - pub fn new() -> Self { - Self { - run_ends_builder: PrimitiveBuilder::new(), - values_builder: GenericByteBuilder::::new(), - current_value: Vec::new(), - has_current_value: false, - current_run_end_index: 0, - prev_run_end_index: 0, - } - } - - /// Creates a new `GenericByteRunBuilder` with the provided capacity - /// - /// `capacity`: the expected number of run-end encoded values. - /// `data_capacity`: the expected number of bytes of run end encoded values - pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { - Self { - run_ends_builder: PrimitiveBuilder::with_capacity(capacity), - values_builder: GenericByteBuilder::::with_capacity(capacity, data_capacity), - current_value: Vec::new(), - has_current_value: false, - current_run_end_index: 0, - prev_run_end_index: 0, - } - } -} - -impl ArrayBuilder for GenericByteRunBuilder -where - R: RunEndIndexType, - V: ByteArrayType, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the length of logical array encoded by - /// the eventual runs array. - fn len(&self) -> usize { - self.current_run_end_index - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} - -impl GenericByteRunBuilder -where - R: RunEndIndexType, - V: ByteArrayType, -{ - /// Appends optional value to the logical array encoded by the RunArray. - pub fn append_option(&mut self, input_value: Option>) { - match input_value { - Some(value) => self.append_value(value), - None => self.append_null(), - } - } - - /// Appends value to the logical array encoded by the RunArray. - pub fn append_value(&mut self, input_value: impl AsRef) { - let value: &[u8] = input_value.as_ref().as_ref(); - if !self.has_current_value { - self.append_run_end(); - self.current_value.extend_from_slice(value); - self.has_current_value = true; - } else if self.current_value.as_slice() != value { - self.append_run_end(); - self.current_value.clear(); - self.current_value.extend_from_slice(value); - } - self.current_run_end_index += 1; - } - - /// Appends null to the logical array encoded by the RunArray. - pub fn append_null(&mut self) { - if self.has_current_value { - self.append_run_end(); - self.current_value.clear(); - self.has_current_value = false; - } - self.current_run_end_index += 1; - } - - /// Creates the RunArray and resets the builder. - /// Panics if RunArray cannot be built. - pub fn finish(&mut self) -> RunArray { - // write the last run end to the array. - self.append_run_end(); - - // reset the run end index to zero. - self.current_value.clear(); - self.has_current_value = false; - self.current_run_end_index = 0; - self.prev_run_end_index = 0; - - // build the run encoded array by adding run_ends and values array as its children. - let run_ends_array = self.run_ends_builder.finish(); - let values_array = self.values_builder.finish(); - RunArray::::try_new(&run_ends_array, &values_array).unwrap() - } - - /// Creates the RunArray and without resetting the builder. - /// Panics if RunArray cannot be built. - pub fn finish_cloned(&self) -> RunArray { - let mut run_ends_array = self.run_ends_builder.finish_cloned(); - let mut values_array = self.values_builder.finish_cloned(); - - // Add current run if one exists - if self.prev_run_end_index != self.current_run_end_index { - let mut run_end_builder = run_ends_array.into_builder().unwrap(); - let mut values_builder = values_array.into_builder().unwrap(); - self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); - run_ends_array = run_end_builder.finish(); - values_array = values_builder.finish(); - } - - RunArray::::try_new(&run_ends_array, &values_array).unwrap() - } - - // Appends the current run to the array. - fn append_run_end(&mut self) { - // empty array or the function called without appending any value. - if self.prev_run_end_index == self.current_run_end_index { - return; - } - let run_end_index = self.run_end_index_as_native(); - self.run_ends_builder.append_value(run_end_index); - if self.has_current_value { - let slice = self.current_value.as_slice(); - let native = unsafe { - // Safety: - // As self.current_value is created from V::Native. The value V::Native can be - // built back from the bytes without validations - V::Native::from_bytes_unchecked(slice) - }; - self.values_builder.append_value(native); - } else { - self.values_builder.append_null(); - } - self.prev_run_end_index = self.current_run_end_index; - } - - // Similar to `append_run_end` but on custom builders. - // Used in `finish_cloned` which is not suppose to mutate `self`. - fn append_run_end_with_builders( - &self, - run_ends_builder: &mut PrimitiveBuilder, - values_builder: &mut GenericByteBuilder, - ) { - let run_end_index = self.run_end_index_as_native(); - run_ends_builder.append_value(run_end_index); - if self.has_current_value { - let slice = self.current_value.as_slice(); - let native = unsafe { - // Safety: - // As self.current_value is created from V::Native. The value V::Native can be - // built back from the bytes without validations - V::Native::from_bytes_unchecked(slice) - }; - values_builder.append_value(native); - } else { - values_builder.append_null(); - } - } - - fn run_end_index_as_native(&self) -> R::Native { - R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| { - panic!( - "Cannot convert the value {} from `usize` to native form of arrow datatype {}", - self.current_run_end_index, - R::DATA_TYPE - ) - }) - } -} - -impl Extend> for GenericByteRunBuilder -where - R: RunEndIndexType, - V: ByteArrayType, - S: AsRef, -{ - fn extend>>(&mut self, iter: T) { - for elem in iter { - self.append_option(elem); - } - } -} - -/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray) -/// -/// ``` -/// // Create a run-end encoded array with run-end indexes data type as `i16`. -/// // The encoded values are Strings. -/// -/// # use arrow_array::builder::StringRunBuilder; -/// # use arrow_array::{Int16Array, StringArray}; -/// # use arrow_array::types::Int16Type; -/// # use arrow_array::cast::AsArray; -/// # -/// let mut builder = StringRunBuilder::::new(); -/// -/// // The builder builds the dictionary value by value -/// builder.append_value("abc"); -/// builder.append_null(); -/// builder.extend([Some("def"), Some("def"), Some("abc")]); -/// let array = builder.finish(); -/// -/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &StringArray = av.as_string::(); -/// -/// assert_eq!(ava.value(0), "abc"); -/// assert!(av.is_null(1)); -/// assert_eq!(ava.value(2), "def"); -/// assert_eq!(ava.value(3), "abc"); -/// -/// ``` -pub type StringRunBuilder = GenericByteRunBuilder; - -/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray) -pub type LargeStringRunBuilder = GenericByteRunBuilder; - -/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray) -/// -/// ``` -/// // Create a run-end encoded array with run-end indexes data type as `i16`. -/// // The encoded data is binary values. -/// -/// # use arrow_array::builder::BinaryRunBuilder; -/// # use arrow_array::{BinaryArray, Int16Array}; -/// # use arrow_array::cast::AsArray; -/// # use arrow_array::types::Int16Type; -/// -/// let mut builder = BinaryRunBuilder::::new(); -/// -/// // The builder builds the dictionary value by value -/// builder.append_value(b"abc"); -/// builder.append_null(); -/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); -/// let array = builder.finish(); -/// -/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &BinaryArray = av.as_binary(); -/// -/// assert_eq!(ava.value(0), b"abc"); -/// assert!(av.is_null(1)); -/// assert_eq!(ava.value(2), b"def"); -/// assert_eq!(ava.value(3), b"abc"); -/// -/// ``` -pub type BinaryRunBuilder = GenericByteRunBuilder; - -/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) -pub type LargeBinaryRunBuilder = GenericByteRunBuilder; - -#[cfg(test)] -mod tests { - use super::*; - - use crate::array::Array; - use crate::cast::AsArray; - use crate::types::{Int16Type, Int32Type}; - use crate::GenericByteArray; - use crate::Int16RunArray; - - fn test_bytes_run_builder(values: Vec<&T::Native>) - where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = GenericByteRunBuilder::::new(); - builder.append_value(values[0]); - builder.append_value(values[0]); - builder.append_value(values[0]); - builder.append_null(); - builder.append_null(); - builder.append_value(values[1]); - builder.append_value(values[1]); - builder.append_value(values[2]); - builder.append_value(values[2]); - builder.append_value(values[2]); - builder.append_value(values[2]); - let array = builder.finish(); - - assert_eq!(array.len(), 11); - assert_eq!(array.null_count(), 0); - assert_eq!(array.logical_null_count(), 2); - - assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); - - assert_eq!(*ava.value(0), *values[0]); - assert!(ava.is_null(1)); - assert_eq!(*ava.value(2), *values[1]); - assert_eq!(*ava.value(3), *values[2]); - } - - #[test] - fn test_string_run_builder() { - test_bytes_run_builder::(vec!["abc", "def", "ghi"]); - } - - #[test] - fn test_string_run_builder_with_empty_strings() { - test_bytes_run_builder::(vec!["abc", "", "ghi"]); - } - - #[test] - fn test_binary_run_builder() { - test_bytes_run_builder::(vec![b"abc", b"def", b"ghi"]); - } - - fn test_bytes_run_builder_finish_cloned(values: Vec<&T::Native>) - where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = GenericByteRunBuilder::::new(); - - builder.append_value(values[0]); - builder.append_null(); - builder.append_value(values[1]); - builder.append_value(values[1]); - builder.append_value(values[0]); - let mut array: Int16RunArray = builder.finish_cloned(); - - assert_eq!(array.len(), 5); - assert_eq!(array.null_count(), 0); - assert_eq!(array.logical_null_count(), 1); - - assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); - - assert_eq!(ava.value(0), values[0]); - assert!(ava.is_null(1)); - assert_eq!(ava.value(2), values[1]); - assert_eq!(ava.value(3), values[0]); - - // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only - // one entry in final output. - builder.append_value(values[0]); - builder.append_value(values[0]); - builder.append_value(values[1]); - array = builder.finish(); - - assert_eq!(array.len(), 8); - assert_eq!(array.null_count(), 0); - assert_eq!(array.logical_null_count(), 1); - - assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]); - - // Values are polymorphic and so require a downcast. - let av2 = array.values(); - let ava2: &GenericByteArray = - av2.as_any().downcast_ref::>().unwrap(); - - assert_eq!(ava2.value(0), values[0]); - assert!(ava2.is_null(1)); - assert_eq!(ava2.value(2), values[1]); - // The value appended before and after `finish_cloned` has only one entry. - assert_eq!(ava2.value(3), values[0]); - assert_eq!(ava2.value(4), values[1]); - } - - #[test] - fn test_string_run_builder_finish_cloned() { - test_bytes_run_builder_finish_cloned::(vec!["abc", "def", "ghi"]); - } - - #[test] - fn test_binary_run_builder_finish_cloned() { - test_bytes_run_builder_finish_cloned::(vec![b"abc", b"def", b"ghi"]); - } - - #[test] - fn test_extend() { - let mut builder = StringRunBuilder::::new(); - builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some)); - builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some)); - let array = builder.finish(); - - assert_eq!(array.len(), 10); - assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]); - - let str_array = array.values().as_string::(); - assert_eq!(str_array.value(0), "a"); - assert_eq!(str_array.value(1), ""); - assert_eq!(str_array.value(2), "b"); - assert_eq!(str_array.value(3), "cupcakes"); - } -} diff --git a/arrow-array/src/builder2/generic_bytes_builder.rs b/arrow-array/src/builder2/generic_bytes_builder.rs index 6b6c1a842350..0b07c43ac7d7 100644 --- a/arrow-array/src/builder2/generic_bytes_builder.rs +++ b/arrow-array/src/builder2/generic_bytes_builder.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; +use crate::builder2::{SpecificArrayBuilder}; +use crate::builder::{BufferBuilder, UInt8BufferBuilder}; use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; +use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayDataBuilder; @@ -196,19 +197,21 @@ impl Default for GenericByteBuilder { } } -impl ArrayBuilder for GenericByteBuilder { +impl SpecificArrayBuilder for GenericByteBuilder { + type Output = GenericByteArray; + /// Returns the number of binary slots in the builder fn len(&self) -> usize { self.null_buffer_builder.len() } /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } @@ -226,6 +229,14 @@ impl ArrayBuilder for GenericByteBuilder { fn into_box_any(self: Box) -> Box { self } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + self.append_value(value) + } + + fn append_null(&mut self) { + self.append_null() + } } impl> Extend> for GenericByteBuilder { diff --git a/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs deleted file mode 100644 index 3a1469177488..000000000000 --- a/arrow-array/src/builder2/generic_bytes_dictionary_builder.rs +++ /dev/null @@ -1,667 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; -use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::{ArrowError, DataType}; -use hashbrown::HashTable; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`DictionaryArray`] of [`GenericByteArray`] -/// -/// For example to map a set of byte indices to String values. Note that -/// the use of a `HashMap` here will not scale to very large arrays or -/// result in an ordered dictionary. -#[derive(Debug)] -pub struct GenericByteDictionaryBuilder -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - state: ahash::RandomState, - dedup: HashTable, - - keys_builder: PrimitiveBuilder, - values_builder: GenericByteBuilder, -} - -impl Default for GenericByteDictionaryBuilder -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - fn default() -> Self { - Self::new() - } -} - -impl GenericByteDictionaryBuilder -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - /// Creates a new `GenericByteDictionaryBuilder` - pub fn new() -> Self { - let keys_builder = PrimitiveBuilder::new(); - let values_builder = GenericByteBuilder::::new(); - Self { - state: Default::default(), - dedup: HashTable::with_capacity(keys_builder.capacity()), - keys_builder, - values_builder, - } - } - - /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities - /// - /// `keys_capacity`: the number of keys, i.e. length of array to build - /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary - /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary - pub fn with_capacity( - keys_capacity: usize, - value_capacity: usize, - data_capacity: usize, - ) -> Self { - Self { - state: Default::default(), - dedup: Default::default(), - keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder: GenericByteBuilder::::with_capacity(value_capacity, data_capacity), - } - } - - /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary - /// which is initialized with the given values. - /// The indices of those dictionary values are used as keys. - /// - /// # Example - /// - /// ``` - /// # use arrow_array::builder::StringDictionaryBuilder; - /// # use arrow_array::{Int16Array, StringArray}; - /// - /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); - /// - /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap(); - /// builder.append("def").unwrap(); - /// builder.append_null(); - /// builder.append("abc").unwrap(); - /// - /// let dictionary_array = builder.finish(); - /// - /// let keys = dictionary_array.keys(); - /// - /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); - /// ``` - pub fn new_with_dictionary( - keys_capacity: usize, - dictionary_values: &GenericByteArray, - ) -> Result { - let state = ahash::RandomState::default(); - let dict_len = dictionary_values.len(); - - let mut dedup = HashTable::with_capacity(dict_len); - - let values_len = dictionary_values.value_data().len(); - let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); - - K::Native::from_usize(dictionary_values.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - - for (idx, maybe_value) in dictionary_values.iter().enumerate() { - match maybe_value { - Some(value) => { - let value_bytes: &[u8] = value.as_ref(); - let hash = state.hash_one(value_bytes); - - dedup - .entry( - hash, - |idx: &usize| value_bytes == get_bytes(&values_builder, *idx), - |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)), - ) - .or_insert(idx); - - values_builder.append_value(value); - } - None => values_builder.append_null(), - } - } - - Ok(Self { - state, - dedup, - keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder, - }) - } -} - -impl ArrayBuilder for GenericByteDictionaryBuilder -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} - -impl GenericByteDictionaryBuilder -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - fn get_or_insert_key(&mut self, value: impl AsRef) -> Result { - let value_native: &T::Native = value.as_ref(); - let value_bytes: &[u8] = value_native.as_ref(); - - let state = &self.state; - let storage = &mut self.values_builder; - let hash = state.hash_one(value_bytes); - - let idx = *self - .dedup - .entry( - hash, - |idx| value_bytes == get_bytes(storage, *idx), - |idx| state.hash_one(get_bytes(storage, *idx)), - ) - .or_insert_with(|| { - let idx = storage.len(); - storage.append_value(value); - idx - }) - .get(); - - let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?; - - Ok(key) - } - - /// Append a value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - /// - /// Returns an error if the new index would overflow the key type. - pub fn append(&mut self, value: impl AsRef) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value(key); - Ok(key) - } - - /// Append a value multiple times to the array. - /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. - /// - /// Returns an error if the new index would overflow the key type. - pub fn append_n( - &mut self, - value: impl AsRef, - count: usize, - ) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value_n(key, count); - Ok(key) - } - - /// Infallibly append a value to this builder - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - pub fn append_value(&mut self, value: impl AsRef) { - self.append(value).expect("dictionary key overflow"); - } - - /// Infallibly append a value to this builder repeatedly `count` times. - /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - pub fn append_values(&mut self, value: impl AsRef, count: usize) { - self.append_n(value, count) - .expect("dictionary key overflow"); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.keys_builder.append_null() - } - - /// Infallibly append `n` null slots into the builder - #[inline] - pub fn append_nulls(&mut self, n: usize) { - self.keys_builder.append_nulls(n) - } - - /// Append an `Option` value into the builder - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - #[inline] - pub fn append_option(&mut self, value: Option>) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Append an `Option` value into the builder repeatedly `count` times. - /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - pub fn append_options(&mut self, value: Option>, count: usize) { - match value { - None => self.keys_builder.append_nulls(count), - Some(v) => self.append_values(v, count), - }; - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.dedup.clear(); - let values = self.values_builder.finish(); - let keys = self.keys_builder.finish(); - - let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); - - let builder = keys - .into_data() - .into_builder() - .data_type(data_type) - .child_data(vec![values.into_data()]); - - DictionaryArray::from(unsafe { builder.build_unchecked() }) - } - - /// Builds the `DictionaryArray` without resetting the builder. - pub fn finish_cloned(&self) -> DictionaryArray { - let values = self.values_builder.finish_cloned(); - let keys = self.keys_builder.finish_cloned(); - - let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); - - let builder = keys - .into_data() - .into_builder() - .data_type(data_type) - .child_data(vec![values.into_data()]); - - DictionaryArray::from(unsafe { builder.build_unchecked() }) - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.keys_builder.validity_slice() - } -} - -impl> Extend> - for GenericByteDictionaryBuilder -{ - #[inline] - fn extend>>(&mut self, iter: I) { - for v in iter { - self.append_option(v) - } - } -} - -fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[u8] { - let offsets = values.offsets_slice(); - let values = values.values_slice(); - - let end_offset = offsets[idx + 1].as_usize(); - let start_offset = offsets[idx].as_usize(); - - &values[start_offset..end_offset] -} - -/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray) -/// -/// ``` -/// // Create a dictionary array indexed by bytes whose values are Strings. -/// // It can thus hold up to 256 distinct string values. -/// -/// # use arrow_array::builder::StringDictionaryBuilder; -/// # use arrow_array::{Int8Array, StringArray}; -/// # use arrow_array::types::Int8Type; -/// -/// let mut builder = StringDictionaryBuilder::::new(); -/// -/// // The builder builds the dictionary value by value -/// builder.append("abc").unwrap(); -/// builder.append_null(); -/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single lookup -/// builder.append("abc").unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(ava.value(0), "abc"); -/// assert_eq!(ava.value(1), "def"); -/// -/// ``` -pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; - -/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) -pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; - -/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) -/// -/// ``` -/// // Create a dictionary array indexed by bytes whose values are binary. -/// // It can thus hold up to 256 distinct binary values. -/// -/// # use arrow_array::builder::BinaryDictionaryBuilder; -/// # use arrow_array::{BinaryArray, Int8Array}; -/// # use arrow_array::types::Int8Type; -/// -/// let mut builder = BinaryDictionaryBuilder::::new(); -/// -/// // The builder builds the dictionary value by value -/// builder.append(b"abc").unwrap(); -/// builder.append_null(); -/// builder.append(b"def").unwrap(); -/// builder.append(b"def").unwrap(); -/// builder.append(b"abc").unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(ava.value(0), b"abc"); -/// assert_eq!(ava.value(1), b"def"); -/// -/// ``` -pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; - -/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) -pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; - -#[cfg(test)] -mod tests { - use super::*; - - use crate::array::Int8Array; - use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; - - fn test_bytes_dictionary_builder(values: Vec<&T::Native>) - where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = GenericByteDictionaryBuilder::::new(); - builder.append(values[0]).unwrap(); - builder.append_null(); - builder.append(values[1]).unwrap(); - builder.append(values[1]).unwrap(); - builder.append(values[0]).unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); - - assert_eq!(*ava.value(0), *values[0]); - assert_eq!(*ava.value(1), *values[1]); - } - - #[test] - fn test_string_dictionary_builder() { - test_bytes_dictionary_builder::>(vec!["abc", "def"]); - } - - #[test] - fn test_binary_dictionary_builder() { - test_bytes_dictionary_builder::>(vec![b"abc", b"def"]); - } - - fn test_bytes_dictionary_builder_finish_cloned(values: Vec<&T::Native>) - where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = GenericByteDictionaryBuilder::::new(); - - builder.append(values[0]).unwrap(); - builder.append_null(); - builder.append(values[1]).unwrap(); - builder.append(values[1]).unwrap(); - builder.append(values[0]).unwrap(); - let mut array = builder.finish_cloned(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); - - assert_eq!(ava.value(0), values[0]); - assert_eq!(ava.value(1), values[1]); - - builder.append(values[0]).unwrap(); - builder.append(values[2]).unwrap(); - builder.append(values[1]).unwrap(); - - array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![ - Some(0), - None, - Some(1), - Some(1), - Some(0), - Some(0), - Some(2), - Some(1) - ]) - ); - - // Values are polymorphic and so require a downcast. - let av2 = array.values(); - let ava2: &GenericByteArray = - av2.as_any().downcast_ref::>().unwrap(); - - assert_eq!(ava2.value(0), values[0]); - assert_eq!(ava2.value(1), values[1]); - assert_eq!(ava2.value(2), values[2]); - } - - #[test] - fn test_string_dictionary_builder_finish_cloned() { - test_bytes_dictionary_builder_finish_cloned::>(vec![ - "abc", "def", "ghi", - ]); - } - - #[test] - fn test_binary_dictionary_builder_finish_cloned() { - test_bytes_dictionary_builder_finish_cloned::>(vec![ - b"abc", b"def", b"ghi", - ]); - } - - fn test_bytes_dictionary_builder_with_existing_dictionary( - dictionary: GenericByteArray, - values: Vec<&T::Native>, - ) where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary(6, &dictionary) - .unwrap(); - builder.append(values[0]).unwrap(); - builder.append_null(); - builder.append(values[1]).unwrap(); - builder.append(values[1]).unwrap(); - builder.append(values[0]).unwrap(); - builder.append(values[2]).unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); - - assert!(!ava.is_valid(0)); - assert_eq!(ava.value(1), values[1]); - assert_eq!(ava.value(2), values[0]); - assert_eq!(ava.value(3), values[2]); - } - - #[test] - fn test_string_dictionary_builder_with_existing_dictionary() { - test_bytes_dictionary_builder_with_existing_dictionary::>( - StringArray::from(vec![None, Some("def"), Some("abc")]), - vec!["abc", "def", "ghi"], - ); - } - - #[test] - fn test_binary_dictionary_builder_with_existing_dictionary() { - let values: Vec> = vec![None, Some(b"def"), Some(b"abc")]; - test_bytes_dictionary_builder_with_existing_dictionary::>( - BinaryArray::from(values), - vec![b"abc", b"def", b"ghi"], - ); - } - - fn test_bytes_dictionary_builder_with_reserved_null_value( - dictionary: GenericByteArray, - values: Vec<&T::Native>, - ) where - T: ByteArrayType, - ::Native: PartialEq, - ::Native: AsRef<::Native>, - { - let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary(4, &dictionary) - .unwrap(); - builder.append(values[0]).unwrap(); - builder.append_null(); - builder.append(values[1]).unwrap(); - builder.append(values[0]).unwrap(); - let array = builder.finish(); - - assert!(array.is_null(1)); - assert!(!array.is_valid(1)); - - let keys = array.keys(); - - assert_eq!(keys.value(0), 1); - assert!(keys.is_null(1)); - // zero initialization is currently guaranteed by Buffer allocation and resizing - assert_eq!(keys.value(1), 0); - assert_eq!(keys.value(2), 2); - assert_eq!(keys.value(3), 1); - } - - #[test] - fn test_string_dictionary_builder_with_reserved_null_value() { - let v: Vec> = vec![None]; - test_bytes_dictionary_builder_with_reserved_null_value::>( - StringArray::from(v), - vec!["abc", "def"], - ); - } - - #[test] - fn test_binary_dictionary_builder_with_reserved_null_value() { - let values: Vec> = vec![None]; - test_bytes_dictionary_builder_with_reserved_null_value::>( - BinaryArray::from(values), - vec![b"abc", b"def"], - ); - } - - #[test] - fn test_extend() { - let mut builder = GenericByteDictionaryBuilder::::new(); - builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); - builder.extend(["c", "d", "a"].into_iter().map(Some)); - let dict = builder.finish(); - assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); - assert_eq!(dict.values().len(), 4); - } -} diff --git a/arrow-array/src/builder2/generic_bytes_view_builder.rs b/arrow-array/src/builder2/generic_bytes_view_builder.rs index 6bb1cf41438e..f23df2835de6 100644 --- a/arrow-array/src/builder2/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder2/generic_bytes_view_builder.rs @@ -25,10 +25,10 @@ use arrow_schema::ArrowError; use hashbrown::hash_table::Entry; use hashbrown::HashTable; -use crate::builder2::ArrayBuilder; +use crate::builder2::SpecificArrayBuilder; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{ArrayRef, GenericByteViewArray}; +use crate::{ArrayAccessor, ArrayRef, GenericByteViewArray}; const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB @@ -426,16 +426,18 @@ impl std::fmt::Debug for GenericByteViewBuilder { } } -impl ArrayBuilder for GenericByteViewBuilder { +impl SpecificArrayBuilder for GenericByteViewBuilder { + type Output = GenericByteViewArray; + fn len(&self) -> usize { self.null_buffer_builder.len() } - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } @@ -450,6 +452,14 @@ impl ArrayBuilder for GenericByteViewBuilder { fn into_box_any(self: Box) -> Box { self } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + self.append_value(value) + } + + fn append_null(&mut self) { + self.append_null() + } } impl> Extend> diff --git a/arrow-array/src/builder2/generic_list_builder.rs b/arrow-array/src/builder2/generic_list_builder.rs index d110f61e2623..8d3a780a601e 100644 --- a/arrow-array/src/builder2/generic_list_builder.rs +++ b/arrow-array/src/builder2/generic_list_builder.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::{ArrayBuilder, BufferBuilder}; -use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; +use crate::builder2::{SpecificArrayBuilder}; +use crate::builder::{BufferBuilder}; +use crate::{Array, ArrayAccessor, ArrayRef, GenericListArray, OffsetSizeTrait}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{Buffer, OffsetBuffer}; use arrow_schema::{Field, FieldRef}; @@ -85,20 +86,20 @@ use std::sync::Arc; /// [`LargeListBuilder`]: crate::builder::LargeListBuilder /// [`LargeListArray`]: crate::array::LargeListArray #[derive(Debug)] -pub struct GenericListBuilder { +pub struct GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { offsets_builder: BufferBuilder, null_buffer_builder: NullBufferBuilder, values_builder: T, field: Option, } -impl Default for GenericListBuilder { +impl Default for GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { fn default() -> Self { Self::new(T::default()) } } -impl GenericListBuilder { +impl GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { /// Creates a new [`GenericListBuilder`] from a given values array builder pub fn new(values_builder: T) -> Self { let capacity = values_builder.len(); @@ -131,12 +132,15 @@ impl GenericListBuilder ArrayBuilder +impl SpecificArrayBuilder for GenericListBuilder where - T: 'static, + OffsetSize: OffsetSizeTrait, + ValuesOutput: Array + ArrayAccessor, + T: 'static + SpecificArrayBuilder, for<'a> &'a ValuesOutput: ArrayAccessor { + type Output = GenericListArray; + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -158,19 +162,29 @@ where } /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + // our item is their output + self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); + self.append(true); + } + + fn append_null(&mut self) { + self.append(false); + } } -impl GenericListBuilder +impl GenericListBuilder where - T: 'static, + T: 'static, for<'a> &'a ::Output: ArrayAccessor { /// Returns the child array builder as a mutable reference. /// @@ -261,11 +275,6 @@ where self.extend(std::iter::once(Some(i))) } - pub fn append_list(&mut self, list: GenericListArray) { - self.values_builder.extend(list.values().iter()); - self.append(true); - } - /// Append a null to this [`GenericListBuilder`] /// /// See [`Self::append_value`] for an example use. @@ -339,8 +348,8 @@ where impl Extend> for GenericListBuilder where O: OffsetSizeTrait, - B: ArrayBuilder + Extend, - V: IntoIterator, + B: SpecificArrayBuilder + Extend, + V: IntoIterator, for<'a> &'a ::Output: ArrayAccessor { #[inline] fn extend>>(&mut self, iter: T) { @@ -359,7 +368,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::builder::{make_builder, Int32Builder, Int8Builder, ListBuilder}; + use crate::builder2::{Int32Builder, Int8Builder, LargeListBuilder, ListBuilder, PrimitiveBuilder}; + use crate::builder2::SpecificArrayBuilder; use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Int32Array, ListArray}; @@ -446,7 +456,7 @@ mod tests { #[test] fn test_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); + let values_builder = Int32Array::builder2(5); let mut builder = ListBuilder::new(values_builder); builder.values().append_slice(&[1, 2, 3]); @@ -559,7 +569,7 @@ mod tests { #[test] fn test_boxed_primitive_array_builder() { - let values_builder = make_builder(&DataType::Int32, 5); + let values_builder = PrimitiveBuilder::::with_capacity(5); let mut builder = ListBuilder::new(values_builder); builder @@ -588,173 +598,26 @@ mod tests { #[test] fn test_boxed_list_list_array_builder() { // This test is same as `test_list_list_array_builder` but uses boxed builders. - let values_builder = make_builder( - &DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), - 10, - ); + let values_builder = ListBuilder::with_capacity( Int32Builder::with_capacity(10), 10); test_boxed_generic_list_generic_list_array_builder::(values_builder); } #[test] fn test_boxed_large_list_large_list_array_builder() { // This test is same as `test_list_list_array_builder` but uses boxed builders. - let values_builder = make_builder( - &DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))), - 10, + test_boxed_generic_list_generic_list_array_builder( + LargeListBuilder::with_capacity(Int32Builder::with_capacity(10), 10), ); - test_boxed_generic_list_generic_list_array_builder::(values_builder); } - fn test_boxed_generic_list_generic_list_array_builder( - values_builder: Box, - ) { - let mut builder: GenericListBuilder> = - GenericListBuilder::>::new(values_builder); + fn test_boxed_generic_list_generic_list_array_builder( + values_builder: GenericListBuilder, + ) where for<'a> &'a ::Output: ArrayAccessor { + let mut builder: GenericListBuilder> = + GenericListBuilder::>::new(values_builder); // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(1); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(2); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(true); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(3); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(4); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(true); - builder.append(true); - - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(5); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(6); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an (Large)ListBuilder") - .append_value(7); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(true); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(false); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(8); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(true); - builder.append(true); - - builder.append(false); - - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(9); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(10); - builder - .values() - .as_any_mut() - .downcast_mut::>>() - .expect("should be an (Large)ListBuilder") - .append(true); - builder.append(true); + builder.append_value(vec![vec![1, 2], vec![3, 4]]); let l1 = builder.finish(); diff --git a/arrow-array/src/builder2/map_builder.rs b/arrow-array/src/builder2/map_builder.rs deleted file mode 100644 index 1e88a34cc4dc..000000000000 --- a/arrow-array/src/builder2/map_builder.rs +++ /dev/null @@ -1,380 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{ArrayBuilder, BufferBuilder}; -use crate::{Array, ArrayRef, MapArray, StructArray}; -use arrow_buffer::Buffer; -use arrow_buffer::{NullBuffer, NullBufferBuilder}; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, Field, FieldRef}; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`MapArray`] -/// -/// ``` -/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; -/// # use arrow_array::{Int32Array, StringArray}; -/// -/// let string_builder = StringBuilder::new(); -/// let int_builder = Int32Builder::with_capacity(4); -/// -/// // Construct `[{"joe": 1}, {"blogs": 2, "foo": 4}, {}, null]` -/// let mut builder = MapBuilder::new(None, string_builder, int_builder); -/// -/// builder.keys().append_value("joe"); -/// builder.values().append_value(1); -/// builder.append(true).unwrap(); -/// -/// builder.keys().append_value("blogs"); -/// builder.values().append_value(2); -/// builder.keys().append_value("foo"); -/// builder.values().append_value(4); -/// builder.append(true).unwrap(); -/// builder.append(true).unwrap(); -/// builder.append(false).unwrap(); -/// -/// let array = builder.finish(); -/// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]); -/// assert_eq!(array.values().as_ref(), &Int32Array::from(vec![1, 2, 4])); -/// assert_eq!(array.keys().as_ref(), &StringArray::from(vec!["joe", "blogs", "foo"])); -/// -/// ``` -#[derive(Debug)] -pub struct MapBuilder { - offsets_builder: BufferBuilder, - null_buffer_builder: NullBufferBuilder, - field_names: MapFieldNames, - key_builder: K, - value_builder: V, - value_field: Option, -} - -/// The [`Field`] names for a [`MapArray`] -#[derive(Debug, Clone)] -pub struct MapFieldNames { - /// [`Field`] name for map entries - pub entry: String, - /// [`Field`] name for map key - pub key: String, - /// [`Field`] name for map value - pub value: String, -} - -impl Default for MapFieldNames { - fn default() -> Self { - Self { - entry: "entries".to_string(), - key: "keys".to_string(), - value: "values".to_string(), - } - } -} - -impl MapBuilder { - /// Creates a new `MapBuilder` - pub fn new(field_names: Option, key_builder: K, value_builder: V) -> Self { - let capacity = key_builder.len(); - Self::with_capacity(field_names, key_builder, value_builder, capacity) - } - - /// Creates a new `MapBuilder` with capacity - pub fn with_capacity( - field_names: Option, - key_builder: K, - value_builder: V, - capacity: usize, - ) -> Self { - let mut offsets_builder = BufferBuilder::::new(capacity + 1); - offsets_builder.append(0); - Self { - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - field_names: field_names.unwrap_or_default(), - key_builder, - value_builder, - value_field: None, - } - } - - /// Override the field passed to [`MapBuilder::new`] - /// - /// By default a nullable field is created with the name `values` - /// - /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the - /// field's data type does not match that of `V` - pub fn with_values_field(self, field: impl Into) -> Self { - Self { - value_field: Some(field.into()), - ..self - } - } - - /// Returns the key array builder of the map - pub fn keys(&mut self) -> &mut K { - &mut self.key_builder - } - - /// Returns the value array builder of the map - pub fn values(&mut self) -> &mut V { - &mut self.value_builder - } - - /// Returns both the key and value array builders of the map - pub fn entries(&mut self) -> (&mut K, &mut V) { - (&mut self.key_builder, &mut self.value_builder) - } - - /// Finish the current map array slot - /// - /// Returns an error if the key and values builders are in an inconsistent state. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<(), ArrowError> { - if self.key_builder.len() != self.value_builder.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Cannot append to a map builder when its keys and values have unequal lengths of {} and {}", - self.key_builder.len(), - self.value_builder.len() - ))); - } - self.offsets_builder.append(self.key_builder.len() as i32); - self.null_buffer_builder.append(is_valid); - Ok(()) - } - - /// Builds the [`MapArray`] - pub fn finish(&mut self) -> MapArray { - let len = self.len(); - // Build the keys - let keys_arr = self.key_builder.finish(); - let values_arr = self.value_builder.finish(); - let offset_buffer = self.offsets_builder.finish(); - self.offsets_builder.append(0); - let null_bit_buffer = self.null_buffer_builder.finish(); - - self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) - } - - /// Builds the [`MapArray`] without resetting the builder. - pub fn finish_cloned(&self) -> MapArray { - let len = self.len(); - // Build the keys - let keys_arr = self.key_builder.finish_cloned(); - let values_arr = self.value_builder.finish_cloned(); - let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - let nulls = self.null_buffer_builder.finish_cloned(); - self.finish_helper(keys_arr, values_arr, offset_buffer, nulls, len) - } - - fn finish_helper( - &self, - keys_arr: Arc, - values_arr: Arc, - offset_buffer: Buffer, - nulls: Option, - len: usize, - ) -> MapArray { - assert!( - keys_arr.null_count() == 0, - "Keys array must have no null values, found {} null value(s)", - keys_arr.null_count() - ); - - let keys_field = Arc::new(Field::new( - self.field_names.key.as_str(), - keys_arr.data_type().clone(), - false, // always non-nullable - )); - let values_field = match &self.value_field { - Some(f) => f.clone(), - None => Arc::new(Field::new( - self.field_names.value.as_str(), - values_arr.data_type().clone(), - true, - )), - }; - - let struct_array = - StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); - - let map_field = Arc::new(Field::new( - self.field_names.entry.as_str(), - struct_array.data_type().clone(), - false, // always non-nullable - )); - let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys - .len(len) - .add_buffer(offset_buffer) - .add_child_data(struct_array.into_data()) - .nulls(nulls); - - let array_data = unsafe { array_data.build_unchecked() }; - - MapArray::from(array_data) - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - -impl ArrayBuilder for MapBuilder { - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn into_box_any(self: Box) -> Box { - self - } -} - -#[cfg(test)] -mod tests { - use crate::builder::{make_builder, Int32Builder, StringBuilder}; - use crate::{Int32Array, StringArray}; - - use super::*; - - #[test] - #[should_panic(expected = "Keys array must have no null values, found 1 null value(s)")] - fn test_map_builder_with_null_keys_panics() { - let mut builder = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); - builder.keys().append_null(); - builder.values().append_value(42); - builder.append(true).unwrap(); - - builder.finish(); - } - - #[test] - fn test_boxed_map_builder() { - let keys_builder = make_builder(&DataType::Utf8, 5); - let values_builder = make_builder(&DataType::Int32, 5); - - let mut builder = MapBuilder::new(None, keys_builder, values_builder); - builder - .keys() - .as_any_mut() - .downcast_mut::() - .expect("should be an StringBuilder") - .append_value("1"); - builder - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_value(42); - builder.append(true).unwrap(); - - let map_array = builder.finish(); - - assert_eq!( - map_array - .keys() - .as_any() - .downcast_ref::() - .expect("should be an StringArray") - .value(0), - "1" - ); - assert_eq!( - map_array - .values() - .as_any() - .downcast_ref::() - .expect("should be an Int32Array") - .value(0), - 42 - ); - } - - #[test] - fn test_with_values_field() { - let value_field = Arc::new(Field::new("bars", DataType::Int32, false)); - let mut builder = MapBuilder::new(None, Int32Builder::new(), Int32Builder::new()) - .with_values_field(value_field.clone()); - builder.keys().append_value(1); - builder.values().append_value(2); - builder.append(true).unwrap(); - builder.append(false).unwrap(); // This is fine as nullability refers to nullability of values - builder.keys().append_value(3); - builder.values().append_value(4); - builder.append(true).unwrap(); - let map = builder.finish(); - - assert_eq!(map.len(), 3); - assert_eq!( - map.data_type(), - &DataType::Map( - Arc::new(Field::new( - "entries", - DataType::Struct( - vec![ - Arc::new(Field::new("keys", DataType::Int32, false)), - value_field.clone() - ] - .into() - ), - false, - )), - false - ) - ); - - builder.keys().append_value(5); - builder.values().append_value(6); - builder.append(true).unwrap(); - let map = builder.finish(); - - assert_eq!(map.len(), 1); - assert_eq!( - map.data_type(), - &DataType::Map( - Arc::new(Field::new( - "entries", - DataType::Struct( - vec![ - Arc::new(Field::new("keys", DataType::Int32, false)), - value_field - ] - .into() - ), - false, - )), - false - ) - ); - } -} diff --git a/arrow-array/src/builder2/mod.rs b/arrow-array/src/builder2/mod.rs index fd5ef99a5776..723c0a999cf4 100644 --- a/arrow-array/src/builder2/mod.rs +++ b/arrow-array/src/builder2/mod.rs @@ -152,8 +152,6 @@ pub use arrow_buffer::BooleanBufferBuilder; mod boolean_builder; pub use boolean_builder::*; -mod buffer_builder; -pub use buffer_builder::*; mod fixed_size_binary_builder; pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; @@ -162,30 +160,14 @@ mod generic_bytes_builder; pub use generic_bytes_builder::*; mod generic_list_builder; pub use generic_list_builder::*; -mod map_builder; -pub use map_builder::*; -mod null_builder; -pub use null_builder::*; mod primitive_builder; pub use primitive_builder::*; -mod primitive_dictionary_builder; -pub use primitive_dictionary_builder::*; -mod primitive_run_builder; -pub use primitive_run_builder::*; -mod struct_builder; -pub use struct_builder::*; -mod generic_bytes_dictionary_builder; -pub use generic_bytes_dictionary_builder::*; -mod generic_byte_run_builder; -pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; -mod union_builder; - -pub use union_builder::*; use crate::{Array, ArrayAccessor, ArrayRef}; use std::any::Any; +use std::sync::Arc; /// Trait for dealing with different array builders at runtime /// @@ -235,8 +217,11 @@ use std::any::Any; /// "🍎" /// ); /// ``` -pub trait SpecificArrayBuilder: Any + Send + Sync { - type Output: Array + ArrayAccessor; +/// +// TODO - require extend or allow to append from iterator +trait SpecificArrayBuilder: Any + Send + Sync where for<'a> &'a ::Output: ArrayAccessor { + type Output: Array; + /// Returns the number of array slots in the builder fn len(&self) -> usize; @@ -247,10 +232,10 @@ pub trait SpecificArrayBuilder: Any + Send + Sync { } /// Builds the array - fn finish(&mut self) -> Self::Output; + fn finish(&mut self) -> Arc; /// Builds the array without resetting the underlying builder. - fn finish_cloned(&self) -> Self::Output; + fn finish_cloned(&self) -> Arc; /// Returns the builder as a non-mutable `Any` reference. /// @@ -270,45 +255,89 @@ pub trait SpecificArrayBuilder: Any + Send + Sync { fn into_box_any(self: Box) -> Box; // Append a value to the builder - fn append_value(&mut self, value: ::Item); -} - -impl SpecificArrayBuilder for Box> { - type Output = T; - - fn len(&self) -> usize { - (**self).len() - } - - fn is_empty(&self) -> bool { - (**self).is_empty() - } - - fn finish(&mut self) -> ArrayRef { - (**self).finish() - } - - fn finish_cloned(&self) -> ArrayRef { - (**self).finish_cloned() - } + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item); - fn as_any(&self) -> &dyn Any { - (**self).as_any() - } + /// Appends a null slot into the builder + fn append_null(&mut self) ; - fn as_any_mut(&mut self) -> &mut dyn Any { - (**self).as_any_mut() + /// Appends `n` `null`s into the builder. + #[inline] + fn append_nulls(&mut self, n: usize) { + for _ in 0..n { + self.append_null(); + } } - fn into_box_any(self: Box) -> Box { - self + /// Appends an `Option` into the builder + #[inline] + fn append_option(&mut self, v: Option<<&Self::Output as ArrayAccessor>::Item>) { + match v { + None => self.append_null(), + Some(v) => self.append_value(v), + }; } - fn append_value(&mut self, value: ::Item) { - (**self).append_value(value) + #[inline] + fn append_output(&mut self, output: &Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.append_value(output.value(i)); + } + } } } +// impl SpecificArrayBuilder for Box> +// where +// T: Array + 'static, +// for<'a> &'a T: ArrayAccessor, +// { +// type Output = dyn Array; +// +// fn len(&self) -> usize { +// (**self).len() +// } +// +// fn is_empty(&self) -> bool { +// (**self).is_empty() +// } +// +// fn finish(&mut self) -> Arc { +// (**self).finish() +// } +// +// fn finish_cloned(&self) -> Arc { +// (**self).finish_cloned() +// } +// +// fn as_any(&self) -> &dyn Any { +// (**self).as_any() +// } +// +// fn as_any_mut(&mut self) -> &mut dyn Any { +// (**self).as_any_mut() +// } +// +// fn into_box_any(self: Box) -> Box { +// self +// } +// +// fn append_value(&mut self, value: ::Item) { +// (**self).append_value(value) +// } +// +// fn append_null(&mut self) { +// (**self).append_null() +// } +// +// fn append_option(&mut self, v: Option<::Item>) { +// (**self).append_option(v) +// } +// } + /// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; diff --git a/arrow-array/src/builder2/null_builder.rs b/arrow-array/src/builder2/null_builder.rs deleted file mode 100644 index 10e6d38274a5..000000000000 --- a/arrow-array/src/builder2/null_builder.rs +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::ArrayBuilder; -use crate::{ArrayRef, NullArray}; -use arrow_data::ArrayData; -use arrow_schema::DataType; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`NullArray`] -/// -/// # Example -/// -/// Create a `NullArray` from a `NullBuilder` -/// -/// ``` -/// -/// # use arrow_array::{Array, NullArray, builder::NullBuilder}; -/// -/// let mut b = NullBuilder::new(); -/// b.append_empty_value(); -/// b.append_null(); -/// b.append_nulls(3); -/// b.append_empty_values(3); -/// let arr = b.finish(); -/// -/// assert_eq!(8, arr.len()); -/// assert_eq!(0, arr.null_count()); -/// ``` -#[derive(Debug)] -pub struct NullBuilder { - len: usize, -} - -impl Default for NullBuilder { - fn default() -> Self { - Self::new() - } -} - -impl NullBuilder { - /// Creates a new null builder - pub fn new() -> Self { - Self { len: 0 } - } - - /// Creates a new null builder with space for `capacity` elements without re-allocating - #[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"] - pub fn with_capacity(_capacity: usize) -> Self { - Self::new() - } - - /// Returns the capacity of this builder measured in slots of type `T` - #[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"] - pub fn capacity(&self) -> usize { - self.len - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.len += 1; - } - - /// Appends `n` `null`s into the builder. - #[inline] - pub fn append_nulls(&mut self, n: usize) { - self.len += n; - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_empty_value(&mut self) { - self.append_null(); - } - - /// Appends `n` `null`s into the builder. - #[inline] - pub fn append_empty_values(&mut self, n: usize) { - self.append_nulls(n); - } - - /// Builds the [NullArray] and reset this builder. - pub fn finish(&mut self) -> NullArray { - let len = self.len(); - let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); - - let array_data = unsafe { builder.build_unchecked() }; - NullArray::from(array_data) - } - - /// Builds the [NullArray] without resetting the builder. - pub fn finish_cloned(&self) -> NullArray { - let len = self.len(); - let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); - - let array_data = unsafe { builder.build_unchecked() }; - NullArray::from(array_data) - } -} - -impl ArrayBuilder for NullBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::Array; - - #[test] - fn test_null_array_builder() { - let mut builder = NullArray::builder(10); - builder.append_null(); - builder.append_nulls(4); - builder.append_empty_value(); - builder.append_empty_values(4); - - let arr = builder.finish(); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - assert!(arr.is_nullable()); - } - - #[test] - fn test_null_array_builder_finish_cloned() { - let mut builder = NullArray::builder(16); - builder.append_null(); - builder.append_empty_value(); - builder.append_empty_values(3); - let mut array = builder.finish_cloned(); - assert_eq!(5, array.len()); - - builder.append_empty_values(5); - array = builder.finish(); - assert_eq!(10, array.len()); - } -} diff --git a/arrow-array/src/builder2/primitive_builder.rs b/arrow-array/src/builder2/primitive_builder.rs index 93cfb6695f50..830007d52c25 100644 --- a/arrow-array/src/builder2/primitive_builder.rs +++ b/arrow-array/src/builder2/primitive_builder.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder2::{ArrayBuilder, BufferBuilder}; +use crate::builder2::{SpecificArrayBuilder}; +use crate::builder::BufferBuilder; use crate::types::*; -use crate::{ArrayRef, PrimitiveArray}; +use crate::{ArrayAccessor, ArrayRef, PrimitiveArray}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -100,7 +101,9 @@ pub struct PrimitiveBuilder { data_type: DataType, } -impl ArrayBuilder for PrimitiveBuilder { +impl SpecificArrayBuilder for PrimitiveBuilder { + type Output = PrimitiveArray; + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -122,14 +125,30 @@ impl ArrayBuilder for PrimitiveBuilder { } /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { + fn finish(&mut self) -> Arc { Arc::new(self.finish()) } /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { + fn finish_cloned(&self) -> Arc { Arc::new(self.finish_cloned()) } + + fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { + self.append_value(value) + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_nulls(&mut self, n: usize) { + self.append_nulls(n) + } + + fn append_output(&mut self, output: &Self::Output) { + self.extend(output.iter()) + } } impl Default for PrimitiveBuilder { diff --git a/arrow-array/src/builder2/primitive_dictionary_builder.rs b/arrow-array/src/builder2/primitive_dictionary_builder.rs deleted file mode 100644 index 91d60003e187..000000000000 --- a/arrow-array/src/builder2/primitive_dictionary_builder.rs +++ /dev/null @@ -1,446 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{ArrayBuilder, PrimitiveBuilder}; -use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; -use arrow_buffer::{ArrowNativeType, ToByteSlice}; -use arrow_schema::{ArrowError, DataType}; -use std::any::Any; -use std::collections::HashMap; -use std::sync::Arc; - -/// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it -/// -/// This is necessary to handle types such as f32, which don't natively implement these -#[derive(Debug)] -struct Value(T); - -impl std::hash::Hash for Value { - fn hash(&self, state: &mut H) { - self.0.to_byte_slice().hash(state) - } -} - -impl PartialEq for Value { - fn eq(&self, other: &Self) -> bool { - self.0.to_byte_slice().eq(other.0.to_byte_slice()) - } -} - -impl Eq for Value {} - -/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) -/// -/// # Example: -/// -/// ``` -/// -/// # use arrow_array::builder::PrimitiveDictionaryBuilder; -/// # use arrow_array::types::{UInt32Type, UInt8Type}; -/// # use arrow_array::{Array, UInt32Array, UInt8Array}; -/// -/// let mut builder = PrimitiveDictionaryBuilder::::new(); -/// builder.append(12345678).unwrap(); -/// builder.append_null(); -/// builder.append(22345678).unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &UInt8Array::from(vec![Some(0), None, Some(1)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); -/// let avs: &[u32] = ava.values(); -/// -/// assert!(!array.is_null(0)); -/// assert!(array.is_null(1)); -/// assert!(!array.is_null(2)); -/// -/// assert_eq!(avs, &[12345678, 22345678]); -/// ``` -#[derive(Debug)] -pub struct PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - map: HashMap, usize>, -} - -impl Default for PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - fn default() -> Self { - Self::new() - } -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Creates a new `PrimitiveDictionaryBuilder`. - pub fn new() -> Self { - Self { - keys_builder: PrimitiveBuilder::new(), - values_builder: PrimitiveBuilder::new(), - map: HashMap::new(), - } - } - - /// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders. - /// - /// # Panics - /// - /// This method panics if `keys_builder` or `values_builder` is not empty. - pub fn new_from_empty_builders( - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - ) -> Self { - assert!( - keys_builder.is_empty() && values_builder.is_empty(), - "keys and values builders must be empty" - ); - Self { - keys_builder, - values_builder, - map: HashMap::new(), - } - } - - /// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values. - /// - /// # Safety - /// - /// caller must ensure that the passed in builders are valid for DictionaryArray. - pub unsafe fn new_from_builders( - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - ) -> Self { - let keys = keys_builder.values_slice(); - let values = values_builder.values_slice(); - let mut map = HashMap::with_capacity(values.len()); - - keys.iter().zip(values.iter()).for_each(|(key, value)| { - map.insert(Value(*value), K::Native::to_usize(*key).unwrap()); - }); - - Self { - keys_builder, - values_builder, - map, - } - } - - /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities - /// - /// `keys_capacity`: the number of keys, i.e. length of array to build - /// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary - pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self { - Self { - keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder: PrimitiveBuilder::with_capacity(values_capacity), - map: HashMap::with_capacity(values_capacity), - } - } -} - -impl ArrayBuilder for PrimitiveDictionaryBuilder -where - K: ArrowDictionaryKeyType, - V: ArrowPrimitiveType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowDictionaryKeyType, - V: ArrowPrimitiveType, -{ - #[inline] - fn get_or_insert_key(&mut self, value: V::Native) -> Result { - match self.map.get(&Value(value)) { - Some(&key) => { - Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) - } - None => { - let key = self.values_builder.len(); - self.values_builder.append_value(value); - self.map.insert(Value(value), key); - Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) - } - } - } - - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - #[inline] - pub fn append(&mut self, value: V::Native) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value(key); - Ok(key) - } - - /// Append a value multiple times to the array. - /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. - /// - /// Returns an error if the new index would overflow the key type. - pub fn append_n(&mut self, value: V::Native, count: usize) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value_n(key, count); - Ok(key) - } - - /// Infallibly append a value to this builder - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - #[inline] - pub fn append_value(&mut self, value: V::Native) { - self.append(value).expect("dictionary key overflow"); - } - - /// Infallibly append a value to this builder repeatedly `count` times. - /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - pub fn append_values(&mut self, value: V::Native, count: usize) { - self.append_n(value, count) - .expect("dictionary key overflow"); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.keys_builder.append_null() - } - - /// Append `n` null slots into the builder - #[inline] - pub fn append_nulls(&mut self, n: usize) { - self.keys_builder.append_nulls(n) - } - - /// Append an `Option` value into the builder - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - #[inline] - pub fn append_option(&mut self, value: Option) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Append an `Option` value into the builder repeatedly `count` times. - /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. - /// - /// # Panics - /// - /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` - pub fn append_options(&mut self, value: Option, count: usize) { - match value { - None => self.keys_builder.append_nulls(count), - Some(v) => self.append_values(v, count), - }; - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.map.clear(); - let values = self.values_builder.finish(); - let keys = self.keys_builder.finish(); - - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); - - let builder = keys - .into_data() - .into_builder() - .data_type(data_type) - .child_data(vec![values.into_data()]); - - DictionaryArray::from(unsafe { builder.build_unchecked() }) - } - - /// Builds the `DictionaryArray` without resetting the builder. - pub fn finish_cloned(&self) -> DictionaryArray { - let values = self.values_builder.finish_cloned(); - let keys = self.keys_builder.finish_cloned(); - - let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); - - let builder = keys - .into_data() - .into_builder() - .data_type(data_type) - .child_data(vec![values.into_data()]); - - DictionaryArray::from(unsafe { builder.build_unchecked() }) - } - - /// Returns the current dictionary values buffer as a slice - pub fn values_slice(&self) -> &[V::Native] { - self.values_builder.values_slice() - } - - /// Returns the current dictionary values buffer as a mutable slice - pub fn values_slice_mut(&mut self) -> &mut [V::Native] { - self.values_builder.values_slice_mut() - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.keys_builder.validity_slice() - } -} - -impl Extend> - for PrimitiveDictionaryBuilder -{ - #[inline] - fn extend>>(&mut self, iter: T) { - for v in iter { - self.append_option(v) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::array::UInt32Array; - use crate::array::UInt8Array; - use crate::builder::Decimal128Builder; - use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; - - #[test] - fn test_primitive_dictionary_builder() { - let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); - builder.append(12345678).unwrap(); - builder.append_null(); - builder.append(22345678).unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &UInt8Array::from(vec![Some(0), None, Some(1)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); - let avs: &[u32] = ava.values(); - - assert!(!array.is_null(0)); - assert!(array.is_null(1)); - assert!(!array.is_null(2)); - - assert_eq!(avs, &[12345678, 22345678]); - } - - #[test] - fn test_extend() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); - builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); - let dict = builder.finish(); - assert_eq!( - dict.keys().values(), - &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0] - ); - assert_eq!(dict.values().len(), 5); - } - - #[test] - #[should_panic(expected = "DictionaryKeyOverflowError")] - fn test_primitive_dictionary_overflow() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(257, 257); - // 256 unique keys. - for i in 0..256 { - builder.append(i + 1000).unwrap(); - } - // Special error if the key overflows (256th entry) - builder.append(1257).unwrap(); - } - - #[test] - fn test_primitive_dictionary_with_builders() { - let keys_builder = PrimitiveBuilder::::new(); - let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); - let mut builder = - PrimitiveDictionaryBuilder::::new_from_empty_builders( - keys_builder, - values_builder, - ); - let dict_array = builder.finish(); - assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2)); - assert_eq!( - dict_array.data_type(), - &DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Decimal128(1, 2)), - ) - ); - } -} diff --git a/arrow-array/src/builder2/primitive_run_builder.rs b/arrow-array/src/builder2/primitive_run_builder.rs deleted file mode 100644 index 1db9c91e081d..000000000000 --- a/arrow-array/src/builder2/primitive_run_builder.rs +++ /dev/null @@ -1,313 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{any::Any, sync::Arc}; - -use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray}; - -use super::{ArrayBuilder, PrimitiveBuilder}; - -use arrow_buffer::ArrowNativeType; - -/// Builder for [`RunArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) -/// -/// # Example: -/// -/// ``` -/// -/// # use arrow_array::builder::PrimitiveRunBuilder; -/// # use arrow_array::cast::AsArray; -/// # use arrow_array::types::{UInt32Type, Int16Type}; -/// # use arrow_array::{Array, UInt32Array, Int16Array}; -/// -/// let mut builder = -/// PrimitiveRunBuilder::::new(); -/// builder.append_value(1234); -/// builder.append_value(1234); -/// builder.append_value(1234); -/// builder.append_null(); -/// builder.append_value(5678); -/// builder.append_value(5678); -/// let array = builder.finish(); -/// -/// assert_eq!(array.run_ends().values(), &[3, 4, 6]); -/// -/// let av = array.values(); -/// -/// assert!(!av.is_null(0)); -/// assert!(av.is_null(1)); -/// assert!(!av.is_null(2)); -/// -/// // Values are polymorphic and so require a downcast. -/// let ava: &UInt32Array = av.as_primitive::(); -/// -/// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); -/// ``` -#[derive(Debug)] -pub struct PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - run_ends_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - current_value: Option, - current_run_end_index: usize, - prev_run_end_index: usize, -} - -impl Default for PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - fn default() -> Self { - Self::new() - } -} - -impl PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - /// Creates a new `PrimitiveRunBuilder` - pub fn new() -> Self { - Self { - run_ends_builder: PrimitiveBuilder::new(), - values_builder: PrimitiveBuilder::new(), - current_value: None, - current_run_end_index: 0, - prev_run_end_index: 0, - } - } - - /// Creates a new `PrimitiveRunBuilder` with the provided capacity - /// - /// `capacity`: the expected number of run-end encoded values. - pub fn with_capacity(capacity: usize) -> Self { - Self { - run_ends_builder: PrimitiveBuilder::with_capacity(capacity), - values_builder: PrimitiveBuilder::with_capacity(capacity), - current_value: None, - current_run_end_index: 0, - prev_run_end_index: 0, - } - } -} - -impl ArrayBuilder for PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the length of logical array encoded by - /// the eventual runs array. - fn len(&self) -> usize { - self.current_run_end_index - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } -} - -impl PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - /// Appends optional value to the logical array encoded by the RunArray. - pub fn append_option(&mut self, value: Option) { - if self.current_run_end_index == 0 { - self.current_run_end_index = 1; - self.current_value = value; - return; - } - if self.current_value != value { - self.append_run_end(); - self.current_value = value; - } - - self.current_run_end_index += 1; - } - - /// Appends value to the logical array encoded by the run-ends array. - pub fn append_value(&mut self, value: V::Native) { - self.append_option(Some(value)) - } - - /// Appends null to the logical array encoded by the run-ends array. - pub fn append_null(&mut self) { - self.append_option(None) - } - - /// Creates the RunArray and resets the builder. - /// Panics if RunArray cannot be built. - pub fn finish(&mut self) -> RunArray { - // write the last run end to the array. - self.append_run_end(); - - // reset the run index to zero. - self.current_value = None; - self.current_run_end_index = 0; - - // build the run encoded array by adding run_ends and values array as its children. - let run_ends_array = self.run_ends_builder.finish(); - let values_array = self.values_builder.finish(); - RunArray::::try_new(&run_ends_array, &values_array).unwrap() - } - - /// Creates the RunArray and without resetting the builder. - /// Panics if RunArray cannot be built. - pub fn finish_cloned(&self) -> RunArray { - let mut run_ends_array = self.run_ends_builder.finish_cloned(); - let mut values_array = self.values_builder.finish_cloned(); - - // Add current run if one exists - if self.prev_run_end_index != self.current_run_end_index { - let mut run_end_builder = run_ends_array.into_builder().unwrap(); - let mut values_builder = values_array.into_builder().unwrap(); - self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); - run_ends_array = run_end_builder.finish(); - values_array = values_builder.finish(); - } - - RunArray::try_new(&run_ends_array, &values_array).unwrap() - } - - // Appends the current run to the array. - fn append_run_end(&mut self) { - // empty array or the function called without appending any value. - if self.prev_run_end_index == self.current_run_end_index { - return; - } - let run_end_index = self.run_end_index_as_native(); - self.run_ends_builder.append_value(run_end_index); - self.values_builder.append_option(self.current_value); - self.prev_run_end_index = self.current_run_end_index; - } - - // Similar to `append_run_end` but on custom builders. - // Used in `finish_cloned` which is not suppose to mutate `self`. - fn append_run_end_with_builders( - &self, - run_ends_builder: &mut PrimitiveBuilder, - values_builder: &mut PrimitiveBuilder, - ) { - let run_end_index = self.run_end_index_as_native(); - run_ends_builder.append_value(run_end_index); - values_builder.append_option(self.current_value); - } - - fn run_end_index_as_native(&self) -> R::Native { - R::Native::from_usize(self.current_run_end_index) - .unwrap_or_else(|| panic!( - "Cannot convert `current_run_end_index` {} from `usize` to native form of arrow datatype {}", - self.current_run_end_index, - R::DATA_TYPE - )) - } -} - -impl Extend> for PrimitiveRunBuilder -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - fn extend>>(&mut self, iter: T) { - for elem in iter { - self.append_option(elem); - } - } -} - -#[cfg(test)] -mod tests { - use crate::builder::PrimitiveRunBuilder; - use crate::cast::AsArray; - use crate::types::{Int16Type, UInt32Type}; - use crate::{Array, UInt32Array}; - - #[test] - fn test_primitive_ree_array_builder() { - let mut builder = PrimitiveRunBuilder::::new(); - builder.append_value(1234); - builder.append_value(1234); - builder.append_value(1234); - builder.append_null(); - builder.append_value(5678); - builder.append_value(5678); - - let array = builder.finish(); - - assert_eq!(array.null_count(), 0); - assert_eq!(array.logical_null_count(), 1); - assert_eq!(array.len(), 6); - - assert_eq!(array.run_ends().values(), &[3, 4, 6]); - - let av = array.values(); - - assert!(!av.is_null(0)); - assert!(av.is_null(1)); - assert!(!av.is_null(2)); - - // Values are polymorphic and so require a downcast. - let ava: &UInt32Array = av.as_primitive::(); - - assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); - } - - #[test] - fn test_extend() { - let mut builder = PrimitiveRunBuilder::::new(); - builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some)); - builder.extend([4, 4, 6, 2].into_iter().map(Some)); - let array = builder.finish(); - - assert_eq!(array.len(), 11); - assert_eq!(array.null_count(), 0); - assert_eq!(array.logical_null_count(), 0); - assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]); - assert_eq!( - array.values().as_primitive::().values(), - &[1, 2, 5, 4, 6, 2] - ); - } -} diff --git a/arrow-array/src/builder2/struct_builder.rs b/arrow-array/src/builder2/struct_builder.rs deleted file mode 100644 index b765a6d2e179..000000000000 --- a/arrow-array/src/builder2/struct_builder.rs +++ /dev/null @@ -1,872 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::StructArray; -use crate::{ - builder2::*, - types::{Int16Type, Int32Type, Int64Type, Int8Type}, -}; -use arrow_buffer::NullBufferBuilder; -use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit}; -use std::sync::Arc; - -/// Builder for [`StructArray`] -/// -/// Note that callers should make sure that methods of all the child field builders are -/// properly called to maintain the consistency of the data structure. -/// -/// -/// Handling arrays with complex layouts, such as `List>>`, in Rust can be challenging due to its strong typing system. -/// To construct a collection builder ([`ListBuilder`], [`LargeListBuilder`], or [`MapBuilder`]) using [`make_builder`], multiple calls are required. This complexity arises from the recursive approach utilized by [`StructBuilder::from_fields`]. -/// -/// Initially, [`StructBuilder::from_fields`] invokes [`make_builder`], which returns a `Box`. To obtain the specific collection builder, one must first use [`StructBuilder::field_builder`] to get a `Collection<[Box]>`. Subsequently, the `values()` result from this operation can be downcast to the desired builder type. -/// -/// For example, when working with [`ListBuilder`], you would first call [`StructBuilder::field_builder::>>`] and then downcast the [`Box`] to the specific [`StructBuilder`] you need. -/// -/// For a practical example see the code below: -/// -/// ```rust -/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder}; -/// use arrow_schema::{DataType, Field, Fields}; -/// use std::sync::Arc; -/// -/// // This is an example column that has a List>> layout -/// let mut example_col = ListBuilder::new(StructBuilder::from_fields( -/// vec![Field::new( -/// "value_list", -/// DataType::List(Arc::new(Field::new_list_field( -/// DataType::Struct(Fields::from(vec![ -/// Field::new("key", DataType::Utf8, true), -/// Field::new("value", DataType::Utf8, true), -/// ])), //In this example we are trying to get to this builder and insert key/value pairs -/// true, -/// ))), -/// true, -/// )], -/// 0, -/// )); -/// -/// // We can obtain the StructBuilder without issues, because example_col was created with StructBuilder -/// let col_struct_builder: &mut StructBuilder = example_col.values(); -/// -/// // We can't obtain the ListBuilder with the expected generic types, because under the hood -/// // the StructBuilder was returned as a Box and passed as such to the ListBuilder constructor -/// -/// // This panics in runtime, even though we know that the builder is a ListBuilder. -/// // let sb = col_struct_builder -/// // .field_builder::>(0) -/// // .as_mut() -/// // .unwrap(); -/// -/// //To keep in line with Rust's strong typing, we fetch a ListBuilder> from the column StructBuilder first... -/// let mut list_builder_option = -/// col_struct_builder.field_builder::>>(0); -/// -/// let list_builder = list_builder_option.as_mut().unwrap(); -/// -/// // ... and then downcast the key/value pair values to a StructBuilder -/// let struct_builder = list_builder -/// .values() -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap(); -/// -/// // We can now append values to the StructBuilder -/// let key_builder = struct_builder.field_builder::(0).unwrap(); -/// key_builder.append_value("my key"); -/// -/// let value_builder = struct_builder.field_builder::(1).unwrap(); -/// value_builder.append_value("my value"); -/// -/// struct_builder.append(true); -/// list_builder.append(true); -/// col_struct_builder.append(true); -/// example_col.append(true); -/// -/// let array = example_col.finish(); -/// -/// println!("My array: {:?}", array); -/// ``` -/// -pub struct StructBuilder { - fields: Fields, - field_builders: Vec>, - null_buffer_builder: NullBufferBuilder, -} - -impl std::fmt::Debug for StructBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("StructBuilder") - .field("fields", &self.fields) - .field("bitmap_builder", &self.null_buffer_builder) - .field("len", &self.len()) - .finish() - } -} - -impl ArrayBuilder for StructBuilder { - /// Returns the number of array slots in the builder. - /// - /// Note that this always return the first child field builder's length, and it is - /// the caller's responsibility to maintain the consistency that all the child field - /// builder should have the equal number of elements. - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Builds the array. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> ArrayRef { - Arc::new(self.finish_cloned()) - } - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } -} - -/// Returns a builder with capacity for `capacity` elements of datatype -/// `DataType`. -/// -/// This function is useful to construct arrays from an arbitrary vectors with -/// known/expected schema. -/// -/// See comments on [StructBuilder] for retrieving collection builders built by -/// make_builder. -pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { - use crate::builder::*; - match datatype { - DataType::Null => Box::new(NullBuilder::new()), - DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)), - DataType::Int8 => Box::new(Int8Builder::with_capacity(capacity)), - DataType::Int16 => Box::new(Int16Builder::with_capacity(capacity)), - DataType::Int32 => Box::new(Int32Builder::with_capacity(capacity)), - DataType::Int64 => Box::new(Int64Builder::with_capacity(capacity)), - DataType::UInt8 => Box::new(UInt8Builder::with_capacity(capacity)), - DataType::UInt16 => Box::new(UInt16Builder::with_capacity(capacity)), - DataType::UInt32 => Box::new(UInt32Builder::with_capacity(capacity)), - DataType::UInt64 => Box::new(UInt64Builder::with_capacity(capacity)), - DataType::Float16 => Box::new(Float16Builder::with_capacity(capacity)), - DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)), - DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), - DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), - DataType::LargeBinary => Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)), - DataType::FixedSizeBinary(len) => { - Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) - } - DataType::Decimal128(p, s) => Box::new( - Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)), - ), - DataType::Decimal256(p, s) => Box::new( - Decimal256Builder::with_capacity(capacity).with_data_type(DataType::Decimal256(*p, *s)), - ), - DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), - DataType::LargeUtf8 => Box::new(LargeStringBuilder::with_capacity(capacity, 1024)), - DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), - DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), - DataType::Time32(TimeUnit::Second) => { - Box::new(Time32SecondBuilder::with_capacity(capacity)) - } - DataType::Time32(TimeUnit::Millisecond) => { - Box::new(Time32MillisecondBuilder::with_capacity(capacity)) - } - DataType::Time64(TimeUnit::Microsecond) => { - Box::new(Time64MicrosecondBuilder::with_capacity(capacity)) - } - DataType::Time64(TimeUnit::Nanosecond) => { - Box::new(Time64NanosecondBuilder::with_capacity(capacity)) - } - DataType::Timestamp(TimeUnit::Second, tz) => Box::new( - TimestampSecondBuilder::with_capacity(capacity) - .with_data_type(DataType::Timestamp(TimeUnit::Second, tz.clone())), - ), - DataType::Timestamp(TimeUnit::Millisecond, tz) => Box::new( - TimestampMillisecondBuilder::with_capacity(capacity) - .with_data_type(DataType::Timestamp(TimeUnit::Millisecond, tz.clone())), - ), - DataType::Timestamp(TimeUnit::Microsecond, tz) => Box::new( - TimestampMicrosecondBuilder::with_capacity(capacity) - .with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())), - ), - DataType::Timestamp(TimeUnit::Nanosecond, tz) => Box::new( - TimestampNanosecondBuilder::with_capacity(capacity) - .with_data_type(DataType::Timestamp(TimeUnit::Nanosecond, tz.clone())), - ), - DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(IntervalYearMonthBuilder::with_capacity(capacity)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(IntervalDayTimeBuilder::with_capacity(capacity)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - Box::new(IntervalMonthDayNanoBuilder::with_capacity(capacity)) - } - DataType::Duration(TimeUnit::Second) => { - Box::new(DurationSecondBuilder::with_capacity(capacity)) - } - DataType::Duration(TimeUnit::Millisecond) => { - Box::new(DurationMillisecondBuilder::with_capacity(capacity)) - } - DataType::Duration(TimeUnit::Microsecond) => { - Box::new(DurationMicrosecondBuilder::with_capacity(capacity)) - } - DataType::Duration(TimeUnit::Nanosecond) => { - Box::new(DurationNanosecondBuilder::with_capacity(capacity)) - } - DataType::List(field) => { - let builder = make_builder(field.data_type(), capacity); - Box::new(ListBuilder::with_capacity(builder, capacity).with_field(field.clone())) - } - DataType::LargeList(field) => { - let builder = make_builder(field.data_type(), capacity); - Box::new(LargeListBuilder::with_capacity(builder, capacity).with_field(field.clone())) - } - DataType::FixedSizeList(field, size) => { - let size = *size; - let values_builder_capacity = { - let size: usize = size.try_into().unwrap(); - capacity * size - }; - let builder = make_builder(field.data_type(), values_builder_capacity); - Box::new( - FixedSizeListBuilder::with_capacity(builder, size, capacity) - .with_field(field.clone()), - ) - } - DataType::Map(field, _) => match field.data_type() { - DataType::Struct(fields) => { - let map_field_names = MapFieldNames { - key: fields[0].name().clone(), - value: fields[1].name().clone(), - entry: field.name().clone(), - }; - let key_builder = make_builder(fields[0].data_type(), capacity); - let value_builder = make_builder(fields[1].data_type(), capacity); - Box::new( - MapBuilder::with_capacity( - Some(map_field_names), - key_builder, - value_builder, - capacity, - ) - .with_values_field(fields[1].clone()), - ) - } - t => panic!("The field of Map data type {t:?} should has a child Struct field"), - }, - DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), - t @ DataType::Dictionary(key_type, value_type) => { - macro_rules! dict_builder { - ($key_type:ty) => { - match &**value_type { - DataType::Utf8 => { - let dict_builder: StringDictionaryBuilder<$key_type> = - StringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeUtf8 => { - let dict_builder: LargeStringDictionaryBuilder<$key_type> = - LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::Binary => { - let dict_builder: BinaryDictionaryBuilder<$key_type> = - BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeBinary => { - let dict_builder: LargeBinaryDictionaryBuilder<$key_type> = - LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - t => panic!("Dictionary value type {t:?} is not currently supported"), - } - }; - } - match &**key_type { - DataType::Int8 => dict_builder!(Int8Type), - DataType::Int16 => dict_builder!(Int16Type), - DataType::Int32 => dict_builder!(Int32Type), - DataType::Int64 => dict_builder!(Int64Type), - _ => { - panic!("Data type {t:?} with key type {key_type:?} is not currently supported") - } - } - } - t => panic!("Data type {t:?} is not currently supported"), - } -} - -impl StructBuilder { - /// Creates a new `StructBuilder` - pub fn new(fields: impl Into, field_builders: Vec>) -> Self { - Self { - field_builders, - fields: fields.into(), - null_buffer_builder: NullBufferBuilder::new(0), - } - } - - /// Creates a new `StructBuilder` from [`Fields`] and `capacity` - pub fn from_fields(fields: impl Into, capacity: usize) -> Self { - let fields = fields.into(); - let mut builders = Vec::with_capacity(fields.len()); - for field in &fields { - builders.push(make_builder(field.data_type(), capacity)); - } - Self::new(fields, builders) - } - - /// Returns a mutable reference to the child field builder at index `i`. - /// Result will be `None` if the input type `T` provided doesn't match the actual - /// field builder's type. - pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { - self.field_builders[i].as_any_mut().downcast_mut::() - } - - /// Returns the number of fields for the struct this builder is building. - pub fn num_fields(&self) -> usize { - self.field_builders.len() - } - - /// Appends an element (either null or non-null) to the struct. The actual elements - /// should be appended for each child sub-array in a consistent way. - #[inline] - pub fn append(&mut self, is_valid: bool) { - self.null_buffer_builder.append(is_valid); - } - - /// Appends a null element to the struct. - #[inline] - pub fn append_null(&mut self) { - self.append(false) - } - - /// Builds the `StructArray` and reset this builder. - pub fn finish(&mut self) -> StructArray { - self.validate_content(); - if self.fields.is_empty() { - return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish()); - } - - let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); - let nulls = self.null_buffer_builder.finish(); - StructArray::new(self.fields.clone(), arrays, nulls) - } - - /// Builds the `StructArray` without resetting the builder. - pub fn finish_cloned(&self) -> StructArray { - self.validate_content(); - - if self.fields.is_empty() { - return StructArray::new_empty_fields( - self.len(), - self.null_buffer_builder.finish_cloned(), - ); - } - - let arrays = self - .field_builders - .iter() - .map(|f| f.finish_cloned()) - .collect(); - - let nulls = self.null_buffer_builder.finish_cloned(); - - StructArray::new(self.fields.clone(), arrays, nulls) - } - - /// Constructs and validates contents in the builder to ensure that - /// - fields and field_builders are of equal length - /// - the number of items in individual field_builders are equal to self.len() - fn validate_content(&self) { - if self.fields.len() != self.field_builders.len() { - panic!("Number of fields is not equal to the number of field_builders."); - } - self.field_builders.iter().enumerate().for_each(|(idx, x)| { - if x.len() != self.len() { - let builder = SchemaBuilder::from(&self.fields); - let schema = builder.finish(); - - panic!("{}", format!( - "StructBuilder ({:?}) and field_builder with index {} ({:?}) are of unequal lengths: ({} != {}).", - schema, - idx, - self.fields[idx].data_type(), - self.len(), - x.len() - )); - } - }); - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - -#[cfg(test)] -mod tests { - use std::any::type_name; - - use super::*; - use arrow_buffer::Buffer; - use arrow_data::ArrayData; - use arrow_schema::Field; - - use crate::{array::Array, types::ArrowDictionaryKeyType}; - - #[test] - fn test_struct_array_builder() { - let string_builder = StringBuilder::new(); - let int_builder = Int32Builder::new(); - - let fields = vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ]; - let field_builders = vec![ - Box::new(string_builder) as Box, - Box::new(int_builder) as Box, - ]; - - let mut builder = StructBuilder::new(fields, field_builders); - assert_eq!(2, builder.num_fields()); - - let string_builder = builder - .field_builder::(0) - .expect("builder at field 0 should be string builder"); - string_builder.append_value("joe"); - string_builder.append_null(); - string_builder.append_null(); - string_builder.append_value("mark"); - - let int_builder = builder - .field_builder::(1) - .expect("builder at field 1 should be int builder"); - int_builder.append_value(1); - int_builder.append_value(2); - int_builder.append_null(); - int_builder.append_value(4); - - builder.append(true); - builder.append(true); - builder.append_null(); - builder.append(true); - - let struct_data = builder.finish().into_data(); - - assert_eq!(4, struct_data.len()); - assert_eq!(1, struct_data.null_count()); - assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity()); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7])) - .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build() - .unwrap(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) - .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) - .build() - .unwrap(); - - assert_eq!(expected_string_data, struct_data.child_data()[0]); - assert_eq!(expected_int_data, struct_data.child_data()[1]); - } - - #[test] - fn test_struct_array_builder_finish() { - let int_builder = Int32Builder::new(); - let bool_builder = BooleanBuilder::new(); - - let fields = vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Boolean, false), - ]; - let field_builders = vec![ - Box::new(int_builder) as Box, - Box::new(bool_builder) as Box, - ]; - - let mut builder = StructBuilder::new(fields, field_builders); - builder - .field_builder::(0) - .unwrap() - .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[ - false, true, false, true, false, true, false, true, false, true, - ]); - - // Append slot values - all are valid. - for _ in 0..10 { - builder.append(true); - } - - assert_eq!(10, builder.len()); - - let arr = builder.finish(); - - assert_eq!(10, arr.len()); - assert_eq!(0, builder.len()); - - builder - .field_builder::(0) - .unwrap() - .append_slice(&[1, 3, 5, 7, 9]); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[false, true, false, true, false]); - - // Append slot values - all are valid. - for _ in 0..5 { - builder.append(true); - } - - assert_eq!(5, builder.len()); - - let arr = builder.finish(); - - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_build_fixed_size_list() { - const LIST_LENGTH: i32 = 4; - let fixed_size_list_dtype = - DataType::new_fixed_size_list(DataType::Int32, LIST_LENGTH, false); - let mut builder = make_builder(&fixed_size_list_dtype, 10); - let builder = builder - .as_any_mut() - .downcast_mut::>>(); - match builder { - Some(builder) => { - assert_eq!(builder.value_length(), LIST_LENGTH); - assert!(builder - .values() - .as_any_mut() - .downcast_mut::() - .is_some()); - } - None => panic!("expected FixedSizeListBuilder, got a different builder type"), - } - } - - #[test] - fn test_struct_array_builder_finish_cloned() { - let int_builder = Int32Builder::new(); - let bool_builder = BooleanBuilder::new(); - - let fields = vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Boolean, false), - ]; - let field_builders = vec![ - Box::new(int_builder) as Box, - Box::new(bool_builder) as Box, - ]; - - let mut builder = StructBuilder::new(fields, field_builders); - builder - .field_builder::(0) - .unwrap() - .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[ - false, true, false, true, false, true, false, true, false, true, - ]); - - // Append slot values - all are valid. - for _ in 0..10 { - builder.append(true); - } - - assert_eq!(10, builder.len()); - - let mut arr = builder.finish_cloned(); - - assert_eq!(10, arr.len()); - assert_eq!(10, builder.len()); - - builder - .field_builder::(0) - .unwrap() - .append_slice(&[1, 3, 5, 7, 9]); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[false, true, false, true, false]); - - // Append slot values - all are valid. - for _ in 0..5 { - builder.append(true); - } - - assert_eq!(15, builder.len()); - - arr = builder.finish(); - - assert_eq!(15, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_struct_array_builder_from_schema() { - let mut fields = vec![ - Field::new("f1", DataType::Float32, false), - Field::new("f2", DataType::Utf8, false), - ]; - let sub_fields = vec![ - Field::new("g1", DataType::Int32, false), - Field::new("g2", DataType::Boolean, false), - ]; - let struct_type = DataType::Struct(sub_fields.into()); - fields.push(Field::new("f3", struct_type, false)); - - let mut builder = StructBuilder::from_fields(fields, 5); - assert_eq!(3, builder.num_fields()); - assert!(builder.field_builder::(0).is_some()); - assert!(builder.field_builder::(1).is_some()); - assert!(builder.field_builder::(2).is_some()); - } - - #[test] - fn test_datatype_properties() { - let fields = Fields::from(vec![ - Field::new("f1", DataType::Decimal128(1, 2), false), - Field::new( - "f2", - DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), - false, - ), - ]); - let mut builder = StructBuilder::from_fields(fields.clone(), 1); - builder - .field_builder::(0) - .unwrap() - .append_value(1); - builder - .field_builder::(1) - .unwrap() - .append_value(1); - builder.append(true); - let array = builder.finish(); - - assert_eq!(array.data_type(), &DataType::Struct(fields.clone())); - assert_eq!(array.column(0).data_type(), fields[0].data_type()); - assert_eq!(array.column(1).data_type(), fields[1].data_type()); - } - - #[test] - fn test_struct_array_builder_from_dictionary_type_int8_key() { - test_struct_array_builder_from_dictionary_type_inner::(DataType::Int8); - } - - #[test] - fn test_struct_array_builder_from_dictionary_type_int16_key() { - test_struct_array_builder_from_dictionary_type_inner::(DataType::Int16); - } - - #[test] - fn test_struct_array_builder_from_dictionary_type_int32_key() { - test_struct_array_builder_from_dictionary_type_inner::(DataType::Int32); - } - - #[test] - fn test_struct_array_builder_from_dictionary_type_int64_key() { - test_struct_array_builder_from_dictionary_type_inner::(DataType::Int64); - } - - fn test_struct_array_builder_from_dictionary_type_inner( - key_type: DataType, - ) { - let dict_field = Field::new( - "f1", - DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)), - false, - ); - let fields = vec![dict_field.clone()]; - let expected_dtype = DataType::Struct(fields.into()); - let cloned_dict_field = dict_field.clone(); - let expected_child_dtype = dict_field.data_type(); - let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5); - let Some(dict_builder) = struct_builder.field_builder::>(0) - else { - panic!( - "Builder should be StringDictionaryBuilder<{}>", - type_name::() - ) - }; - dict_builder.append_value("dict string"); - struct_builder.append(true); - let array = struct_builder.finish(); - - assert_eq!(array.data_type(), &expected_dtype); - assert_eq!(array.column(0).data_type(), expected_child_dtype); - assert_eq!(array.column(0).len(), 1); - } - - #[test] - #[should_panic( - expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported" - )] - fn test_struct_array_builder_from_schema_unsupported_type() { - let fields = vec![ - Field::new("f1", DataType::UInt64, false), - Field::new( - "f2", - DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - false, - ), - ]; - - let _ = StructBuilder::from_fields(fields, 5); - } - - #[test] - #[should_panic(expected = "Dictionary value type Int32 is not currently supported")] - fn test_struct_array_builder_from_dict_with_unsupported_value_type() { - let fields = vec![Field::new( - "f1", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)), - false, - )]; - - let _ = StructBuilder::from_fields(fields, 5); - } - - #[test] - fn test_struct_array_builder_field_builder_type_mismatch() { - let int_builder = Int32Builder::with_capacity(10); - - let fields = vec![Field::new("f1", DataType::Int32, false)]; - let field_builders = vec![Box::new(int_builder) as Box]; - - let mut builder = StructBuilder::new(fields, field_builders); - assert!(builder.field_builder::(0).is_none()); - } - - #[test] - #[should_panic( - expected = "StructBuilder (Schema { fields: [Field { name: \"f1\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"f2\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." - )] - fn test_struct_array_builder_unequal_field_builders_lengths() { - let mut int_builder = Int32Builder::with_capacity(10); - let mut bool_builder = BooleanBuilder::new(); - - int_builder.append_value(1); - int_builder.append_value(2); - bool_builder.append_value(true); - - let fields = vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Boolean, false), - ]; - let field_builders = vec![ - Box::new(int_builder) as Box, - Box::new(bool_builder) as Box, - ]; - - let mut builder = StructBuilder::new(fields, field_builders); - builder.append(true); - builder.append(true); - builder.finish(); - } - - #[test] - #[should_panic(expected = "Number of fields is not equal to the number of field_builders.")] - fn test_struct_array_builder_unequal_field_field_builders() { - let int_builder = Int32Builder::with_capacity(10); - - let fields = vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Boolean, false), - ]; - let field_builders = vec![Box::new(int_builder) as Box]; - - let mut builder = StructBuilder::new(fields, field_builders); - builder.finish(); - } - - #[test] - #[should_panic( - expected = "Incorrect datatype for StructArray field \\\"timestamp\\\", expected Timestamp(Nanosecond, Some(\\\"UTC\\\")) got Timestamp(Nanosecond, None)" - )] - fn test_struct_array_mismatch_builder() { - let fields = vec![Field::new( - "timestamp", - DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_owned().into())), - false, - )]; - - let field_builders: Vec> = - vec![Box::new(TimestampNanosecondBuilder::new())]; - - let mut sa = StructBuilder::new(fields, field_builders); - sa.finish(); - } - - #[test] - fn test_empty() { - let mut builder = StructBuilder::new(Fields::empty(), vec![]); - builder.append(true); - builder.append(false); - - let a1 = builder.finish_cloned(); - let a2 = builder.finish(); - assert_eq!(a1, a2); - assert_eq!(a1.len(), 2); - assert_eq!(a1.null_count(), 1); - assert!(a1.is_valid(0)); - assert!(a1.is_null(1)); - } -} diff --git a/arrow-array/src/builder2/union_builder.rs b/arrow-array/src/builder2/union_builder.rs deleted file mode 100644 index 0ddc38000899..000000000000 --- a/arrow-array/src/builder2/union_builder.rs +++ /dev/null @@ -1,313 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder}; -use crate::builder2::BufferBuilder; -use crate::{make_array, ArrowPrimitiveType, UnionArray}; -use arrow_buffer::NullBufferBuilder; -use arrow_buffer::{ArrowNativeType, Buffer}; -use arrow_data::ArrayDataBuilder; -use arrow_schema::{ArrowError, DataType, Field}; -use std::any::Any; -use std::collections::BTreeMap; -use std::sync::Arc; - -/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. -#[derive(Debug)] -struct FieldData { - /// The type id for this field - type_id: i8, - /// The Arrow data type represented in the `values_buffer`, which is untyped - data_type: DataType, - /// A buffer containing the values for this field in raw bytes - values_buffer: Box, - /// The number of array slots represented by the buffer - slots: usize, - /// A builder for the null bitmap - null_buffer_builder: NullBufferBuilder, -} - -/// A type-erased [`BufferBuilder`] used by [`FieldData`] -trait FieldDataValues: std::fmt::Debug { - fn as_mut_any(&mut self) -> &mut dyn Any; - - fn append_null(&mut self); - - fn finish(&mut self) -> Buffer; -} - -impl FieldDataValues for BufferBuilder { - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn append_null(&mut self) { - self.advance(1) - } - - fn finish(&mut self) -> Buffer { - self.finish() - } -} - -impl FieldData { - /// Creates a new `FieldData`. - fn new(type_id: i8, data_type: DataType, capacity: usize) -> Self { - Self { - type_id, - data_type, - slots: 0, - values_buffer: Box::new(BufferBuilder::::new(capacity)), - null_buffer_builder: NullBufferBuilder::new(capacity), - } - } - - /// Appends a single value to this `FieldData`'s `values_buffer`. - fn append_value(&mut self, v: T::Native) { - self.values_buffer - .as_mut_any() - .downcast_mut::>() - .expect("Tried to append unexpected type") - .append(v); - - self.null_buffer_builder.append(true); - self.slots += 1; - } - - /// Appends a null to this `FieldData`. - fn append_null(&mut self) { - self.values_buffer.append_null(); - self.null_buffer_builder.append(false); - self.slots += 1; - } -} - -/// Builder for [`UnionArray`] -/// -/// Example: **Dense Memory Layout** -/// -/// ``` -/// # use arrow_array::builder::UnionBuilder; -/// # use arrow_array::types::{Float64Type, Int32Type}; -/// -/// let mut builder = UnionBuilder::new_dense(); -/// builder.append::("a", 1).unwrap(); -/// builder.append::("b", 3.0).unwrap(); -/// builder.append::("a", 4).unwrap(); -/// let union = builder.build().unwrap(); -/// -/// assert_eq!(union.type_id(0), 0); -/// assert_eq!(union.type_id(1), 1); -/// assert_eq!(union.type_id(2), 0); -/// -/// assert_eq!(union.value_offset(0), 0); -/// assert_eq!(union.value_offset(1), 0); -/// assert_eq!(union.value_offset(2), 1); -/// ``` -/// -/// Example: **Sparse Memory Layout** -/// ``` -/// # use arrow_array::builder::UnionBuilder; -/// # use arrow_array::types::{Float64Type, Int32Type}; -/// -/// let mut builder = UnionBuilder::new_sparse(); -/// builder.append::("a", 1).unwrap(); -/// builder.append::("b", 3.0).unwrap(); -/// builder.append::("a", 4).unwrap(); -/// let union = builder.build().unwrap(); -/// -/// assert_eq!(union.type_id(0), 0); -/// assert_eq!(union.type_id(1), 1); -/// assert_eq!(union.type_id(2), 0); -/// -/// assert_eq!(union.value_offset(0), 0); -/// assert_eq!(union.value_offset(1), 1); -/// assert_eq!(union.value_offset(2), 2); -/// ``` -#[derive(Debug)] -pub struct UnionBuilder { - /// The current number of slots in the array - len: usize, - /// Maps field names to `FieldData` instances which track the builders for that field - fields: BTreeMap, - /// Builder to keep track of type ids - type_id_builder: Int8BufferBuilder, - /// Builder to keep track of offsets (`None` for sparse unions) - value_offset_builder: Option, - initial_capacity: usize, -} - -impl UnionBuilder { - /// Creates a new dense array builder. - pub fn new_dense() -> Self { - Self::with_capacity_dense(1024) - } - - /// Creates a new sparse array builder. - pub fn new_sparse() -> Self { - Self::with_capacity_sparse(1024) - } - - /// Creates a new dense array builder with capacity. - pub fn with_capacity_dense(capacity: usize) -> Self { - Self { - len: 0, - fields: Default::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: Some(Int32BufferBuilder::new(capacity)), - initial_capacity: capacity, - } - } - - /// Creates a new sparse array builder with capacity. - pub fn with_capacity_sparse(capacity: usize) -> Self { - Self { - len: 0, - fields: Default::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: None, - initial_capacity: capacity, - } - } - - /// Appends a null to this builder, encoding the null in the array - /// of the `type_name` child / field. - /// - /// Since `UnionArray` encodes nulls as an entry in its children - /// (it doesn't have a validity bitmap itself), and where the null - /// is part of the final array, appending a NULL requires - /// specifying which field (child) to use. - #[inline] - pub fn append_null( - &mut self, - type_name: &str, - ) -> Result<(), ArrowError> { - self.append_option::(type_name, None) - } - - /// Appends a value to this builder. - #[inline] - pub fn append( - &mut self, - type_name: &str, - v: T::Native, - ) -> Result<(), ArrowError> { - self.append_option::(type_name, Some(v)) - } - - fn append_option( - &mut self, - type_name: &str, - v: Option, - ) -> Result<(), ArrowError> { - let type_name = type_name.to_string(); - - let mut field_data = match self.fields.remove(&type_name) { - Some(data) => { - if data.data_type != T::DATA_TYPE { - return Err(ArrowError::InvalidArgumentError(format!( - "Attempt to write col \"{}\" with type {} doesn't match existing type {}", - type_name, - T::DATA_TYPE, - data.data_type - ))); - } - data - } - None => match self.value_offset_builder { - Some(_) => FieldData::new::( - self.fields.len() as i8, - T::DATA_TYPE, - self.initial_capacity, - ), - // In the case of a sparse union, we should pass the maximum of the currently length and the capacity. - None => { - let mut fd = FieldData::new::( - self.fields.len() as i8, - T::DATA_TYPE, - self.len.max(self.initial_capacity), - ); - for _ in 0..self.len { - fd.append_null(); - } - fd - } - }, - }; - self.type_id_builder.append(field_data.type_id); - - match &mut self.value_offset_builder { - // Dense Union - Some(offset_builder) => { - offset_builder.append(field_data.slots as i32); - } - // Sparse Union - None => { - for (_, fd) in self.fields.iter_mut() { - // Append to all bar the FieldData currently being appended to - fd.append_null(); - } - } - } - - match v { - Some(v) => field_data.append_value::(v), - None => field_data.append_null(), - } - - self.fields.insert(type_name, field_data); - self.len += 1; - Ok(()) - } - - /// Builds this builder creating a new `UnionArray`. - pub fn build(self) -> Result { - let mut children = Vec::with_capacity(self.fields.len()); - let union_fields = self - .fields - .into_iter() - .map( - |( - name, - FieldData { - type_id, - data_type, - mut values_buffer, - slots, - mut null_buffer_builder, - }, - )| { - let array_ref = make_array(unsafe { - ArrayDataBuilder::new(data_type.clone()) - .add_buffer(values_buffer.finish()) - .len(slots) - .nulls(null_buffer_builder.finish()) - .build_unchecked() - }); - children.push(array_ref); - (type_id, Arc::new(Field::new(name, data_type, false))) - }, - ) - .collect(); - UnionArray::try_new( - union_fields, - self.type_id_builder.into(), - self.value_offset_builder.map(Into::into), - children, - ) - } -} From a8cb7d0c0e454850d18feabd8c5f03bfb61f031a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:28:45 +0200 Subject: [PATCH 4/5] implement better builder --- arrow-array/src/array/boolean_array.rs | 12 + arrow-array/src/array/list_array.rs | 12 + arrow-array/src/array/primitive_array.rs | 13 + arrow-array/src/builder/boolean_builder.rs | 47 +- .../src/builder/fixed_size_binary_builder.rs | 39 +- .../src/builder/fixed_size_list_builder.rs | 55 +- .../src/builder/generic_byte_run_builder.rs | 7 +- .../src/builder/generic_bytes_builder.rs | 42 +- .../src/builder/generic_bytes_view_builder.rs | 40 +- .../src/builder/generic_list_builder.rs | 128 ++- arrow-array/src/builder/mod.rs | 43 +- arrow-array/src/builder/primitive_builder.rs | 38 +- arrow-array/src/builder2/boolean_builder.rs | 318 -------- .../src/builder2/fixed_size_binary_builder.rs | 270 ------- .../src/builder2/fixed_size_list_builder.rs | 508 ------------ .../src/builder2/generic_bytes_builder.rs | 607 -------------- .../builder2/generic_bytes_view_builder.rs | 743 ------------------ .../src/builder2/generic_list_builder.rs | 740 ----------------- arrow-array/src/builder2/mod.rs | 365 --------- arrow-array/src/builder2/primitive_builder.rs | 637 --------------- arrow-array/src/lib.rs | 1 - 21 files changed, 456 insertions(+), 4209 deletions(-) delete mode 100644 arrow-array/src/builder2/boolean_builder.rs delete mode 100644 arrow-array/src/builder2/fixed_size_binary_builder.rs delete mode 100644 arrow-array/src/builder2/fixed_size_list_builder.rs delete mode 100644 arrow-array/src/builder2/generic_bytes_builder.rs delete mode 100644 arrow-array/src/builder2/generic_bytes_view_builder.rs delete mode 100644 arrow-array/src/builder2/generic_list_builder.rs delete mode 100644 arrow-array/src/builder2/mod.rs delete mode 100644 arrow-array/src/builder2/primitive_builder.rs diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 9c2d4af8c454..22af62471b44 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -352,6 +352,18 @@ impl ArrayAccessor for &BooleanArray { } } +impl ArrayAccessor for BooleanArray { + type Item = bool; + + fn value(&self, index: usize) -> Self::Item { + self.value(index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + self.value_unchecked(index) + } +} + impl From> for BooleanArray { fn from(data: Vec) -> Self { let mut mut_buf = MutableBuffer::new_null(data.len()); diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index bed0bdf889b2..34becd35b663 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -537,6 +537,18 @@ impl ArrayAccessor for &GenericListArray ArrayAccessor for GenericListArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + self.value(index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + self.value_unchecked(index) + } +} + impl std::fmt::Debug for GenericListArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let prefix = OffsetSize::PREFIX; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 57aa23bf9040..fc9ad155ef92 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1197,6 +1197,19 @@ impl ArrayAccessor for &PrimitiveArray { } } +impl ArrayAccessor for PrimitiveArray { + type Item = T::Native; + + fn value(&self, index: usize) -> Self::Item { + self.value(index) + } + + #[inline] + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + self.value_unchecked(index) + } +} + impl PrimitiveArray where i64: From, diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 60ed86ce80b4..f630d4ea89b1 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, BooleanBufferBuilder}; -use crate::{ArrayRef, BooleanArray}; +use crate::builder::{SpecificArrayBuilder, ArrayBuilder, BooleanBufferBuilder}; +use crate::{Array, ArrayRef, BooleanArray}; use arrow_buffer::Buffer; use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; @@ -219,6 +219,49 @@ impl ArrayBuilder for BooleanBuilder { } } + +impl SpecificArrayBuilder for BooleanBuilder { + type Output = BooleanArray; + type Item<'a> = bool; + + /// Builds the array and reset this builder. + fn finish(&mut self) -> Arc { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> Arc { + Arc::new(self.finish_cloned()) + } + + fn append_value(&mut self, value: bool) { + self.append_value(value) + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_value(*value) + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_nulls(&mut self, n: usize) { + self.append_nulls(n) + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.append_value(output.value(i)); + } + } + } +} + impl Extend> for BooleanBuilder { #[inline] fn extend>>(&mut self, iter: T) { diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 65072a09f603..d742cb7dccc9 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, UInt8BufferBuilder}; -use crate::{ArrayRef, FixedSizeBinaryArray}; +use crate::builder::{ArrayBuilder, SpecificArrayBuilder, UInt8BufferBuilder}; +use crate::{Array, ArrayRef, FixedSizeBinaryArray}; use arrow_buffer::Buffer; use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; @@ -154,6 +154,41 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { } } +impl SpecificArrayBuilder for FixedSizeBinaryBuilder { + type Output = FixedSizeBinaryArray; + type Item<'a> = &'a [u8]; + + fn finish(&mut self) -> Arc { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> Arc { + Arc::new(self.finish_cloned()) + } + + fn append_value<'a>(&'a mut self, value: Self::Item<'a>) { + self.append_value(value).unwrap() + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_value(value).unwrap() + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.append_value(output.value(i)).unwrap(); + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 5c142b277d14..0e82fa4ede9f 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::ArrayBuilder; -use crate::{ArrayRef, FixedSizeListArray}; +use crate::builder::{ArrayBuilder, SpecificArrayBuilder}; +use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{Field, FieldRef}; use std::any::Any; @@ -215,6 +215,57 @@ where } } + +impl SpecificArrayBuilder for FixedSizeListBuilder +where + ValuesOutput: Array + 'static, + T: SpecificArrayBuilder, + for<'a> &'a ValuesOutput: ArrayAccessor, + for<'a> ::Item<'a>: From<<&'a ValuesOutput as ArrayAccessor>::Item> +{ + type Output = FixedSizeListArray; + type Item<'a> = T::Output; + + /// Builds the array and reset this builder. + fn finish(&mut self) -> Arc { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> Arc { + Arc::new(self.finish_cloned()) + } + + fn append_value<'a>(&'a mut self, value: Self::Item<'a>) { + // our item is their output + self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); + self.append(true); + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); + self.append(true); + } + + fn append_null(&mut self) { + // TODO - make sure we should append nulls to the values builder + self.values_builder.append_nulls(self.list_len as usize); + self.append(false); + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.values_builder.append_output(output.value(i).as_any().downcast_ref::().unwrap()); + self.append(true); + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 0bf5658b297e..66d7d979b3db 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -18,12 +18,9 @@ use crate::types::bytes::ByteArrayNativeType; use std::{any::Any, sync::Arc}; -use crate::{ - types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, - ArrayRef, ArrowPrimitiveType, RunArray, -}; +use crate::{types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, RunArray}; -use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; +use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder, SpecificArrayBuilder}; use arrow_buffer::ArrowNativeType; diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index e2be96615b61..9e265d22c5ea 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; +use crate::builder::{ArrayBuilder, BufferBuilder, SpecificArrayBuilder, UInt8BufferBuilder}; use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; +use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayDataBuilder; @@ -228,6 +228,44 @@ impl ArrayBuilder for GenericByteBuilder { } } +impl SpecificArrayBuilder for GenericByteBuilder { + type Output = GenericByteArray; + type Item<'a> = &'a T::Native; + + /// Builds the array and reset this builder. + fn finish(&mut self) -> Arc> { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> Arc> { + Arc::new(self.finish_cloned()) + } + + fn append_value(&mut self, value: &T::Native) { + self.append_value(value) + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_value(value) + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.append_value(output.value(i)); + } + } + } +} + impl> Extend> for GenericByteBuilder { #[inline] fn extend>>(&mut self, iter: I) { diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 7268e751b149..c242ba220905 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -25,10 +25,10 @@ use arrow_schema::ArrowError; use hashbrown::hash_table::Entry; use hashbrown::HashTable; -use crate::builder::ArrayBuilder; +use crate::builder::{ArrayBuilder, SpecificArrayBuilder}; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{ArrayRef, GenericByteViewArray}; +use crate::{Array, ArrayRef, GenericByteViewArray}; const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB @@ -452,6 +452,42 @@ impl ArrayBuilder for GenericByteViewBuilder { } } +impl SpecificArrayBuilder for GenericByteViewBuilder { + type Output = GenericByteViewArray; + type Item<'a> = &'a T::Native; + + fn finish(&mut self) -> Arc> { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> Arc> { + Arc::new(self.finish_cloned()) + } + + fn append_value(&mut self, value: &T::Native) { + self.append_value(value) + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_value(value) + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + self.append_value(output.value(i)); + } + } + } +} + impl> Extend> for GenericByteViewBuilder { diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index a9c88ec6c586..0d2d6f893389 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::builder::{ArrayBuilder, BufferBuilder, SpecificArrayBuilder}; use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{Buffer, OffsetBuffer}; @@ -168,6 +168,59 @@ where } } +impl SpecificArrayBuilder for GenericListBuilder +where + OffsetSize: OffsetSizeTrait, +{ + type Output = GenericListArray; + type Item<'a> = T::Output; + + /// Builds the array and reset this builder. + fn finish(&mut self) -> Arc> { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> Arc> { + Arc::new(self.finish_cloned()) + } + + fn append_value<'a>(&'a mut self, value: Self::Item<'a>) { + // our item is their output + self.values_builder + .append_output(value.as_any().downcast_ref::>().unwrap()); + self.append(true); + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.values_builder + .append_output(value.as_any().downcast_ref::>().unwrap()); + self.append(true); + } + + fn append_null(&mut self) { + self.append(false); + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + // TODO - if iterator exists try it? + for i in 0..output.len() { + if output.is_null(i) { + self.append_null(); + } else { + let current_value = output.value(i); + self.values_builder.append_output( + current_value + .as_any() + .downcast_ref::>() + .unwrap(), + ); + self.append(true); + } + } + } +} + impl GenericListBuilder where T: 'static, @@ -353,11 +406,12 @@ where #[cfg(test)] mod tests { + use arrow_buffer::ArrowNativeType; use super::*; - use crate::builder::{make_builder, Int32Builder, ListBuilder}; + use crate::builder::{make_builder, Int32Builder, ListBuilder, PrimitiveBuilder}; use crate::cast::AsArray; use crate::types::Int32Type; - use crate::Int32Array; + use crate::{Int32Array, ListArray}; use arrow_schema::DataType; fn _test_generic_list_array_builder() { @@ -803,4 +857,72 @@ mod tests { builder.append_value([Some(1)]); builder.finish(); } + + #[test] + fn should_be_able_to_add_from_list_as_is() { + let from: Arc = { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListBuilder::new(primitive_builder); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true); + builder.values().append(false); + builder.values().values().append_value(8); + builder.values().append(true); + builder.append(true); + + builder.append(false); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true); + builder.append(true); + + Arc::new(builder.finish()) + }; + let mut to = ListBuilder::new(ListBuilder::new(Int32Builder::new())); + + for i in 0..from.len() { + if from.is_valid(i) { + let item = from.value(i); + let inner_list = item + .as_any() + .downcast_ref::>() + .unwrap(); + SpecificArrayBuilder::append_value_ref(&mut to, inner_list); + } else { + to.append_null(); + } + } + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + let l1 = to.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6].map(i32::usize_as)); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10].map(i32::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 89a96280eb87..ad5c28c97cc5 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -184,8 +184,9 @@ mod union_builder; pub use union_builder::*; -use crate::ArrayRef; +use crate::{Array, ArrayRef}; use std::any::Any; +use std::sync::Arc; /// Trait for dealing with different array builders at runtime /// @@ -298,6 +299,46 @@ impl ArrayBuilder for Box { } } +pub trait SpecificArrayBuilder: Any + Send + Sync + ArrayBuilder { + type Output: Array; + type Item<'a>; + + /// Builds the array + fn finish(&mut self) -> Arc; + + /// Builds the array without resetting the underlying builder. + fn finish_cloned(&self) -> Arc; + + // Append a value to the builder + fn append_value<'a>(&'a mut self, value: Self::Item<'a>); + + // Append a value to the builder + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>); + + /// Appends a null slot into the builder + fn append_null(&mut self); + + /// Appends `n` `null`s into the builder. + #[inline] + fn append_nulls(&mut self, n: usize) { + for _ in 0..n { + self.append_null(); + } + } + + /// Appends an `Option` into the builder + #[inline] + fn append_option<'a>(&'a mut self, v: Option>) { + match v { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + #[inline] + fn append_output<'a>(&'a mut self, output: &'a Self::Output); +} + /// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 3191fea6e407..102ebc77ad01 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::builder::{ArrayBuilder, BufferBuilder, SpecificArrayBuilder}; use crate::types::*; use crate::{ArrayRef, PrimitiveArray}; use arrow_buffer::NullBufferBuilder; @@ -132,6 +132,42 @@ impl ArrayBuilder for PrimitiveBuilder { } } + +impl SpecificArrayBuilder for PrimitiveBuilder { + type Output = PrimitiveArray; + type Item<'a> = T::Native; + + /// Builds the array and reset this builder. + fn finish(&mut self) -> Arc> { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> Arc> { + Arc::new(self.finish_cloned()) + } + + fn append_value(&mut self, value: T::Native) { + self.append_value(value) + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_value(*value) + } + + fn append_null(&mut self) { + self.append_null() + } + + fn append_nulls(&mut self, n: usize) { + self.append_nulls(n) + } + + fn append_output<'a>(&'a mut self, output: &'a PrimitiveArray) { + self.extend(output) + } +} + impl Default for PrimitiveBuilder { fn default() -> Self { Self::new() diff --git a/arrow-array/src/builder2/boolean_builder.rs b/arrow-array/src/builder2/boolean_builder.rs deleted file mode 100644 index 52ec4661d797..000000000000 --- a/arrow-array/src/builder2/boolean_builder.rs +++ /dev/null @@ -1,318 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{SpecificArrayBuilder, BooleanBufferBuilder}; -use crate::{ArrayAccessor, ArrayRef, BooleanArray}; -use arrow_buffer::Buffer; -use arrow_buffer::NullBufferBuilder; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType}; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`BooleanArray`] -/// -/// # Example -/// -/// Create a `BooleanArray` from a `BooleanBuilder` -/// -/// ``` -/// -/// # use arrow_array::{Array, BooleanArray, builder::BooleanBuilder}; -/// -/// let mut b = BooleanBuilder::new(); -/// b.append_value(true); -/// b.append_null(); -/// b.append_value(false); -/// b.append_value(true); -/// let arr = b.finish(); -/// -/// assert_eq!(4, arr.len()); -/// assert_eq!(1, arr.null_count()); -/// assert_eq!(true, arr.value(0)); -/// assert!(arr.is_valid(0)); -/// assert!(!arr.is_null(0)); -/// assert!(!arr.is_valid(1)); -/// assert!(arr.is_null(1)); -/// assert_eq!(false, arr.value(2)); -/// assert!(arr.is_valid(2)); -/// assert!(!arr.is_null(2)); -/// assert_eq!(true, arr.value(3)); -/// assert!(arr.is_valid(3)); -/// assert!(!arr.is_null(3)); -/// ``` -#[derive(Debug)] -pub struct BooleanBuilder { - values_builder: BooleanBufferBuilder, - null_buffer_builder: NullBufferBuilder, -} - -impl Default for BooleanBuilder { - fn default() -> Self { - Self::new() - } -} - -impl BooleanBuilder { - /// Creates a new boolean builder - pub fn new() -> Self { - Self::with_capacity(1024) - } - - /// Creates a new boolean builder with space for `capacity` elements without re-allocating - pub fn with_capacity(capacity: usize) -> Self { - Self { - values_builder: BooleanBufferBuilder::new(capacity), - null_buffer_builder: NullBufferBuilder::new(capacity), - } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: bool) { - self.values_builder.append(v); - self.null_buffer_builder.append_non_null(); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.null_buffer_builder.append_null(); - self.values_builder.advance(1); - } - - /// Appends `n` `null`s into the builder. - #[inline] - pub fn append_nulls(&mut self, n: usize) { - self.null_buffer_builder.append_n_nulls(n); - self.values_builder.advance(n); - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) { - match v { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[bool]) { - self.values_builder.append_slice(v); - self.null_buffer_builder.append_n_non_nulls(v.len()); - } - - /// Appends n `additional` bits of value `v` into the buffer - #[inline] - pub fn append_n(&mut self, additional: usize, v: bool) { - self.values_builder.append_n(additional, v); - self.null_buffer_builder.append_n_non_nulls(additional); - } - - /// Appends values from a slice of type `T` and a validity boolean slice. - /// - /// Returns an error if the slices are of different lengths - #[inline] - pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<(), ArrowError> { - if values.len() != is_valid.len() { - Err(ArrowError::InvalidArgumentError( - "Value and validity lengths must be equal".to_string(), - )) - } else { - self.null_buffer_builder.append_slice(is_valid); - self.values_builder.append_slice(values); - Ok(()) - } - } - - /// Builds the [BooleanArray] and reset this builder. - pub fn finish(&mut self) -> BooleanArray { - let len = self.len(); - let null_bit_buffer = self.null_buffer_builder.finish(); - let builder = ArrayData::builder(DataType::Boolean) - .len(len) - .add_buffer(self.values_builder.finish().into_inner()) - .nulls(null_bit_buffer); - - let array_data = unsafe { builder.build_unchecked() }; - BooleanArray::from(array_data) - } - - /// Builds the [BooleanArray] without resetting the builder. - pub fn finish_cloned(&self) -> BooleanArray { - let len = self.len(); - let nulls = self.null_buffer_builder.finish_cloned(); - let value_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let builder = ArrayData::builder(DataType::Boolean) - .len(len) - .add_buffer(value_buffer) - .nulls(nulls); - - let array_data = unsafe { builder.build_unchecked() }; - BooleanArray::from(array_data) - } - - /// Returns the current values buffer as a slice - /// - /// Boolean values are bit-packed into bytes. To extract the i-th boolean - /// from the bytes, you can use `arrow_buffer::bit_util::get_bit()`. - pub fn values_slice(&self) -> &[u8] { - self.values_builder.as_slice() - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - - -impl Extend> for BooleanBuilder { - #[inline] - fn extend>>(&mut self, iter: T) { - for v in iter { - self.append_option(v) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::Array; - - #[test] - fn test_boolean_array_builder() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanArray::builder(10); - for i in 0..10 { - if i == 3 || i == 6 || i == 9 { - builder.append_value(true); - } else { - builder.append_value(false); - } - } - - let arr = builder.finish(); - assert_eq!(&buf, arr.values().inner()); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..10 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") - } - } - - #[test] - fn test_boolean_array_builder_append_slice() { - let arr1 = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); - - let mut builder = BooleanArray::builder(0); - builder.append_slice(&[true, false]); - builder.append_null(); - builder.append_null(); - builder.append_value(false); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_array_builder_append_slice_large() { - let arr1 = BooleanArray::from(vec![true; 513]); - - let mut builder = BooleanArray::builder(512); - builder.append_slice(&[true; 513]); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_array_builder_no_null() { - let mut builder = BooleanArray::builder(0); - builder.append_option(Some(true)); - builder.append_value(false); - builder.append_slice(&[true, false, true]); - builder - .append_values(&[false, false, true], &[true, true, true]) - .unwrap(); - - let array = builder.finish(); - assert_eq!(0, array.null_count()); - assert!(array.nulls().is_none()); - } - - #[test] - fn test_boolean_array_builder_finish_cloned() { - let mut builder = BooleanArray::builder(16); - builder.append_option(Some(true)); - builder.append_value(false); - builder.append_slice(&[true, false, true]); - let mut array = builder.finish_cloned(); - assert_eq!(3, array.true_count()); - assert_eq!(2, array.false_count()); - - builder - .append_values(&[false, false, true], &[true, true, true]) - .unwrap(); - - array = builder.finish(); - assert_eq!(4, array.true_count()); - assert_eq!(4, array.false_count()); - - assert_eq!(0, array.null_count()); - assert!(array.nulls().is_none()); - } - - #[test] - fn test_extend() { - let mut builder = BooleanBuilder::new(); - builder.extend([false, false, true, false, false].into_iter().map(Some)); - builder.extend([true, true, false].into_iter().map(Some)); - let array = builder.finish(); - let values = array.iter().map(|x| x.unwrap()).collect::>(); - assert_eq!( - &values, - &[false, false, true, false, false, true, true, false] - ) - } - - #[test] - fn test_boolean_array_builder_append_n() { - let mut builder = BooleanBuilder::new(); - builder.append_n(3, true); - builder.append_n(2, false); - let array = builder.finish(); - assert_eq!(3, array.true_count()); - assert_eq!(2, array.false_count()); - assert_eq!(0, array.null_count()); - - let values = array.iter().map(|x| x.unwrap()).collect::>(); - assert_eq!(&values, &[true, true, true, false, false]) - } -} diff --git a/arrow-array/src/builder2/fixed_size_binary_builder.rs b/arrow-array/src/builder2/fixed_size_binary_builder.rs deleted file mode 100644 index 6c20e5562b2a..000000000000 --- a/arrow-array/src/builder2/fixed_size_binary_builder.rs +++ /dev/null @@ -1,270 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{SpecificArrayBuilder}; -use crate::builder::{UInt8BufferBuilder}; -use crate::{ArrayAccessor, ArrayRef, FixedSizeBinaryArray}; -use arrow_buffer::Buffer; -use arrow_buffer::NullBufferBuilder; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType}; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`FixedSizeBinaryArray`] -/// ``` -/// # use arrow_array::builder::FixedSizeBinaryBuilder; -/// # use arrow_array::Array; -/// # -/// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); -/// // [b"hello", null, b"arrow"] -/// builder.append_value(b"hello").unwrap(); -/// builder.append_null(); -/// builder.append_value(b"arrow").unwrap(); -/// -/// let array = builder.finish(); -/// assert_eq!(array.value(0), b"hello"); -/// assert!(array.is_null(1)); -/// assert_eq!(array.value(2), b"arrow"); -/// ``` -#[derive(Debug)] -pub struct FixedSizeBinaryBuilder { - values_builder: UInt8BufferBuilder, - null_buffer_builder: NullBufferBuilder, - value_length: i32, -} - -impl FixedSizeBinaryBuilder { - /// Creates a new [`FixedSizeBinaryBuilder`] - pub fn new(byte_width: i32) -> Self { - Self::with_capacity(1024, byte_width) - } - - /// Creates a new [`FixedSizeBinaryBuilder`], `capacity` is the number of byte slices - /// that can be appended without reallocating - pub fn with_capacity(capacity: usize, byte_width: i32) -> Self { - assert!( - byte_width >= 0, - "value length ({byte_width}) of the array must >= 0" - ); - Self { - values_builder: UInt8BufferBuilder::new(capacity * byte_width as usize), - null_buffer_builder: NullBufferBuilder::new(capacity), - value_length: byte_width, - } - } - - /// Appends a byte slice into the builder. - /// - /// Automatically update the null buffer to delimit the slice appended in as a - /// distinct value element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> { - if self.value_length != value.as_ref().len() as i32 { - Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" - .to_string(), - )) - } else { - self.values_builder.append_slice(value.as_ref()); - self.null_buffer_builder.append_non_null(); - Ok(()) - } - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) { - self.values_builder - .append_slice(&vec![0u8; self.value_length as usize][..]); - self.null_buffer_builder.append_null(); - } - - /// Builds the [`FixedSizeBinaryArray`] and reset this builder. - pub fn finish(&mut self) -> FixedSizeBinaryArray { - let array_length = self.len(); - let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(self.values_builder.finish()) - .nulls(self.null_buffer_builder.finish()) - .len(array_length); - let array_data = unsafe { array_data_builder.build_unchecked() }; - FixedSizeBinaryArray::from(array_data) - } - - /// Builds the [`FixedSizeBinaryArray`] without resetting the builder. - pub fn finish_cloned(&self) -> FixedSizeBinaryArray { - let array_length = self.len(); - let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(values_buffer) - .nulls(self.null_buffer_builder.finish_cloned()) - .len(array_length); - let array_data = unsafe { array_data_builder.build_unchecked() }; - FixedSizeBinaryArray::from(array_data) - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - -impl SpecificArrayBuilder for FixedSizeBinaryBuilder { - type Output = FixedSizeBinaryArray; - - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - // TODO - should panic? - // should we document it or return a Result? - self.append_value(value).expect("append value failed"); - } - - fn append_null(&mut self) { - self.append_null(); - } - - // TODO - implement append nulls with better performance? -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::Array; - - #[test] - fn test_fixed_size_binary_builder() { - let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); - - // [b"hello", null, "arrow"] - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"arrow").unwrap(); - let array: FixedSizeBinaryArray = builder.finish(); - - assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); - assert_eq!(3, array.len()); - assert_eq!(1, array.null_count()); - assert_eq!(10, array.value_offset(2)); - assert_eq!(5, array.value_length()); - } - - #[test] - fn test_fixed_size_binary_builder_finish_cloned() { - let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); - - // [b"hello", null, "arrow"] - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"arrow").unwrap(); - let mut array: FixedSizeBinaryArray = builder.finish_cloned(); - - assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); - assert_eq!(3, array.len()); - assert_eq!(1, array.null_count()); - assert_eq!(10, array.value_offset(2)); - assert_eq!(5, array.value_length()); - - // [b"finis", null, "clone"] - builder.append_value(b"finis").unwrap(); - builder.append_null(); - builder.append_value(b"clone").unwrap(); - - array = builder.finish(); - - assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); - assert_eq!(6, array.len()); - assert_eq!(2, array.null_count()); - assert_eq!(25, array.value_offset(5)); - assert_eq!(5, array.value_length()); - } - - #[test] - fn test_fixed_size_binary_builder_with_zero_value_length() { - let mut builder = FixedSizeBinaryBuilder::new(0); - - builder.append_value(b"").unwrap(); - builder.append_null(); - builder.append_value(b"").unwrap(); - assert!(!builder.is_empty()); - - let array: FixedSizeBinaryArray = builder.finish(); - assert_eq!(&DataType::FixedSizeBinary(0), array.data_type()); - assert_eq!(3, array.len()); - assert_eq!(1, array.null_count()); - assert_eq!(0, array.value_offset(2)); - assert_eq!(0, array.value_length()); - assert_eq!(b"", array.value(0)); - assert_eq!(b"", array.value(2)); - } - - #[test] - #[should_panic( - expected = "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" - )] - fn test_fixed_size_binary_builder_with_inconsistent_value_length() { - let mut builder = FixedSizeBinaryBuilder::with_capacity(1, 4); - builder.append_value(b"hello").unwrap(); - } - #[test] - fn test_fixed_size_binary_builder_empty() { - let mut builder = FixedSizeBinaryBuilder::new(5); - assert!(builder.is_empty()); - - let fixed_size_binary_array = builder.finish(); - assert_eq!( - &DataType::FixedSizeBinary(5), - fixed_size_binary_array.data_type() - ); - assert_eq!(0, fixed_size_binary_array.len()); - } - - #[test] - #[should_panic(expected = "value length (-1) of the array must >= 0")] - fn test_fixed_size_binary_builder_invalid_value_length() { - let _ = FixedSizeBinaryBuilder::with_capacity(15, -1); - } -} diff --git a/arrow-array/src/builder2/fixed_size_list_builder.rs b/arrow-array/src/builder2/fixed_size_list_builder.rs deleted file mode 100644 index f01c18b16c72..000000000000 --- a/arrow-array/src/builder2/fixed_size_list_builder.rs +++ /dev/null @@ -1,508 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::SpecificArrayBuilder; -use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; -use arrow_buffer::NullBufferBuilder; -use arrow_schema::{Field, FieldRef}; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`FixedSizeListArray`] -/// ``` -/// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array}; -/// let values_builder = Int32Builder::new(); -/// let mut builder = FixedSizeListBuilder::new(values_builder, 3); -/// -/// // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] -/// builder.values().append_value(0); -/// builder.values().append_value(1); -/// builder.values().append_value(2); -/// builder.append(true); -/// builder.values().append_null(); -/// builder.values().append_null(); -/// builder.values().append_null(); -/// builder.append(false); -/// builder.values().append_value(3); -/// builder.values().append_null(); -/// builder.values().append_value(5); -/// builder.append(true); -/// builder.values().append_value(6); -/// builder.values().append_value(7); -/// builder.values().append_null(); -/// builder.append(true); -/// let list_array = builder.finish(); -/// assert_eq!( -/// *list_array.value(0), -/// Int32Array::from(vec![Some(0), Some(1), Some(2)]) -/// ); -/// assert!(list_array.is_null(1)); -/// assert_eq!( -/// *list_array.value(2), -/// Int32Array::from(vec![Some(3), None, Some(5)]) -/// ); -/// assert_eq!( -/// *list_array.value(3), -/// Int32Array::from(vec![Some(6), Some(7), None]) -/// ) -/// ``` -/// -#[derive(Debug)] -pub struct FixedSizeListBuilder where for<'a> &'a ::Output: ArrayAccessor { - null_buffer_builder: NullBufferBuilder, - values_builder: T, - list_len: i32, - field: Option, -} - -impl FixedSizeListBuilder where for<'a> &'a ::Output: ArrayAccessor { - /// Creates a new [`FixedSizeListBuilder`] from a given values array builder - /// `value_length` is the number of values within each array - pub fn new(values_builder: T, value_length: i32) -> Self { - let capacity = values_builder - .len() - .checked_div(value_length as _) - .unwrap_or_default(); - - Self::with_capacity(values_builder, value_length, capacity) - } - - /// Creates a new [`FixedSizeListBuilder`] from a given values array builder - /// `value_length` is the number of values within each array - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, value_length: i32, capacity: usize) -> Self { - Self { - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, - list_len: value_length, - field: None, - } - } - - /// Override the field passed to [`FixedSizeListArray::new`] - /// - /// By default, a nullable field is created with the name `item` - /// - /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the - /// field's data type does not match that of `T` - pub fn with_field(self, field: impl Into) -> Self { - Self { - field: Some(field.into()), - ..self - } - } -} - -impl SpecificArrayBuilder for FixedSizeListBuilder -where - ValuesOutput: Array, - T: 'static + SpecificArrayBuilder, - for<'a> &'a ValuesOutput: ArrayAccessor -{ - type Output = FixedSizeListArray; - - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - // our item is their output - self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); - self.append(true); - } - - fn append_null(&mut self) { - // TODO - make sure we should append nulls to the values builder - self.values_builder.append_nulls(self.list_len as usize); - self.append(false); - } -} - -impl FixedSizeListBuilder -where - T: 'static, for<'a> &'a ::Output: ArrayAccessor -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call [`append`](#method.append) to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Returns the length of the list - pub fn value_length(&self) -> i32 { - self.list_len - } - - /// Finish the current fixed-length list array slot - #[inline] - pub fn append(&mut self, is_valid: bool) { - self.null_buffer_builder.append(is_valid); - } - - /// Builds the [`FixedSizeListBuilder`] and reset this builder. - pub fn finish(&mut self) -> FixedSizeListArray { - let len = self.len(); - let values = self.values_builder.finish(); - let nulls = self.null_buffer_builder.finish(); - - assert_eq!( - values.len(), len * self.list_len as usize, - "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", - values.len(), - self.list_len, - len, - ); - - let field = self - .field - .clone() - .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); - - FixedSizeListArray::new(field, self.list_len, values, nulls) - } - - /// Builds the [`FixedSizeListBuilder`] without resetting the builder. - pub fn finish_cloned(&self) -> FixedSizeListArray { - let len = self.len(); - let values = self.values_builder.finish_cloned(); - let nulls = self.null_buffer_builder.finish_cloned(); - - assert_eq!( - values.len(), len * self.list_len as usize, - "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", - values.len(), - self.list_len, - len, - ); - - let field = self - .field - .clone() - .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); - - FixedSizeListArray::new(field, self.list_len, values, nulls) - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_schema::DataType; - - use crate::builder2::{Int32Builder, PrimitiveBuilder}; - use crate::Array; - use crate::Int32Array; - - fn make_list_builder( - include_null_element: bool, - include_null_in_values: bool, - ) -> FixedSizeListBuilder> { - let values_builder = Int32Builder::new(); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - builder.values().append_value(0); - builder.values().append_value(1); - builder.values().append_value(2); - builder.append(true); - - builder.values().append_value(2); - builder.values().append_value(3); - builder.values().append_value(4); - builder.append(true); - - if include_null_element { - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_null(); - builder.append(false); - } else { - builder.values().append_value(2); - builder.values().append_value(3); - builder.values().append_value(4); - builder.append(true); - } - - if include_null_in_values { - builder.values().append_value(3); - builder.values().append_null(); - builder.values().append_value(5); - builder.append(true); - } else { - builder.values().append_value(3); - builder.values().append_value(4); - builder.values().append_value(5); - builder.append(true); - } - - builder - } - - #[test] - fn test_fixed_size_list_array_builder() { - let mut builder = make_list_builder(true, true); - - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - fn test_fixed_size_list_array_builder_with_field() { - let builder = make_list_builder(false, false); - let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - fn test_fixed_size_list_array_builder_with_field_and_null() { - let builder = make_list_builder(true, false); - let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - #[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")] - fn test_fixed_size_list_array_builder_with_field_null_panic() { - let builder = make_list_builder(true, true); - let mut builder = builder.with_field(Field::new("list_item", DataType::Int32, false)); - - builder.finish(); - } - - #[test] - #[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")] - fn test_fixed_size_list_array_builder_with_field_type_panic() { - let values_builder = Int32Builder::new(); - let builder = FixedSizeListBuilder::new(values_builder, 3); - let mut builder = builder.with_field(Field::new("list_item", DataType::Int64, true)); - - // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] - builder.values().append_value(0); - builder.values().append_value(1); - builder.values().append_value(2); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_null(); - builder.append(false); - builder.values().append_value(3); - builder.values().append_value(4); - builder.values().append_value(5); - builder.append(true); - - builder.finish(); - } - - #[test] - fn test_fixed_size_list_array_builder_cloned_with_field() { - let builder = make_list_builder(true, true); - let builder = builder.with_field(Field::new("list_element", DataType::Int32, true)); - - let list_array = builder.finish_cloned(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - #[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")] - fn test_fixed_size_list_array_builder_cloned_with_field_null_panic() { - let builder = make_list_builder(true, true); - let builder = builder.with_field(Field::new("list_item", DataType::Int32, false)); - - builder.finish_cloned(); - } - - #[test] - fn test_fixed_size_list_array_builder_cloned_with_field_and_null() { - let builder = make_list_builder(true, false); - let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false)); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - #[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")] - fn test_fixed_size_list_array_builder_cloned_with_field_type_panic() { - let builder = make_list_builder(false, false); - let builder = builder.with_field(Field::new("list_item", DataType::Int64, true)); - - builder.finish_cloned(); - } - - #[test] - fn test_fixed_size_list_array_builder_finish_cloned() { - let mut builder = make_list_builder(true, true); - - let mut list_array = builder.finish_cloned(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_length()); - - builder.values().append_value(6); - builder.values().append_value(7); - builder.values().append_null(); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_null(); - builder.append(false); - list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(6, list_array.len()); - assert_eq!(2, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - fn test_fixed_size_list_array_builder_with_field_empty() { - let values_builder = Int32Array::builder2(0); - let mut builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( - "list_item", - DataType::Int32, - false, - )); - assert!(builder.is_empty()); - let arr = builder.finish(); - assert_eq!(0, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_cloned_with_field_empty() { - let values_builder = Int32Array::builder2(0); - let builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new( - "list_item", - DataType::Int32, - false, - )); - assert!(builder.is_empty()); - let arr = builder.finish_cloned(); - assert_eq!(0, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_empty() { - let values_builder = Int32Array::builder2(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - assert!(builder.is_empty()); - let arr = builder.finish(); - assert_eq!(0, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_finish() { - let values_builder = Int32Array::builder2(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5, 6]); - builder.append(true); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.values().append_slice(&[7, 8, 9]); - builder.append(true); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - #[should_panic( - expected = "Length of the child array (10) must be the multiple of the value length (3) and the array length (3)." - )] - fn test_fixed_size_list_array_builder_fail() { - let values_builder = Int32Array::builder2(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5, 6]); - builder.append(true); - builder.values().append_slice(&[7, 8, 9, 10]); - builder.append(true); - - builder.finish(); - } -} diff --git a/arrow-array/src/builder2/generic_bytes_builder.rs b/arrow-array/src/builder2/generic_bytes_builder.rs deleted file mode 100644 index 0b07c43ac7d7..000000000000 --- a/arrow-array/src/builder2/generic_bytes_builder.rs +++ /dev/null @@ -1,607 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{SpecificArrayBuilder}; -use crate::builder::{BufferBuilder, UInt8BufferBuilder}; -use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait}; -use arrow_buffer::NullBufferBuilder; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; -use arrow_data::ArrayDataBuilder; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`GenericByteArray`] -/// -/// For building strings, see docs on [`GenericStringBuilder`]. -/// For building binary, see docs on [`GenericBinaryBuilder`]. -pub struct GenericByteBuilder { - value_builder: UInt8BufferBuilder, - offsets_builder: BufferBuilder, - null_buffer_builder: NullBufferBuilder, -} - -impl GenericByteBuilder { - /// Creates a new [`GenericByteBuilder`]. - pub fn new() -> Self { - Self::with_capacity(1024, 1024) - } - - /// Creates a new [`GenericByteBuilder`]. - /// - /// - `item_capacity` is the number of items to pre-allocate. - /// The size of the preallocated buffer of offsets is the number of items plus one. - /// - `data_capacity` is the total number of bytes of data to pre-allocate - /// (for all items, not per item). - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); - offsets_builder.append(T::Offset::from_usize(0).unwrap()); - Self { - value_builder: UInt8BufferBuilder::new(data_capacity), - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(item_capacity), - } - } - - /// Creates a new [`GenericByteBuilder`] from buffers. - /// - /// # Safety - /// - /// This doesn't verify buffer contents as it assumes the buffers are from - /// existing and valid [`GenericByteArray`]. - pub unsafe fn new_from_buffer( - offsets_buffer: MutableBuffer, - value_buffer: MutableBuffer, - null_buffer: Option, - ) -> Self { - let offsets_builder = BufferBuilder::::new_from_buffer(offsets_buffer); - let value_builder = BufferBuilder::::new_from_buffer(value_buffer); - - let null_buffer_builder = null_buffer - .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) - .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); - - Self { - offsets_builder, - value_builder, - null_buffer_builder, - } - } - - #[inline] - fn next_offset(&self) -> T::Offset { - T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") - } - - /// Appends a value into the builder. - /// - /// See the [GenericStringBuilder] documentation for examples of - /// incrementally building string values with multiple `write!` calls. - /// - /// # Panics - /// - /// Panics if the resulting length of [`Self::values_slice`] would exceed - /// `T::Offset::MAX` bytes. - /// - /// For example, this can happen with [`StringArray`] or [`BinaryArray`] - /// where the total length of all values exceeds 2GB - /// - /// [`StringArray`]: crate::StringArray - /// [`BinaryArray`]: crate::BinaryArray - #[inline] - pub fn append_value(&mut self, value: impl AsRef) { - self.value_builder.append_slice(value.as_ref().as_ref()); - self.null_buffer_builder.append(true); - self.offsets_builder.append(self.next_offset()); - } - - /// Append an `Option` value into the builder. - /// - /// - A `None` value will append a null value. - /// - A `Some` value will append the value. - /// - /// See [`Self::append_value`] for more panic information. - #[inline] - pub fn append_option(&mut self, value: Option>) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Append a null value into the builder. - #[inline] - pub fn append_null(&mut self) { - self.null_buffer_builder.append(false); - self.offsets_builder.append(self.next_offset()); - } - - /// Builds the [`GenericByteArray`] and reset this builder. - pub fn finish(&mut self) -> GenericByteArray { - let array_type = T::DATA_TYPE; - let array_builder = ArrayDataBuilder::new(array_type) - .len(self.len()) - .add_buffer(self.offsets_builder.finish()) - .add_buffer(self.value_builder.finish()) - .nulls(self.null_buffer_builder.finish()); - - self.offsets_builder.append(self.next_offset()); - let array_data = unsafe { array_builder.build_unchecked() }; - GenericByteArray::from(array_data) - } - - /// Builds the [`GenericByteArray`] without resetting the builder. - pub fn finish_cloned(&self) -> GenericByteArray { - let array_type = T::DATA_TYPE; - let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice()); - let array_builder = ArrayDataBuilder::new(array_type) - .len(self.len()) - .add_buffer(offset_buffer) - .add_buffer(value_buffer) - .nulls(self.null_buffer_builder.finish_cloned()); - - let array_data = unsafe { array_builder.build_unchecked() }; - GenericByteArray::from(array_data) - } - - /// Returns the current values buffer as a slice - pub fn values_slice(&self) -> &[u8] { - self.value_builder.as_slice() - } - - /// Returns the current offsets buffer as a slice - pub fn offsets_slice(&self) -> &[T::Offset] { - self.offsets_builder.as_slice() - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } - - /// Returns the current null buffer as a mutable slice - pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { - self.null_buffer_builder.as_slice_mut() - } -} - -impl std::fmt::Debug for GenericByteBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?; - f.debug_struct("") - .field("value_builder", &self.value_builder) - .field("offsets_builder", &self.offsets_builder) - .field("null_buffer_builder", &self.null_buffer_builder) - .finish() - } -} - -impl Default for GenericByteBuilder { - fn default() -> Self { - Self::new() - } -} - -impl SpecificArrayBuilder for GenericByteBuilder { - type Output = GenericByteArray; - - /// Returns the number of binary slots in the builder - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - self.append_value(value) - } - - fn append_null(&mut self) { - self.append_null() - } -} - -impl> Extend> for GenericByteBuilder { - #[inline] - fn extend>>(&mut self, iter: I) { - for v in iter { - self.append_option(v) - } - } -} - -/// Array builder for [`GenericStringArray`][crate::GenericStringArray] -/// -/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with -/// [`GenericByteBuilder::append_null`]. -/// -/// This builder also implements [`std::fmt::Write`] with any written data -/// included in the next appended value. This allows using [`std::fmt::Display`] -/// with standard Rust idioms like `write!` and `writeln!` to write data -/// directly to the builder without intermediate allocations. -/// -/// # Example writing strings with `append_value` -/// ``` -/// # use arrow_array::builder::GenericStringBuilder; -/// let mut builder = GenericStringBuilder::::new(); -/// -/// // Write one string value -/// builder.append_value("foobarbaz"); -/// -/// // Write a second string -/// builder.append_value("v2"); -/// -/// let array = builder.finish(); -/// assert_eq!(array.value(0), "foobarbaz"); -/// assert_eq!(array.value(1), "v2"); -/// ``` -/// -/// # Example incrementally writing strings with `std::fmt::Write` -/// -/// ``` -/// # use std::fmt::Write; -/// # use arrow_array::builder::GenericStringBuilder; -/// let mut builder = GenericStringBuilder::::new(); -/// -/// // Write data in multiple `write!` calls -/// write!(builder, "foo").unwrap(); -/// write!(builder, "bar").unwrap(); -/// // The next call to append_value finishes the current string -/// // including all previously written strings. -/// builder.append_value("baz"); -/// -/// // Write second value with a single write call -/// write!(builder, "v2").unwrap(); -/// // finish the value by calling append_value with an empty string -/// builder.append_value(""); -/// -/// let array = builder.finish(); -/// assert_eq!(array.value(0), "foobarbaz"); -/// assert_eq!(array.value(1), "v2"); -/// ``` -pub type GenericStringBuilder = GenericByteBuilder>; - -impl std::fmt::Write for GenericStringBuilder { - fn write_str(&mut self, s: &str) -> std::fmt::Result { - self.value_builder.append_slice(s.as_bytes()); - Ok(()) - } -} - -/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] -/// -/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with -/// [`GenericByteBuilder::append_null`]. -/// -/// # Example -/// ``` -/// # use arrow_array::builder::GenericBinaryBuilder; -/// let mut builder = GenericBinaryBuilder::::new(); -/// -/// // Write data -/// builder.append_value("foo"); -/// -/// // Write second value -/// builder.append_value(&[0,1,2]); -/// -/// let array = builder.finish(); -/// // binary values -/// assert_eq!(array.value(0), b"foo"); -/// assert_eq!(array.value(1), b"\x00\x01\x02"); -/// ``` -/// -/// # Example incrementally writing bytes with `write_bytes` -/// -/// ``` -/// # use std::io::Write; -/// # use arrow_array::builder::GenericBinaryBuilder; -/// let mut builder = GenericBinaryBuilder::::new(); -/// -/// // Write data in multiple `write_bytes` calls -/// write!(builder, "foo").unwrap(); -/// write!(builder, "bar").unwrap(); -/// // The next call to append_value finishes the current string -/// // including all previously written strings. -/// builder.append_value("baz"); -/// -/// // Write second value with a single write call -/// write!(builder, "v2").unwrap(); -/// // finish the value by calling append_value with an empty string -/// builder.append_value(""); -/// -/// let array = builder.finish(); -/// assert_eq!(array.value(0), "foobarbaz".as_bytes()); -/// assert_eq!(array.value(1), "v2".as_bytes()); -/// ``` -pub type GenericBinaryBuilder = GenericByteBuilder>; - -impl std::io::Write for GenericBinaryBuilder { - fn write(&mut self, bs: &[u8]) -> std::io::Result { - self.value_builder.append_slice(bs); - Ok(bs.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::Array; - use crate::GenericStringArray; - use std::fmt::Write as _; - use std::io::Write as _; - - fn _test_generic_binary_builder() { - let mut builder = GenericBinaryBuilder::::new(); - - builder.append_value(b"hello"); - builder.append_value(b""); - builder.append_null(); - builder.append_value(b"rust"); - - let array = builder.finish(); - - assert_eq!(4, array.len()); - assert_eq!(1, array.null_count()); - assert_eq!(b"hello", array.value(0)); - assert_eq!([] as [u8; 0], array.value(1)); - assert!(array.is_null(2)); - assert_eq!(b"rust", array.value(3)); - assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]); - assert_eq!(O::from_usize(4).unwrap(), array.value_length(3)); - } - - #[test] - fn test_binary_builder() { - _test_generic_binary_builder::() - } - - #[test] - fn test_large_binary_builder() { - _test_generic_binary_builder::() - } - - fn _test_generic_binary_builder_all_nulls() { - let mut builder = GenericBinaryBuilder::::new(); - builder.append_null(); - builder.append_null(); - builder.append_null(); - assert_eq!(3, builder.len()); - assert!(!builder.is_empty()); - - let array = builder.finish(); - assert_eq!(3, array.null_count()); - assert_eq!(3, array.len()); - assert!(array.is_null(0)); - assert!(array.is_null(1)); - assert!(array.is_null(2)); - } - - #[test] - fn test_binary_builder_all_nulls() { - _test_generic_binary_builder_all_nulls::() - } - - #[test] - fn test_large_binary_builder_all_nulls() { - _test_generic_binary_builder_all_nulls::() - } - - fn _test_generic_binary_builder_reset() { - let mut builder = GenericBinaryBuilder::::new(); - - builder.append_value(b"hello"); - builder.append_value(b""); - builder.append_null(); - builder.append_value(b"rust"); - builder.finish(); - - assert!(builder.is_empty()); - - builder.append_value(b"parquet"); - builder.append_null(); - builder.append_value(b"arrow"); - builder.append_value(b""); - let array = builder.finish(); - - assert_eq!(4, array.len()); - assert_eq!(1, array.null_count()); - assert_eq!(b"parquet", array.value(0)); - assert!(array.is_null(1)); - assert_eq!(b"arrow", array.value(2)); - assert_eq!(b"", array.value(1)); - assert_eq!(O::zero(), array.value_offsets()[0]); - assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]); - assert_eq!(O::from_usize(5).unwrap(), array.value_length(2)); - } - - #[test] - fn test_binary_builder_reset() { - _test_generic_binary_builder_reset::() - } - - #[test] - fn test_large_binary_builder_reset() { - _test_generic_binary_builder_reset::() - } - - fn _test_generic_string_array_builder() { - let mut builder = GenericStringBuilder::::new(); - let owned = "arrow".to_owned(); - - builder.append_value("hello"); - builder.append_value(""); - builder.append_value(&owned); - builder.append_null(); - builder.append_option(Some("rust")); - builder.append_option(None::<&str>); - builder.append_option(None::); - assert_eq!(7, builder.len()); - - assert_eq!( - GenericStringArray::::from(vec![ - Some("hello"), - Some(""), - Some("arrow"), - None, - Some("rust"), - None, - None - ]), - builder.finish() - ); - } - - #[test] - fn test_string_array_builder() { - _test_generic_string_array_builder::() - } - - #[test] - fn test_large_string_array_builder() { - _test_generic_string_array_builder::() - } - - fn _test_generic_string_array_builder_finish() { - let mut builder = GenericStringBuilder::::with_capacity(3, 11); - - builder.append_value("hello"); - builder.append_value("rust"); - builder.append_null(); - - builder.finish(); - assert!(builder.is_empty()); - assert_eq!(&[O::zero()], builder.offsets_slice()); - - builder.append_value("arrow"); - builder.append_value("parquet"); - let arr = builder.finish(); - // array should not have null buffer because there is not `null` value. - assert!(arr.nulls().is_none()); - assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) - } - - #[test] - fn test_string_array_builder_finish() { - _test_generic_string_array_builder_finish::() - } - - #[test] - fn test_large_string_array_builder_finish() { - _test_generic_string_array_builder_finish::() - } - - fn _test_generic_string_array_builder_finish_cloned() { - let mut builder = GenericStringBuilder::::with_capacity(3, 11); - - builder.append_value("hello"); - builder.append_value("rust"); - builder.append_null(); - - let mut arr = builder.finish_cloned(); - assert!(!builder.is_empty()); - assert_eq!(3, arr.len()); - - builder.append_value("arrow"); - builder.append_value("parquet"); - arr = builder.finish(); - - assert!(arr.nulls().is_some()); - assert_eq!(&[O::zero()], builder.offsets_slice()); - assert_eq!(5, arr.len()); - } - - #[test] - fn test_string_array_builder_finish_cloned() { - _test_generic_string_array_builder_finish_cloned::() - } - - #[test] - fn test_large_string_array_builder_finish_cloned() { - _test_generic_string_array_builder_finish_cloned::() - } - - #[test] - fn test_extend() { - let mut builder = GenericStringBuilder::::new(); - builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some)); - builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some)); - let array = builder.finish(); - assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); - assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); - } - - #[test] - fn test_write_str() { - let mut builder = GenericStringBuilder::::new(); - write!(builder, "foo").unwrap(); - builder.append_value(""); - writeln!(builder, "bar").unwrap(); - builder.append_value(""); - write!(builder, "fiz").unwrap(); - write!(builder, "buz").unwrap(); - builder.append_value(""); - let a = builder.finish(); - let r: Vec<_> = a.iter().flatten().collect(); - assert_eq!(r, &["foo", "bar\n", "fizbuz"]) - } - - #[test] - fn test_write_bytes() { - let mut builder = GenericBinaryBuilder::::new(); - write!(builder, "foo").unwrap(); - builder.append_value(""); - writeln!(builder, "bar").unwrap(); - builder.append_value(""); - write!(builder, "fiz").unwrap(); - write!(builder, "buz").unwrap(); - builder.append_value(""); - let a = builder.finish(); - let r: Vec<_> = a.iter().flatten().collect(); - assert_eq!( - r, - &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] - ) - } -} diff --git a/arrow-array/src/builder2/generic_bytes_view_builder.rs b/arrow-array/src/builder2/generic_bytes_view_builder.rs deleted file mode 100644 index f23df2835de6..000000000000 --- a/arrow-array/src/builder2/generic_bytes_view_builder.rs +++ /dev/null @@ -1,743 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::marker::PhantomData; -use std::sync::Arc; - -use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; -use arrow_data::ByteView; -use arrow_schema::ArrowError; -use hashbrown::hash_table::Entry; -use hashbrown::HashTable; - -use crate::builder2::SpecificArrayBuilder; -use crate::types::bytes::ByteArrayNativeType; -use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{ArrayAccessor, ArrayRef, GenericByteViewArray}; - -const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB -const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB - -enum BlockSizeGrowthStrategy { - Fixed { size: u32 }, - Exponential { current_size: u32 }, -} - -impl BlockSizeGrowthStrategy { - fn next_size(&mut self) -> u32 { - match self { - Self::Fixed { size } => *size, - Self::Exponential { current_size } => { - if *current_size < MAX_BLOCK_SIZE { - // we have fixed start/end block sizes, so we can't overflow - *current_size = current_size.saturating_mul(2); - *current_size - } else { - MAX_BLOCK_SIZE - } - } - } - } -} - -/// A builder for [`GenericByteViewArray`] -/// -/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data, -/// and a list of views into those buffers. -/// -/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`] -/// -/// This builder can be used in two ways -/// -/// # Append Values -/// -/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable -/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] -/// writes values larger than 12 bytes to the current in-progress block, with values smaller -/// than 12 bytes inlined into the views. If a value is appended that will not fit in the -/// in-progress block, it will be closed, and a new block of sufficient size allocated -/// -/// # Append Views -/// -/// Some use-cases may wish to reuse an existing allocation containing string data, for example, -/// when parsing data from a parquet data page. In such a case entire blocks can be appended -/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended -/// using [`GenericByteViewBuilder::try_append_view`] -pub struct GenericByteViewBuilder { - views_builder: BufferBuilder, - null_buffer_builder: NullBufferBuilder, - completed: Vec, - in_progress: Vec, - block_size: BlockSizeGrowthStrategy, - /// Some if deduplicating strings - /// map ` -> ` - string_tracker: Option<(HashTable, ahash::RandomState)>, - phantom: PhantomData, -} - -impl GenericByteViewBuilder { - /// Creates a new [`GenericByteViewBuilder`]. - pub fn new() -> Self { - Self::with_capacity(1024) - } - - /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. - pub fn with_capacity(capacity: usize) -> Self { - Self { - views_builder: BufferBuilder::new(capacity), - null_buffer_builder: NullBufferBuilder::new(capacity), - completed: vec![], - in_progress: vec![], - block_size: BlockSizeGrowthStrategy::Exponential { - current_size: STARTING_BLOCK_SIZE, - }, - string_tracker: None, - phantom: Default::default(), - } - } - - /// Set a fixed buffer size for variable length strings - /// - /// The block size is the size of the buffer used to store values greater - /// than 12 bytes. The builder allocates new buffers when the current - /// buffer is full. - /// - /// By default the builder balances buffer size and buffer count by - /// growing buffer size exponentially from 8KB up to 2MB. The - /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. - /// - /// If this method is used, any new buffers allocated are - /// exactly this size. This can be useful for advanced users - /// that want to control the memory usage and buffer count. - /// - /// See for more details on the implications. - pub fn with_fixed_block_size(self, block_size: u32) -> Self { - debug_assert!(block_size > 0, "Block size must be greater than 0"); - Self { - block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, - ..self - } - } - - /// Override the size of buffers to allocate for holding string data - /// Use `with_fixed_block_size` instead. - #[deprecated(since = "53.0.0", note = "Use `with_fixed_block_size` instead")] - pub fn with_block_size(self, block_size: u32) -> Self { - self.with_fixed_block_size(block_size) - } - - /// Deduplicate strings while building the array - /// - /// This will potentially decrease the memory usage if the array have repeated strings - /// It will also increase the time to build the array as it needs to hash the strings - pub fn with_deduplicate_strings(self) -> Self { - Self { - string_tracker: Some(( - HashTable::with_capacity(self.views_builder.capacity()), - Default::default(), - )), - ..self - } - } - - /// Append a new data block returning the new block offset - /// - /// Note: this will first flush any in-progress block - /// - /// This allows appending views from blocks added using [`Self::append_block`]. See - /// [`Self::append_value`] for appending individual values - /// - /// ``` - /// # use arrow_array::builder::StringViewBuilder; - /// let mut builder = StringViewBuilder::new(); - /// - /// let block = builder.append_block(b"helloworldbingobongo".into()); - /// - /// builder.try_append_view(block, 0, 5).unwrap(); - /// builder.try_append_view(block, 5, 5).unwrap(); - /// builder.try_append_view(block, 10, 5).unwrap(); - /// builder.try_append_view(block, 15, 5).unwrap(); - /// builder.try_append_view(block, 0, 15).unwrap(); - /// let array = builder.finish(); - /// - /// let actual: Vec<_> = array.iter().flatten().collect(); - /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"]; - /// assert_eq!(actual, expected); - /// ``` - pub fn append_block(&mut self, buffer: Buffer) -> u32 { - assert!(buffer.len() < u32::MAX as usize); - - self.flush_in_progress(); - let offset = self.completed.len(); - self.push_completed(buffer); - offset as u32 - } - - /// Append a view of the given `block`, `offset` and `length` - /// - /// # Safety - /// (1) The block must have been added using [`Self::append_block`] - /// (2) The range `offset..offset+length` must be within the bounds of the block - /// (3) The data in the block must be valid of type `T` - pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { - let b = self.completed.get_unchecked(block as usize); - let start = offset as usize; - let end = start.saturating_add(len as usize); - let b = b.get_unchecked(start..end); - - let view = make_view(b, block, offset); - self.views_builder.append(view); - self.null_buffer_builder.append_non_null(); - } - - /// Try to append a view of the given `block`, `offset` and `length` - /// - /// See [`Self::append_block`] - pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> { - let b = self.completed.get(block as usize).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!("No block found with index {block}")) - })?; - let start = offset as usize; - let end = start.saturating_add(len as usize); - - let b = b.get(start..end).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Range {start}..{end} out of bounds for block of length {}", - b.len() - )) - })?; - - if T::Native::from_bytes_checked(b).is_none() { - return Err(ArrowError::InvalidArgumentError( - "Invalid view data".to_string(), - )); - } - - unsafe { - self.append_view_unchecked(block, offset, len); - } - Ok(()) - } - - /// Flushes the in progress block if any - #[inline] - fn flush_in_progress(&mut self) { - if !self.in_progress.is_empty() { - let f = Buffer::from_vec(std::mem::take(&mut self.in_progress)); - self.push_completed(f) - } - } - - /// Append a block to `self.completed`, checking for overflow - #[inline] - fn push_completed(&mut self, block: Buffer) { - assert!(block.len() < u32::MAX as usize, "Block too large"); - assert!(self.completed.len() < u32::MAX as usize, "Too many blocks"); - self.completed.push(block); - } - - /// Returns the value at the given index - /// Useful if we want to know what value has been inserted to the builder - /// The index has to be smaller than `self.len()`, otherwise it will panic - pub fn get_value(&self, index: usize) -> &[u8] { - let view = self.views_builder.as_slice().get(index).unwrap(); - let len = *view as u32; - if len <= 12 { - // # Safety - // The view is valid from the builder - unsafe { GenericByteViewArray::::inline_value(view, len as usize) } - } else { - let view = ByteView::from(*view); - if view.buffer_index < self.completed.len() as u32 { - let block = &self.completed[view.buffer_index as usize]; - &block[view.offset as usize..view.offset as usize + view.length as usize] - } else { - &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] - } - } - } - - /// Appends a value into the builder - /// - /// # Panics - /// - /// Panics if - /// - String buffer count exceeds `u32::MAX` - /// - String length exceeds `u32::MAX` - #[inline] - pub fn append_value(&mut self, value: impl AsRef) { - let v: &[u8] = value.as_ref().as_ref(); - let length: u32 = v.len().try_into().unwrap(); - if length <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); - view_buffer[4..4 + v.len()].copy_from_slice(v); - self.views_builder.append(u128::from_le_bytes(view_buffer)); - self.null_buffer_builder.append_non_null(); - return; - } - - // Deduplication if: - // (1) deduplication is enabled. - // (2) len > 12 - if let Some((mut ht, hasher)) = self.string_tracker.take() { - let hash_val = hasher.hash_one(v); - let hasher_fn = |v: &_| hasher.hash_one(v); - - let entry = ht.entry( - hash_val, - |idx| { - let stored_value = self.get_value(*idx); - v == stored_value - }, - hasher_fn, - ); - match entry { - Entry::Occupied(occupied) => { - // If the string already exists, we will directly use the view - let idx = occupied.get(); - self.views_builder - .append(self.views_builder.as_slice()[*idx]); - self.null_buffer_builder.append_non_null(); - self.string_tracker = Some((ht, hasher)); - return; - } - Entry::Vacant(vacant) => { - // o.w. we insert the (string hash -> view index) - // the idx is current length of views_builder, as we are inserting a new view - vacant.insert(self.views_builder.len()); - } - } - self.string_tracker = Some((ht, hasher)); - } - - let required_cap = self.in_progress.len() + v.len(); - if self.in_progress.capacity() < required_cap { - self.flush_in_progress(); - let to_reserve = v.len().max(self.block_size.next_size() as usize); - self.in_progress.reserve(to_reserve); - }; - let offset = self.in_progress.len() as u32; - self.in_progress.extend_from_slice(v); - - let view = ByteView { - length, - prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), - buffer_index: self.completed.len() as u32, - offset, - }; - self.views_builder.append(view.into()); - self.null_buffer_builder.append_non_null(); - } - - /// Append an `Option` value into the builder - #[inline] - pub fn append_option(&mut self, value: Option>) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Append a null value into the builder - #[inline] - pub fn append_null(&mut self) { - self.null_buffer_builder.append_null(); - self.views_builder.append(0); - } - - /// Builds the [`GenericByteViewArray`] and reset this builder - pub fn finish(&mut self) -> GenericByteViewArray { - self.flush_in_progress(); - let completed = std::mem::take(&mut self.completed); - let len = self.views_builder.len(); - let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); - let nulls = self.null_buffer_builder.finish(); - if let Some((ref mut ht, _)) = self.string_tracker.as_mut() { - ht.clear(); - } - // SAFETY: valid by construction - unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } - } - - /// Builds the [`GenericByteViewArray`] without resetting the builder - pub fn finish_cloned(&self) -> GenericByteViewArray { - let mut completed = self.completed.clone(); - if !self.in_progress.is_empty() { - completed.push(Buffer::from_slice_ref(&self.in_progress)); - } - let len = self.views_builder.len(); - let views = Buffer::from_slice_ref(self.views_builder.as_slice()); - let views = ScalarBuffer::new(views, 0, len); - let nulls = self.null_buffer_builder.finish_cloned(); - // SAFETY: valid by construction - unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } - - /// Return the allocated size of this builder in bytes, useful for memory accounting. - pub fn allocated_size(&self) -> usize { - let views = self.views_builder.capacity() * std::mem::size_of::(); - let null = self.null_buffer_builder.allocated_size(); - let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); - let in_progress = self.in_progress.capacity(); - let tracker = match &self.string_tracker { - Some((ht, _)) => ht.capacity() * std::mem::size_of::(), - None => 0, - }; - buffer_size + in_progress + tracker + views + null - } -} - -impl Default for GenericByteViewBuilder { - fn default() -> Self { - Self::new() - } -} - -impl std::fmt::Debug for GenericByteViewBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}ViewBuilder", T::PREFIX)?; - f.debug_struct("") - .field("views_builder", &self.views_builder) - .field("in_progress", &self.in_progress) - .field("completed", &self.completed) - .field("null_buffer_builder", &self.null_buffer_builder) - .finish() - } -} - -impl SpecificArrayBuilder for GenericByteViewBuilder { - type Output = GenericByteViewArray; - - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn into_box_any(self: Box) -> Box { - self - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - self.append_value(value) - } - - fn append_null(&mut self) { - self.append_null() - } -} - -impl> Extend> - for GenericByteViewBuilder -{ - #[inline] - fn extend>>(&mut self, iter: I) { - for v in iter { - self.append_option(v) - } - } -} - -/// Array builder for [`StringViewArray`][crate::StringViewArray] -/// -/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with -/// [`GenericByteViewBuilder::append_null`] as normal. -/// -/// # Example -/// ``` -/// # use arrow_array::builder::StringViewBuilder; -/// # use arrow_array::StringViewArray; -/// let mut builder = StringViewBuilder::new(); -/// builder.append_value("hello"); -/// builder.append_null(); -/// builder.append_value("world"); -/// let array = builder.finish(); -/// -/// let expected = vec![Some("hello"), None, Some("world")]; -/// let actual: Vec<_> = array.iter().collect(); -/// assert_eq!(expected, actual); -/// ``` -pub type StringViewBuilder = GenericByteViewBuilder; - -/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] -/// -/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with -/// [`GenericByteViewBuilder::append_null`] as normal. -/// -/// # Example -/// ``` -/// # use arrow_array::builder::BinaryViewBuilder; -/// use arrow_array::BinaryViewArray; -/// let mut builder = BinaryViewBuilder::new(); -/// builder.append_value("hello"); -/// builder.append_null(); -/// builder.append_value("world"); -/// let array = builder.finish(); -/// -/// let expected: Vec> = vec![Some(b"hello"), None, Some(b"world")]; -/// let actual: Vec<_> = array.iter().collect(); -/// assert_eq!(expected, actual); -/// ``` -/// -pub type BinaryViewBuilder = GenericByteViewBuilder; - -/// Creates a view from a fixed length input (the compiler can generate -/// specialized code for this) -fn make_inlined_view(data: &[u8]) -> u128 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); - view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); - u128::from_le_bytes(view_buffer) -} - -/// Create a view based on the given data, block id and offset. -/// -/// Note that the code below is carefully examined with x86_64 assembly code: -/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), -/// which slows down things. -#[inline(never)] -pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { - let len = data.len(); - - // Generate specialized code for each potential small string length - // to improve performance - match len { - 0 => make_inlined_view::<0>(data), - 1 => make_inlined_view::<1>(data), - 2 => make_inlined_view::<2>(data), - 3 => make_inlined_view::<3>(data), - 4 => make_inlined_view::<4>(data), - 5 => make_inlined_view::<5>(data), - 6 => make_inlined_view::<6>(data), - 7 => make_inlined_view::<7>(data), - 8 => make_inlined_view::<8>(data), - 9 => make_inlined_view::<9>(data), - 10 => make_inlined_view::<10>(data), - 11 => make_inlined_view::<11>(data), - 12 => make_inlined_view::<12>(data), - // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. - _ => { - let view = ByteView { - length: len as u32, - prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), - buffer_index: block_id, - offset, - }; - view.as_u128() - } - } -} - -#[cfg(test)] -mod tests { - use core::str; - - use super::*; - use crate::Array; - - #[test] - fn test_string_view_deduplicate() { - let value_1 = "long string to test string view"; - let value_2 = "not so similar string but long"; - - let mut builder = StringViewBuilder::new() - .with_deduplicate_strings() - .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers - - let values = vec![ - Some(value_1), - Some(value_2), - Some("short"), - Some(value_1), - None, - Some(value_2), - Some(value_1), - ]; - builder.extend(values.clone()); - - let array = builder.finish_cloned(); - array.to_data().validate_full().unwrap(); - assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. - let actual: Vec<_> = array.iter().collect(); - assert_eq!(actual, values); - - let view0 = array.views().first().unwrap(); - let view3 = array.views().get(3).unwrap(); - let view6 = array.views().get(6).unwrap(); - - assert_eq!(view0, view3); - assert_eq!(view0, view6); - - assert_eq!(array.views().get(1), array.views().get(5)); - } - - #[test] - fn test_string_view_deduplicate_after_finish() { - let mut builder = StringViewBuilder::new().with_deduplicate_strings(); - - let value_1 = "long string to test string view"; - let value_2 = "not so similar string but long"; - builder.append_value(value_1); - let _array = builder.finish(); - builder.append_value(value_2); - let _array = builder.finish(); - builder.append_value(value_1); - let _array = builder.finish(); - } - - #[test] - fn test_string_view() { - let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81"); - let b2 = Buffer::from(b"cupcakes"); - let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity"); - - let mut v = StringViewBuilder::new(); - assert_eq!(v.append_block(b1), 0); - - v.append_value("This is a very long string that exceeds the inline length"); - v.append_value("This is another very long string that exceeds the inline length"); - - assert_eq!(v.append_block(b2), 2); - assert_eq!(v.append_block(b3), 3); - - // Test short strings - v.try_append_view(0, 0, 5).unwrap(); // world - v.try_append_view(0, 6, 7).unwrap(); // bananas - v.try_append_view(2, 3, 5).unwrap(); // cake - v.try_append_view(2, 0, 3).unwrap(); // cup - v.try_append_view(2, 0, 8).unwrap(); // cupcakes - v.try_append_view(0, 13, 4).unwrap(); // 😁 - v.try_append_view(0, 13, 0).unwrap(); // - - // Test longer strings - v.try_append_view(3, 0, 16).unwrap(); // Many strings are - v.try_append_view(1, 0, 19).unwrap(); // This is a very long - v.try_append_view(3, 13, 27).unwrap(); // here contained of great length - - v.append_value("I do so like long strings"); - - let array = v.finish_cloned(); - array.to_data().validate_full().unwrap(); - assert_eq!(array.data_buffers().len(), 5); - let actual: Vec<_> = array.iter().flatten().collect(); - assert_eq!( - actual, - &[ - "This is a very long string that exceeds the inline length", - "This is another very long string that exceeds the inline length", - "world", - "bananas", - "cakes", - "cup", - "cupcakes", - "😁", - "", - "Many strings are", - "This is a very long", - "are here contained of great", - "I do so like long strings" - ] - ); - - let err = v.try_append_view(0, u32::MAX, 1).unwrap_err(); - assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17"); - - let err = v.try_append_view(0, 1, u32::MAX).unwrap_err(); - assert_eq!( - err.to_string(), - "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17" - ); - - let err = v.try_append_view(0, 13, 2).unwrap_err(); - assert_eq!(err.to_string(), "Invalid argument error: Invalid view data"); - - let err = v.try_append_view(0, 40, 0).unwrap_err(); - assert_eq!( - err.to_string(), - "Invalid argument error: Range 40..40 out of bounds for block of length 17" - ); - - let err = v.try_append_view(5, 0, 0).unwrap_err(); - assert_eq!( - err.to_string(), - "Invalid argument error: No block found with index 5" - ); - } - - #[test] - fn test_string_view_with_block_size_growth() { - let mut exp_builder = StringViewBuilder::new(); - let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); - - let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); - - for i in 0..9 { - // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M - for _ in 0..(2_u32.pow(i)) { - exp_builder.append_value(long_string); - fixed_builder.append_value(long_string); - } - exp_builder.flush_in_progress(); - fixed_builder.flush_in_progress(); - - // Every step only add one buffer, but the buffer size is much larger - assert_eq!(exp_builder.completed.len(), i as usize + 1); - assert_eq!( - exp_builder.completed[i as usize].len(), - STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) - ); - - // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 - assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); - - // Every buffer is fixed size - assert!(fixed_builder - .completed - .iter() - .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)); - } - - // Add one more value, and the buffer stop growing. - exp_builder.append_value(long_string); - exp_builder.flush_in_progress(); - assert_eq!( - exp_builder.completed.last().unwrap().capacity(), - MAX_BLOCK_SIZE as usize - ); - } -} diff --git a/arrow-array/src/builder2/generic_list_builder.rs b/arrow-array/src/builder2/generic_list_builder.rs deleted file mode 100644 index 8d3a780a601e..000000000000 --- a/arrow-array/src/builder2/generic_list_builder.rs +++ /dev/null @@ -1,740 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{SpecificArrayBuilder}; -use crate::builder::{BufferBuilder}; -use crate::{Array, ArrayAccessor, ArrayRef, GenericListArray, OffsetSizeTrait}; -use arrow_buffer::NullBufferBuilder; -use arrow_buffer::{Buffer, OffsetBuffer}; -use arrow_schema::{Field, FieldRef}; -use std::any::Any; -use std::sync::Arc; - -/// Builder for [`GenericListArray`] -/// -/// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. -/// -/// # Example -/// -/// Here is code that constructs a ListArray with the contents: -/// `[[A,B,C], [], NULL, [D], [NULL, F]]` -/// -/// ``` -/// # use std::sync::Arc; -/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array}; -/// # -/// let values_builder = StringBuilder::new(); -/// let mut builder = ListBuilder::new(values_builder); -/// -/// // [A, B, C] -/// builder.values().append_value("A"); -/// builder.values().append_value("B"); -/// builder.values().append_value("C"); -/// builder.append(true); -/// -/// // [ ] (empty list) -/// builder.append(true); -/// -/// // Null -/// builder.append(false); -/// -/// // [D] -/// builder.values().append_value("D"); -/// builder.append(true); -/// -/// // [NULL, F] -/// builder.values().append_null(); -/// builder.values().append_value("F"); -/// builder.append(true); -/// -/// // Build the array -/// let array = builder.finish(); -/// -/// // Values is a string array -/// // "A", "B" "C", "?", "D", NULL, "F" -/// assert_eq!( -/// array.values().as_ref(), -/// &StringArray::from(vec![ -/// Some("A"), Some("B"), Some("C"), -/// Some("D"), None, Some("F") -/// ]) -/// ); -/// -/// // Offsets are indexes into the values array -/// assert_eq!( -/// array.value_offsets(), -/// &[0, 3, 3, 3, 4, 6] -/// ); -/// ``` -/// -/// [`ListBuilder`]: crate::builder::ListBuilder -/// [`ListArray`]: crate::array::ListArray -/// [`LargeListBuilder`]: crate::builder::LargeListBuilder -/// [`LargeListArray`]: crate::array::LargeListArray -#[derive(Debug)] -pub struct GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { - offsets_builder: BufferBuilder, - null_buffer_builder: NullBufferBuilder, - values_builder: T, - field: Option, -} - -impl Default for GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { - fn default() -> Self { - Self::new(T::default()) - } -} - -impl GenericListBuilder where for<'a> &'a ::Output: ArrayAccessor { - /// Creates a new [`GenericListBuilder`] from a given values array builder - pub fn new(values_builder: T) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, capacity) - } - - /// Creates a new [`GenericListBuilder`] from a given values array builder - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let mut offsets_builder = BufferBuilder::::new(capacity + 1); - offsets_builder.append(OffsetSize::zero()); - Self { - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, - field: None, - } - } - - /// Override the field passed to [`GenericListArray::new`] - /// - /// By default a nullable field is created with the name `item` - /// - /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the - /// field's data type does not match that of `T` - pub fn with_field(self, field: impl Into) -> Self { - Self { - field: Some(field.into()), - ..self - } - } -} -impl SpecificArrayBuilder - for GenericListBuilder -where - OffsetSize: OffsetSizeTrait, - ValuesOutput: Array + ArrayAccessor, - T: 'static + SpecificArrayBuilder, for<'a> &'a ValuesOutput: ArrayAccessor -{ - type Output = GenericListArray; - - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - // our item is their output - self.values_builder.append_output(value.as_any().downcast_ref::().unwrap()); - self.append(true); - } - - fn append_null(&mut self) { - self.append(false); - } -} - -impl GenericListBuilder -where - T: 'static, for<'a> &'a ::Output: ArrayAccessor -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call [`append`](#method.append) to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Returns the child array builder as an immutable reference - pub fn values_ref(&self) -> &T { - &self.values_builder - } - - /// Finish the current variable-length list array slot - /// - /// # Panics - /// - /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` - #[inline] - pub fn append(&mut self, is_valid: bool) { - self.offsets_builder.append(self.next_offset()); - self.null_buffer_builder.append(is_valid); - } - - /// Returns the next offset - /// - /// # Panics - /// - /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` - #[inline] - fn next_offset(&self) -> OffsetSize { - OffsetSize::from_usize(self.values_builder.len()).unwrap() - } - - /// Append a value to this [`GenericListBuilder`] - /// - /// ``` - /// # use arrow_array::builder::{Int32Builder, ListBuilder}; - /// # use arrow_array::cast::AsArray; - /// # use arrow_array::{Array, Int32Array}; - /// # use arrow_array::types::Int32Type; - /// let mut builder = ListBuilder::new(Int32Builder::new()); - /// - /// builder.append_value([Some(1), Some(2), Some(3)]); - /// builder.append_value([]); - /// builder.append_value([None]); - /// - /// let array = builder.finish(); - /// assert_eq!(array.len(), 3); - /// - /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); - /// let values = array.values().as_primitive::(); - /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); - /// ``` - /// - /// This is an alternative API to appending directly to [`Self::values`] and - /// delimiting the result with [`Self::append`] - /// - /// ``` - /// # use arrow_array::builder::{Int32Builder, ListBuilder}; - /// # use arrow_array::cast::AsArray; - /// # use arrow_array::{Array, Int32Array}; - /// # use arrow_array::types::Int32Type; - /// let mut builder = ListBuilder::new(Int32Builder::new()); - /// - /// builder.values().append_value(1); - /// builder.values().append_value(2); - /// builder.values().append_value(3); - /// builder.append(true); - /// builder.append(true); - /// builder.values().append_null(); - /// builder.append(true); - /// - /// let array = builder.finish(); - /// assert_eq!(array.len(), 3); - /// - /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); - /// let values = array.values().as_primitive::(); - /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); - /// ``` - #[inline] - pub fn append_value(&mut self, i: I) - where - T: Extend>, - I: IntoIterator>, - { - self.extend(std::iter::once(Some(i))) - } - - /// Append a null to this [`GenericListBuilder`] - /// - /// See [`Self::append_value`] for an example use. - #[inline] - pub fn append_null(&mut self) { - self.offsets_builder.append(self.next_offset()); - self.null_buffer_builder.append_null(); - } - - /// Appends an optional value into this [`GenericListBuilder`] - /// - /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] - #[inline] - pub fn append_option(&mut self, i: Option) - where - T: Extend>, - I: IntoIterator>, - { - match i { - Some(i) => self.append_value(i), - None => self.append_null(), - } - } - - /// Builds the [`GenericListArray`] and reset this builder. - pub fn finish(&mut self) -> GenericListArray { - let values = self.values_builder.finish(); - let nulls = self.null_buffer_builder.finish(); - - let offsets = self.offsets_builder.finish(); - // Safety: Safe by construction - let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; - self.offsets_builder.append(OffsetSize::zero()); - - let field = match &self.field { - Some(f) => f.clone(), - None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), - }; - - GenericListArray::new(field, offsets, values, nulls) - } - - /// Builds the [`GenericListArray`] without resetting the builder. - pub fn finish_cloned(&self) -> GenericListArray { - let values = self.values_builder.finish_cloned(); - let nulls = self.null_buffer_builder.finish_cloned(); - - let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - // Safety: safe by construction - let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; - - let field = match &self.field { - Some(f) => f.clone(), - None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), - }; - - GenericListArray::new(field, offsets, values, nulls) - } - - /// Returns the current offsets buffer as a slice - pub fn offsets_slice(&self) -> &[OffsetSize] { - self.offsets_builder.as_slice() - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } -} - -impl Extend> for GenericListBuilder -where - O: OffsetSizeTrait, - B: SpecificArrayBuilder + Extend, - V: IntoIterator, for<'a> &'a ::Output: ArrayAccessor -{ - #[inline] - fn extend>>(&mut self, iter: T) { - for v in iter { - match v { - Some(elements) => { - self.values_builder.extend(elements); - self.append(true); - } - None => self.append(false), - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::builder2::{Int32Builder, Int8Builder, LargeListBuilder, ListBuilder, PrimitiveBuilder}; - use crate::builder2::SpecificArrayBuilder; - use crate::cast::AsArray; - use crate::types::Int32Type; - use crate::{Int32Array, ListArray}; - use arrow_schema::DataType; - - fn _test_generic_list_array_builder() { - let values_builder = Int32Builder::with_capacity(10); - let mut builder = GenericListBuilder::::new(values_builder); - - // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().append_value(0); - builder.values().append_value(1); - builder.values().append_value(2); - builder.append(true); - builder.values().append_value(3); - builder.values().append_value(4); - builder.values().append_value(5); - builder.append(true); - builder.values().append_value(6); - builder.values().append_value(7); - builder.append(true); - let list_array = builder.finish(); - - let list_values = list_array.values().as_primitive::(); - assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); - assert_eq!(list_array.value_offsets(), [0, 3, 6, 8].map(O::usize_as)); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); - assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2)); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - } - - #[test] - fn test_list_array_builder() { - _test_generic_list_array_builder::() - } - - #[test] - fn test_large_list_array_builder() { - _test_generic_list_array_builder::() - } - - fn _test_generic_list_array_builder_nulls() { - let values_builder = Int32Builder::with_capacity(10); - let mut builder = GenericListBuilder::::new(values_builder); - - // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().append_value(0); - builder.values().append_value(1); - builder.values().append_value(2); - builder.append(true); - builder.append(false); - builder.values().append_value(3); - builder.values().append_null(); - builder.values().append_value(5); - builder.append(true); - builder.values().append_value(6); - builder.values().append_value(7); - builder.append(true); - - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); - assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); - } - - #[test] - fn test_list_array_builder_nulls() { - _test_generic_list_array_builder_nulls::() - } - - #[test] - fn test_large_list_array_builder_nulls() { - _test_generic_list_array_builder_nulls::() - } - - #[test] - fn test_list_array_builder_finish() { - let values_builder = Int32Array::builder2(5); - let mut builder = ListBuilder::new(values_builder); - - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5, 6]); - builder.append(true); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert!(builder.is_empty()); - - builder.values().append_slice(&[7, 8, 9]); - builder.append(true); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert!(builder.is_empty()); - } - - #[test] - fn test_list_array_builder_finish_cloned() { - let values_builder = Int32Array::builder(5); - let mut builder = ListBuilder::new(values_builder); - - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5, 6]); - builder.append(true); - - let mut arr = builder.finish_cloned(); - assert_eq!(2, arr.len()); - assert!(!builder.is_empty()); - - builder.values().append_slice(&[7, 8, 9]); - builder.append(true); - arr = builder.finish(); - assert_eq!(3, arr.len()); - assert!(builder.is_empty()); - } - - #[test] - fn test_list_list_array_builder() { - let primitive_builder = Int32Builder::with_capacity(10); - let values_builder = ListBuilder::new(primitive_builder); - let mut builder = ListBuilder::new(values_builder); - - // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.values().values().append_value(1); - builder.values().values().append_value(2); - builder.values().append(true); - builder.values().values().append_value(3); - builder.values().values().append_value(4); - builder.values().append(true); - builder.append(true); - - builder.values().values().append_value(5); - builder.values().values().append_value(6); - builder.values().values().append_value(7); - builder.values().append(true); - builder.values().append(false); - builder.values().values().append_value(8); - builder.values().append(true); - builder.append(true); - - builder.append(false); - - builder.values().values().append_value(9); - builder.values().values().append_value(10); - builder.values().append(true); - builder.append(true); - - let l1 = builder.finish(); - - assert_eq!(4, l1.len()); - assert_eq!(1, l1.null_count()); - - assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); - let l2 = l1.values().as_list::(); - - assert_eq!(6, l2.len()); - assert_eq!(1, l2.null_count()); - assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); - - let i1 = l2.values().as_primitive::(); - assert_eq!(10, i1.len()); - assert_eq!(0, i1.null_count()); - assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - } - - #[test] - fn test_extend() { - let mut builder = ListBuilder::new(Int32Builder::new()); - builder.extend([ - Some(vec![Some(1), Some(2), Some(7), None]), - Some(vec![]), - Some(vec![Some(4), Some(5)]), - None, - ]); - - let array = builder.finish(); - assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]); - assert_eq!(array.null_count(), 1); - assert_eq!(array.logical_null_count(), 1); - assert!(array.is_null(3)); - let elements = array.values().as_primitive::(); - assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); - assert_eq!(elements.null_count(), 1); - assert_eq!(elements.logical_null_count(), 1); - assert!(elements.is_null(3)); - } - - #[test] - fn test_boxed_primitive_array_builder() { - let values_builder = PrimitiveBuilder::::with_capacity(5); - let mut builder = ListBuilder::new(values_builder); - - builder - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_slice(&[1, 2, 3]); - builder.append(true); - - builder - .values() - .as_any_mut() - .downcast_mut::() - .expect("should be an Int32Builder") - .append_slice(&[4, 5, 6]); - builder.append(true); - - let arr = builder.finish(); - assert_eq!(2, arr.len()); - - let elements = arr.values().as_primitive::(); - assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); - } - - #[test] - fn test_boxed_list_list_array_builder() { - // This test is same as `test_list_list_array_builder` but uses boxed builders. - let values_builder = ListBuilder::with_capacity( Int32Builder::with_capacity(10), 10); - test_boxed_generic_list_generic_list_array_builder::(values_builder); - } - - #[test] - fn test_boxed_large_list_large_list_array_builder() { - // This test is same as `test_list_list_array_builder` but uses boxed builders. - test_boxed_generic_list_generic_list_array_builder( - LargeListBuilder::with_capacity(Int32Builder::with_capacity(10), 10), - ); - } - - fn test_boxed_generic_list_generic_list_array_builder( - values_builder: GenericListBuilder, - ) where for<'a> &'a ::Output: ArrayAccessor { - let mut builder: GenericListBuilder> = - GenericListBuilder::>::new(values_builder); - - // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.append_value(vec![vec![1, 2], vec![3, 4]]); - - let l1 = builder.finish(); - - assert_eq!(4, l1.len()); - assert_eq!(1, l1.null_count()); - - assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6].map(O::usize_as)); - let l2 = l1.values().as_list::(); - - assert_eq!(6, l2.len()); - assert_eq!(1, l2.null_count()); - assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10].map(O::usize_as)); - - let i1 = l2.values().as_primitive::(); - assert_eq!(10, i1.len()); - assert_eq!(0, i1.null_count()); - assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - } - - #[test] - fn test_with_field() { - let field = Arc::new(Field::new("bar", DataType::Int32, false)); - let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append_null(); // This is fine as nullability refers to nullability of values - builder.append_value([Some(4)]); - let array = builder.finish(); - assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::List(field.clone())); - - builder.append_value([Some(4), Some(5)]); - let array = builder.finish(); - assert_eq!(array.data_type(), &DataType::List(field)); - assert_eq!(array.len(), 1); - } - - #[test] - #[should_panic(expected = "Non-nullable field of ListArray \\\"item\\\" cannot contain nulls")] - fn test_checks_nullability() { - let field = Arc::new(Field::new_list_field(DataType::Int32, false)); - let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); - builder.append_value([Some(1), None]); - builder.finish(); - } - - #[test] - #[should_panic(expected = "ListArray expected data type Int64 got Int32")] - fn test_checks_data_type() { - let field = Arc::new(Field::new_list_field(DataType::Int64, false)); - let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); - builder.append_value([Some(1)]); - builder.finish(); - } - - #[test] - fn should_run() { - let from: Arc = create_test_list(); - let mut to = ListBuilder::new( - ListBuilder::new( - Int32Builder::new() - ) - ); - let indices: &[usize] = &[0, 1, 2]; - let data_type = DataType::List( - Arc::new(Field::new( - "item", - DataType::List( - Arc::new(Field::new( - "item", - DataType::Int32, - false - )) - ), - false - )) - ); - - for &i in indices { - if from.is_valid(i) { - let inner_list = from.value(i).as_any().downcast_ref::>().unwrap(); - // to.append_value(inner_list); - } else { - to.append_null(); - } - } - } - - fn create_test_list() -> Arc { - let primitive_builder = Int32Builder::with_capacity(10); - let values_builder = ListBuilder::new(primitive_builder); - let mut builder = ListBuilder::new(values_builder); - - // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.values().values().append_value(1); - builder.values().values().append_value(2); - builder.values().append(true); - builder.values().values().append_value(3); - builder.values().values().append_value(4); - builder.values().append(true); - builder.append(true); - - builder.values().values().append_value(5); - builder.values().values().append_value(6); - builder.values().values().append_value(7); - builder.values().append(true); - builder.values().append(false); - builder.values().values().append_value(8); - builder.values().append(true); - builder.append(true); - - builder.append(false); - - builder.values().values().append_value(9); - builder.values().values().append_value(10); - builder.values().append(true); - builder.append(true); - - Arc::new(builder.finish()) - } -} diff --git a/arrow-array/src/builder2/mod.rs b/arrow-array/src/builder2/mod.rs deleted file mode 100644 index 723c0a999cf4..000000000000 --- a/arrow-array/src/builder2/mod.rs +++ /dev/null @@ -1,365 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines push-based APIs for constructing arrays -//! -//! # Basic Usage -//! -//! Builders can be used to build simple, non-nested arrays -//! -//! ``` -//! # use arrow_array::builder::Int32Builder; -//! # use arrow_array::PrimitiveArray; -//! let mut a = Int32Builder::new(); -//! a.append_value(1); -//! a.append_null(); -//! a.append_value(2); -//! let a = a.finish(); -//! -//! assert_eq!(a, PrimitiveArray::from(vec![Some(1), None, Some(2)])); -//! ``` -//! -//! ``` -//! # use arrow_array::builder::StringBuilder; -//! # use arrow_array::{Array, StringArray}; -//! let mut a = StringBuilder::new(); -//! a.append_value("foo"); -//! a.append_value("bar"); -//! a.append_null(); -//! let a = a.finish(); -//! -//! assert_eq!(a, StringArray::from_iter([Some("foo"), Some("bar"), None])); -//! ``` -//! -//! # Nested Usage -//! -//! Builders can also be used to build more complex nested arrays, such as lists -//! -//! ``` -//! # use arrow_array::builder::{Int32Builder, ListBuilder}; -//! # use arrow_array::ListArray; -//! # use arrow_array::types::Int32Type; -//! let mut a = ListBuilder::new(Int32Builder::new()); -//! // [1, 2] -//! a.values().append_value(1); -//! a.values().append_value(2); -//! a.append(true); -//! // null -//! a.append(false); -//! // [] -//! a.append(true); -//! // [3, null] -//! a.values().append_value(3); -//! a.values().append_null(); -//! a.append(true); -//! -//! // [[1, 2], null, [], [3, null]] -//! let a = a.finish(); -//! -//! assert_eq!(a, ListArray::from_iter_primitive::([ -//! Some(vec![Some(1), Some(2)]), -//! None, -//! Some(vec![]), -//! Some(vec![Some(3), None])] -//! )) -//! ``` -//! -//! # Custom Builders -//! -//! It is common to have a collection of statically defined Rust types that -//! you want to convert to Arrow arrays. -//! -//! An example of doing so is below -//! -//! ``` -//! # use std::any::Any; -//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; -//! # use arrow_array::{ArrayRef, RecordBatch, StructArray}; -//! # use arrow_schema::{DataType, Field}; -//! # use std::sync::Arc; -//! /// A custom row representation -//! struct MyRow { -//! i32: i32, -//! optional_i32: Option, -//! string: Option, -//! i32_list: Option>>, -//! } -//! -//! /// Converts `Vec` into `StructArray` -//! #[derive(Debug, Default)] -//! struct MyRowBuilder { -//! i32: Int32Builder, -//! string: StringBuilder, -//! i32_list: ListBuilder, -//! } -//! -//! impl MyRowBuilder { -//! fn append(&mut self, row: &MyRow) { -//! self.i32.append_value(row.i32); -//! self.string.append_option(row.string.as_ref()); -//! self.i32_list.append_option(row.i32_list.as_ref().map(|x| x.iter().copied())); -//! } -//! -//! /// Note: returns StructArray to allow nesting within another array if desired -//! fn finish(&mut self) -> StructArray { -//! let i32 = Arc::new(self.i32.finish()) as ArrayRef; -//! let i32_field = Arc::new(Field::new("i32", DataType::Int32, false)); -//! -//! let string = Arc::new(self.string.finish()) as ArrayRef; -//! let string_field = Arc::new(Field::new("i32", DataType::Utf8, false)); -//! -//! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; -//! let value_field = Arc::new(Field::new_list_field(DataType::Int32, true)); -//! let i32_list_field = Arc::new(Field::new("i32_list", DataType::List(value_field), true)); -//! -//! StructArray::from(vec![ -//! (i32_field, i32), -//! (string_field, string), -//! (i32_list_field, i32_list), -//! ]) -//! } -//! } -//! -//! impl<'a> Extend<&'a MyRow> for MyRowBuilder { -//! fn extend>(&mut self, iter: T) { -//! iter.into_iter().for_each(|row| self.append(row)); -//! } -//! } -//! -//! /// Converts a slice of [`MyRow`] to a [`RecordBatch`] -//! fn rows_to_batch(rows: &[MyRow]) -> RecordBatch { -//! let mut builder = MyRowBuilder::default(); -//! builder.extend(rows); -//! RecordBatch::from(&builder.finish()) -//! } -//! ``` - -pub use arrow_buffer::BooleanBufferBuilder; - -mod boolean_builder; -pub use boolean_builder::*; -mod fixed_size_binary_builder; -pub use fixed_size_binary_builder::*; -mod fixed_size_list_builder; -pub use fixed_size_list_builder::*; -mod generic_bytes_builder; -pub use generic_bytes_builder::*; -mod generic_list_builder; -pub use generic_list_builder::*; -mod primitive_builder; -pub use primitive_builder::*; -mod generic_bytes_view_builder; -pub use generic_bytes_view_builder::*; - -use crate::{Array, ArrayAccessor, ArrayRef}; -use std::any::Any; -use std::sync::Arc; - -/// Trait for dealing with different array builders at runtime -/// -/// # Example -/// -/// ``` -/// // Create -/// # use arrow_array::{ArrayRef, StringArray}; -/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder}; -/// -/// let mut data_builders: Vec> = vec![ -/// Box::new(Float64Builder::new()), -/// Box::new(Int64Builder::new()), -/// Box::new(StringBuilder::new()), -/// ]; -/// -/// // Fill -/// data_builders[0] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value(3.14); -/// data_builders[1] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value(-1); -/// data_builders[2] -/// .as_any_mut() -/// .downcast_mut::() -/// .unwrap() -/// .append_value("🍎"); -/// -/// // Finish -/// let array_refs: Vec = data_builders -/// .iter_mut() -/// .map(|builder| builder.finish()) -/// .collect(); -/// assert_eq!(array_refs[0].len(), 1); -/// assert_eq!(array_refs[1].is_null(0), false); -/// assert_eq!( -/// array_refs[2] -/// .as_any() -/// .downcast_ref::() -/// .unwrap() -/// .value(0), -/// "🍎" -/// ); -/// ``` -/// -// TODO - require extend or allow to append from iterator -trait SpecificArrayBuilder: Any + Send + Sync where for<'a> &'a ::Output: ArrayAccessor { - type Output: Array; - - - /// Returns the number of array slots in the builder - fn len(&self) -> usize; - - /// Returns whether number of array slots is zero - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Builds the array - fn finish(&mut self) -> Arc; - - /// Builds the array without resetting the underlying builder. - fn finish_cloned(&self) -> Arc; - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &dyn Any; - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut dyn Any; - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box; - - // Append a value to the builder - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item); - - /// Appends a null slot into the builder - fn append_null(&mut self) ; - - /// Appends `n` `null`s into the builder. - #[inline] - fn append_nulls(&mut self, n: usize) { - for _ in 0..n { - self.append_null(); - } - } - - /// Appends an `Option` into the builder - #[inline] - fn append_option(&mut self, v: Option<<&Self::Output as ArrayAccessor>::Item>) { - match v { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - #[inline] - fn append_output(&mut self, output: &Self::Output) { - // TODO - if iterator exists try it? - for i in 0..output.len() { - if output.is_null(i) { - self.append_null(); - } else { - self.append_value(output.value(i)); - } - } - } -} - -// impl SpecificArrayBuilder for Box> -// where -// T: Array + 'static, -// for<'a> &'a T: ArrayAccessor, -// { -// type Output = dyn Array; -// -// fn len(&self) -> usize { -// (**self).len() -// } -// -// fn is_empty(&self) -> bool { -// (**self).is_empty() -// } -// -// fn finish(&mut self) -> Arc { -// (**self).finish() -// } -// -// fn finish_cloned(&self) -> Arc { -// (**self).finish_cloned() -// } -// -// fn as_any(&self) -> &dyn Any { -// (**self).as_any() -// } -// -// fn as_any_mut(&mut self) -> &mut dyn Any { -// (**self).as_any_mut() -// } -// -// fn into_box_any(self: Box) -> Box { -// self -// } -// -// fn append_value(&mut self, value: ::Item) { -// (**self).append_value(value) -// } -// -// fn append_null(&mut self) { -// (**self).append_null() -// } -// -// fn append_option(&mut self, v: Option<::Item>) { -// (**self).append_option(v) -// } -// } - -/// Builder for [`ListArray`](crate::array::ListArray) -pub type ListBuilder = GenericListBuilder; - -/// Builder for [`LargeListArray`](crate::array::LargeListArray) -pub type LargeListBuilder = GenericListBuilder; - -/// Builder for [`BinaryArray`](crate::array::BinaryArray) -/// -/// See examples on [`GenericBinaryBuilder`] -pub type BinaryBuilder = GenericBinaryBuilder; - -/// Builder for [`LargeBinaryArray`](crate::array::LargeBinaryArray) -/// -/// See examples on [`GenericBinaryBuilder`] -pub type LargeBinaryBuilder = GenericBinaryBuilder; - -/// Builder for [`StringArray`](crate::array::StringArray) -/// -/// See examples on [`GenericStringBuilder`] -pub type StringBuilder = GenericStringBuilder; - -/// Builder for [`LargeStringArray`](crate::array::LargeStringArray) -/// -/// See examples on [`GenericStringBuilder`] -pub type LargeStringBuilder = GenericStringBuilder; diff --git a/arrow-array/src/builder2/primitive_builder.rs b/arrow-array/src/builder2/primitive_builder.rs deleted file mode 100644 index 830007d52c25..000000000000 --- a/arrow-array/src/builder2/primitive_builder.rs +++ /dev/null @@ -1,637 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder2::{SpecificArrayBuilder}; -use crate::builder::BufferBuilder; -use crate::types::*; -use crate::{ArrayAccessor, ArrayRef, PrimitiveArray}; -use arrow_buffer::NullBufferBuilder; -use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType}; -use std::any::Any; -use std::sync::Arc; - -/// A signed 8-bit integer array builder. -pub type Int8Builder = PrimitiveBuilder; -/// A signed 16-bit integer array builder. -pub type Int16Builder = PrimitiveBuilder; -/// A signed 32-bit integer array builder. -pub type Int32Builder = PrimitiveBuilder; -/// A signed 64-bit integer array builder. -pub type Int64Builder = PrimitiveBuilder; -/// An usigned 8-bit integer array builder. -pub type UInt8Builder = PrimitiveBuilder; -/// An usigned 16-bit integer array builder. -pub type UInt16Builder = PrimitiveBuilder; -/// An usigned 32-bit integer array builder. -pub type UInt32Builder = PrimitiveBuilder; -/// An usigned 64-bit integer array builder. -pub type UInt64Builder = PrimitiveBuilder; -/// A 16-bit floating point array builder. -pub type Float16Builder = PrimitiveBuilder; -/// A 32-bit floating point array builder. -pub type Float32Builder = PrimitiveBuilder; -/// A 64-bit floating point array builder. -pub type Float64Builder = PrimitiveBuilder; - -/// A timestamp second array builder. -pub type TimestampSecondBuilder = PrimitiveBuilder; -/// A timestamp millisecond array builder. -pub type TimestampMillisecondBuilder = PrimitiveBuilder; -/// A timestamp microsecond array builder. -pub type TimestampMicrosecondBuilder = PrimitiveBuilder; -/// A timestamp nanosecond array builder. -pub type TimestampNanosecondBuilder = PrimitiveBuilder; - -/// A 32-bit date array builder. -pub type Date32Builder = PrimitiveBuilder; -/// A 64-bit date array builder. -pub type Date64Builder = PrimitiveBuilder; - -/// A 32-bit elaspsed time in seconds array builder. -pub type Time32SecondBuilder = PrimitiveBuilder; -/// A 32-bit elaspsed time in milliseconds array builder. -pub type Time32MillisecondBuilder = PrimitiveBuilder; -/// A 64-bit elaspsed time in microseconds array builder. -pub type Time64MicrosecondBuilder = PrimitiveBuilder; -/// A 64-bit elaspsed time in nanoseconds array builder. -pub type Time64NanosecondBuilder = PrimitiveBuilder; - -/// A “calendar” interval in months array builder. -pub type IntervalYearMonthBuilder = PrimitiveBuilder; -/// A “calendar” interval in days and milliseconds array builder. -pub type IntervalDayTimeBuilder = PrimitiveBuilder; -/// A “calendar” interval in months, days, and nanoseconds array builder. -pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; - -/// An elapsed time in seconds array builder. -pub type DurationSecondBuilder = PrimitiveBuilder; -/// An elapsed time in milliseconds array builder. -pub type DurationMillisecondBuilder = PrimitiveBuilder; -/// An elapsed time in microseconds array builder. -pub type DurationMicrosecondBuilder = PrimitiveBuilder; -/// An elapsed time in nanoseconds array builder. -pub type DurationNanosecondBuilder = PrimitiveBuilder; - -/// A decimal 128 array builder -pub type Decimal128Builder = PrimitiveBuilder; -/// A decimal 256 array builder -pub type Decimal256Builder = PrimitiveBuilder; - -/// Builder for [`PrimitiveArray`] -#[derive(Debug)] -pub struct PrimitiveBuilder { - values_builder: BufferBuilder, - null_buffer_builder: NullBufferBuilder, - data_type: DataType, -} - -impl SpecificArrayBuilder for PrimitiveBuilder { - type Output = PrimitiveArray; - - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> Arc { - Arc::new(self.finish()) - } - - /// Builds the array without resetting the builder. - fn finish_cloned(&self) -> Arc { - Arc::new(self.finish_cloned()) - } - - fn append_value(&mut self, value: <&Self::Output as ArrayAccessor>::Item) { - self.append_value(value) - } - - fn append_null(&mut self) { - self.append_null() - } - - fn append_nulls(&mut self, n: usize) { - self.append_nulls(n) - } - - fn append_output(&mut self, output: &Self::Output) { - self.extend(output.iter()) - } -} - -impl Default for PrimitiveBuilder { - fn default() -> Self { - Self::new() - } -} - -impl PrimitiveBuilder { - /// Creates a new primitive array builder - pub fn new() -> Self { - Self::with_capacity(1024) - } - - /// Creates a new primitive array builder with capacity no of items - pub fn with_capacity(capacity: usize) -> Self { - Self { - values_builder: BufferBuilder::::new(capacity), - null_buffer_builder: NullBufferBuilder::new(capacity), - data_type: T::DATA_TYPE, - } - } - - /// Creates a new primitive array builder from buffers - pub fn new_from_buffer( - values_buffer: MutableBuffer, - null_buffer: Option, - ) -> Self { - let values_builder = BufferBuilder::::new_from_buffer(values_buffer); - - let null_buffer_builder = null_buffer - .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, values_builder.len())) - .unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len())); - - Self { - values_builder, - null_buffer_builder, - data_type: T::DATA_TYPE, - } - } - - /// By default [`PrimitiveBuilder`] uses [`ArrowPrimitiveType::DATA_TYPE`] as the - /// data type of the generated array. - /// - /// This method allows overriding the data type, to allow specifying timezones - /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] - /// - /// # Panics - /// - /// This method panics if `data_type` is not [PrimitiveArray::is_compatible] - pub fn with_data_type(self, data_type: DataType) -> Self { - assert!( - PrimitiveArray::::is_compatible(&data_type), - "incompatible data type for builder, expected {} got {}", - T::DATA_TYPE, - data_type - ); - Self { data_type, ..self } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: T::Native) { - self.null_buffer_builder.append_non_null(); - self.values_builder.append(v); - } - - /// Appends a value of type `T` into the builder `n` times - #[inline] - pub fn append_value_n(&mut self, v: T::Native, n: usize) { - self.null_buffer_builder.append_n_non_nulls(n); - self.values_builder.append_n(n, v); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.null_buffer_builder.append_null(); - self.values_builder.advance(1); - } - - /// Appends `n` no. of null's into the builder - #[inline] - pub fn append_nulls(&mut self, n: usize) { - self.null_buffer_builder.append_n_nulls(n); - self.values_builder.advance(n); - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) { - match v { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[T::Native]) { - self.null_buffer_builder.append_n_non_nulls(v.len()); - self.values_builder.append_slice(v); - } - - /// Appends values from a slice of type `T` and a validity boolean slice - /// - /// # Panics - /// - /// Panics if `values` and `is_valid` have different lengths - #[inline] - pub fn append_values(&mut self, values: &[T::Native], is_valid: &[bool]) { - assert_eq!( - values.len(), - is_valid.len(), - "Value and validity lengths must be equal" - ); - self.null_buffer_builder.append_slice(is_valid); - self.values_builder.append_slice(values); - } - - /// Appends values from a trusted length iterator. - /// - /// # Safety - /// This requires the iterator be a trusted length. This could instead require - /// the iterator implement `TrustedLen` once that is stabilized. - #[inline] - pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { - let iter = iter.into_iter(); - let len = iter - .size_hint() - .1 - .expect("append_trusted_len_iter requires an upper bound"); - - self.null_buffer_builder.append_n_non_nulls(len); - self.values_builder.append_trusted_len_iter(iter); - } - - /// Builds the [`PrimitiveArray`] and reset this builder. - pub fn finish(&mut self) -> PrimitiveArray { - let len = self.len(); - let nulls = self.null_buffer_builder.finish(); - let builder = ArrayData::builder(self.data_type.clone()) - .len(len) - .add_buffer(self.values_builder.finish()) - .nulls(nulls); - - let array_data = unsafe { builder.build_unchecked() }; - PrimitiveArray::::from(array_data) - } - - /// Builds the [`PrimitiveArray`] without resetting the builder. - pub fn finish_cloned(&self) -> PrimitiveArray { - let len = self.len(); - let nulls = self.null_buffer_builder.finish_cloned(); - let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let builder = ArrayData::builder(self.data_type.clone()) - .len(len) - .add_buffer(values_buffer) - .nulls(nulls); - - let array_data = unsafe { builder.build_unchecked() }; - PrimitiveArray::::from(array_data) - } - - /// Returns the current values buffer as a slice - pub fn values_slice(&self) -> &[T::Native] { - self.values_builder.as_slice() - } - - /// Returns the current values buffer as a mutable slice - pub fn values_slice_mut(&mut self) -> &mut [T::Native] { - self.values_builder.as_slice_mut() - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } - - /// Returns the current null buffer as a mutable slice - pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { - self.null_buffer_builder.as_slice_mut() - } - - /// Returns the current values buffer and null buffer as a slice - pub fn slices_mut(&mut self) -> (&mut [T::Native], Option<&mut [u8]>) { - ( - self.values_builder.as_slice_mut(), - self.null_buffer_builder.as_slice_mut(), - ) - } -} - -impl PrimitiveBuilder

{ - /// Sets the precision and scale - pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { - validate_decimal_precision_and_scale::

(precision, scale)?; - Ok(Self { - data_type: P::TYPE_CONSTRUCTOR(precision, scale), - ..self - }) - } -} - -impl PrimitiveBuilder

{ - /// Sets the timezone - pub fn with_timezone(self, timezone: impl Into>) -> Self { - self.with_timezone_opt(Some(timezone.into())) - } - - /// Sets an optional timezone - pub fn with_timezone_opt>>(self, timezone: Option) -> Self { - Self { - data_type: DataType::Timestamp(P::UNIT, timezone.map(Into::into)), - ..self - } - } -} - -impl Extend> for PrimitiveBuilder

{ - #[inline] - fn extend>>(&mut self, iter: T) { - for v in iter { - self.append_option(v) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_schema::TimeUnit; - - use crate::array::Array; - use crate::array::BooleanArray; - use crate::array::Date32Array; - use crate::array::Int32Array; - use crate::array::TimestampSecondArray; - - #[test] - fn test_primitive_array_builder_i32() { - let mut builder = Int32Array::builder(5); - for i in 0..5 { - builder.append_value(i); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_i32_append_iter() { - let mut builder = Int32Array::builder(5); - unsafe { builder.append_trusted_len_iter(0..5) }; - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_i32_append_nulls() { - let mut builder = Int32Array::builder(5); - builder.append_nulls(5); - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(5, arr.null_count()); - for i in 0..5 { - assert!(arr.is_null(i)); - assert!(!arr.is_valid(i)); - } - } - - #[test] - fn test_primitive_array_builder_date32() { - let mut builder = Date32Array::builder(5); - for i in 0..5 { - builder.append_value(i); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_timestamp_second() { - let mut builder = TimestampSecondArray::builder(5); - for i in 0..5 { - builder.append_value(i); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i64, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_bool() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanArray::builder(10); - for i in 0..10 { - if i == 3 || i == 6 || i == 9 { - builder.append_value(true); - } else { - builder.append_value(false); - } - } - - let arr = builder.finish(); - assert_eq!(&buf, arr.values().inner()); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..10 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") - } - } - - #[test] - fn test_primitive_array_builder_append_option() { - let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_option(Some(0)); - builder.append_option(None); - builder.append_option(Some(2)); - builder.append_option(None); - builder.append_option(Some(4)); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_null() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_value(0); - builder.append_value(2); - builder.append_null(); - builder.append_null(); - builder.append_value(4); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_slice() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_slice(&[0, 2]); - builder.append_null(); - builder.append_null(); - builder.append_value(4); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_finish() { - let mut builder = Int32Builder::new(); - builder.append_slice(&[2, 4, 6, 8]); - let mut arr = builder.finish(); - assert_eq!(4, arr.len()); - assert_eq!(0, builder.len()); - - builder.append_slice(&[1, 3, 5, 7, 9]); - arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_primitive_array_builder_finish_cloned() { - let mut builder = Int32Builder::new(); - builder.append_value(23); - builder.append_value(45); - let result = builder.finish_cloned(); - assert_eq!(result, Int32Array::from(vec![23, 45])); - builder.append_value(56); - assert_eq!(builder.finish_cloned(), Int32Array::from(vec![23, 45, 56])); - - builder.append_slice(&[2, 4, 6, 8]); - let mut arr = builder.finish(); - assert_eq!(7, arr.len()); - assert_eq!(arr, Int32Array::from(vec![23, 45, 56, 2, 4, 6, 8])); - assert_eq!(0, builder.len()); - - builder.append_slice(&[1, 3, 5, 7, 9]); - arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_primitive_array_builder_with_data_type() { - let mut builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); - builder.append_value(1); - let array = builder.finish(); - assert_eq!(array.precision(), 1); - assert_eq!(array.scale(), 2); - - let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); - let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); - builder.append_value(1); - let array = builder.finish(); - assert_eq!(array.data_type(), &data_type); - } - - #[test] - #[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")] - fn test_invalid_with_data_type() { - Int32Builder::new().with_data_type(DataType::Int64); - } - - #[test] - fn test_extend() { - let mut builder = PrimitiveBuilder::::new(); - builder.extend([1, 2, 3, 5, 2, 4, 4].into_iter().map(Some)); - builder.extend([2, 4, 6, 2].into_iter().map(Some)); - let array = builder.finish(); - assert_eq!(array.values(), &[1, 2, 3, 5, 2, 4, 4, 2, 4, 6, 2]); - } -} diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 3e8ac2302c48..0fc9d30ab6e3 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -254,7 +254,6 @@ pub mod temporal_conversions; pub mod timezone; mod trusted_len; pub mod types; -pub mod builder2; #[cfg(test)] mod tests { From 878ec3eca54120fd370a40cbf6603967e2fa1104 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 10 Dec 2024 09:41:50 +0000 Subject: [PATCH 5/5] implement specific array builder for NullBuilder --- arrow-array/src/builder/null_builder.rs | 39 ++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/null_builder.rs b/arrow-array/src/builder/null_builder.rs index 59086dffa907..1da3fcd64afa 100644 --- a/arrow-array/src/builder/null_builder.rs +++ b/arrow-array/src/builder/null_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::ArrayBuilder; +use crate::builder::{SpecificArrayBuilder, ArrayBuilder}; use crate::{ArrayRef, NullArray}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -146,6 +146,43 @@ impl ArrayBuilder for NullBuilder { } } +impl SpecificArrayBuilder for NullBuilder { + type Output = NullArray; + type Item<'a> = (); + + fn finish(&mut self) -> Arc { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> Arc { + Arc::new(self.finish_cloned()) + } + + fn append_value<'a>(&'a mut self, value: Self::Item<'a>) { + self.append_null(); + } + + fn append_value_ref<'a>(&'a mut self, value: &'a Self::Item<'a>) { + self.append_null(); + } + + fn append_null(&mut self) { + self.append_null(); + } + + fn append_output<'a>(&'a mut self, output: &'a Self::Output) { + self.len += output.len(); + } + + fn append_nulls(&mut self, n: usize) { + self.append_nulls(n) + } + + fn append_option<'a>(&'a mut self, v: Option>) { + self.append_null() + } +} + #[cfg(test)] mod tests { use super::*;