Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add extend_dictionary in dictionary builder for improved performance #6875

Merged
merged 7 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use hashbrown::HashTable;
Expand Down Expand Up @@ -305,6 +305,52 @@ where
};
}

/// Extends builder with dictionary
rluvaton marked this conversation as resolved.
Show resolved Hide resolved
///
/// This is the same as `extends` but avoid lookup for each item in the iterator
///
pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>) -> Result<(), ArrowError> {
let values = dictionary.values();

let v_len = values.len();
let k_len = dictionary.keys().len();
if v_len == 0 && k_len == 0 {
return Ok(());
}

// All nulls
if v_len == 0 {
self.append_nulls(k_len);
return Ok(());
}

if k_len == 0 {
return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string()));
}

// Orphan values will be carried over to the new dictionary
let mapped_values = values.iter()
// Dictionary values should not be null as the keys are null if the value is null
rluvaton marked this conversation as resolved.
Show resolved Hide resolved
.map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null")))
.collect::<Result<Vec<_>, _>>()?;

// Just insert the keys without additional lookups
dictionary
.keys()
.iter()
.for_each(|key| {
match key {
None => self.append_null(),
Some(original_dict_index) => {
let index = original_dict_index.as_usize().min(v_len - 1);
self.keys_builder.append_value(mapped_values[index]);
}
}
});

Ok(())
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
Expand Down Expand Up @@ -664,4 +710,60 @@ mod tests {
assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
assert_eq!(dict.values().len(), 4);
}

#[test]
fn test_extend_dictionary() {
let some_dict = {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
builder.extend([None::<&str>].into_iter());
builder.extend(["c", "d", "a"].into_iter().map(Some));
builder.append_null();
builder.finish()
};

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 6);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None]
);
}

#[test]
fn test_extend_all_null_dictionary() {
let some_dict = {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.append_nulls(2);
builder.finish()
};

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 0);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[None, None]
);
}
}
108 changes: 105 additions & 3 deletions arrow-array/src/builder/primitive_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::builder::{ArrayBuilder, PrimitiveBuilder};
use crate::types::ArrowDictionaryKeyType;
use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
Expand Down Expand Up @@ -303,6 +303,52 @@ where
};
}

/// Extends builder with dictionary
///
/// This is the same as `extends` but avoid lookup for each item in the iterator
///
pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray<K, PrimitiveArray<V>>) -> Result<(), ArrowError> {
let values = dictionary.values();

let v_len = values.len();
let k_len = dictionary.keys().len();
if v_len == 0 && k_len == 0 {
return Ok(());
}

// All nulls
if v_len == 0 {
self.append_nulls(k_len);
return Ok(());
}

if k_len == 0 {
return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string()));
}

// Orphan values will be carried over to the new dictionary
let mapped_values = values.iter()
// Dictionary values should not be null as the keys are null if the value is null
.map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null")))
rluvaton marked this conversation as resolved.
Show resolved Hide resolved
.collect::<Result<Vec<_>, _>>()?;

// Just insert the keys without additional lookups
dictionary
.keys()
.iter()
.for_each(|key| {
match key {
None => self.append_null(),
Some(original_dict_index) => {
let index = original_dict_index.as_usize().min(v_len - 1);
self.keys_builder.append_value(mapped_values[index]);
}
}
});

Ok(())
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();
Expand Down Expand Up @@ -368,8 +414,7 @@ impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
mod tests {
use super::*;

use crate::array::UInt32Array;
use crate::array::UInt8Array;
use crate::array::{UInt32Array, UInt8Array, Int32Array};
use crate::builder::Decimal128Builder;
use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};

Expand Down Expand Up @@ -443,4 +488,61 @@ mod tests {
)
);
}


#[test]
fn test_extend_dictionary() {
let some_dict = {
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
builder.extend([None::<i32>].into_iter());
builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
builder.append_null();
builder.finish()
};

let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend([6, 6, 7, 6, 5].into_iter().map(Some));
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 7);

let values = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[Some(6), Some(6), Some(7), Some(6), Some(5), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), None, Some(4), Some(5), Some(1), Some(3), Some(1), None]
);
}

#[test]
fn test_extend_all_null_dictionary() {
let some_dict = {
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.append_nulls(2);
builder.finish()
};

let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 0);

let values = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[None, None]
);
}
}
Loading