Skip to content

Commit

Permalink
Merge branch 'master' into interleave-bloom
Browse files Browse the repository at this point in the history
  • Loading branch information
progval authored Jun 13, 2024
2 parents d2a7ab8 + d32f68d commit b434eea
Show file tree
Hide file tree
Showing 24 changed files with 870 additions and 208 deletions.
15 changes: 9 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,20 @@ jobs:
run: cargo update -p ahash --precise 0.8.7
- name: Check arrow
working-directory: arrow
run: cargo msrv verify
run: cargo msrv --log-target stdout verify
- name: Check parquet
working-directory: parquet
run: cargo msrv verify
run: cargo msrv --log-target stdout verify
- name: Check arrow-flight
working-directory: arrow-flight
run: cargo msrv verify
run: cargo msrv --log-target stdout verify
- name: Downgrade object_store dependencies
working-directory: object_store
# Necessary because 1.30.0 updates MSRV to 1.63
run: cargo update -p tokio --precise 1.29.1
# Necessary because tokio 1.30.0 updates MSRV to 1.63
# and url 2.5.1, updates to 1.67
run: |
cargo update -p tokio --precise 1.29.1
cargo update -p url --precise 2.5.0
- name: Check object_store
working-directory: object_store
run: cargo msrv verify
run: cargo msrv --log-target stdout verify
47 changes: 32 additions & 15 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
offset as u32
}

/// Append a view of the given `block`, `offset` and `length`
///
/// # Safety
/// (1) The block must have been added using [`Self::append_block`]
/// (2) The range `offset..offset+length` must be within the bounds of the block
/// (3) The data in the block must be valid of type `T`
pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
let b = self.completed.get_unchecked(block as usize);
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get_unchecked(start..end);

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
}

self.null_buffer_builder.append_non_null();
}

/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
Expand All @@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
));
}

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
unsafe {
self.append_view_unchecked(block, offset, len);
}

self.null_buffer_builder.append_non_null();
Ok(())
}

Expand Down
143 changes: 139 additions & 4 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,69 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(

Ok(new_array)
}
Utf8View => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<StringArray>()
.unwrap();

let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(string_view))
}
BinaryView => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<BinaryArray>()
.unwrap();

let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}

fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
array: &GenericByteArray<V>,
keys: &PrimitiveArray<K>,
) -> GenericByteViewArray<T> {
let value_buffer = array.values();
let value_offsets = array.value_offsets();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
builder.append_block(value_buffer.clone());
for i in keys.iter() {
match i {
Some(v) => {
let idx = v.to_usize().unwrap();

// Safety
// (1) The index is within bounds as they are offsets
// (2) The append_view is safe
unsafe {
let offset = value_offsets.get_unchecked(idx).as_usize();
let end = value_offsets.get_unchecked(idx + 1).as_usize();
let length = end - offset;
builder.append_view_unchecked(0, offset as u32, length as u32)
}
}
None => {
builder.append_null();
}
}
}
builder.finish()
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
pub(crate) fn unpack_dictionary<K>(
array: &dyn Array,
Expand Down Expand Up @@ -129,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
Decimal256(_, _) => {
pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
}
Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options),
LargeUtf8 => pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options),
Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options),
LargeBinary => pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options),
Utf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
}
LargeUtf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
}
Binary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
}
LargeBinary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {dict_value_type:?}"
))),
Expand Down Expand Up @@ -167,6 +250,58 @@ where
Ok(Arc::new(b.finish()))
}

pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let string_view = array.as_any().downcast_ref::<StringViewArray>().unwrap();
for v in string_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let binary_view = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
for v in binary_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
// key types of K
pub(crate) fn pack_byte_to_dictionary<K, T>(
Expand Down
Loading

0 comments on commit b434eea

Please sign in to comment.