diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs index d929277a4da3..ee2021d15b60 100644 --- a/arrow-cast/src/cast/dictionary.rs +++ b/arrow-cast/src/cast/dictionary.rs @@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary( Decimal256(_, _) => { pack_numeric_to_dictionary::(array, dict_value_type, cast_options) } - Utf8 => pack_byte_to_dictionary::>(array, cast_options), - LargeUtf8 => pack_byte_to_dictionary::>(array, cast_options), - Binary => pack_byte_to_dictionary::>(array, cast_options), - LargeBinary => pack_byte_to_dictionary::>(array, cast_options), + Utf8 => { + // If the input is a view type, we can avoid casting (thus copying) the data + if array.data_type() == &DataType::Utf8View { + return string_view_to_dictionary::(array); + } + pack_byte_to_dictionary::>(array, cast_options) + } + LargeUtf8 => { + // If the input is a view type, we can avoid casting (thus copying) the data + if array.data_type() == &DataType::Utf8View { + return string_view_to_dictionary::(array); + } + pack_byte_to_dictionary::>(array, cast_options) + } + Binary => { + // If the input is a view type, we can avoid casting (thus copying) the data + if array.data_type() == &DataType::BinaryView { + return binary_view_to_dictionary::(array); + } + pack_byte_to_dictionary::>(array, cast_options) + } + LargeBinary => { + // If the input is a view type, we can avoid casting (thus copying) the data + if array.data_type() == &DataType::BinaryView { + return binary_view_to_dictionary::(array); + } + pack_byte_to_dictionary::>(array, cast_options) + } _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {dict_value_type:?}" ))), @@ -226,6 +250,58 @@ where Ok(Arc::new(b.finish())) } +pub(crate) fn string_view_to_dictionary( + array: &dyn Array, +) -> Result +where + K: ArrowDictionaryKeyType, +{ + let mut b = GenericByteDictionaryBuilder::>::with_capacity( + array.len(), + 1024, + 1024, + ); + let string_view = array.as_any().downcast_ref::().unwrap(); + for v in string_view.iter() { + match v { + Some(v) => { + b.append(v)?; + } + None => { + b.append_null(); + } + } + } + + Ok(Arc::new(b.finish())) +} + +pub(crate) fn binary_view_to_dictionary( + array: &dyn Array, +) -> Result +where + K: ArrowDictionaryKeyType, +{ + let mut b = GenericByteDictionaryBuilder::>::with_capacity( + array.len(), + 1024, + 1024, + ); + let binary_view = array.as_any().downcast_ref::().unwrap(); + for v in binary_view.iter() { + match v { + Some(v) => { + b.append(v)?; + } + None => { + b.append_null(); + } + } + } + + Ok(Arc::new(b.finish())) +} + // Packs the data as a GenericByteDictionaryBuilder, if possible, with the // key types of K pub(crate) fn pack_byte_to_dictionary( diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index e073e34cb6e4..354c31af6958 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -5205,10 +5205,10 @@ mod tests { const VIEW_TEST_DATA: [Option<&str>; 5] = [ Some("hello"), - Some("world"), + Some("repeated"), None, Some("large payload over 12 bytes"), - Some("lulu"), + Some("repeated"), ]; fn _test_string_to_view() @@ -5291,6 +5291,26 @@ mod tests { assert_eq!(casted_binary_array.as_ref(), &binary_view_array); } + #[test] + fn test_view_to_dict() { + let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); + let string_dict_array: DictionaryArray = VIEW_TEST_DATA.into_iter().collect(); + let casted_type = string_dict_array.data_type(); + let casted_dict_array = cast(&string_view_array, casted_type).unwrap(); + assert_eq!(casted_dict_array.data_type(), casted_type); + assert_eq!(casted_dict_array.as_ref(), &string_dict_array); + + let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + let binary_dict_array = string_dict_array.downcast_dict::().unwrap(); + let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap(); + let binary_dict_array = + DictionaryArray::::new(binary_dict_array.keys().clone(), binary_buffer); + let casted_type = binary_dict_array.data_type(); + let casted_binary_array = cast(&binary_view_array, casted_type).unwrap(); + assert_eq!(casted_binary_array.data_type(), casted_type); + assert_eq!(casted_binary_array.as_ref(), &binary_dict_array); + } + #[test] fn test_view_to_string() { _test_view_to_string::(); @@ -5330,23 +5350,15 @@ mod tests { where O: OffsetSizeTrait, { - let data: Vec> = vec![ - Some(b"hello"), - Some(b"world"), - None, - Some(b"large payload over 12 bytes"), - Some(b"lulu"), - ]; - let view_array = { let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. - for s in data.iter() { + for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); } builder.finish() }; - let expected_binary_array = GenericBinaryArray::::from(data); + let expected_binary_array = GenericBinaryArray::::from_iter(VIEW_TEST_DATA); let expected_type = expected_binary_array.data_type(); assert!(can_cast_types(view_array.data_type(), expected_type));