From 7a5155c5f0e21559203d4a5363cffd7ec0394817 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Mon, 9 Sep 2024 11:29:53 -0400 Subject: [PATCH] Add support for Utf8View in arrow_string::length (#6345) * Add support for Utf8View in arrow_string::length #6305 * Cargo fmt. --- arrow-string/src/length.rs | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 79fa46026912..479c444edce2 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -48,7 +48,7 @@ fn bit_length_impl( /// For list array, length is the number of elements in each list. /// For string array and binary array, length is the number of bytes of each value. /// -/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray, BinaryArray/LargeBinaryArray, and FixedSizeListArray, +/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray/StringViewArray, BinaryArray/LargeBinaryArray, and FixedSizeListArray, /// or DictionaryArray with above Arrays as values /// * length of null is null. pub fn length(array: &dyn Array) -> Result { @@ -74,6 +74,14 @@ pub fn length(array: &dyn Array) -> Result { let list = array.as_string::(); Ok(length_impl::(list.offsets(), list.nulls())) } + DataType::Utf8View => { + let list = array.as_string_view(); + let v = list.views().iter().map(|v| *v as i32).collect::>(); + Ok(Arc::new(PrimitiveArray::::new( + v.into(), + list.nulls().cloned(), + ))) + } DataType::Binary => { let list = array.as_binary::(); Ok(length_impl::(list.offsets(), list.nulls())) @@ -147,9 +155,15 @@ mod tests { fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array - let values = ["one", "on", "o", ""]; + let values = [ + "one", + "on", + "o", + "", + "this is a longer string to test string array with", + ]; let values = values.into_iter().cycle().take(4096).collect(); - let expected = [3, 2, 1, 0].into_iter().cycle().take(4096).collect(); + let expected = [3, 2, 1, 0, 49].into_iter().cycle().take(4096).collect(); vec![ (vec!["hello", " ", "world"], 3, vec![5, 1, 5]), @@ -210,6 +224,21 @@ mod tests { }) } + #[test] + fn length_test_string_view() { + length_cases_string() + .into_iter() + .for_each(|(input, len, expected)| { + let array = StringViewArray::from(input); + let result = length(&array).unwrap(); + assert_eq!(len, result.len()); + let result = result.as_any().downcast_ref::().unwrap(); + expected.iter().enumerate().for_each(|(i, value)| { + assert_eq!(*value, result.value(i)); + }); + }) + } + #[test] fn length_test_binary() { let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]];