From 855666d9e9283c1ef11648762fe92c7c188b68f1 Mon Sep 17 00:00:00 2001 From: Xin Li <33629085+xinlifoobar@users.noreply.github.com> Date: Sun, 25 Aug 2024 06:24:21 -0700 Subject: [PATCH] Specialize Prefix/Suffix Match for `Like/ILike` between Array and Scalar for StringViewArray (#6231) * v2 impl * Add bench * fix clippy * fix endswith * Finalize the prefix_v2 implementation * stop reverse string for ends_with * Fix comments * fix bad comment * Correct equals sematics --- arrow-array/src/array/byte_view_array.rs | 64 ++++++++++++++ arrow-string/src/like.rs | 21 +++++ arrow-string/src/predicate.rs | 108 +++++++++++++++++++---- 3 files changed, 176 insertions(+), 17 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a155b6ab22e2..3f5a952c282e 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; use arrow_schema::{ArrowError, DataType}; +use core::str; use num::ToPrimitive; use std::any::Any; use std::fmt::Debug; @@ -301,6 +302,69 @@ impl GenericByteViewArray { ArrayIter::new(self) } + /// Returns an iterator over the bytes of this array. + pub fn bytes_iter(&self) -> impl Iterator { + self.views.iter().map(move |v| { + let len = *v as u32; + if len <= 12 { + unsafe { Self::inline_value(v, len as usize) } + } else { + let view = ByteView::from(*v); + let data = &self.buffers[view.buffer_index as usize]; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + len as usize) } + } + }) + } + + /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. + /// If the prefix length is larger than the string length, it will return the empty slice. + pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < prefix_len { + return &[] as &[u8]; + } + + if prefix_len <= 4 || len <= 12 { + unsafe { StringViewArray::inline_value(v, prefix_len) } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + prefix_len) } + } + }) + } + + /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. + /// If the suffix length is larger than the string length, it will return the empty slice. + pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < suffix_len { + return &[] as &[u8]; + } + + if len <= 12 { + unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } + } + }) + } + /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e878e241853a..4626be1362e9 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -989,6 +989,27 @@ mod tests { vec![false, true, true, false, false, false, false, true, true, true, true] ); + // 😈 is four bytes long. + test_utf8_scalar!( + test_uff8_array_like_multibyte, + vec![ + "sdlkdfFooßsdfs", + "sdlkdfFooSSdggs", + "sdlkdfFoosssdsd", + "FooS", + "Foos", + "ffooSS", + "ffooß", + "😃sadlksffofsSsh😈klF", + "😱slgffoesSsh😈klF", + "FFKoSS", + "longer than 12 bytes FFKoSS", + ], + "%Ssh😈klF", + like, + vec![false, false, false, false, false, false, false, true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_ilike_scalar_one, vec![ diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index ec0c4827830c..f559088e6c96 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{ArrayAccessor, BooleanArray}; +use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray}; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -114,21 +114,92 @@ impl<'a> Predicate<'a> { Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { - finder.find(haystack.as_bytes()).is_some() != negate - }), - Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_kernel) != negate - }), - Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }), - Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate - }), - Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }), + Predicate::Contains(finder) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + BooleanArray::from( + string_view_array + .bytes_iter() + .map(|haystack| finder.find(haystack).is_some() != negate) + .collect::>(), + ) + } else { + BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }) + } + } + Predicate::StartsWith(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + BooleanArray::from( + string_view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) + .collect::>(), + ) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_kernel) != negate + }) + } + } + Predicate::IStartsWithAscii(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + BooleanArray::from( + string_view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate + }) + .collect::>(), + ) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } + Predicate::EndsWith(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + BooleanArray::from( + string_view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) + .collect::>(), + ) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate + }) + } + } + Predicate::IEndsWithAscii(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + BooleanArray::from( + string_view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate + }) + .collect::>(), + ) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } Predicate::Regex(v) => { BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) } @@ -136,6 +207,10 @@ impl<'a> Predicate<'a> { } } +fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel) +} + /// This is faster than `str::starts_with` for small strings. /// See for more details. fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { @@ -145,7 +220,6 @@ fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) } } - /// This is faster than `str::ends_with` for small strings. /// See for more details. fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {