Skip to content

Commit

Permalink
Specialize Prefix/Suffix Match for Like/ILike between Array and Sca…
Browse files Browse the repository at this point in the history
…lar for StringViewArray (#6231)

* v2 impl

* Add bench

* fix clippy

* fix endswith

* Finalize the prefix_v2 implementation

* stop reverse string for ends_with

* Fix comments

* fix bad comment

* Correct equals sematics
  • Loading branch information
xinlifoobar authored Aug 25, 2024
1 parent f73dbc3 commit 855666d
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 17 deletions.
64 changes: 64 additions & 0 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_schema::{ArrowError, DataType};
use core::str;
use num::ToPrimitive;
use std::any::Any;
use std::fmt::Debug;
Expand Down Expand Up @@ -301,6 +302,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
ArrayIter::new(self)
}

/// Returns an iterator over the bytes of this array.
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
self.views.iter().map(move |v| {
let len = *v as u32;
if len <= 12 {
unsafe { Self::inline_value(v, len as usize) }
} else {
let view = ByteView::from(*v);
let data = &self.buffers[view.buffer_index as usize];
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + len as usize) }
}
})
}

/// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
/// If the prefix length is larger than the string length, it will return the empty slice.
pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < prefix_len {
return &[] as &[u8];
}

if prefix_len <= 4 || len <= 12 {
unsafe { StringViewArray::inline_value(v, prefix_len) }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + prefix_len) }
}
})
}

/// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
/// If the suffix length is larger than the string length, it will return the empty slice.
pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < suffix_len {
return &[] as &[u8];
}

if len <= 12 {
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
}
})
}

/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
Expand Down
21 changes: 21 additions & 0 deletions arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,27 @@ mod tests {
vec![false, true, true, false, false, false, false, true, true, true, true]
);

// 😈 is four bytes long.
test_utf8_scalar!(
test_uff8_array_like_multibyte,
vec![
"sdlkdfFooßsdfs",
"sdlkdfFooSSdggs",
"sdlkdfFoosssdsd",
"FooS",
"Foos",
"ffooSS",
"ffooß",
"😃sadlksffofsSsh😈klF",
"😱slgffoesSsh😈klF",
"FFKoSS",
"longer than 12 bytes FFKoSS",
],
"%Ssh😈klF",
like,
vec![false, false, false, false, false, false, false, true, true, false, false]
);

test_utf8_scalar!(
test_utf8_array_ilike_scalar_one,
vec![
Expand Down
108 changes: 91 additions & 17 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::{ArrayAccessor, BooleanArray};
use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
Expand Down Expand Up @@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
}),
Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
}),
Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
}),
Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
}),
Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
}),
Predicate::Contains(finder) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.bytes_iter()
.map(|haystack| finder.find(haystack).is_some() != negate)
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
})
}
}
Predicate::StartsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.prefix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
})
}
}
Predicate::IStartsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.prefix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(
haystack,
v.as_bytes(),
equals_ignore_ascii_case_kernel,
) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
}
}
Predicate::EndsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
})
}
}
Predicate::IEndsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(
haystack,
v.as_bytes(),
equals_ignore_ascii_case_kernel,
) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
}
}
Predicate::Regex(v) => {
BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate)
}
}
}
}

fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel)
}

/// This is faster than `str::starts_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
Expand All @@ -145,7 +220,6 @@ fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8))
zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
}
}

/// This is faster than `str::ends_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
Expand Down

0 comments on commit 855666d

Please sign in to comment.