From 72c9505bed6260865fa95a637f37219cdc129a0e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 23 Dec 2023 02:09:55 -0800 Subject: [PATCH] Improve regexp kernels performance by avoiding cloning Regex (#5235) * Improve regexp_match performance by avoiding cloning Regex * For review --- arrow-string/src/regexp.rs | 10 +++----- arrow/Cargo.toml | 5 ++++ arrow/benches/regexp_kernels.rs | 44 +++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 arrow/benches/regexp_kernels.rs diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index 34bb1b0b4c41..25c712d20f08 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -81,15 +81,14 @@ pub fn regexp_is_match_utf8( (Some(value), Some(pattern)) => { let existing_pattern = patterns.get(&pattern); let re = match existing_pattern { - Some(re) => re.clone(), + Some(re) => re, None => { let re = Regex::new(pattern.as_str()).map_err(|e| { ArrowError::ComputeError(format!( "Regular expression did not compile: {e:?}" )) })?; - patterns.insert(pattern, re.clone()); - re + patterns.entry(pattern).or_insert(re) } }; result.append(re.is_match(value)); @@ -216,15 +215,14 @@ pub fn regexp_match( (Some(value), Some(pattern)) => { let existing_pattern = patterns.get(&pattern); let re = match existing_pattern { - Some(re) => re.clone(), + Some(re) => re, None => { let re = Regex::new(pattern.as_str()).map_err(|e| { ArrowError::ComputeError(format!( "Regular expression did not compile: {e:?}" )) })?; - patterns.insert(pattern, re.clone()); - re + patterns.entry(pattern).or_insert(re) } }; match re.captures(value) { diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a6b4ddf51dfb..168a58b295f9 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -247,6 +247,11 @@ name = "substring_kernels" harness = false required-features = ["test_utils"] +[[bench]] +name = "regexp_kernels" +harness = false +required-features = ["test_utils"] + [[bench]] name = "array_data_validate" harness = false diff --git a/arrow/benches/regexp_kernels.rs b/arrow/benches/regexp_kernels.rs new file mode 100644 index 000000000000..eb38ba6783bc --- /dev/null +++ b/arrow/benches/regexp_kernels.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::Criterion; + +extern crate arrow; + +use arrow::array::*; +use arrow::compute::kernels::regexp::*; +use arrow::util::bench_util::*; + +fn bench_regexp(arr: &GenericStringArray, regex_array: &GenericStringArray) { + regexp_match(criterion::black_box(arr), regex_array, None).unwrap(); +} + +fn add_benchmark(c: &mut Criterion) { + let size = 65536; + let val_len = 1000; + + let arr_string = create_string_array_with_len::(size, 0.0, val_len); + let pattern_values = vec![r".*-(\d*)-.*"; size]; + let pattern = GenericStringArray::::from(pattern_values); + + c.bench_function("regexp", |b| b.iter(|| bench_regexp(&arr_string, &pattern))); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches);