Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: scalar regex match physical expr #12270

Open
wants to merge 38 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d357df7
feat: scalar regex match physical expr
zhuliquan Aug 31, 2024
02f58b2
bench: add scalar regex match benchmarks
zhuliquan Sep 17, 2024
2dcb317
feat: apply scalar_regex_match optimize to similar_to case
zhuliquan Sep 18, 2024
3a33a71
minor: regen datafusion protobuf
Sep 18, 2024
930b7a8
bench: improve scalar_regex_match
zhuliquan Nov 9, 2024
5f2b555
minor: update cargo.lock
zhuliquan Nov 9, 2024
f309341
fix: fix wrong merge conflict
zhuliquan Dec 3, 2024
a0ba73a
bench: init expr in scalar_regex_match bench iter
zhuliquan Dec 6, 2024
25d9f93
Merge branch 'apache:main' into feature-scalar_regexp_match_expr
zhuliquan Dec 6, 2024
7014ac3
bench: diff batch run over in scalar_regex_match
zhuliquan Dec 7, 2024
ef66c56
improve: improve performance of scalar_regex_match
zhuliquan Dec 8, 2024
13adab5
Minor: Comment temporary function for documentation migration (#13669)
comphead Dec 6, 2024
7cfaf1e
Minor: Rephrase MSRV policy to be more explanatory (#13668)
comphead Dec 6, 2024
98b7488
fix: repartitioned reads of CSV with custom line terminator (#13677)
korowa Dec 7, 2024
14dcf20
chore: macros crate cleanup (#13685)
findepi Dec 7, 2024
f6cafba
Refactor regexplike signature (#13394)
jiashenC Dec 8, 2024
cebf94f
Performance: enable array allocation reuse (`ScalarFunctionArgs` gets…
alamb Dec 8, 2024
6dd3f3a
Temporary fix for CI (#13689)
jonahgao Dec 8, 2024
de36fb6
refactor: use `LazyLock` in the `user_doc` macro (#13684)
jonahgao Dec 8, 2024
7728525
Unlock lexical-write-integer version. (#13693)
Alexhuszagh Dec 9, 2024
4884ac2
Minor: Use `div_ceil`
akurmustafa Dec 9, 2024
021a500
Fix hash join with sort push down (#13560)
haohuaijin Dec 9, 2024
ec5e038
Improve substr() performance by avoiding using owned string (#13688)
richox Dec 9, 2024
412d3f6
reinstate down_cast_any_ref (#13705)
andygrove Dec 9, 2024
dc17dd6
Optimize performance of `character_length` function (#13696)
tlm365 Dec 10, 2024
9b57875
Update prost-build requirement from =0.13.3 to =0.13.4 (#13698)
dependabot[bot] Dec 10, 2024
4a08545
Minor: Output elapsed time for sql logic test (#13718)
comphead Dec 10, 2024
d02d587
refactor: simplify the `make_udf_function` macro (#13712)
jonahgao Dec 11, 2024
0e41341
refactor: replace `Vec` with `IndexMap` for expression mappings in `P…
Weijun-H Dec 11, 2024
a8fc264
Handle alias when parsing sql(parse_sql_expr) (#12939)
Eason0729 Dec 11, 2024
a505610
Improve documentation for TableProvider (#13724)
alamb Dec 11, 2024
4fb668b
Reveal implementing type and return type in simple UDF implementation…
findepi Dec 11, 2024
1ab089e
minor: Extract tests for `EXTRACT` AND `date_part` to their own file …
alamb Dec 11, 2024
2b65fb3
Support unparsing `UNNEST` plan to `UNNEST` table factor SQL (#13660)
goldmedal Dec 11, 2024
79cb7d6
feat: new way to make bool_buffer in scalar_regex_match
zhuliquan Dec 15, 2024
ac5f313
Merge branch 'main' into feature-scalar_regexp_match_expr
zhuliquan Dec 15, 2024
c53ed77
fix: take fmt suggestion
zhuliquan Dec 15, 2024
6a8f1ae
Merge branch 'apache:main' into feature-scalar_regexp_match_expr
zhuliquan Dec 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ itertools = { workspace = true, features = ["use_std"] }
log = { workspace = true }
paste = "^1.0"
petgraph = "0.6.2"
regex = { workspace = true }

[dev-dependencies]
arrow = { workspace = true, features = ["test_utils"] }
Expand All @@ -72,3 +73,7 @@ name = "case_when"
[[bench]]
harness = false
name = "is_null"

[[bench]]
harness = false
name = "scalar_regex_match"
175 changes: 175 additions & 0 deletions datafusion/physical-expr/benches/scalar_regex_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::sync::Arc;
use std::time::Duration;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema};
use criterion::{criterion_group, criterion_main, Criterion};
use datafusion_expr_common::operator::Operator;
use datafusion_physical_expr::{
expressions::{binary, col, lit, scalar_regex_match},
PhysicalExpr,
};
use rand::{
distributions::{Alphanumeric, DistString},
rngs::StdRng,
SeedableRng,
};

/// make a record batch with one column and n rows
/// this record batch is single string column is used for
/// scalar regex match benchmarks
fn make_record_batch(
batch_iter: usize,
batch_size: usize,
string_len: usize,
matched_str: &[&str],
schema: &Schema,
) -> Vec<RecordBatch> {
let mut rng = StdRng::seed_from_u64(12345);
let mut batches = vec![];
for _ in 0..batch_iter {
let mut array = (0..batch_size)
.map(|_| Some(Alphanumeric.sample_string(&mut rng, string_len)))
.collect::<Vec<_>>();
for v in matched_str {
array.push(Some(v.to_string()));
}
let array = StringArray::from(array);
let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])
.unwrap();
batches.push(batch);
}
batches
}

/// initialize benchmark data and pattern literals
#[allow(clippy::type_complexity)]
fn init_benchmark() -> (
Vec<(usize, usize, Vec<RecordBatch>)>,
Schema,
Arc<dyn PhysicalExpr>,
Vec<(String, Arc<dyn PhysicalExpr>)>,
) {
// make common schema
let column = "s";
let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]);

// make test record batch
let batch_data = vec![
// (20, 10_usize, make_record_batch(20, 10, 100, schema.clone())),
// (20, 100_usize, make_record_batch(20, 100, 100, schema.clone())),
// (20, 1000_usize, make_record_batch(20, 1000, 100, schema.clone())),
(
128_usize,
4096_usize,
make_record_batch(
128,
4096,
100,
&[
"example@email.com",
"http://example.com",
"123.4.5.6",
"1236787788",
"55555",
],
&schema,
),
),
];

// string column
let string_col = col(column, &schema).unwrap();

// some pattern literal
let pattern_lit = vec![
(
"email".to_string(),
lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"),
),
(
"url".to_string(),
lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$"),
),
(
"ip".to_string(),
lit(
r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$",
),
),
(
"phone".to_string(),
lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"),
),
("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
];
(batch_data, schema, string_col, pattern_lit)
}

fn regex_match_benchmark(c: &mut Criterion) {
let (batch_data, schema, string_col, pattern_lit) = init_benchmark();
for (name, regexp_lit) in pattern_lit.iter() {
for (batch_iter, batch_size, batches) in batch_data.iter() {
let group_name = format!(
"regex_{}_batch_iter_{}_batch_size_{}",
name, batch_iter, batch_size
);
let mut group = c.benchmark_group(group_name.as_str());

group.sample_size(50).measurement_time(Duration::new(30, 0));

// binary expr match benchmarks
group.bench_function("binary_expr_match", |b| {
b.iter(|| {
let expr = binary(
string_col.clone(),
Operator::RegexMatch,
regexp_lit.clone(),
&schema,
)
.unwrap();
for batch in batches.iter() {
expr.evaluate(batch).unwrap();
}
});
});
// scalar regex match benchmarks
group.bench_function("scalar_regex_match", |b| {
b.iter(|| {
let expr = scalar_regex_match(
false,
false,
string_col.clone(),
regexp_lit.clone(),
&schema,
)
.unwrap();
for batch in batches.iter() {
expr.evaluate(batch).unwrap();
}
});
});
group.finish();
}
}
}

criterion_group!(benches, regex_match_benchmark);
criterion_main!(benches);
2 changes: 2 additions & 0 deletions datafusion/physical-expr/src/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mod literal;
mod negative;
mod no_op;
mod not;
mod scalar_regex_match;
mod try_cast;
mod unknown_column;

Expand All @@ -50,5 +51,6 @@ pub use literal::{lit, Literal};
pub use negative::{negative, NegativeExpr};
pub use no_op::NoOp;
pub use not::{not, NotExpr};
pub use scalar_regex_match::{scalar_regex_match, ScalarRegexMatchExpr};
pub use try_cast::{try_cast, TryCastExpr};
pub use unknown_column::UnKnownColumn;
Loading
Loading