Skip to content

Commit

Permalink
bench: improve scalar_regex_match
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuliquan committed Dec 3, 2024
1 parent 7186929 commit d7516f2
Showing 1 changed file with 62 additions and 44 deletions.
106 changes: 62 additions & 44 deletions datafusion/physical-expr/benches/scalar_regex_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr_common::operator::Operator;
use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match};
use hashbrown::HashMap;
use datafusion_physical_expr::{
expressions::{binary, col, lit, scalar_regex_match},
PhysicalExpr,
};
use rand::distributions::{Alphanumeric, DistString};

/// make a record batch with one column and n rows
Expand All @@ -39,83 +41,99 @@ fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> Recor
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap()
}

fn scalar_regex_match_benchmark(c: &mut Criterion) {
/// initialize benchmark data and pattern literals
fn init_benchmark() -> (
Vec<(usize, RecordBatch)>,
Schema,
Arc<dyn PhysicalExpr>,
Vec<(String, Arc<dyn PhysicalExpr>)>,
) {
// make common schema
let column = "string";
let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]);

// meke test record batch
let test_batch = [
(10, make_record_batch(10, 100, schema.clone())),
(100, make_record_batch(100, 100, schema.clone())),
(1000, make_record_batch(1000, 100, schema.clone())),
(2000, make_record_batch(2000, 100, schema.clone())),
]
.iter()
.map(|(k, v)| (*k, v.clone()))
.collect::<HashMap<_, _>>();
let batch_data = vec![
// (10_usize, make_record_batch(10, 100, schema.clone())),
// (100_usize, make_record_batch(100, 100, schema.clone())),
// (1000_usize, make_record_batch(1000, 100, schema.clone())),
(2000_usize, make_record_batch(2000, 100, schema.clone())),
];

// string column
let string_col = col(column, &schema).unwrap();

// some pattern literal
let pattern_lit = [
("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")),
("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")),
("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")),
("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")),
("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
].iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect::<HashMap<_, _>>();
let pattern_lit = vec![
(
format!("email"),
lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"),
),
(
format!("url"),
lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$"),
),
(
format!("ip"),
lit(
r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$",
),
),
(
format!("phone"),
lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"),
),
(format!("zip_code"), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
];
(batch_data, schema, string_col, pattern_lit)
}

fn regex_match_benchmark(c: &mut Criterion) {
let (batch_data, schema, string_col, pattern_lit) = init_benchmark();
// let record_batch_run_times = [10, 20, 50, 100];
let record_batch_run_times = [10];
for (name, regexp_lit) in pattern_lit.iter() {
for (rows, batch) in test_batch.iter() {
for iter in [10, 20, 50, 100] {
// scalar regex match benchmarks
let bench_name = format!(
"scalar_regex_match_pattern_{}_rows_{}_iter_{}",
name, rows, iter
);
c.bench_function(bench_name.as_str(), |b| {
let expr = scalar_regex_match(
false,
false,
for (rows, batch) in batch_data.iter() {
for run_time in record_batch_run_times {
let group_name =
format!("regex_{}_rows_{}_run_time_{}", name, rows, run_time);
let mut group = c.benchmark_group(group_name.as_str());
// binary expr match benchmarks
group.bench_function("binary_expr_match", |b| {
let expr = binary(
string_col.clone(),
Operator::RegexMatch,
regexp_lit.clone(),
&schema,
)
.unwrap();
b.iter(|| {
for _ in 0..iter {
for _ in 0..run_time {
expr.evaluate(black_box(batch)).unwrap();
}
});
});

// binary regex match benchmarks
let bench_name = format!(
"binary_regex_match_pattern_{}_rows_{}_iter_{}",
name, rows, iter
);
c.bench_function(bench_name.as_str(), |b| {
let expr = binary(
// scalar regex match benchmarks
group.bench_function("scalar_regex_match", |b| {
let expr = scalar_regex_match(
false,
false,
string_col.clone(),
Operator::RegexMatch,
regexp_lit.clone(),
&schema,
)
.unwrap();
b.iter(|| {
for _ in 0..iter {
for _ in 0..run_time {
expr.evaluate(black_box(batch)).unwrap();
}
});
});
group.finish();
}
}
}
}

criterion_group!(benches, scalar_regex_match_benchmark);
criterion_group!(benches, regex_match_benchmark);
criterion_main!(benches);

0 comments on commit d7516f2

Please sign in to comment.