Skip to content

Commit

Permalink
bench: add scalar regex match benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuliquan committed Sep 17, 2024
1 parent a600f4d commit d49edca
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
4 changes: 4 additions & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,7 @@ name = "case_when"
[[bench]]
harness = false
name = "is_null"

[[bench]]
harness = false
name = "scalar_regex_match"
104 changes: 104 additions & 0 deletions datafusion/physical-expr/benches/scalar_regex_match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
use std::sync::Arc;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr_common::operator::Operator;
use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match};
use hashbrown::HashMap;
use rand::distributions::{Alphanumeric, DistString};

/// make a record batch with one column and n rows
/// this record batch is single string column is used for
/// scalar regex match benchmarks
fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> RecordBatch {
let mut rng = rand::thread_rng();
let mut array = Vec::with_capacity(rows);
for _ in 0..rows {
let data_line = Alphanumeric.sample_string(&mut rng, string_length);
array.push(Some(data_line));
}
let array = StringArray::from(array);
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap()
}

fn scalar_regex_match_benchmark(c: &mut Criterion) {
// make common schema
let column = "string";
let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]);

// meke test record batch
let test_batch = [
(10, make_record_batch(10, 100, schema.clone())),
(100, make_record_batch(100, 100, schema.clone())),
(1000, make_record_batch(1000, 100, schema.clone())),
(2000, make_record_batch(2000, 100, schema.clone())),
]
.iter()
.map(|(k, v)| (*k, v.clone()))
.collect::<HashMap<_, _>>();

// string column
let string_col = col(column, &schema).unwrap();

// some pattern literal
let pattern_lit = [
("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")),
("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")),
("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")),
("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")),
("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")),
].iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect::<HashMap<_, _>>();

for (name, regexp_lit) in pattern_lit.iter() {
for (rows, batch) in test_batch.iter() {
for iter in [10, 20, 50, 100] {
// scalar regex match benchmarks
let bench_name = format!(
"scalar_regex_match_pattern_{}_rows_{}_iter_{}",
name, rows, iter
);
c.bench_function(bench_name.as_str(), |b| {
let expr = scalar_regex_match(
false,
false,
string_col.clone(),
regexp_lit.clone(),
&schema,
)
.unwrap();
b.iter(|| {
for _ in 0..iter {
expr.evaluate(black_box(batch)).unwrap();
}
});
});

// binary regex match benchmarks
let bench_name = format!(
"binary_regex_match_pattern_{}_rows_{}_iter_{}",
name, rows, iter
);
c.bench_function(bench_name.as_str(), |b| {
let expr = binary(
string_col.clone(),
Operator::RegexMatch,
regexp_lit.clone(),
&schema,
)
.unwrap();
b.iter(|| {
for _ in 0..iter {
expr.evaluate(black_box(batch)).unwrap();
}
});
});
}
}
}
}

criterion_group!(benches, scalar_regex_match_benchmark);
criterion_main!(benches);

0 comments on commit d49edca

Please sign in to comment.