diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index c53f7a6c47715..674d6fca5d561 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -87,3 +87,7 @@ name = "case_when" [[bench]] harness = false name = "is_null" + +[[bench]] +harness = false +name = "scalar_regex_match" \ No newline at end of file diff --git a/datafusion/physical-expr/benches/scalar_regex_match.rs b/datafusion/physical-expr/benches/scalar_regex_match.rs new file mode 100644 index 0000000000000..8d1e431d02e5e --- /dev/null +++ b/datafusion/physical-expr/benches/scalar_regex_match.rs @@ -0,0 +1,104 @@ +use std::sync::Arc; + +use arrow_array::{RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr_common::operator::Operator; +use datafusion_physical_expr::expressions::{binary, col, lit, scalar_regex_match}; +use hashbrown::HashMap; +use rand::distributions::{Alphanumeric, DistString}; + +/// make a record batch with one column and n rows +/// this record batch is single string column is used for +/// scalar regex match benchmarks +fn make_record_batch(rows: usize, string_length: usize, schema: Schema) -> RecordBatch { + let mut rng = rand::thread_rng(); + let mut array = Vec::with_capacity(rows); + for _ in 0..rows { + let data_line = Alphanumeric.sample_string(&mut rng, string_length); + array.push(Some(data_line)); + } + let array = StringArray::from(array); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap() +} + +fn scalar_regex_match_benchmark(c: &mut Criterion) { + // make common schema + let column = "string"; + let schema = Schema::new(vec![Field::new(column, DataType::Utf8, true)]); + + // meke test record batch + let test_batch = [ + (10, make_record_batch(10, 100, schema.clone())), + (100, make_record_batch(100, 100, schema.clone())), + (1000, make_record_batch(1000, 100, schema.clone())), + (2000, make_record_batch(2000, 100, schema.clone())), + ] + .iter() + .map(|(k, v)| (*k, v.clone())) + .collect::>(); + + // string column + let string_col = col(column, &schema).unwrap(); + + // some pattern literal + let pattern_lit = [ + ("email".to_string(), lit(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")), + ("url".to_string(), lit(r"^(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]$")), + ("ip".to_string(), lit(r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")), + ("phone".to_string(), lit(r"^(\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$")), + ("zip_code".to_string(), lit(r"^\d{5}(?:[-\s]\d{4})?$")), + ].iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect::>(); + + for (name, regexp_lit) in pattern_lit.iter() { + for (rows, batch) in test_batch.iter() { + for iter in [10, 20, 50, 100] { + // scalar regex match benchmarks + let bench_name = format!( + "scalar_regex_match_pattern_{}_rows_{}_iter_{}", + name, rows, iter + ); + c.bench_function(bench_name.as_str(), |b| { + let expr = scalar_regex_match( + false, + false, + string_col.clone(), + regexp_lit.clone(), + &schema, + ) + .unwrap(); + b.iter(|| { + for _ in 0..iter { + expr.evaluate(black_box(batch)).unwrap(); + } + }); + }); + + // binary regex match benchmarks + let bench_name = format!( + "binary_regex_match_pattern_{}_rows_{}_iter_{}", + name, rows, iter + ); + c.bench_function(bench_name.as_str(), |b| { + let expr = binary( + string_col.clone(), + Operator::RegexMatch, + regexp_lit.clone(), + &schema, + ) + .unwrap(); + b.iter(|| { + for _ in 0..iter { + expr.evaluate(black_box(batch)).unwrap(); + } + }); + }); + } + } + } +} + +criterion_group!(benches, scalar_regex_match_benchmark); +criterion_main!(benches);