Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature scalar regexp match benchmark #13789

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ecd4793
bench: scalar regex match benchmark
zhuliquan Dec 5, 2024
c3e0951
refactor: migrate `LinearSearch` to `HashTable` (#13658)
crepererum Dec 6, 2024
1e507ad
Minor: Comment temporary function for documentation migration (#13669)
comphead Dec 6, 2024
61fd077
Minor: Rephrase MSRV policy to be more explanatory (#13668)
comphead Dec 6, 2024
67260a0
fix: repartitioned reads of CSV with custom line terminator (#13677)
korowa Dec 7, 2024
3618cfe
chore: macros crate cleanup (#13685)
findepi Dec 7, 2024
d3e0860
Refactor regexplike signature (#13394)
jiashenC Dec 8, 2024
a960c6d
Performance: enable array allocation reuse (`ScalarFunctionArgs` gets…
alamb Dec 8, 2024
d39852d
Temporary fix for CI (#13689)
jonahgao Dec 8, 2024
98372cc
refactor: use `LazyLock` in the `user_doc` macro (#13684)
jonahgao Dec 8, 2024
e8226f5
Unlock lexical-write-integer version. (#13693)
Alexhuszagh Dec 9, 2024
bd91271
Minor: Use `div_ceil`
akurmustafa Dec 9, 2024
45926ab
Fix hash join with sort push down (#13560)
haohuaijin Dec 9, 2024
16d2ab1
Improve substr() performance by avoiding using owned string (#13688)
richox Dec 9, 2024
d8c9cfb
reinstate down_cast_any_ref (#13705)
andygrove Dec 9, 2024
f8c0efe
Optimize performance of `character_length` function (#13696)
tlm365 Dec 10, 2024
2d8bd42
Update prost-build requirement from =0.13.3 to =0.13.4 (#13698)
dependabot[bot] Dec 10, 2024
5dc6e42
Minor: Output elapsed time for sql logic test (#13718)
comphead Dec 10, 2024
4fb9d2a
refactor: simplify the `make_udf_function` macro (#13712)
jonahgao Dec 11, 2024
fa0440b
refactor: replace `Vec` with `IndexMap` for expression mappings in `P…
Weijun-H Dec 11, 2024
d3c459e
Handle alias when parsing sql(parse_sql_expr) (#12939)
Eason0729 Dec 11, 2024
ddfc9e5
Improve documentation for TableProvider (#13724)
alamb Dec 11, 2024
b494157
Reveal implementing type and return type in simple UDF implementation…
findepi Dec 11, 2024
3b5daa2
minor: Extract tests for `EXTRACT` AND `date_part` to their own file …
alamb Dec 11, 2024
50ce883
Support unparsing `UNNEST` plan to `UNNEST` table factor SQL (#13660)
goldmedal Dec 11, 2024
13b581a
Merge branch 'apache:main' into feature-scalar_regexp_match_benchmark
zhuliquan Dec 12, 2024
065eb47
Merge branch 'apache:main' into feature-scalar_regexp_match_benchmark
zhuliquan Dec 15, 2024
c697bb0
fix: take taplo formatter suggestion
zhuliquan Dec 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,7 @@ name = "topk_aggregate"
harness = false
name = "map_query_sql"
required-features = ["nested_expressions"]

[[bench]]
harness = false
name = "scalar_regex_match_query_sql"
131 changes: 131 additions & 0 deletions datafusion/core/benches/scalar_regex_match_query_sql.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::{
datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
};
use arrow_array::StringArray;
use criterion::{criterion_group, criterion_main, Criterion};
use datafusion::prelude::SessionContext;
use datafusion::{datasource::MemTable, error::Result};
use rand::SeedableRng;
use rand::{rngs::StdRng, Rng};
use std::sync::Arc;
use tokio::runtime::Runtime;

fn query(ctx: &SessionContext, sql: &str) {
let rt = Runtime::new().unwrap();

// execute the query
let df = rt.block_on(ctx.sql(sql)).unwrap();
rt.block_on(df.collect()).unwrap();
}

fn generate_random_string(rng: &mut StdRng, length: usize, charset: &[u8]) -> String {
(0..length)
.map(|_| {
let idx = rng.gen_range(0..charset.len());
charset[idx] as char
})
.collect()
}

fn create_context(
batch_iter: usize,
batch_size: usize,
string_len: usize,
rand_seed: u64,
correct: &str,
) -> Result<SessionContext> {
let mut rng = StdRng::seed_from_u64(rand_seed);
let charset = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,:/\\+-_!@#$%^&*()~'\"{}[]?";

// define a schema.
let schema = Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)]));

// define data.
let batches = (0..batch_iter)
.map(|_| {
let mut array = (0..batch_size - 128)
.map(|_| Some(generate_random_string(&mut rng, string_len, charset)))
.collect::<Vec<_>>();
for _ in 0..128 {
array.push(Some(correct.to_string()));
}
let array = StringArray::from(array);
RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap()
})
.collect::<Vec<_>>();

let ctx = SessionContext::new();

// declare a table in memory. In spark API, this corresponds to createDataFrame(...).
let provider = MemTable::try_new(schema, vec![batches])?;
ctx.register_table("t", Arc::new(provider))?;

Ok(ctx)
}

fn criterion_benchmark(c: &mut Criterion) {
let batch_iter = 128;
let batch_size = 4096;
c.bench_function("test email address pattern", |b| {
let correct = "test@eaxample.com";
let sql = "select s from t where s ~ '^[a-zA-Z0-9_\\+\\-]+@[a-zA-Z0-9\\-]+\\.[a-zA-Z]{2,}$'";
let ctx = create_context(batch_iter, batch_size, 64, 11111, correct).unwrap();
b.iter(|| query(&ctx, sql))
});

c.bench_function("test ip pattern", |b| {
let correct = "23.7.9.9";
let ctx = create_context(batch_iter, batch_size, 16, 22222, correct).unwrap();
let sql = "select s from t where s ~ '^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'";
b.iter(|| query(&ctx, sql))
});

c.bench_function("test phone number pattern", |b| {
let correct = "1236788899";
let sql = "select s from t where s ~ '^(\\+\\d{1,2}\\s?)?\\(?\\d{3}\\)?[\\s.-]?\\d{3}[\\s.-]?\\d{4}$'";
let ctx = create_context(batch_iter, batch_size, 16, 33333, correct).unwrap();
b.iter(|| query(&ctx, sql))
});

c.bench_function("test html tag pattern", |b| {
let correct = "<div>Hello World</div>";
let sql = "select s from t where s ~ '<([a-z1-6]+)>[^<]+</([a-z1-6]+)>'";
let ctx = create_context(batch_iter, batch_size, 64, 44444, correct).unwrap();
b.iter(|| query(&ctx, sql))
});

c.bench_function("test url pattern", |b| {
let correct = "https://www.example.com";
let sql = "select s from t where s ~ '^(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]$'";
let ctx = create_context(batch_iter, batch_size, 64, 55555, correct).unwrap();
b.iter(|| query(&ctx, sql))
});

c.bench_function("test date pattern", |b| {
let correct = "2024-02-03";
let sql = "select s from t where s ~ '[0-9]{4}-[0-9]{2}-[0-9]{2}'";
let ctx = create_context(batch_iter, batch_size, 16, 66666, correct).unwrap();
b.iter(|| query(&ctx, sql))
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Loading