-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add decode benchmarks for Rust piece to measure CGO overhead
- Loading branch information
Daulet Zhanguzin
committed
Jul 10, 2024
1 parent
8b4f0ce
commit 21d1792
Showing
9 changed files
with
517 additions
and
25 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
use rand::Rng; | ||
use std::time::Instant; | ||
use tokenizers::tokenizer::Tokenizer; | ||
|
||
fn decode(tokenizer:&Tokenizer, ids_slice: &[u32], skip_special_tokens: bool) -> String { | ||
tokenizer.decode(ids_slice, skip_special_tokens).expect("failed to decode input") | ||
} | ||
|
||
fn bench_decode_n_times(c: &mut Criterion) { | ||
let tokenizer = Tokenizer::from_file("./test/data/bert-base-uncased.json").expect("failed to create tokenizer"); | ||
c.bench_function("decode_n_times", | ||
|b| b.iter(|| | ||
decode(&tokenizer, black_box(&[2829, 4419, 14523, 2058, 1996, 13971, 3899]), black_box(true)) | ||
) | ||
); | ||
} | ||
|
||
fn bench_decode_n_tokens(c: &mut Criterion) { | ||
let tokenizer = Tokenizer::from_file("./test/data/bert-base-uncased.json").expect("failed to create tokenizer"); | ||
let max_token_id = tokenizer.get_vocab_size(true); | ||
let mut rng = rand::thread_rng(); | ||
|
||
c.bench_function("decode_n_tokens", | ||
move |b| { b.iter_custom(|iters| { | ||
let tokens: Vec<u32> = (0..iters).map(|_| rng.gen_range(0..max_token_id) as u32).collect(); | ||
|
||
let start = Instant::now(); | ||
decode(&tokenizer, black_box(&tokens), black_box(true)); | ||
start.elapsed() | ||
})} | ||
); | ||
} | ||
|
||
criterion_group!(benches, bench_decode_n_times, bench_decode_n_tokens); | ||
criterion_main!(benches); |
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
Run with `go test -bench=Decode -count=10 -run=^\$ > test/benchmark/go_results.txt` then `benchstat test/benchmark/go_results.txt` | ||
|
||
goos: darwin | ||
goarch: arm64 | ||
pkg: github.com/daulet/tokenizers | ||
BenchmarkDecodeNTimes-10 239250 4343 ns/op | ||
BenchmarkDecodeNTimes-10 271682 4356 ns/op | ||
BenchmarkDecodeNTimes-10 274546 4346 ns/op | ||
BenchmarkDecodeNTimes-10 271051 4368 ns/op | ||
BenchmarkDecodeNTimes-10 272458 4372 ns/op | ||
BenchmarkDecodeNTimes-10 271284 4350 ns/op | ||
BenchmarkDecodeNTimes-10 272586 4350 ns/op | ||
BenchmarkDecodeNTimes-10 271552 4358 ns/op | ||
BenchmarkDecodeNTimes-10 268934 4349 ns/op | ||
BenchmarkDecodeNTimes-10 273238 4364 ns/op | ||
BenchmarkDecodeNTokens-10 1840972 657.1 ns/op | ||
BenchmarkDecodeNTokens-10 1817886 636.0 ns/op | ||
BenchmarkDecodeNTokens-10 1884613 641.0 ns/op | ||
BenchmarkDecodeNTokens-10 1823654 637.8 ns/op | ||
BenchmarkDecodeNTokens-10 1883685 646.7 ns/op | ||
BenchmarkDecodeNTokens-10 1852138 642.2 ns/op | ||
BenchmarkDecodeNTokens-10 1852321 643.3 ns/op | ||
BenchmarkDecodeNTokens-10 1850312 649.7 ns/op | ||
BenchmarkDecodeNTokens-10 1838618 640.8 ns/op | ||
BenchmarkDecodeNTokens-10 1881645 642.7 ns/op | ||
PASS | ||
ok github.com/daulet/tokenizers 31.929s | ||
|
||
goos: darwin | ||
goarch: arm64 | ||
pkg: github.com/daulet/tokenizers | ||
│ test/benchmark/go_results.txt │ | ||
│ sec/op │ | ||
DecodeNTimes-10 4.353µ ± 0% | ||
DecodeNTokens-10 642.5n ± 1% | ||
geomean 1.672µ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
run with `cargo bench` | ||
|
||
ecode_n_times time: [3.9812 µs 3.9874 µs 3.9939 µs] | ||
change: [-0.4103% -0.1338% +0.1275%] (p = 0.33 > 0.05) | ||
No change in performance detected. | ||
Found 7 outliers among 100 measurements (7.00%) | ||
7 (7.00%) high mild | ||
|
||
decode_n_tokens time: [651.72 ns 661.73 ns 675.78 ns] | ||
change: [+0.3504% +2.0016% +3.5507%] (p = 0.01 < 0.05) | ||
Change within noise threshold. | ||
Found 7 outliers among 100 measurements (7.00%) | ||
2 (2.00%) high mild | ||
5 (5.00%) high severe |