From 636ac2a3aa67d291dbbf9cdd8b16a192636c2a4e Mon Sep 17 00:00:00 2001 From: Pete Gadomski Date: Wed, 5 Jun 2024 06:32:06 -0600 Subject: [PATCH] feat: stac-arrow benchmarks --- stac-arrow/Cargo.toml | 5 +++++ stac-arrow/benches/read.rs | 41 ++++++++++++++++++++++++++++++++++++++ stac-arrow/src/lib.rs | 3 +++ 3 files changed, 49 insertions(+) create mode 100644 stac-arrow/benches/read.rs diff --git a/stac-arrow/Cargo.toml b/stac-arrow/Cargo.toml index c96280493..fa7f9fd6c 100644 --- a/stac-arrow/Cargo.toml +++ b/stac-arrow/Cargo.toml @@ -19,5 +19,10 @@ thiserror = "1" wkb = "0.7" [dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } parquet = "51" stac-validate = { version = "0.1", path = "../stac-validate" } + +[[bench]] +name = "read" +harness = false diff --git a/stac-arrow/benches/read.rs b/stac-arrow/benches/read.rs new file mode 100644 index 000000000..66f7fab66 --- /dev/null +++ b/stac-arrow/benches/read.rs @@ -0,0 +1,41 @@ +//! How fast is `RecordBatch` -> `Vec>` when going through +//! full serialization vs the deprecated `record_batches_to_json_rows`? + +use arrow::array::RecordBatch; +use arrow_json::ArrayWriter; +use criterion::{criterion_group, criterion_main, Criterion}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use serde_json::{Map, Value}; +use std::fs::File; + +fn record_batches_to_json_rows(record_batch: &RecordBatch) { + let mut writer = ArrayWriter::new(Vec::new()); + writer.write(record_batch).unwrap(); + writer.finish().unwrap(); + let _: Vec> = + serde_json::from_reader(writer.into_inner().as_slice()).unwrap(); +} + +#[allow(deprecated)] +fn writer(record_batch: &RecordBatch) { + let _ = arrow_json::writer::record_batches_to_json_rows(&[record_batch]).unwrap(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("record batch to vec"); + let file = File::open("data/naip.parquet").unwrap(); + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); + let mut record_batch = reader.next().unwrap().unwrap(); + let index = record_batch.schema().index_of("geometry").unwrap(); + record_batch.remove_column(index); + group.bench_function("record_batches_to_json_rows", |b| { + b.iter(|| record_batches_to_json_rows(&record_batch)) + }); + group.bench_function("writer", |b| b.iter(|| writer(&record_batch))); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/stac-arrow/src/lib.rs b/stac-arrow/src/lib.rs index 23fe54a88..dbc55ae07 100644 --- a/stac-arrow/src/lib.rs +++ b/stac-arrow/src/lib.rs @@ -221,3 +221,6 @@ mod readme { external_doc_test!(include_str!("../README.md")); } + +#[cfg(test)] +use criterion as _;