Skip to content

Commit

Permalink
Hack together metedata benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed May 16, 2024
1 parent d3e9150 commit 5c9e563
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 1 deletion.
119 changes: 119 additions & 0 deletions parquet/benches/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,129 @@

use bytes::Bytes;
use criterion::*;
use rand::Rng;
use thrift::protocol::TCompactOutputProtocol;

use arrow::util::test_util::seedable_rng;
use arrow_ipc::writer::{IpcDataGenerator, IpcWriteOptions};
use arrow_schema::{DataType, Field, Fields, Schema};
use parquet::file::reader::SerializedFileReader;
use parquet::file::serialized_reader::ReadOptionsBuilder;
use parquet::format::{
ColumnChunk, FieldRepetitionType, FileMetaData, RowGroup, SchemaElement, Type,
};
use parquet::thrift::{TCompactSliceInputProtocol, TSerializable};

const NUM_COLUMNS: usize = 10_000;
const NUM_ROW_GROUPS: usize = 1;

fn encoded_meta() -> Vec<u8> {
let mut rng = seedable_rng();

let mut schema = Vec::with_capacity(NUM_COLUMNS + 1);
schema.push(SchemaElement {
type_: None,
type_length: None,
repetition_type: None,
name: Default::default(),
num_children: Some(NUM_COLUMNS as _),
converted_type: None,
scale: None,
precision: None,
field_id: None,
logical_type: None,
});
for i in 0..NUM_COLUMNS {
schema.push(SchemaElement {
type_: Some(Type::FLOAT),
type_length: None,
repetition_type: Some(FieldRepetitionType::REQUIRED),
name: i.to_string().into(),
num_children: None,
converted_type: None,
scale: None,
precision: None,
field_id: None,
logical_type: None,
})
}

let mut row_groups = (0..NUM_ROW_GROUPS)
.map(|i| {
let columns = (0..NUM_COLUMNS)
.map(|_| ColumnChunk {
file_path: None,
file_offset: 0,
meta_data: None,
offset_index_offset: Some(rng.gen()),
offset_index_length: Some(rng.gen()),
column_index_offset: Some(rng.gen()),
column_index_length: Some(rng.gen()),
crypto_metadata: None,
encrypted_column_metadata: None,
})
.collect();

RowGroup {
columns,
total_byte_size: rng.gen(),
num_rows: rng.gen(),
sorting_columns: None,
file_offset: None,
total_compressed_size: Some(rng.gen()),
ordinal: Some(i as _),
}
})
.collect();

let mut file = FileMetaData {
schema,
row_groups,
version: 1,
num_rows: rng.gen(),
key_value_metadata: None,
created_by: Some("parquet-rs".into()),
column_orders: None,
encryption_algorithm: None,
footer_signing_key_metadata: None,
};

let mut buf = Vec::with_capacity(1024);
{
let mut out = TCompactOutputProtocol::new(&mut buf);
file.write_to_out_protocol(&mut out).unwrap();
}
buf
}

fn encoded_ipc_schema() -> Vec<u8> {
let schema = Schema::new(Fields::from_iter(
(0..NUM_COLUMNS).map(|i| Field::new(i.to_string(), DataType::Float64, true)),
));
let data = IpcDataGenerator::default();
let r = data.schema_to_bytes(&schema, &IpcWriteOptions::default());
assert_eq!(r.arrow_data.len(), 0);
r.ipc_message
}

fn criterion_benchmark(c: &mut Criterion) {
let buf = black_box(encoded_meta());
println!("Parquet metadata {}", buf.len());

c.bench_function("decode metadata", |b| {
b.iter(|| {
let mut input = TCompactSliceInputProtocol::new(&buf);
FileMetaData::read_from_in_protocol(&mut input).unwrap()
})
});

let buf = black_box(encoded_ipc_schema());
println!("Arrow IPC metadata {}", buf.len());

c.bench_function("decode ipc metadata", |b| {
b.iter(|| arrow_ipc::root_as_message(&buf).unwrap())
});

// Read file into memory to isolate filesystem performance
let file = "../parquet-testing/data/alltypes_tiny_pages.parquet";
let data = std::fs::read(file).unwrap();
Expand Down
2 changes: 1 addition & 1 deletion parquet/src/thrift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub trait TSerializable<'de>: Sized {
/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice
///
/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol
pub(crate) struct TCompactSliceInputProtocol<'a> {
pub struct TCompactSliceInputProtocol<'a> {
buf: &'a [u8],
// Identifier of the last field deserialized for a struct.
last_read_field_id: i16,
Expand Down

0 comments on commit 5c9e563

Please sign in to comment.