From 16e5258300c199fc84abd641b043091688b929a5 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 16 May 2024 19:21:33 +0300 Subject: [PATCH] Set the default size of BitWriter for DeltdaBitPackEndoer to 1MB --- parquet/src/encodings/encoding/mod.rs | 4 ++-- parquet/src/util/bit_util.rs | 27 ++++++++------------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index d797b3cb2f52..56ca68db7d75 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -249,7 +249,7 @@ impl Encoder for RleValueEncoder { // DELTA_BINARY_PACKED encoding const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32; -const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; +const DEFAULT_BIT_WRITER_SIZE: usize = 1024 * 1024; const DEFAULT_NUM_MINI_BLOCKS: usize = 4; /// Delta bit packed encoder. @@ -313,7 +313,7 @@ impl DeltaBitPackEncoder { DeltaBitPackEncoder { page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE), - bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE), + bit_writer: BitWriter::new(DEFAULT_BIT_WRITER_SIZE), total_values: 0, first_value: 0, current_value: 0, // current value to keep adding deltas diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index b1dd23574a19..1ec764e2c869 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -172,9 +172,9 @@ pub struct BitWriter { } impl BitWriter { - pub fn new(max_bytes: usize) -> Self { + pub fn new(initial_capacity: usize) -> Self { Self { - buffer: Vec::with_capacity(max_bytes), + buffer: Vec::with_capacity(initial_capacity), buffered_values: 0, bit_offset: 0, } @@ -304,12 +304,7 @@ impl BitWriter { /// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra /// higher ordered bytes will be ignored. #[inline] - pub fn put_aligned_offset( - &mut self, - val: T, - num_bytes: usize, - offset: usize, - ) { + pub fn put_aligned_offset(&mut self, val: T, num_bytes: usize, offset: usize) { let slice = val.as_bytes(); let len = num_bytes.min(slice.len()); self.buffer[offset..offset + len].copy_from_slice(&slice[..len]) @@ -405,8 +400,8 @@ impl BitReader { self.load_buffered_values() } - let mut v = trailing_bits(self.buffered_values, self.bit_offset + num_bits) - >> self.bit_offset; + let mut v = + trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset; self.bit_offset += num_bits; if self.bit_offset >= 64 { @@ -571,8 +566,7 @@ impl BitReader { false => num_values, }; - let end_bit_offset = - self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset; + let end_bit_offset = self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset; self.byte_offset = end_bit_offset / 8; self.bit_offset = end_bit_offset % 8; @@ -585,11 +579,7 @@ impl BitReader { } /// Reads up to `num_bytes` to `buf` returning the number of bytes read - pub(crate) fn get_aligned_bytes( - &mut self, - buf: &mut Vec, - num_bytes: usize, - ) -> usize { + pub(crate) fn get_aligned_bytes(&mut self, buf: &mut Vec, num_bytes: usize) -> usize { // Align to byte offset self.byte_offset = self.get_byte_offset(); self.bit_offset = 0; @@ -998,8 +988,7 @@ mod tests { .collect(); // Generic values used to check against actual values read from `get_batch`. - let expected_values: Vec = - values.iter().map(|v| from_le_slice(v.as_bytes())).collect(); + let expected_values: Vec = values.iter().map(|v| from_le_slice(v.as_bytes())).collect(); (0..total).for_each(|i| writer.put_value(values[i], num_bits));