Skip to content

Commit

Permalink
Set the default size of BitWriter for DeltdaBitPackEndoer to 1MB (#5776)
Browse files Browse the repository at this point in the history
  • Loading branch information
AdamGS authored May 20, 2024
1 parent c498eb7 commit ce8363a
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 21 deletions.
4 changes: 2 additions & 2 deletions parquet/src/encodings/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ impl<T: DataType> Encoder<T> for RleValueEncoder<T> {
// DELTA_BINARY_PACKED encoding

const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32;
const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024;
const DEFAULT_BIT_WRITER_SIZE: usize = 1024 * 1024;
const DEFAULT_NUM_MINI_BLOCKS: usize = 4;

/// Delta bit packed encoder.
Expand Down Expand Up @@ -313,7 +313,7 @@ impl<T: DataType> DeltaBitPackEncoder<T> {

DeltaBitPackEncoder {
page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE),
bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE),
bit_writer: BitWriter::new(DEFAULT_BIT_WRITER_SIZE),
total_values: 0,
first_value: 0,
current_value: 0, // current value to keep adding deltas
Expand Down
27 changes: 8 additions & 19 deletions parquet/src/util/bit_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ pub struct BitWriter {
}

impl BitWriter {
pub fn new(max_bytes: usize) -> Self {
pub fn new(initial_capacity: usize) -> Self {
Self {
buffer: Vec::with_capacity(max_bytes),
buffer: Vec::with_capacity(initial_capacity),
buffered_values: 0,
bit_offset: 0,
}
Expand Down Expand Up @@ -304,12 +304,7 @@ impl BitWriter {
/// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra
/// higher ordered bytes will be ignored.
#[inline]
pub fn put_aligned_offset<T: AsBytes>(
&mut self,
val: T,
num_bytes: usize,
offset: usize,
) {
pub fn put_aligned_offset<T: AsBytes>(&mut self, val: T, num_bytes: usize, offset: usize) {
let slice = val.as_bytes();
let len = num_bytes.min(slice.len());
self.buffer[offset..offset + len].copy_from_slice(&slice[..len])
Expand Down Expand Up @@ -405,8 +400,8 @@ impl BitReader {
self.load_buffered_values()
}

let mut v = trailing_bits(self.buffered_values, self.bit_offset + num_bits)
>> self.bit_offset;
let mut v =
trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset;
self.bit_offset += num_bits;

if self.bit_offset >= 64 {
Expand Down Expand Up @@ -571,8 +566,7 @@ impl BitReader {
false => num_values,
};

let end_bit_offset =
self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset;
let end_bit_offset = self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset;

self.byte_offset = end_bit_offset / 8;
self.bit_offset = end_bit_offset % 8;
Expand All @@ -585,11 +579,7 @@ impl BitReader {
}

/// Reads up to `num_bytes` to `buf` returning the number of bytes read
pub(crate) fn get_aligned_bytes(
&mut self,
buf: &mut Vec<u8>,
num_bytes: usize,
) -> usize {
pub(crate) fn get_aligned_bytes(&mut self, buf: &mut Vec<u8>, num_bytes: usize) -> usize {
// Align to byte offset
self.byte_offset = self.get_byte_offset();
self.bit_offset = 0;
Expand Down Expand Up @@ -998,8 +988,7 @@ mod tests {
.collect();

// Generic values used to check against actual values read from `get_batch`.
let expected_values: Vec<T> =
values.iter().map(|v| from_le_slice(v.as_bytes())).collect();
let expected_values: Vec<T> = values.iter().map(|v| from_le_slice(v.as_bytes())).collect();

(0..total).for_each(|i| writer.put_value(values[i], num_bits));

Expand Down

0 comments on commit ce8363a

Please sign in to comment.