From 06a015770098a569b67855dfaa18bdfa7c18ff92 Mon Sep 17 00:00:00 2001 From: Onur Satici Date: Wed, 11 Dec 2024 13:23:05 +0000 Subject: [PATCH] add buffered data_pages to parquet column writer total bytes estimation (#6862) * add buffered data_pages to parquet column writer memory size estimation * move to get estimated total bytes --- parquet/src/column/writer/mod.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c..16de0ba7898 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn get_estimated_total_bytes(&self) -> u64 { - self.column_metrics.total_bytes_written + self.data_pages + .iter() + .map(|page| page.data().len() as u64) + .sum::() + + self.column_metrics.total_bytes_written + self.encoder.estimated_data_page_size() as u64 + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 } @@ -3422,6 +3426,26 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + #[cfg(feature = "arrow")] + fn test_column_writer_get_estimated_total_bytes() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.get_estimated_total_bytes(), 0); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.get_estimated_total_bytes(); + assert_eq!(size_with_one_page, 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.get_estimated_total_bytes(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]],