From 9be0eb530de85a68358dc31c146f71ccb97739ca Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 17 Jul 2024 16:07:47 -0400 Subject: [PATCH] Minor: Improve parquet PageIndex documentation (#6042) * Minor: Improve parquet PageIndex documentation * More improvements * Add reasons for data page being without null * Apply suggestions from code review Co-authored-by: Val Lorentz * Update parquet/src/file/page_index/index.rs --------- Co-authored-by: Val Lorentz --- parquet/src/file/metadata/mod.rs | 37 ++++++++++++++++++++++++---- parquet/src/file/page_index/index.rs | 34 +++++++++++++++---------- parquet/src/file/statistics.rs | 16 ++++++++++-- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 46ebf5d0f2b8..3722f2cbe48d 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -54,7 +54,12 @@ use crate::schema::types::{ Type as SchemaType, }; -/// [`Index`] for each row group of each column. +/// Page level statistics for each column chunk of each row group. +/// +/// This structure is an in-memory representation of multiple [`ColumnIndex`] +/// structures in a parquet file footer, as described in the Parquet [PageIndex +/// documentation]. Each [`Index`] holds statistics about all the pages in a +/// particular column chunk. /// /// `column_index[row_group_number][column_number]` holds the /// [`Index`] corresponding to column `column_number` of row group @@ -62,9 +67,14 @@ use crate::schema::types::{ /// /// For example `column_index[2][3]` holds the [`Index`] for the forth /// column in the third row group of the parquet file. +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each data page of each row group of each column. +/// [`PageLocation`] for each data page of each row group of each column +/// +/// This structure is the parsed representation of the [`OffsetIndex`] from the +/// Parquet file footer, as described in the Parquet [PageIndex documentation]. /// /// `offset_index[row_group_number][column_number][page_number]` holds /// the [`PageLocation`] corresponding to page `page_number` of column @@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec>; /// For example `offset_index[2][3][4]` holds the [`PageLocation`] for /// the fifth page of the forth column in the third row group of the /// parquet file. +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetOffsetIndex = Vec>>; /// Parsed metadata for a single Parquet file @@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder { } } -/// Builder for column index +/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex] +/// +/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct ColumnIndexBuilder { null_pages: Vec, min_values: Vec>, max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, - // If one page can't get build index, need to ignore all index in this column + /// Is the information in the builder valid? + /// + /// Set to `false` if any entry in the page doesn't have statistics for + /// some reason, so statistics for that page won't be written to the file. + /// This might happen if the page is entirely null, or + /// is a floating point column without any non-nan values + /// e.g. valid: bool, } @@ -975,6 +995,7 @@ impl ColumnIndexBuilder { } } + /// Append statistics for the next page pub fn append( &mut self, null_page: bool, @@ -992,15 +1013,19 @@ impl ColumnIndexBuilder { self.boundary_order = boundary_order; } + /// Mark this column index as invalid pub fn to_invalid(&mut self) { self.valid = false; } + /// Is the information in the builder valid? pub fn valid(&self) -> bool { self.valid } /// Build and get the thrift metadata of column index + /// + /// Note: callers should check [`Self::valid`] before calling this method pub fn build_to_thrift(self) -> ColumnIndex { ColumnIndex::new( self.null_pages, @@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder { } } -/// Builder for offset index +/// Builder for offset index, part of the Parquet [PageIndex]. +/// +/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct OffsetIndexBuilder { offset_array: Vec, compressed_page_size_array: Vec, diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 71fb47afa960..7eba949042f1 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -25,14 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; use std::fmt::Debug; -/// PageIndex Statistics for one data page, as described in [Column Index]. +/// Typed statistics for one data page /// -/// One significant difference from the row group level -/// [`Statistics`](crate::format::Statistics) is that page level -/// statistics may not store actual column values as min and max -/// (e.g. they may store truncated strings to save space) -/// -/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// See [`NativeIndex`] for more details #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PageIndex { /// The minimum value, It is None when all values are null @@ -70,11 +65,9 @@ where #[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] -/// Typed statistics for a data page in a column chunk. +/// Statistics for data pages in a column chunk. /// -/// This structure is part of the "Page Index" and is optionally part of -/// [ColumnIndex] in the parquet file and can be used to skip decoding pages -/// while reading the file data. +/// See [`NativeIndex`] for more information pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, @@ -117,10 +110,25 @@ impl Index { } } -/// Stores the [`PageIndex`] for each page of a column +/// Strongly typed statistics for data pages in a column chunk. +/// +/// This structure is a natively typed, in memory representation of the +/// [`ColumnIndex`] structure in a parquet file footer, as described in the +/// Parquet [PageIndex documentation]. The statistics stored in this structure +/// can be used by query engines to skip decoding pages while reading parquet +/// data. +/// +/// # Differences with Row Group Level Statistics +/// +/// One significant difference between `NativeIndex` and row group level +/// [`Statistics`] is that page level statistics may not store actual column +/// values as min and max (e.g. they may store truncated strings to save space) +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [`Statistics`]: crate::file::statistics::Statistics #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { - /// The indexes, one item per page + /// The actual column indexes, one item per page pub indexes: Vec>, /// If the min/max elements are ordered, and if so in which /// direction. See [source] for details. diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 7d704cc138fc..dded9c9c1192 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { Some(thrift_stats) } -/// Statistics for a column chunk and data page. +/// Strongly typed statistics for a column chunk within a row group. +/// +/// This structure is a natively typed, in memory representation of the +/// [`Statistics`] structure in a parquet file footer. The statistics stored in +/// this structure can be used by query engines to skip decoding pages while +/// reading parquet data. +/// +/// Page level statistics are stored separately, in [NativeIndex]. +/// +/// [`Statistics`]: crate::format::Statistics +/// [NativeIndex]: crate::file::page_index::index::NativeIndex #[derive(Debug, Clone, PartialEq)] pub enum Statistics { Boolean(ValueStatistics), @@ -445,7 +455,9 @@ impl fmt::Display for Statistics { /// Typed implementation for [`Statistics`]. pub type TypedStatistics = ValueStatistics<::T>; -/// Statistics for a particular `ParquetValueType` +/// Typed statistics for one column chunk +/// +/// See [`Statistics`] for more details #[derive(Clone, Eq, PartialEq)] pub struct ValueStatistics { min: Option,