Refine parquet documentation on types and metadata (#5786)

* Refine parquet documentation on types and metadata * Update regen.sh and thrift.rs * Clarify page index encompasses offset index and column index * revert unexpected diff
apache · May 20, 2024 · c6b3eaa · c6b3eaa
1 parent cf59b6c
commit c6b3eaa
Show file tree

Hide file tree

Showing 9 changed files with 123 additions and 60 deletions.
diff --git a/parquet/regen.sh b/parquet/regen.sh
@@ -21,7 +21,10 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2
 
 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"
 
-docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
+COMMENT='//! See [`crate::file`] for easier to use APIs.'
+
+# Note: add argument --platform=linux/amd64 to run on mac
+docker run  -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
   pacman -Sy --noconfirm wget thrift && \
   wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \
   thrift --gen rs /tmp/parquet.thrift && \
@@ -35,5 +38,6 @@ docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
   sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
   echo 'Rewriting return value expectations' && \
   sed -i 's/Ok(ret.expect(\"return value should have been constructed\"))/ret.ok_or_else(|| thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, \"return value should have been constructed\")))/g' parquet.rs && \
+  sed -i '1i${COMMENT}' parquet.rs && \
   mv parquet.rs /thrift/src/format.rs
   "
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
@@ -15,23 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Contains information about available Parquet metadata.
+//! Parquet metadata structures
 //!
-//! The hierarchy of metadata is as follows:
+//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
+//!   file footer.
 //!
-//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains
-//! [`FileMetaData`](struct.FileMetaData.html) and zero or more
-//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group.
+//! * [`FileMetaData`]: File level metadata such as schema, row counts and
+//!   version.
 //!
-//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific
-//! metadata.
+//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
+//!   location and number of rows, and column chunks.
 //!
-//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row
-//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for
-//! each column chunk.
-//!
-//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column
-//! chunk (primitive leaf column), including encoding/compression, number of values, etc.
+//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
+//!   within a Row Group including encoding and compression information,
+//!   number of values, statistics, etc.
 
 use std::ops::Range;
 use std::sync::Arc;
@@ -61,7 +58,7 @@ use crate::schema::types::{
 /// column in the third row group of the parquet file.
 pub type ParquetColumnIndex = Vec<Vec<Index>>;
 
-/// [`PageLocation`] for each datapage of each row group of each column.
+/// [`PageLocation`] for each data page of each row group of each column.
 ///
 /// `offset_index[row_group_number][column_number][page_number]` holds
 /// the [`PageLocation`] corresponding to page `page_number` of column
@@ -72,14 +69,30 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
 /// parquet file.
 pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
 
-/// Global Parquet metadata.
+/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
+///
+/// This structure is stored in the footer of Parquet files, in the format
+/// defined by [`parquet.thrift`]. It contains:
+///
+/// * File level metadata: [`FileMetaData`]
+/// * Row Group level metadata: [`RowGroupMetaData`]
+/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]
+///
+/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+///
+/// This structure is read by the various readers in this crate or can be read
+/// directly from a file using the [`parse_metadata`] function.
+///
+/// [`parse_metadata`]: crate::file::footer::parse_metadata
 #[derive(Debug, Clone)]
 pub struct ParquetMetaData {
+    /// File level metadata
     file_metadata: FileMetaData,
+    /// Row group metadata
     row_groups: Vec<RowGroupMetaData>,
-    /// Page index for all pages in each column chunk
+    /// Page level index for each page in each column chunk
     column_index: Option<ParquetColumnIndex>,
-    /// Offset index for all pages in each column chunk
+    /// Offset index for all each page in each column chunk
     offset_index: Option<ParquetOffsetIndex>,
 }
 
@@ -172,7 +185,9 @@ pub type KeyValue = crate::format::KeyValue;
 /// Reference counted pointer for [`FileMetaData`].
 pub type FileMetaDataPtr = Arc<FileMetaData>;
 
-/// Metadata for a Parquet file.
+/// File level metadata for a Parquet file.
+///
+/// Includes the version of the file, metadata, number of rows, schema, and column orders
 #[derive(Debug, Clone)]
 pub struct FileMetaData {
     version: i32,
@@ -271,16 +286,20 @@ impl FileMetaData {
 /// Reference counted pointer for [`RowGroupMetaData`].
 pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
 
-/// Metadata for a row group.
+/// Metadata for a row group
+///
+/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
+/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
 #[derive(Debug, Clone, PartialEq)]
 pub struct RowGroupMetaData {
     columns: Vec<ColumnChunkMetaData>,
     num_rows: i64,
     sorting_columns: Option<Vec<SortingColumn>>,
     total_byte_size: i64,
     schema_descr: SchemaDescPtr,
-    // We can't infer from file offset of first column since there may empty columns in row group.
+    /// We can't infer from file offset of first column since there may empty columns in row group.
     file_offset: Option<i64>,
+    /// Ordinal position of this row group in file
     ordinal: Option<i16>,
 }
 
@@ -335,7 +354,10 @@ impl RowGroupMetaData {
         self.schema_descr.clone()
     }
 
-    /// Returns ordinal of this row group in file
+    /// Returns ordinal position of this row group in file.
+    ///
+    /// For example if this is the first row group in the file, this will return 0.
+    /// If this is the second row group in the file, this will return 1.
     #[inline(always)]
     pub fn ordinal(&self) -> Option<i16> {
         self.ordinal

diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
@@ -19,10 +19,13 @@
 //!
 //! Provides access to file and row group readers and writers, record API, metadata, etc.
 //!
-//! See [`serialized_reader::SerializedFileReader`](serialized_reader/struct.SerializedFileReader.html) or
-//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a
-//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file
-//! metadata, and [`statistics`](statistics/index.html) for working with statistics.
+//! # See Also:
+//! * [`SerializedFileReader`] and [`SerializedFileWriter`] for reading / writing parquet
+//! * [`metadata`]: for working with metadata such as schema
+//! * [`statistics`]: for working with statistics in metadata
+//!
+//! [`SerializedFileReader`]: serialized_reader::SerializedFileReader
+//! [`SerializedFileWriter`]: writer::SerializedFileWriter
 //!
 //! # Example of writing a new file
 //!

diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs
@@ -57,10 +57,11 @@ impl<T> PageIndex<T> {
 
 #[derive(Debug, Clone, PartialEq)]
 #[allow(non_camel_case_types)]
-/// Typed statistics for a data page in a column chunk. This structure
-/// is obtained from decoding the [ColumnIndex] in the parquet file
-/// and can be used to skip decoding pages while reading the file
-/// data.
+/// Typed statistics for a data page in a column chunk.
+///
+/// This structure is part of the "Page Index" and is optionally part of
+/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
+/// while reading the file data.
 pub enum Index {
     /// Sometimes reading page index from parquet file
     /// will only return pageLocations without min_max index,

diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
@@ -20,6 +20,7 @@
 //! Though some common methods are available on enum, use pattern match to extract
 //! actual min and max values from statistics, see below:
 //!
+//! # Examples
 //! ```rust
 //! use parquet::file::statistics::Statistics;
 //!

diff --git a/parquet/src/format.rs b/parquet/src/format.rs
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
@@ -28,25 +28,30 @@
 //! # Format Overview
 //!
 //! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
-//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet
-//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency.
+//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], but
+//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency.
 //!
 //! Parquet files are partitioned for scalability. Each file contains metadata,
 //! along with zero or more "row groups", each row group containing one or
 //! more columns. The APIs in this crate reflect this structure.
 //!
-//! Parquet distinguishes between "logical" and "physical" data types.
-//! For instance, strings (logical type) are stored as byte arrays (physical type).
-//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
-//! are stored as integers (physical type). This crate exposes both kinds of types.
+//! Data in Parquet files is strongly typed and differentiates between logical
+//! and physical types (see [`schema`]). In addition, Parquet files may contain
+//! other metadata, such as statistics, which can be used to optimize reading
+//! (see [`file::metadata`]).
+//! For more details about the Parquet format itself, see the [Parquet spec]
 //!
-//! For more details about the Parquet format, see the
-//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//! [Parquet spec]: https://github.com/apache/parquet-format/blob/master/README.md#file-format
 //!
 //! # APIs
 //!
 //! This crate exposes a number of APIs for different use-cases.
 //!
+//! ## Metadata and Schema
+//!
+//! The [`schema`] module provides APIs to work with Parquet schemas. The
+//! [`file::metadata`] module provides APIs to work with Parquet metadata.
+//!
 //! ## Read/Write Arrow
 //!
 //! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
@@ -64,7 +69,7 @@
 //!
 //! ## Read/Write Parquet
 //!
-//! Workloads needing finer-grained control, or looking to not take a dependency on arrow,
+//! Workloads needing finer-grained control, or avoid a dependence on arrow,
 //! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
 //! data model, and therefore require knowledge of the underlying parquet format,
 //! including the details of [Dremel] record shredding and [Logical Types]. Most workloads

diff --git a/parquet/src/schema/mod.rs b/parquet/src/schema/mod.rs
@@ -17,6 +17,20 @@
 
 //! Parquet schema definitions and methods to print and parse schema.
 //!
+//! * [`SchemaDescriptor`] describes the data types of the columns stored in a file
+//! * [`ColumnDescriptor`]: Describes the schema of a single (leaf) column.
+//! * [`ColumnPath`]: Represents the location of a column in the schema (e.g. a nested field)
+//!
+//! Parquet distinguishes
+//! between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type).
+//!
+//! [`SchemaDescriptor`]: types::SchemaDescriptor
+//! [`ColumnDescriptor`]: types::ColumnDescriptor
+//! [`ColumnPath`]: types::ColumnPath
+//!
 //! # Example
 //!
 //! ```rust

diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
@@ -37,8 +37,10 @@ pub type SchemaDescPtr = Arc<SchemaDescriptor>;
 pub type ColumnDescPtr = Arc<ColumnDescriptor>;
 
 /// Representation of a Parquet type.
+///
 /// Used to describe primitive leaf fields and structs, including top-level schema.
-/// Note that the top-level schema type is represented using `GroupType` whose
+///
+/// Note that the top-level schema is represented using [`Type::GroupType`] whose
 /// repetition is `None`.
 #[derive(Clone, Debug, PartialEq)]
 pub enum Type {
@@ -662,7 +664,7 @@ impl BasicTypeInfo {
 // ----------------------------------------------------------------------
 // Parquet descriptor definitions
 
-/// Represents a path in a nested schema
+/// Represents the location of a column in a Parquet schema
 #[derive(Clone, PartialEq, Debug, Eq, Hash)]
 pub struct ColumnPath {
     parts: Vec<String>,
@@ -737,21 +739,22 @@ impl AsRef<[String]> for ColumnPath {
     }
 }
 
-/// A descriptor for leaf-level primitive columns.
-/// This encapsulates information such as definition and repetition levels and is used to
+/// Physical type for leaf-level primitive columns.
+///
+/// Also includes the maximum definition and repetition levels required to
 /// re-assemble nested data.
 #[derive(Debug, PartialEq)]
 pub struct ColumnDescriptor {
-    // The "leaf" primitive type of this column
+    /// The "leaf" primitive type of this column
     primitive_type: TypePtr,
 
-    // The maximum definition level for this column
+    /// The maximum definition level for this column
     max_def_level: i16,
 
-    // The maximum repetition level for this column
+    /// The maximum repetition level for this column
     max_rep_level: i16,
 
-    // The path of this column. For instance, "a.b.c.d".
+    /// The path of this column. For instance, "a.b.c.d".
     path: ColumnPath,
 }
 
@@ -860,24 +863,33 @@ impl ColumnDescriptor {
     }
 }
 
-/// A schema descriptor. This encapsulates the top-level schemas for all the columns,
-/// as well as all descriptors for all the primitive columns.
+/// Schema of a Parquet file.
+///
+/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
+/// each primitive (leaf) column.
 #[derive(PartialEq)]
 pub struct SchemaDescriptor {
-    // The top-level schema (the "message" type).
-    // This must be a `GroupType` where each field is a root column type in the schema.
+    /// The top-level logical schema (the "message" type).
+    ///
+    /// This must be a [`Type::GroupType`] where each field is a root
+    /// column type in the schema.
     schema: TypePtr,
 
-    // All the descriptors for primitive columns in this schema, constructed from
-    // `schema` in DFS order.
+    /// The descriptors for the physical type of each leaf column in this schema
+    ///
+    /// Constructed from `schema` in DFS order.
     leaves: Vec<ColumnDescPtr>,
 
-    // Mapping from a leaf column's index to the root column index that it
-    // comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`:
-    // -- a  <-----+
-    // -- -- b     |
-    // -- -- -- c  |
-    // -- -- -- -- d
+    /// Mapping from a leaf column's index to the root column index that it
+    /// comes from.
+    ///
+    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
+    /// ```text
+    /// -- a  <-----+
+    /// -- -- b     |
+    /// -- -- -- c  |
+    /// -- -- -- -- d
+    /// ```
     leaf_to_base: Vec<usize>,
 }