Skip to content

Commit

Permalink
Refine parquet documentation on types and metadata (#5786)
Browse files Browse the repository at this point in the history
* Refine parquet documentation on types and metadata

* Update regen.sh and thrift.rs

* Clarify page index encompasses offset index and column index

* revert unexpected diff
  • Loading branch information
alamb authored May 20, 2024
1 parent cf59b6c commit c6b3eaa
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 60 deletions.
6 changes: 5 additions & 1 deletion parquet/regen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2

SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"

docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
COMMENT='//! See [`crate::file`] for easier to use APIs.'

# Note: add argument --platform=linux/amd64 to run on mac
docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
pacman -Sy --noconfirm wget thrift && \
wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \
thrift --gen rs /tmp/parquet.thrift && \
Expand All @@ -35,5 +38,6 @@ docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
echo 'Rewriting return value expectations' && \
sed -i 's/Ok(ret.expect(\"return value should have been constructed\"))/ret.ok_or_else(|| thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, \"return value should have been constructed\")))/g' parquet.rs && \
sed -i '1i${COMMENT}' parquet.rs && \
mv parquet.rs /thrift/src/format.rs
"
64 changes: 43 additions & 21 deletions parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,20 @@
// specific language governing permissions and limitations
// under the License.

//! Contains information about available Parquet metadata.
//! Parquet metadata structures
//!
//! The hierarchy of metadata is as follows:
//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
//! file footer.
//!
//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains
//! [`FileMetaData`](struct.FileMetaData.html) and zero or more
//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group.
//! * [`FileMetaData`]: File level metadata such as schema, row counts and
//! version.
//!
//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific
//! metadata.
//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
//! location and number of rows, and column chunks.
//!
//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row
//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for
//! each column chunk.
//!
//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column
//! chunk (primitive leaf column), including encoding/compression, number of values, etc.
//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
//! within a Row Group including encoding and compression information,
//! number of values, statistics, etc.
use std::ops::Range;
use std::sync::Arc;
Expand Down Expand Up @@ -61,7 +58,7 @@ use crate::schema::types::{
/// column in the third row group of the parquet file.
pub type ParquetColumnIndex = Vec<Vec<Index>>;

/// [`PageLocation`] for each datapage of each row group of each column.
/// [`PageLocation`] for each data page of each row group of each column.
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
Expand All @@ -72,14 +69,30 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;

/// Global Parquet metadata.
/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
///
/// This structure is stored in the footer of Parquet files, in the format
/// defined by [`parquet.thrift`]. It contains:
///
/// * File level metadata: [`FileMetaData`]
/// * Row Group level metadata: [`RowGroupMetaData`]
/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]
///
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
///
/// This structure is read by the various readers in this crate or can be read
/// directly from a file using the [`parse_metadata`] function.
///
/// [`parse_metadata`]: crate::file::footer::parse_metadata
#[derive(Debug, Clone)]
pub struct ParquetMetaData {
/// File level metadata
file_metadata: FileMetaData,
/// Row group metadata
row_groups: Vec<RowGroupMetaData>,
/// Page index for all pages in each column chunk
/// Page level index for each page in each column chunk
column_index: Option<ParquetColumnIndex>,
/// Offset index for all pages in each column chunk
/// Offset index for all each page in each column chunk
offset_index: Option<ParquetOffsetIndex>,
}

Expand Down Expand Up @@ -172,7 +185,9 @@ pub type KeyValue = crate::format::KeyValue;
/// Reference counted pointer for [`FileMetaData`].
pub type FileMetaDataPtr = Arc<FileMetaData>;

/// Metadata for a Parquet file.
/// File level metadata for a Parquet file.
///
/// Includes the version of the file, metadata, number of rows, schema, and column orders
#[derive(Debug, Clone)]
pub struct FileMetaData {
version: i32,
Expand Down Expand Up @@ -271,16 +286,20 @@ impl FileMetaData {
/// Reference counted pointer for [`RowGroupMetaData`].
pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;

/// Metadata for a row group.
/// Metadata for a row group
///
/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
#[derive(Debug, Clone, PartialEq)]
pub struct RowGroupMetaData {
columns: Vec<ColumnChunkMetaData>,
num_rows: i64,
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
// We can't infer from file offset of first column since there may empty columns in row group.
/// We can't infer from file offset of first column since there may empty columns in row group.
file_offset: Option<i64>,
/// Ordinal position of this row group in file
ordinal: Option<i16>,
}

Expand Down Expand Up @@ -335,7 +354,10 @@ impl RowGroupMetaData {
self.schema_descr.clone()
}

/// Returns ordinal of this row group in file
/// Returns ordinal position of this row group in file.
///
/// For example if this is the first row group in the file, this will return 0.
/// If this is the second row group in the file, this will return 1.
#[inline(always)]
pub fn ordinal(&self) -> Option<i16> {
self.ordinal
Expand Down
11 changes: 7 additions & 4 deletions parquet/src/file/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
//!
//! Provides access to file and row group readers and writers, record API, metadata, etc.
//!
//! See [`serialized_reader::SerializedFileReader`](serialized_reader/struct.SerializedFileReader.html) or
//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a
//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file
//! metadata, and [`statistics`](statistics/index.html) for working with statistics.
//! # See Also:
//! * [`SerializedFileReader`] and [`SerializedFileWriter`] for reading / writing parquet
//! * [`metadata`]: for working with metadata such as schema
//! * [`statistics`]: for working with statistics in metadata
//!
//! [`SerializedFileReader`]: serialized_reader::SerializedFileReader
//! [`SerializedFileWriter`]: writer::SerializedFileWriter
//!
//! # Example of writing a new file
//!
Expand Down
9 changes: 5 additions & 4 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ impl<T> PageIndex<T> {

#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
/// Typed statistics for a data page in a column chunk. This structure
/// is obtained from decoding the [ColumnIndex] in the parquet file
/// and can be used to skip decoding pages while reading the file
/// data.
/// Typed statistics for a data page in a column chunk.
///
/// This structure is part of the "Page Index" and is optionally part of
/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
/// while reading the file data.
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
Expand Down
1 change: 1 addition & 0 deletions parquet/src/file/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
//! Though some common methods are available on enum, use pattern match to extract
//! actual min and max values from statistics, see below:
//!
//! # Examples
//! ```rust
//! use parquet::file::statistics::Statistics;
//!
Expand Down
3 changes: 2 additions & 1 deletion parquet/src/format.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 14 additions & 9 deletions parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,30 @@
//! # Format Overview
//!
//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet
//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency.
//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], but
//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency.
//!
//! Parquet files are partitioned for scalability. Each file contains metadata,
//! along with zero or more "row groups", each row group containing one or
//! more columns. The APIs in this crate reflect this structure.
//!
//! Parquet distinguishes between "logical" and "physical" data types.
//! For instance, strings (logical type) are stored as byte arrays (physical type).
//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
//! are stored as integers (physical type). This crate exposes both kinds of types.
//! Data in Parquet files is strongly typed and differentiates between logical
//! and physical types (see [`schema`]). In addition, Parquet files may contain
//! other metadata, such as statistics, which can be used to optimize reading
//! (see [`file::metadata`]).
//! For more details about the Parquet format itself, see the [Parquet spec]
//!
//! For more details about the Parquet format, see the
//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
//! [Parquet spec]: https://github.com/apache/parquet-format/blob/master/README.md#file-format
//!
//! # APIs
//!
//! This crate exposes a number of APIs for different use-cases.
//!
//! ## Metadata and Schema
//!
//! The [`schema`] module provides APIs to work with Parquet schemas. The
//! [`file::metadata`] module provides APIs to work with Parquet metadata.
//!
//! ## Read/Write Arrow
//!
//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
Expand All @@ -64,7 +69,7 @@
//!
//! ## Read/Write Parquet
//!
//! Workloads needing finer-grained control, or looking to not take a dependency on arrow,
//! Workloads needing finer-grained control, or avoid a dependence on arrow,
//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
//! data model, and therefore require knowledge of the underlying parquet format,
//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads
Expand Down
14 changes: 14 additions & 0 deletions parquet/src/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@

//! Parquet schema definitions and methods to print and parse schema.
//!
//! * [`SchemaDescriptor`] describes the data types of the columns stored in a file
//! * [`ColumnDescriptor`]: Describes the schema of a single (leaf) column.
//! * [`ColumnPath`]: Represents the location of a column in the schema (e.g. a nested field)
//!
//! Parquet distinguishes
//! between "logical" and "physical" data types.
//! For instance, strings (logical type) are stored as byte arrays (physical type).
//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
//! are stored as integers (physical type).
//!
//! [`SchemaDescriptor`]: types::SchemaDescriptor
//! [`ColumnDescriptor`]: types::ColumnDescriptor
//! [`ColumnPath`]: types::ColumnPath
//!
//! # Example
//!
//! ```rust
Expand Down
52 changes: 32 additions & 20 deletions parquet/src/schema/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ pub type SchemaDescPtr = Arc<SchemaDescriptor>;
pub type ColumnDescPtr = Arc<ColumnDescriptor>;

/// Representation of a Parquet type.
///
/// Used to describe primitive leaf fields and structs, including top-level schema.
/// Note that the top-level schema type is represented using `GroupType` whose
///
/// Note that the top-level schema is represented using [`Type::GroupType`] whose
/// repetition is `None`.
#[derive(Clone, Debug, PartialEq)]
pub enum Type {
Expand Down Expand Up @@ -662,7 +664,7 @@ impl BasicTypeInfo {
// ----------------------------------------------------------------------
// Parquet descriptor definitions

/// Represents a path in a nested schema
/// Represents the location of a column in a Parquet schema
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub struct ColumnPath {
parts: Vec<String>,
Expand Down Expand Up @@ -737,21 +739,22 @@ impl AsRef<[String]> for ColumnPath {
}
}

/// A descriptor for leaf-level primitive columns.
/// This encapsulates information such as definition and repetition levels and is used to
/// Physical type for leaf-level primitive columns.
///
/// Also includes the maximum definition and repetition levels required to
/// re-assemble nested data.
#[derive(Debug, PartialEq)]
pub struct ColumnDescriptor {
// The "leaf" primitive type of this column
/// The "leaf" primitive type of this column
primitive_type: TypePtr,

// The maximum definition level for this column
/// The maximum definition level for this column
max_def_level: i16,

// The maximum repetition level for this column
/// The maximum repetition level for this column
max_rep_level: i16,

// The path of this column. For instance, "a.b.c.d".
/// The path of this column. For instance, "a.b.c.d".
path: ColumnPath,
}

Expand Down Expand Up @@ -860,24 +863,33 @@ impl ColumnDescriptor {
}
}

/// A schema descriptor. This encapsulates the top-level schemas for all the columns,
/// as well as all descriptors for all the primitive columns.
/// Schema of a Parquet file.
///
/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
/// each primitive (leaf) column.
#[derive(PartialEq)]
pub struct SchemaDescriptor {
// The top-level schema (the "message" type).
// This must be a `GroupType` where each field is a root column type in the schema.
/// The top-level logical schema (the "message" type).
///
/// This must be a [`Type::GroupType`] where each field is a root
/// column type in the schema.
schema: TypePtr,

// All the descriptors for primitive columns in this schema, constructed from
// `schema` in DFS order.
/// The descriptors for the physical type of each leaf column in this schema
///
/// Constructed from `schema` in DFS order.
leaves: Vec<ColumnDescPtr>,

// Mapping from a leaf column's index to the root column index that it
// comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`:
// -- a <-----+
// -- -- b |
// -- -- -- c |
// -- -- -- -- d
/// Mapping from a leaf column's index to the root column index that it
/// comes from.
///
/// For instance: the leaf `a.b.c.d` would have a link back to `a`:
/// ```text
/// -- a <-----+
/// -- -- b |
/// -- -- -- c |
/// -- -- -- -- d
/// ```
leaf_to_base: Vec<usize>,
}

Expand Down

0 comments on commit c6b3eaa

Please sign in to comment.