From 28c1cae83f6a10644a0b92e8b185587b0cd72f9f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 17 May 2024 11:40:41 -0400 Subject: [PATCH] Fix documentation for parquet `parse_metadata`, `decode_metadata` and `decode_footer` (#5781) --- parquet/src/file/footer.rs | 44 ++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 9695dbeae6e1..7a75576c3645 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -27,14 +27,30 @@ use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE, PARQUET_MAGIC}; use crate::schema::types::{self, SchemaDescriptor}; -/// Layout of Parquet file +/// Reads the [ParquetMetaData] from the footer of the parquet file. +/// +/// # Layout of Parquet file +/// ```text /// +---------------------------+-----+---+ /// | Rest of file | B | A | /// +---------------------------+-----+---+ -/// where A: parquet footer, B: parquet metadata. +/// ``` +/// where +/// * `A`: parquet footer which stores the length of the metadata. +/// * `B`: parquet metadata. +/// +/// # I/O +/// +/// This method first reads the last 8 bytes of the file via +/// [`ChunkReader::get_read`] to get the the parquet footer which contains the +/// metadata length. +/// +/// It then issues a second `get_read` to read the encoded metadata +/// metadata. /// -/// The reader first reads DEFAULT_FOOTER_SIZE bytes from the end of the file. -/// If it is not enough according to the length indicated in the footer, it reads more bytes. +/// # See Also +/// [`decode_metadata`] for decoding the metadata from the bytes. +/// [`decode_footer`] for decoding the metadata length from the footer. pub fn parse_metadata(chunk_reader: &R) -> Result { // check file is large enough to hold footer let file_size = chunk_reader.len(); @@ -65,7 +81,13 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result { // TODO: row group filtering let mut prot = TCompactSliceInputProtocol::new(buf); @@ -90,7 +112,17 @@ pub fn decode_metadata(buf: &[u8]) -> Result { Ok(ParquetMetaData::new(file_metadata, row_groups)) } -/// Decodes the footer returning the metadata length in bytes +/// Decodes the Parquet footer returning the metadata length in bytes +/// +/// A parquet footer is 8 bytes long and has the following layout: +/// * 4 bytes for the metadata length +/// * 4 bytes for the magic bytes 'PAR1' +/// +/// ```text +/// +-----+--------+ +/// | len | 'PAR1' | +/// +-----+--------+ +/// ``` pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { // check this is indeed a parquet file if slice[4..] != PARQUET_MAGIC {