From 97d8991e8d29c111c18d2b601aa2902fae4d69ae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 15:22:06 -0500 Subject: [PATCH 01/15] Add ArrowToParquetSchemaConverter, deprecate `arrow_to_parquet_schema` et al --- parquet/src/arrow/arrow_writer/mod.rs | 23 +++-- parquet/src/arrow/mod.rs | 8 +- parquet/src/arrow/schema/mod.rs | 137 ++++++++++++++++++++++---- parquet/src/file/properties.rs | 22 ++--- 4 files changed, 146 insertions(+), 44 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 222d86131e0a..cdcf9c3570ec 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -30,12 +30,10 @@ use arrow_array::types::*; use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef}; -use super::schema::{ - add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, - arrow_to_parquet_schema_with_root, decimal_length_from_precision, -}; +use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision}; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; +use crate::arrow::ArrowToParquetSchemaConverter; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::ColumnValueEncoder; use crate::column::writer::{ @@ -181,10 +179,12 @@ impl ArrowWriter { options: ArrowWriterOptions, ) -> Result { let mut props = options.properties; - let schema = match options.schema_root { - Some(s) => arrow_to_parquet_schema_with_root(&arrow_schema, &s, props.coerce_types())?, - None => arrow_to_parquet_schema(&arrow_schema, props.coerce_types())?, - }; + let mut converter = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(props.coerce_types()); + if let Some(s) = &options.schema_root { + converter = converter.schema_root(s); + } + let schema = converter.build()?; if !options.skip_arrow_metadata { // add serialized arrow schema add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); @@ -538,7 +538,7 @@ impl ArrowColumnChunk { /// # use std::sync::Arc; /// # use arrow_array::*; /// # use arrow_schema::*; -/// # use parquet::arrow::arrow_to_parquet_schema; +/// # use parquet::arrow::ArrowToParquetSchemaConverter; /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers}; /// # use parquet::file::properties::WriterProperties; /// # use parquet::file::writer::SerializedFileWriter; @@ -550,7 +550,10 @@ impl ArrowColumnChunk { /// /// // Compute the parquet schema /// let props = Arc::new(WriterProperties::default()); -/// let parquet_schema = arrow_to_parquet_schema(schema.as_ref(), props.coerce_types()).unwrap(); +/// let parquet_schema = ArrowToParquetSchemaConverter::new(schema.as_ref()) +/// .with_coerce_types(props.coerce_types()) +/// .build() +/// .unwrap(); /// /// // Create writers for each of the leaf columns /// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 2d09cd19203f..df8e2d74c026 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -116,9 +116,13 @@ pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; use arrow_schema::{FieldRef, Schema}; +// continue to until functions are removed +#[allow(deprecated)] +pub use self::schema::arrow_to_parquet_schema; + pub use self::schema::{ - arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema, - parquet_to_arrow_schema_by_columns, FieldLevels, + parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, + ArrowToParquetSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index ec34840d858f..7947e77894fa 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -225,29 +225,121 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut } } +/// Converter for arrow schema to parquet schema +/// +/// Example: +/// ``` +/// # use std::sync::Arc; +/// # use arrow_schema::{Field, Schema, DataType}; +/// # use parquet::arrow::ArrowToParquetSchemaConverter; +/// use parquet::schema::types::{SchemaDescriptor, Type}; +/// use parquet::basic; // note there are two `Type`s in the following example +/// let arrow_schema = Schema::new(vec![ +/// Field::new("a", DataType::Int64, true), +/// Field::new("b", DataType::Date32, true), +/// ]); +/// +/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) +/// .build() +/// .unwrap(); +/// // +/// let expected_parquet_schema = SchemaDescriptor::new( +/// Arc::new( +/// Type::group_type_builder("arrow_schema") +/// .with_fields(vec![ +/// Arc::new( +/// Type::primitive_type_builder("a", basic::Type::INT64) +/// .build().unwrap() +/// ), +/// Arc::new( +/// Type::primitive_type_builder("b", basic::Type::INT32) +/// .with_converted_type(basic::ConvertedType::DATE) +/// .with_logical_type(Some(basic::LogicalType::Date)) +/// .build().unwrap() +/// ), +/// ]) +/// .build().unwrap() +/// ) +/// ); +/// +/// assert_eq!(parquet_schema, expected_parquet_schema); +/// ``` +#[derive(Debug)] +pub struct ArrowToParquetSchemaConverter<'a> { + /// The schema to convert + schema: &'a Schema, + /// Name of the root schema in Parquet + schema_root: &'a str, + /// Should we Coerce arrow types to compatible Parquet types? + /// + /// See docs on [Self::with_coerce_types]` + coerce_types: bool +} + +impl <'a> ArrowToParquetSchemaConverter<'a> { + /// Create a new converter + pub fn new(schema: &'a Schema) -> Self { + Self { + schema, + schema_root: "arrow_schema", + coerce_types: false, + } + } + + /// Should arrow types be coerced into parquet native types (default false). + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but may lose precision for arrow types such as + /// [`DataType::Date64`] which have no direct corresponding Parquet type. + /// + /// # Discussion + /// + /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no + /// corresponding Parquet logical type. Thus, they can not be losslessly + /// round-tripped when stored using the appropriate Parquet logical type. + /// + /// For example, some Date64 values may be truncated when stored with + /// parquet's native 32 bit date type. + /// + /// By default, the arrow writer does not coerce to native parquet types. It + /// writes data in such a way that it can be lossless round tripped. + /// However, this means downstream readers must be aware of and correctly + /// interpret the embedded Arrow schema. + pub fn with_coerce_types(mut self, coerce_types: bool) -> Self { + self.coerce_types = coerce_types; + self + } + + /// Set the root schema element name (defaults to `"arrow_schema"`). + pub fn schema_root(mut self, schema_root: &'a str) -> Self { + self.schema_root = schema_root; + self + } + + /// Build the desired parquet [`SchemaDescriptor`] + pub fn build(self) -> Result { + let Self { schema, schema_root: root_schema_name, coerce_types } = self; + let fields = schema + .fields() + .iter() + .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new)) + .collect::>()?; + let group = Type::group_type_builder(root_schema_name).with_fields(fields).build()?; + Ok(SchemaDescriptor::new(Arc::new(group))) + } +} + /// Convert arrow schema to parquet schema /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be -/// overridden with [`arrow_to_parquet_schema_with_root`] -pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> Result { - arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types) -} +/// overridden with [`ArrowToParquetSchemaConverter`] +#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] +pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { -/// Convert arrow schema to parquet schema specifying the name of the root schema element -pub fn arrow_to_parquet_schema_with_root( - schema: &Schema, - root: &str, - coerce_types: bool, -) -> Result { - let fields = schema - .fields() - .iter() - .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new)) - .collect::>()?; - let group = Type::group_type_builder(root).with_fields(fields).build()?; - Ok(SchemaDescriptor::new(Arc::new(group))) + ArrowToParquetSchemaConverter::new(schema).build() } + fn parse_key_value_metadata( key_value_metadata: Option<&Vec>, ) -> Option> { @@ -1569,7 +1661,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap(); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema).build().unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1606,9 +1698,10 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(true) + .build(); - assert!(converted_arrow_schema.is_err()); converted_arrow_schema.unwrap(); } @@ -1878,7 +1971,9 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema, true)?; + let parq_schema_descr = crate::arrow::ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(true) + .build()?; let parq_fields = parq_schema_descr.root_schema().get_fields(); assert_eq!(parq_fields.len(), 2); assert_eq!(parq_fields[0].get_basic_info().id(), 1); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 1e8a4868dfc3..d3b4459843db 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -16,14 +16,13 @@ // under the License. //! Configuration via [`WriterProperties`] and [`ReaderProperties`] -use std::str::FromStr; -use std::{collections::HashMap, sync::Arc}; - use crate::basic::{Compression, Encoding}; use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; use crate::format::SortingColumn; use crate::schema::types::ColumnPath; +use std::str::FromStr; +use std::{collections::HashMap, sync::Arc}; /// Default value for [`WriterProperties::data_page_size_limit`] pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; @@ -287,15 +286,16 @@ impl WriterProperties { self.statistics_truncate_length } - /// Returns `coerce_types` boolean + /// Should the writer coerce types to parquet native types. + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but may lose precision for arrow types such as + /// [`DataType::Date64`] which have no direct corresponding Parquet type. + /// + /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details /// - /// Some Arrow types do not have a corresponding Parquet logical type. - /// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`. - /// Writers have the option to coerce these into native Parquet types. Type - /// coercion allows for meaningful representations that do not require - /// downstream readers to consider the embedded Arrow schema. However, type - /// coercion also prevents the data from being losslessly round-tripped. This method - /// returns `true` if type coercion enabled. + /// [`DataType::Date64`]: arrow_schema::DataType::Date64 + /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowToParquetSchemaConverter::with_coerce_types pub fn coerce_types(&self) -> bool { self.coerce_types } From af744292f8e94df5015a7516e6e32e3609eacf45 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 17:57:53 -0500 Subject: [PATCH 02/15] Fmt --- parquet/src/arrow/schema/mod.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 7947e77894fa..f49b3eeaa0f9 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -273,10 +273,10 @@ pub struct ArrowToParquetSchemaConverter<'a> { /// Should we Coerce arrow types to compatible Parquet types? /// /// See docs on [Self::with_coerce_types]` - coerce_types: bool + coerce_types: bool, } -impl <'a> ArrowToParquetSchemaConverter<'a> { +impl<'a> ArrowToParquetSchemaConverter<'a> { /// Create a new converter pub fn new(schema: &'a Schema) -> Self { Self { @@ -318,13 +318,19 @@ impl <'a> ArrowToParquetSchemaConverter<'a> { /// Build the desired parquet [`SchemaDescriptor`] pub fn build(self) -> Result { - let Self { schema, schema_root: root_schema_name, coerce_types } = self; + let Self { + schema, + schema_root: root_schema_name, + coerce_types, + } = self; let fields = schema .fields() .iter() .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new)) .collect::>()?; - let group = Type::group_type_builder(root_schema_name).with_fields(fields).build()?; + let group = Type::group_type_builder(root_schema_name) + .with_fields(fields) + .build()?; Ok(SchemaDescriptor::new(Arc::new(group))) } } @@ -335,11 +341,9 @@ impl <'a> ArrowToParquetSchemaConverter<'a> { /// overridden with [`ArrowToParquetSchemaConverter`] #[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { - ArrowToParquetSchemaConverter::new(schema).build() } - fn parse_key_value_metadata( key_value_metadata: Option<&Vec>, ) -> Option> { @@ -1661,7 +1665,9 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema).build().unwrap(); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + .build() + .unwrap(); assert_eq!( parquet_schema.columns().len(), From b519da41f5ac9416d7cdcf403f62638d0944bf2b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 18:14:10 -0500 Subject: [PATCH 03/15] update test --- parquet/src/arrow/schema/mod.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 8c3a9bd57e64..e3fb23e5dff1 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1590,7 +1590,10 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true).unwrap(); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(true) + .build() + .unwrap(); assert_eq!( parquet_schema.columns().len(), converted_arrow_schema.columns().len() @@ -1614,7 +1617,10 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap(); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(false) + .build() + .unwrap(); assert_eq!( parquet_schema.columns().len(), converted_arrow_schema.columns().len() From 40ce6224e4536b04d7edca572a13af06f43a00fb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Dec 2024 17:13:49 -0500 Subject: [PATCH 04/15] Update parquet/src/arrow/schema/mod.rs Co-authored-by: Ed Seidl --- parquet/src/arrow/schema/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index e3fb23e5dff1..51639a988724 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -242,7 +242,7 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) /// .build() /// .unwrap(); -/// // +/// /// let expected_parquet_schema = SchemaDescriptor::new( /// Arc::new( /// Type::group_type_builder("arrow_schema") From 1535b42d4d5768107af31d1e7bc9a9744fbdf1c9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Dec 2024 17:15:37 -0500 Subject: [PATCH 05/15] Apply suggestions from code review Co-authored-by: Ed Seidl --- parquet/src/arrow/schema/mod.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 51639a988724..fee7e37f072b 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -286,13 +286,13 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { } } - /// Should arrow types be coerced into parquet native types (default false). + /// Should arrow types be coerced into parquet native types (default `false`). /// /// Setting this option to `true` will result in parquet files that can be /// read by more readers, but may lose precision for arrow types such as /// [`DataType::Date64`] which have no direct corresponding Parquet type. /// - /// By default, does not coerce to native parquet types. Enabling type + /// By default, this converter does not coerce to native parquet types. Enabling type /// coercion allows for meaningful representations that do not require /// downstream readers to consider the embedded Arrow schema, and can allow /// for greater compatibility with other Parquet implementations. However, @@ -303,11 +303,14 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no /// corresponding Parquet logical type. Thus, they can not be losslessly /// round-tripped when stored using the appropriate Parquet logical type. - /// /// For example, some Date64 values may be truncated when stored with - /// parquet's native 32 bit date type. For [`List`] and [`Map`] types, some + /// parquet's native 32 bit date type. + /// + /// For [`List`] and [`Map`] types, some /// Parquet readers expect certain schema elements to have specific names - /// (earlier versions of the spec was somewhat ambiguous on this point). + /// (earlier versions of the spec were somewhat ambiguous on this point). + /// Type coercion will use the names prescribed by the Parquet specification, + /// potentially losing naming metadata from the Arrow schema. /// /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps From cec4f8d758110a3cc37b7d68b46afb3a180ab69b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Dec 2024 17:15:51 -0500 Subject: [PATCH 06/15] Improve comments --- parquet/src/arrow/mod.rs | 2 +- parquet/src/arrow/schema/mod.rs | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index df8e2d74c026..fdbd9e29f3b7 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -116,7 +116,7 @@ pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; use arrow_schema::{FieldRef, Schema}; -// continue to until functions are removed +// continue to export deprecated methods until they are removed #[allow(deprecated)] pub use self::schema::arrow_to_parquet_schema; diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index e3fb23e5dff1..1cd33032aa0e 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -15,13 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for converting parquet schema to arrow schema and vice versa. -//! -//! The main interfaces for converting parquet schema to arrow schema are -//! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and -//! `parquet_to_arrow_field`. -//! -//! The interfaces for converting arrow schema to parquet schema is coming. +//! Converting Parquet schema <--> Arrow schema: [`ArrowToParquetSchemaConverter`] and [parquet_to_arrow_schema] use base64::prelude::BASE64_STANDARD; use base64::Engine; From e7b7d20eb3bdbd1a39291d8dfe7fc31935b471bb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Dec 2024 17:19:51 -0500 Subject: [PATCH 07/15] Add more detail to WriterPropertiesBuilder docs --- parquet/src/file/properties.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index a1a5c98bf5a6..7010ca886023 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -781,9 +781,17 @@ impl WriterPropertiesBuilder { /// Should the writer coerce types to parquet native types (defaults to `false`). /// + /// Leaving this option the default `false` will ensure the exact same data + /// written to parquet using this library will be read. + /// /// Setting this option to `true` will result in parquet files that can be - /// read by more readers, but may lose precision for arrow types such as - /// [`DataType::Date64`] which have no direct corresponding Parquet type. + /// read by more readers, but potentially lose information in the process. + /// + /// * Types such as [`DataType::Date64`], which have no direct corresponding + /// Parquet type, may be stored with lower precision. + /// + /// * The internal field names of [`List`] and [`Map`] will be renamed if + /// necessary to match what is required by the newest Parquet specification. /// /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details /// From 4e8c4d695145c9e43f730fb7449d12bbad015057 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 7 Dec 2024 07:20:02 -0500 Subject: [PATCH 08/15] Update parquet/src/file/properties.rs Co-authored-by: Ed Seidl --- parquet/src/file/properties.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 7010ca886023..970c18ca482b 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -790,7 +790,7 @@ impl WriterPropertiesBuilder { /// * Types such as [`DataType::Date64`], which have no direct corresponding /// Parquet type, may be stored with lower precision. /// - /// * The internal field names of [`List`] and [`Map`] will be renamed if + /// * The internal field names of `List` and `Map` types will be renamed if /// necessary to match what is required by the newest Parquet specification. /// /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details From 3ffd4d42a8619f8ab8e648cd348de0a24e7ac12e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 7 Dec 2024 07:21:19 -0500 Subject: [PATCH 09/15] Fix some more capitalization and add a link to Parquet date spec --- parquet/src/arrow/schema/mod.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index c13f3cc9fdf9..c859e14e30d2 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -264,7 +264,7 @@ pub struct ArrowToParquetSchemaConverter<'a> { schema: &'a Schema, /// Name of the root schema in Parquet schema_root: &'a str, - /// Should we Coerce arrow types to compatible Parquet types? + /// Should we coerce Arrow types to compatible Parquet types? /// /// See docs on [Self::with_coerce_types]` coerce_types: bool, @@ -280,13 +280,13 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { } } - /// Should arrow types be coerced into parquet native types (default `false`). + /// Should Arrow types be coerced into Parquet native types (default `false`). /// - /// Setting this option to `true` will result in parquet files that can be - /// read by more readers, but may lose precision for arrow types such as - /// [`DataType::Date64`] which have no direct corresponding Parquet type. + /// Setting this option to `true` will result in Parquet files that can be + /// read by more readers, but may lose precision for Arrow types such as + /// [`DataType::Date64`] which have no direct [corresponding Parquet type]. /// - /// By default, this converter does not coerce to native parquet types. Enabling type + /// By default, this converter does not coerce to native Parquet types. Enabling type /// coercion allows for meaningful representations that do not require /// downstream readers to consider the embedded Arrow schema, and can allow /// for greater compatibility with other Parquet implementations. However, @@ -300,14 +300,15 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { /// For example, some Date64 values may be truncated when stored with /// parquet's native 32 bit date type. /// - /// For [`List`] and [`Map`] types, some - /// Parquet readers expect certain schema elements to have specific names - /// (earlier versions of the spec were somewhat ambiguous on this point). - /// Type coercion will use the names prescribed by the Parquet specification, - /// potentially losing naming metadata from the Arrow schema. + /// For [`List`] and [`Map`] types, some Parquet readers expect certain + /// schema elements to have specific names (earlier versions of the spec + /// were somewhat ambiguous on this point). Type coercion will use the names + /// prescribed by the Parquet specification, potentially losing naming + /// metadata from the Arrow schema. /// /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + /// [corresponding Parquet type]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date /// pub fn with_coerce_types(mut self, coerce_types: bool) -> Self { self.coerce_types = coerce_types; From bd4e2d5135e3e4fe2d3dc61e043659243f4423bb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 15:44:01 -0500 Subject: [PATCH 10/15] Update parquet/src/arrow/schema/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/schema/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index c859e14e30d2..bdb1d800c6aa 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -316,7 +316,7 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { } /// Set the root schema element name (defaults to `"arrow_schema"`). - pub fn schema_root(mut self, schema_root: &'a str) -> Self { + pub fn with_root(mut self, schema_root: &'a str) -> Self { self.schema_root = schema_root; self } From 17267ec0226d04fd16228c5d81829ba654b78c96 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 15:51:52 -0500 Subject: [PATCH 11/15] Revert "Update parquet/src/arrow/schema/mod.rs" This reverts commit bd4e2d5135e3e4fe2d3dc61e043659243f4423bb. --- parquet/src/arrow/schema/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 92b0ebe559fb..44e6c7c9e15c 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -317,7 +317,7 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { } /// Set the root schema element name (defaults to `"arrow_schema"`). - pub fn with_root(mut self, schema_root: &'a str) -> Self { + pub fn schema_root(mut self, schema_root: &'a str) -> Self { self.schema_root = schema_root; self } From 8892810aff2900c29754424ad4542a7c26b870a5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 15:55:20 -0500 Subject: [PATCH 12/15] rename to ArrowSchemaConverter --- parquet/src/arrow/arrow_writer/mod.rs | 18 +++++++++--------- parquet/src/arrow/mod.rs | 2 +- parquet/src/arrow/schema/mod.rs | 24 ++++++++++++------------ parquet/src/file/properties.rs | 2 +- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 6c3d62a1a299..85febd42b076 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -33,7 +33,7 @@ use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, S use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision}; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; -use crate::arrow::ArrowToParquetSchemaConverter; +use crate::arrow::ArrowSchemaConverter; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::ColumnValueEncoder; use crate::column::writer::{ @@ -179,10 +179,10 @@ impl ArrowWriter { options: ArrowWriterOptions, ) -> Result { let mut props = options.properties; - let mut converter = ArrowToParquetSchemaConverter::new(&arrow_schema) - .with_coerce_types(props.coerce_types()); - if let Some(s) = &options.schema_root { - converter = converter.schema_root(s); + let mut converter = + ArrowSchemaConverter::new(&arrow_schema).with_coerce_types(props.coerce_types()); + if let Some(schema_root) = &options.schema_root { + converter = converter.schema_root(schema_root); } let schema = converter.build()?; if !options.skip_arrow_metadata { @@ -390,9 +390,9 @@ impl ArrowWriterOptions { } /// Set the name of the root parquet schema element (defaults to `"arrow_schema"`) - pub fn with_schema_root(self, name: String) -> Self { + pub fn with_schema_root(self, schema_root: String) -> Self { Self { - schema_root: Some(name), + schema_root: Some(schema_root), ..self } } @@ -538,7 +538,7 @@ impl ArrowColumnChunk { /// # use std::sync::Arc; /// # use arrow_array::*; /// # use arrow_schema::*; -/// # use parquet::arrow::ArrowToParquetSchemaConverter; +/// # use parquet::arrow::ArrowSchemaConverter; /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers}; /// # use parquet::file::properties::WriterProperties; /// # use parquet::file::writer::SerializedFileWriter; @@ -550,7 +550,7 @@ impl ArrowColumnChunk { /// /// // Compute the parquet schema /// let props = Arc::new(WriterProperties::default()); -/// let parquet_schema = ArrowToParquetSchemaConverter::new(schema.as_ref()) +/// let parquet_schema = ArrowSchemaConverter::new(schema.as_ref()) /// .with_coerce_types(props.coerce_types()) /// .build() /// .unwrap(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index fdbd9e29f3b7..d77436bc1ff7 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -122,7 +122,7 @@ pub use self::schema::arrow_to_parquet_schema; pub use self::schema::{ parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, - ArrowToParquetSchemaConverter, FieldLevels, + ArrowSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 44e6c7c9e15c..d1f3375b345f 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Converting Parquet schema <--> Arrow schema: [`ArrowToParquetSchemaConverter`] and [parquet_to_arrow_schema] +//! Converting Parquet schema <--> Arrow schema: [`ArrowSchemaConverter`] and [parquet_to_arrow_schema] use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -226,7 +226,7 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// ``` /// # use std::sync::Arc; /// # use arrow_schema::{Field, Schema, DataType}; -/// # use parquet::arrow::ArrowToParquetSchemaConverter; +/// # use parquet::arrow::ArrowSchemaConverter; /// use parquet::schema::types::{SchemaDescriptor, Type}; /// use parquet::basic; // note there are two `Type`s in the following example /// let arrow_schema = Schema::new(vec![ @@ -234,7 +234,7 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// Field::new("b", DataType::Date32, true), /// ]); /// -/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) +/// let parquet_schema = ArrowSchemaConverter::new(&arrow_schema) /// .build() /// .unwrap(); /// @@ -260,7 +260,7 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// assert_eq!(parquet_schema, expected_parquet_schema); /// ``` #[derive(Debug)] -pub struct ArrowToParquetSchemaConverter<'a> { +pub struct ArrowSchemaConverter<'a> { /// The schema to convert schema: &'a Schema, /// Name of the root schema in Parquet @@ -271,7 +271,7 @@ pub struct ArrowToParquetSchemaConverter<'a> { coerce_types: bool, } -impl<'a> ArrowToParquetSchemaConverter<'a> { +impl<'a> ArrowSchemaConverter<'a> { /// Create a new converter pub fn new(schema: &'a Schema) -> Self { Self { @@ -344,10 +344,10 @@ impl<'a> ArrowToParquetSchemaConverter<'a> { /// Convert arrow schema to parquet schema /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be -/// overridden with [`ArrowToParquetSchemaConverter`] +/// overridden with [`ArrowSchemaConverter`] #[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { - ArrowToParquetSchemaConverter::new(schema).build() + ArrowSchemaConverter::new(schema).build() } fn parse_key_value_metadata( @@ -1589,7 +1589,7 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) .with_coerce_types(true) .build() .unwrap(); @@ -1616,7 +1616,7 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) .with_coerce_types(false) .build() .unwrap(); @@ -1775,7 +1775,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) .build() .unwrap(); @@ -1814,7 +1814,7 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) .with_coerce_types(true) .build(); @@ -2088,7 +2088,7 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::ArrowToParquetSchemaConverter::new(&arrow_schema) + let parq_schema_descr = crate::arrow::ArrowSchemaConverter::new(&arrow_schema) .with_coerce_types(true) .build()?; let parq_fields = parq_schema_descr.root_schema().get_fields(); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 970c18ca482b..7b688333e540 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -796,7 +796,7 @@ impl WriterPropertiesBuilder { /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details /// /// [`DataType::Date64`]: arrow_schema::DataType::Date64 - /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowToParquetSchemaConverter::with_coerce_types + /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types pub fn set_coerce_types(mut self, coerce_types: bool) -> Self { self.coerce_types = coerce_types; self From 0ab4eb4ccc149a4ab27faa6ad6a0273bab72c3ef Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 16:00:48 -0500 Subject: [PATCH 13/15] change from build --> convert --- parquet/src/arrow/arrow_writer/mod.rs | 5 +-- parquet/src/arrow/schema/mod.rs | 58 +++++++++++++-------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 85febd42b076..31993966bbdb 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -179,12 +179,11 @@ impl ArrowWriter { options: ArrowWriterOptions, ) -> Result { let mut props = options.properties; - let mut converter = - ArrowSchemaConverter::new(&arrow_schema).with_coerce_types(props.coerce_types()); + let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); if let Some(schema_root) = &options.schema_root { converter = converter.schema_root(schema_root); } - let schema = converter.build()?; + let schema = converter.convert(&arrow_schema)?; if !options.skip_arrow_metadata { // add serialized arrow schema add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index d1f3375b345f..d190220d9e92 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -220,7 +220,7 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut } } -/// Converter for arrow schema to parquet schema +/// Converter for Arrow schema to Parquet schema /// /// Example: /// ``` @@ -229,13 +229,14 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// # use parquet::arrow::ArrowSchemaConverter; /// use parquet::schema::types::{SchemaDescriptor, Type}; /// use parquet::basic; // note there are two `Type`s in the following example +/// // create an Arrow Schema /// let arrow_schema = Schema::new(vec![ /// Field::new("a", DataType::Int64, true), /// Field::new("b", DataType::Date32, true), /// ]); -/// -/// let parquet_schema = ArrowSchemaConverter::new(&arrow_schema) -/// .build() +/// // convert the Arrow schema to a Parquet schema +/// let parquet_schema = ArrowSchemaConverter::new() +/// .convert(&arrow_schema) /// .unwrap(); /// /// let expected_parquet_schema = SchemaDescriptor::new( @@ -256,13 +257,10 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut /// .build().unwrap() /// ) /// ); -/// /// assert_eq!(parquet_schema, expected_parquet_schema); /// ``` #[derive(Debug)] pub struct ArrowSchemaConverter<'a> { - /// The schema to convert - schema: &'a Schema, /// Name of the root schema in Parquet schema_root: &'a str, /// Should we coerce Arrow types to compatible Parquet types? @@ -271,11 +269,16 @@ pub struct ArrowSchemaConverter<'a> { coerce_types: bool, } +impl Default for ArrowSchemaConverter<'_> { + fn default() -> Self { + Self::new() + } +} + impl<'a> ArrowSchemaConverter<'a> { /// Create a new converter - pub fn new(schema: &'a Schema) -> Self { + pub fn new() -> Self { Self { - schema, schema_root: "arrow_schema", coerce_types: false, } @@ -322,19 +325,16 @@ impl<'a> ArrowSchemaConverter<'a> { self } - /// Build the desired parquet [`SchemaDescriptor`] - pub fn build(self) -> Result { - let Self { - schema, - schema_root: root_schema_name, - coerce_types, - } = self; + /// Convert the specified Arrow [`Schema`] to the desired Parquet [`SchemaDescriptor`] + /// + /// See example in [`ArrowSchemaConverter`] + pub fn convert(&self, schema: &Schema) -> Result { let fields = schema .fields() .iter() - .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new)) + .map(|field| arrow_to_parquet_type(field, self.coerce_types).map(Arc::new)) .collect::>()?; - let group = Type::group_type_builder(root_schema_name) + let group = Type::group_type_builder(self.schema_root) .with_fields(fields) .build()?; Ok(SchemaDescriptor::new(Arc::new(group))) @@ -347,7 +347,7 @@ impl<'a> ArrowSchemaConverter<'a> { /// overridden with [`ArrowSchemaConverter`] #[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { - ArrowSchemaConverter::new(schema).build() + ArrowSchemaConverter::new().convert(schema) } fn parse_key_value_metadata( @@ -1589,9 +1589,9 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new() .with_coerce_types(true) - .build() + .convert(&arrow_schema) .unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1616,9 +1616,9 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new() .with_coerce_types(false) - .build() + .convert(&arrow_schema) .unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1775,8 +1775,8 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) - .build() + let converted_arrow_schema = ArrowSchemaConverter::new() + .convert(&arrow_schema) .unwrap(); assert_eq!( @@ -1814,9 +1814,9 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowSchemaConverter::new(&arrow_schema) + let converted_arrow_schema = ArrowSchemaConverter::new() .with_coerce_types(true) - .build(); + .convert(&arrow_schema); converted_arrow_schema.unwrap(); } @@ -2088,9 +2088,9 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::ArrowSchemaConverter::new(&arrow_schema) + let parq_schema_descr = ArrowSchemaConverter::new() .with_coerce_types(true) - .build()?; + .convert(&arrow_schema)?; let parq_fields = parq_schema_descr.root_schema().get_fields(); assert_eq!(parq_fields.len(), 2); assert_eq!(parq_fields[0].get_basic_info().id(), 1); From df3072cddcda3b045088c066d2fa24077a62c272 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 16:16:56 -0500 Subject: [PATCH 14/15] update doc --- parquet/src/arrow/arrow_writer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 31993966bbdb..871b140768cb 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -549,9 +549,9 @@ impl ArrowColumnChunk { /// /// // Compute the parquet schema /// let props = Arc::new(WriterProperties::default()); -/// let parquet_schema = ArrowSchemaConverter::new(schema.as_ref()) +/// let parquet_schema = ArrowSchemaConverter::new() /// .with_coerce_types(props.coerce_types()) -/// .build() +/// .convert(&schema) /// .unwrap(); /// /// // Create writers for each of the leaf columns From 46e57d41d4a00e718c5af9681168d38394abc874 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 16:22:30 -0500 Subject: [PATCH 15/15] fix fmt --- parquet/src/arrow/schema/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index d190220d9e92..4ae3fdb8e5cf 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1775,9 +1775,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = ArrowSchemaConverter::new() - .convert(&arrow_schema) - .unwrap(); + let converted_arrow_schema = ArrowSchemaConverter::new().convert(&arrow_schema).unwrap(); assert_eq!( parquet_schema.columns().len(),