diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index a1644ee49b8d..1b65c5057de1 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -100,6 +100,13 @@ jobs: run: rustup component add rustfmt - name: Format arrow run: cargo fmt --all -- --check + - name: Format parquet + # Many modules in parquet are skipped, so check parquet separately. If this check fails, run: + # cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs` + # from the top level arrow-rs directory and check in the result. + # https://github.com/apache/arrow-rs/issues/6179 + working-directory: parquet + run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` - name: Format object_store working-directory: object_store run: cargo fmt --all -- --check diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a53604d0f6ae..e0adc18a9a60 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -160,6 +160,13 @@ PR be sure to run the following and check for lint issues: cargo +stable fmt --all -- --check ``` +Note that currently the above will not check all source files in the parquet crate. To check all +parquet files run the following from the top-level `arrow-rs` directory: + +```bash +cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` +``` + ## Breaking Changes Our [release schedule] allows breaking API changes only in major releases. diff --git a/parquet/src/arrow/array_reader/empty_array.rs b/parquet/src/arrow/array_reader/empty_array.rs index 51673f2f8cf2..9aba08b4e853 100644 --- a/parquet/src/arrow/array_reader/empty_array.rs +++ b/parquet/src/arrow/array_reader/empty_array.rs @@ -17,9 +17,9 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::Result; -use arrow_schema::{DataType as ArrowType, Fields}; use arrow_array::{ArrayRef, StructArray}; use arrow_data::ArrayDataBuilder; +use arrow_schema::{DataType as ArrowType, Fields}; use std::any::Any; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs index 4cf68a06601c..75099d018fc9 100644 --- a/parquet/src/arrow/array_reader/fixed_size_list_array.rs +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -138,7 +138,7 @@ impl ArrayReader for FixedSizeListArrayReader { "Encountered misaligned row with length {} (expected length {})", row_len, self.fixed_size - )) + )); } row_len = 0; @@ -226,9 +226,7 @@ mod tests { use super::*; use crate::arrow::{ array_reader::{test_util::InMemoryArrayReader, ListArrayReader}, - arrow_reader::{ - ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, - }, + arrow_reader::{ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader}, ArrowWriter, }; use arrow::datatypes::{Field, Int32Type}; @@ -279,10 +277,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 3, - ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 3, - ), + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 3), 2, 1, true, @@ -328,10 +323,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 2, - ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 2, - ), + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 2), 1, 1, false, @@ -354,14 +346,10 @@ mod tests { // [[4, 5]], // [[null, null]], // ] - let l2_type = ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 2, - ); - let l1_type = ArrowType::FixedSizeList( - Arc::new(Field::new("item", l2_type.clone(), false)), - 1, - ); + let l2_type = + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 2); + let l1_type = + ArrowType::FixedSizeList(Arc::new(Field::new("item", l2_type.clone(), false)), 1); let array = PrimitiveArray::::from(vec![ None, @@ -413,14 +401,8 @@ mod tests { Some(vec![0, 0, 2, 0, 2, 0, 0, 2, 0, 2]), ); - let l2 = FixedSizeListArrayReader::new( - Box::new(item_array_reader), - 2, - l2_type, - 4, - 2, - false, - ); + let l2 = + FixedSizeListArrayReader::new(Box::new(item_array_reader), 2, l2_type, 4, 2, false); let mut l1 = FixedSizeListArrayReader::new(Box::new(l2), 1, l1_type, 3, 1, true); let expected_1 = expected.slice(0, 2); @@ -454,10 +436,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 0, - ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 0, - ), + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 0), 2, 1, true, @@ -473,8 +452,7 @@ mod tests { #[test] fn test_nested_var_list() { // [[[1, null, 3], null], [[4], []], [[5, 6], [null, null]], null] - let mut builder = - FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); + let mut builder = FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); builder.values().append_value([Some(1), None, Some(3)]); builder.values().append_null(); builder.append(true); @@ -503,12 +481,9 @@ mod tests { None, ])); - let inner_type = - ArrowType::List(Arc::new(Field::new("item", ArrowType::Int32, true))); - let list_type = ArrowType::FixedSizeList( - Arc::new(Field::new("item", inner_type.clone(), true)), - 2, - ); + let inner_type = ArrowType::List(Arc::new(Field::new("item", ArrowType::Int32, true))); + let list_type = + ArrowType::FixedSizeList(Arc::new(Field::new("item", inner_type.clone(), true)), 2); let item_array_reader = InMemoryArrayReader::new( ArrowType::Int32, @@ -517,22 +492,11 @@ mod tests { Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), ); - let inner_array_reader = ListArrayReader::::new( - Box::new(item_array_reader), - inner_type, - 4, - 2, - true, - ); + let inner_array_reader = + ListArrayReader::::new(Box::new(item_array_reader), inner_type, 4, 2, true); - let mut list_array_reader = FixedSizeListArrayReader::new( - Box::new(inner_array_reader), - 2, - list_type, - 2, - 1, - true, - ); + let mut list_array_reader = + FixedSizeListArrayReader::new(Box::new(inner_array_reader), 2, list_type, 2, 1, true); let actual = list_array_reader.next_batch(1024).unwrap(); let actual = actual .as_any() @@ -564,21 +528,13 @@ mod tests { ); // [null, 2, 3, null, 5] - let primitive = PrimitiveArray::::from_iter(vec![ - None, - Some(2), - Some(3), - None, - Some(5), - ]); + let primitive = + PrimitiveArray::::from_iter(vec![None, Some(2), Some(3), None, Some(5)]); let schema = Arc::new(Schema::new(vec![ Field::new( "list", - ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 4, - ), + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 4), true, ), Field::new("primitive", ArrowType::Int32, true), @@ -643,10 +599,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new( "list", - ArrowType::FixedSizeList( - Arc::new(Field::new("item", ArrowType::Int32, true)), - 4, - ), + ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 4), true, )])); diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index e1752f30cea8..ebff3286bed5 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -125,8 +125,7 @@ impl ArrayReader for ListArrayReader { // lists, and for consistency we do the same for nulls. // The output offsets for the computed ListArray - let mut list_offsets: Vec = - Vec::with_capacity(next_batch_array.len() + 1); + let mut list_offsets: Vec = Vec::with_capacity(next_batch_array.len() + 1); // The validity mask of the computed ListArray if nullable let mut validity = self @@ -270,9 +269,7 @@ mod tests { GenericListArray::::DATA_TYPE_CONSTRUCTOR(field) } - fn downcast( - array: &ArrayRef, - ) -> &'_ GenericListArray { + fn downcast(array: &ArrayRef) -> &'_ GenericListArray { array .as_any() .downcast_ref::>() @@ -383,18 +380,12 @@ mod tests { Some(vec![0, 3, 2, 2, 2, 1, 1, 1, 1, 3, 3, 2, 3, 3, 2, 0, 0, 0]), ); - let l3 = ListArrayReader::::new( - Box::new(item_array_reader), - l3_type, - 5, - 3, - true, - ); + let l3 = + ListArrayReader::::new(Box::new(item_array_reader), l3_type, 5, 3, true); let l2 = ListArrayReader::::new(Box::new(l3), l2_type, 3, 2, false); - let mut l1 = - ListArrayReader::::new(Box::new(l2), l1_type, 2, 1, true); + let mut l1 = ListArrayReader::::new(Box::new(l2), l1_type, 2, 1, true); let expected_1 = expected.slice(0, 2); let expected_2 = expected.slice(2, 2); @@ -560,8 +551,7 @@ mod tests { .unwrap(); writer.close().unwrap(); - let file_reader: Arc = - Arc::new(SerializedFileReader::new(file).unwrap()); + let file_reader: Arc = Arc::new(SerializedFileReader::new(file).unwrap()); let file_metadata = file_reader.metadata().file_metadata(); let schema = file_metadata.schema_descr(); @@ -573,8 +563,7 @@ mod tests { ) .unwrap(); - let mut array_reader = - build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap(); + let mut array_reader = build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap(); let batch = array_reader.next_batch(100).unwrap(); assert_eq!(batch.data_type(), array_reader.get_data_type()); @@ -584,9 +573,7 @@ mod tests { "table_info", ArrowType::List(Arc::new(Field::new( "table_info", - ArrowType::Struct( - vec![Field::new("name", ArrowType::Binary, false)].into() - ), + ArrowType::Struct(vec![Field::new("name", ArrowType::Binary, false)].into()), false ))), false diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 9bfc047322a7..4bdec602ba4f 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -184,21 +184,19 @@ mod tests { map_builder.append(true).expect("adding map entry"); // Create record batch - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_builder.finish())]) - .expect("create record batch"); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_builder.finish())]) + .expect("create record batch"); // Write record batch to file let mut buffer = Vec::with_capacity(1024); - let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None) - .expect("creat file writer"); + let mut writer = + ArrowWriter::try_new(&mut buffer, batch.schema(), None).expect("creat file writer"); writer.write(&batch).expect("writing file"); writer.close().expect("close writer"); // Read file let reader = Bytes::from(buffer); - let record_batch_reader = - ParquetRecordBatchReader::try_new(reader, 1024).unwrap(); + let record_batch_reader = ParquetRecordBatchReader::try_new(reader, 1024).unwrap(); for maybe_record_batch in record_batch_reader { let record_batch = maybe_record_batch.expect("Getting current batch"); let col = record_batch.column(0); diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 4af194774bfb..fb2f2f8928b9 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -112,10 +112,10 @@ impl ArrayReader for StructArrayReader { .collect::>>()?; // check that array child data has same size - let children_array_len = - children_array.first().map(|arr| arr.len()).ok_or_else(|| { - general_err!("Struct array reader should have at least one child!") - })?; + let children_array_len = children_array + .first() + .map(|arr| arr.len()) + .ok_or_else(|| general_err!("Struct array reader should have at least one child!"))?; let all_children_len_eq = children_array .iter() @@ -169,8 +169,7 @@ impl ArrayReader for StructArrayReader { return Err(general_err!("Failed to decode level data for struct array")); } - array_data_builder = - array_data_builder.null_bit_buffer(Some(bitmap_builder.into())); + array_data_builder = array_data_builder.null_bit_buffer(Some(bitmap_builder.into())); } let array_data = unsafe { array_data_builder.build_unchecked() }; @@ -282,13 +281,12 @@ mod tests { // null, // ] - let expected_l = - Arc::new(ListArray::from_iter_primitive::(vec![ - Some(vec![Some(1), Some(2), None]), - Some(vec![]), - None, - None, - ])); + let expected_l = Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), None]), + Some(vec![]), + None, + None, + ])); let validity = Buffer::from([0b00000111]); let struct_fields = vec![( diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index a5f6d242ef91..e487feabb848 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -41,7 +41,7 @@ pub struct ParquetField { /// i.e. guaranteed to be > 0 for an element of list type pub rep_level: i16, /// The level at which this field is fully defined, - /// i.e. guaranteed to be > 0 for a nullable type or child of a + /// i.e. guaranteed to be > 0 for a nullable type or child of a /// nullable type pub def_level: i16, /// Whether this field is nullable @@ -64,11 +64,7 @@ impl ParquetField { rep_level: self.rep_level, def_level: self.def_level, nullable: false, - arrow_type: DataType::List(Arc::new(Field::new( - name, - self.arrow_type.clone(), - false, - ))), + arrow_type: DataType::List(Arc::new(Field::new(name, self.arrow_type.clone(), false))), field_type: ParquetFieldType::Group { children: vec![self], }, @@ -289,7 +285,7 @@ impl Visitor { match map_key.get_basic_info().repetition() { Repetition::REPEATED => { - return Err(arrow_err!("Map keys cannot be repeated")); + return Err(arrow_err!("Map keys cannot be repeated")); } Repetition::REQUIRED | Repetition::OPTIONAL => { // Relaxed check for having repetition REQUIRED as there exists @@ -317,10 +313,7 @@ impl Visitor { (Some(field), Some(&*fields[0]), Some(&*fields[1]), *sorted) } d => { - return Err(arrow_err!( - "Map data type should contain struct got {}", - d - )); + return Err(arrow_err!("Map data type should contain struct got {}", d)); } }, Some(d) => { @@ -416,9 +409,7 @@ impl Visitor { let (def_level, nullable) = match list_type.get_basic_info().repetition() { Repetition::REQUIRED => (context.def_level, false), Repetition::OPTIONAL => (context.def_level + 1, true), - Repetition::REPEATED => { - return Err(arrow_err!("List type cannot be repeated")) - } + Repetition::REPEATED => return Err(arrow_err!("List type cannot be repeated")), }; let arrow_field = match &context.data_type { @@ -542,11 +533,7 @@ impl Visitor { /// /// The resulting [`Field`] will have the type dictated by `field`, a name /// dictated by the `parquet_type`, and any metadata from `arrow_hint` -fn convert_field( - parquet_type: &Type, - field: &ParquetField, - arrow_hint: Option<&Field>, -) -> Field { +fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<&Field>) -> Field { let name = parquet_type.name(); let data_type = field.arrow_type.clone(); let nullable = field.nullable; @@ -568,11 +555,14 @@ fn convert_field( let basic_info = parquet_type.get_basic_info(); if basic_info.has_id() { let mut meta = HashMap::with_capacity(1); - meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string()); + meta.insert( + PARQUET_FIELD_ID_META_KEY.to_string(), + basic_info.id().to_string(), + ); ret.set_metadata(meta); } ret - }, + } } } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index a3528b6c8adb..fab8966952b2 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1483,11 +1483,25 @@ mod tests { ), Field::new("date", DataType::Date32, true), Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true), - Field::new("time_milli_utc", DataType::Time32(TimeUnit::Millisecond), true) - .with_metadata(HashMap::from_iter(vec![("adjusted_to_utc".to_string(), "".to_string())])), + Field::new( + "time_milli_utc", + DataType::Time32(TimeUnit::Millisecond), + true, + ) + .with_metadata(HashMap::from_iter(vec![( + "adjusted_to_utc".to_string(), + "".to_string(), + )])), Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true), - Field::new("time_micro_utc", DataType::Time64(TimeUnit::Microsecond), true) - .with_metadata(HashMap::from_iter(vec![("adjusted_to_utc".to_string(), "".to_string())])), + Field::new( + "time_micro_utc", + DataType::Time64(TimeUnit::Microsecond), + true, + ) + .with_metadata(HashMap::from_iter(vec![( + "adjusted_to_utc".to_string(), + "".to_string(), + )])), Field::new( "ts_milli", DataType::Timestamp(TimeUnit::Millisecond, None), diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 10560210e4e8..edf675f1302a 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -150,35 +150,47 @@ pub fn create_codec(codec: CodecType, _options: &CodecOptions) -> Result { #[cfg(any(feature = "brotli", test))] return Ok(Some(Box::new(BrotliCodec::new(level)))); - Err(ParquetError::General("Disabled feature at compile time: brotli".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: brotli".into(), + )) + } CodecType::GZIP(level) => { #[cfg(any(feature = "flate2", test))] return Ok(Some(Box::new(GZipCodec::new(level)))); - Err(ParquetError::General("Disabled feature at compile time: flate2".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: flate2".into(), + )) + } CodecType::SNAPPY => { #[cfg(any(feature = "snap", test))] return Ok(Some(Box::new(SnappyCodec::new()))); - Err(ParquetError::General("Disabled feature at compile time: snap".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: snap".into(), + )) + } CodecType::LZ4 => { #[cfg(any(feature = "lz4", test))] return Ok(Some(Box::new(LZ4HadoopCodec::new( _options.backward_compatible_lz4, )))); - Err(ParquetError::General("Disabled feature at compile time: lz4".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: lz4".into(), + )) + } CodecType::ZSTD(level) => { #[cfg(any(feature = "zstd", test))] return Ok(Some(Box::new(ZSTDCodec::new(level)))); - Err(ParquetError::General("Disabled feature at compile time: zstd".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: zstd".into(), + )) + } CodecType::LZ4_RAW => { #[cfg(any(feature = "lz4", test))] return Ok(Some(Box::new(LZ4RawCodec::new()))); - Err(ParquetError::General("Disabled feature at compile time: lz4".into())) - }, + Err(ParquetError::General( + "Disabled feature at compile time: lz4".into(), + )) + } CodecType::UNCOMPRESSED => Ok(None), _ => Err(nyi_err!("The codec type {} is not supported yet", codec)), } diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 98283b574ebb..8efb845219d3 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -148,7 +148,6 @@ impl DictEncoder { fn bit_width(&self) -> u8 { num_required_bits(self.num_entries().saturating_sub(1) as u64) } - } impl Encoder for DictEncoder { diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 8d4e48ba16bd..6f662b614fca 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -27,11 +27,7 @@ use crate::util::bit_util::{ceil, num_required_bits, BitWriter}; /// repetition/definition level and number of total buffered values (includes null /// values). #[inline] -pub fn max_buffer_size( - encoding: Encoding, - max_level: i16, - num_buffered_values: usize, -) -> usize { +pub fn max_buffer_size(encoding: Encoding, max_level: i16, num_buffered_values: usize) -> usize { let bit_width = num_required_bits(max_level as u64); match encoding { Encoding::RLE => RleEncoder::max_buffer_size(bit_width, num_buffered_values), diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 97a122941f17..e1ca8cd745e3 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -199,13 +199,9 @@ impl RleEncoder { /// internal writer. #[inline] pub fn flush(&mut self) { - if self.bit_packed_count > 0 - || self.repeat_count > 0 - || self.num_buffered_values > 0 - { + if self.bit_packed_count > 0 || self.repeat_count > 0 || self.num_buffered_values > 0 { let all_repeat = self.bit_packed_count == 0 - && (self.repeat_count == self.num_buffered_values - || self.num_buffered_values == 0); + && (self.repeat_count == self.num_buffered_values || self.num_buffered_values == 0); if self.repeat_count > 0 && all_repeat { self.flush_rle_run(); } else { @@ -250,11 +246,8 @@ impl RleEncoder { // Write the indicator byte to the reserved position in `bit_writer` let num_groups = self.bit_packed_count / 8; let indicator_byte = ((num_groups << 1) | 1) as u8; - self.bit_writer.put_aligned_offset( - indicator_byte, - 1, - self.indicator_byte_pos as usize, - ); + self.bit_writer + .put_aligned_offset(indicator_byte, 1, self.indicator_byte_pos as usize); self.indicator_byte_pos = -1; self.bit_packed_count = 0; } @@ -288,9 +281,7 @@ impl RleEncoder { /// return the estimated memory size of this encoder. pub(crate) fn estimated_memory_size(&self) -> usize { - self.bit_writer.estimated_memory_size() - + std::mem::size_of::() - + self.bit_writer.estimated_memory_size() + std::mem::size_of::() } } @@ -384,12 +375,10 @@ impl RleDecoder { let mut values_read = 0; while values_read < buffer.len() { if self.rle_left > 0 { - let num_values = - cmp::min(buffer.len() - values_read, self.rle_left as usize); + let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); for i in 0..num_values { - let repeated_value = T::try_from_le_slice( - &self.current_value.as_mut().unwrap().to_ne_bytes(), - )?; + let repeated_value = + T::try_from_le_slice(&self.current_value.as_mut().unwrap().to_ne_bytes())?; buffer[values_read + i] = repeated_value; } self.rle_left -= num_values as u32; @@ -397,8 +386,7 @@ impl RleDecoder { } else if self.bit_packed_left > 0 { let mut num_values = cmp::min(buffer.len() - values_read, self.bit_packed_left as usize); - let bit_reader = - self.bit_reader.as_mut().expect("bit_reader should be set"); + let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); num_values = bit_reader.get_batch::( &mut buffer[values_read..values_read + num_values], @@ -424,15 +412,13 @@ impl RleDecoder { let mut values_skipped = 0; while values_skipped < num_values { if self.rle_left > 0 { - let num_values = - cmp::min(num_values - values_skipped, self.rle_left as usize); + let num_values = cmp::min(num_values - values_skipped, self.rle_left as usize); self.rle_left -= num_values as u32; values_skipped += num_values; } else if self.bit_packed_left > 0 { let mut num_values = cmp::min(num_values - values_skipped, self.bit_packed_left as usize); - let bit_reader = - self.bit_reader.as_mut().expect("bit_reader should be set"); + let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); num_values = bit_reader.skip(num_values, self.bit_width as usize); if num_values == 0 { @@ -467,8 +453,7 @@ impl RleDecoder { let index_buf = self.index_buf.get_or_insert_with(|| Box::new([0; 1024])); if self.rle_left > 0 { - let num_values = - cmp::min(max_values - values_read, self.rle_left as usize); + let num_values = cmp::min(max_values - values_read, self.rle_left as usize); let dict_idx = self.current_value.unwrap() as usize; for i in 0..num_values { buffer[values_read + i].clone_from(&dict[dict_idx]); @@ -476,8 +461,7 @@ impl RleDecoder { self.rle_left -= num_values as u32; values_read += num_values; } else if self.bit_packed_left > 0 { - let bit_reader = - self.bit_reader.as_mut().expect("bit_reader should be set"); + let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); loop { let to_read = index_buf @@ -489,10 +473,8 @@ impl RleDecoder { break; } - let num_values = bit_reader.get_batch::( - &mut index_buf[..to_read], - self.bit_width as usize, - ); + let num_values = bit_reader + .get_batch::(&mut index_buf[..to_read], self.bit_width as usize); if num_values == 0 { // Handle writers which truncate the final block self.bit_packed_left = 0; @@ -708,14 +690,10 @@ mod tests { decoder.set_data(data.into()); let mut buffer = vec![""; 12]; let expected = vec![ - "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", - "fff", + "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff", ]; - let result = decoder.get_batch_with_dict::<&str>( - dict.as_slice(), - buffer.as_mut_slice(), - 12, - ); + let result = + decoder.get_batch_with_dict::<&str>(dict.as_slice(), buffer.as_mut_slice(), 12); assert!(result.is_ok()); assert_eq!(buffer, expected); } @@ -1042,8 +1020,7 @@ mod tests { for _ in 0..niters { values.clear(); let rng = thread_rng(); - let seed_vec: Vec = - rng.sample_iter::(&Standard).take(seed_len).collect(); + let seed_vec: Vec = rng.sample_iter::(&Standard).take(seed_len).collect(); let mut seed = [0u8; 32]; seed.copy_from_slice(&seed_vec[0..seed_len]); let mut gen = rand::rngs::StdRng::from_seed(seed); diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index a17202254cd6..062f93270386 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -1027,7 +1027,10 @@ mod tests { .collect(); // Generic values used to check against actual values read from `get_batch`. - let expected_values: Vec = values.iter().map(|v| T::try_from_le_slice(v.as_bytes()).unwrap()).collect(); + let expected_values: Vec = values + .iter() + .map(|v| T::try_from_le_slice(v.as_bytes()).unwrap()) + .collect(); (0..total).for_each(|i| writer.put_value(values[i], num_bits)); diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index f57fc3a71409..a804419b5da7 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -35,7 +35,7 @@ pub trait Storage { /// Return an estimate of the memory used in this storage, in bytes #[allow(dead_code)] // not used in parquet_derive, so is dead there - fn estimated_memory_size(&self) -> usize; + fn estimated_memory_size(&self) -> usize; } /// A generic value interner supporting various different [`Storage`] diff --git a/parquet/src/util/test_common/file_util.rs b/parquet/src/util/test_common/file_util.rs index c2dcd677360d..6c031358e795 100644 --- a/parquet/src/util/test_common/file_util.rs +++ b/parquet/src/util/test_common/file_util.rs @@ -19,8 +19,7 @@ use std::{fs, path::PathBuf, str::FromStr}; /// Returns path to the test parquet file in 'data' directory pub fn get_test_path(file_name: &str) -> PathBuf { - let mut pathbuf = - PathBuf::from_str(&arrow::util::test_util::parquet_test_data()).unwrap(); + let mut pathbuf = PathBuf::from_str(&arrow::util::test_util::parquet_test_data()).unwrap(); pathbuf.push(file_name); pathbuf } diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index 504219ecae19..8cfc1e6dd423 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -21,4 +21,4 @@ pub mod page_util; pub mod file_util; #[cfg(test)] -pub mod rand_gen; \ No newline at end of file +pub mod rand_gen; diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index a267c34840c1..ec80d3a593ae 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -173,8 +173,7 @@ pub fn make_pages( // Generate the current page - let mut pb = - DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + let mut pb = DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); if max_rep_level > 0 { pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); }