diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 3573e3020e5c..dad960790e6c 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -450,19 +450,57 @@ mod tests { true, )])); - let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(2)])); + // build row groups / pages that exercise different combinations of nulls and values + // note that below we set the row group and page sizes to 4 and 2 respectively + // so that these "groupings" make sense + let a: ArrayRef = Arc::new(Int32Array::from(vec![ + // a row group that has all values + Some(i32::MIN), + Some(-1), + Some(1), + Some(i32::MAX), + // a row group with a page of all nulls and a page of all values + None, + None, + Some(2), + Some(3), + // a row group that has all null pages + None, + None, + None, + None, + // a row group having 1 page with all values and 1 page with some nulls + Some(4), + Some(5), + None, + Some(6), + // a row group having 1 page with all nulls and 1 page with some nulls + None, + None, + Some(7), + None, + // a row group having all pages with some nulls + None, + Some(8), + Some(9), + None, + ])); let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); - let writer_props = match write_page_index { - true => WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Page) - .build(), - false => WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Chunk) - .build(), + let writer_props_builder = match write_page_index { + true => WriterProperties::builder().set_statistics_enabled(EnabledStatistics::Page), + false => WriterProperties::builder().set_statistics_enabled(EnabledStatistics::Chunk), }; + // tune the size or pages to the data above + // to make sure we exercise code paths where all items in a page are null, etc. + let writer_props = writer_props_builder + .set_max_row_group_size(4) + .set_data_page_row_count_limit(2) + .set_write_batch_size(2) + .build(); + let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(writer_props)).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); @@ -610,6 +648,29 @@ mod tests { decoded_metadata.num_row_groups() ); + // check that the mins and maxes are what we expect for each page + // also indirectly checking that the pages were written out as we expected them to be laid out + // (if they're not, or something gets refactored in the future that breaks that assumption, + // this test may have to drop down to a lower level and create metadata directly instead of relying on + // writing an entire file) + let column_indexes = metadata.metadata.column_index().unwrap(); + assert_eq!(column_indexes.len(), 6); + // make sure each row group has 2 pages by checking the first column + // page counts for each column for each row group, should all be the same and there should be + // 12 pages in total across 6 row groups / 1 column + let mut page_counts = vec![]; + for row_group in column_indexes { + for column in row_group { + match column { + Index::INT32(column_index) => { + page_counts.push(column_index.indexes.len()); + } + _ => panic!("unexpected column index type"), + } + } + } + assert_eq!(page_counts, vec![2; 6]); + metadata .metadata .row_groups() diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 662ba45621ab..2f30abead25c 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -230,16 +230,14 @@ impl NativeIndex { let min_values = self .indexes .iter() - .map(|x| x.min_bytes().map(|x| x.to_vec())) - .collect::>>() - .unwrap_or_else(|| vec![vec![]; self.indexes.len()]); + .map(|x| x.min_bytes().unwrap_or(&[]).to_vec()) + .collect::>(); let max_values = self .indexes .iter() - .map(|x| x.max_bytes().map(|x| x.to_vec())) - .collect::>>() - .unwrap_or_else(|| vec![vec![]; self.indexes.len()]); + .map(|x| x.max_bytes().unwrap_or(&[]).to_vec()) + .collect::>(); let null_counts = self .indexes