Skip to content

Commit

Permalink
Added tests for parquet files with empty row group in middle
Browse files Browse the repository at this point in the history
  • Loading branch information
malhotrashivam committed Oct 9, 2024
1 parent 54314b7 commit da5259c
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -558,33 +558,6 @@ public void testParquetZstdCompressionCodec() {
compressionCodecTestHelper(ParquetTools.ZSTD);
}

@Test
public void testReadingParquetDataWithEmptyRowgroups() {
{
// Single parquet file with empty row group
final String path =
TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup.parquet").getFile();
final Table table =
readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select();
assertEquals(0, table.size());
assertTrue(table.getRowSet().isEmpty());
}

{
// Parquet dataset with three files, first and third file have three row groups, two non-empty followed by
// an empty row group, and second file has just one empty row group.
final String dirPath = TestParquetTools.class.getResource("/datasetWithEmptyRowgroups").getFile();
assertFalse(readTable(dirPath + "/parquet_sample1.gz.parquet").isEmpty());
assertTrue(readTable(dirPath + "/parquet_sample2.gz.parquet").isEmpty());
assertFalse(readTable(dirPath + "/parquet_sample3.gz.parquet").isEmpty());

final Table table = readTable(dirPath).select();
assertEquals(2138182, table.size());
assertEquals(4, table.numColumns());
assertEquals(1068950, table.selectDistinct("price").size());
}
}

@Test
public void testParquetGzipCompressionCodec() {
compressionCodecTestHelper(ParquetTools.GZIP);
Expand Down Expand Up @@ -1762,6 +1735,47 @@ public void testAllNonPartitioningColumnTypes() {
}
}

@Test
public void testReadingParquetDataWithEmptyRowGroups() {
{
// Single parquet file with empty row group
final String path =
TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup1.parquet").getFile();
final Table fromDisk =
readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select();
assertEquals(0, fromDisk.size());
assertTrue(fromDisk.getRowSet().isEmpty());
}

{
// Single parquet file with three row groups, first and third row group are non-empty, and second row group
// is empty. To generate this file, the following branch was used:
// https://github.com/malhotrashivam/deephaven-core/tree/sm-ref-branch
final String path =
TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup2.parquet").getFile();
final Table fromDisk =
readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select();
assertEquals(20, fromDisk.size());
final Table table = TableTools.emptyTable(10).update("integers = (int)(ii%3)");
final Table expected = merge(table, table);
assertTableEquals(expected, fromDisk);
}

{
// Parquet dataset with three files, first and third file have three row groups, two non-empty followed by
// an empty row group, and second file has just one empty row group.
final String dirPath = TestParquetTools.class.getResource("/datasetWithEmptyRowgroups").getFile();
assertFalse(readTable(dirPath + "/file1.parquet").isEmpty());
assertTrue(readTable(dirPath + "/file2.parquet").isEmpty());
assertFalse(readTable(dirPath + "/file3.parquet").isEmpty());

final Table table = readTable(dirPath).select();
assertEquals(2138182, table.size());
assertEquals(4, table.numColumns());
assertEquals(1068950, table.selectDistinct("price").size());
}
}

@Test
public void decimalLogicalTypeTest() {
final Table expected = TableTools.emptyTable(100_000).update(
Expand Down
Git LFS file not shown

0 comments on commit da5259c

Please sign in to comment.