From da5259cd6cce4f3bc76598a1167053e63c4dc23d Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 9 Oct 2024 11:06:35 -0500 Subject: [PATCH] Added tests for parquet files with empty row group in middle --- .../table/ParquetTableReadWriteTest.java | 68 +++++++++++-------- ...eferenceParquetWithEmptyRowGroup1.parquet} | 0 ...ReferenceParquetWithEmptyRowGroup2.parquet | 3 + ...rquet_sample1.gz.parquet => file1.parquet} | 0 ...rquet_sample2.gz.parquet => file2.parquet} | 0 ...rquet_sample3.gz.parquet => file3.parquet} | 0 6 files changed, 44 insertions(+), 27 deletions(-) rename extensions/parquet/table/src/test/resources/{ReferenceParquetWithEmptyRowGroup.parquet => ReferenceParquetWithEmptyRowGroup1.parquet} (100%) create mode 100644 extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet rename extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/{parquet_sample1.gz.parquet => file1.parquet} (100%) rename extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/{parquet_sample2.gz.parquet => file2.parquet} (100%) rename extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/{parquet_sample3.gz.parquet => file3.parquet} (100%) diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index ad7b278d919..065f708e5b3 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -558,33 +558,6 @@ public void testParquetZstdCompressionCodec() { compressionCodecTestHelper(ParquetTools.ZSTD); } - @Test - public void testReadingParquetDataWithEmptyRowgroups() { - { - // Single parquet file with empty row group - final String path = - TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup.parquet").getFile(); - final Table table = - readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select(); - assertEquals(0, table.size()); - assertTrue(table.getRowSet().isEmpty()); - } - - { - // Parquet dataset with three files, first and third file have three row groups, two non-empty followed by - // an empty row group, and second file has just one empty row group. - final String dirPath = TestParquetTools.class.getResource("/datasetWithEmptyRowgroups").getFile(); - assertFalse(readTable(dirPath + "/parquet_sample1.gz.parquet").isEmpty()); - assertTrue(readTable(dirPath + "/parquet_sample2.gz.parquet").isEmpty()); - assertFalse(readTable(dirPath + "/parquet_sample3.gz.parquet").isEmpty()); - - final Table table = readTable(dirPath).select(); - assertEquals(2138182, table.size()); - assertEquals(4, table.numColumns()); - assertEquals(1068950, table.selectDistinct("price").size()); - } - } - @Test public void testParquetGzipCompressionCodec() { compressionCodecTestHelper(ParquetTools.GZIP); @@ -1762,6 +1735,47 @@ public void testAllNonPartitioningColumnTypes() { } } + @Test + public void testReadingParquetDataWithEmptyRowGroups() { + { + // Single parquet file with empty row group + final String path = + TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup1.parquet").getFile(); + final Table fromDisk = + readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select(); + assertEquals(0, fromDisk.size()); + assertTrue(fromDisk.getRowSet().isEmpty()); + } + + { + // Single parquet file with three row groups, first and third row group are non-empty, and second row group + // is empty. To generate this file, the following branch was used: + // https://github.com/malhotrashivam/deephaven-core/tree/sm-ref-branch + final String path = + TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup2.parquet").getFile(); + final Table fromDisk = + readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select(); + assertEquals(20, fromDisk.size()); + final Table table = TableTools.emptyTable(10).update("integers = (int)(ii%3)"); + final Table expected = merge(table, table); + assertTableEquals(expected, fromDisk); + } + + { + // Parquet dataset with three files, first and third file have three row groups, two non-empty followed by + // an empty row group, and second file has just one empty row group. + final String dirPath = TestParquetTools.class.getResource("/datasetWithEmptyRowgroups").getFile(); + assertFalse(readTable(dirPath + "/file1.parquet").isEmpty()); + assertTrue(readTable(dirPath + "/file2.parquet").isEmpty()); + assertFalse(readTable(dirPath + "/file3.parquet").isEmpty()); + + final Table table = readTable(dirPath).select(); + assertEquals(2138182, table.size()); + assertEquals(4, table.numColumns()); + assertEquals(1068950, table.selectDistinct("price").size()); + } + } + @Test public void decimalLogicalTypeTest() { final Table expected = TableTools.emptyTable(100_000).update( diff --git a/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup.parquet b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup1.parquet similarity index 100% rename from extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup.parquet rename to extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup1.parquet diff --git a/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet new file mode 100644 index 00000000000..2ce0097288d --- /dev/null +++ b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d13c8bb7cb58dba15328674290eb12ae559aae5d175dd33f9a769137acb9f31 +size 489 diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample1.gz.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file1.parquet similarity index 100% rename from extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample1.gz.parquet rename to extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file1.parquet diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample2.gz.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file2.parquet similarity index 100% rename from extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample2.gz.parquet rename to extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file2.parquet diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample3.gz.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file3.parquet similarity index 100% rename from extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/parquet_sample3.gz.parquet rename to extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file3.parquet