From 5483dbcfc17f0ea9eb0b7d67712d5e4a53be5a86 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Thu, 11 Jul 2024 17:35:13 -0500 Subject: [PATCH] fix: For NPE when sorting dictionary-encoded null string columns (#5758) --- .../engine/table/impl/SortHelpers.java | 5 + .../regioned/ColumnRegionChunkDictionary.java | 5 +- .../table/ParquetTableReadWriteTest.java | 123 ++++++++++++++++++ .../ReferenceNullStringDictEncoded1.parquet | 3 + .../ReferenceNullStringDictEncoded2.parquet | 3 + 5 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded1.parquet create mode 100644 extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded2.parquet diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/SortHelpers.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/SortHelpers.java index 76d0c398e2d..4a7b67b077a 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/SortHelpers.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/SortHelpers.java @@ -359,6 +359,11 @@ private static SortMapping doSymbolTableMapping(SortingOrder order, ColumnSource final ColumnSource reinterpreted = columnSource.reinterpret(long.class); final Table symbolTable = ((SymbolTableSource) columnSource).getStaticSymbolTable(rowSet, true); + if (symbolTable.isEmpty()) { + // All nulls, so we can just return the row set as the sort mapping + return new IndexedSortMapping(rowSet.size(), new long[] {rowSet.size()}, new RowSet[] {rowSet}); + } + if (symbolTable.size() >= sortSize) { // the very first thing we will do is sort the symbol table, using a regular sort; if it is larger than the // actual table we care to sort, then it is wasteful to use the symbol table sorting diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/ColumnRegionChunkDictionary.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/ColumnRegionChunkDictionary.java index c56c58097fd..74a747f4e18 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/ColumnRegionChunkDictionary.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/ColumnRegionChunkDictionary.java @@ -86,7 +86,10 @@ public boolean gatherDictionaryValuesRowSet( @NotNull final RowSequence.Iterator knownKeys, @NotNull final RowSetBuilderSequential sequentialBuilder) { final long dictSize = getDictionaryChunk().size(); - + if (dictSize == 0) { + advanceToNextPage(knownKeys); + return advanceToNextPage(keysToVisit); + } final long pageFirstKey = firstRow(keysToVisit.currentValue()); final long pageLastKey = pageFirstKey + dictSize - 1; diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index ee9ab2b081e..1766da71b26 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -79,6 +79,8 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.net.URI; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.time.Instant; import java.time.LocalDate; import java.time.LocalTime; @@ -1565,6 +1567,127 @@ public void stringDictionaryTest() { } } + /** + * Test if we can read/sort a parquet file with a null string column encoded as a dictionary. + */ + @Test + public void nullStringDictEncodingTest() { + final int NUM_ROWS = 2048; + { + // The following file has a null string column encoded as a dictionary. We should be able to read/sort + // it without any exceptions. + final String path = + ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded1.parquet").getFile(); + final Table expected = TableTools.emptyTable(NUM_ROWS).update("someLong = i", + "someString = (String)null"); + nullStringDictEncodingTestHelper(path, expected); + } + + { + // The following file has a string column with all values null except for the last one encoded as a + // dictionary. We should be able to read/sort it without any exceptions. + final String path = + ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded2.parquet").getFile(); + final Table expected = TableTools.emptyTable(NUM_ROWS).update("someLong = i", + "someString = i == 2047 ? `Hello` : (String)null"); + nullStringDictEncodingTestHelper(path, expected); + } + } + + private static void nullStringDictEncodingTestHelper(final String path, final Table expected) { + // Verify that the column uses dictionary encoding. + final ParquetMetadata metadata = + new ParquetTableLocationKey(new File(path).toURI(), 0, null, ParquetInstructions.EMPTY) + .getMetadata(); + final String strColumnMetadata = metadata.getBlocks().get(0).getColumns().get(1).toString(); + assertTrue(strColumnMetadata.contains("someString") && strColumnMetadata.contains("RLE_DICTIONARY")); + + // Sort and compare + assertTableEquals(expected, readTable(path).sort("someString")); + assertTableEquals(expected.sortDescending("someString"), + readTable(path).sortDescending("someString")); + } + + /** + * Extension of {@link #nullStringDictEncodingTest()} test, for testing if we can read/sort a table with regions + * having null string columns encoded as a dictionary. + */ + @Test + public void nullRegionDictionaryEncodingTest() throws IOException { + // The following file has a null string column encoded as a dictionary. + final File allNullsFile = new File( + ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded1.parquet").getFile()); + final Table allNullsTable = readTable(allNullsFile.toString()).select(); + final File allButOneNullFile = new File( + ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded2.parquet").getFile()); + final Table allButOneNullTable = readTable(allButOneNullFile.toString()).select(); + final File testDir = new File(rootFile, "nullRegionDictionaryEncodingTest"); + final File firstRegion = new File(testDir, "Region1.parquet"); + final File secondRegion = new File(testDir, "Region2.parquet"); + final File thirdRegion = new File(testDir, "Region3.parquet"); + final File fourthRegion = new File(testDir, "Region4.parquet"); + + // The first test is for a table where all three regions have null values for the string column. + nullRegionDictionaryEncodingTestHelper( + allNullsFile, allNullsFile, allNullsFile, + allNullsTable, allNullsTable, allNullsTable, + firstRegion, secondRegion, thirdRegion); + + // The next test is for a table where the first-region has non-null values and the second and third have null + // values for the string column. + nullRegionDictionaryEncodingTestHelper( + allButOneNullFile, allNullsFile, allNullsFile, + allButOneNullTable, allNullsTable, allNullsTable, + firstRegion, secondRegion, thirdRegion); + + // The next test is for a table where the second-region has non-null values, and the first and third have null + // values for the string column. + nullRegionDictionaryEncodingTestHelper( + allNullsFile, allButOneNullFile, allNullsFile, + allNullsTable, allButOneNullTable, allNullsTable, + firstRegion, secondRegion, thirdRegion); + + // The next test is for a table where the third-region has non-null values, and the first two have null values + // for the string column. + nullRegionDictionaryEncodingTestHelper( + allNullsFile, allNullsFile, allButOneNullFile, + allNullsTable, allNullsTable, allButOneNullTable, + firstRegion, secondRegion, thirdRegion); + + { + // The next test is for a table where the first and region have null values and the second and fourth two + // have null values for the string column. + testDir.mkdir(); + Files.copy(allNullsFile.toPath(), firstRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + Files.copy(allButOneNullFile.toPath(), secondRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + Files.copy(allNullsFile.toPath(), thirdRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + Files.copy(allButOneNullFile.toPath(), fourthRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + final Table expected = merge(allNullsTable, allButOneNullTable, allNullsTable, allButOneNullTable); + assertTableEquals(expected.sort("someString"), + readTable(testDir.toString()).sort("someString")); + assertTableEquals(expected.sortDescending("someString"), + readTable(testDir.toString()).sortDescending("someString")); + FileUtils.deleteRecursively(testDir); + } + } + + private static void nullRegionDictionaryEncodingTestHelper( + final File firstSource, final File secondSource, final File thirdSource, + final Table firstTable, final Table secondTable, final Table thirdTable, + final File firstRegion, final File secondRegion, final File thirdRegion) throws IOException { + final File testDir = new File(rootFile, "nullRegionDictionaryEncodingTest"); + testDir.mkdir(); + Files.copy(firstSource.toPath(), firstRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + Files.copy(secondSource.toPath(), secondRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + Files.copy(thirdSource.toPath(), thirdRegion.toPath(), StandardCopyOption.REPLACE_EXISTING); + final Table expected = merge(firstTable, secondTable, thirdTable); + assertTableEquals(expected.sort("someString"), + readTable(testDir.toString()).sort("someString")); + assertTableEquals(expected.sortDescending("someString"), + readTable(testDir.toString()).sortDescending("someString")); + FileUtils.deleteRecursively(testDir); + } + /** * Encoding bigDecimal is tricky -- the writer will try to pick the precision and scale automatically. Because of * that tableTools.assertTableEquals will fail because, even though the numbers are identical, the representation diff --git a/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded1.parquet b/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded1.parquet new file mode 100644 index 00000000000..e01037545de --- /dev/null +++ b/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f736ad4bce9fb4e6433ed6a4bb58b5a1a2b1b967383e6bb630340c705e0a7612 +size 8734 diff --git a/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded2.parquet b/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded2.parquet new file mode 100644 index 00000000000..eb40644f2b7 --- /dev/null +++ b/extensions/parquet/table/src/test/resources/ReferenceNullStringDictEncoded2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5cf40486008b9b0dab8feffa3a2df40db17119431845e2e4dd7f5e24416dac +size 8777