ORC-1553: Row grouping reads should be skipped when the statistics ar…

…e written without any values for the SArg column ### What changes were proposed in this pull request? This PR aims to fix an issue where the column statistics were incorrectly evaluated in scenarios where no values were written, resulting in the inability to skip row groups. ### Why are the changes needed? The fix improves the evaluation logic of statistics, enabling the skipping of row groups that don't need to be read, thus enhancing performance. ### How was this patch tested? Unit tests have been added to validate the changes. Closes #1692 from guiyanakuang/ORC-1553. Authored-by: Yiqun Zhang <guiyanakuang@gmail.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org> (cherry picked from commit 06ee13f) Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
apache · Dec 24, 2023 · 11d8ad5 · 11d8ad5
1 parent de89122
commit 11d8ad5
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 1 deletion.
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -667,6 +667,14 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
                                            TypeDescription type,
                                            boolean writerUsedProlepticGregorian,
                                            boolean useUTCTimestamp) {
+
+    // When statsProto is EMPTY_COLUMN_STATISTICS
+    // this column does not actually provide statistics
+    // we cannot make any assumptions, so we return YES_NO_NULL.
+    if (statsProto == EMPTY_COLUMN_STATISTICS) {
+      return TruthValue.YES_NO_NULL;
+    }
+
     ColumnStatistics cs = ColumnStatisticsImpl.deserialize(
         null, statsProto, writerUsedProlepticGregorian, true);
     ValueRange range = getValueRange(cs, predicate, useUTCTimestamp);
@@ -748,8 +756,10 @@ static TruthValue evaluatePredicateRange(PredicateLeaf predicate,
                                            ValueRange range,
                                            BloomFilter bloomFilter,
                                            boolean useUTCTimestamp) {
+    // If range is invalid, that means that no value (including null) is written to this column
+    // we should return TruthValue.NO for any predicate.
     if (!range.isValid()) {
-      return TruthValue.YES_NO_NULL;
+      return TruthValue.NO;
     }
 
     // if we didn't have any values, everything must have been null

diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -29,6 +29,7 @@
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
@@ -2475,6 +2476,57 @@ public void testWithoutStatistics() {
     assertEquals(TruthValue.YES_NO_NULL, truthValue);
   }
 
+  @Test
+  public void testStatisticsWithNoWrites() throws Exception {
+    Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc");
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    fs.delete(testFilePath, true);
+
+    TypeDescription schema = TypeDescription.fromString("struct<x:struct<z:double>,y:int>");
+    Writer writer = OrcFile.createWriter(
+        testFilePath,
+        OrcFile.writerOptions(conf).setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    StructColumnVector structColumnVector = (StructColumnVector) batch.cols[0];
+    LongColumnVector longColumnVector = (LongColumnVector) batch.cols[1];
+    structColumnVector.ensureSize(1024, false);
+    structColumnVector.noNulls = false;
+    for (int i = 0; i < 1024; i++) {
+      structColumnVector.isNull[i] = true;
+      longColumnVector.vector[i] = i;
+    }
+    batch.size = 1024;
+    writer.addRowBatch(batch);
+    batch.reset();
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+
+    try (RecordReader rr = reader.rows()) {
+      RecordReaderImpl rri = (RecordReaderImpl) rr;
+      // x.z id is 2, We just need to read this column
+      OrcIndex orcIndex = rri.readRowIndex(0,
+          new boolean[] { false, false, true, false },
+          new boolean[] { false, false, true, false });
+      OrcProto.RowIndex[] rowGroupIndex = orcIndex.getRowGroupIndex();
+      OrcProto.ColumnStatistics statistics = rowGroupIndex[2].getEntry(0).getStatistics();
+      OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
+          .setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
+          .build();
+      PredicateLeaf pred = createPredicateLeaf(
+          PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x.z", 1.0, null);
+
+      TruthValue truthValue = RecordReaderImpl.evaluatePredicateProto(
+          statistics,
+          pred, null, encoding, null,
+          CURRENT_WRITER, TypeDescription.createInt());
+
+      assertEquals(TruthValue.NO, truthValue);
+    }
+  }
+
   @Test
   public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
     // orc-file-no-double-statistic.orc is an orc file created by cudf with a schema of