Skip to content

Commit

Permalink
ORC-1553: Row grouping reads should be skipped when the statistics ar…
Browse files Browse the repository at this point in the history
…e written without any values for the SArg column

### What changes were proposed in this pull request?
This PR aims to fix an issue where the column statistics were incorrectly evaluated in scenarios where no values were written, resulting in the inability to skip row groups.

### Why are the changes needed?
The fix improves the evaluation logic of statistics, enabling the skipping of row groups that don't need to be read, thus enhancing performance.

### How was this patch tested?
Unit tests have been added to validate the changes.

Closes #1692 from guiyanakuang/ORC-1553.

Authored-by: Yiqun Zhang <guiyanakuang@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 06ee13f)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
guiyanakuang authored and dongjoon-hyun committed Dec 24, 2023
1 parent de89122 commit 11d8ad5
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 1 deletion.
12 changes: 11 additions & 1 deletion java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,14 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
TypeDescription type,
boolean writerUsedProlepticGregorian,
boolean useUTCTimestamp) {

// When statsProto is EMPTY_COLUMN_STATISTICS
// this column does not actually provide statistics
// we cannot make any assumptions, so we return YES_NO_NULL.
if (statsProto == EMPTY_COLUMN_STATISTICS) {
return TruthValue.YES_NO_NULL;
}

ColumnStatistics cs = ColumnStatisticsImpl.deserialize(
null, statsProto, writerUsedProlepticGregorian, true);
ValueRange range = getValueRange(cs, predicate, useUTCTimestamp);
Expand Down Expand Up @@ -748,8 +756,10 @@ static TruthValue evaluatePredicateRange(PredicateLeaf predicate,
ValueRange range,
BloomFilter bloomFilter,
boolean useUTCTimestamp) {
// If range is invalid, that means that no value (including null) is written to this column
// we should return TruthValue.NO for any predicate.
if (!range.isValid()) {
return TruthValue.YES_NO_NULL;
return TruthValue.NO;
}

// if we didn't have any values, everything must have been null
Expand Down
52 changes: 52 additions & 0 deletions java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
Expand Down Expand Up @@ -2475,6 +2476,57 @@ public void testWithoutStatistics() {
assertEquals(TruthValue.YES_NO_NULL, truthValue);
}

@Test
public void testStatisticsWithNoWrites() throws Exception {
Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc");
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.delete(testFilePath, true);

TypeDescription schema = TypeDescription.fromString("struct<x:struct<z:double>,y:int>");
Writer writer = OrcFile.createWriter(
testFilePath,
OrcFile.writerOptions(conf).setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
StructColumnVector structColumnVector = (StructColumnVector) batch.cols[0];
LongColumnVector longColumnVector = (LongColumnVector) batch.cols[1];
structColumnVector.ensureSize(1024, false);
structColumnVector.noNulls = false;
for (int i = 0; i < 1024; i++) {
structColumnVector.isNull[i] = true;
longColumnVector.vector[i] = i;
}
batch.size = 1024;
writer.addRowBatch(batch);
batch.reset();
writer.close();

Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));

try (RecordReader rr = reader.rows()) {
RecordReaderImpl rri = (RecordReaderImpl) rr;
// x.z id is 2, We just need to read this column
OrcIndex orcIndex = rri.readRowIndex(0,
new boolean[] { false, false, true, false },
new boolean[] { false, false, true, false });
OrcProto.RowIndex[] rowGroupIndex = orcIndex.getRowGroupIndex();
OrcProto.ColumnStatistics statistics = rowGroupIndex[2].getEntry(0).getStatistics();
OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
.build();
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x.z", 1.0, null);

TruthValue truthValue = RecordReaderImpl.evaluatePredicateProto(
statistics,
pred, null, encoding, null,
CURRENT_WRITER, TypeDescription.createInt());

assertEquals(TruthValue.NO, truthValue);
}
}

@Test
public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
// orc-file-no-double-statistic.orc is an orc file created by cudf with a schema of
Expand Down

0 comments on commit 11d8ad5

Please sign in to comment.