From b3016d779e79d71ab70a283fb292390a5ae37100 Mon Sep 17 00:00:00 2001 From: Yiqun Zhang Date: Thu, 24 Aug 2023 20:11:54 -0700 Subject: [PATCH] ORC-1482: Adaptation to read ORC files created by CUDF This pr is aimed at adapting to read ORC files created by CUDF, which may have missing statistics in their DOUBLE/FLOAT columns. Official ORC readers can't read CUDF-created ORC files properly. Added UT. Closes #1598 from guiyanakuang/ORC-1482-to-1.8. Authored-by: Yiqun Zhang Signed-off-by: Dongjoon Hyun --- .../org/apache/orc/impl/RecordReaderImpl.java | 12 ++++++++++-- .../org/apache/orc/impl/TestRecordReaderImpl.java | 1 + .../resources/orc-file-no-double-statistic.orc | Bin 0 -> 161 bytes 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 java/core/src/test/resources/orc-file-no-double-statistic.orc diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 333cf5b9e1..48ba952606 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -650,8 +650,8 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, " include ORC-517. Writer version: {}", predicate.getColumnName(), writerVersion); return TruthValue.YES_NO_NULL; - } else if (category == TypeDescription.Category.DOUBLE - || category == TypeDescription.Category.FLOAT) { + } else if ((category == TypeDescription.Category.DOUBLE || + category == TypeDescription.Category.FLOAT) && cs instanceof DoubleColumnStatistics) { DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs; if (Double.isNaN(dstas.getSum())) { LOG.debug("Not using predication pushdown on {} because stats contain NaN values", @@ -1654,4 +1654,12 @@ public CompressionCodec getCompressionCodec() { public int getMaxDiskRangeChunkLimit() { return maxDiskRangeChunkLimit; } + + /** + * Get sargApplier for testing. + * @return sargApplier in record reader. + */ + SargApplier getSargApp() { + return sargApp; + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index 85e502afbd..555954b080 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -41,6 +41,7 @@ import org.apache.orc.CompressionCodec; import org.apache.orc.CompressionKind; import org.apache.orc.DataReader; +import org.apache.orc.DoubleColumnStatistics; import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; import org.apache.orc.OrcProto; diff --git a/java/core/src/test/resources/orc-file-no-double-statistic.orc b/java/core/src/test/resources/orc-file-no-double-statistic.orc new file mode 100644 index 0000000000000000000000000000000000000000..9da6e42e326b43a1e2def90004ed5af2863327fc GIT binary patch literal 161 zcmW-XF$%&!5JmsYnsr=OSQi3AAP}|+*rc`+(kGbKi&zFv;9>1`l7Tm`_-|gu6A(C{ zHO1huUuHoC{6N6N1}{EAkGn}PW+c+eOlG&`8?~hpSjfN=5SOMV*jJVA+EWpUnW&*x dIf%~~=w`8j`)86Nv94<>jM$&wU+3m!tp6yC4Q~Jd literal 0 HcmV?d00001