ORC-1577: Use ZSTD as the default compression

### What changes were proposed in this pull request? This PR aims to use `ZSTD` as the default compression from Apache ORC 2.0.0. ### Why are the changes needed? Apache ORC has been supporting ZStandard since 1.6.0. ZStandard is known to be better than Gzip in terms of the size and speed. - _The Rise of ZStandard: Apache Spark/Parquet/ORC/Avro_ - [Slides](https://www.slideshare.net/databricks/the-rise-of-zstandard-apache-sparkparquetorcavro) - [Youtube](https://youtu.be/dTGxhHwjONY) ### How was this patch tested? Pass the CIs. Closes #1733 from dongjoon-hyun/ORC-1577. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org> (cherry picked from commit baf4c23) Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
apache · Jan 9, 2024 · 40d1d26 · 40d1d26
1 parent 73ea5b8
commit 40d1d26
Show file tree

Hide file tree

Showing 7 changed files with 11 additions and 10 deletions.
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
@@ -51,7 +51,7 @@ namespace orc {
       stripeSize = 64 * 1024 * 1024;                               // 64M
       compressionBlockSize = 64 * 1024;                            // 64K
       rowIndexStride = 10000;
-      compression = CompressionKind_ZLIB;
+      compression = CompressionKind_ZSTD;
       compressionStrategy = CompressionStrategy_SPEED;
       memoryPool = getDefaultPool();
       paddingTolerance = 0.0;

diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -52,7 +52,7 @@ public enum OrcConf {
   BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
       true,
       "Define whether stripes should be padded to the HDFS block boundaries."),
-  COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
+  COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZSTD",
       "Define the default compression codec for ORC file"),
   WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
       "Define the version of the file to write. Possible values are 0.11 and\n"+

diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -538,7 +538,7 @@ public void testStringAndBinaryStatistics(Version fileFormat) throws Exception {
 
     assertEquals(3, stats[1].getNumberOfValues());
     assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
-    assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
+    assertEquals("count: 3 hasNull: true bytesOnDisk: 30 sum: 15", stats[1].toString());
 
     assertEquals(3, stats[2].getNumberOfValues());
     assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
@@ -1255,7 +1255,7 @@ public void test1(Version fileFormat) throws Exception {
     assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
     assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
     assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
-    assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0",
+    assertEquals("count: 2 hasNull: false bytesOnDisk: 19 min: -15.0 max: -5.0 sum: -20.0",
         stats[7].toString());
 
     assertEquals("count: 2 hasNull: false bytesOnDisk: " +
@@ -3961,7 +3961,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {
     // test reading with no keys
     Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
     assertEquals(9 * 1024, reader.getNumberOfRows());
-    assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+    assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
     assertEquals(1000, reader.getRowIndexStride());
     assertEquals(0xc00, reader.getCompressionSize());
     assertEquals(fileFormat, reader.getFileVersion());
@@ -4107,7 +4107,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {
 
     reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
     assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
-    assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+    assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
     assertEquals(0x800, reader.getCompressionSize());
     assertEquals(1000, reader.getRowIndexStride());
     assertEquals(fileFormat, reader.getFileVersion());

diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -588,6 +588,7 @@ public void testHasNull() throws Exception {
     Writer writer = OrcFile.createWriter(testFilePath,
         OrcFile.writerOptions(conf)
             .setSchema(schema)
+            .compress(CompressionKind.ZLIB)
             .rowIndexStride(1000)
             .stripeSize(10000)
             .bufferSize(10000));

diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
@@ -69,7 +69,7 @@ permalink: /docs/core-java-config.html
 </tr>
 <tr>
   <td><code>orc.compress</code></td>
-  <td>ZLIB</td>
+  <td>ZSTD</td>
   <td>
     Define the default compression codec for ORC file
   </td>
@@ -396,4 +396,4 @@ permalink: /docs/core-java-config.html
     The maximum number of child elements to buffer before the ORC row writer writes the batch to the file.
   </td>
 </tr>
-</table>
+</table>
diff --git a/site/_docs/hive-config.md b/site/_docs/hive-config.md
@@ -12,7 +12,7 @@ with the same options.
 
 Key                      | Default     | Notes
 :----------------------- | :---------- | :------------------------
-orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
+orc.compress             | ZSTD        | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries

diff --git a/site/_docs/spark-config.md b/site/_docs/spark-config.md
@@ -12,7 +12,7 @@ with the same options.
 
 Key                      | Default     | Notes
 :----------------------- | :---------- | :------------------------
-orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
+orc.compress             | ZSTD        | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries