Skip to content

Commit

Permalink
ORC-1577: Use ZSTD as the default compression
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR aims to use `ZSTD` as the default compression from Apache ORC 2.0.0.

### Why are the changes needed?

Apache ORC has been supporting ZStandard since 1.6.0.

ZStandard is known to be better than Gzip in terms of the size and speed.

- _The Rise of ZStandard: Apache Spark/Parquet/ORC/Avro_
    - [Slides](https://www.slideshare.net/databricks/the-rise-of-zstandard-apache-sparkparquetorcavro)
    - [Youtube](https://youtu.be/dTGxhHwjONY)

### How was this patch tested?

Pass the CIs.

Closes #1733 from dongjoon-hyun/ORC-1577.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit baf4c23)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
dongjoon-hyun committed Jan 9, 2024
1 parent 73ea5b8 commit 40d1d26
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 10 deletions.
2 changes: 1 addition & 1 deletion c++/src/Writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ namespace orc {
stripeSize = 64 * 1024 * 1024; // 64M
compressionBlockSize = 64 * 1024; // 64K
rowIndexStride = 10000;
compression = CompressionKind_ZLIB;
compression = CompressionKind_ZSTD;
compressionStrategy = CompressionStrategy_SPEED;
memoryPool = getDefaultPool();
paddingTolerance = 0.0;
Expand Down
2 changes: 1 addition & 1 deletion java/core/src/java/org/apache/orc/OrcConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public enum OrcConf {
BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
true,
"Define whether stripes should be padded to the HDFS block boundaries."),
COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZSTD",
"Define the default compression codec for ORC file"),
WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
"Define the version of the file to write. Possible values are 0.11 and\n"+
Expand Down
8 changes: 4 additions & 4 deletions java/core/src/test/org/apache/orc/TestVectorOrcFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ public void testStringAndBinaryStatistics(Version fileFormat) throws Exception {

assertEquals(3, stats[1].getNumberOfValues());
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
assertEquals("count: 3 hasNull: true bytesOnDisk: 30 sum: 15", stats[1].toString());

assertEquals(3, stats[2].getNumberOfValues());
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
Expand Down Expand Up @@ -1255,7 +1255,7 @@ public void test1(Version fileFormat) throws Exception {
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0",
assertEquals("count: 2 hasNull: false bytesOnDisk: 19 min: -15.0 max: -5.0 sum: -20.0",
stats[7].toString());

assertEquals("count: 2 hasNull: false bytesOnDisk: " +
Expand Down Expand Up @@ -3961,7 +3961,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {
// test reading with no keys
Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
assertEquals(9 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(0xc00, reader.getCompressionSize());
assertEquals(fileFormat, reader.getFileVersion());
Expand Down Expand Up @@ -4107,7 +4107,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {

reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
assertEquals(0x800, reader.getCompressionSize());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(fileFormat, reader.getFileVersion());
Expand Down
1 change: 1 addition & 0 deletions java/tools/src/test/org/apache/orc/tools/TestFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,7 @@ public void testHasNull() throws Exception {
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.setSchema(schema)
.compress(CompressionKind.ZLIB)
.rowIndexStride(1000)
.stripeSize(10000)
.bufferSize(10000));
Expand Down
4 changes: 2 additions & 2 deletions site/_docs/core-java-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ permalink: /docs/core-java-config.html
</tr>
<tr>
<td><code>orc.compress</code></td>
<td>ZLIB</td>
<td>ZSTD</td>
<td>
Define the default compression codec for ORC file
</td>
Expand Down Expand Up @@ -396,4 +396,4 @@ permalink: /docs/core-java-config.html
The maximum number of child elements to buffer before the ORC row writer writes the batch to the file.
</td>
</tr>
</table>
</table>
2 changes: 1 addition & 1 deletion site/_docs/hive-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ with the same options.

Key | Default | Notes
:----------------------- | :---------- | :------------------------
orc.compress | ZLIB | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
orc.compress | ZSTD | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
orc.compress.size | 262,144 | compression chunk size
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
orc.row.index.stride | 10,000 | number of rows between index entries
Expand Down
2 changes: 1 addition & 1 deletion site/_docs/spark-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ with the same options.

Key | Default | Notes
:----------------------- | :---------- | :------------------------
orc.compress | ZLIB | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
orc.compress | ZSTD | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
orc.compress.size | 262,144 | compression chunk size
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
orc.row.index.stride | 10,000 | number of rows between index entries
Expand Down

0 comments on commit 40d1d26

Please sign in to comment.