From 739476cda88f0f4ca1b88d3669c373a7df001978 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 8 Jan 2024 19:20:28 -0800
Subject: [PATCH] ORC-1578: Fix `SparkBenchmark` on `sales` data according to
 SPARK-40918
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

This PR aims to fix `SparkBenchmark` according to the requirement of SPARK-40918.

Note that this fixes the synthetic benchmark on `Sales` data. For the other real-life dataset (`github` and `taxi`), we will revisit.

### Why are the changes needed?

1. Generate `Sales` data
```
$ java -jar core/target/orc-benchmarks-core-*-uber.jar generate data -f orc -d sales -s 1000000
```

2. Run Spark Benchmark
```
$ java -jar spark/target/orc-benchmarks-spark-2.1.0-SNAPSHOT.jar spark data -d sales -f orc
# Run complete. Total time: 00:10:45

Benchmark                                  (compression)  (dataset)  (format)  Mode  Cnt        Score       Error  Units
SparkBenchmark.fullRead                               gz      sales       orc  avgt    5   686792.235 ±  4398.971  us/op
SparkBenchmark.fullRead:bytesPerRecord                gz      sales       orc  avgt    5        0.192                  #
SparkBenchmark.fullRead:ops                           gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                     gz      sales       orc  avgt    5        0.687 ±     0.004  us/op
SparkBenchmark.fullRead:records                       gz      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.fullRead                           snappy      sales       orc  avgt    5   286166.380 ± 19864.429  us/op
SparkBenchmark.fullRead:bytesPerRecord            snappy      sales       orc  avgt    5        0.201                  #
SparkBenchmark.fullRead:ops                       snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                 snappy      sales       orc  avgt    5        0.286 ±     0.020  us/op
SparkBenchmark.fullRead:records                   snappy      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.fullRead                             zstd      sales       orc  avgt    5   384394.233 ± 10057.315  us/op
SparkBenchmark.fullRead:bytesPerRecord              zstd      sales       orc  avgt    5        0.192                  #
SparkBenchmark.fullRead:ops                         zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                   zstd      sales       orc  avgt    5        0.384 ±     0.010  us/op
SparkBenchmark.fullRead:records                     zstd      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                            gz      sales       orc  avgt    5    41683.914 ±  4046.077  us/op
SparkBenchmark.partialRead:bytesPerRecord             gz      sales       orc  avgt    5        0.192                  #
SparkBenchmark.partialRead:ops                        gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord                  gz      sales       orc  avgt    5        0.042 ±     0.004  us/op
SparkBenchmark.partialRead:records                    gz      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                        snappy      sales       orc  avgt    5    23981.054 ± 17874.229  us/op
SparkBenchmark.partialRead:bytesPerRecord         snappy      sales       orc  avgt    5        0.201                  #
SparkBenchmark.partialRead:ops                    snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord              snappy      sales       orc  avgt    5        0.024 ±     0.018  us/op
SparkBenchmark.partialRead:records                snappy      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                          zstd      sales       orc  avgt    5    41433.277 ± 25110.021  us/op
SparkBenchmark.partialRead:bytesPerRecord           zstd      sales       orc  avgt    5        0.192                  #
SparkBenchmark.partialRead:ops                      zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord                zstd      sales       orc  avgt    5        0.041 ±     0.025  us/op
SparkBenchmark.partialRead:records                  zstd      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.pushDown                               gz      sales       orc  avgt    5    23760.997 ±   833.034  us/op
SparkBenchmark.pushDown:bytesPerRecord                gz      sales       orc  avgt    5       19.153                  #
SparkBenchmark.pushDown:ops                           gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                     gz      sales       orc  avgt    5        2.376 ±     0.083  us/op
SparkBenchmark.pushDown:records                       gz      sales       orc  avgt    5    50000.000                  #
SparkBenchmark.pushDown                           snappy      sales       orc  avgt    5    14062.508 ±  1793.691  us/op
SparkBenchmark.pushDown:bytesPerRecord            snappy      sales       orc  avgt    5       20.105                  #
SparkBenchmark.pushDown:ops                       snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                 snappy      sales       orc  avgt    5        1.406 ±     0.179  us/op
SparkBenchmark.pushDown:records                   snappy      sales       orc  avgt    5    50000.000                  #
SparkBenchmark.pushDown                             zstd      sales       orc  avgt    5    15597.651 ±  1307.246  us/op
SparkBenchmark.pushDown:bytesPerRecord              zstd      sales       orc  avgt    5       19.213                  #
SparkBenchmark.pushDown:ops                         zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                   zstd      sales       orc  avgt    5        1.560 ±     0.131  us/op
SparkBenchmark.pushDown:records                     zstd      sales       orc  avgt    5    50000.000                  #
```

### How was this patch tested?

Pass the CIs.

Closes #1734 from dongjoon-hyun/ORC-1578.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit fbe49d71e0e66903508a52952be1cb0ca9d09ac1)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../org/apache/orc/bench/spark/SparkBenchmark.java     | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
index 4991c030c0..e8fcc6d351 100644
--- a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
+++ b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java
@@ -122,6 +122,7 @@ public static class InputSource {
     public void setup() {
       session = SparkSession.builder().appName("benchmark")
           .config("spark.master", "local[4]")
+          .config("spark.log.level", "ERROR")
           .config("spark.sql.orc.filterPushdown", true)
           .config("spark.sql.orc.impl", "native")
           .getOrCreate();
@@ -189,6 +190,9 @@ public void fullRead(InputSource source,
       case "json":
         options.add(new Tuple2<>("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSS"));
         break;
+      case "orc":
+        options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+        break;
       default:
         break;
     }
@@ -218,6 +222,9 @@ public void partialRead(InputSource source,
       case "json":
       case "avro":
         throw new IllegalArgumentException(source.format + " can't handle projection");
+      case "orc":
+        options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+        break;
       default:
         break;
     }
@@ -289,6 +296,9 @@ public void pushDown(InputSource source,
       case "json":
       case "avro":
         throw new IllegalArgumentException(source.format + " can't handle pushdown");
+      case "orc":
+        options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+        break;
       default:
         break;
     }