ORC-1578: Fix `SparkBenchmark` on `sales` data according to SPARK-40918 · apache/orc@739476c

Commit

ORC-1578: Fix SparkBenchmark on sales data according to SPARK-40918

### What changes were proposed in this pull request?

This PR aims to fix `SparkBenchmark` according to the requirement of SPARK-40918.

Note that this fixes the synthetic benchmark on `Sales` data. For the other real-life dataset (`github` and `taxi`), we will revisit.

### Why are the changes needed?

1. Generate `Sales` data
```
$ java -jar core/target/orc-benchmarks-core-*-uber.jar generate data -f orc -d sales -s 1000000
```

2. Run Spark Benchmark
```
$ java -jar spark/target/orc-benchmarks-spark-2.1.0-SNAPSHOT.jar spark data -d sales -f orc
# Run complete. Total time: 00:10:45

Benchmark                                  (compression)  (dataset)  (format)  Mode  Cnt        Score       Error  Units
SparkBenchmark.fullRead                               gz      sales       orc  avgt    5   686792.235 ±  4398.971  us/op
SparkBenchmark.fullRead:bytesPerRecord                gz      sales       orc  avgt    5        0.192                  #
SparkBenchmark.fullRead:ops                           gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                     gz      sales       orc  avgt    5        0.687 ±     0.004  us/op
SparkBenchmark.fullRead:records                       gz      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.fullRead                           snappy      sales       orc  avgt    5   286166.380 ± 19864.429  us/op
SparkBenchmark.fullRead:bytesPerRecord            snappy      sales       orc  avgt    5        0.201                  #
SparkBenchmark.fullRead:ops                       snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                 snappy      sales       orc  avgt    5        0.286 ±     0.020  us/op
SparkBenchmark.fullRead:records                   snappy      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.fullRead                             zstd      sales       orc  avgt    5   384394.233 ± 10057.315  us/op
SparkBenchmark.fullRead:bytesPerRecord              zstd      sales       orc  avgt    5        0.192                  #
SparkBenchmark.fullRead:ops                         zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.fullRead:perRecord                   zstd      sales       orc  avgt    5        0.384 ±     0.010  us/op
SparkBenchmark.fullRead:records                     zstd      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                            gz      sales       orc  avgt    5    41683.914 ±  4046.077  us/op
SparkBenchmark.partialRead:bytesPerRecord             gz      sales       orc  avgt    5        0.192                  #
SparkBenchmark.partialRead:ops                        gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord                  gz      sales       orc  avgt    5        0.042 ±     0.004  us/op
SparkBenchmark.partialRead:records                    gz      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                        snappy      sales       orc  avgt    5    23981.054 ± 17874.229  us/op
SparkBenchmark.partialRead:bytesPerRecord         snappy      sales       orc  avgt    5        0.201                  #
SparkBenchmark.partialRead:ops                    snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord              snappy      sales       orc  avgt    5        0.024 ±     0.018  us/op
SparkBenchmark.partialRead:records                snappy      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.partialRead                          zstd      sales       orc  avgt    5    41433.277 ± 25110.021  us/op
SparkBenchmark.partialRead:bytesPerRecord           zstd      sales       orc  avgt    5        0.192                  #
SparkBenchmark.partialRead:ops                      zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.partialRead:perRecord                zstd      sales       orc  avgt    5        0.041 ±     0.025  us/op
SparkBenchmark.partialRead:records                  zstd      sales       orc  avgt    5  5000000.000                  #
SparkBenchmark.pushDown                               gz      sales       orc  avgt    5    23760.997 ±   833.034  us/op
SparkBenchmark.pushDown:bytesPerRecord                gz      sales       orc  avgt    5       19.153                  #
SparkBenchmark.pushDown:ops                           gz      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                     gz      sales       orc  avgt    5        2.376 ±     0.083  us/op
SparkBenchmark.pushDown:records                       gz      sales       orc  avgt    5    50000.000                  #
SparkBenchmark.pushDown                           snappy      sales       orc  avgt    5    14062.508 ±  1793.691  us/op
SparkBenchmark.pushDown:bytesPerRecord            snappy      sales       orc  avgt    5       20.105                  #
SparkBenchmark.pushDown:ops                       snappy      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                 snappy      sales       orc  avgt    5        1.406 ±     0.179  us/op
SparkBenchmark.pushDown:records                   snappy      sales       orc  avgt    5    50000.000                  #
SparkBenchmark.pushDown                             zstd      sales       orc  avgt    5    15597.651 ±  1307.246  us/op
SparkBenchmark.pushDown:bytesPerRecord              zstd      sales       orc  avgt    5       19.213                  #
SparkBenchmark.pushDown:ops                         zstd      sales       orc  avgt    5       40.000                  #
SparkBenchmark.pushDown:perRecord                   zstd      sales       orc  avgt    5        1.560 ±     0.131  us/op
SparkBenchmark.pushDown:records                     zstd      sales       orc  avgt    5    50000.000                  #
```

### How was this patch tested?

Pass the CIs.

Closes #1734 from dongjoon-hyun/ORC-1578.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit fbe49d7)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>

Loading branch information

dongjoon-hyun committed Jan 9, 2024

1 parent 645f41f commit 739476c

java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java

-Original file line number
+Diff line change
@@ Expand Up / @@ -122,6 +122,7 @@ public static class InputSource { @@
         public void setup() {
           session = SparkSession.builder().appName("benchmark")
               .config("spark.master", "local[4]")
+              .config("spark.log.level", "ERROR")
               .config("spark.sql.orc.filterPushdown", true)
               .config("spark.sql.orc.impl", "native")
               .getOrCreate();
@@ Expand Down Expand Up / @@ -189,6 +190,9 @@ public void fullRead(InputSource source, @@
           case "json":
             options.add(new Tuple2<>("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSS"));
             break;
+          case "orc":
+            options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+            break;
           default:
             break;
         }
@@ Expand Down Expand Up / @@ -218,6 +222,9 @@ public void partialRead(InputSource source, @@
           case "json":
           case "avro":
             throw new IllegalArgumentException(source.format + " can't handle projection");
+          case "orc":
+            options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+            break;
           default:
             break;
         }
@@ Expand Down Expand Up / @@ -289,6 +296,9 @@ public void pushDown(InputSource source, @@
           case "json":
           case "avro":
             throw new IllegalArgumentException(source.format + " can't handle pushdown");
+          case "orc":
+            options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918
+            break;
           default:
             break;
         }
@@ Expand Down @@

0 comments on commit `739476c`

Please sign in to comment.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

0 comments on commit `739476c`

Commit

There are no files selected for viewing

0 comments on commit 739476c

0 comments on commit `739476c`