From 739476cda88f0f4ca1b88d3669c373a7df001978 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 8 Jan 2024 19:20:28 -0800 Subject: [PATCH] ORC-1578: Fix `SparkBenchmark` on `sales` data according to SPARK-40918 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? This PR aims to fix `SparkBenchmark` according to the requirement of SPARK-40918. Note that this fixes the synthetic benchmark on `Sales` data. For the other real-life dataset (`github` and `taxi`), we will revisit. ### Why are the changes needed? 1. Generate `Sales` data ``` $ java -jar core/target/orc-benchmarks-core-*-uber.jar generate data -f orc -d sales -s 1000000 ``` 2. Run Spark Benchmark ``` $ java -jar spark/target/orc-benchmarks-spark-2.1.0-SNAPSHOT.jar spark data -d sales -f orc # Run complete. Total time: 00:10:45 Benchmark (compression) (dataset) (format) Mode Cnt Score Error Units SparkBenchmark.fullRead gz sales orc avgt 5 686792.235 ± 4398.971 us/op SparkBenchmark.fullRead:bytesPerRecord gz sales orc avgt 5 0.192 # SparkBenchmark.fullRead:ops gz sales orc avgt 5 40.000 # SparkBenchmark.fullRead:perRecord gz sales orc avgt 5 0.687 ± 0.004 us/op SparkBenchmark.fullRead:records gz sales orc avgt 5 5000000.000 # SparkBenchmark.fullRead snappy sales orc avgt 5 286166.380 ± 19864.429 us/op SparkBenchmark.fullRead:bytesPerRecord snappy sales orc avgt 5 0.201 # SparkBenchmark.fullRead:ops snappy sales orc avgt 5 40.000 # SparkBenchmark.fullRead:perRecord snappy sales orc avgt 5 0.286 ± 0.020 us/op SparkBenchmark.fullRead:records snappy sales orc avgt 5 5000000.000 # SparkBenchmark.fullRead zstd sales orc avgt 5 384394.233 ± 10057.315 us/op SparkBenchmark.fullRead:bytesPerRecord zstd sales orc avgt 5 0.192 # SparkBenchmark.fullRead:ops zstd sales orc avgt 5 40.000 # SparkBenchmark.fullRead:perRecord zstd sales orc avgt 5 0.384 ± 0.010 us/op SparkBenchmark.fullRead:records zstd sales orc avgt 5 5000000.000 # SparkBenchmark.partialRead gz sales orc avgt 5 41683.914 ± 4046.077 us/op SparkBenchmark.partialRead:bytesPerRecord gz sales orc avgt 5 0.192 # SparkBenchmark.partialRead:ops gz sales orc avgt 5 40.000 # SparkBenchmark.partialRead:perRecord gz sales orc avgt 5 0.042 ± 0.004 us/op SparkBenchmark.partialRead:records gz sales orc avgt 5 5000000.000 # SparkBenchmark.partialRead snappy sales orc avgt 5 23981.054 ± 17874.229 us/op SparkBenchmark.partialRead:bytesPerRecord snappy sales orc avgt 5 0.201 # SparkBenchmark.partialRead:ops snappy sales orc avgt 5 40.000 # SparkBenchmark.partialRead:perRecord snappy sales orc avgt 5 0.024 ± 0.018 us/op SparkBenchmark.partialRead:records snappy sales orc avgt 5 5000000.000 # SparkBenchmark.partialRead zstd sales orc avgt 5 41433.277 ± 25110.021 us/op SparkBenchmark.partialRead:bytesPerRecord zstd sales orc avgt 5 0.192 # SparkBenchmark.partialRead:ops zstd sales orc avgt 5 40.000 # SparkBenchmark.partialRead:perRecord zstd sales orc avgt 5 0.041 ± 0.025 us/op SparkBenchmark.partialRead:records zstd sales orc avgt 5 5000000.000 # SparkBenchmark.pushDown gz sales orc avgt 5 23760.997 ± 833.034 us/op SparkBenchmark.pushDown:bytesPerRecord gz sales orc avgt 5 19.153 # SparkBenchmark.pushDown:ops gz sales orc avgt 5 40.000 # SparkBenchmark.pushDown:perRecord gz sales orc avgt 5 2.376 ± 0.083 us/op SparkBenchmark.pushDown:records gz sales orc avgt 5 50000.000 # SparkBenchmark.pushDown snappy sales orc avgt 5 14062.508 ± 1793.691 us/op SparkBenchmark.pushDown:bytesPerRecord snappy sales orc avgt 5 20.105 # SparkBenchmark.pushDown:ops snappy sales orc avgt 5 40.000 # SparkBenchmark.pushDown:perRecord snappy sales orc avgt 5 1.406 ± 0.179 us/op SparkBenchmark.pushDown:records snappy sales orc avgt 5 50000.000 # SparkBenchmark.pushDown zstd sales orc avgt 5 15597.651 ± 1307.246 us/op SparkBenchmark.pushDown:bytesPerRecord zstd sales orc avgt 5 19.213 # SparkBenchmark.pushDown:ops zstd sales orc avgt 5 40.000 # SparkBenchmark.pushDown:perRecord zstd sales orc avgt 5 1.560 ± 0.131 us/op SparkBenchmark.pushDown:records zstd sales orc avgt 5 50000.000 # ``` ### How was this patch tested? Pass the CIs. Closes #1734 from dongjoon-hyun/ORC-1578. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit fbe49d71e0e66903508a52952be1cb0ca9d09ac1) Signed-off-by: Dongjoon Hyun --- .../org/apache/orc/bench/spark/SparkBenchmark.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java index 4991c030c0..e8fcc6d351 100644 --- a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java +++ b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java @@ -122,6 +122,7 @@ public static class InputSource { public void setup() { session = SparkSession.builder().appName("benchmark") .config("spark.master", "local[4]") + .config("spark.log.level", "ERROR") .config("spark.sql.orc.filterPushdown", true) .config("spark.sql.orc.impl", "native") .getOrCreate(); @@ -189,6 +190,9 @@ public void fullRead(InputSource source, case "json": options.add(new Tuple2<>("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSS")); break; + case "orc": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -218,6 +222,9 @@ public void partialRead(InputSource source, case "json": case "avro": throw new IllegalArgumentException(source.format + " can't handle projection"); + case "orc": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -289,6 +296,9 @@ public void pushDown(InputSource source, case "json": case "avro": throw new IllegalArgumentException(source.format + " can't handle pushdown"); + case "orc": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; }