diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 5e056d040e4c..93044ccf069e 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -46,8 +46,10 @@ substrait = { version = "0.50", features = ["serde"] } url = { workspace = true } [dev-dependencies] -datafusion = { workspace = true, features = ["nested_expressions"] } +datafusion = { workspace = true, features = ["nested_expressions", "serde"] } datafusion-functions-aggregate = { workspace = true } +datafusion-proto = { workspace = true, features = ["json"] } +datafusion-proto-common = { workspace = true, features = ["json"] } serde_json = "1.0" tokio = { workspace = true } diff --git a/datafusion/substrait/tests/cases/mod.rs b/datafusion/substrait/tests/cases/mod.rs index 777246e4139b..7d6553f37655 100644 --- a/datafusion/substrait/tests/cases/mod.rs +++ b/datafusion/substrait/tests/cases/mod.rs @@ -22,5 +22,6 @@ mod logical_plans; mod roundtrip_logical_plan; #[cfg(feature = "physical")] mod roundtrip_physical_plan; +mod roundtrip_substrait_tpch; mod serialize; mod substrait_validations; diff --git a/datafusion/substrait/tests/cases/roundtrip_substrait_tpch.rs b/datafusion/substrait/tests/cases/roundtrip_substrait_tpch.rs new file mode 100644 index 000000000000..a01a07dcc4ae --- /dev/null +++ b/datafusion/substrait/tests/cases/roundtrip_substrait_tpch.rs @@ -0,0 +1,356 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! TPCH `roundtrip_substrait_tpch` tests +//! +//! This module tests that substrait queries in sql can be read and that the optimized +//! logiccal plans produced remain the same after a round trip to substrait and back. +//! +//! +//! The input sql queries are generated from +//! +//! + +use async_trait::async_trait; +use datafusion::arrow::datatypes::{Schema, SchemaRef}; +use datafusion::catalog::TableProvider; +use datafusion::common::{internal_datafusion_err, Result}; +use datafusion::datasource::TableType; +use datafusion::error::DataFusionError; +use datafusion::execution::{SessionState, SessionStateBuilder}; +use datafusion::logical_expr::TableProviderFilterPushDown; +use datafusion::prelude::DataFrame; +use datafusion::sql::TableReference; +use datafusion::{ + execution::runtime_env::RuntimeEnv, + prelude::{SessionConfig, SessionContext}, +}; +use datafusion_proto::protobuf; +use std::fs::read_to_string; +use std::sync::Arc; + +#[derive(Debug)] +struct FakeTableProvider { + schema: Arc, +} + +#[async_trait] +impl TableProvider for FakeTableProvider { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + _state: &dyn datafusion::catalog::Session, + _projection: Option<&Vec>, + _filters: &[datafusion::prelude::Expr], + _limit: Option, + ) -> Result> { + println!("uh oh"); + unimplemented!("scan") + } + + fn supports_filters_pushdown( + &self, + filters: &[&datafusion::prelude::Expr], + ) -> Result> { + Ok(std::vec![ + TableProviderFilterPushDown::Inexact; + filters.len() + ]) + } +} + +async fn create_context() -> Result { + let state = SessionStateBuilder::new() + .with_config(SessionConfig::default()) + .with_runtime_env(Arc::new(RuntimeEnv::default())) + .with_default_features() + .build(); + + let ctx = SessionContext::new_with_state(state); + + let tables = [ + "customer", "lineitem", "nation", "orders", "part", "partsupp", "region", + "supplier", + ]; + + tables.into_iter().try_for_each(|table_name| { + let schema_path = format!("tests/testdata/tpch_schemas/{table_name}_schema.json"); + let schema_json_data = read_to_string(schema_path)?; + + let proto: protobuf::Schema = serde_json::from_str(&schema_json_data) + .map_err(|e| internal_datafusion_err!("Error parsing schema JSON: {}", e))?; + + let schema = Schema::try_from(&proto)?; + + let provider = FakeTableProvider { + schema: Arc::new(schema), + }; + ctx.register_table(TableReference::bare(table_name), Arc::new(provider))?; + + Ok::<_, DataFusionError>(()) + })?; + + Ok(ctx) +} + +async fn get_dataframe(query_id: i32) -> Result<(DataFrame, SessionState)> { + let path = format!("tests/testdata/tpch_queries/query_{query_id:02}.sql"); + println!("path = {}", path); + let query = read_to_string(path)?; + println!("Query: \n{}", query); + + let ctx = create_context().await?; + + Ok((ctx.sql(&query).await?, ctx.state())) +} + +#[cfg(test)] +mod tests { + use datafusion::common::Result; + use datafusion_substrait::logical_plan::{ + consumer::from_substrait_plan, producer::to_substrait_plan, + }; + + use super::*; + + async fn tpch_round_trip_optimized(query_id: i32) -> Result<()> { + let (df, state) = get_dataframe(query_id).await?; + + let oplan = df.clone().into_optimized_plan()?; + println!("Optimized plan:\n{}", oplan.display_indent()); + + let ssplan = to_substrait_plan(&oplan, &state)?; + let roundtrip_plan = from_substrait_plan(&state, &ssplan).await?; + let roundtrip_plan_optimized = state.optimize(&roundtrip_plan)?; + println!( + "Roundtrip optimized plan:\n{}", + roundtrip_plan_optimized.display_indent() + ); + + assert_eq!( + oplan.display_indent().to_string(), + roundtrip_plan_optimized.display_indent().to_string() + ); + Ok(()) + } + + async fn tpch_round_trip_unoptimized(query_id: i32) -> Result<()> { + let (df, state) = get_dataframe(query_id).await?; + + let plan = df.logical_plan(); + println!("Logical plan:\n{}", plan.display_indent()); + + let ssplan = to_substrait_plan(plan, &state)?; + let roundtrip_plan = from_substrait_plan(&state, &ssplan).await?; + println!("Roundtrip plan:\n{}", roundtrip_plan.display_indent()); + + assert_eq!( + plan.display_indent().to_string(), + roundtrip_plan.display_indent().to_string() + ); + Ok(()) + } + + #[tokio::test] + async fn tpch_round_trip_test_optimized_01() -> Result<()> { + tpch_round_trip_optimized(1).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_02() -> Result<()> { + tpch_round_trip_optimized(2).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_03() -> Result<()> { + tpch_round_trip_optimized(3).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_04() -> Result<()> { + tpch_round_trip_optimized(4).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_05() -> Result<()> { + tpch_round_trip_optimized(5).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_06() -> Result<()> { + tpch_round_trip_optimized(6).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_07() -> Result<()> { + tpch_round_trip_optimized(7).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_08() -> Result<()> { + tpch_round_trip_optimized(8).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_09() -> Result<()> { + tpch_round_trip_optimized(9).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_10() -> Result<()> { + tpch_round_trip_optimized(10).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_11() -> Result<()> { + tpch_round_trip_optimized(11).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_12() -> Result<()> { + tpch_round_trip_optimized(12).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_13() -> Result<()> { + tpch_round_trip_optimized(13).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_14() -> Result<()> { + tpch_round_trip_optimized(14).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_15() -> Result<()> { + tpch_round_trip_optimized(15).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_16() -> Result<()> { + tpch_round_trip_optimized(16).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_17() -> Result<()> { + tpch_round_trip_optimized(17).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_18() -> Result<()> { + tpch_round_trip_optimized(18).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_19() -> Result<()> { + tpch_round_trip_optimized(19).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_20() -> Result<()> { + tpch_round_trip_optimized(20).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_21() -> Result<()> { + tpch_round_trip_optimized(21).await + } + #[tokio::test] + async fn tpch_round_trip_test_optimized_22() -> Result<()> { + tpch_round_trip_optimized(22).await + } + + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_01() -> Result<()> { + tpch_round_trip_unoptimized(1).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_02() -> Result<()> { + tpch_round_trip_unoptimized(2).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_03() -> Result<()> { + tpch_round_trip_unoptimized(3).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_04() -> Result<()> { + tpch_round_trip_unoptimized(4).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_05() -> Result<()> { + tpch_round_trip_unoptimized(5).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_06() -> Result<()> { + tpch_round_trip_unoptimized(6).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_07() -> Result<()> { + tpch_round_trip_unoptimized(7).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_08() -> Result<()> { + tpch_round_trip_unoptimized(8).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_09() -> Result<()> { + tpch_round_trip_unoptimized(9).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_10() -> Result<()> { + tpch_round_trip_unoptimized(10).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_11() -> Result<()> { + tpch_round_trip_unoptimized(11).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_12() -> Result<()> { + tpch_round_trip_unoptimized(12).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_13() -> Result<()> { + tpch_round_trip_unoptimized(13).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_14() -> Result<()> { + tpch_round_trip_unoptimized(14).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_15() -> Result<()> { + tpch_round_trip_unoptimized(15).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_16() -> Result<()> { + tpch_round_trip_unoptimized(16).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_17() -> Result<()> { + tpch_round_trip_unoptimized(17).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_18() -> Result<()> { + tpch_round_trip_unoptimized(18).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_19() -> Result<()> { + tpch_round_trip_unoptimized(19).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_20() -> Result<()> { + tpch_round_trip_unoptimized(20).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_21() -> Result<()> { + tpch_round_trip_unoptimized(21).await + } + #[tokio::test] + async fn tpch_round_trip_test_unoptimized_22() -> Result<()> { + tpch_round_trip_unoptimized(22).await + } +} diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_01.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_01.sql new file mode 100644 index 000000000000..72d232fd3fe1 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_01.sql @@ -0,0 +1,22 @@ +SELECT + l_returnflag, + l_linestatus, + sum(l_quantity) AS sum_qty, + sum(l_extendedprice) AS sum_base_price, + sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + avg(l_quantity) AS avg_qty, + avg(l_extendedprice) AS avg_price, + avg(l_discount) AS avg_disc, + count(*) AS count_order +FROM + lineitem +WHERE + l_shipdate <= CAST('1998-09-02' AS date) +GROUP BY + l_returnflag, + l_linestatus +ORDER BY + l_returnflag, + l_linestatus; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_02.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_02.sql new file mode 100644 index 000000000000..568bf610a90c --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_02.sql @@ -0,0 +1,44 @@ +SELECT + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +FROM + part, + supplier, + partsupp, + nation, + region +WHERE + p_partkey = ps_partkey + AND s_suppkey = ps_suppkey + AND p_size = 15 + AND p_type LIKE '%BRASS' + AND s_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND r_name = 'EUROPE' + AND ps_supplycost = ( + SELECT + min(ps_supplycost) + FROM + partsupp, + supplier, + nation, + region + WHERE + p_partkey = ps_partkey + AND s_suppkey = ps_suppkey + AND s_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND r_name = 'EUROPE') +ORDER BY + s_acctbal DESC, + n_name, + s_name, + p_partkey +LIMIT 100; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_03.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_03.sql new file mode 100644 index 000000000000..61e163624120 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_03.sql @@ -0,0 +1,24 @@ +SELECT + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) AS revenue, + o_orderdate, + o_shippriority +FROM + customer, + orders, + lineitem +WHERE + c_mktsegment = 'BUILDING' + AND c_custkey = o_custkey + AND l_orderkey = o_orderkey + AND o_orderdate < CAST('1995-03-15' AS date) + AND l_shipdate > CAST('1995-03-15' AS date) +GROUP BY + l_orderkey, + o_orderdate, + o_shippriority +ORDER BY + revenue DESC, + o_orderdate +LIMIT 10; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_04.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_04.sql new file mode 100644 index 000000000000..5e173030ff0b --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_04.sql @@ -0,0 +1,21 @@ +SELECT + o_orderpriority, + count(*) AS order_count +FROM + orders +WHERE + o_orderdate >= CAST('1993-07-01' AS date) + AND o_orderdate < CAST('1993-10-01' AS date) + AND EXISTS ( + SELECT + * + FROM + lineitem + WHERE + l_orderkey = o_orderkey + AND l_commitdate < l_receiptdate) +GROUP BY + o_orderpriority +ORDER BY + o_orderpriority; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_05.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_05.sql new file mode 100644 index 000000000000..340a000994ad --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_05.sql @@ -0,0 +1,25 @@ +SELECT + n_name, + sum(l_extendedprice * (1 - l_discount)) AS revenue +FROM + customer, + orders, + lineitem, + supplier, + nation, + region +WHERE + c_custkey = o_custkey + AND l_orderkey = o_orderkey + AND l_suppkey = s_suppkey + AND c_nationkey = s_nationkey + AND s_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND r_name = 'ASIA' + AND o_orderdate >= CAST('1994-01-01' AS date) + AND o_orderdate < CAST('1995-01-01' AS date) +GROUP BY + n_name +ORDER BY + revenue DESC; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_06.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_06.sql new file mode 100644 index 000000000000..1dfe795d1f2f --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_06.sql @@ -0,0 +1,11 @@ +SELECT + sum(l_extendedprice * l_discount) AS revenue +FROM + lineitem +WHERE + l_shipdate >= CAST('1994-01-01' AS date) + AND l_shipdate < CAST('1995-01-01' AS date) + AND l_discount BETWEEN 0.05 + AND 0.07 + AND l_quantity < 24; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_07.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_07.sql new file mode 100644 index 000000000000..d4a196137116 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_07.sql @@ -0,0 +1,39 @@ +SELECT + supp_nation, + cust_nation, + l_year, + sum(volume) AS revenue +FROM ( + SELECT + n1.n_name AS supp_nation, + n2.n_name AS cust_nation, + extract(year FROM l_shipdate) AS l_year, + l_extendedprice * (1 - l_discount) AS volume + FROM + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + WHERE + s_suppkey = l_suppkey + AND o_orderkey = l_orderkey + AND c_custkey = o_custkey + AND s_nationkey = n1.n_nationkey + AND c_nationkey = n2.n_nationkey + AND ((n1.n_name = 'FRANCE' + AND n2.n_name = 'GERMANY') + OR (n1.n_name = 'GERMANY' + AND n2.n_name = 'FRANCE')) + AND l_shipdate BETWEEN CAST('1995-01-01' AS date) + AND CAST('1996-12-31' AS date)) AS shipping +GROUP BY + supp_nation, + cust_nation, + l_year +ORDER BY + supp_nation, + cust_nation, + l_year; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_08.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_08.sql new file mode 100644 index 000000000000..7cd01af5f5c1 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_08.sql @@ -0,0 +1,39 @@ +SELECT + o_year, + sum( + CASE WHEN nation = 'BRAZIL' THEN + volume + ELSE + 0 + END) / sum(volume) AS mkt_share +FROM ( + SELECT + extract(year FROM o_orderdate) AS o_year, + l_extendedprice * (1 - l_discount) AS volume, + n2.n_name AS nation + FROM + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + WHERE + p_partkey = l_partkey + AND s_suppkey = l_suppkey + AND l_orderkey = o_orderkey + AND o_custkey = c_custkey + AND c_nationkey = n1.n_nationkey + AND n1.n_regionkey = r_regionkey + AND r_name = 'AMERICA' + AND s_nationkey = n2.n_nationkey + AND o_orderdate BETWEEN CAST('1995-01-01' AS date) + AND CAST('1996-12-31' AS date) + AND p_type = 'ECONOMY ANODIZED STEEL') AS all_nations +GROUP BY + o_year +ORDER BY + o_year; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_09.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_09.sql new file mode 100644 index 000000000000..d731284a05fe --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_09.sql @@ -0,0 +1,31 @@ +SELECT + nation, + o_year, + sum(amount) AS sum_profit +FROM ( + SELECT + n_name AS nation, + extract(year FROM o_orderdate) AS o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount + FROM + part, + supplier, + lineitem, + partsupp, + orders, + nation + WHERE + s_suppkey = l_suppkey + AND ps_suppkey = l_suppkey + AND ps_partkey = l_partkey + AND p_partkey = l_partkey + AND o_orderkey = l_orderkey + AND s_nationkey = n_nationkey + AND p_name LIKE '%green%') AS profit +GROUP BY + nation, + o_year +ORDER BY + nation, + o_year DESC; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_10.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_10.sql new file mode 100644 index 000000000000..ac9d2ffe6ef2 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_10.sql @@ -0,0 +1,33 @@ +SELECT + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) AS revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +FROM + customer, + orders, + lineitem, + nation +WHERE + c_custkey = o_custkey + AND l_orderkey = o_orderkey + AND o_orderdate >= CAST('1993-10-01' AS date) + AND o_orderdate < CAST('1994-01-01' AS date) + AND l_returnflag = 'R' + AND c_nationkey = n_nationkey +GROUP BY + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +ORDER BY + revenue DESC +LIMIT 20; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_11.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_11.sql new file mode 100644 index 000000000000..5c9a02962308 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_11.sql @@ -0,0 +1,28 @@ +SELECT + ps_partkey, + sum(ps_supplycost * ps_availqty) AS value +FROM + partsupp, + supplier, + nation +WHERE + ps_suppkey = s_suppkey + AND s_nationkey = n_nationkey + AND n_name = 'GERMANY' +GROUP BY + ps_partkey +HAVING + sum(ps_supplycost * ps_availqty) > ( + SELECT + sum(ps_supplycost * ps_availqty) * 0.0001000000 + FROM + partsupp, + supplier, + nation + WHERE + ps_suppkey = s_suppkey + AND s_nationkey = n_nationkey + AND n_name = 'GERMANY') +ORDER BY + value DESC; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_12.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_12.sql new file mode 100644 index 000000000000..fe72a62f7565 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_12.sql @@ -0,0 +1,31 @@ +SELECT + l_shipmode, + sum( + CASE WHEN o_orderpriority = '1-URGENT' + OR o_orderpriority = '2-HIGH' THEN + 1 + ELSE + 0 + END) AS high_line_count, + sum( + CASE WHEN o_orderpriority <> '1-URGENT' + AND o_orderpriority <> '2-HIGH' THEN + 1 + ELSE + 0 + END) AS low_line_count +FROM + orders, + lineitem +WHERE + o_orderkey = l_orderkey + AND l_shipmode IN ('MAIL', 'SHIP') + AND l_commitdate < l_receiptdate + AND l_shipdate < l_commitdate + AND l_receiptdate >= CAST('1994-01-01' AS date) + AND l_receiptdate < CAST('1995-01-01' AS date) +GROUP BY + l_shipmode +ORDER BY + l_shipmode; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_13.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_13.sql new file mode 100644 index 000000000000..7e5f91808683 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_13.sql @@ -0,0 +1,20 @@ +SELECT + c_count, + count(*) AS custdist +FROM ( + SELECT + c_custkey, + count(o_orderkey) + FROM + customer + LEFT OUTER JOIN orders ON c_custkey = o_custkey + AND o_comment NOT LIKE '%special%requests%' +GROUP BY + c_custkey) AS c_orders (c_custkey, + c_count) +GROUP BY + c_count +ORDER BY + custdist DESC, + c_count DESC; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_14.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_14.sql new file mode 100644 index 000000000000..49bc3b460ebe --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_14.sql @@ -0,0 +1,15 @@ +SELECT + 100.00 * sum( + CASE WHEN p_type LIKE 'PROMO%' THEN + l_extendedprice * (1 - l_discount) + ELSE + 0 + END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue +FROM + lineitem, + part +WHERE + l_partkey = p_partkey + AND l_shipdate >= date '1995-09-01' + AND l_shipdate < CAST('1995-10-01' AS date); + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_15.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_15.sql new file mode 100644 index 000000000000..88235d3d3f5d --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_15.sql @@ -0,0 +1,30 @@ +WITH revenue AS ( + SELECT + l_suppkey AS supplier_no, + sum(l_extendedprice * (1 - l_discount)) AS total_revenue + FROM + lineitem + WHERE + l_shipdate >= CAST('1996-01-01' AS date) + AND l_shipdate < CAST('1996-04-01' AS date) + GROUP BY + supplier_no +) +SELECT + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +FROM + supplier, + revenue +WHERE + s_suppkey = supplier_no + AND total_revenue = ( + SELECT + max(total_revenue) + FROM revenue) +ORDER BY + s_suppkey; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_16.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_16.sql new file mode 100644 index 000000000000..c8227b9fbf23 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_16.sql @@ -0,0 +1,30 @@ +SELECT + p_brand, + p_type, + p_size, + count(DISTINCT ps_suppkey) AS supplier_cnt +FROM + partsupp, + part +WHERE + p_partkey = ps_partkey + AND p_brand <> 'Brand#45' + AND p_type NOT LIKE 'MEDIUM POLISHED%' + AND p_size IN (49, 14, 23, 45, 19, 3, 36, 9) + AND ps_suppkey NOT IN ( + SELECT + s_suppkey + FROM + supplier + WHERE + s_comment LIKE '%Customer%Complaints%') +GROUP BY + p_brand, + p_type, + p_size +ORDER BY + supplier_cnt DESC, + p_brand, + p_type, + p_size; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_17.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_17.sql new file mode 100644 index 000000000000..5659b44f38c0 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_17.sql @@ -0,0 +1,17 @@ +SELECT + sum(l_extendedprice) / 7.0 AS avg_yearly +FROM + lineitem, + part +WHERE + p_partkey = l_partkey + AND p_brand = 'Brand#23' + AND p_container = 'MED BOX' + AND l_quantity < ( + SELECT + 0.2 * avg(l_quantity) + FROM + lineitem + WHERE + l_partkey = p_partkey); + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_18.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_18.sql new file mode 100644 index 000000000000..8f879a724a56 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_18.sql @@ -0,0 +1,34 @@ +SELECT + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +FROM + customer, + orders, + lineitem +WHERE + o_orderkey IN ( + SELECT + l_orderkey + FROM + lineitem + GROUP BY + l_orderkey + HAVING + sum(l_quantity) > 300) + AND c_custkey = o_custkey + AND o_orderkey = l_orderkey +GROUP BY + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +ORDER BY + o_totalprice DESC, + o_orderdate +LIMIT 100; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_19.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_19.sql new file mode 100644 index 000000000000..fae736ba1b4b --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_19.sql @@ -0,0 +1,30 @@ +SELECT + sum(l_extendedprice * (1 - l_discount)) AS revenue +FROM + lineitem, + part +WHERE (p_partkey = l_partkey + AND p_brand = 'Brand#12' + AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + AND l_quantity >= 1 + AND l_quantity <= 1 + 10 + AND p_size BETWEEN 1 AND 5 + AND l_shipmode IN ('AIR', 'AIR REG') + AND l_shipinstruct = 'DELIVER IN PERSON') + OR (p_partkey = l_partkey + AND p_brand = 'Brand#23' + AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + AND l_quantity >= 10 + AND l_quantity <= 10 + 10 + AND p_size BETWEEN 1 AND 10 + AND l_shipmode IN ('AIR', 'AIR REG') + AND l_shipinstruct = 'DELIVER IN PERSON') + OR (p_partkey = l_partkey + AND p_brand = 'Brand#34' + AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + AND l_quantity >= 20 + AND l_quantity <= 20 + 10 + AND p_size BETWEEN 1 AND 15 + AND l_shipmode IN ('AIR', 'AIR REG') + AND l_shipinstruct = 'DELIVER IN PERSON'); + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_20.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_20.sql new file mode 100644 index 000000000000..cb5249748d45 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_20.sql @@ -0,0 +1,35 @@ +SELECT + s_name, + s_address +FROM + supplier, + nation +WHERE + s_suppkey IN ( + SELECT + ps_suppkey + FROM + partsupp + WHERE + ps_partkey IN ( + SELECT + p_partkey + FROM + part + WHERE + p_name LIKE 'forest%') + AND ps_availqty > ( + SELECT + 0.5 * sum(l_quantity) + FROM + lineitem + WHERE + l_partkey = ps_partkey + AND l_suppkey = ps_suppkey + AND l_shipdate >= CAST('1994-01-01' AS date) + AND l_shipdate < CAST('1995-01-01' AS date))) + AND s_nationkey = n_nationkey + AND n_name = 'CANADA' + ORDER BY + s_name; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_21.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_21.sql new file mode 100644 index 000000000000..c358cbb05d55 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_21.sql @@ -0,0 +1,39 @@ +SELECT + s_name, + count(*) AS numwait +FROM + supplier, + lineitem l1, + orders, + nation +WHERE + s_suppkey = l1.l_suppkey + AND o_orderkey = l1.l_orderkey + AND o_orderstatus = 'F' + AND l1.l_receiptdate > l1.l_commitdate + AND EXISTS ( + SELECT + * + FROM + lineitem l2 + WHERE + l2.l_orderkey = l1.l_orderkey + AND l2.l_suppkey <> l1.l_suppkey) + AND NOT EXISTS ( + SELECT + * + FROM + lineitem l3 + WHERE + l3.l_orderkey = l1.l_orderkey + AND l3.l_suppkey <> l1.l_suppkey + AND l3.l_receiptdate > l3.l_commitdate) + AND s_nationkey = n_nationkey + AND n_name = 'SAUDI ARABIA' +GROUP BY + s_name +ORDER BY + numwait DESC, + s_name +LIMIT 100; + diff --git a/datafusion/substrait/tests/testdata/tpch_queries/query_22.sql b/datafusion/substrait/tests/testdata/tpch_queries/query_22.sql new file mode 100644 index 000000000000..00b958a34fcb --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_queries/query_22.sql @@ -0,0 +1,32 @@ +SELECT + cntrycode, + count(*) AS numcust, + sum(c_acctbal) AS totacctbal +FROM ( + SELECT + substring(c_phone FROM 1 FOR 2) AS cntrycode, + c_acctbal + FROM + customer + WHERE + substring(c_phone FROM 1 FOR 2) IN ('13', '31', '23', '29', '30', '18', '17') + AND c_acctbal > ( + SELECT + avg(c_acctbal) + FROM + customer + WHERE + c_acctbal > 0.00 + AND substring(c_phone FROM 1 FOR 2) IN ('13', '31', '23', '29', '30', '18', '17')) + AND NOT EXISTS ( + SELECT + * + FROM + orders + WHERE + o_custkey = c_custkey)) AS custsale +GROUP BY + cntrycode +ORDER BY + cntrycode; + diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/customer_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/customer_schema.json new file mode 100644 index 000000000000..92fab75b7e0e --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/customer_schema.json @@ -0,0 +1,63 @@ +{ + "columns": [ + { + "name": "c_custkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "c_name", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "c_address", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "c_nationkey", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "c_phone", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "c_acctbal", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "c_mktsegment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "c_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/lineitem_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/lineitem_schema.json new file mode 100644 index 000000000000..08f837b849a9 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/lineitem_schema.json @@ -0,0 +1,128 @@ +{ + "columns": [ + { + "name": "l_orderkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "l_partkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "l_suppkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "l_linenumber", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "l_quantity", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "l_extendedprice", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "l_discount", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "l_tax", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "l_returnflag", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "l_linestatus", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "l_shipdate", + "arrowType": { + "DATE32": {} + }, + "nullable": true + }, + { + "name": "l_commitdate", + "arrowType": { + "DATE32": {} + }, + "nullable": true + }, + { + "name": "l_receiptdate", + "arrowType": { + "DATE32": {} + }, + "nullable": true + }, + { + "name": "l_shipinstruct", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "l_shipmode", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "l_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/nation_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/nation_schema.json new file mode 100644 index 000000000000..389414f7add2 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/nation_schema.json @@ -0,0 +1,32 @@ +{ + "columns": [ + { + "name": "n_nationkey", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "n_name", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "n_regionkey", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "n_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/orders_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/orders_schema.json new file mode 100644 index 000000000000..3a71444560a3 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/orders_schema.json @@ -0,0 +1,70 @@ +{ + "columns": [ + { + "name": "o_orderkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "o_custkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "o_orderstatus", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "o_totalprice", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "o_orderdate", + "arrowType": { + "DATE32": {} + }, + "nullable": true + }, + { + "name": "o_orderpriority", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "o_clerk", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "o_shippriority", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "o_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/part_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/part_schema.json new file mode 100644 index 000000000000..68d8d9449e32 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/part_schema.json @@ -0,0 +1,70 @@ +{ + "columns": [ + { + "name": "p_partkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "p_name", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "p_mfgr", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "p_brand", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "p_type", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "p_size", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "p_container", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "p_retailprice", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "p_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/partsupp_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/partsupp_schema.json new file mode 100644 index 000000000000..5aae8a83407e --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/partsupp_schema.json @@ -0,0 +1,42 @@ +{ + "columns": [ + { + "name": "ps_partkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "ps_suppkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "ps_availqty", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "ps_supplycost", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "ps_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/region_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/region_schema.json new file mode 100644 index 000000000000..c4710ea8ffbd --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/region_schema.json @@ -0,0 +1,25 @@ +{ + "columns": [ + { + "name": "r_regionkey", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "r_name", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "r_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +} diff --git a/datafusion/substrait/tests/testdata/tpch_schemas/supplier_schema.json b/datafusion/substrait/tests/testdata/tpch_schemas/supplier_schema.json new file mode 100644 index 000000000000..1869cb980a09 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_schemas/supplier_schema.json @@ -0,0 +1,56 @@ +{ + "columns": [ + { + "name": "s_suppkey", + "arrowType": { + "INT64": {} + }, + "nullable": true + }, + { + "name": "s_name", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "s_address", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "s_nationkey", + "arrowType": { + "INT32": {} + }, + "nullable": true + }, + { + "name": "s_phone", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + }, + { + "name": "s_acctbal", + "arrowType": { + "DECIMAL": { + "precision": 15, + "scale": 2 + } + }, + "nullable": true + }, + { + "name": "s_comment", + "arrowType": { + "UTF8VIEW": {} + }, + "nullable": true + } + ] +}