Merge branch 'main' into feature-scalar_regexp_match_expr

apache · Oct 29, 2024 · f1a81a7 · f1a81a7
2 parents aa2eed2 + 444a673
commit f1a81a7
Show file tree

Hide file tree

Showing 759 changed files with 46,200 additions and 19,569 deletions.
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -28,16 +28,18 @@ runs:
     - name: Install Build Dependencies
       shell: bash
       run: |
-        apt-get update
-        apt-get install -y protobuf-compiler
+        RETRY="ci/scripts/retry"
+        "${RETRY}" apt-get update
+        "${RETRY}" apt-get install -y protobuf-compiler
     - name: Setup Rust toolchain
       shell: bash
       # rustfmt is needed for the substrait build script
       run: |
+        RETRY="ci/scripts/retry"
         echo "Installing ${{ inputs.rust-version }}"
-        rustup toolchain install ${{ inputs.rust-version }}
-        rustup default ${{ inputs.rust-version }}
-        rustup component add rustfmt
+        "${RETRY}" rustup toolchain install ${{ inputs.rust-version }}
+        "${RETRY}" rustup default ${{ inputs.rust-version }}
+        "${RETRY}" rustup component add rustfmt
     - name: Configure rust runtime env
       uses: ./.github/actions/setup-rust-runtime
     - name: Fixup git permissions

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -521,7 +521,7 @@ jobs:
         run: taplo format --check
 
   config-docs-check:
-    name: check configs.md is up-to-date
+    name: check configs.md and ***_functions.md is up-to-date
     needs: [ linux-build-lib ]
     runs-on: ubuntu-latest
     container:
@@ -542,6 +542,11 @@ jobs:
           # If you encounter an error, run './dev/update_config_docs.sh' and commit
           ./dev/update_config_docs.sh
           git diff --exit-code
+      - name: Check if any of the ***_functions.md has been modified
+        run: |
+          # If you encounter an error, run './dev/update_function_docs.sh' and commit
+          ./dev/update_function_docs.sh
+          git diff --exit-code
 
   # Verify MSRV for the crates which are directly used by other projects:
   # - datafusion
@@ -569,9 +574,9 @@ jobs:
           #
           # To reproduce: 
           # 1. Install the version of Rust that is failing. Example: 
-          #    rustup install 1.78.0
+          #    rustup install 1.79.0
           # 2. Run the command that failed with that version. Example:
-          #    cargo +1.78.0 check -p datafusion
+          #    cargo +1.79.0 check -p datafusion
           # 
           # To resolve, either:
           # 1. Change your code to use older Rust features, 

diff --git a/Cargo.toml b/Cargo.toml
@@ -58,8 +58,8 @@ homepage = "https://datafusion.apache.org"
 license = "Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/apache/datafusion"
-rust-version = "1.78"
-version = "42.0.0"
+rust-version = "1.79"
+version = "42.1.0"
 
 [workspace.dependencies]
 # We turn off default-features for some dependencies here so the workspaces which inherit them can
@@ -70,51 +70,51 @@ version = "42.0.0"
 ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
-arrow = { version = "53.0.0", features = [
+arrow = { version = "53.2.0", features = [
     "prettyprint",
 ] }
-arrow-array = { version = "53.0.0", default-features = false, features = [
+arrow-array = { version = "53.2.0", default-features = false, features = [
     "chrono-tz",
 ] }
-arrow-buffer = { version = "53.0.0", default-features = false }
-arrow-flight = { version = "53.0.0", features = [
+arrow-buffer = { version = "53.2.0", default-features = false }
+arrow-flight = { version = "53.2.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "53.0.0", default-features = false, features = [
+arrow-ipc = { version = "53.2.0", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "53.0.0", default-features = false }
-arrow-schema = { version = "53.0.0", default-features = false }
-arrow-string = { version = "53.0.0", default-features = false }
+arrow-ord = { version = "53.2.0", default-features = false }
+arrow-schema = { version = "53.2.0", default-features = false }
+arrow-string = { version = "53.2.0", default-features = false }
 async-trait = "0.1.73"
 bigdecimal = "=0.4.1"
 bytes = "1.4"
 chrono = { version = "0.4.38", default-features = false }
 ctor = "0.2.0"
 dashmap = "6.0.1"
-datafusion = { path = "datafusion/core", version = "42.0.0", default-features = false }
-datafusion-catalog = { path = "datafusion/catalog", version = "42.0.0" }
-datafusion-common = { path = "datafusion/common", version = "42.0.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "42.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "42.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "42.0.0" }
-datafusion-expr-common = { path = "datafusion/expr-common", version = "42.0.0" }
-datafusion-functions = { path = "datafusion/functions", version = "42.0.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "42.0.0" }
-datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "42.0.0" }
-datafusion-functions-nested = { path = "datafusion/functions-nested", version = "42.0.0" }
-datafusion-functions-window = { path = "datafusion/functions-window", version = "42.0.0" }
-datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "42.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "42.0.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "42.0.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "42.0.0", default-features = false }
-datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "42.0.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "42.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "42.0.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "42.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "42.0.0" }
-datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "42.0.0" }
-datafusion-substrait = { path = "datafusion/substrait", version = "42.0.0" }
+datafusion = { path = "datafusion/core", version = "42.1.0", default-features = false }
+datafusion-catalog = { path = "datafusion/catalog", version = "42.1.0" }
+datafusion-common = { path = "datafusion/common", version = "42.1.0", default-features = false }
+datafusion-common-runtime = { path = "datafusion/common-runtime", version = "42.1.0" }
+datafusion-execution = { path = "datafusion/execution", version = "42.1.0" }
+datafusion-expr = { path = "datafusion/expr", version = "42.1.0" }
+datafusion-expr-common = { path = "datafusion/expr-common", version = "42.1.0" }
+datafusion-functions = { path = "datafusion/functions", version = "42.1.0" }
+datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "42.1.0" }
+datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "42.1.0" }
+datafusion-functions-nested = { path = "datafusion/functions-nested", version = "42.1.0" }
+datafusion-functions-window = { path = "datafusion/functions-window", version = "42.1.0" }
+datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "42.1.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "42.1.0", default-features = false }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "42.1.0", default-features = false }
+datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "42.1.0", default-features = false }
+datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "42.1.0" }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "42.1.0" }
+datafusion-proto = { path = "datafusion/proto", version = "42.1.0" }
+datafusion-proto-common = { path = "datafusion/proto-common", version = "42.1.0" }
+datafusion-sql = { path = "datafusion/sql", version = "42.1.0" }
+datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "42.1.0" }
+datafusion-substrait = { path = "datafusion/substrait", version = "42.1.0" }
 doc-comment = "0.3"
 env_logger = "0.11"
 futures = "0.3"
@@ -126,7 +126,7 @@ log = "^0.4"
 num_cpus = "1.13.0"
 object_store = { version = "0.11.0", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "53.0.0", default-features = false, features = [
+parquet = { version = "53.2.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",
@@ -137,7 +137,7 @@ prost = "0.13.1"
 prost-derive = "0.13.1"
 rand = "0.8"
 regex = "1.8"
-rstest = "0.22.0"
+rstest = "0.23.0"
 serde_json = "1"
 sqlparser = { version = "0.51.0", features = ["visitor"] }
 tempfile = "3"
@@ -169,3 +169,4 @@ large_futures = "warn"
 
 [workspace.lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
+unused_qualifications = "deny"
diff --git a/README.md b/README.md
@@ -42,14 +42,23 @@
 </a>
 
 DataFusion is an extensible query engine written in [Rust] that
-uses [Apache Arrow] as its in-memory format. DataFusion's target users are
-developers building fast and feature rich database and analytic systems,
-customized to particular workloads. See [use cases] for examples.
+uses [Apache Arrow] as its in-memory format.
 
-"Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
-excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
-extensive customization, and a great community.
-[Python Bindings] are also available.
+This crate provides libraries and binaries for developers building fast and
+feature rich database and analytic systems, customized to particular workloads.
+See [use cases] for examples. The following related subprojects target end users:
+
+- [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
+  queries.
+- [DataFusion Ray](https://github.com/apache/datafusion-ray/) provides a distributed version of DataFusion that scales
+  out on Ray clusters.
+- [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on
+  DataFusion.
+
+"Out of the box,"
+DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance],
+built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
+a great community.
 
 DataFusion features a full query planner, a columnar, streaming, multi-threaded,
 vectorized execution engine, and partitioned data sources. You can
@@ -125,3 +134,8 @@ For example, given the releases `1.78.0`, `1.79.0`, `1.80.0`, `1.80.1` and `1.81
 If a hotfix is released for the minimum supported Rust version (MSRV), the MSRV will be the minor version with all hotfixes, even if it surpasses the four-month window.
 
 We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
+
+## DataFusion API evolution policy
+
+Public methods in Apache DataFusion are subject to evolve as part of the API lifecycle.
+Deprecated methods will be phased out in accordance with the [policy](https://datafusion.apache.org/library-user-guide/api-health.html), ensuring the API is stable and healthy.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -330,6 +330,16 @@ steps.
 The tests sort the entire dataset using several different sort
 orders.
 
+## IMDB
+
+Run Join Order Benchmark (JOB) on IMDB dataset.
+
+The Internet Movie Database (IMDB) dataset contains real-world movie data. Unlike synthetic datasets like TPCH, which assume uniform data distribution and uncorrelated columns, the IMDB dataset includes skewed data and correlated columns (which are common for real dataset), making it more suitable for testing query optimizers, particularly for cardinality estimation.
+
+This benchmark is derived from [Join Order Benchmark](https://github.com/gregrahn/join-order-benchmark).
+
+See paper [How Good Are Query Optimizers, Really](http://www.vldb.org/pvldb/vol9/p204-leis.pdf) for more details.
+
 ## TPCH
 
 Run the tpch benchmark.
@@ -342,6 +352,34 @@ This benchmarks is derived from the [TPC-H][1] version
 [2]: https://github.com/databricks/tpch-dbgen.git,
 [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
 
+## External Aggregation
+
+Run the benchmark for aggregations with limited memory.
+
+When the memory limit is exceeded, the aggregation intermediate results will be spilled to disk, and finally read back with sort-merge.
+
+External aggregation benchmarks run several aggregation queries with different memory limits, on TPCH `lineitem` table. Queries can be found in [`external_aggr.rs`](src/bin/external_aggr.rs).
+
+This benchmark is inspired by [DuckDB's external aggregation paper](https://hannes.muehleisen.org/publications/icde2024-out-of-core-kuiper-boncz-muehleisen.pdf), specifically Section VI.
+
+### External Aggregation Example Runs
+1. Run all queries with predefined memory limits:
+```bash
+# Under 'benchmarks/' directory
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json'
+```
+
+2. Run a query with specific memory limit:
+```bash
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json' --query 1 --memory-limit 30M
+```
+
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh data external_aggr
+./bench.sh run external_aggr
+```
+
 
 # Older Benchmarks
 

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -78,6 +78,7 @@ sort:                   Benchmark of sorting speed
 clickbench_1:           ClickBench queries against a single parquet file
 clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
+external_aggr:          External aggregation benchmark
 
 **********
 * Supported Configuration (Environment Variables)
@@ -170,6 +171,10 @@ main() {
                 imdb)
                     data_imdb
                     ;;
+                external_aggr)
+                    # same data as for tpch
+                    data_tpch "1"
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -211,6 +216,8 @@ main() {
                     run_clickbench_1
                     run_clickbench_partitioned
                     run_clickbench_extended
+                    run_imdb
+                    run_external_aggr
                     ;;
                 tpch)
                     run_tpch "1"
@@ -239,6 +246,12 @@ main() {
                 clickbench_extended)
                     run_clickbench_extended
                     ;;
+                imdb)
+                    run_imdb
+                    ;;
+                external_aggr)
+                    run_external_aggr
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -353,15 +366,15 @@ run_parquet() {
     RESULTS_FILE="${RESULTS_DIR}/parquet.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running parquet filter benchmark..."
-    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
 }
 
 # Runs the sort benchmark
 run_sort() {
     RESULTS_FILE="${RESULTS_DIR}/sort.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running sort benchmark..."
-    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
 }
 
 
@@ -510,7 +523,31 @@ data_imdb() {
     fi
 }
 
+# Runs the imdb benchmark
+run_imdb() {
+    IMDB_DIR="${DATA_DIR}/imdb"
+
+    RESULTS_FILE="${RESULTS_DIR}/imdb.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running imdb benchmark..."
+    $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
+}
 
+# Runs the external aggregation benchmark
+run_external_aggr() {
+    # Use TPC-H SF1 dataset
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/external_aggr.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running external aggregation benchmark..."
+
+    # Only parquet is supported.
+    # Since per-operator memory limit is calculated as (total-memory-limit / 
+    # number-of-partitions), and by default `--partitions` is set to number of
+    # CPU cores, we set a constant number of partitions to prevent this 
+    # benchmark to fail on some machines.
+    $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+}
 
 
 compare_benchmarks() {

diff --git a/benchmarks/queries/imdb/10a.sql b/benchmarks/queries/imdb/10a.sql
@@ -0,0 +1 @@
+SELECT MIN(chn.name) AS uncredited_voiced_character, MIN(t.title) AS russian_movie FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note  like '%(voice)%' and ci.note like '%(uncredited)%' AND cn.country_code  = '[ru]' AND rt.role  = 'actor' AND t.production_year > 2005 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id;
diff --git a/benchmarks/queries/imdb/10b.sql b/benchmarks/queries/imdb/10b.sql
@@ -0,0 +1 @@
+SELECT MIN(chn.name) AS character, MIN(t.title) AS russian_mov_with_actor_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note  like '%(producer)%' AND cn.country_code  = '[ru]' AND rt.role  = 'actor' AND t.production_year > 2010 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id;
diff --git a/benchmarks/queries/imdb/10c.sql b/benchmarks/queries/imdb/10c.sql
@@ -0,0 +1 @@
+SELECT MIN(chn.name) AS character, MIN(t.title) AS movie_with_american_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note  like '%(producer)%' AND cn.country_code  = '[us]' AND t.production_year > 1990 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id;
diff --git a/benchmarks/queries/imdb/11a.sql b/benchmarks/queries/imdb/11a.sql
@@ -0,0 +1 @@
+SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS non_polish_sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id;
diff --git a/benchmarks/queries/imdb/11b.sql b/benchmarks/queries/imdb/11b.sql
@@ -0,0 +1 @@
+SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follows%' AND mc.note IS NULL AND t.production_year  = 1998 and t.title like '%Money%' AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SELECT MIN(chn.name) AS uncredited_voiced_character, MIN(t.title) AS russian_movie FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(voice)%' and ci.note like '%(uncredited)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2005 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SELECT MIN(chn.name) AS character, MIN(t.title) AS russian_mov_with_actor_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(producer)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2010 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS non_polish_sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id;