From 12dca5f0e58ebc99f5e6ecbf5b585c7493a3cf24 Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 21 Jul 2023 10:02:20 +0200 Subject: [PATCH 01/10] Fix DBT examples --- .github/workflows/whirl-ci.yml | 4 -- envs/dbt-example/.whirl.env | 6 +++ .../compose.setup.d/02_clean_dbt_log.sh | 18 ++++++++ .../compose.teardown.d/01_show_dbt_log.sh | 16 +++++++ examples/dbt-example/dag.py | 3 +- examples/dbt-example/dbt/.gitignore | 2 +- examples/dbt-example/dbt/logs/.gitkeep | 0 examples/dbt-example/dbt/profiles.yml | 2 +- .../whirl.setup.d/02_install_postgres_dbt.sh | 2 + examples/dbt-spark-example/.whirl.env | 6 +++ .../compose.setup.d/02_clean_dbt_log.sh | 18 ++++++++ .../compose.teardown.d/01_show_dbt_log.sh | 16 +++++++ examples/dbt-spark-example/dag.py | 3 +- examples/dbt-spark-example/dbt/.gitignore | 2 +- examples/dbt-spark-example/dbt/profiles.yml | 2 - whirl | 45 +++++++++++++++++++ 16 files changed, 134 insertions(+), 11 deletions(-) create mode 100644 examples/dbt-example/compose.setup.d/02_clean_dbt_log.sh create mode 100644 examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh create mode 100644 examples/dbt-example/dbt/logs/.gitkeep create mode 100644 examples/dbt-spark-example/compose.setup.d/02_clean_dbt_log.sh create mode 100644 examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh diff --git a/.github/workflows/whirl-ci.yml b/.github/workflows/whirl-ci.yml index 73629cc..9a780d1 100644 --- a/.github/workflows/whirl-ci.yml +++ b/.github/workflows/whirl-ci.yml @@ -54,8 +54,6 @@ jobs: - example_dir: ./examples/dbt-spark-example - example_dir: ./examples/spark-delta-sharing - example_dir: ./examples/spark-s3-to-hive - # Exclude failing dbt runs - - example_dir: ./examples/dbt-example env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} @@ -82,8 +80,6 @@ jobs: - example: dbt-spark-example - example: spark-delta-sharing - example: spark-s3-to-hive - # Exclude failing dbt runs - - example: dbt-example env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} diff --git a/envs/dbt-example/.whirl.env b/envs/dbt-example/.whirl.env index cde21ef..959fc05 100644 --- a/envs/dbt-example/.whirl.env +++ b/envs/dbt-example/.whirl.env @@ -11,6 +11,12 @@ POSTGRES_PASSWORD=p@ssw0rd POSTGRES_USER=postgres POSTGRES_DB=postgresdb +# DBT verbosity +DBT_LOG_LEVEL=debug +DBT_LOG_FORMAT=text +DBT_LOG_FORMAT_FILE=text +DBT_SEND_ANONYMOUS_USAGE_STATS=False + # Spark variables SPARK_VERSION=3.4.0 diff --git a/examples/dbt-example/compose.setup.d/02_clean_dbt_log.sh b/examples/dbt-example/compose.setup.d/02_clean_dbt_log.sh new file mode 100644 index 0000000..578690d --- /dev/null +++ b/examples/dbt-example/compose.setup.d/02_clean_dbt_log.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +function empty_log_dir() { + echo "=====================================" + echo "== Cleanup local DBT log mount dir ==" + echo "=====================================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" + + if [ "$(ls -A ${DBT_LOG_DIR})" ]; then + echo "${DBT_LOG_DIR} is not empty. Clearing NOW!!" + find ${DBT_LOG_DIR} -mindepth 1 -not -name ".gitkeep" -delete + else + echo "${DBT_LOG_DIR} is empty. Continue" + fi +} + +empty_log_dir diff --git a/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh new file mode 100644 index 0000000..3a32cae --- /dev/null +++ b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +function show_logs() { + echo "=======================" + echo "== Show dbt run logs ==" + echo "=======================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" + + if [ "$(ls -A ${DBT_LOG_DIR})" ]; then + echo "${DBT_LOG_DIR} is not empty. Showing log!!" + cat ${DBT_LOG_DIR}/dbt.log + fi +} + +show_logs \ No newline at end of file diff --git a/examples/dbt-example/dag.py b/examples/dbt-example/dag.py index 9720123..d23534b 100644 --- a/examples/dbt-example/dag.py +++ b/examples/dbt-example/dag.py @@ -29,7 +29,8 @@ 'spark.hadoop.fs.s3a.endpoint': "{}:{}".format(os.environ.get('AWS_SERVER', ''), os.environ.get('AWS_PORT', '')), 'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false', 'spark.hadoop.fs.s3a.path.style.access': 'true', - 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem' + 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', + 'spark.hadoop.fs.s3a.aws.credentials.provider': 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider' } dag = DAG(dag_id='whirl-dbt-example', diff --git a/examples/dbt-example/dbt/.gitignore b/examples/dbt-example/dbt/.gitignore index 3117c62..d23b15d 100644 --- a/examples/dbt-example/dbt/.gitignore +++ b/examples/dbt-example/dbt/.gitignore @@ -1,4 +1,4 @@ .idea dbt_modules -logs target +.user.yml diff --git a/examples/dbt-example/dbt/logs/.gitkeep b/examples/dbt-example/dbt/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/examples/dbt-example/dbt/profiles.yml b/examples/dbt-example/dbt/profiles.yml index ca48f43..4c0ddb9 100644 --- a/examples/dbt-example/dbt/profiles.yml +++ b/examples/dbt-example/dbt/profiles.yml @@ -26,5 +26,5 @@ flights: # search_path: [optional, override the default postgres search_path] # role: [optional, set the role dbt assumes when executing queries] # sslmode: [optional, set the sslmode used to connect to the database] - target: local + target: airflow diff --git a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh index c11a6a9..e80e430 100644 --- a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh +++ b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh @@ -1,2 +1,4 @@ #!/usr/bin/env bash pip install dbt-postgres + +sudo chown airflow:root /opt/airflow/dags/dbt-example/dbt/logs \ No newline at end of file diff --git a/examples/dbt-spark-example/.whirl.env b/examples/dbt-spark-example/.whirl.env index 20064c5..b6a9883 100644 --- a/examples/dbt-spark-example/.whirl.env +++ b/examples/dbt-spark-example/.whirl.env @@ -1,2 +1,8 @@ WHIRL_ENVIRONMENT=s3-external-spark-hive MOCK_DATA_FOLDER=${DAG_FOLDER}/mock-data + +# DBT verbosity +DBT_LOG_LEVEL=debug +DBT_LOG_FORMAT=text +DBT_LOG_FORMAT_FILE=text +DBT_SEND_ANONYMOUS_USAGE_STATS=False \ No newline at end of file diff --git a/examples/dbt-spark-example/compose.setup.d/02_clean_dbt_log.sh b/examples/dbt-spark-example/compose.setup.d/02_clean_dbt_log.sh new file mode 100644 index 0000000..578690d --- /dev/null +++ b/examples/dbt-spark-example/compose.setup.d/02_clean_dbt_log.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +function empty_log_dir() { + echo "=====================================" + echo "== Cleanup local DBT log mount dir ==" + echo "=====================================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" + + if [ "$(ls -A ${DBT_LOG_DIR})" ]; then + echo "${DBT_LOG_DIR} is not empty. Clearing NOW!!" + find ${DBT_LOG_DIR} -mindepth 1 -not -name ".gitkeep" -delete + else + echo "${DBT_LOG_DIR} is empty. Continue" + fi +} + +empty_log_dir diff --git a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh new file mode 100644 index 0000000..3a32cae --- /dev/null +++ b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +function show_logs() { + echo "=======================" + echo "== Show dbt run logs ==" + echo "=======================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" + + if [ "$(ls -A ${DBT_LOG_DIR})" ]; then + echo "${DBT_LOG_DIR} is not empty. Showing log!!" + cat ${DBT_LOG_DIR}/dbt.log + fi +} + +show_logs \ No newline at end of file diff --git a/examples/dbt-spark-example/dag.py b/examples/dbt-spark-example/dag.py index 4b42102..e4a4782 100644 --- a/examples/dbt-spark-example/dag.py +++ b/examples/dbt-spark-example/dag.py @@ -34,7 +34,8 @@ 'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false', 'spark.hadoop.fs.s3a.path.style.access': 'true', 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', - 'spark.hadoop.fs.s3a.multipart.size': '104857600' + 'spark.hadoop.fs.s3a.multipart.size': '104857600', + 'spark.hadoop.fs.s3a.aws.credentials.provider': 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider' } dag = DAG(dag_id='whirl-dbt-spark-example', diff --git a/examples/dbt-spark-example/dbt/.gitignore b/examples/dbt-spark-example/dbt/.gitignore index 3117c62..d23b15d 100644 --- a/examples/dbt-spark-example/dbt/.gitignore +++ b/examples/dbt-spark-example/dbt/.gitignore @@ -1,4 +1,4 @@ .idea dbt_modules -logs target +.user.yml diff --git a/examples/dbt-spark-example/dbt/profiles.yml b/examples/dbt-spark-example/dbt/profiles.yml index 1737576..a602e1b 100644 --- a/examples/dbt-spark-example/dbt/profiles.yml +++ b/examples/dbt-spark-example/dbt/profiles.yml @@ -5,8 +5,6 @@ flights: method: thrift host: sparkthrift port: 10000 - user: hive - password: hive schema: default connect_retries: 5 connect_timeout: 60 diff --git a/whirl b/whirl index 327863f..daa5a2c 100755 --- a/whirl +++ b/whirl @@ -155,6 +155,15 @@ test_dag_state() { echo "Dagruns: $output" >&2 output=$(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns/~/taskInstances" | jq ".task_instances" ) echo "Tasks: $output" >&2 + # shellcheck disable=SC2207 + log_urls=($(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns/~/taskInstances" | jq -r '.task_instances[] | select(.state!="success" and .try_number>0) | .dag_run_id + "/taskInstances/" + .task_id + "/logs/" + (.try_number|tostring)')) + for uri in "${log_urls[@]}"; do + output=$(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns/${uri}") + echo "Tasks Instance: $uri" >&2 + echo " Log Output: " >&2 + echo "" >&2 + echo "$output" >&2 + done else echo "$result" fi @@ -196,6 +205,40 @@ run_compose_setup_scripts() { . "$filename" done fi + + if [[ -d ${DAG_FOLDER}/compose.setup.d ]]; then + echo "================================================" + echo "== Setup docker-compose dag specifics ==========" + echo "================================================" + for filename in "${DAG_FOLDER}"/compose.setup.d/*.sh; do + echo "Executing compose dag prepare script: $filename" + # shellcheck disable=SC1090 + . "$filename" + done + fi +} + +run_compose_teardown_scripts() { + if [[ -d ${ENVIRONMENT_FOLDER}/compose.teardown.d ]]; then + echo "============================================" + echo "== Teardown docker-compose specifics =======" + echo "============================================" + for filename in "${ENVIRONMENT_FOLDER}"/compose.teardown.d/*.sh; do + echo "Executing compose teardown script: $filename" + # shellcheck disable=SC1090 + . "$filename" + done + fi + if [[ -d ${DAG_FOLDER}/compose.teardown.d ]]; then + echo "============================================" + echo "== Teardown docker-compose dag specifics ===" + echo "============================================" + for filename in "${DAG_FOLDER}"/compose.teardown.d/*.sh; do + echo "Executing compose teardown script: $filename" + # shellcheck disable=SC1090 + . "$filename" + done + fi } check_dagrun_result() { @@ -221,6 +264,8 @@ check_dagrun_result() { echo "Dag '${DAG_ID}' run(s) failed" # output dagrun result test_dag_state "${DAG_ID}" "${FAILURE_STATE}" true + echo "Running teardown scripts before stopping..." + run_compose_teardown_scripts return 1 else echo "Dag '${DAG_ID}' neither success nor failed!!!" From 53c0af3a8bb90dc0d8528e4363f8aa5c0e67ec4c Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 21 Jul 2023 11:26:50 +0200 Subject: [PATCH 02/10] Fix DBT examples --- examples/dbt-example/dbt/dbt_project.yml | 4 ++-- examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh | 3 ++- examples/dbt-spark-example/dbt/dbt_project.yml | 4 ++-- examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh | 3 +++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/dbt-example/dbt/dbt_project.yml b/examples/dbt-example/dbt/dbt_project.yml index d7357b3..9a93834 100644 --- a/examples/dbt-example/dbt/dbt_project.yml +++ b/examples/dbt-example/dbt/dbt_project.yml @@ -13,10 +13,10 @@ profile: 'flights' # These configurations specify where dbt should look for different types of files. # The `source-paths` config, for example, states that source models can be found # in the "models/" directory. You probably won't need to change these! -source-paths: ["models"] +model-paths: ["models"] analysis-paths: ["analysis"] test-paths: ["tests"] -data-paths: ["data"] +seed-paths: ["data"] macro-paths: ["macros"] target-path: "target" # directory which will store compiled SQL files diff --git a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh index e80e430..852958c 100644 --- a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh +++ b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash pip install dbt-postgres -sudo chown airflow:root /opt/airflow/dags/dbt-example/dbt/logs \ No newline at end of file +sudo chown -R airflow:root /opt/airflow/dags/dbt-example/dbt +sudo chmod -R 755 /opt/airflow/dags/dbt-example/dbt \ No newline at end of file diff --git a/examples/dbt-spark-example/dbt/dbt_project.yml b/examples/dbt-spark-example/dbt/dbt_project.yml index cdeb31f..47d0b74 100644 --- a/examples/dbt-spark-example/dbt/dbt_project.yml +++ b/examples/dbt-spark-example/dbt/dbt_project.yml @@ -13,10 +13,10 @@ profile: 'flights' # These configurations specify where dbt should look for different types of files. # The `source-paths` config, for example, states that source models can be found # in the "models/" directory. You probably won't need to change these! -source-paths: ["models"] +model-paths: ["models"] analysis-paths: ["analysis"] test-paths: ["tests"] -data-paths: ["data"] +seed-paths: ["data"] macro-paths: ["macros"] target-path: "target" # directory which will store compiled SQL files diff --git a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh index 88a14ec..33bf7dd 100644 --- a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh +++ b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh @@ -1,3 +1,6 @@ #!/usr/bin/env bash sudo apt-get install -y libsasl2-dev build-essential pip install agate==1.6.1 airflow-dbt dbt-spark[PyHive] + +sudo chown -R airflow:root /opt/airflow/dags/dbt-spark-example/dbt +sudo chmod -R 755 /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file From 36903660c66c5384fd67c42b7a1c283ca32262fd Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 21 Jul 2023 15:49:27 +0200 Subject: [PATCH 03/10] Fix DBT examples --- examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh | 2 +- examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh | 2 +- whirl | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh index 852958c..6e277fd 100644 --- a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh +++ b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh @@ -2,4 +2,4 @@ pip install dbt-postgres sudo chown -R airflow:root /opt/airflow/dags/dbt-example/dbt -sudo chmod -R 755 /opt/airflow/dags/dbt-example/dbt \ No newline at end of file +sudo chmod -R 644 /opt/airflow/dags/dbt-example/dbt \ No newline at end of file diff --git a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh index 33bf7dd..0579030 100644 --- a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh +++ b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh @@ -3,4 +3,4 @@ sudo apt-get install -y libsasl2-dev build-essential pip install agate==1.6.1 airflow-dbt dbt-spark[PyHive] sudo chown -R airflow:root /opt/airflow/dags/dbt-spark-example/dbt -sudo chmod -R 755 /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file +sudo chmod -R 644 /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file diff --git a/whirl b/whirl index daa5a2c..3c58d76 100755 --- a/whirl +++ b/whirl @@ -153,8 +153,10 @@ test_dag_state() { if [ "${SHOW_RESULT}" == true ]; then output=$(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns" | jq ".dag_runs" ) echo "Dagruns: $output" >&2 + sleep 5 # Wait for the task states to be up to date output=$(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns/~/taskInstances" | jq ".task_instances" ) echo "Tasks: $output" >&2 + sleep 5 # Wait for the logs to be available # shellcheck disable=SC2207 log_urls=($(curl -s -u ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}/dagRuns/~/taskInstances" | jq -r '.task_instances[] | select(.state!="success" and .try_number>0) | .dag_run_id + "/taskInstances/" + .task_id + "/logs/" + (.try_number|tostring)')) for uri in "${log_urls[@]}"; do From bc2c135e71111921369b44c47befd95d7d74b5df Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 21 Jul 2023 16:44:49 +0200 Subject: [PATCH 04/10] Set correct permissions --- examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh | 3 +-- examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh index 6e277fd..4b04aad 100644 --- a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh +++ b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash pip install dbt-postgres -sudo chown -R airflow:root /opt/airflow/dags/dbt-example/dbt -sudo chmod -R 644 /opt/airflow/dags/dbt-example/dbt \ No newline at end of file +sudo chmod -R ugo+rw /opt/airflow/dags/dbt-example/dbt \ No newline at end of file diff --git a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh index 0579030..5213fda 100644 --- a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh +++ b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh @@ -2,5 +2,4 @@ sudo apt-get install -y libsasl2-dev build-essential pip install agate==1.6.1 airflow-dbt dbt-spark[PyHive] -sudo chown -R airflow:root /opt/airflow/dags/dbt-spark-example/dbt -sudo chmod -R 644 /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file +sudo chmod -R ugo+rw /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file From 19496e0ab732ed1b86684628a503c7065c455f59 Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Sun, 23 Jul 2023 14:17:02 +0200 Subject: [PATCH 05/10] Prevent failure when unpausing dag --- examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh | 2 +- .../dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh | 2 +- whirl | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh index 3a32cae..6f58e22 100644 --- a/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh +++ b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh @@ -9,7 +9,7 @@ function show_logs() { if [ "$(ls -A ${DBT_LOG_DIR})" ]; then echo "${DBT_LOG_DIR} is not empty. Showing log!!" - cat ${DBT_LOG_DIR}/dbt.log + cat ${DBT_LOG_DIR}/dbt.log || true fi } diff --git a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh index 3a32cae..6f58e22 100644 --- a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh +++ b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh @@ -9,7 +9,7 @@ function show_logs() { if [ "$(ls -A ${DBT_LOG_DIR})" ]; then echo "${DBT_LOG_DIR} is not empty. Showing log!!" - cat ${DBT_LOG_DIR}/dbt.log + cat ${DBT_LOG_DIR}/dbt.log || true fi } diff --git a/whirl b/whirl index 3c58d76..fae138d 100755 --- a/whirl +++ b/whirl @@ -355,6 +355,7 @@ start() { exit 8 fi else + sleep 10 # wait for dag to be available. if [[ "$(curl -s -o /dev/null -w %\{http_code\} -X PATCH -H 'Content-Type: application/json' --user ${AIRFLOW_ADMIN_USR}:${AIRFLOW_ADMIN_PWD} "http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}?update_mask=is_paused" -d '{ "is_paused": false }')" != "200" ]]; then echo "Unable to unpause dag with id ${DAG_ID}." echo "Command: curl -X PATCH -H 'Content-Type: application/json' --user ${AIRFLOW_ADMIN_USR}:****** \"http://localhost:${AIRFLOW_UI_PORT}/${AIRFLOW_API_BASE_URI}/dags/${DAG_ID}?update_mask=is_paused\" -d '{ \"is_paused\": false }'" From 6c43e935af73801461393335a204c177556e6f4c Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 25 Aug 2023 11:47:15 +0200 Subject: [PATCH 06/10] Use different dbt operator provider library --- envs/dbt-example/.gitignore | 1 + .../compose.setup.d/01_clean_pg_data_dir.sh | 18 ++++++++++++++++++ envs/dbt-example/docker-compose.yml | 2 ++ .../whirl.setup.d/04_install_dbt.sh | 10 +++++++++- examples/dbt-example/dag.py | 6 +++--- .../whirl.setup.d/02_install_postgres_dbt.sh | 2 +- .../compose.teardown.d/01_show_dbt_log.sh | 2 +- examples/dbt-spark-example/dag.py | 6 +++--- .../whirl.setup.d/02_add_dbt.sh | 10 +++++++++- 9 files changed, 47 insertions(+), 10 deletions(-) create mode 100644 envs/dbt-example/.gitignore create mode 100644 envs/dbt-example/compose.setup.d/01_clean_pg_data_dir.sh diff --git a/envs/dbt-example/.gitignore b/envs/dbt-example/.gitignore new file mode 100644 index 0000000..6600d6a --- /dev/null +++ b/envs/dbt-example/.gitignore @@ -0,0 +1 @@ +.pgdata \ No newline at end of file diff --git a/envs/dbt-example/compose.setup.d/01_clean_pg_data_dir.sh b/envs/dbt-example/compose.setup.d/01_clean_pg_data_dir.sh new file mode 100644 index 0000000..3887ae8 --- /dev/null +++ b/envs/dbt-example/compose.setup.d/01_clean_pg_data_dir.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +function empty_data_dir() { + echo "================================" + echo "== Cleanup local PG mount dir ==" + echo "================================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + PG_DATA_DIR="${SCRIPT_DIR}/../.pgdata" + + if [ "$(ls -A ${PG_DATA_DIR})" ]; then + echo "${PG_DATA_DIR} is not empty. Clearing NOW!!" + find ${PG_DATA_DIR} -mindepth 1 -delete + else + echo "${PG_DATA_DIR} is empty. Continue" + fi +} + +empty_data_dir \ No newline at end of file diff --git a/envs/dbt-example/docker-compose.yml b/envs/dbt-example/docker-compose.yml index 5581487..f9fdb58 100644 --- a/envs/dbt-example/docker-compose.yml +++ b/envs/dbt-example/docker-compose.yml @@ -45,6 +45,8 @@ services: - POSTGRES_PASSWORD - POSTGRES_USER - POSTGRES_DB + volumes: + - ./.pgdata:/var/lib/postgresql/data sparkmaster: build: diff --git a/envs/dbt-example/whirl.setup.d/04_install_dbt.sh b/envs/dbt-example/whirl.setup.d/04_install_dbt.sh index 068c35d..0a7602a 100644 --- a/envs/dbt-example/whirl.setup.d/04_install_dbt.sh +++ b/envs/dbt-example/whirl.setup.d/04_install_dbt.sh @@ -1,3 +1,11 @@ #!/usr/bin/env bash sudo apt-get install -y libsasl2-dev build-essential -pip install agate==1.6.1 airflow-dbt +pip install dbt-core==1.4.5 airflow-dbt-python + +#airflow-dbt-python depends on the fs_default connection +echo "=====================================" +echo "== Configure FS Default connection ==" +echo "=====================================" +airflow connections add fs_default \ + --conn-type fs \ + --conn-extra "{\"path\": \"/\"}" diff --git a/examples/dbt-example/dag.py b/examples/dbt-example/dag.py index d23534b..fd3b4d4 100644 --- a/examples/dbt-example/dag.py +++ b/examples/dbt-example/dag.py @@ -4,7 +4,7 @@ from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator -from airflow_dbt.operators.dbt_operator import DbtRunOperator, DbtTestOperator +from airflow_dbt_python.operators.dbt import DbtRunOperator, DbtTestOperator default_args = { 'owner': 'whirl', @@ -87,7 +87,7 @@ dbt_run = DbtRunOperator( task_id='dbt_run', - dir=DBT_DIRECTORY, + project_dir=DBT_DIRECTORY, profiles_dir=DBT_DIRECTORY, target='airflow', dag=dag @@ -96,7 +96,7 @@ dbt_test = DbtTestOperator( task_id='dbt_test', - dir=DBT_DIRECTORY, + project_dir=DBT_DIRECTORY, profiles_dir=DBT_DIRECTORY, target='airflow', dag=dag diff --git a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh index 4b04aad..28b877d 100644 --- a/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh +++ b/examples/dbt-example/whirl.setup.d/02_install_postgres_dbt.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash -pip install dbt-postgres +pip install dbt-postgres==1.4.5 sudo chmod -R ugo+rw /opt/airflow/dags/dbt-example/dbt \ No newline at end of file diff --git a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh index 6f58e22..a9143e3 100644 --- a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh +++ b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh @@ -9,7 +9,7 @@ function show_logs() { if [ "$(ls -A ${DBT_LOG_DIR})" ]; then echo "${DBT_LOG_DIR} is not empty. Showing log!!" - cat ${DBT_LOG_DIR}/dbt.log || true + sudo cat ${DBT_LOG_DIR}/dbt.log || true fi } diff --git a/examples/dbt-spark-example/dag.py b/examples/dbt-spark-example/dag.py index e4a4782..36d7e62 100644 --- a/examples/dbt-spark-example/dag.py +++ b/examples/dbt-spark-example/dag.py @@ -4,7 +4,7 @@ from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator -from airflow_dbt.operators.dbt_operator import DbtRunOperator, DbtTestOperator +from airflow_dbt_python.operators.dbt import DbtRunOperator, DbtTestOperator default_args = { 'owner': 'whirl', @@ -98,7 +98,7 @@ dbt_run = DbtRunOperator( task_id='dbt_run', - dir=DBT_DIRECTORY, + project_dir=DBT_DIRECTORY, profiles_dir=DBT_DIRECTORY, target='hive', dag=dag @@ -107,7 +107,7 @@ dbt_test = DbtTestOperator( task_id='dbt_test', - dir=DBT_DIRECTORY, + project_dir=DBT_DIRECTORY, profiles_dir=DBT_DIRECTORY, target='hive', dag=dag diff --git a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh index 5213fda..26e6465 100644 --- a/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh +++ b/examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh @@ -1,5 +1,13 @@ #!/usr/bin/env bash sudo apt-get install -y libsasl2-dev build-essential -pip install agate==1.6.1 airflow-dbt dbt-spark[PyHive] +pip install dbt-core==1.4.5 dbt-spark[PyHive] airflow-dbt-python + +#airflow-dbt-python depends on the fs_default connection +echo "=====================================" +echo "== Configure FS Default connection ==" +echo "=====================================" +airflow connections add fs_default \ + --conn-type fs \ + --conn-extra "{\"path\": \"/\"}" sudo chmod -R ugo+rw /opt/airflow/dags/dbt-spark-example/dbt \ No newline at end of file From 220b9657171d790b5aa7b15a939dda5750b07021 Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 25 Aug 2023 13:46:36 +0200 Subject: [PATCH 07/10] Prevent failure in show log --- examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh | 6 +++--- .../dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh index 6f58e22..8068c9e 100644 --- a/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh +++ b/examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh @@ -7,9 +7,9 @@ function show_logs() { local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" - if [ "$(ls -A ${DBT_LOG_DIR})" ]; then - echo "${DBT_LOG_DIR} is not empty. Showing log!!" - cat ${DBT_LOG_DIR}/dbt.log || true + if [ "$(ls -A ${DBT_LOG_DIR}/dbt.log)" ]; then + echo "${DBT_LOG_DIR}/dbt.log exists. Showing log!!" + sudo cat ${DBT_LOG_DIR}/dbt.log fi } diff --git a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh index a9143e3..8068c9e 100644 --- a/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh +++ b/examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh @@ -7,9 +7,9 @@ function show_logs() { local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs" - if [ "$(ls -A ${DBT_LOG_DIR})" ]; then - echo "${DBT_LOG_DIR} is not empty. Showing log!!" - sudo cat ${DBT_LOG_DIR}/dbt.log || true + if [ "$(ls -A ${DBT_LOG_DIR}/dbt.log)" ]; then + echo "${DBT_LOG_DIR}/dbt.log exists. Showing log!!" + sudo cat ${DBT_LOG_DIR}/dbt.log fi } From 37c8f0f66c0579079173c433fdb7261e7b6ba0e1 Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 25 Aug 2023 14:47:36 +0200 Subject: [PATCH 08/10] Prevent running dbt in parallel --- .github/workflows/whirl-ci.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/whirl-ci.yml b/.github/workflows/whirl-ci.yml index 9a780d1..a8d2188 100644 --- a/.github/workflows/whirl-ci.yml +++ b/.github/workflows/whirl-ci.yml @@ -54,6 +54,8 @@ jobs: - example_dir: ./examples/dbt-spark-example - example_dir: ./examples/spark-delta-sharing - example_dir: ./examples/spark-s3-to-hive + # Run without parallelism separately + - example_dir: ./examples/dbt-example env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} @@ -65,6 +67,26 @@ jobs: echo Run Ci from example directory ${{ matrix.example_dir }} ../../whirl ci + whirl-ci-dbt-example: + needs: [directories] + runs-on: ubuntu-latest + strategy: + fail-fast: false + max-parallel: 1 + matrix: + python_version: ["3.8", "3.9"] + airflow_version: ["2.2.5", "2.6.3"] + env: + PYTHON_VERSION: ${{ matrix.python_version }} + AIRFLOW_VERSION: ${{ matrix.airflow_version }} + steps: + - uses: actions/checkout@v2 + - name: Run whirl CI dbt example + working-directory: ./examples/dbt-example + run: | + echo Run Ci from dbt-example directory + ../../whirl ci + whirl-ci-default-envs-from-root-dir: needs: [examples] runs-on: ubuntu-latest @@ -80,6 +102,8 @@ jobs: - example: dbt-spark-example - example: spark-delta-sharing - example: spark-s3-to-hive + # Run without parallelism separately + - example_dir: ./examples/dbt-example env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} From 64bab57aba82a7656ca8e30ba936905723d39b54 Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Fri, 25 Aug 2023 16:10:32 +0200 Subject: [PATCH 09/10] Fix workflow typo --- .github/workflows/whirl-ci.yml | 40 ++++++++++++++++------------------ 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/whirl-ci.yml b/.github/workflows/whirl-ci.yml index a8d2188..3053eb9 100644 --- a/.github/workflows/whirl-ci.yml +++ b/.github/workflows/whirl-ci.yml @@ -67,26 +67,6 @@ jobs: echo Run Ci from example directory ${{ matrix.example_dir }} ../../whirl ci - whirl-ci-dbt-example: - needs: [directories] - runs-on: ubuntu-latest - strategy: - fail-fast: false - max-parallel: 1 - matrix: - python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] - env: - PYTHON_VERSION: ${{ matrix.python_version }} - AIRFLOW_VERSION: ${{ matrix.airflow_version }} - steps: - - uses: actions/checkout@v2 - - name: Run whirl CI dbt example - working-directory: ./examples/dbt-example - run: | - echo Run Ci from dbt-example directory - ../../whirl ci - whirl-ci-default-envs-from-root-dir: needs: [examples] runs-on: ubuntu-latest @@ -103,7 +83,7 @@ jobs: - example: spark-delta-sharing - example: spark-s3-to-hive # Run without parallelism separately - - example_dir: ./examples/dbt-example + - example: dbt-example env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} @@ -114,6 +94,24 @@ jobs: echo Run Ci for example ${{ matrix.example }} ./whirl -x ${{ matrix.example }} ci + whirl-ci-dbt-example: + runs-on: ubuntu-latest + strategy: + fail-fast: false + max-parallel: 1 + matrix: + python_version: ["3.8", "3.9"] + airflow_version: ["2.2.5", "2.6.3"] + env: + PYTHON_VERSION: ${{ matrix.python_version }} + AIRFLOW_VERSION: ${{ matrix.airflow_version }} + steps: + - uses: actions/checkout@v2 + - name: Run whirl CI dbt example + working-directory: ./examples/dbt-example + run: | + echo Run Ci from dbt-example directory + ../../whirl ci whirl-ci-extra-env-spark-s3-to-postgres: runs-on: ubuntu-latest From 0a0fd9fa8a7ae6d983b0665ac7b1a12b6d565d8b Mon Sep 17 00:00:00 2001 From: Kris Geusebroek Date: Sun, 15 Oct 2023 11:46:30 +0200 Subject: [PATCH 10/10] Bump airflow version to latest --- .github/workflows/whirl-ci.yml | 10 +++++----- .whirl.env | 2 +- docker/airflow-python/Dockerfile | 4 ++-- .../compose.setup.d/Dockerfile.worker | 2 +- examples/dbt-example/.whirl.env | 1 + 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/whirl-ci.yml b/.github/workflows/whirl-ci.yml index 3053eb9..5a02442 100644 --- a/.github/workflows/whirl-ci.yml +++ b/.github/workflows/whirl-ci.yml @@ -48,7 +48,7 @@ jobs: matrix: example_dir: ${{ fromJson(needs.directories.outputs.dir) }} python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] + airflow_version: ["2.2.5", "2.7.2"] exclude: # Needs more memory than available on the runner - example_dir: ./examples/dbt-spark-example @@ -76,7 +76,7 @@ jobs: matrix: example: ${{ fromJson(needs.examples.outputs.example) }} python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] + airflow_version: ["2.2.5", "2.7.2"] exclude: # Needs more memory than available on the runner - example: dbt-spark-example @@ -101,7 +101,7 @@ jobs: max-parallel: 1 matrix: python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] + airflow_version: ["2.2.5", "2.7.2"] env: PYTHON_VERSION: ${{ matrix.python_version }} AIRFLOW_VERSION: ${{ matrix.airflow_version }} @@ -129,7 +129,7 @@ jobs: max-parallel: 4 matrix: python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] + airflow_version: ["2.2.5", "2.7.2"] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.python_version }} @@ -148,7 +148,7 @@ jobs: max-parallel: 4 matrix: python_version: ["3.8", "3.9"] - airflow_version: ["2.2.5", "2.6.3"] + airflow_version: ["2.2.5", "2.7.2"] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.python_version }} diff --git a/.whirl.env b/.whirl.env index fe4afc0..75942eb 100644 --- a/.whirl.env +++ b/.whirl.env @@ -1,4 +1,4 @@ -AIRFLOW_VERSION=2.6.3 +AIRFLOW_VERSION=2.7.2 PYTHON_VERSION=3.9 AIRFLOW__API__AUTH_BACKEND=airflow.api.auth.backend.basic_auth MINIMAL_AIRFLOW_VERSION=2.2.5 \ No newline at end of file diff --git a/docker/airflow-python/Dockerfile b/docker/airflow-python/Dockerfile index a68c380..527a2af 100644 --- a/docker/airflow-python/Dockerfile +++ b/docker/airflow-python/Dockerfile @@ -1,10 +1,10 @@ -ARG AIRFLOW_VERSION=2.6.0 +ARG AIRFLOW_VERSION=2.7.2 ARG PYTHON_VERSION=3.9 FROM apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION} USER root -ARG AIRFLOW_VERSION=2.6.0 +ARG AIRFLOW_VERSION=2.7.2 ENV AIRFLOW_VERSION=${AIRFLOW_VERSION} ENV WHIRL_SETUP_FOLDER=/etc/airflow/whirl.setup.d diff --git a/envs/api-python-s3-k8s/compose.setup.d/Dockerfile.worker b/envs/api-python-s3-k8s/compose.setup.d/Dockerfile.worker index 22cb441..6dc6e1e 100644 --- a/envs/api-python-s3-k8s/compose.setup.d/Dockerfile.worker +++ b/envs/api-python-s3-k8s/compose.setup.d/Dockerfile.worker @@ -1,4 +1,4 @@ -ARG AIRFLOW_VERSION=2.6.3 +ARG AIRFLOW_VERSION=2.7.2 ARG PYTHON_VERSION=3.9 FROM apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION} diff --git a/examples/dbt-example/.whirl.env b/examples/dbt-example/.whirl.env index 591dd77..88e6d12 100644 --- a/examples/dbt-example/.whirl.env +++ b/examples/dbt-example/.whirl.env @@ -1,2 +1,3 @@ WHIRL_ENVIRONMENT=dbt-example MOCK_DATA_FOLDER=${DAG_FOLDER}/mock-data +PYTHON_VERSION=3.8 \ No newline at end of file