Skip to content

Commit

Permalink
Merge pull request #95 from godatadriven/kg/dbt-logging-fix
Browse files Browse the repository at this point in the history
Fix DBT examples
  • Loading branch information
krisgeus authored Oct 15, 2023
2 parents f8c217d + 0a0fd9f commit cddeb66
Show file tree
Hide file tree
Showing 27 changed files with 218 additions and 30 deletions.
30 changes: 24 additions & 6 deletions .github/workflows/whirl-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ jobs:
matrix:
example_dir: ${{ fromJson(needs.directories.outputs.dir) }}
python_version: ["3.8", "3.9"]
airflow_version: ["2.2.5", "2.6.3"]
airflow_version: ["2.2.5", "2.7.2"]
exclude:
# Needs more memory than available on the runner
- example_dir: ./examples/dbt-spark-example
- example_dir: ./examples/spark-delta-sharing
- example_dir: ./examples/spark-s3-to-hive
# Exclude failing dbt runs
# Run without parallelism separately
- example_dir: ./examples/dbt-example
env:
PYTHON_VERSION: ${{ matrix.python_version }}
Expand All @@ -76,13 +76,13 @@ jobs:
matrix:
example: ${{ fromJson(needs.examples.outputs.example) }}
python_version: ["3.8", "3.9"]
airflow_version: ["2.2.5", "2.6.3"]
airflow_version: ["2.2.5", "2.7.2"]
exclude:
# Needs more memory than available on the runner
- example: dbt-spark-example
- example: spark-delta-sharing
- example: spark-s3-to-hive
# Exclude failing dbt runs
# Run without parallelism separately
- example: dbt-example
env:
PYTHON_VERSION: ${{ matrix.python_version }}
Expand All @@ -94,6 +94,24 @@ jobs:
echo Run Ci for example ${{ matrix.example }}
./whirl -x ${{ matrix.example }} ci
whirl-ci-dbt-example:
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 1
matrix:
python_version: ["3.8", "3.9"]
airflow_version: ["2.2.5", "2.7.2"]
env:
PYTHON_VERSION: ${{ matrix.python_version }}
AIRFLOW_VERSION: ${{ matrix.airflow_version }}
steps:
- uses: actions/checkout@v2
- name: Run whirl CI dbt example
working-directory: ./examples/dbt-example
run: |
echo Run Ci from dbt-example directory
../../whirl ci
whirl-ci-extra-env-spark-s3-to-postgres:
runs-on: ubuntu-latest
Expand All @@ -111,7 +129,7 @@ jobs:
max-parallel: 4
matrix:
python_version: ["3.8", "3.9"]
airflow_version: ["2.2.5", "2.6.3"]
airflow_version: ["2.2.5", "2.7.2"]
runs-on: ubuntu-latest
env:
PYTHON_VERSION: ${{ matrix.python_version }}
Expand All @@ -130,7 +148,7 @@ jobs:
max-parallel: 4
matrix:
python_version: ["3.8", "3.9"]
airflow_version: ["2.2.5", "2.6.3"]
airflow_version: ["2.2.5", "2.7.2"]
runs-on: ubuntu-latest
env:
PYTHON_VERSION: ${{ matrix.python_version }}
Expand Down
2 changes: 1 addition & 1 deletion .whirl.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AIRFLOW_VERSION=2.6.3
AIRFLOW_VERSION=2.7.2
PYTHON_VERSION=3.9
AIRFLOW__API__AUTH_BACKEND=airflow.api.auth.backend.basic_auth
MINIMAL_AIRFLOW_VERSION=2.2.5
4 changes: 2 additions & 2 deletions docker/airflow-python/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
ARG AIRFLOW_VERSION=2.6.0
ARG AIRFLOW_VERSION=2.7.2
ARG PYTHON_VERSION=3.9
FROM apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION}

USER root

ARG AIRFLOW_VERSION=2.6.0
ARG AIRFLOW_VERSION=2.7.2
ENV AIRFLOW_VERSION=${AIRFLOW_VERSION}
ENV WHIRL_SETUP_FOLDER=/etc/airflow/whirl.setup.d

Expand Down
2 changes: 1 addition & 1 deletion envs/api-python-s3-k8s/compose.setup.d/Dockerfile.worker
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG AIRFLOW_VERSION=2.6.3
ARG AIRFLOW_VERSION=2.7.2
ARG PYTHON_VERSION=3.9
FROM apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION}

Expand Down
1 change: 1 addition & 0 deletions envs/dbt-example/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.pgdata
6 changes: 6 additions & 0 deletions envs/dbt-example/.whirl.env
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ POSTGRES_PASSWORD=p@ssw0rd
POSTGRES_USER=postgres
POSTGRES_DB=postgresdb

# DBT verbosity
DBT_LOG_LEVEL=debug
DBT_LOG_FORMAT=text
DBT_LOG_FORMAT_FILE=text
DBT_SEND_ANONYMOUS_USAGE_STATS=False

# Spark variables
SPARK_VERSION=3.4.0

Expand Down
18 changes: 18 additions & 0 deletions envs/dbt-example/compose.setup.d/01_clean_pg_data_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

function empty_data_dir() {
echo "================================"
echo "== Cleanup local PG mount dir =="
echo "================================"
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
PG_DATA_DIR="${SCRIPT_DIR}/../.pgdata"

if [ "$(ls -A ${PG_DATA_DIR})" ]; then
echo "${PG_DATA_DIR} is not empty. Clearing NOW!!"
find ${PG_DATA_DIR} -mindepth 1 -delete
else
echo "${PG_DATA_DIR} is empty. Continue"
fi
}

empty_data_dir
2 changes: 2 additions & 0 deletions envs/dbt-example/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ services:
- POSTGRES_PASSWORD
- POSTGRES_USER
- POSTGRES_DB
volumes:
- ./.pgdata:/var/lib/postgresql/data

sparkmaster:
build:
Expand Down
10 changes: 9 additions & 1 deletion envs/dbt-example/whirl.setup.d/04_install_dbt.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
#!/usr/bin/env bash
sudo apt-get install -y libsasl2-dev build-essential
pip install agate==1.6.1 airflow-dbt
pip install dbt-core==1.4.5 airflow-dbt-python

#airflow-dbt-python depends on the fs_default connection
echo "====================================="
echo "== Configure FS Default connection =="
echo "====================================="
airflow connections add fs_default \
--conn-type fs \
--conn-extra "{\"path\": \"/\"}"
1 change: 1 addition & 0 deletions examples/dbt-example/.whirl.env
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
WHIRL_ENVIRONMENT=dbt-example
MOCK_DATA_FOLDER=${DAG_FOLDER}/mock-data
PYTHON_VERSION=3.8
18 changes: 18 additions & 0 deletions examples/dbt-example/compose.setup.d/02_clean_dbt_log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

function empty_log_dir() {
echo "====================================="
echo "== Cleanup local DBT log mount dir =="
echo "====================================="
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs"

if [ "$(ls -A ${DBT_LOG_DIR})" ]; then
echo "${DBT_LOG_DIR} is not empty. Clearing NOW!!"
find ${DBT_LOG_DIR} -mindepth 1 -not -name ".gitkeep" -delete
else
echo "${DBT_LOG_DIR} is empty. Continue"
fi
}

empty_log_dir
16 changes: 16 additions & 0 deletions examples/dbt-example/compose.teardown.d/01_show_dbt_log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

function show_logs() {
echo "======================="
echo "== Show dbt run logs =="
echo "======================="
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs"

if [ "$(ls -A ${DBT_LOG_DIR}/dbt.log)" ]; then
echo "${DBT_LOG_DIR}/dbt.log exists. Showing log!!"
sudo cat ${DBT_LOG_DIR}/dbt.log
fi
}

show_logs
9 changes: 5 additions & 4 deletions examples/dbt-example/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow_dbt.operators.dbt_operator import DbtRunOperator, DbtTestOperator
from airflow_dbt_python.operators.dbt import DbtRunOperator, DbtTestOperator

default_args = {
'owner': 'whirl',
Expand All @@ -29,7 +29,8 @@
'spark.hadoop.fs.s3a.endpoint': "{}:{}".format(os.environ.get('AWS_SERVER', ''), os.environ.get('AWS_PORT', '')),
'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false',
'spark.hadoop.fs.s3a.path.style.access': 'true',
'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem'
'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem',
'spark.hadoop.fs.s3a.aws.credentials.provider': 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider'
}

dag = DAG(dag_id='whirl-dbt-example',
Expand Down Expand Up @@ -86,7 +87,7 @@

dbt_run = DbtRunOperator(
task_id='dbt_run',
dir=DBT_DIRECTORY,
project_dir=DBT_DIRECTORY,
profiles_dir=DBT_DIRECTORY,
target='airflow',
dag=dag
Expand All @@ -95,7 +96,7 @@

dbt_test = DbtTestOperator(
task_id='dbt_test',
dir=DBT_DIRECTORY,
project_dir=DBT_DIRECTORY,
profiles_dir=DBT_DIRECTORY,
target='airflow',
dag=dag
Expand Down
2 changes: 1 addition & 1 deletion examples/dbt-example/dbt/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.idea
dbt_modules
logs
target
.user.yml
4 changes: 2 additions & 2 deletions examples/dbt-example/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ profile: 'flights'
# These configurations specify where dbt should look for different types of files.
# The `source-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
source-paths: ["models"]
model-paths: ["models"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
data-paths: ["data"]
seed-paths: ["data"]
macro-paths: ["macros"]

target-path: "target" # directory which will store compiled SQL files
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion examples/dbt-example/dbt/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@ flights:
# search_path: [optional, override the default postgres search_path]
# role: [optional, set the role dbt assumes when executing queries]
# sslmode: [optional, set the sslmode used to connect to the database]
target: local
target: airflow

Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
#!/usr/bin/env bash
pip install dbt-postgres
pip install dbt-postgres==1.4.5

sudo chmod -R ugo+rw /opt/airflow/dags/dbt-example/dbt
6 changes: 6 additions & 0 deletions examples/dbt-spark-example/.whirl.env
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
WHIRL_ENVIRONMENT=s3-external-spark-hive
MOCK_DATA_FOLDER=${DAG_FOLDER}/mock-data

# DBT verbosity
DBT_LOG_LEVEL=debug
DBT_LOG_FORMAT=text
DBT_LOG_FORMAT_FILE=text
DBT_SEND_ANONYMOUS_USAGE_STATS=False
18 changes: 18 additions & 0 deletions examples/dbt-spark-example/compose.setup.d/02_clean_dbt_log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

function empty_log_dir() {
echo "====================================="
echo "== Cleanup local DBT log mount dir =="
echo "====================================="
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs"

if [ "$(ls -A ${DBT_LOG_DIR})" ]; then
echo "${DBT_LOG_DIR} is not empty. Clearing NOW!!"
find ${DBT_LOG_DIR} -mindepth 1 -not -name ".gitkeep" -delete
else
echo "${DBT_LOG_DIR} is empty. Continue"
fi
}

empty_log_dir
16 changes: 16 additions & 0 deletions examples/dbt-spark-example/compose.teardown.d/01_show_dbt_log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

function show_logs() {
echo "======================="
echo "== Show dbt run logs =="
echo "======================="
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
DBT_LOG_DIR="${SCRIPT_DIR}/../dbt/logs"

if [ "$(ls -A ${DBT_LOG_DIR}/dbt.log)" ]; then
echo "${DBT_LOG_DIR}/dbt.log exists. Showing log!!"
sudo cat ${DBT_LOG_DIR}/dbt.log
fi
}

show_logs
9 changes: 5 additions & 4 deletions examples/dbt-spark-example/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from airflow_dbt.operators.dbt_operator import DbtRunOperator, DbtTestOperator
from airflow_dbt_python.operators.dbt import DbtRunOperator, DbtTestOperator

default_args = {
'owner': 'whirl',
Expand Down Expand Up @@ -34,7 +34,8 @@
'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false',
'spark.hadoop.fs.s3a.path.style.access': 'true',
'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem',
'spark.hadoop.fs.s3a.multipart.size': '104857600'
'spark.hadoop.fs.s3a.multipart.size': '104857600',
'spark.hadoop.fs.s3a.aws.credentials.provider': 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider'
}

dag = DAG(dag_id='whirl-dbt-spark-example',
Expand Down Expand Up @@ -97,7 +98,7 @@

dbt_run = DbtRunOperator(
task_id='dbt_run',
dir=DBT_DIRECTORY,
project_dir=DBT_DIRECTORY,
profiles_dir=DBT_DIRECTORY,
target='hive',
dag=dag
Expand All @@ -106,7 +107,7 @@

dbt_test = DbtTestOperator(
task_id='dbt_test',
dir=DBT_DIRECTORY,
project_dir=DBT_DIRECTORY,
profiles_dir=DBT_DIRECTORY,
target='hive',
dag=dag
Expand Down
2 changes: 1 addition & 1 deletion examples/dbt-spark-example/dbt/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.idea
dbt_modules
logs
target
.user.yml
4 changes: 2 additions & 2 deletions examples/dbt-spark-example/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ profile: 'flights'
# These configurations specify where dbt should look for different types of files.
# The `source-paths` config, for example, states that source models can be found
# in the "models/" directory. You probably won't need to change these!
source-paths: ["models"]
model-paths: ["models"]
analysis-paths: ["analysis"]
test-paths: ["tests"]
data-paths: ["data"]
seed-paths: ["data"]
macro-paths: ["macros"]

target-path: "target" # directory which will store compiled SQL files
Expand Down
2 changes: 0 additions & 2 deletions examples/dbt-spark-example/dbt/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ flights:
method: thrift
host: sparkthrift
port: 10000
user: hive
password: hive
schema: default
connect_retries: 5
connect_timeout: 60
Expand Down
12 changes: 11 additions & 1 deletion examples/dbt-spark-example/whirl.setup.d/02_add_dbt.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
#!/usr/bin/env bash
sudo apt-get install -y libsasl2-dev build-essential
pip install agate==1.6.1 airflow-dbt dbt-spark[PyHive]
pip install dbt-core==1.4.5 dbt-spark[PyHive] airflow-dbt-python

#airflow-dbt-python depends on the fs_default connection
echo "====================================="
echo "== Configure FS Default connection =="
echo "====================================="
airflow connections add fs_default \
--conn-type fs \
--conn-extra "{\"path\": \"/\"}"

sudo chmod -R ugo+rw /opt/airflow/dags/dbt-spark-example/dbt
Loading

0 comments on commit cddeb66

Please sign in to comment.