Skip to content

Commit

Permalink
upgrade airflow and dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
Pierlou committed Sep 3, 2024
1 parent cea71c9 commit 4f8ddfd
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 75 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pg-airflow/
ssh/
.env
variables.py
.DS_Store
.DS_Store
Dockerfile
13 changes: 8 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM apache/airflow:2.7.3-python3.10

FROM apache/airflow:2.10.0-python3.10

USER root

Expand All @@ -14,15 +15,17 @@ RUN pip install --upgrade pip

USER root

# MySQL key rotation (https://dev.mysql.com/doc/refman/8.0/en/checking-gpg-signature.html)
RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A8D3785C
# RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29

RUN apt-get update -y
RUN apt-get install git -y
RUN apt-get install lftp -y
RUN apt-get install zip -y
RUN apt-get install wget -y
RUN apt-get install p7zip-full -y
RUN apt-get install nano -y
RUN apt-get install jq -y
RUN apt-get install libmagic1 -y

RUN chown -R "airflow:root" /opt/airflow/

Expand All @@ -41,6 +44,6 @@ ADD requirements.txt /requirements.txt

RUN pip install -r /requirements.txt

RUN git config --global user.email "geoffrey.aldebert@data.gouv.fr"
RUN git config --global user.name "Geoffrey Aldebert (Bot Airflow)"
RUN git config --global user.email "pierlou.ramade@data.gouv.fr"
RUN git config --global user.name "Pierlou Ramade (Bot Airflow)"

120 changes: 62 additions & 58 deletions airflow.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,55 +23,12 @@ default_timezone = utc
# full import path to the class when using a custom executor.
executor = SequentialExecutor

# The SqlAlchemy connection string to the metadata database.
# SqlAlchemy supports many different database engines.
# More information here:
# http://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri
sql_alchemy_conn = sqlite:////opt/airflow/airflow.db

# The encoding for the databases
sql_engine_encoding = utf-8

# Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding.
# This is particularly useful in case of mysql with utf8mb4 encoding because
# primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should
# be set to ``utf8mb3_general_ci``.
# sql_engine_collation_for_ids =

# If SqlAlchemy should pool database connections.
sql_alchemy_pool_enabled = True

# The SqlAlchemy pool size is the maximum number of database connections
# in the pool. 0 indicates no limit.
sql_alchemy_pool_size = 5

# The maximum overflow size of the pool.
# When the number of checked-out connections reaches the size set in pool_size,
# additional connections will be returned up to this limit.
# When those additional connections are returned to the pool, they are disconnected and discarded.
# It follows then that the total number of simultaneous connections the pool will allow
# is pool_size + max_overflow,
# and the total number of "sleeping" connections the pool will allow is pool_size.
# max_overflow can be set to ``-1`` to indicate no overflow limit;
# no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
sql_alchemy_max_overflow = 10

# The SqlAlchemy pool recycle is the number of seconds a connection
# can be idle in the pool before it is invalidated. This config does
# not apply to sqlite. If the number of DB connections is ever exceeded,
# a lower config value will allow the system to recover faster.
sql_alchemy_pool_recycle = 1800

# Check connection at the start of each connection pool checkout.
# Typically, this is a simple statement like "SELECT 1".
# More information here:
# https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
sql_alchemy_pool_pre_ping = True

# The schema to use for the metadata database.
# SqlAlchemy supports databases with the concept of multiple schemas.
sql_alchemy_schema =

# Import path for connect args in SqlAlchemy. Defaults to an empty dict.
# This is useful when you want to configure db engine args that SqlAlchemy won't parse
# in connection string.
Expand All @@ -87,7 +44,7 @@ parallelism = 32
# the number of tasks that is running concurrently for a DAG, add up the number of running
# tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``concurrency``,
# which is defaulted as ``dag_concurrency``.
dag_concurrency = 16
max_active_tasks_per_dag = 16

# Are DAGs paused by default at creation
dags_are_paused_at_creation = True
Expand Down Expand Up @@ -126,7 +83,7 @@ fernet_key =
donot_pickle = True

# How long before timing out a python file import
dagbag_import_timeout = 30.0
dagbag_import_timeout = 200.0

# Should a traceback be shown in the UI for dagbag import errors,
# instead of just the exception message
Expand Down Expand Up @@ -210,15 +167,10 @@ lazy_load_plugins = True
# loaded from module.
lazy_discover_providers = True

# Number of times the code should be retried in case of DB Operational Errors.
# Not all transactions will be retried as it can cause undesired state.
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
max_db_retries = 3

# Hide sensitive Variables or Connection extra json keys from UI and task logs when set to True
#
# (Connection passwords are always hidden in logs)
hide_sensitive_var_conn_fields = True
hide_sensitive_var_conn_fields = False

# A comma-separated list of extra sensitive keywords to look for in variables names or connection's
# extra JSON.
Expand Down Expand Up @@ -382,7 +334,7 @@ enable_experimental_api = False
# Deny all :
# auth_backend = airflow.api.auth.backend.deny_all
# Basic Auth :
auth_backend = airflow.api.auth.backend.basic_auth
auth_backends = airflow.api.auth.backend.basic_auth


# Used to set the maximum page limit for API requests
Expand Down Expand Up @@ -529,7 +481,7 @@ expose_hostname = True
expose_stacktrace = True

# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
dag_default_view = tree
dag_default_view = grid

# Default DAG orientation. Valid values are:
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
Expand Down Expand Up @@ -677,6 +629,11 @@ sentry_dsn =
# otherwise via ``CeleryExecutor``
kubernetes_queue = kubernetes

[kubernetes_executor]

# The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
namespace = default

[celery]

# This section only applies if you are using the CeleryExecutor in
Expand Down Expand Up @@ -802,6 +759,56 @@ worker_precheck = False
# Example: visibility_timeout = 21600
# visibility_timeout =

[database]

# The SqlAlchemy connection string to the metadata database.
# SqlAlchemy supports many different database engines.
# More information here:
# http://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri
sql_alchemy_conn = sqlite:////opt/airflow/airflow.db

# If SqlAlchemy should pool database connections.
sql_alchemy_pool_enabled = True

# The SqlAlchemy pool size is the maximum number of database connections
# in the pool. 0 indicates no limit.
sql_alchemy_pool_size = 5

# The maximum overflow size of the pool.
# When the number of checked-out connections reaches the size set in pool_size,
# additional connections will be returned up to this limit.
# When those additional connections are returned to the pool, they are disconnected and discarded.
# It follows then that the total number of simultaneous connections the pool will allow
# is pool_size + max_overflow,
# and the total number of "sleeping" connections the pool will allow is pool_size.
# max_overflow can be set to ``-1`` to indicate no overflow limit;
# no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
sql_alchemy_max_overflow = 10

# The SqlAlchemy pool recycle is the number of seconds a connection
# can be idle in the pool before it is invalidated. This config does
# not apply to sqlite. If the number of DB connections is ever exceeded,
# a lower config value will allow the system to recover faster.
sql_alchemy_pool_recycle = 1800

# Check connection at the start of each connection pool checkout.
# Typically, this is a simple statement like "SELECT 1".
# More information here:
# https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
sql_alchemy_pool_pre_ping = True

# The encoding for the databases
sql_engine_encoding = utf-8

# The schema to use for the metadata database.
# SqlAlchemy supports databases with the concept of multiple schemas.
sql_alchemy_schema =

# Number of times the code should be retried in case of DB Operational Errors.
# Not all transactions will be retried as it can cause undesired state.
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
max_db_retries = 3

[dask]

# This section only applies if you are using the DaskExecutor in
Expand Down Expand Up @@ -834,7 +841,7 @@ scheduler_heartbeat_sec = 5
num_runs = -1

# The number of seconds to wait between consecutive DAG file processing
processor_poll_interval = 1
scheduler_idle_sleep_time = 1

# Number of seconds after which a DAG file is parsed. The DAG file is parsed every
# ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
Expand Down Expand Up @@ -942,7 +949,7 @@ api_rev = v3
host =

# Format of the log_id, which is used to query for a given tasks logs
log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
log_id_template = {dag_id}-{task_id}-{run_id}-{map_index}-{try_number}

# Used to mark the end of a log stream for a task
end_of_log_mark = end_of_log
Expand Down Expand Up @@ -981,9 +988,6 @@ worker_container_repository =
# The tag of the Kubernetes Image for the Worker to Run
worker_container_tag =

# The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
namespace = default

# If True, all worker pods will be deleted upon termination
delete_worker_pods = True

Expand Down
26 changes: 15 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
GitPython
table-schema-to-markdown==0.4.6
table-schema-to-markdown==0.4.12
frictionless==4.25.1
jsonschema
Unidecode==1.3.2
Unidecode==1.3.6
PyYAML
geojson
shapely
minio==7.1.3
boto3==1.28.0
minio==7.2.8
boto3==1.35.0
emails==0.6
pandas==1.5.3
papermill==2.3.4
plotly==5.6.0
pandas==2.2.2
papermill==2.6.0
plotly==5.24.0
plotly_express==0.4.1
kaleido==0.2.1
ipykernel==5.5.6
nbconvert==6.5.1
openpyxl==3.0.9
openpyxl==3.1.5
elasticsearch==7.17.0
elasticsearch_dsl==7.4.0
requests==2.32.0
python-dotenv==0.21.0
swifter==1.1.3
tweepy==4.8.0
pytest==7.2.1
langdetect==1.0.9
pydantic==2.4.0
pyproj==3.6.1
pyproj==3.6.1
requests==2.32.3
swifter==1.4.0
rdflib==6.3.2
feedgen==1.0.0
duckdb==0.10.2
python-magic==0.4.27

0 comments on commit 4f8ddfd

Please sign in to comment.