diff --git a/.dockerignore b/.dockerignore index 8953f43..9c99e23 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,6 +3,5 @@ tests/data/packages tests/data/processing tests/data/tmp tests/data/global -tests/data/gridfinder tests/data/countries_all.geojson .git \ No newline at end of file diff --git a/.github/workflows/build_dev.yaml b/.github/workflows/build_dev.yaml index c7f357a..d7d228a 100644 --- a/.github/workflows/build_dev.yaml +++ b/.github/workflows/build_dev.yaml @@ -6,14 +6,15 @@ on: push: branches: - develop + paths: ['api/**', 'dataproc/**', 'tests/**', 'config.py', 'Dockerfile', 'requirements.txt', '.github/**'] env: - VERSION: 0.2.4 REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} + TEST_IMAGE_TAG: test jobs: - build-and-push-image: + build-test-push-dev-image: runs-on: ubuntu-latest permissions: contents: read @@ -23,6 +24,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Set variables + run: | + VER=$(cat ./VERSION) + echo "VERSION=$VER" >> $GITHUB_ENV + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -33,7 +39,19 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and push Docker image + - name: Build Test Docker image + uses: docker/build-push-action@v3 + with: + context: . + load: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.VERSION }}-dev-${{ env.TEST_IMAGE_TAG }} + + - name: Run Test Suite + env: + AUTOPKG_VERSION: ${{ env.VERSION }}-dev-${{ env.TEST_IMAGE_TAG }} + run: docker-compose -f .github/workflows/test/docker-compose-ci-test.yaml run test || docker-compose -f .github/workflows/test/docker-compose-ci-test.yaml logs dataproc + + - name: Build and Push Docker image uses: docker/build-push-action@v3 with: context: . @@ -41,3 +59,4 @@ jobs: cache-to: type=gha,mode=max push: true tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.VERSION }}-dev + diff --git a/.github/workflows/build_prod_and_release.yaml b/.github/workflows/build_prod_and_release.yaml index 0bc39e4..bb2e076 100644 --- a/.github/workflows/build_prod_and_release.yaml +++ b/.github/workflows/build_prod_and_release.yaml @@ -6,31 +6,16 @@ on: push: branches: - master + paths: ['api/**', 'dataproc/**', 'tests/**', 'config.py', 'Dockerfile', 'requirements.txt', '.github/**'] env: - VERSION: 0.2.4 REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} + TEST_IMAGE_TAG: test jobs: - release: - name: Create Release - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Create Release - id: create_release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: v${{ env.VERSION }} - release_name: Release v${{ env.VERSION }} - draft: false - prerelease: false - build-and-push-image: + build-and-push-image-release: runs-on: ubuntu-latest permissions: contents: read @@ -40,6 +25,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Set variables + run: | + VER=$(cat VERSION) + echo "VERSION=$VER" >> $GITHUB_ENV + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -50,6 +40,18 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Build Test Docker image + uses: docker/build-push-action@v3 + with: + context: . + load: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.VERSION }}-${{ env.TEST_IMAGE_TAG }} + + - name: Run Test Suite + env: + AUTOPKG_VERSION: ${{ env.VERSION }}-${{ env.TEST_IMAGE_TAG }} + run: docker-compose -f .github/workflows/test/docker-compose-ci-test.yaml run test || docker-compose -f .github/workflows/test/docker-compose-ci-test.yaml logs dataproc + - name: Build and push Docker image uses: docker/build-push-action@v3 with: @@ -58,3 +60,14 @@ jobs: cache-to: type=gha,mode=max push: true tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.VERSION }} + + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: v${{ env.VERSION }} + release_name: Release v${{ env.VERSION }} + draft: false + prerelease: false diff --git a/.github/workflows/test/.citest.env b/.github/workflows/test/.citest.env new file mode 100644 index 0000000..9b32779 --- /dev/null +++ b/.github/workflows/test/.citest.env @@ -0,0 +1,36 @@ +# DB Vars +POSTGRES_USER=postgres +POSTGRES_DB=ccgautopkg +POSTGRES_HOST=db +POSTGRES_PASSWORD=postgres + +# Autopkg Vars +PYTHONPATH="${PYTHONPATH}:/usr/src/app" +AUTOPKG_LOG_LEVEL=DEBUG +AUTOPKG_DEPLOYMENT_ENV=test +AUTOPKG_INTEGRATION_TEST_ENDPOINT="http://api:8000" +AUTOPKG_INCLUDE_TEST_PROCESSORS=True + +# Localfs Backend +AUTOPKG_STORAGE_BACKEND=localfs +AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT_TEST=/usr/src/app/tests/data/packages +AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT_TEST=/usr/src/app/tests/data/processing +AUTOPKG_POSTGRES_USER=postgres +AUTOPKG_POSTGRES_PASSWORD=postgres +AUTOPKG_POSTGRES_HOST=db +AUTOPKG_POSTGRES_PORT=5432 +AUTOPKG_POSTGRES_DB=ccgautopkg + +AUTOPKG_CELERY_BROKER=redis://redis:6379 +AUTOPKG_CELERY_BACKEND=redis://redis:6379 +AUTOPKG_CELERY_CONCURRENCY=1 +AUTOPKG_TASK_LOCK_TIMEOUT=600 +AUTOPKG_TASK_EXPIRY_SECS=3600 +AUTOPKG_REDIS_HOST=redis +AUTOPKG_PACKAGES_HOST_URL=http://localhost/packages + +# Raster Cropping Memory Allocation +GDAL_CACHEMAX=64 + +# Test Flags +AUTOPKG_TEST_GRI_OSM="False" \ No newline at end of file diff --git a/.github/workflows/test/docker-compose-ci-test.yaml b/.github/workflows/test/docker-compose-ci-test.yaml new file mode 100644 index 0000000..a848c38 --- /dev/null +++ b/.github/workflows/test/docker-compose-ci-test.yaml @@ -0,0 +1,56 @@ +version: '3' +services: + db: + image: postgis/postgis:13-3.1-alpine + ports: + - 5432:5432 + env_file: + - .citest.env + + redis: + image: redis:6.2-alpine + command: redis-server --save 20 1 --loglevel debug + + dataproc: + image: ghcr.io/nismod/irv-autopkg:${AUTOPKG_VERSION} + restart: always + depends_on: + - redis + env_file: + - .citest.env + volumes: + - packages:/usr/src/app/tests/data/packages + - processing:/usr/src/app/tests/data/processing + command: celery --app dataproc.tasks worker + + api: + image: ghcr.io/nismod/irv-autopkg:${AUTOPKG_VERSION} + restart: always + depends_on: + - db + - redis + env_file: + - .citest.env + volumes: + - ./wait-for-it.sh:/opt/wait-for-it.sh + - packages:/usr/src/app/tests/data/packages + - processing:/usr/src/app/tests/data/processing + command: /opt/wait-for-it.sh db:5432 --timeout=20 -- uvicorn api.main:app --host 0.0.0.0 --port 8000 + + test: + image: ghcr.io/nismod/irv-autopkg:${AUTOPKG_VERSION} + depends_on: + - api + - dataproc + env_file: + - .citest.env + volumes: + - ./wait-for-it.sh:/opt/wait-for-it.sh + - ./run_tests.sh:/opt/run_tests.sh + - packages:/usr/src/app/tests/data/packages + - processing:/usr/src/app/tests/data/processing + command: /opt/wait-for-it.sh api:8000 --timeout=20 -- /opt/wait-for-it.sh redis:6379 --timeout=20 -- sh /opt/run_tests.sh + +volumes: + packages: + processing: \ No newline at end of file diff --git a/.github/workflows/test/run_tests.sh b/.github/workflows/test/run_tests.sh new file mode 100644 index 0000000..ade5dec --- /dev/null +++ b/.github/workflows/test/run_tests.sh @@ -0,0 +1,7 @@ +#!/bin/sh +pip install pytest && \ +echo Executing API Tests... && \ +ls -lrt /usr/src/app/tests/data && \ +pytest -p no:warnings -v /usr/src/app/tests/api && \ +echo Executing Processor Tests... && \ +pytest -p no:warnings -v /usr/src/app/tests/dataproc diff --git a/.github/workflows/test/test_results/.gitkeep b/.github/workflows/test/test_results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/test/wait-for-it.sh b/.github/workflows/test/wait-for-it.sh new file mode 100755 index 0000000..3974640 --- /dev/null +++ b/.github/workflows/test/wait-for-it.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo -n > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# Check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) + +WAITFORIT_BUSYTIMEFLAG="" +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + # Check if busybox timeout uses -t flag + # (recent Alpine versions don't support -t anymore) + if timeout &>/dev/stdout | grep -q -e '-t '; then + WAITFORIT_BUSYTIMEFLAG="-t" + fi +else + WAITFORIT_ISBUSY=0 +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c9e4129..f26cb20 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,31 +1,35 @@ -FROM osgeo/gdal:alpine-small-3.6.2 - -# set work directory -WORKDIR /usr/src/app +FROM osgeo/gdal:ubuntu-small-3.6.2 # set environment variables ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 -# install dependencies -COPY ./requirements.txt . -RUN apk add --virtual .build-deps \ - --repository http://dl-cdn.alpinelinux.org/alpine/edge/community \ - --repository http://dl-cdn.alpinelinux.org/alpine/edge/main \ - gcc libc-dev geos-dev geos && \ - apk add --update --no-cache python3 python3-dev alpine-sdk && ln -sf python3 /usr/bin/python && \ - python3 -m ensurepip && \ - pip3 install --no-cache --upgrade pip setuptools && \ - pip3 install -r requirements.txt +# copy project +WORKDIR /usr/src/app + +# install dependencies and add user +RUN apt-get update && \ + apt-get install -y python3-pip && \ + rm -rf /var/lib/apt/lists/* && \ + addgroup -gid 1002 autopkg && adduser --system --disabled-login -uid 1002 --gid 1002 autopkg -ENV PYTHONPATH "${PYTHONPATH}:/usr/src/app/" +# Load Pip deps as Autopkg +COPY requirements.txt . +USER autopkg +RUN pip3 install --user --no-cache --upgrade --no-warn-script-location pip -r requirements.txt -# copy project -COPY . . +# Load App and alter user +USER root +COPY config.py . +COPY api ./api +COPY dataproc ./dataproc +COPY tests ./tests +RUN mkdir -p /usr/src/app/tests/data/processing && mkdir /usr/src/app/tests/data/packages && chown -R autopkg:autopkg /usr/src/app -# Setup the Executing User -RUN addgroup -g 1002 autopkg && adduser -SHD autopkg -u 1002 -G autopkg && \ - chown -R autopkg:autopkg /usr/src/app +USER autopkg +# Make sure scripts in .local are usable: +ENV PATH=/home/autopkg/.local/bin:$PATH +ENV PYTHONPATH "${PYTHONPATH}:/usr/src/app/" # Run unit tests -RUN python3 -m unittest discover /usr/src/app/tests/dataproc/unit +RUN python3 -m unittest /usr/src/app/tests/dataproc/unit/processors/test_env.py diff --git a/README.md b/README.md index 2eb68a6..c747603 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,14 @@ # IRV AutoPackaging -FastAPI + Celery + Individual processors per Dataset for DAG ETL Pipeline +FastAPI + Celery for executing ETL (boundary clipping, re-formatting, move to S3/Local FS) against source datasaets hosted on https://global.infrastructureresilience.org (see also https://github.com/nismod/infra-risk-vis.) -Encompasses API and backend-processing to generate / manage datapackages associated with boundaries +Encompasses API and backend-processing to generate / manage frictionless-data datapackages associated with boundaries -## Data Processing - -Celery-based worker running DAG's generated through API request - -__TODO Docs__: - -* Celery -* Dag structure -* Processors (Internal) - Boundary and Provenance -* Processors (Core) - dev deployment and testing -* Secrets required for Processors -* Concurrency -* Duplicate Task submission and execution - -### Running Locally: - -```bash -celery --app dataproc.tasks worker --loglevel=debug --concurrency=1 -``` - -Running using Docker: - -```bash -docker-compose up dataproc -``` -### Testing - -Integration tests in `tests/dataproc/integration/processors` all run standalone (without Redis / Celery), but you'll need access to the source data for each processor (see above). +## Architecture -__NOTE__: Test for geopkg (test_natural_earth_vector) loading include a load from shapefile to postgres - the API database is used for this test and configured user requires insert and delete rights on the api database for the test to succeed. + -```bash -# Run tests locally -python -m unittest discover tests/dataproc -# Run tests in Docker -docker-compose run test-dataproc -``` ## API @@ -62,6 +29,29 @@ The boundaries table schema is managed by Alembic can be found under `api/db/mod __NOTE__: API Integration tests require a Db user who has RW access to this table. __NOTE__: The configured API database will be wiped during running of the integration tests and loaded with test-boundaries. +#### PG Schema Management with Alembic + +The database schema is managed through Alembic. The following serves as a guide to basic usage for extending the schema - refer to https://alembic.sqlalchemy.org/en/latest/ for more information. + +##### Schema Updates + +* Make changes as required to models +* From within the autoppkg/api folder run the following to auto-generate an upgrade/downgrade script: + +```bash +alembic revision --autogenerate -m "Added Boundary Table" +``` + +__NOTE__: CHECK the script - remove extraneous operations (in particular those relating to spatial-ref-sys) + +* When ready run the following to upgrade the database: + +```bash +# Ensure the AUTOPKG_POSTGRES_* env variables are set (see below) +cd api +alembic upgrade head +``` + ### Running Locally: ```bash @@ -76,18 +66,172 @@ docker-compose up api ### Documentation +API Docs: https://global.infrastructureresilience.org/extract/redoc + +OpenAPI JSON: https://global.infrastructureresilience.org/extract/openapi.json + #### OpenAPI * Run the app as above -* Navigate to http://:/openapi.json +* Navigate to http://`host`:`port`/openapi.json #### ReDoc * Run the app as above -* Navigate to http://:/redoc +* Navigate to http://`host`:`port`/redoc + + + +## Data Processing + +### Running + +#### Locally + +Data Processor: + +```bash +celery --app dataproc.tasks worker +``` + +#### Docker Compose + +See: `docker-compose.yaml` + +```bash +docker-compose up dataproc +``` + +### Data Storage + +Terms: + +* `Package` - All Data associated with a single boundary +* `Processor` - Code for clipping a particular Dataset and Version +* `Processing Backend` - Processor execution environment. Currently only local filesystem processing backend is supported. (Processing interim files are executed against and stored-in the local execution env) +* `Storage Backend` - Package storage environment. Currently AWS S3 and LocalFS are supported. Package files are hosted from here, either using NGINX (see `docker-compose.yaml`) or S3. + +#### Package Structure: + + + +Processors will download and store source datafiles to a configured location on the local execution environment filesystem, on first-execution. (This means source files _could_ be downloaded multiple times if multiple Celery workers were deployed across seperate filesystems.) + +Processor will generate interim files in a configured location on the local filesystem during processing of a boundary. These files are subsequently moved to the configured storage backend and deleted from temporary storage on processor exit. + +### Processors + +Dataset Core Processors (`dataproc/processors/core`) are executed as Celery Tasks and are responsible for fetching, cropping and moving the dataset-version to-which they are associated. + +Supporting Internal Processors (`dataproc/processors/internal`) generate Boundary and folder-structures, as well as providing logging. + +Celery tasks are constructued from API request and executed against source data. A processing request can only be executed against a single boundary, but can include multiple processors to be executed. + +The overall task for each request is executed as a Chord, with a nested Group of tasks for each processor (which can run in parallel): + +```python +dag = step_setup | group(processor_task_signatures) | step_finalise +``` + +The `step_setup` and `step_finalise` tasks are defined in `dataproc.tasks` and are responsible for setting up the processing environment and cleaning up after the processing has completed. + +The `processor_task_signatures` are generated by the API and are responsible for executing the processing for each processor. + +Duplicate execution of tasks is prevented by using a Redis-lock for a combination of boundary-dataset-version key. (see `dataproc/tasks.py`). + + +### Configuration + +All config variables are parsed by `config.py` from the execution environment. + +```bash +# Celery +AUTOPKG_LOG_LEVEL=DEBUG # API and Dataproc Logging Level +AUTOPKG_INTEGRATION_TEST_ENDPOINT="http://localhost:8000" # API Endpoint used during integration testing (integration testing deployment env) +AUTOPKG_REDIS_HOST="localhost" # Redis Host (APOI and Worker) +AUTOPKG_CELERY_BROKER="redis://localhost" # Used for Worker only +AUTOPKG_CELERY_BACKEND="redis://localhost" # Used in API and Worker +AUTOPKG_CELERY_CONCURRENCY=2 # Celery worker concurrency - dataproc only +AUTOPKG_TASK_LOCK_TIMEOUT=600 # Secs - Duplicate task lock timeout (blocks duplicate processors from executing for this time) +AUTOPKG_TASK_EXPIRY_SECS=43200 # Secs before queued tasks expire on Celery - dataproc only +GDAL_CACHEMAX=1024 # Siz eof GDAL Cache (mb) for raster crop operations - see GDAL Docs + +# Postgres Boundaries +AUTOPKG_POSTGRES_USER= # Used for API Boundaries in Prod (and test natural_earth_vector processor in Worker) +AUTOPKG_POSTGRES_HOST= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) +AUTOPKG_POSTGRES_PASSWORD= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) +AUTOPKG_POSTGRES_PORT= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) +AUTOPKG_POSTGRES_DB= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) + +# Deployment Env +AUTOPKG_DEPLOYMENT_ENV="prod" # Change to test when running integration tests. +AUTOPKG_S3_REGION="eu-west-2" # S3 region +AUTOPKG_STORAGE_BACKEND="awss3" # Either "awss3" or "localfs" Storage backend to use for final packages (see additional backend-specific flags below for more info). Used in API and Worker + +# Testing Backend +AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT_TEST="./tests/data/packages" # Root for backend storage folder in testing +AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT_TEST="./tests/data/processing" # Root for backend processing folder in testing +AUTOPKG_S3_TEST_ACCESS_KEY= # S3 Access key for testing Bucket +AUTOPKG_S3_TEST_SECRET_KEY= # S3 Secret for testing bucket +AUTOPKG_S3_TEST_BUCKET="irv-autopkg-dev" # S3 Bucket for Dev / Testing + +# Prod Backend +AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT="./data/packages" # Root for backend storage folder in Prod +AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT="./data/processing" # Root for backend storage folder in Prod +AUTOPKG_S3_ACCESS_KEY= # S3 Access key for testing Bucket +AUTOPKG_S3_SECRET_KEY= # S3 Secret for testing bucket +AUTOPKG_S3_BUCKET="irv-autopkg" # S3 Bucket for Prod + +# Testing Flags +AUTOPKG_INCLUDE_TEST_PROCESSORS="True" # Include Test Processors from the available processors list +AUTOPKG_TEST_GRI_OSM="True" # Integration tests which require access to the GRIOSM Postgres instance will be run if this is set-True (1) + +AUTOPKG_PACKAGES_HOST_URL= # Root-URL to the hosting engine for package data. e.g. "https://global.infrastructureresilience.org/packages" (localfs) or "https://irv-autopkg.s3.eu-west-2.amazonaws.com" (awss3), or http://localhost (Local testing under NGINX) +``` + +#### Processor Specific Configurations + +Some processors require their-own environment configuration(e.g. secrets for source data) + +```bash +# AWS OSM / Damages DB +AUTOPKG_OSM_PGHOST= +AUTOPKG_OSM_PORT= +AUTOPKG_OSM_PGDATABASE= +AUTOPKG_OSM_PGUSER= +AUTOPKG_OSM_PGPASSWORD= +``` + +### Scaling + +The primary means of scaling to fit available resources are: + +```bash +GDAL_CACHEMAX=1024 # This flag limits the amount of memory GDAL uses when cropping rasters. Mainly effects jrc_built_c processing because the input rasters are very large +AUTOPKG_CELERY_CONCURRENCY=2 # The number of tasks that can be executed at once. Assume you'll get into the position of executing multiple very large crops / OSM cuts this number of times in parallel. Smaller tasks will be queued behind these larger blocking tasks. +``` + +Also when running under docker-compose you can change the container resource limits in `docker-compose.yaml` to uit your execution environment. + +__NOTE__: We have not yet extensively testsed running on a distributed-cluster (i.e. workers running on separate nodes). In Theory this is supported through Celery and the Redis backend, however the processor data-folder will need to be provided through some shared persistent storage to avoid pulliung source data multiple-times. ### Testing +#### DataProcessors + +Integration tests in `tests/dataproc/integration/processors` all run standalone (without Redis / Celery), but you'll need access to the source data for each processor (see above). + +__NOTE__: Test for geopkg (test_natural_earth_vector) loading include a load from shapefile to postgres - the API database is used for this test and configured user requires insert and delete rights on the api database for the test to succeed. + +```bash +# Run tests locally +python -m unittest discover tests/dataproc +# Run tests in Docker +docker-compose run test-dataproc +``` + +#### API & DataProcessing End2End + __NOTE__: API and Dataproc tests required access to shared processing and package folders for assertion of processor outputs. __NOTE__: API tests will add and remove boundary test-data to/from the Db during execution. @@ -107,7 +251,7 @@ Ensure you also have `AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT_TEST` set in the envi ```bash export AUTOPKG_DEPLOYMENT_ENV=test # Run API -uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload +uvicorn api.main:app --host 0.0.0.0 --port 8000 # Run Worker celery --app dataproc.tasks worker --loglevel=debug --concurrency=1 @@ -126,54 +270,95 @@ docker-compose up -d db redis api dataproc docker-compose run test-api ``` -### PG Schema Management with Alembic - -The database schema is managed through Alembic. The following serves as a guide to basic usage for extending the schema - refer to https://alembic.sqlalchemy.org/en/latest/ for more information. +#### Localfs or S3 Backend -#### Schema Updates +Altering deployment env with `AUTOPKG_STORAGE_BACKEND=awss3` or `AUTOPKG_STORAGE_BACKEND=localfs` will also mean tests run against the configured +backend. -* Make changes as required to models -* From within the autoppkg/api folder run the following to auto-generate an upgrade/downgrade script: +__NOTE__ awss3 integration tests require supplied access keys to have RW permissions on the configured bucket. ```bash -alembic revision --autogenerate -m "Added Boundary Table" +export AUTOPKG_STORAGE_BACKEND=awss3 && python -m unittest discover tests/dataproc ``` -__NOTE__: CHECK the script - remove extransous operations (in particular those relating to spatial-ref-sys) -* When reaady run the following to upgrade the database: - -```bash -# Ensure the AUTOPKG_POSTGRES_* env variables are set (see below) -cd api -alembic upgrade head +### Extending / New Processor Development + +* Create a new folder for your dataset beneath `dataproc/processors/core` (e.g. `dataproc/processors/core/my_dataset`) +* Add a new Python-file for the dataset version within the folder (and supporting __init__.py). (e.g. `dataproc/processors/core/my_dataset/version_1.py`) +* Add a Metadata Class containing the processor-version metadata (which must sub-class MetadataABC), e.g.: + +```python +class Metadata(BaseMetadataABC): + """Processor metadata""" + + name = processor_name_from_file( + inspect.stack()[1].filename + ) # this must follow snakecase formatting, without special chars + description = "A test processor for nightlights" # Longer processor description + version = version_name_from_file( + inspect.stack()[1].filename + ) # Version of the Processor + dataset_name = "nightlights" # The dataset this processor targets + data_author = "Nightlights Author" + data_title = "" + data_title_long = "" + data_summary = "" + data_citation = "" + data_license = DataPackageLicense( + name="CC-BY-4.0", + title="Creative Commons Attribution 4.0", + path="https://creativecommons.org/licenses/by/4.0/", + ) + data_origin_url = "http://url" + data_formats = ["GeoTIFF"] ``` -## Deployment Environment - -```bash -export AUTOPKG_POSTGRES_USER= # Used for API Boundaries in Prod (and test natural_earth_vector processor in Worker) -export AUTOPKG_POSTGRES_HOST= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) -export AUTOPKG_POSTGRES_PASSWORD= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) -export AUTOPKG_POSTGRES_PORT= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) -export AUTOPKG_POSTGRES_DB= # Used for API Boundaries only (and test natural_earth_vector processor in Worker) -export AUTOPKG_CELERY_BROKER= # Used for Worker only -export AUTOPKG_CELERY_BACKEND= # Used in API and Worker -export AUTOPKG_CELERY_CONCURRENCY= # Celery worker concurrency - dataproc only -export AUTOPKG_TASK_LOCK_TIMEOUT=600 # Secs - Duplicate task lock timeout (blocks duplicate processors from executing for this time) -export AUTOPKG_TASK_EXPIRY_SECS=3600 # Secs before tasks expire on Celery - dataproc only -export AUTOPKG_STORAGE_BACKEND= # Storage backend to use for final packages (see additional backend-specific flags below for more info). Used in API and Worker -export AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT= # Path to root-directory for packages. Used in API and Worker -export AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT= # Path to root-directory for local interim processing data. Used by Worker only -export AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT_TEST= # Path to root-directory for packages when running integration tests. Used in API and Worker -export AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT_TEST= # Path to root-directory for local interim processing data when running integration tests. Used by Worker only -export PACKAGES_HOST_URL= # URL to the hosting engine for package data, e.g. http://localhost -``` - -### Local FileSystem Storage Backend +* Add a Processor Class (which must sub-class BaseProcessorABC so it can be run by the global Celery Task), which runs the fetching, cropping and moving logic for your dataset-version. (__NOTE__: Helper methods are already provided for the majority of tasks - e.g. Storage backend classes are provided for LocalFS and AWSS3), e.g.: +```python +class Processor(BaseProcessorABC): + """A Test Processor""" + + def generate(self): + """Generate files for a given processor""" + self.update_progress(30,"waiting") + output_folder = self.paths_helper.build_absolute_path( + "test_processor", self.metadata.version, "outputs" + ) + output_fpath = os.path.join(output_folder, f"{self.boundary['name']}_test.tif") + if self.exists() is True: + raise ProcessorDatasetExists() + else: + # Generate a blank tests dataset + create_test_file(output_fpath) + result_uri = self.storage_backend.put_processor_data( + output_fpath, + self.boundary["name"], + self.metadata.name, + self.metadata.version, + ) + self.provenance_log[f"{self.metadata.name} - move to storage success"] = True + self.provenance_log[f"{self.metadata.name} - result URI"] = result_uri + # Generate the datapackage and add it to the output log + datapkg = datapackage_resource( + self.metadata, + [result_uri], + "GEOPKG", + [os.path.getsize(output_fpath)], + [data_file_hash(output_fpath)], + ) + self.provenance_log["datapackage"] = datapkg.asdict() + return self.provenance_log + + def exists(self): + """Whether all files for a given processor exist on the FS on not""" + return self.storage_backend.processor_file_exists( + self.boundary["name"], + self.metadata.name, + self.metadata.version, + f"{self.boundary['name']}_test.tif", + ) +``` -```bash -export AUTOPKG_STORAGE_BACKEND=localfs -export AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT= # Base-folder for packages -export AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT_TEST= # Base path for integration-test package data -``` \ No newline at end of file +* Write tests against the new Processor (see: `tests/dataproc/integration` for examples) +* Rebuild image and deploy: The API will expose any valid processor-folder placed under the `dataproc/core` folder. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..967b33f --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.2.7 \ No newline at end of file diff --git a/api/db/controller.py b/api/db/controller.py index bcb0c33..c5f1554 100644 --- a/api/db/controller.py +++ b/api/db/controller.py @@ -6,7 +6,7 @@ from typing import List from fastapi.logger import logger -from config import LOG_LEVEL, NAME_SEARCH_DISTANCE +from config import LOG_LEVEL from api.db import database from api import schemas from api.db.queries import Queries @@ -58,5 +58,5 @@ async def search_boundaries_by_name(self, name: str) -> List[schemas.Boundary]: """ Get summary information about boundaries with a name similar to the given """ - boundaries = await Queries(database).search_boundaries_by_name(name, NAME_SEARCH_DISTANCE) + boundaries = await Queries(database).search_boundaries_by_name(name) return [schemas.BoundarySummary.from_orm(boundary) for boundary in boundaries] diff --git a/api/db/queries.py b/api/db/queries.py index be3e39e..0ef156f 100644 --- a/api/db/queries.py +++ b/api/db/queries.py @@ -48,13 +48,11 @@ async def get_boundary_by_name(self, name: str) -> models.Boundary: Get detailed information about a specific boundary This includes a GeoJSON repr of the geometry under field ST_AsGeoJSON """ - stmt = ( - select( - models.Boundary, - func.ST_AsGeoJSON(models.Boundary.geometry), - func.ST_AsGeoJSON(func.ST_Envelope(models.Boundary.geometry))) - .where(models.Boundary.name == name) - ) + stmt = select( + models.Boundary, + func.ST_AsGeoJSON(models.Boundary.geometry), + func.ST_AsGeoJSON(func.ST_Envelope(models.Boundary.geometry)), + ).where(models.Boundary.name == name) res = await self.database.fetch_one(stmt) if not res: raise BoundaryNotFoundException() @@ -64,20 +62,24 @@ async def get_all_boundary_summaries(self) -> List[models.Boundary]: """ Get summary information about all available boundaries """ - stmt = select(models.Boundary.id, models.Boundary.name, models.Boundary.name_long) + stmt = select( + models.Boundary.id, models.Boundary.name, models.Boundary.name_long + ).order_by(models.Boundary.name_long) res = await self.database.fetch_all(stmt) if not res: return [] return res - async def search_boundaries_by_name( - self, name: str, distance: int - ) -> List[models.Boundary]: + async def search_boundaries_by_name(self, name: str) -> List[models.Boundary]: """ Search for boundaries by fuzzy matching matching name """ - stmt = select(models.Boundary).where( - func.difference(name, models.Boundary.name) > distance + stmt = ( + select(models.Boundary) + .where( + func.like(func.lower(models.Boundary.name_long), f"%{name.lower()}%") + ) + .order_by(models.Boundary.name_long) ) res = await self.database.fetch_all(stmt) if not res: diff --git a/api/helpers.py b/api/helpers.py index f3ffd8a..323d550 100644 --- a/api/helpers.py +++ b/api/helpers.py @@ -2,6 +2,7 @@ API Helpers """ +from enum import Enum import traceback from typing import Any, List @@ -14,13 +15,15 @@ from dataproc.tasks import boundary_setup, generate_provenance from dataproc.helpers import ( get_processor_meta_by_name, + list_processors, + build_processor_name_version ) from dataproc.exceptions import InvalidProcessorException from api.exceptions import ( CannotGetCeleryTasksInfoException, ) -from config import CELERY_APP, PACKAGES_HOST_URL +from config import CELERY_APP, INCLUDE_TEST_PROCESSORS # API @@ -60,7 +63,6 @@ def handle_exception(logger, err: Exception): # DAGs and Processing - def get_processor_task(name: str) -> Any: """Get task related to a processor task by its name""" return getattr(tasks, name) @@ -95,7 +97,7 @@ def random_task_uuid(): def processor_meta( processor_name_version: str, executing: bool = False -) -> schemas.ProcessorVersion: +) -> schemas.ProcessorVersionMetadata: """ Generate ProcessorVersion (with nested metadata) for a given processor version """ @@ -106,26 +108,32 @@ def processor_meta( raise InvalidProcessorException() if meta_cls is not None: meta = meta_cls() - return schemas.ProcessorVersion( - processor=schemas.ProcessorMetadata( - name=processor_name_version, - description=meta.description, - dataset=meta.dataset_name, - author=meta.data_author, - license=meta.data_license.asdict(), - origin_url=meta.data_origin_url, - version=meta.version, - status="executing" if executing is True else "complete", - ), - version=meta.version + return schemas.ProcessorVersionMetadata( + name=processor_name_version, + description=meta.description, + version=meta.version, + status="executing" if executing is True else "complete", + data_author=meta.data_author, + data_title=meta.data_title, + data_title_long=meta.data_title_long, + data_summary=meta.data_summary, + data_citation=meta.data_citation, + data_license=meta.data_license.asdict(), + data_origin_url=meta.data_origin_url, + data_formats=meta.data_formats ) # Celery Queue Interactions -def extract_group_state_info(group_result: GroupResult) -> schemas.JobGroupStatus: +def extract_group_state_info(group_result: GroupResult, missing_proc_name_msg: str = "processor details not available") -> schemas.JobGroupStatus: """ Generate job status info from a GroupStatus object + Internally we ensure the Chord (DAG) succeeds so we can generate provenance at the end of each run. + As a result DAG internal Processor tasks that fail are reported as SUCCESS in Celery because the errors are caught / handled, + so we can use the sink (log) in the next DAG (Chord) stage. + NOTE: The API will report tasks as FAILED or SKIPPED depending on the contents of the job result. + state=PENDING - info = None state=EXECUTING - info = {'progress': int, 'current_task': str} Comes from the processor updating its states @@ -144,19 +152,50 @@ def extract_group_state_info(group_result: GroupResult) -> schemas.JobGroupStatu continue if not isinstance(task_meta, dict): continue + # check if task_meta contains tips about failure or skip + if "failed" in task_meta.keys(): + _state = "FAILURE" + elif "skipped" in task_meta.keys(): + _state = "SKIPPED" + else: + _state = result.state # While its progressing we report the progress, otherwise we show the result + processors.append( + schemas.JobStatus( + processor_name=proc_name, + job_id=result.id, + job_status=_state, + job_progress=schemas.JobProgress( + percent_complete=task_meta['progress'] if isinstance(task_meta, dict) and 'progress' in task_meta.keys() else 0, + current_task=task_meta['current_task'] if isinstance(task_meta, dict) and 'current_task' in task_meta.keys() else "UNKNOWN", + ) if _state not in ["SUCCESS", "FAILURE", "SKIPPED"] else None, # Progressing if not successful, failed or skipped + job_result=task_meta if _state in ["SUCCESS", "FAILURE", "SKIPPED"] else None, + ) + ) + else: + # Awaiting execution - attempt to get info direct + try: + _result = get_celery_task_info(result.id) + host = list(_result.keys())[0] + proc_name = (_result[host][result.id][1]['args'][2]) processors.append(schemas.JobStatus( processor_name=proc_name, job_id=result.id, job_status=result.state, - job_progress=schemas.JobProgress( - percent_complete=task_meta['progress'] if isinstance(task_meta, dict) and 'progress' in task_meta.keys() else 0, - current_task=task_meta['current_task'] if isinstance(task_meta, dict) and 'current_task' in task_meta.keys() else "UNKNOWN", - ) if result.state not in ["SUCCESS", "FAILURE"] else None, - job_result=task_meta if result.state in ["SUCCESS", "FAILURE"] else None, + job_progress=None, + job_result=None, + )) + except Exception: + # Sometimes Celery fails to return a result object - when under heavy load + processors.append(schemas.JobStatus( + processor_name=missing_proc_name_msg, + job_id=result.id, + job_status=result.state, + job_progress=None, + job_result=None, )) return schemas.JobGroupStatus( - job_group_status="COMPLETE" if all(global_status) else "PENDING", + job_group_status="COMPLETE" if all(global_status) else "PENDING", job_group_percent_complete=sum(global_perc_complete), job_group_processors=processors ) @@ -187,7 +226,6 @@ def get_celery_scheduled_tasks() -> dict: def get_celery_task_info(task_id: str) -> dict: """ - DEPRECATED Information about a specific task Example Output diff --git a/api/routers/boundaries.py b/api/routers/boundaries.py index 86e5d39..3a32c5a 100644 --- a/api/routers/boundaries.py +++ b/api/routers/boundaries.py @@ -46,7 +46,7 @@ async def search_boundary( """Search for boundaries by name or coordinates.""" try: logger.debug("performing %s with query %s", inspect.stack()[0][3], [name, latitude, longitude]) - if latitude and longitude: + if latitude is not None and longitude is not None: result = await DBController().search_boundaries_by_coordinates( latitude, longitude ) diff --git a/api/routers/jobs.py b/api/routers/jobs.py index 02bb112..ca16330 100644 --- a/api/routers/jobs.py +++ b/api/routers/jobs.py @@ -58,7 +58,7 @@ def extract_job_id(node): return proc_id -@router.get(JOB_STATUS_ROUTE, response_model=schemas.JobGroupStatus) +@router.get(JOB_STATUS_ROUTE, response_model=schemas.JobGroupStatus, response_model_exclude_none=True) def get_status(job_id: str): """Get status of a DAG associated with a given package""" try: @@ -77,10 +77,10 @@ def get_status(job_id: str): # Remove Boundary processor from each jobs info logger.debug( "Group Status: %s", - [[result.state, result.info] for result in group_result.results], + [[result.state, result.info, result.args, result.name] for result in group_result.results], ) # Create response object - response = extract_group_state_info(group_result) + response = extract_group_state_info(group_result, missing_proc_name_msg=schemas.MISSING_PROC_MSG) logger.debug("completed %s with result: %s", inspect.stack()[0][3], response) return response except JobNotFoundException as err: @@ -99,7 +99,11 @@ async def submit_processing_job(job: schemas.Job, status_code=status.HTTP_202_AC # Collect boundary geojson boundary_db = await DBController().get_boundary_by_name(job.boundary_name) boundary_dataproc = DataProcBoundary( - job.boundary_name, boundary_db.geometry, boundary_db.envelope + job.boundary_name, + boundary_db.geometry.dict()["__root__"], + boundary_db.envelope.dict()[ + "__root__" + ], # Due to GeoJSON dynamic type in external schema ) # Check processors are all valid for processor_name_version in job.processors: diff --git a/api/routers/packages.py b/api/routers/packages.py index 37ebffd..fd563b0 100644 --- a/api/routers/packages.py +++ b/api/routers/packages.py @@ -48,7 +48,7 @@ logger.setLevel(LOG_LEVEL) # Initialise the storage backend helpers -storage_backend = init_storage_backend(STORAGE_BACKEND)(LOCALFS_STORAGE_BACKEND_ROOT) +storage_backend = init_storage_backend(STORAGE_BACKEND) @router.get(PACKAGES_BASE_ROUTE, response_model=List[PackageSummary]) @@ -56,9 +56,10 @@ async def get_packages(): """Retrieve information on available top-level packages (which are created from boundaries)""" try: logger.debug("performing %s", inspect.stack()[0][3]) - logger.debug("found packages in backend: %s", storage_backend.packages()) + packages = storage_backend.packages(summary=True) + logger.debug("found packages in backend: %s", packages) result = [] - for boundary_name in storage_backend.packages(): + for boundary_name in packages: result.append( PackageSummary( boundary_name=boundary_name, diff --git a/api/routers/processors.py b/api/routers/processors.py index a1db36e..bfc6e05 100644 --- a/api/routers/processors.py +++ b/api/routers/processors.py @@ -2,16 +2,21 @@ Data Processors """ import logging -import inspect -from types import ModuleType from typing import List from fastapi import APIRouter, HTTPException -from config import LOG_LEVEL -from dataproc.helpers import list_processors, build_processor_name_version, get_processor_meta_by_name -from api.routes import PROCESSORS_BASE_ROUTE -from api.db import database +from config import LOG_LEVEL, INCLUDE_TEST_PROCESSORS +from dataproc.helpers import ( + list_processors, + build_processor_name_version, + get_processor_meta_by_name, +) +from api.routes import ( + PROCESSORS_BASE_ROUTE, + PROCESSORS_NAME_ROUTE, + PROCESSORS_VERSION_ROUTE, +) from api import schemas from api.helpers import handle_exception @@ -23,40 +28,107 @@ logger = logging.getLogger("uvicorn.access") logger.setLevel(LOG_LEVEL) + @router.get(PROCESSORS_BASE_ROUTE, response_model=List[schemas.Processor]) async def get_processors(): - """Retrieve information about all available data processors""" + """Metadata for all available data processors""" try: results = [] - for proc_name, proc_versions in list_processors().items(): + for proc_name, proc_versions in list_processors(include_test_processors=INCLUDE_TEST_PROCESSORS).items(): output_versions = [] for version in proc_versions: - meta = get_processor_meta_by_name( - build_processor_name_version(proc_name, version) - )() - version = schemas.ProcessorVersion( - version=version, - processor=schemas.ProcessorMetadata( - name=build_processor_name_version(meta.name, version), - description=meta.description, - dataset=meta.dataset_name, - author=meta.data_author, - license=meta.data_license.asdict(), - origin_url=meta.data_origin_url, - version=meta.version - ) + name_version = build_processor_name_version(proc_name, version) + meta = get_processor_meta_by_name(name_version)() + version = schemas.ProcessorVersionMetadata( + name=name_version, + description=meta.description, + version=meta.version, + data_author=meta.data_author, + data_title=meta.data_title, + data_title_long=meta.data_title_long, + data_summary=meta.data_summary, + data_citation=meta.data_citation, + data_license=meta.data_license.asdict(), + data_origin_url=meta.data_origin_url, + data_formats=meta.data_formats ) output_versions.append(version) - results.append( - schemas.Processor( - name=proc_name, - versions=output_versions - ) - ) + results.append(schemas.Processor(name=proc_name, versions=output_versions)) return results except Exception as err: handle_exception(logger, err) raise HTTPException(status_code=500) - - + +@router.get( + PROCESSORS_NAME_ROUTE, + response_model=schemas.Processor, + response_model_exclude_none=True, + response_model_exclude_unset=True, +) +async def get_processor(name: str): + """Metadata for all versions of a single processor""" + try: + for proc_name, proc_versions in list_processors(include_test_processors=INCLUDE_TEST_PROCESSORS).items(): + output_versions = [] + if proc_name == name: + for version in proc_versions: + name_version = build_processor_name_version(name, version) + meta = get_processor_meta_by_name(name_version)() + version = schemas.ProcessorVersionMetadata( + name=name_version, + description=meta.description, + version=meta.version, + data_author=meta.data_author, + data_title=meta.data_title, + data_title_long=meta.data_title_long, + data_summary=meta.data_summary, + data_citation=meta.data_citation, + data_license=meta.data_license.asdict(), + data_origin_url=meta.data_origin_url, + data_formats=meta.data_formats + ) + output_versions.append(version) + return schemas.Processor(name=proc_name, versions=output_versions) + raise HTTPException(status_code=404, detail=f"no such processor: {name}") + except HTTPException: + raise + except Exception as err: + handle_exception(logger, err) + raise HTTPException(status_code=500) + + +@router.get( + PROCESSORS_VERSION_ROUTE, + response_model=schemas.ProcessorVersionMetadata, + response_model_exclude_none=True, + response_model_exclude_unset=True, +) +async def get_processor_version(name: str, version: str): + """Metadata for a single version of a processor""" + try: + name_version = build_processor_name_version(name, version) + try: + meta = get_processor_meta_by_name(name_version)() + except: + raise HTTPException( + status_code=404, detail=f"no such processor version: {name_version}" + ) + return schemas.ProcessorVersionMetadata( + name=name_version, + description=meta.description, + version=meta.version, + data_author=meta.data_author, + data_title=meta.data_title, + data_title_long=meta.data_title_long, + data_summary=meta.data_summary, + data_citation=meta.data_citation, + data_license=meta.data_license.asdict(), + data_origin_url=meta.data_origin_url, + data_formats=meta.data_formats + ) + except HTTPException: + raise + except Exception as err: + handle_exception(logger, err) + raise HTTPException(status_code=500) diff --git a/api/routes.py b/api/routes.py index 58d5643..43cacc3 100644 --- a/api/routes.py +++ b/api/routes.py @@ -14,8 +14,10 @@ BOUNDARY_SEARCH_ROUTE = BOUNDARIES_BASE_ROUTE + "/search" BOUNDARY_ROUTE = BOUNDARIES_BASE_ROUTE + "/{name}" -# Retrieval of information about available processors +# Retrieval of information about all available processors PROCESSORS_BASE_ROUTE = API_ROUTE_BASE + "/processors" +PROCESSORS_NAME_ROUTE = PROCESSORS_BASE_ROUTE + "/{name}" +PROCESSORS_VERSION_ROUTE = PROCESSORS_NAME_ROUTE + "/{version}" # Retrieval of information about packages PACKAGES_BASE_ROUTE = API_ROUTE_BASE + "/packages" diff --git a/api/schemas.py b/api/schemas.py index c4469ca..b46491c 100644 --- a/api/schemas.py +++ b/api/schemas.py @@ -3,94 +3,178 @@ """ from typing import List, Optional +from enum import Enum from pydantic import BaseModel, validator +from dataproc.helpers import processors_as_enum +from config import INCLUDE_TEST_PROCESSORS + +MISSING_PROC_MSG = "processor details not available" + + +class Polygon(BaseModel): + """Reference to the external GeoJSON Polygon JSON Schema""" + + __root__: dict + + class Config: + @staticmethod + def schema_extra(schema: dict): + schema.clear() + schema["$ref"] = "https://geojson.org/schema/Polygon.json" + + +class MultiPolygon(BaseModel): + """Reference to the external GeoJSON MultiPolygon JSON Schema""" + + __root__: dict + + class Config: + @staticmethod + def schema_extra(schema: dict): + schema.clear() + schema["$ref"] = "https://geojson.org/schema/MultiPolygon.json" + + +class DataPackage(BaseModel): + """Reference to the external DataPackage JSON Schema""" + + __root__: dict + + class Config: + @staticmethod + def schema_extra(schema: dict): + schema.clear() + schema[ + "$ref" + ] = "https://specs.frictionlessdata.io/schemas/data-package.json" + + class BoundarySummary(BaseModel): """Summary of a boundary""" - id=int - name:str - name_long:str + + id = int + name: str + name_long: str class Config: orm_mode = True + class Boundary(BoundarySummary): """Complete boundary information""" - admin_level:str - geometry:dict # GeoJSON - envelope:dict # GeoJSON + + admin_level: str + geometry: MultiPolygon + envelope: Polygon class Config: orm_mode = True -class ProcessorMetadata(BaseModel): + +class ProcessorVersionMetadata(BaseModel): """Detail about a Data Processor""" + name: str description: str - dataset: str - author: str - license: dict - origin_url: str version: str - status: Optional[str]="" + status: Optional[str] = "" # Used while executing + uri: Optional[str] = "" # Used when package is available + data_author: str + data_title: str + data_title_long: str + data_summary: str + data_citation: str + data_license: dict + data_origin_url: str + data_formats: List[str] -class ProcessorVersion(BaseModel): - """A Version of a Processor""" - version: str - processor: ProcessorMetadata # Metadata about the versioned processor which created this dataset - uri: Optional[str]="" class Processor(BaseModel): """Summary information about a Processor""" - name: str # Name of the processor - versions: List[ProcessorVersion] # Versions of the processor, which are created by versioned processors of the same name + + name: str # Name of the processor + versions: List[ + ProcessorVersionMetadata + ] # Versions of the processor, which are created by versioned processors of the same name + class PackageSummary(BaseModel): """Summary information about a top-level package (which is formed from a boundary)""" - boundary_name: str # Name of the Boundary the package was created from - uri: str # URI to the package + + boundary_name: str # Name of the Boundary the package was created from + uri: str # URI to the package + class Package(PackageSummary): """Detailed information about a package""" - boundary: Boundary # Boundary from-which the package has been created - processors: List[Processor] # Datasets within this package - datapackage: dict # Datapackage.json parsed from the FS and nested within the Package response + + boundary: Boundary # Boundary from-which the package has been created + processors: List[Processor] # Datasets within this package + datapackage: DataPackage # Datapackage.json parsed from the FS and nested within the Package response + # Jobs + class Job(BaseModel): boundary_name: str - processors: List[str] # List of processor names + processors: List[str] # List of processor names - @validator('processors') + @validator("processors") def no_dups(cls, v): if len(set(v)) != len(v): - raise ValueError('duplicate processors not allowed') + raise ValueError("duplicate processors not allowed") return v + class SubmittedJob(BaseModel): """A successfully submitted Job""" + job_id: str + class JobProgress(BaseModel): """ Specifics about the progress of an individual Processors Job """ - percent_complete: Optional[int]=0 + + percent_complete: Optional[int] = 0 current_task: Optional[str] + +class JobStateEnum(str, Enum): + """Possible Job States""" + + PENDING = "PENDING" + SUCCESS = "SUCCESS" + FAILURE = "FAILURE" + EXECUTING = "EXECUTING" + RETRY = "RETRY" + SKIPPED = "SKIPPED" + REVOKED = "REVOKED" + + class JobStatus(SubmittedJob): """Status of a Submitted Job""" - processor_name: str - job_status: str + + processor_name: processors_as_enum( + include_test_processors=INCLUDE_TEST_PROCESSORS, additions=[MISSING_PROC_MSG] + ) + job_status: JobStateEnum job_progress: Optional[JobProgress] job_result: Optional[dict] + class Config: + use_enum_values = True + + class JobGroupStatus(BaseModel): """ Status of the Processor Group in a submited DAG """ + job_group_status: str - job_group_percent_complete: Optional[int]=0 - job_group_processors: List[JobStatus] \ No newline at end of file + job_group_percent_complete: Optional[int] = 0 + job_group_processors: List[JobStatus] diff --git a/config.py b/config.py index 4513f35..dd77cec 100644 --- a/config.py +++ b/config.py @@ -73,7 +73,7 @@ def get_db_uri_sync( ) # seconds before locked tasks timeout # Deployment Env -DEPLOYMENT_ENV = getenv("AUTOPKG_DEPLOYMENT_ENV", "dev") +DEPLOYMENT_ENV = getenv("AUTOPKG_DEPLOYMENT_ENV", "prod") LOG_LEVEL = logging.getLevelName(getenv("AUTOPKG_LOG_LEVEL", "DEBUG")) INTEGRATION_TEST_ENDPOINT = getenv( "AUTOPKG_INTEGRATION_TEST_ENDPOINT", "http://localhost:8000" @@ -99,7 +99,7 @@ def get_db_uri_sync( # Storage backend to use STORAGE_BACKEND = getenv("AUTOPKG_STORAGE_BACKEND", "localfs") # Dev / Prod switch for testing -if getenv("AUTOPKG_DEPLOYMENT_ENV", "prod") == "test": +if DEPLOYMENT_ENV == "test": # TEST # The root-level folder when using localfs storage backend LOCALFS_STORAGE_BACKEND_ROOT = getenv( @@ -112,16 +112,25 @@ def get_db_uri_sync( path.join(path.dirname(path.abspath(__file__)), "tests", "data", "processing"), ) # Integration tests which require access to the GRIOSM Postgres instance will be run if this is set-True (1) - TEST_GRI_OSM = bool(int(getenv("TEST_GRI_OSM", "0"))) + TEST_GRI_OSM = True if getenv("AUTOPKG_TEST_GRI_OSM", "True") == "True" else False + # AWSS3 Storage Backend + S3_ACCESS_KEY = getenv("AUTOPKG_S3_TEST_ACCESS_KEY") + S3_SECRET_KEY = getenv("AUTOPKG_S3_TEST_SECRET_KEY") + # Top level S3 bucket, under-which packages are stored if using AWSS3 backend + S3_BUCKET = getenv("AUTOPKG_S3_TEST_BUCKET", "irv-autopkg-dev") + S3_REGION = getenv("AUTOPKG_S3_REGION", "eu-west-2") else: # PROD # The root-level folder when using localfs storage backend LOCALFS_STORAGE_BACKEND_ROOT = getenv("AUTOPKG_LOCALFS_STORAGE_BACKEND_ROOT") # The root-level folder when using localfs processing backend LOCALFS_PROCESSING_BACKEND_ROOT = getenv("AUTOPKG_LOCALFS_PROCESSING_BACKEND_ROOT") - -# Name matching Soundex Distance Default -NAME_SEARCH_DISTANCE = int(getenv("AUTOPKG_NAME_SEARCH_DISTANCE", "2")) + # AWSS3 Storage Backend + S3_ACCESS_KEY = getenv("AUTOPKG_S3_ACCESS_KEY") + S3_SECRET_KEY = getenv("AUTOPKG_S3_SECRET_KEY") + # Top level S3 bucket, under-which packages are stored if using AWSS3 backend + S3_BUCKET = getenv("AUTOPKG_S3_BUCKET", "irv-autopkg") + S3_REGION = getenv("AUTOPKG_S3_REGION", "eu-west-2") # Initialised Startup Data DBURI_API = get_db_uri(API_POSTGRES_DB) @@ -131,7 +140,11 @@ def get_db_uri_sync( worker_concurrency=CELERY_CONCURRENCY, broker_url=CELERY_BROKER, result_backend=CELERY_BACKEND, + result_extended=True ) # Seconds before submitted tasks expire TASK_EXPIRY_SECS = int(getenv("AUTOPKG_TASK_EXPIRY_SECS", "3600")) + +# Remove Test Processors from the available processors list +INCLUDE_TEST_PROCESSORS = True if getenv("AUTOPKG_INCLUDE_TEST_PROCESSORS", "True") == "True" else False diff --git a/dataproc/README.md b/dataproc/README.md index a06e711..614e031 100644 --- a/dataproc/README.md +++ b/dataproc/README.md @@ -15,7 +15,11 @@ class Metadata(BaseMetadataABC): description="A test processor for nightlights" # Logner processor description version="1" # Version of the Processor dataset_name="nightlights" # The dataset this processor targets + data_title="Night-time lights" + data_title_long="Night-time lights annual composite from VIIRS 2022" data_author="Nightlights Author" + data_summary = "A few paragraphs about night-time lights" + data_citation = "Author, N. (2020) Night-time lights. Available at: https://example.com" data_license="Nightlights License" data_origin_url="http://url" @@ -39,4 +43,4 @@ class Processor(BaseProcessorABC): pass ``` -- Reboot API and Celery Worker to make processor live \ No newline at end of file +- Reboot API and Celery Worker to make processor live diff --git a/dataproc/__init__.py b/dataproc/__init__.py index 629640c..759ad0a 100644 --- a/dataproc/__init__.py +++ b/dataproc/__init__.py @@ -5,7 +5,7 @@ class Boundary(dict): """Encapsulates the definition of a boundary required for downstream processing""" - def __init__(self, name, geojson, envelope_geojson): + def __init__(self, name: str, geojson: dict, envelope_geojson: dict): dict.__init__(self, name=name, geojson=geojson, envelope_geojson=envelope_geojson) self.name = name self.geojson = geojson diff --git a/dataproc/backends/storage/__init__.py b/dataproc/backends/storage/__init__.py index 1f6152c..4523208 100644 --- a/dataproc/backends/storage/__init__.py +++ b/dataproc/backends/storage/__init__.py @@ -1,16 +1,28 @@ - - +"""""" +from config import ( + LOCALFS_STORAGE_BACKEND_ROOT, + S3_ACCESS_KEY, + S3_SECRET_KEY, + S3_BUCKET, + S3_REGION, +) from dataproc.backends import StorageBackend from dataproc.exceptions import ConfigException from .localfs import LocalFSStorageBackend +from .awss3 import AWSS3StorageBackend + def init_storage_backend(storage_backend: str) -> StorageBackend: """ Initialise a StorageBackend by name """ if storage_backend == "localfs": - return LocalFSStorageBackend + return LocalFSStorageBackend(LOCALFS_STORAGE_BACKEND_ROOT) + elif storage_backend == "awss3": + return AWSS3StorageBackend( + S3_BUCKET, S3_ACCESS_KEY, S3_SECRET_KEY, s3_region=S3_REGION + ) else: raise ConfigException( f"Unsupported / Unset StorageBackend {storage_backend} - check env" - ) \ No newline at end of file + ) diff --git a/dataproc/backends/storage/awss3.py b/dataproc/backends/storage/awss3.py new file mode 100644 index 0000000..84b59c7 --- /dev/null +++ b/dataproc/backends/storage/awss3.py @@ -0,0 +1,503 @@ +""" +AWS S3 Filesystem Backend +""" + +import os +from typing import List, Tuple +import json +from datetime import datetime +import warnings + +from pyarrow import fs + +from dataproc.exceptions import ( + FolderCreationException, + FileCreationException, + PackageNotFoundException, + DatasetNotFoundException, + S3Exception, +) +from dataproc import DataPackageResource +from dataproc import helpers +from config import PACKAGES_HOST_URL +from ..base import StorageBackend + + +class S3Manager: + """ + S3 FS Context Manager + + ::arg access_key str + ::arg secret_key str + ::kwarg region str + + """ + + def __init__(self, *args, region="eu-west-2"): + self.access_key = args[0] + self.secret_key = args[1] + self.region = region + self.s3_fs = None + + def __enter__(self) -> fs.S3FileSystem: + self.s3_fs = fs.S3FileSystem( + region=self.region, access_key=self.access_key, secret_key=self.secret_key + ) + return self.s3_fs + + def __exit__(self, exc_type, exc_value, exc_tb): + if self.s3_fs: + del self.s3_fs + + +class AWSS3StorageBackend(StorageBackend): + """Backend for AWS S3 filesystem""" + + def __init__( + self, + bucket: str, + s3_access_key: str, + s3_secret_key: str, + s3_region="eu-west-2", + ) -> None: + """ + + ::param bucket str S3 bucket under-which packages are stored + ::param s3_access_key str S3 access key + ::param s3_secret_key str S3 secret key + """ + dict.__init__(self) + self.bucket = bucket + self.s3_access_key = s3_access_key + self.s3_secret_key = s3_secret_key + self.s3_region = s3_region + _ = self._check_env() + + def _parse_env(self) -> Tuple[str, str]: + """ + Parse the S3 secrets from env + + ::returns Tuple[str, str] access_key, secret_key + """ + return self.s3_access_key, self.s3_secret_key + + def _check_env(self) -> bool: + """ + Check the env required for S3 appears to be valid + """ + if not all([self.s3_access_key, self.s3_secret_key]): + warnings.warn( + "AWSS3StorageBackend - s3_access_key and s3_secret_key required for S3 initialisation" + ) + return False + return True + + def _build_absolute_path(self, *args) -> str: + """ + Build an absolute path from a relative path, by pre-pending the configured top level directory + """ + return os.path.join(self.bucket, *args) + + def _build_uri(self, absolute_fpath: str) -> str: + """Build the internet-accessible URI from a given s3 fpath""" + return absolute_fpath.replace(self.bucket, PACKAGES_HOST_URL) + + def _remove_bucket_from_s3path(self, s3_path: str) -> str: + """ + Remove the bucket name from a given as3_path + """ + return s3_path.replace(self.bucket + "/", "") + + def _list_directories(self, absolute_s3_path: str, recursive=False) -> List[str]: + """ + List the paths to directories in a given absolute_s3_path path (i.e. includes the bucket name) + + ::kwarg recursive optionally recurse down the tree + """ + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + contents = s3_fs.get_file_info( + fs.FileSelector(absolute_s3_path, recursive=recursive) + ) + return [ + os.path.basename(item.path) + for item in contents + if item.type == fs.FileType.Directory + ] + + def _exists(self, absolute_s3_path: str) -> bool: + """ + Check if an object at the given path exists + """ + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + chk = s3_fs.get_file_info(absolute_s3_path) + return chk.type != fs.FileType.NotFound + + def tree(self, summary: bool = False) -> dict: + """ + Generate a source-of-truth tree for the + FS showing Packages and Processors + (with their versions) + { + "package_name":{ + "processor_name": ["version_id", ...] + }, + "package_name":{ + "processor_name": ["version_id", ...] + } + } + + ::kwarg summary bool Return only boundary names (packages), not included dataset_versions + """ + tree = {} + if summary is True: + for package in self._list_directories(self._build_absolute_path("")): + # First level packages + tree[package] = {} + return tree + # Descend into datasets and versions + # Single call to S3 + for package, _ in tree.items(): + for dataset in self._list_directories( + self._build_absolute_path(package, self.datasets_folder_name) + ): + tree[package][dataset] = [] + # Descend into versions + for package, _ in tree.items(): + for dataset, _ in tree[package].items(): + for version in self._list_directories( + self._build_absolute_path( + package, self.datasets_folder_name, dataset + ) + ): + tree[package][dataset].append(version) + return tree + + def packages(self, summary: bool = False) -> List[str]: + """List of Packages that currently exist under the top-level storage backend""" + tree = self.tree(summary=summary) + return list(tree.keys()) + + def package_datasets(self, package: str) -> List[str]: + """ + List of Datasets that currently exist for a given Package + + ::param package str The name of the package + (which maps directly to a Boundary name) + """ + try: + return self._list_directories( + self._build_absolute_path(package, self.datasets_folder_name) + ) + except: + # The package does not exist + raise PackageNotFoundException(f"{package}") + + def dataset_versions(self, package: str, dataset: str) -> List[str]: + """ + List of versions that currently exist for a given Dataset + + ::param package str The name of the package + (which maps directly to a Boundary name) + + ::param dataset str the name of the dataset for which to retrieve versions + """ + try: + return self._list_directories( + self._build_absolute_path(package, self.datasets_folder_name, dataset) + ) + except: + # The dataset does not exist + raise DatasetNotFoundException(f"{dataset}") + + def add_provenance( + self, + boundary_name: str, + processing_log: List[dict], + filename: str = "provenance.json", + ) -> bool: + """ + Generate new and/or append given processing log to a boundaries provenance file + + { + "isoformat dtg": {log}, ... + } + """ + dest_abs_path = self._build_absolute_path(boundary_name, filename) + # If no exist - stream new to path + if not self._exists(dest_abs_path): + log = {datetime.utcnow().isoformat(): processing_log} + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + with s3_fs.open_output_stream(dest_abs_path) as stream: + stream.write(json.dumps(log).encode()) + else: + # If exist - fetch, update and upload (overwrite) + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + with s3_fs.open_input_stream(dest_abs_path) as stream: + log = json.loads(stream.readall().decode()) + log[datetime.utcnow().isoformat()] = processing_log + with s3_fs.open_output_stream(dest_abs_path) as stream: + stream.write(json.dumps(log).encode()) + return True + + def boundary_folder_exists(self, boundary_name: str): + """If a given boundary folder exists""" + return self._exists(self._build_absolute_path(boundary_name)) + + def boundary_data_folder_exists(self, boundary_name: str): + """If a given boundary data folder exists""" + return self._exists( + self._build_absolute_path(boundary_name, self.datasets_folder_name) + ) + + def boundary_file_exists(self, boundary_name: str, filename: str): + """If a given file for a boundary exists""" + return self._exists(self._build_absolute_path(boundary_name, filename)) + + def create_boundary_folder(self, boundary_name: str): + """ + Create a boundary folder + """ + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + s3_fs.create_dir(self._build_absolute_path(boundary_name)) + if not self.boundary_folder_exists(boundary_name): + raise FolderCreationException( + f"boundary folder path {boundary_name} not found" + ) + + def create_boundary_data_folder(self, boundary_name: str): + """ + Create a boundary data folder + """ + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + s3_fs.create_dir( + self._build_absolute_path(boundary_name, self.datasets_folder_name) + ) + if not self.boundary_data_folder_exists(boundary_name): + raise FolderCreationException( + f"boundary data-folder path {boundary_name} not found" + ) + + def put_boundary_data( + self, + local_source_fpath: str, + boundary_name: str, + ): + """Put a boundary supporting data onto the backend""" + filename = os.path.basename(local_source_fpath) + dest_abs_path = self._build_absolute_path(boundary_name, filename) + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + fs.copy_files( + local_source_fpath, + dest_abs_path, + source_filesystem=fs.LocalFileSystem(), + destination_filesystem=s3_fs, + ) + if not self._exists(dest_abs_path): + raise FileCreationException( + f"destination file path {dest_abs_path} not found after creation attempt" + ) + + def processor_dataset_exists( + self, boundary_name: str, processor_dataset: str, version: str + ) -> bool: + """ + Test if a given dataset folder exists within the given boundary and dataset version + """ + return self._exists( + self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + processor_dataset, + version, + ) + ) + + def processor_file_exists( + self, boundary_name: str, dataset_name: str, version: str, filename: str + ): + """If a given file for a dataset processor exists""" + return self._exists( + self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + dataset_name, + version, + self.dataset_data_folder_name, + filename, + ) + ) + + def put_processor_data( + self, + local_source_fpath: str, + boundary_name: str, + dataset_name: str, + version: str, + remove_local_source=False, + ) -> str: + """ + Put data output from a processor for a particular dataset and + version onto the backend + + ::kwarg remove_local_source bool Whether to delete the local source file + after a successful move + + ::returns dest_abs_path str URI of the moved file + """ + filename = os.path.basename(local_source_fpath) + dest_abs_path = self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + dataset_name, + version, + self.dataset_data_folder_name, + filename, + ) + # Create the output dirs + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + # Creates directories as necessary + fs.copy_files( + local_source_fpath, + dest_abs_path, + source_filesystem=fs.LocalFileSystem(), + destination_filesystem=s3_fs, + ) + if not self._exists(dest_abs_path): + raise FileCreationException( + f"destination file path {dest_abs_path} not found after creation attempt" + ) + if remove_local_source is True: + os.remove(local_source_fpath) + return self._build_uri(dest_abs_path) + + def put_processor_metadata( + self, + local_source_fpath: str, + boundary_name: str, + dataset_name: str, + version: str, + remove_local_source=False, + ) -> str: + """ + Put an a processor metadata file for a particular dataset and + version onto the backend + + ::kwarg remove_local_source bool Whether to delete the local source file + after a successful move + + ::returns dest_abs_path str URI of the moved file + """ + filename = os.path.basename(local_source_fpath) + dest_abs_path = self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + dataset_name, + version, + filename, + ) + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + # Creates directories as necessary + fs.copy_files( + local_source_fpath, + dest_abs_path, + source_filesystem=fs.LocalFileSystem(), + destination_filesystem=s3_fs, + ) + if not self._exists(dest_abs_path): + raise FileCreationException( + f"destination file path {dest_abs_path} not found after creation attempt" + ) + if remove_local_source is True: + os.remove(local_source_fpath) + return self._build_uri(dest_abs_path) + + def count_file_types_in_folder(self, folder_path: str, file_type="tif") -> int: + """ + Count the number of files of a type in a folder + """ + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + contents = s3_fs.get_file_info( + fs.FileSelector(folder_path, recursive=False) + ) + return len( + [ + item + for item in contents + if item.type == fs.FileType.File and item.extension == file_type + ] + ) + + def count_boundary_data_files( + self, + boundary_name: str, + dataset_name: str, + version: str, + datafile_ext: str = ".tif", + ) -> int: + """ + Count the number of datafiles for a given boundary folder + """ + folder = self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + dataset_name, + version, + self.dataset_data_folder_name, + ) + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + contents = s3_fs.get_file_info(fs.FileSelector(folder, recursive=False)) + return len( + [ + item + for item in contents + if item.type == fs.FileType.File + and item.extension == datafile_ext.replace(".", "") + ] + ) + + def remove_boundary_data_files( + self, + boundary_name: str, + dataset_name: str, + version: str, + ): + """Remove all datafiles associated with a particular boundary + processing version""" + folder = self._build_absolute_path( + boundary_name, + self.datasets_folder_name, + dataset_name, + version, + self.dataset_data_folder_name, + ) + if not self._exists(folder): + raise FileNotFoundError() + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + s3_fs.delete_dir_contents(folder, missing_dir_ok=True) + + def update_datapackage(self, boundary_name: str, dp_resource: DataPackageResource): + """ + Update a packages datapackage.json file with details of a given dataset. + + __NOTE__: Assumes the Boundary processor has already run and datapackage exists (even as just template) + """ + datapackage_fpath = self._build_absolute_path(boundary_name, "datapackage.json") + if not self._exists(datapackage_fpath): + raise S3Exception(f"datapackage does not exist: {datapackage_fpath}") + # Fetch, append, write + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + with s3_fs.open_input_stream(datapackage_fpath) as stream: + datapackage = json.loads(stream.readall().decode()) + datapackage = helpers.add_dataset_to_datapackage( + dp_resource, + datapackage, + ) + with s3_fs.open_output_stream(datapackage_fpath) as stream: + stream.write(json.dumps(datapackage).encode()) + + def load_datapackage(self, boundary_name: str) -> dict: + """Load the datapackage.json file from backend and return""" + datapackage_fpath = self._build_absolute_path(boundary_name, "datapackage.json") + with S3Manager(*self._parse_env(), region=self.s3_region) as s3_fs: + with s3_fs.open_input_stream(datapackage_fpath) as stream: + datapackage = json.loads(stream.readall().decode()) + return datapackage diff --git a/dataproc/backends/storage/localfs.py b/dataproc/backends/storage/localfs.py index fa2131a..8caeed5 100644 --- a/dataproc/backends/storage/localfs.py +++ b/dataproc/backends/storage/localfs.py @@ -38,7 +38,7 @@ def _build_uri(self, absolute_fpath: str) -> str: """Build the internet-accessible URI from a given localFS absolute fpath""" return absolute_fpath.replace(self.top_level_folder_path, PACKAGES_HOST_URL) - def tree(self) -> dict: + def tree(self, summary: bool = False) -> dict: """ Generate a source-of-truth tree for the FS showing Packages and Processors @@ -51,6 +51,7 @@ def tree(self) -> dict: "processor_name": ["version_id", ...] } } + ::kwarg summary bool Return only boundary names (packages), not included dataset_versions """ tree = {} for _, dirs, _ in os.walk(os.path.join(self.top_level_folder_path)): @@ -59,6 +60,8 @@ def tree(self) -> dict: tree[package] = {} # Dont recurse further break + if summary is True: + return tree # Descend into datasets for package, _ in tree.items(): for _, dataset_dirs, _ in os.walk( @@ -89,9 +92,9 @@ def tree(self) -> dict: break return tree - def packages(self) -> List[str]: + def packages(self, summary: bool = False) -> List[str]: """List of Packages that currently exist under the top-level storage backend""" - tree = self.tree() + tree = self.tree(summary=summary) return list(tree.keys()) def package_datasets(self, package: str) -> List[str]: @@ -170,8 +173,8 @@ def create_boundary_folder(self, boundary_name: str): """ full_path = self._build_absolute_path(boundary_name) os.mkdir(full_path) - if not self.boundary_folder_exists(full_path): - raise FolderCreationException(f"boundary folder path {full_path} not found") + if not self.boundary_folder_exists(boundary_name): + raise FolderCreationException(f"boundary folder path {boundary_name} not found") def create_boundary_data_folder(self, boundary_name: str): """ @@ -179,8 +182,8 @@ def create_boundary_data_folder(self, boundary_name: str): """ full_path = self._build_absolute_path(boundary_name, self.datasets_folder_name) os.mkdir(full_path) - if not self.boundary_folder_exists(full_path): - raise FolderCreationException(f"boundary folder path {full_path} not found") + if not self.boundary_data_folder_exists(boundary_name): + raise FolderCreationException(f"boundary data-folder path {boundary_name} not found") def put_boundary_data( self, @@ -234,7 +237,7 @@ def put_processor_data( remove_local_source=False ) -> str: """ - Put an data output from a processor for a particular dataset and + Put data output from a processor for a particular dataset and version onto the backend ::kwarg remove_local_source bool Whether to delete the local source file @@ -301,7 +304,7 @@ def put_processor_metadata( @staticmethod def count_file_types_in_folder(folder_path: str, file_type="tif") -> int: """ - Count the number of tiffs in a folder + Count the number of files of a type in a folder """ count = 0 for dir_info in os.scandir(folder_path): @@ -327,7 +330,7 @@ def count_boundary_data_files( self.dataset_data_folder_name, ) if not os.path.exists(folder): - raise FolderNotFoundException() + raise FileNotFoundError() return self.count_file_types_in_folder(folder, datafile_ext) def remove_boundary_data_files( @@ -345,7 +348,7 @@ def remove_boundary_data_files( self.dataset_data_folder_name, ) if not os.path.exists(folder): - raise FolderNotFoundException() + raise FileNotFoundError() for filename in os.listdir(folder): file_path = os.path.join(folder, filename) try: @@ -360,7 +363,7 @@ def update_datapackage(self, boundary_name: str, dp_resource: DataPackageResourc """ Update a packages datapackage.json file with details of a given dataset. - __NOTE__: Assumes the Boundary processor has already run and datapcakge exists (even as just template) + __NOTE__: Assumes the Boundary processor has already run and datapackage exists (even as just template) """ # Load existing datapackage datapackage_fpath = self._build_absolute_path(boundary_name, "datapackage.json") diff --git a/dataproc/exceptions.py b/dataproc/exceptions.py index 7de99ef..256e27b 100644 --- a/dataproc/exceptions.py +++ b/dataproc/exceptions.py @@ -8,6 +8,15 @@ class DataProcException(Exception): class ConfigException(DataProcException): """Error with configuration """ +class ProcessorExecutionFailed(DataProcException): + """A Processor failed during execution""" + +class ProcessorExecutionSkipped(DataProcException): + """A Processor was skipped during execution""" + +class ProcessorDatasetExists(DataProcException): + """Processor output for a given boundary already exists""" + class ProcessorAlreadyExecutingException(DataProcException): """A Given processor, boundary combination is already executing""" @@ -38,3 +47,5 @@ class UnexpectedFilesException(DataProcException): class ZenodoGetFailedException(DataProcException): """Zenodo Get command returned non-zero result""" +class S3Exception(DataProcException): + """Wrapper for exceptions generated by S3 filesystem""" \ No newline at end of file diff --git a/dataproc/helpers.py b/dataproc/helpers.py index e3aff94..5909391 100644 --- a/dataproc/helpers.py +++ b/dataproc/helpers.py @@ -1,8 +1,9 @@ """ Helper methods / classes """ +from enum import Enum import inspect -from typing import Generator, List +from typing import Generator, List, Tuple from types import ModuleType import os import requests @@ -15,6 +16,10 @@ import csv import warnings +import rasterio +from rasterio import sample +import numpy as np + from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC from dataproc.backends import StorageBackend from dataproc import Boundary, DataPackageLicense, DataPackageResource @@ -27,6 +32,18 @@ # DAGs and Processing +def processors_as_enum(include_test_processors: str = False, additions: List[str] = []) -> Enum: + """Generate an Enum of the currently available processors""" + procs = {} + for proc_name, proc_versions in list_processors(include_test_processors=include_test_processors).items(): + for version in proc_versions: + name_version = build_processor_name_version(proc_name, version) + procs[name_version] = name_version + # Add in any additional fields + for addition in additions: + if not addition in procs.keys(): + procs[addition] = addition + return Enum("ProcessorsEnum", procs) def processor_name(dataset: str, version: str) -> str: """Generate a processor name from a dataset and version""" @@ -69,15 +86,18 @@ def build_processor_name_version(processor_base_name: str, version: str) -> str: return f"{processor_base_name}.{version}" -def list_processors() -> List[BaseProcessorABC]: +def list_processors(include_test_processors: bool=False) -> List[BaseProcessorABC]: """Retrieve a list of available processors and their versions""" # Iterate through Core processors and collect metadata import dataproc.processors.core as available_processors valid_processors = {} # {name: [versions]} for name, processor in inspect.getmembers(available_processors): + if include_test_processors is False: + if "test" in name: + continue # Check validity - if not valid_processor(name, processor): + if valid_processor(name, processor) is False: continue # Split name and version proc_name, proc_version = name.split(".") @@ -261,6 +281,15 @@ def generate_datapackage( # FILE OPERATIONS +def output_filename(dataset_name: str, dataset_version: str, boundary_name: str, file_format: str, dataset_subfilename: str = None) -> str: + """ + Generate a standardized output filename + """ + base = f"{dataset_name}-{dataset_version}" + if not dataset_subfilename: + return f"{base}-{boundary_name}.{file_format.replace('.', '')}" + else: + return f"{base}-{dataset_subfilename}-{boundary_name}.{file_format.replace('.', '')}" def unpack_zip(zip_fpath: str, target_folder: str): """ @@ -408,89 +437,130 @@ def fetch_zenodo_doi( # RASTER OPERATIONS +def is_bigtiff(filename): + """ + https://stackoverflow.com/questions/60427572/how-to-determine-if-a-tiff-was-written-in-bigtiff-format + """ + import struct + with open(filename, 'rb') as f: + header = f.read(4) + byteorder = {b'II': '<', b'MM': '>', b'EP': '<'}[header[:2]] + version = struct.unpack(byteorder + "H", header[2:4])[0] + return version == 43 +def sample_geotiff_coords(fpath: str, num_coords: int = 10) -> np.ndarray: + """ + Retrieve a set of coordinates within the bounds of the given raster + """ + with rasterio.open(fpath, 'r') as src: + return np.column_stack(( + np.random.uniform(low=src.bounds.left, high=src.bounds.right, size=(num_coords,)), + np.random.uniform(low=src.bounds.bottom, high=src.bounds.top, size=(num_coords,)) + )) -def assert_geotiff(fpath: str, check_crs: str = "EPSG:4326", check_compression=True): +def sample_geotiff(fpath: str, coords: np.ndarray = None, num_samples: int = 10) -> Tuple[np.ndarray, List[np.ndarray]]: """ - Check a given file is a valid geotiff + Retrieve a sample of given GeoTIFF file. - ::param fpath str Absolute filepath + Optionally provide coords from-which to sample pixels. (2d np array of coordinates within the raster bounds) + + If not provided coords will be sampled at random from the raster bounds + + ::returns Tuple (coords, samples (shape = num_samplesx(np.array(pixel per band)))) + """ + if coords is None: + # Take a random sample of coords within bounds + coords = sample_geotiff_coords(fpath, num_samples) + with rasterio.open(fpath, 'r') as src: + samples = sample.sample_gen(src, coords) + return coords, [sample for sample in samples] + +def assert_geotiff( + fpath: str, + check_crs: str = "EPSG:4326", + check_compression=True, + check_is_bigtiff=False, + check_pixel_coords: np.ndarray=None, + check_pixel_expected_samples: List[np.ndarray]=None + ): """ - import rasterio + Check a given file is a valid geotiff, optionally checking: + Coordinate Reference System Match + Compression exists (Any) + The TIFF has BIGTIFF tags + The TIFF pixels match some expected data samples at given coordinates - with rasterio.open(fpath) as src: + ::param fpath str Absolute filepath + """ + with rasterio.open(fpath, 'r') as src: if check_crs is not None: assert ( src.meta["crs"] == check_crs ), f"raster CRS {src.meta['crs']} doesnt not match expected {check_crs}" + if check_compression is True: assert src.compression is not None, "raster did not have any compression" + if check_pixel_coords is not None and check_pixel_expected_samples is not None: + src_samples = sample.sample_gen(src, check_pixel_coords) + for idx, src_sample in enumerate(src_samples): + # Special case for nan comparison + if all(np.isnan(src_sample)) and all(np.isnan(check_pixel_expected_samples[idx])): + continue + assert np.array_equal(src_sample, check_pixel_expected_samples[idx]) is True, \ + f"source pixels did not match expected pixel samples at coords: {check_pixel_coords[idx]}, {src_sample} != {check_pixel_expected_samples[idx]}" + + if check_is_bigtiff is True: + assert is_bigtiff(fpath) is True, f"raster is not a bigtiff when it was expected to be: {fpath}" def crop_raster( raster_input_fpath: str, raster_output_fpath: str, boundary: Boundary, - preserve_raster_crs=False, + creation_options=["COMPRESS=PACKBITS"], + debug=False ) -> bool: """ - Crop a raster file to the given boundary (EPSG:4326) - - Generates a geotiff. - - __NOTE__ if the input raster CRS is not EPSG:4326 the boundary will be rep - - ::param raster_input_fpath str Absolute Filepath of input - ::param raster_output_fpath str Absolute Filepath of output - ::kwarg preserve_raster_crs bool If True the source raster CRS will be preserved in the result - (input boundary will be reprojected to source CRS before clip) + Crop a raster using GDAL translate """ - - import rasterio - import rasterio.mask + from osgeo import gdal import shapely - from shapely.ops import transform import pyproj + from shapely.ops import transform + import shlex + import subprocess + + # # Gather the resolution + inds = gdal.Open(raster_input_fpath) + + source_boundary_crs = pyproj.CRS("EPSG:4326") + target_boundary_crs = pyproj.crs.CRS.from_wkt(inds.GetProjection()) + if source_boundary_crs != target_boundary_crs: + # Reproject boundary to source raster for projwin + project = pyproj.Transformer.from_crs( + source_boundary_crs, target_boundary_crs, always_xy=True + ).transform + inshape = shapely.from_geojson(json.dumps(boundary["envelope_geojson"])) + shape = transform(project, inshape) + bounds = shape.bounds + else: + shape = shapely.from_geojson(json.dumps(boundary["envelope_geojson"])) + bounds = shape.bounds - # Create the path to output if it doesnt exist - os.makedirs(os.path.dirname(raster_output_fpath), exist_ok=True) - shape = shapely.from_geojson(json.dumps(boundary["envelope_geojson"])) - with rasterio.open(raster_input_fpath) as src: - # Project the source boundary () to source raster if requested output is to match source raster CRS - source_raster_epsg = ":".join(src.crs.to_authority()) - if preserve_raster_crs is True: - source_boundary_crs = pyproj.CRS("EPSG:4326") - target_boundary_crs = pyproj.CRS(source_raster_epsg) - - project = pyproj.Transformer.from_crs( - source_boundary_crs, target_boundary_crs, always_xy=True - ).transform - shape = transform(project, shape) - else: - # Abort if source raster is not matching 4326 - if source_raster_epsg != "EPSG:4326": - raise SourceRasterProjectionException( - f"Aborting unknown reproject - Source raster is {source_raster_epsg} and preserve_raster_crs is False" - ) - - out_image, out_transform = rasterio.mask.mask(src, [shape], crop=True) - out_meta = src.meta - - out_meta.update( - { - "driver": "GTiff", - "height": out_image.shape[1], - "width": out_image.shape[2], - "transform": out_transform, - } - ) - - with rasterio.open( - raster_output_fpath, "w", **out_meta, compress="PACKBITS" - ) as dest: - dest.write(out_image) + gdal_translate = shutil.which('gdal_translate') + if not gdal_translate: + raise Exception("gdal_translate not found") + cmd = f'{gdal_translate} -projwin {bounds[0]} {bounds[3]} {bounds[2]} {bounds[1]} {raster_input_fpath} {raster_output_fpath}' + # Add Creation Options + for creation_option in creation_options: + cmd = cmd + f' -co {creation_option}' + if debug is True: + print ("Raster Crop Command:", cmd) - return os.path.exists(raster_output_fpath) + result = subprocess.run(shlex.split(cmd), capture_output=True) + if debug is True: + print ("Raster Crop Result:", result) + return os.path.exists(raster_output_fpath) # VECTOR OPERATIONS @@ -504,17 +574,20 @@ def assert_vector_file( Optionally assert the data shape and CRS authority string - ::param fpath str Absolute filepath + ::arg fpath str Absolute filepath + ::kwarg expected_crs str CRS with authority - e.g. "EPSG:4326" """ - import geopandas as gp + import fiona - gdf = gp.read_file(fpath) - assert isinstance(gdf, gp.geodataframe.GeoDataFrame) - if expected_shape is not None: - assert gdf.shape == expected_shape, f"shape did not match expected: {gdf.shape}, {expected_shape}" - if expected_crs is not None: - crs = ":".join(gdf.crs.to_authority()) - assert crs == expected_crs, f"crs did not match expected: {crs}, {expected_crs}" + with fiona.open(fpath, 'r') as fptr: + if expected_shape is not None: + shape = (len(fptr), len(fptr.schema['properties'].keys()) + 1) # Add geom col to count of cols + assert ( + shape == expected_shape + ), f"shape did not match expected: {shape}, {expected_shape}" + if expected_crs is not None: + crs = ":".join(fptr.crs.to_authority()) + assert crs == expected_crs, f"crs did not match expected: {crs}, {expected_crs}" def ogr2ogr_load_shapefile_to_pg(shapefile_fpath: str, pg_uri: str): @@ -541,13 +614,14 @@ def copy_from_pg_table(pg_uri: str, sql: str, output_csv_fpath: str) -> int: ::returns filesize int """ import psycopg2 + sql = f"""COPY ({sql}) TO STDOUT WITH CSV HEADER""" with psycopg2.connect(dsn=pg_uri) as conn: - with open(output_csv_fpath, 'w') as fptr: + with open(output_csv_fpath, "w") as fptr: with conn.cursor() as cur: cur.copy_expert(sql, fptr) - with open(output_csv_fpath, 'rb') as fptr: - total_lines = sum(1 for i in fptr) - 1 # Remove header line + with open(output_csv_fpath, "rb") as fptr: + total_lines = sum(1 for i in fptr) - 1 # Remove header line return total_lines @@ -556,9 +630,9 @@ def crop_osm_to_geopkg( pg_uri: str, pg_table: str, output_fpath: str, - geometry_column: str = 'geom', + geometry_column: str = "geom", extract_type: str = "clip", - limit : int = None, + limit: int = None, batch_size: int = 1000, ) -> Generator: """ @@ -576,11 +650,11 @@ def crop_osm_to_geopkg( Either "intersect" - keep the entire intersecting feature in the output or "clip" includes only the clipped geometry in the output - ::returns Generator[int, int, int, int] + ::returns Generator[int, int, int, int] Progress yield: csv_line_count, current_idx, lines_success, lines_skipped, lines_failed """ import fiona - from fiona.crs import from_epsg as crs_from_epsg + from fiona.crs import CRS from shapely import from_wkt, to_geojson, from_wkb geojson = json.dumps(boundary["geojson"]) @@ -597,36 +671,40 @@ def crop_osm_to_geopkg( WHERE ST_Intersects({pg_table}.{geometry_column}, clip_geom.geometry) """ if limit is not None and int(limit): - stmt = f'{stmt} LIMIT {limit}' + stmt = f"{stmt} LIMIT {limit}" try: # Generate CSV using COPY command - tmp_csv_fpath = os.path.join(os.path.dirname(output_fpath), f'{time()}_tmp.csv') + tmp_csv_fpath = os.path.join(os.path.dirname(output_fpath), f"{time()}_tmp.csv") csv_line_count = copy_from_pg_table(pg_uri, stmt, tmp_csv_fpath) # Load CSV to geopkg - crs = crs_from_epsg(4326) + crs = CRS.from_epsg(4326) schema = { - 'geometry': 'LineString', - 'properties': OrderedDict({ - 'asset_id': 'float:16', - 'osm_way_id': 'str', - 'asset_type': 'str', - 'paved': 'bool', - 'material': 'str', - 'lanes': 'int', - '_asset_type': 'str', - 'rehab_cost_USD_per_km': 'float:16', - 'sector': 'str', - 'subsector': 'str', - 'tag_bridge': 'str', - 'bridge': 'bool', - 'wkt': 'str' - }) + "geometry": "LineString", + "properties": OrderedDict( + { + "asset_id": "float:16", + "osm_way_id": "str", + "asset_type": "str", + "paved": "bool", + "material": "str", + "lanes": "int", + "_asset_type": "str", + "rehab_cost_USD_per_km": "float:16", + "sector": "str", + "subsector": "str", + "tag_bridge": "str", + "bridge": "bool", + "wkt": "str", + } + ), } - template = {_k:None for _k, _ in schema['properties'].items()} - with fiona.open(output_fpath, 'w', driver='GPKG', crs=crs, schema=schema) as output: - with open(tmp_csv_fpath, newline='') as csvfile: - reader = csv.reader(csvfile, delimiter=',', quotechar='"') - next(reader, None) # Skip header + template = {_k: None for _k, _ in schema["properties"].items()} + with fiona.open( + output_fpath, "w", driver="GPKG", crs=crs, schema=schema + ) as output: + with open(tmp_csv_fpath, newline="") as csvfile: + reader = csv.reader(csvfile, delimiter=",", quotechar='"') + next(reader, None) # Skip header lines_skipped = 0 lines_failed = 0 lines_success = 0 @@ -636,43 +714,46 @@ def crop_osm_to_geopkg( data = json.loads(row[1]) outrow = {} geom = from_wkb(row[0]) - if geom.geom_type != 'LineString': - lines_skipped+=1 + if geom.geom_type != "LineString": + lines_skipped += 1 continue - outrow['geometry'] = json.loads(to_geojson(geom)) + outrow["geometry"] = json.loads(to_geojson(geom)) # Null missing fields - outrow['properties'] = OrderedDict(template | data) + outrow["properties"] = OrderedDict(template | data) batch.append(outrow) if len(batch) >= batch_size: output.writerecords(batch) output.flush() - lines_success+=len(batch) + lines_success += len(batch) batch = [] - yield csv_line_count, idx+1, lines_success, lines_skipped, lines_failed + yield csv_line_count, idx + 1, lines_success, lines_skipped, lines_failed except Exception as err: - warnings.warn(f'failed to load rows to due: {err}') + warnings.warn(f"failed to load rows to due: {err}") # Attempt to load everything in the batch apart from the failed row if batch: for outrow in batch: try: output.write(outrow) output.flush() - lines_success+=1 + lines_success += 1 except Exception as rowerr: - warnings.warn(f"failed to load row: {outrow} due to {rowerr}") - lines_failed+=1 + warnings.warn( + f"failed to load row: {outrow} due to {rowerr}" + ) + lines_failed += 1 finally: batch = [] # Final batch leftover if len(batch) > 0: output.writerecords(batch) - lines_success+=len(batch) - yield csv_line_count, idx+1, lines_success, lines_skipped, lines_failed + lines_success += len(batch) + yield csv_line_count, idx + 1, lines_success, lines_skipped, lines_failed finally: # Cleanup if os.path.exists(tmp_csv_fpath): os.remove(tmp_csv_fpath) - yield csv_line_count, idx+1, lines_success, lines_skipped, lines_failed + yield csv_line_count, idx + 1, lines_success, lines_skipped, lines_failed + def gdal_crop_pg_table_to_geopkg( boundary: Boundary, @@ -697,6 +778,7 @@ def gdal_crop_pg_table_to_geopkg( Defaults to "both" """ from osgeo import gdal + if debug: gdal.UseExceptions() gdal.SetConfigOption("CPL_DEBUG", "ON") @@ -727,26 +809,45 @@ def gdal_crop_pg_table_to_geopkg( ) gdal.VectorTranslate(output_fpath, ds, options=vector_options) -def gp_crop_file_to_geopkg( + +def fiona_crop_file_to_geopkg( input_fpath: str, boundary: Boundary, output_fpath: str, - mask_type: str = "boundary", + output_schema: dict, + output_crs: int = 4326 ) -> bool: """ - Geopandas - crop file by given boundary mask + Crop file by given boundary mask, streaming data from the given input to output GPKG. + + Intersects using Shapely.interects - ::kwarg mask_type str One of 'boundary' or 'envelope' - Crop the input file by the boundary, or the envolope of the boundary. + ::arg schema Fiona schema of format. Must match input schema, e.g.: + { + "geometry": "LineString", + "properties": OrderedDict( + { + "asset_id": "float:16", + "osm_way_id": "str", + "asset_type": "str", + ... + } + ), + } """ - import geopandas as gp - gdf_clipped = gp.read_file( - input_fpath, - mask=boundary["geojson"] if mask_type == "boundary" else boundary["envelope"], - ) - gdf_clipped.to_file(output_fpath) - return os.path.exists(output_fpath) + import fiona + from fiona.crs import CRS + import shapely + clip_geom = shapely.from_geojson(json.dumps(boundary['geojson'])) + with fiona.open( + output_fpath, "w", driver="GPKG", crs=CRS.from_epsg(output_crs), schema=output_schema + ) as fptr_output: + with fiona.open(input_fpath) as fptr_input: + for input_row in fptr_input: + if shapely.geometry.shape(input_row.geometry).intersects(clip_geom): + fptr_output.write(input_row) + return os.path.exists(output_fpath) def csv_to_gpkg( input_csv_fpath: str, @@ -764,26 +865,26 @@ def csv_to_gpkg( df = pd.read_csv( input_csv_fpath, dtype={ - "country" : str, - "country_long" : str, - "name" : str, - "gppd_idnr" : str, - "primary_fuel" : str, - "other_fuel1" : str, - "other_fuel2" : str, - "other_fuel3" : str, - "owner" : str, - "source" : str, - "url" : str, - "geolocation_source" : str, - "wepp_id" : str, - "generation_data_source" : str, - "estimated_generation_note_2013" : str, - "estimated_generation_note_2014" : str, - "estimated_generation_note_2015" : str, - "estimated_generation_note_2016" : str, - "estimated_generation_note_2017" : str, - } + "country": str, + "country_long": str, + "name": str, + "gppd_idnr": str, + "primary_fuel": str, + "other_fuel1": str, + "other_fuel2": str, + "other_fuel3": str, + "owner": str, + "source": str, + "url": str, + "geolocation_source": str, + "wepp_id": str, + "generation_data_source": str, + "estimated_generation_note_2013": str, + "estimated_generation_note_2014": str, + "estimated_generation_note_2015": str, + "estimated_generation_note_2016": str, + "estimated_generation_note_2017": str, + }, ) if not latitude_col in df.columns or not longitude_col in df.columns: raise Exception( diff --git a/dataproc/processors/core/gri_osm/roads_and_rail_version_1.py b/dataproc/processors/core/gri_osm/roads_and_rail_version_1.py index f5df9e5..b03bc69 100644 --- a/dataproc/processors/core/gri_osm/roads_and_rail_version_1.py +++ b/dataproc/processors/core/gri_osm/roads_and_rail_version_1.py @@ -15,8 +15,10 @@ processor_name_from_file, generate_index_file, generate_license_file, - generate_datapackage + generate_datapackage, + output_filename ) +from dataproc.exceptions import ProcessorDatasetExists from config import ( get_db_uri_ogr ) @@ -35,7 +37,23 @@ class Metadata(BaseMetadataABC): dataset_name = ( "gri_osm_road_and_rail" # The dataset this processor targets ) - data_author = "GRI/OSM" + data_author = "nismod/open-gira contributors and OpenStreetMap contributors" + data_title = "Road and Rail networks derived from OpenStreetMap" + data_title_long = "Road and Rail networks derived from OpenStreetMap" + data_summary = """ +OpenStreetMap provides map data, including on road and railway networks. +This dataset is a derived, processed extract from the global OpenStreetMap +database, produced by researchers at the University of Oxford to support +infrastructure systems analysis and climate risk and resilience assessments. + +The data is produced from a snapshot of OpenStreetMap (the current version is +taken from November 2022) by a reproducible pipeline which is under development +and made freely available at https://github.com/nismod/open-gira. + """ + data_citation = """ +Russell T., Thomas F., nismod/open-gira contributors and OpenStreetMap contributors (2022) +Global Road and Rail networks derived from OpenStreetMap. [Dataset] Available at https://global.infrastructureresilience.org + """ data_license = DataPackageLicense( name="ODbL-1.0", title="Open Data Commons Open Database License 1.0", @@ -44,6 +62,7 @@ class Metadata(BaseMetadataABC): data_origin_url = ( "https://global.infrastructureresilience.org" ) + data_formats = ["Geopackage"] class Processor(BaseProcessorABC): @@ -68,20 +87,18 @@ def exists(self): self.boundary["name"], self.metadata.name, self.metadata.version, - f"{self.boundary['name']}.gpkg", + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg') ) def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() # Setup output path in the processing backend - output_folder = self.paths_helper.build_absolute_path( - self.boundary["name"], self.metadata.name, self.metadata.version, "outputs" + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg') ) - os.makedirs(output_folder, exist_ok=True) - output_fpath = os.path.join(output_folder, f"{self.boundary['name']}.gpkg") # Crop to given boundary self.update_progress(10, "cropping source") diff --git a/dataproc/processors/core/gridfinder/version_1.py b/dataproc/processors/core/gridfinder/version_1.py index 13f8383..d65c30a 100644 --- a/dataproc/processors/core/gridfinder/version_1.py +++ b/dataproc/processors/core/gridfinder/version_1.py @@ -1,5 +1,5 @@ """ -Test vector Processor +Gridfinder Processor """ import os @@ -7,8 +7,8 @@ from typing import List from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC -from dataproc.exceptions import FolderNotFoundException from dataproc.helpers import ( processor_name_from_file, version_name_from_file, @@ -20,8 +20,9 @@ generate_datapackage, generate_license_file, fetch_zenodo_doi, - gp_crop_file_to_geopkg, + fiona_crop_file_to_geopkg, assert_vector_file, + output_filename ) @@ -38,13 +39,43 @@ class Metadata(BaseMetadataABC): inspect.stack()[1].filename ) # Version of the Processor dataset_name = "gridfinder" # The dataset this processor targets - data_author = "Arderne, Christopher; NIcolas, Claire; Zorn, Conrad; Koks, Elco E" + data_author = "Arderne, Christopher; Nicolas, Claire; Zorn, Conrad; Koks, Elco E" + data_title = "Gridfinder" + data_title_long = "Gridfinder data from 'Predictive mapping of the global power system using open data'" + data_summary = """ +Three primary global data outputs from the research: + +grid.gpkg: Vectorized predicted distribution and transmission line network, with existing OpenStreetMap lines tagged in the 'source' column +targets.tif: Binary raster showing locations predicted to be connected to distribution grid. +lv.tif: Raster of predicted low-voltage infrastructure in kilometres per cell. + +This data was created with code in the following three repositories: + +https://github.com/carderne/gridfinder +https://github.com/carderne/predictive-mapping-global-power +https://github.com/carderne/access-estimator + +Full steps to reproduce are contained in this file: + +https://github.com/carderne/predictive-mapping-global-power/blob/master/README.md + +The data can be visualized at the following location: + +https://gridfinder.org + """ + data_citation = """ +Arderne, Christopher, Nicolas, Claire, Zorn, Conrad, & Koks, Elco E. (2020). +Data from: Predictive mapping of the global power system using open data [Data +set]. In Nature Scientific Data (1.1.1, Vol. 7, Number Article 19). Zenodo. +https://doi.org/10.5281/zenodo.3628142 +""" data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "https://doi.org/10.5281/zenodo.3628142" + data_formats = ["Geopackage", "GeoTIFF"] class Processor(BaseProcessorABC): @@ -65,15 +96,14 @@ def exists(self): self.metadata.version, datafile_ext=".tif", ) - except FolderNotFoundException: + except FileNotFoundError: return False return count_on_backend == self.total_expected_files def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() else: # Ensure we start with a blank output folder on the storage backend try: @@ -82,7 +112,7 @@ def generate(self): self.metadata.name, self.metadata.version, ) - except FolderNotFoundException: + except FileNotFoundError: pass # Check if the source TIFF exists and fetch it if not self.update_progress(10, "fetching and verifying source") @@ -94,18 +124,31 @@ def generate(self): self.update_progress( 10 + int(idx * (80 / len(source_fpaths))), "cropping source" ) + + subfilename = os.path.splitext(os.path.basename(source_fpath))[0] + file_format = os.path.splitext(os.path.basename(source_fpath))[1] + output_fpath = os.path.join( - self.tmp_processing_folder, os.path.basename(source_fpath) + self.tmp_processing_folder, + output_filename( + self.metadata.name, + self.metadata.version, + self.boundary["name"], + file_format, + dataset_subfilename=subfilename + ) ) - if os.path.splitext(os.path.basename(source_fpath))[1] == ".tif": + if file_format == ".tif": crop_success = crop_raster( - source_fpath, output_fpath, self.boundary, preserve_raster_crs=True + source_fpath, output_fpath, self.boundary ) - elif os.path.splitext(os.path.basename(source_fpath))[1] == ".gpkg": - crop_success = gp_crop_file_to_geopkg( + elif file_format == ".gpkg": + crop_success = fiona_crop_file_to_geopkg( source_fpath, self.boundary, output_fpath, + output_schema = {'properties': {'source': 'str'}, 'geometry': 'LineString'}, + output_crs=4326 ) else: continue diff --git a/dataproc/processors/core/isimp_drought/__init__.py b/dataproc/processors/core/isimp_drought/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataproc/processors/core/isimp_drought/helpers.py b/dataproc/processors/core/isimp_drought/helpers.py new file mode 100644 index 0000000..bb3143d --- /dev/null +++ b/dataproc/processors/core/isimp_drought/helpers.py @@ -0,0 +1,251 @@ +""" +Singletons for ISIMP Drought V1 +""" + +VERSION_1_SOURCE_FILES = [ + "lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_historical_2005soc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_historical_2005soc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_historical_nosoc_co2_leh_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_historical_2005soc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_h08_miroc5_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_historical_2005soc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_mpi-hm_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_mpi-hm_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_mpi-hm_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_historical_nosoc_co2_leh_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_h08_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_pcr-globwb_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_historical_nosoc_co2_leh_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_historical_nosoc_co2_leh_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_orchidee_miroc5_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_hwmid-humidex_hadgem2-es_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_historical_histsoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_h08_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_ipsl-cm5a-lr_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_hwmid-humidex_miroc5_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_jules-w1_hadgem2-es_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_hadgem2-es_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_orchidee_ipsl-cm5a-lr_ewembi_historical_nosoc_co2_led_global_annual_1861_2005_baseline_occurrence.tif", + "lange2020_hwmid-humidex_gfdl-esm2m_ewembi_rcp26_nosoc_co2_leh_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_jules-w1_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_h08_ipsl-cm5a-lr_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_watergap2_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_orchidee_hadgem2-es_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_lpjml_gfdl-esm2m_ewembi_rcp26_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_pcr-globwb_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif", + "lange2020_pcr-globwb_hadgem2-es_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_hwmid-humidex_ipsl-cm5a-lr_ewembi_rcp60_nosoc_co2_leh_global_annual_2006_2099_2050_occurrence.tif", + "lange2020_jules-w1_gfdl-esm2m_ewembi_rcp26_nosoc_co2_led_global_annual_2006_2099_2030_occurrence.tif", +] diff --git a/dataproc/processors/core/isimp_drought/templates/version_1/index.html b/dataproc/processors/core/isimp_drought/templates/version_1/index.html new file mode 100644 index 0000000..408cba9 --- /dev/null +++ b/dataproc/processors/core/isimp_drought/templates/version_1/index.html @@ -0,0 +1,23 @@ + + + + + CC0 + + + + + + +

CC0

+ +

CC0 - CC0 enables scientists, educators, artists and other creators and owners of copyright- or + database-protected content to waive those interests in their works and thereby place them as completely as + possible in the public domain, so that others may freely build upon, enhance and reuse the works for any + purposes without restriction under copyright or database law.https://creativecommons.org/share-your-work/public-domain/cc0/ +

+ + + + \ No newline at end of file diff --git a/dataproc/processors/core/isimp_drought/templates/version_1/license.html b/dataproc/processors/core/isimp_drought/templates/version_1/license.html new file mode 100644 index 0000000..fc55292 --- /dev/null +++ b/dataproc/processors/core/isimp_drought/templates/version_1/license.html @@ -0,0 +1,20 @@ + + + + + Creative Commons Attribution 4.0 International Public License + + + + + + +

Creative Commons Attribution 4.0 International Public License

+ +

World Resource Institute - Aqueduct Flood Hazard Maps are made available under the Creative Commons Attribution + License International (4.0): ./license.html or https://creativecommons.org/licenses/by/4.0/

+ + + + \ No newline at end of file diff --git a/dataproc/processors/core/isimp_drought/version_1.py b/dataproc/processors/core/isimp_drought/version_1.py new file mode 100644 index 0000000..d89b005 --- /dev/null +++ b/dataproc/processors/core/isimp_drought/version_1.py @@ -0,0 +1,297 @@ +""" +ISIMP Drought V1 Processor +""" + +import os +import inspect +import shutil +from typing import List + +from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists +from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC +from dataproc.helpers import ( + processor_name_from_file, + tiffs_in_folder, + version_name_from_file, + crop_raster, + assert_geotiff, + data_file_hash, + data_file_size, + generate_index_file, + generate_datapackage, + generate_license_file, + fetch_zenodo_doi, + output_filename, + unpack_zip, +) +from .helpers import VERSION_1_SOURCE_FILES + + +class Metadata(BaseMetadataABC): + """ + Processor metadata + """ + + name = processor_name_from_file( + inspect.stack()[1].filename + ) # this must follow snakecase formatting, without special chars + description = "ISIMP Drought v1 processor" # Longer processor description + version = version_name_from_file( + inspect.stack()[1].filename + ) # Version of the Processor + dataset_name = "ISIMP Drought" # The dataset this processor targets + data_author = "Lange, S., Volkholz, J., Geiger, T., Zhao, F., Vega, I., Veldkamp, T., et al. (2020)" + data_title = "ISIMP Drought" + data_title_long = "Annual probability of extreme heat and drought events, derived from Lange et al 2020" + data_summary = """ +The time series of extreme events given by Lange et al has been processed into an annual probability of occurrence by researchers at the University of Oxford, using the pipeline available online at https://github.com/nismod/infra-risk-vis/blob/45d8974c311067141ee6fcaa1321c7ecdaa59752/etl/pipelines/isimip/Snakefile - this is a draft dataset, used for visualisation in https://global.infrastructureresilience.org/ but not otherwise reviewed or published. + +If you use this, please cite: Lange, S., Volkholz, J., Geiger, T., Zhao, F., Vega, I., Veldkamp, T., et al. (2020). Projecting exposure to extreme climate impact events across six event categories and three spatial scales. Earth's Future, 8, e2020EF001616. DOI 10.1029/2020EF001616 + +This is shared under a CC0 1.0 Universal Public Domain Dedication (CC0 1.0) When using ISIMIP data for your research, please appropriately credit the data providers, e.g. either by citing the DOI for the dataset, or by appropriate acknowledgment. + +Annual probability of drought (soil moisture below a baseline threshold) or extreme heat (temperature and humidity-based indicators over a threshold) events on a 0.5° grid. 8 hydrological models forced by 4 GCMs under baseline, RCP 2.6 & 6.0 emission scenarios. Current and future maps in 2030, 2050 and 2080. + +The ISIMIP2b climate input data and impact model output data analyzed in this study are available in the ISIMIP data repository at ESGF, see https://esg.pik-potsdam.de/search/isimip/?project=ISIMIP2b&product=input and https://esg.pik-potsdam.de/search/isimip/?project=ISIMIP2b&product=output, respectively. More information about the GHM, GGCM, and GVM output data is provided by Gosling et al. (2020), Arneth et al. (2020), and Reyer et al. (2019), respectively. + +Event definitions are given in Lange et al, table 1. Land area is exposed to drought if monthly soil moisture falls below the 2.5th percentile of the preindustrial baseline distribution for at least seven consecutive months. Land area is exposed to extreme heat if both a relative indicator based on temperature (Russo et al 2015, 2017) and an absolute indicator based on temperature and relative humidity (Masterton & Richardson, 1979) exceed their respective threshold value. + """ + data_citation = """ +Lange, S., Volkholz, J., Geiger, T., Zhao, F., Vega, I., Veldkamp, T., et al. (2020). Projecting exposure to extreme climate impact events across six event categories and three spatial scales. Earth's Future, 8, e2020EF001616. DOI 10.1029/2020EF001616 + """ + data_license = DataPackageLicense( + name="CC0", + title="CC0", + path="https://creativecommons.org/share-your-work/public-domain/cc0/", + ) + data_origin_url = "https://doi.org/10.5281/zenodo.7732393" + data_formats = ["GeoTIFF"] + + +class Processor(BaseProcessorABC): + """A Processor for ISIMP Drought V1""" + + zenodo_doi = "10.5281/zenodo.7732393" + source_files = VERSION_1_SOURCE_FILES + total_expected_files = len(source_files) + index_filename = "index.html" + license_filename = "license.html" + + def exists(self): + """Whether all output files for a given processor & boundary exist on the FS on not""" + try: + count_on_backend = self.storage_backend.count_boundary_data_files( + self.boundary["name"], + self.metadata.name, + self.metadata.version, + datafile_ext=".tif", + ) + except FileNotFoundError: + return False + return count_on_backend == self.total_expected_files + + def generate(self): + """Generate files for a given processor""" + if self.exists() is True: + raise ProcessorDatasetExists() + else: + # Ensure we start with a blank output folder on the storage backend + try: + self.storage_backend.remove_boundary_data_files( + self.boundary["name"], + self.metadata.name, + self.metadata.version, + ) + except FileNotFoundError: + pass + # Check if the source TIFF exists and fetch it if not + self.update_progress(10, "fetching and verifying source") + source_fpaths = self._fetch_source() + + self.log.debug("%s - cropping source", self.metadata.name) + results_fpaths = [] + for idx, source_fpath in enumerate(source_fpaths): + self.update_progress( + 10 + int(idx * (80 / len(source_fpaths))), "cropping source" + ) + + subfilename = os.path.splitext(os.path.basename(source_fpath))[0] + file_format = os.path.splitext(os.path.basename(source_fpath))[1] + + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename( + self.metadata.name, + self.metadata.version, + self.boundary["name"], + file_format, + dataset_subfilename=subfilename, + ), + ) + crop_success = crop_raster(source_fpath, output_fpath, self.boundary) + + self.log.debug( + "%s crop %s - success: %s", + self.metadata.name, + os.path.basename(source_fpath), + crop_success, + ) + if crop_success: + results_fpaths.append( + { + "fpath": output_fpath, + "hash": data_file_hash(output_fpath), + "size": data_file_size(output_fpath), + } + ) + # Check results look sensible + assert ( + len(results_fpaths) == self.total_expected_files + ), f"{self.metadata.name} - number of successfully cropped files {len(results_fpaths)} do not match expected {self.total_expected_files}" + + self.update_progress(85, "moving result") + self.log.debug("%s - moving cropped data to backend", self.metadata.name) + result_uris = [] + for result in results_fpaths: + result_uri = self.storage_backend.put_processor_data( + result["fpath"], + self.boundary["name"], + self.metadata.name, + self.metadata.version, + ) + result_uris.append(result_uri) + self.provenance_log[f"{self.metadata.name} - move to storage success"] = ( + len(result_uris) == self.total_expected_files + ) + self.provenance_log[f"{self.metadata.name} - result URIs"] = ",".join( + result_uris + ) + + # Generate documentation on backend + self.update_progress(90, "generate documentation & datapackage") + self.generate_documentation() + + # Generate datapackage in log (using directory for URI) + datapkg = generate_datapackage( + self.metadata, + result_uris, + "GeoTIFF", + [i["size"] for i in results_fpaths], + [i["hash"] for i in results_fpaths], + ) + self.provenance_log["datapackage"] = datapkg + self.log.debug( + "%s generated datapackage in log: %s", self.metadata.name, datapkg + ) + + return self.provenance_log + + def generate_documentation(self): + """Generate documentation for the processor + on the result backend""" + # Generate Documentation + index_fpath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "templates", + self.metadata.version, + self.index_filename, + ) + index_create = generate_index_file( + self.storage_backend, index_fpath, self.boundary["name"], self.metadata + ) + self.provenance_log[ + f"{self.metadata.name} - created index documentation" + ] = index_create + license_fpath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "templates", + self.metadata.version, + self.license_filename, + ) + license_create = generate_license_file( + self.storage_backend, license_fpath, self.boundary["name"], self.metadata + ) + self.provenance_log[ + f"{self.metadata.name} - created license documentation" + ] = license_create + self.log.debug("%s generated documentation on backend", self.metadata.name) + + def _fetch_source(self) -> List[str]: + """ + Fetch and unpack the required source data if required. + + ::returns source_fpaths List[str] Filepaths of all source data + """ + # Build Source Path + os.makedirs(self.source_folder, exist_ok=True) + if self._all_source_exists(): + self.log.debug( + "%s - all source files appear to exist and are valid", + self.metadata.name, + ) + return [ + os.path.join(self.source_folder, _file) for _file in self.source_files + ] + else: + downloaded_files = fetch_zenodo_doi( + self.zenodo_doi, self.source_folder, return_only_tifs=False + ) + # Should be only one zip + try: + downloaded_zip = [ + i + for i in downloaded_files + if os.path.basename(i) == "lange2020_expected_occurrence.zip" + ][0] + except IndexError: + self.log.error( + "after %s download - required zip file lange2020_expected_occurrence.zip was not present", + self.metadata.name, + ) + raise Exception(f"{self.metadata.name} download failed") + # Unpack zip + unpack_zip(downloaded_zip, self.source_folder) + # Moved nested tiffs up to source folder + for tiff_fpath in tiffs_in_folder( + os.path.join(self.source_folder, "lange2020_expected_occurrence"), + full_paths=True, + ): + shutil.move(tiff_fpath, self.source_folder) + shutil.rmtree( + os.path.join(self.source_folder, "lange2020_expected_occurrence"), + ignore_errors=True, + ) + # Count the Tiffs + self.log.debug("%s - Download Complete", self.metadata.name) + assert ( + self._all_source_exists() + ), f"after {self.metadata.name} download - not all source files were present" + # Filter to just the files we support + return [ + os.path.join(self.source_folder, _file) for _file in self.source_files + ] + + def _all_source_exists(self, remove_invalid=True) -> bool: + """ + Check if all source files exist and are valid + If not source will be removed + """ + source_valid = [True for _ in range(len(self.source_files))] + for idx, _file in enumerate(self.source_files): + fpath = os.path.join(self.source_folder, _file) + try: + assert_geotiff(fpath, check_compression=False, check_crs=None) + except Exception as err: + # remove the file and flag we should need to re-fetch, then move on + self.log.warning( + "%s source file %s appears to be invalid due to %s", + self.metadata.name, + fpath, + err, + ) + if remove_invalid: + if os.path.exists(fpath): + os.remove(fpath) + source_valid[idx] = False + return all(source_valid) diff --git a/dataproc/processors/core/jrc_ghsl_built_c/r2022_epoch2018_10m_mszfun.py b/dataproc/processors/core/jrc_ghsl_built_c/r2022_epoch2018_10m_mszfun.py index afb9d03..72f9434 100644 --- a/dataproc/processors/core/jrc_ghsl_built_c/r2022_epoch2018_10m_mszfun.py +++ b/dataproc/processors/core/jrc_ghsl_built_c/r2022_epoch2018_10m_mszfun.py @@ -8,6 +8,7 @@ from typing import List from celery.app import task +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import ( BaseProcessorABC, @@ -25,8 +26,8 @@ generate_index_file, generate_license_file, generate_datapackage, + output_filename ) -from dataproc.exceptions import FolderNotFoundException from dataproc.processors.core.jrc_ghsl_built_c.helpers import JRCBuiltCFetcher @@ -39,20 +40,58 @@ class Metadata(BaseMetadataABC): inspect.stack()[1].filename ) # this must follow snakecase formatting, without special chars description = """ - A Processor for JRC GHSL Built-Up Characteristics - - R2022 release, Epoch 2018, 10m resolution, Morphological Settlement Zone and Functional classification +A Processor for JRC GHSL Built-Up Characteristics - +R2022 release, Epoch 2018, 10m resolution, Morphological Settlement Zone and Functional classification """ # Longer processor description version = version_name_from_file( inspect.stack()[1].filename ) # Version of the Processor dataset_name = "r2022_epoch2018_10m_mszfun" # The dataset this processor targets data_author = "Joint Research Centre" + data_title = "GHS-BUILT-C MSZ and FC, R2022 E2018 10m" + data_title_long = "JRC Global Human Settlement Layer - Built-Up Characteristics (GHS-BUILT-C - MSZ & FC) - Release 2022 - Epoch 2018 - 10m resolution - Morphological Settlement Zone & Functional Classification" + data_summary = """ +The spatial raster dataset delineates the boundaries of the human settlements at +10m resolution, and describe their inner characteristics in terms of the +morphology of the built environment and the functional use. The Morphological +Settlement Zone (MSZ) delineates the spatial domain of all the human settlements +at the neighboring scale of approx. 100m, based on the spatial generalization of +the built-up surface fraction (BUFRAC) function. The objective is to fill the +open spaces that are surrounded by large patches of built space. MSZ, open +spaces, and built spaces basic class abstractions are derived by mathematical +morphology spatial filtering (opening, closing, regional maxima) from the BUFRAC +function. They are further classified according to the information regarding +vegetation intensity (GHS-BUILT-C_VEG_GLOBE_R2022A), water surfaces +(GHS_LAND_GLOBE_R2022A), road surfaces (OSM highways), functional use +(GHS-BUILT-C_FUN_GLOBE_R2022A), and building height (GHS-BUILT-H_GLOBE_R2022A). + +The main characteristics of this dataset are listed below. The complete +information about the GHSL main products can be found in the GHSL Data Package +2022 report (10.33 MB): +https://ghsl.jrc.ec.europa.eu/documents/GHSL_Data_Package_2022.pdf + """ + data_citation = """ +Dataset: + +Pesaresi M., P. Panagiotis (2022): GHS-BUILT-C R2022A - GHS Settlement +Characteristics, derived from Sentinel2 composite (2018) and other GHS R2022A +data.European Commission, Joint Research Centre (JRC) PID: +http://data.europa.eu/89h/dde11594-2a66-4c1b-9a19-821382aed36e, +doi:10.2905/DDE11594-2A66-4C1B-9A19-821382AED36E + +Concept & Methodology: + +Schiavina M., Melchiorri M., Pesaresi M., Politis P., Freire S., Maffenini L., +Florio P., Ehrlich D., Goch K., Tommasi P., Kemper T. GHSL Data Package 2022, +JRC 129516, ISBN 978-92-76-53071-8 doi:10.2760/19817 + """ data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "https://ghsl.jrc.ec.europa.eu/download.php?ds=builtC" + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -88,15 +127,14 @@ def exists(self): self.metadata.version, datafile_ext=".tif", ) - except FolderNotFoundException: + except FileNotFoundError: return False return count_on_backend == self.total_expected_files def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() else: # Ensure we start with a blank output folder on the storage backend try: @@ -105,10 +143,8 @@ def generate(self): self.metadata.name, self.metadata.version, ) - except FolderNotFoundException: + except FileNotFoundError: pass - # Cleanup anything in tmp processing - self._clean_tmp_processing() # Check if the source TIFF exists and fetch it if not self.log.debug( "%s - collecting source geotiffs into %s", @@ -122,14 +158,27 @@ def generate(self): results_fpaths = [] for idx, source_fpath in enumerate(source_fpaths): self.update_progress( - 10 + int(idx * (80 / len(source_fpaths))), "cropping source" + 20 + int(idx * (80 / len(source_fpaths))), "cropping source" ) output_fpath = os.path.join( self.tmp_processing_folder, os.path.basename(source_fpath) ) - # Crop Source - preserve Molleweide + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename( + self.metadata.name, + self.metadata.version, + self.boundary["name"], + 'tif', + dataset_subfilename=os.path.splitext(os.path.basename(source_fpath))[0] + ) + ) + # Crop Source - preserve Molleweide, assume we'll need BIGTIFF for this dataset crop_success = crop_raster( - source_fpath, output_fpath, self.boundary, preserve_raster_crs=True + source_fpath, + output_fpath, + self.boundary, + creation_options=["COMPRESS=DEFLATE", "PREDICTOR=2", "ZLEVEL=6", "BIGTIFF=YES"], ) self.log.debug( "%s %s - success: %s", @@ -137,6 +186,9 @@ def generate(self): os.path.basename(source_fpath), crop_success, ) + self.update_progress( + 20 + int(idx * (80 / len(source_fpaths))), "generating hash" + ) if crop_success: results_fpaths.append( { @@ -221,14 +273,6 @@ def generate_documentation(self): ] = license_create self.log.debug("%s generated documentation on backend", self.metadata.name) - def _clean_tmp_processing(self): - """Remove the tmp processing folder and recreate""" - # Remove partial previous tmp results if they exist - if os.path.exists(self.tmp_processing_folder): - shutil.rmtree(self.tmp_processing_folder) - # Generate the tmp output directory - os.makedirs(self.tmp_processing_folder, exist_ok=True) - def _fetch_source(self) -> List[str]: """ Fetch and unpack the required source data if required. diff --git a/dataproc/processors/core/jrc_ghsl_population/r2022_epoch2020_1km.py b/dataproc/processors/core/jrc_ghsl_population/r2022_epoch2020_1km.py index 7d0b921..7c36a17 100644 --- a/dataproc/processors/core/jrc_ghsl_population/r2022_epoch2020_1km.py +++ b/dataproc/processors/core/jrc_ghsl_population/r2022_epoch2020_1km.py @@ -5,6 +5,7 @@ import os import inspect import shutil +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import ( BaseProcessorABC, @@ -21,8 +22,8 @@ generate_datapackage, generate_index_file, generate_license_file, + output_filename ) -from dataproc.exceptions import FolderNotFoundException from dataproc.processors.core.jrc_ghsl_population.helpers import JRCPopFetcher @@ -40,12 +41,43 @@ class Metadata(BaseMetadataABC): ) # Version of the Processor dataset_name = "r2022_epoch2020_1km" # The dataset this processor targets data_author = "Joint Research Centre" + data_title = "GHS-POP - R2022A" + data_title_long = "GHS-POP R2022A - extract from GHS population grid for 2020, 1km resolution" + data_summary = """ +The spatial raster dataset depicts the distribution of residential population, +expressed as the number of people per cell. Residential population estimates +between 1975 and 2020 in 5-year intervals and projections to 2025 and 2030 +derived from CIESIN GPWv4.11 were disaggregated from census or administrative +units to grid cells, informed by the distribution, density, and classification +of built-up as mapped in the Global Human Settlement Layer (GHSL) global layer +per corresponding epoch. + +The complete information about the GHSL main products can be found in the GHSL +Data Package 2022 report (10.33 MB): +https://ghsl.jrc.ec.europa.eu/documents/GHSL_Data_Package_2022.pdf + """ + data_citation = """ +Dataset: + +Schiavina M., Freire S., MacManus K. (2022): GHS-POP R2022A - GHS population +grid multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) +PID: http://data.europa.eu/89h/d6d86a90-4351-4508-99c1-cb074b022c4a, +doi:10.2905/D6D86A90-4351-4508-99C1-CB074B022C4A + +Concept & Methodology: + +Freire S., MacManus K., Pesaresi M., Doxsey-Whitfield E., Mills J. (2016) +Development of new open and free multi-temporal global population grids at 250 m +resolution. Geospatial Data in a Changing World; Association of Geographic +Information Laboratories in Europe (AGILE), AGILE 2016 + """ data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "https://ghsl.jrc.ec.europa.eu/download.php?ds=pop" + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -66,15 +98,14 @@ def exists(self): self.metadata.version, datafile_ext=".tif", ) - except FolderNotFoundException: + except FileNotFoundError: return False return count_on_backend == self.total_expected_files def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() else: # Ensure we start with a blank output folder on the storage backend try: @@ -83,10 +114,8 @@ def generate(self): self.metadata.name, self.metadata.version, ) - except FolderNotFoundException: + except FileNotFoundError: pass - # Cleanup anything in tmp processing - self._clean_tmp_processing() # Check if the source TIFF exists and fetch it if not self.log.debug( "%s - collecting source geotiffs into %s", @@ -96,13 +125,15 @@ def generate(self): self.update_progress(10, "fetching and verifying source") source_fpath = self._fetch_source() output_fpath = os.path.join( - self.tmp_processing_folder, os.path.basename(source_fpath) + self.tmp_processing_folder, + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'tif') ) # Crop Source - preserve Molleweide self.update_progress(50, "cropping source") self.log.debug("%s - cropping source", self.metadata.name) crop_success = crop_raster( - source_fpath, output_fpath, self.boundary, preserve_raster_crs=True + source_fpath, output_fpath, self.boundary, + creation_options=["COMPRESS=PACKBITS"] # This fails for jrc pop with higher compression ) self.log.debug( "%s %s - success: %s", @@ -170,14 +201,6 @@ def generate_documentation(self): ] = license_create self.log.debug("%s generated documentation on backend", self.metadata.name) - def _clean_tmp_processing(self): - """Remove the tmp processing folder and recreate""" - # Remove partial previous tmp results if they exist - if os.path.exists(self.tmp_processing_folder): - shutil.rmtree(self.tmp_processing_folder) - # Generate the tmp output directory - os.makedirs(self.tmp_processing_folder, exist_ok=True) - def _fetch_source(self) -> str: """ Fetch and unpack the required source data if required. diff --git a/dataproc/processors/core/natural_earth_raster/version_1.py b/dataproc/processors/core/natural_earth_raster/version_1.py index e1ed5e0..429be64 100644 --- a/dataproc/processors/core/natural_earth_raster/version_1.py +++ b/dataproc/processors/core/natural_earth_raster/version_1.py @@ -6,6 +6,7 @@ import inspect from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC from dataproc.helpers import ( processor_name_from_file, @@ -19,6 +20,7 @@ generate_license_file, data_file_hash, data_file_size, + output_filename ) @@ -38,12 +40,17 @@ class Metadata(BaseMetadataABC): ) # Version of the Processor dataset_name = "natural_earth_raster" # The dataset this processor targets data_author = "Natural Earth Data" + data_title = "" + data_title_long = "" + data_summary = "" + data_citation = "" data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "https://www.naturalearthdata.com/downloads/50m-natural-earth-2/50m-natural-earth-ii-with-shaded-relief/" + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -62,28 +69,22 @@ def exists(self): self.boundary["name"], self.metadata.name, self.metadata.version, - f"{self.boundary['name']}.tif", + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'tif') ) def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() # Check if the source TIFF exists and fetch it if not self.update_progress(10,"fetching and verifying source") geotiff_fpath = self._fetch_source() # Crop to given boundary self.update_progress(50,"cropping source") - output_folder = self.paths_helper.build_absolute_path( - self.boundary["name"], - "natural_earth_raster", - self.metadata.version, - "outputs", + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'tif') ) - os.makedirs(output_folder, exist_ok=True) - - output_fpath = os.path.join(output_folder, f"{self.boundary['name']}.tif") self.log.debug("Natural earth raster - cropping geotiff") crop_success = crop_raster(geotiff_fpath, output_fpath, self.boundary) self.provenance_log[f"{self.metadata.name} - crop success"] = crop_success diff --git a/dataproc/processors/core/natural_earth_vector/version_1.py b/dataproc/processors/core/natural_earth_vector/version_1.py index 85bcfc5..8ff5188 100644 --- a/dataproc/processors/core/natural_earth_vector/version_1.py +++ b/dataproc/processors/core/natural_earth_vector/version_1.py @@ -8,6 +8,7 @@ import sqlalchemy as sa from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC from dataproc.helpers import ( processor_name_from_file, @@ -20,7 +21,8 @@ generate_datapackage, generate_index_file, data_file_hash, - data_file_size + data_file_size, + output_filename ) from config import ( get_db_uri_ogr, @@ -45,7 +47,10 @@ class Metadata(BaseMetadataABC): "natural_earth_vector_roads" # The dataset this processor targets ) data_author = "Natural Earth Data" - data_license = "" + data_title = "" + data_title_long = "" + data_summary = "" + data_citation = "" data_license = DataPackageLicense( name="Natural Earth", title="Natural Earth", @@ -54,6 +59,7 @@ class Metadata(BaseMetadataABC): data_origin_url = ( "https://www.naturalearthdata.com/downloads/10m-cultural-vectors/roads/" ) + data_formats = ["Geopackage"] class Processor(BaseProcessorABC): @@ -81,23 +87,21 @@ def exists(self): self.boundary["name"], self.metadata.name, self.metadata.version, - f"{self.boundary['name']}.gpkg", + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg'), ) def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() # Check if the source exists and fetch it if not self.update_progress(10, "fetching and verifying source") pg_table_name = self._fetch_source() # Crop to given boundary - output_folder = self.paths_helper.build_absolute_path( - self.boundary["name"], self.metadata.name, self.metadata.version, "outputs" + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg') ) - os.makedirs(output_folder, exist_ok=True) - output_fpath = os.path.join(output_folder, f"{self.boundary['name']}.gpkg") self.update_progress(20, "cropping source") self.log.debug("Natural earth vector - cropping Roads to geopkg") diff --git a/dataproc/processors/core/storm/global_mosaics_version_1.py b/dataproc/processors/core/storm/global_mosaics_version_1.py index a2a3cf2..fb1ba66 100644 --- a/dataproc/processors/core/storm/global_mosaics_version_1.py +++ b/dataproc/processors/core/storm/global_mosaics_version_1.py @@ -5,6 +5,7 @@ import os import inspect from typing import List +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import ( BaseProcessorABC, @@ -23,8 +24,8 @@ generate_license_file, fetch_zenodo_doi, tiffs_in_folder, + output_filename ) -from dataproc.exceptions import FolderNotFoundException class Metadata(BaseMetadataABC): @@ -41,12 +42,72 @@ class Metadata(BaseMetadataABC): ) # Version of the Processor dataset_name = "STORM Global Mosaics 10.5281/zenodo.7438145" # The dataset this processor targets data_author = "University of Oxford" + data_title = "STORM tropical cyclone wind speed maps" + data_title_long = "STORM tropical cyclone wind speed return period maps as global GeoTIFFs" + data_summary = """ +Global tropical cyclone wind speed return period maps. + +This dataset is derived with minimal processing from the following datasets +created by Bloemendaal et al, which are released with a CC0 license: + +[1] Bloemendaal, Nadia; de Moel, H. (Hans); Muis, S; Haigh, I.D. (Ivan); Aerts, +J.C.J.H. (Jeroen) (2020): STORM tropical cyclone wind speed return periods. +4TU.ResearchData. Dataset. https://doi.org/10.4121/12705164.v3 + +[2] Bloemendaal, Nadia; de Moel, Hans; Dullaart, Job; Haarsma, R.J. (Reindert); +Haigh, I.D. (Ivan); Martinez, Andrew B.; et al. (2022): STORM climate change +tropical cyclone wind speed return periods. 4TU.ResearchData. Dataset. +https://doi.org/10.4121/14510817.v3 + +Datasets containing tropical cyclone maximum wind speed (in m/s) return periods, +generated using the STORM datasets (see +https://www.nature.com/articles/s41597-020-0381-2) and STORM climate change +datasets (see https://figshare.com/s/397aff8631a7da2843fc). Return periods were +empirically calculated using Weibull's plotting formula. The +STORM_FIXED_RETURN_PERIOD dataset contains maximum wind speeds for a fixed set +of return periods at 10 km resolution in every basin and for every climate model +used here (see below). + +The GeoTIFFs provided in the datasets linked above have been mosaicked into +single files with global extent for each climate model/return period using the +following code: + +https://github.com/nismod/open-gira/blob/219315e57cba54bb18f033844cff5e48dd5979d7/workflow/rules/download/storm-ibtracs.smk#L126-L151 + +Files are named on the pattern: +STORM_FIXED_RETURN_PERIODS_{STORM_MODEL}_{STORM_RP}_YR_RP.tif + +STORM_MODEL is be one of constant, CMCC-CM2-VHR4, CNRM-CM6-1-HR, EC-Earth3P-HR +or HadGEM3-GC31-HM. The "constant" files are for the present day, baseline +climate scenario as explained in dataset [1]. The other files are for 2050, +RCP8.5 under different models as explained in the paper linked from dataset [2]. + +STORM_RP is one of 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, +600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000 or +10000. +""" + data_citation = """ +Russell, Tom. (2022). STORM tropical cyclone wind speed return periods as global +GeoTIFFs (1.0.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7438145 + +Derived from: + +[1] Bloemendaal, Nadia; de Moel, H. (Hans); Muis, S; Haigh, I.D. (Ivan); Aerts, +J.C.J.H. (Jeroen) (2020): STORM tropical cyclone wind speed return periods. +4TU.ResearchData. Dataset. https://doi.org/10.4121/12705164.v3 + +[2] Bloemendaal, Nadia; de Moel, Hans; Dullaart, Job; Haarsma, R.J. (Reindert); +Haigh, I.D. (Ivan); Martinez, Andrew B.; et al. (2022): STORM climate change +tropical cyclone wind speed return periods. 4TU.ResearchData. Dataset. +https://doi.org/10.4121/14510817.v3 + """ data_license = DataPackageLicense( name="CC0", title="CC0", path="https://creativecommons.org/share-your-work/public-domain/cc0/", ) - data_origin_url = "https://zenodo.org/record/7438145#.Y-S6cS-l30o" + data_origin_url = "https://doi.org/10.5281/zenodo.7438145" + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -66,15 +127,14 @@ def exists(self): self.metadata.version, datafile_ext=".tif", ) - except FolderNotFoundException: + except FileNotFoundError: return False return count_on_backend == self.total_expected_files def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() else: # Ensure we start with a blank output folder on the storage backend try: @@ -83,7 +143,7 @@ def generate(self): self.metadata.name, self.metadata.version, ) - except FolderNotFoundException: + except FileNotFoundError: pass # Check if the source TIFF exists and fetch it if not self.update_progress(10, "fetching and verifying source") @@ -95,8 +155,16 @@ def generate(self): self.update_progress( 10 + int(idx * (80 / len(source_fpaths))), "cropping source" ) + subfilename = os.path.splitext(os.path.basename(source_fpath))[0] output_fpath = os.path.join( - self.tmp_processing_folder, os.path.basename(source_fpath) + self.tmp_processing_folder, + output_filename( + self.metadata.name, + self.metadata.version, + self.boundary["name"], + 'tif', + dataset_subfilename=subfilename + ) ) crop_success = crop_raster(source_fpath, output_fpath, self.boundary) self.log.debug( diff --git a/dataproc/processors/core/test_fail_processor/__init__.py b/dataproc/processors/core/test_fail_processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataproc/processors/core/test_fail_processor/version_1.py b/dataproc/processors/core/test_fail_processor/version_1.py new file mode 100644 index 0000000..ed31050 --- /dev/null +++ b/dataproc/processors/core/test_fail_processor/version_1.py @@ -0,0 +1,66 @@ +""" +Test Failing Processor +""" + +from time import sleep +import os +import inspect + +from dataproc import DataPackageLicense +from dataproc.processors.internal.base import ( + BaseProcessorABC, + BaseMetadataABC, +) +from dataproc.helpers import ( + version_name_from_file, + create_test_file, + data_file_hash, + datapackage_resource, + processor_name_from_file, +) + + +class Metadata(BaseMetadataABC): + """Processor metadata""" + + name = processor_name_from_file( + inspect.stack()[1].filename + ) # this must follow snakecase formatting, without special chars + description = "A test processor that fails" # Longer processor description + version = version_name_from_file( + inspect.stack()[1].filename + ) # Version of the Processor + dataset_name = "" # The dataset this processor targets + data_author = "" + data_title = "" + data_title_long = "" + data_summary = "" + data_citation = "" + data_license = DataPackageLicense( + name="CC-BY-4.0", + title="Creative Commons Attribution 4.0", + path="https://creativecommons.org/licenses/by/4.0/", + ) + data_origin_url = "http://url" + data_formats = ["GeoTIFF"] + + +class Processor(BaseProcessorABC): + """A Test Failing Processor""" + + def generate(self): + """Generate files for a given processor""" + # Pause to allow inspection + sleep(1) + self.update_progress(30,"waiting") + assert(0==1), "test-fail-processor failed as expected" + return self.provenance_log + + def exists(self): + """Whether all files for a given processor exist on the FS on not""" + return self.storage_backend.processor_file_exists( + self.boundary["name"], + self.metadata.name, + self.metadata.version, + f"{self.boundary['name']}_test.tif", + ) diff --git a/dataproc/processors/core/test_processor/version_1.py b/dataproc/processors/core/test_processor/version_1.py index 1b8eba9..e711feb 100644 --- a/dataproc/processors/core/test_processor/version_1.py +++ b/dataproc/processors/core/test_processor/version_1.py @@ -7,6 +7,7 @@ import inspect from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import ( BaseProcessorABC, BaseMetadataABC, @@ -32,12 +33,17 @@ class Metadata(BaseMetadataABC): ) # Version of the Processor dataset_name = "nightlights" # The dataset this processor targets data_author = "Nightlights Author" + data_title = "" + data_title_long = "" + data_summary = "" + data_citation = "" data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "http://url" + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -53,8 +59,7 @@ def generate(self): ) output_fpath = os.path.join(output_folder, f"{self.boundary['name']}_test.tif") if self.exists() is True: - self.update_progress(100,"waiting") - self.provenance_log[f"{self.metadata.name}"] = "exists" + raise ProcessorDatasetExists() else: # Generate a blank tests dataset create_test_file(output_fpath) diff --git a/dataproc/processors/core/wri_aqueduct/version_2.py b/dataproc/processors/core/wri_aqueduct/version_2.py index 11540b5..8952053 100644 --- a/dataproc/processors/core/wri_aqueduct/version_2.py +++ b/dataproc/processors/core/wri_aqueduct/version_2.py @@ -7,6 +7,7 @@ import shutil from celery.app import task +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import ( BaseProcessorABC, @@ -24,9 +25,9 @@ generate_license_file, generate_datapackage, generate_index_file, + output_filename ) from dataproc.processors.core.wri_aqueduct.helpers import HazardAqueduct -from dataproc.exceptions import FolderNotFoundException class Metadata(BaseMetadataABC): @@ -42,7 +43,18 @@ class Metadata(BaseMetadataABC): inspect.stack()[1].filename ) # Version of the Processor dataset_name = "wri_aqueduct" # The dataset this processor targets - data_author = "WRI" + data_title = "Aqueduct Flood Hazard Maps" + data_title_long = "World Resource Institute - Aqueduct Flood Hazard Maps (Version 2, updated October 20, 2020)" + data_author = "Ward, P.J., H.C. Winsemius, S. Kuzma, M.F.P. Bierkens, A. Bouwman, H. de Moel, A. Díaz Loaiza, et al." + data_summary = """World Resource Institute - Aqueduct Flood Hazard Maps (Version 2 (updated +October 20, 2020)). Inundation depth in meters for coastal and riverine +floods over 1km grid squares. 1 in 2 to 1 in 1000 year return periods. +Baseline, RCP 4.5 & 8.5 emission scenarios. Current and future maps in 2030, +2050 and 2080.""" + data_citation = """ +Ward, P.J., H.C. Winsemius, S. Kuzma, M.F.P. Bierkens, A. Bouwman, H. de Moel, A. Díaz Loaiza, et al. 2020. +Aqueduct Floods Methodology. Technical Note. Washington, D.C.: World Resources Institute. Available online at: +www.wri.org/publication/aqueduct-floods-methodology.""" data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", @@ -51,6 +63,7 @@ class Metadata(BaseMetadataABC): data_origin_url = ( "http://wri-projects.s3.amazonaws.com/AqueductFloodTool/download/v2/index.html" ) + data_formats = ["GeoTIFF"] class Processor(BaseProcessorABC): @@ -73,15 +86,14 @@ def exists(self): self.metadata.version, datafile_ext=".tif", ) - except FolderNotFoundException: + except FileNotFoundError: return False return count_on_backend == self.total_expected_files def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log + raise ProcessorDatasetExists() else: # Ensure we start with a blank output folder on the storage backend try: @@ -90,18 +102,12 @@ def generate(self): self.metadata.name, self.metadata.version, ) - except FolderNotFoundException: + except FileNotFoundError: pass # Check if the source TIFF exists and fetch it if not self.update_progress(10, "fetching and verifying source") self._fetch_source() - # Remove partial previous tmp results if they exist - if os.path.exists(self.tmp_processing_folder): - shutil.rmtree(self.tmp_processing_folder) - # Generate the tmp output directory - os.makedirs(self.tmp_processing_folder, exist_ok=True) - self.log.debug("WRI Aqueduct - cropping geotiffs") results_fpaths = [] for idx, fileinfo in enumerate(os.scandir(self.source_folder)): @@ -114,7 +120,19 @@ def generate(self): 10 + int(idx * (80 / self.total_expected_files)), "cropping source" ) geotiff_fpath = os.path.join(self.source_folder, fileinfo.name) - output_fpath = os.path.join(self.tmp_processing_folder, fileinfo.name) + + subfilename = os.path.splitext(fileinfo.name)[0] + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename( + self.metadata.name, + self.metadata.version, + self.boundary["name"], + 'tif', + dataset_subfilename=subfilename + ) + ) + assert_geotiff(geotiff_fpath) crop_success = crop_raster(geotiff_fpath, output_fpath, self.boundary) self.log.debug( diff --git a/dataproc/processors/core/wri_powerplants/version_130.py b/dataproc/processors/core/wri_powerplants/version_130.py index d60de02..ccbe0d6 100644 --- a/dataproc/processors/core/wri_powerplants/version_130.py +++ b/dataproc/processors/core/wri_powerplants/version_130.py @@ -6,6 +6,7 @@ import inspect from dataproc import DataPackageLicense +from dataproc.exceptions import ProcessorDatasetExists from dataproc.processors.internal.base import BaseProcessorABC, BaseMetadataABC from dataproc.helpers import ( version_name_from_file, @@ -18,8 +19,9 @@ generate_datapackage, unpack_zip, csv_to_gpkg, - gp_crop_file_to_geopkg, + fiona_crop_file_to_geopkg, assert_vector_file, + output_filename ) @@ -33,12 +35,24 @@ class Metadata(BaseMetadataABC): version = version_name_from_file(inspect.stack()[1].filename) dataset_name = "wri_powerplants" data_author = "World Resources Institute" + data_title = "WRI Global Power Plant Database" + data_title_long = "World Resources Institute Global Power Plant Database" + data_summary = """The Global Power Plant Database is a comprehensive, open source database of power plants around the world. It +centralizes power plant data to make it easier to navigate, compare and draw insights for one’s own analysis. +The database covers approximately 35,000 power plants from 167 countries and includes thermal plants (e.g. coal, +gas, oil, nuclear, biomass, waste, geothermal) and renewables (e.g. hydro, wind, solar). Each power plant is +geolocated and entries contain information on plant capacity, generation, ownership, and fuel type. It will be +continuously updated as data becomes available.""" + data_citation = """Global Energy Observatory, Google, KTH Royal Institute of Technology in Stockholm, Enipedia, World Resources +Institute. 2018. Global Power Plant Database. Published on Resource Watch and Google Earth Engine; +http://resourcewatch.org/ https://earthengine.google.com/""" data_license = DataPackageLicense( name="CC-BY-4.0", title="Creative Commons Attribution 4.0", path="https://creativecommons.org/licenses/by/4.0/", ) data_origin_url = "https://datasets.wri.org/dataset/globalpowerplantdatabase" + data_formats = ["Geopackage"] class Processor(BaseProcessorABC): @@ -50,6 +64,47 @@ class Processor(BaseProcessorABC): source_zip_url = "https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip" expected_zip_hash = "083f11452efc1ed0e8fb1494f0ce49e5c37718e2" source_file = "global_power_plant_database.gpkg" + output_schema = { + "properties": { + "country": "str", + "country_long": "str", + "name": "str", + "gppd_idnr": "str", + "capacity_mw": "float", + "latitude": "float", + "longitude": "float", + "primary_fuel": "str", + "other_fuel1": "str", + "other_fuel2": "str", + "other_fuel3": "str", + "commissioning_year": "float", + "owner": "str", + "source": "str", + "url": "str", + "geolocation_source": "str", + "wepp_id": "str", + "year_of_capacity_data": "float", + "generation_gwh_2013": "float", + "generation_gwh_2014": "float", + "generation_gwh_2015": "float", + "generation_gwh_2016": "float", + "generation_gwh_2017": "float", + "generation_gwh_2018": "float", + "generation_gwh_2019": "float", + "generation_data_source": "str", + "estimated_generation_gwh_2013": "float", + "estimated_generation_gwh_2014": "float", + "estimated_generation_gwh_2015": "float", + "estimated_generation_gwh_2016": "float", + "estimated_generation_gwh_2017": "float", + "estimated_generation_note_2013": "str", + "estimated_generation_note_2014": "str", + "estimated_generation_note_2015": "str", + "estimated_generation_note_2016": "str", + "estimated_generation_note_2017": "str", + }, + "geometry": "Point", + } expected_source_gpkg_shape = (34936, 37) def exists(self): @@ -58,20 +113,18 @@ def exists(self): self.boundary["name"], self.metadata.name, self.metadata.version, - f"{self.boundary['name']}.gpkg", + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg') ) def generate(self): """Generate files for a given processor""" if self.exists() is True: - self.provenance_log[self.metadata.name] = "exists" - return self.provenance_log - # Setup output path in the processing backend - output_folder = self.paths_helper.build_absolute_path( - self.boundary["name"], self.metadata.name, self.metadata.version, "outputs" + raise ProcessorDatasetExists() + + output_fpath = os.path.join( + self.tmp_processing_folder, + output_filename(self.metadata.name, self.metadata.version, self.boundary["name"], 'gpkg') ) - os.makedirs(output_folder, exist_ok=True) - output_fpath = os.path.join(output_folder, f"{self.boundary['name']}.gpkg") # Fetch source as required self.update_progress(10, "fetching and verifying source") @@ -80,8 +133,11 @@ def generate(self): # Crop to given boundary self.update_progress(50, "cropping source") self.log.debug("%s - cropping to geopkg", self.metadata.name) - crop_result = gp_crop_file_to_geopkg( - source_gpkg_fpath, self.boundary, output_fpath, mask_type="boundary" + crop_result = fiona_crop_file_to_geopkg( + source_gpkg_fpath, + self.boundary, + output_fpath, + self.output_schema ) self.provenance_log[f"{self.metadata.name} - crop completed"] = crop_result @@ -107,7 +163,9 @@ def generate(self): self.metadata, [result_uri], "GEOPKG", sizes, hashes ) self.provenance_log["datapackage"] = datapkg - self.log.debug("%s generated datapackage in log: %s", self.metadata.name, datapkg) + self.log.debug( + "%s generated datapackage in log: %s", self.metadata.name, datapkg + ) # Cleanup as required return self.provenance_log @@ -151,14 +209,17 @@ def _fetch_source(self) -> str: os.makedirs(self.source_folder, exist_ok=True) if self._all_source_exists(): self.log.debug( - "%s - all source files appear to exist and are valid", self.metadata.name + "%s - all source files appear to exist and are valid", + self.metadata.name, ) return os.path.join(self.source_folder, self.source_file) # Fetch the source zip self.log.debug("%s - fetching zip", self.metadata.name) local_zip_fpath = self._fetch_zip() self.log.debug("%s - fetched zip to %s", self.metadata.name, local_zip_fpath) - self.provenance_log[f"{self.metadata.name} - zip download path"] = local_zip_fpath + self.provenance_log[ + f"{self.metadata.name} - zip download path" + ] = local_zip_fpath # Unpack self.log.debug("%s - unpacking zip", self.metadata.name) unpack_zip(local_zip_fpath, self.tmp_processing_folder) @@ -177,7 +238,9 @@ def _fetch_source(self) -> str: longitude_col="longitude", ) self.log.info( - "%s - CSV conversion to source GPKG success: %s", self.metadata.name, converted + "%s - CSV conversion to source GPKG success: %s", + self.metadata.name, + converted, ) return gpkg_fpath diff --git a/dataproc/processors/internal/base.py b/dataproc/processors/internal/base.py index 6846ba1..48c4be2 100644 --- a/dataproc/processors/internal/base.py +++ b/dataproc/processors/internal/base.py @@ -17,7 +17,11 @@ class BaseMetadataABC(ABC): description: str = "" # Longer processor description version: str = "" # Version of the Processor dataset_name: str = "" # The dataset this processor targets + data_title: str = "" # Short one-liner title for dataset, ~30 characters is good + data_title_long: str = "" # Long title for dataset data_author: str = "" + data_summary: str = "" # 1-3 paragraph prose summary of the dataset + data_citation: str = "" # Suggested citation, e.g. "Nicholas, C (2023) irv-autopkg. [Software] Available at: https://github.com/nismod/irv-autopkg" data_license: DataPackageLicense = None data_origin_url: str = "" @@ -45,7 +49,7 @@ def __init__( self.source_folder = self.paths_helper.build_absolute_path("source_data") os.makedirs(self.source_folder, exist_ok=True) # Tmp Processing data will be cleaned between processor runs - self.tmp_processing_folder = self.paths_helper.build_absolute_path("tmp") + self.tmp_processing_folder = self.paths_helper.build_absolute_path("tmp", self.boundary['name']) os.makedirs(self.tmp_processing_folder, exist_ok=True) def __enter__(self): @@ -60,7 +64,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): exc_tb, ) try: - shutil.rmtree(self.tmp_processing_folder) + shutil.rmtree(self.tmp_processing_folder, ignore_errors=True) except FileNotFoundError: pass diff --git a/dataproc/tasks.py b/dataproc/tasks.py index c9c79f6..874653f 100644 --- a/dataproc/tasks.py +++ b/dataproc/tasks.py @@ -5,7 +5,7 @@ from contextlib import contextmanager import logging -from celery import signals +from celery import signals, states from celery.utils.log import get_task_logger from redis import Redis @@ -14,7 +14,6 @@ CELERY_APP, TASK_LOCK_TIMEOUT, STORAGE_BACKEND, - LOCALFS_STORAGE_BACKEND_ROOT, LOCALFS_PROCESSING_BACKEND_ROOT, REDIS_HOST, ) @@ -24,22 +23,25 @@ BoundaryProcessor, ProvenanceProcessor, ) -from dataproc.exceptions import ProcessorAlreadyExecutingException +from dataproc.exceptions import ProcessorAlreadyExecutingException, ProcessorDatasetExists, ProcessorExecutionFailed from dataproc.backends.storage import init_storage_backend # Setup Configured Storage Backend -storage_backend = init_storage_backend(STORAGE_BACKEND)(LOCALFS_STORAGE_BACKEND_ROOT) +storage_backend = init_storage_backend(STORAGE_BACKEND) # Used for guarding against parallel execution of duplicate tasks redis_client = Redis(host=REDIS_HOST) +def task_sig_exists(task_sig) -> bool: + """Check a task signature in Redis""" + return redis_client.exists(task_sig) != 0 @contextmanager def redis_lock(task_sig: str): """ Manage Task execution lock within redis """ - if redis_client.exists(task_sig): + if task_sig_exists(task_sig) is True: raise ProcessorAlreadyExecutingException() yield redis_client.setex(task_sig, TASK_LOCK_TIMEOUT, value="") @@ -114,8 +116,24 @@ def processor_task( ::param sink Any Sink for result of previous processor in the group """ + retry_countdown = 5 logger = get_task_logger(__name__) task_sig = task_signature(boundary["name"], processor_name_version) + # There can be cases where two dup tasks are submitted - one runs the boundary processors and the other ends up running the actual processing + # In this case there is a chance the boundary processor does not complete before the processor runs (as it ends up running in parallel). + # So here we ensure the boundary step is complete for external tasks before continuing + # NOTE: This is the ONLY retry condition for a Dataset Processor + boundary_task_sig = task_signature(boundary["name"], "boundary_setup") + try: + if task_sig_exists(boundary_task_sig) is True: + raise ProcessorAlreadyExecutingException("boundary setup for this processor executing") + except ProcessorAlreadyExecutingException as err: + logger.warning( + "boundary task with signature %s is currently executing for processor %s - will retry processor in %s secs", + boundary_task_sig, task_sig, retry_countdown + ) + raise self.retry(exc=err, countdown=retry_countdown) + # Run the processor try: with redis_lock(task_sig) as acquired: if acquired: @@ -132,13 +150,17 @@ def processor_task( result = proc.generate() # Update sink for this processor sink[processor_name_version] = result + return sink + except ProcessorDatasetExists: + sink[processor_name_version] = {"skipped": f"{task_sig} exists"} + return sink except Exception as err: logger.exception("") # Update sink for this processor - sink[processor_name_version] = {"failed": type(err).__name__} + sink[processor_name_version] = {"failed": f"{type(err).__name__} - {err}"} + return sink finally: _ = redis_client.getdel(task_sig) - return sink else: raise ProcessorAlreadyExecutingException() except ProcessorAlreadyExecutingException: @@ -168,18 +190,20 @@ def generate_provenance(self, sink: Any, boundary: Boundary): if isinstance(sink, dict): sink = [sink] proc = ProvenanceProcessor(boundary, storage_backend) - res = proc.generate(sink) + return proc.generate(sink) except Exception as err: logger.exception("") # Update sink for this processor - sink["generate_provenance"] = {"failed": type(err).__name__} + if isinstance(sink, dict): + sink["generate_provenance"] = {"failed": type(err).__name__} + else: + sink.append({"generate_provenance failed": type(err).__name__}) finally: _ = redis_client.getdel(task_sig) else: raise ProcessorAlreadyExecutingException() - except ProcessorAlreadyExecutingException: + except ProcessorAlreadyExecutingException as err: logger.warning( "task with signature %s skipped because it was already executing", task_sig ) - self.retry(countdown=5) - return res + raise self.retry(exc=err, countdown=5) diff --git a/docker-compose.yaml b/docker-compose.yaml index af10aec..2b351de 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -21,13 +21,18 @@ services: # - ./tests/data/packages:/www/data/packages # Testing redis: - image: redis:6.2-alpine + image: ghcr.io/nismod/irv-autopkg-redis:0.1 + build: ./redis restart: always + user: autopkg ports: - 6379:6379 command: redis-server --save 20 1 --loglevel debug volumes: - ./data/redis:/data + cpus: "0.25" + mem_reservation: "50M" + mem_limit: "250M" flower: image: mher/flower @@ -42,14 +47,13 @@ services: CELERY_RESULT_BACKEND: redis://redis dataproc: - image: ghcr.io/nismod/irv-autopkg:0.2.4-dev + image: ghcr.io/nismod/irv-autopkg:0.2.7-dev user: autopkg build: . volumes: - ./data/packages:/data/packages - ./data/processing:/data/processing - - ./tests/data/packages:/usr/src/app/tests/data/packages - - ./tests/data/tmp:/usr/src/app/tests/data/tmp + - ./tests:/usr/src/app/tests env_file: - envs/.api_and_dataproc.env command: celery --app dataproc.tasks worker @@ -58,35 +62,38 @@ services: mem_limit: "1G" api: - image: ghcr.io/nismod/irv-autopkg:0.2.4-dev + image: ghcr.io/nismod/irv-autopkg:0.2.7-dev build: . volumes: - ./data/packages:/data/packages - ./data/processing:/data/processing - - ./tests/data/packages:/usr/src/app/tests/data/packages - - ./tests/data/tmp:/usr/src/app/tests/data/tmp + - ./tests:/usr/src/app/tests ports: - 8000:8000 env_file: - envs/.api_and_dataproc.env - command: uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload + command: uvicorn api.main:app --host 0.0.0.0 --port 8000 + cpus: "0.25" + mem_reservation: "50M" + mem_limit: "250M" - # These test-harness containers require API and Dataproc to be running + # API test-harness requires API and Dataproc to be running # WARNING - THESE TESTS WILL WIPE THE CONFIGURED TEST HARNESS DB from anything in MODELS # To run tests change AUTOPKG_DEPLOYMENT_ENV=test in .api_and_dataproc.env, then reboot api and dataproc services test-api: - image: ghcr.io/nismod/irv-autopkg:0.2.4-dev + image: ghcr.io/nismod/irv-autopkg:0.2.7-dev volumes: - - ./tests/data:/usr/src/app/tests/data + - ./tests:/usr/src/app/tests env_file: - envs/.api_and_dataproc.env - command: python3 -m unittest discover /usr/src/app/tests/api + command: python -m unittest discover /usr/src/app/tests/api + # Dataproc test-harness only requires DB test-dataproc: - image: ghcr.io/nismod/irv-autopkg:0.2.4-dev + image: ghcr.io/nismod/irv-autopkg:0.2.7-dev volumes: - - ./tests/data:/usr/src/app/tests/data + - ./tests:/usr/src/app/tests env_file: - envs/.api_and_dataproc.env command: python -m unittest discover /usr/src/app/tests/dataproc diff --git a/docs/architecture.png b/docs/architecture.png new file mode 100644 index 0000000..5b9f599 Binary files /dev/null and b/docs/architecture.png differ diff --git a/docs/package_structure.png b/docs/package_structure.png new file mode 100644 index 0000000..36265db Binary files /dev/null and b/docs/package_structure.png differ diff --git a/redis/Dockerfile b/redis/Dockerfile new file mode 100644 index 0000000..5f28e98 --- /dev/null +++ b/redis/Dockerfile @@ -0,0 +1,5 @@ +FROM redis:6.2-alpine + +RUN addgroup -g 1002 autopkg && adduser -SHD autopkg -u 1002 -G autopkg + +USER autopkg \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6e23d19..63f142f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,6 @@ shapely==2.0.0 pyproj==3.4.1 datapackage==1.15.2 zenodo_get==1.3.4 -geopandas==0.12.2 \ No newline at end of file +geopandas==0.12.2 +pyarrow==11.0.0 +fiona==1.9.1 \ No newline at end of file diff --git a/tests/api/integration/test_boundaries.py b/tests/api/integration/test_boundaries.py index a8a2953..f784f2c 100644 --- a/tests/api/integration/test_boundaries.py +++ b/tests/api/integration/test_boundaries.py @@ -100,8 +100,8 @@ def test_search_boundary_by_name(self): """ Retrieve boundaries by searching for a name """ - search_name = 'gh' - expected_names = ['ghana', 'guinea'] + search_name = 'mbi' + expected_names = ['mozambique', 'gambia', 'zambia'] route = build_route(f"{BOUNDARY_SEARCH_ROUTE}?name={search_name}") response = requests.get(route) self.assert_boundary_summary(response, expected_count=len(expected_names)) @@ -129,6 +129,16 @@ def test_search_boundary_by_coords(self): self.assert_boundary_summary(response, expected_count=len(expected_names)) self.assertCountEqual( [item["name"] for item in response.json()], expected_names) + + def test_search_boundary_by_coords_zero(self): + search_latitude = 28.2 + search_longitude = 0.0 + expected_names = ['algeria'] + route = build_route(f"{BOUNDARY_SEARCH_ROUTE}?latitude={search_latitude}&longitude={search_longitude}") + response = requests.get(route) + self.assert_boundary_summary(response, expected_count=len(expected_names)) + self.assertCountEqual( + [item["name"] for item in response.json()], expected_names) def test_search_boundary_by_coords_nothing_found(self): search_latitude = 128.2 diff --git a/tests/api/integration/test_jobs.py b/tests/api/integration/test_jobs.py index 21c619a..c22b67d 100644 --- a/tests/api/integration/test_jobs.py +++ b/tests/api/integration/test_jobs.py @@ -9,6 +9,7 @@ from uuid import uuid4 from time import time, sleep import json +import shutil import requests @@ -21,7 +22,19 @@ from tests.dataproc.integration.processors import ( LOCAL_FS_PACKAGE_DATA_TOP_DIR, ) -from config import PACKAGES_HOST_URL +from tests.helpers import ( + clean_packages, + assert_package_awss3, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import AWSS3StorageBackend, S3Manager +from config import ( + STORAGE_BACKEND, + S3_BUCKET, + S3_REGION, + PACKAGES_HOST_URL, +) + JOB_SUBMIT_DATA_BOUNDARY_NOEXIST = { "boundary_name": "noexist", @@ -44,23 +57,29 @@ JOB_SUBMIT_DATA_GAMBIA_TEST_PROC = { "boundary_name": "gambia", "processors": ["test_processor.version_1"], -} # Awaits 5 secs +} + +JOB_SUBMIT_DATA_ZIMBABWE_TEST_PROC = { + "boundary_name": "zimbabwe", + "processors": ["test_fail_processor.version_1"], +} JOB_SUBMIT_DATA_ZAMBIA_TEST_PROC = { "boundary_name": "zambia", "processors": ["test_processor.version_1"], -} # Awaits 5 secs +} JOB_SUBMIT_DATA_GHANA_TEST_PROC = { "boundary_name": "ghana", "processors": ["test_processor.version_1"], -} # Awaits 5 secs +} JOB_SUBMIT_DATA_SSUDAN_NE_VECTOR_PROC = { "boundary_name": "ssudan", "processors": ["natural_earth_vector.version_1"], -} # Awaits 5 secs +} +PACAKGES_USED = ["gambia", "zambia", "ssudan", "zimbabwe"] class TestProcessingJobs(unittest.TestCase): @@ -68,8 +87,37 @@ class TestProcessingJobs(unittest.TestCase): These tests require API and Celery Worker to be running (with redis) """ + @classmethod + def setUpClass(cls): + cls.max_job_await = 20 # secs + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=PACAKGES_USED, + ) + + @classmethod + def tearDownClass(cls): + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=PACAKGES_USED, + ) + def setUp(self): - self.max_job_await = 6 # secs + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=PACAKGES_USED, + ) def test_get_job_no_exist(self): """""" @@ -115,13 +163,9 @@ def test_submit_job_duplicate_processor(self): response.json()["detail"][0]["msg"], "duplicate processors not allowed" ) - # __NOTE__: These submission tests use different bounaries - # so results do not overlap in the backend queue - def test_submit_job(self): """Simple submission and await completion of a job""" # Ensure the package tree is clean - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["zambia"]) expected_code = 202 route = build_route(JOBS_BASE_ROUTE) response = requests.post(route, json=JOB_SUBMIT_DATA_GAMBIA_TEST_PROC) @@ -134,29 +178,68 @@ def test_submit_job(self): route = build_route(JOB_STATUS_ROUTE.format(job_id=job_id)) response = requests.get(route) if response.json()["job_group_processors"]: - self.assertEqual(response.json()["job_group_processors"][0]["job_id"], job_id) + self.assertEqual( + response.json()["job_group_processors"][0]["job_id"], job_id + ) if not response.json()["job_group_status"] == "PENDING": + # Final await for any S3 refreshing backend + sleep(1.0) break - sleep(0.2) + sleep(1.0) if (time() - start) > self.max_job_await: self.fail("max await reached") self.assertEqual(response.json()["job_group_status"], "COMPLETE") # Assert the package integrity, including submitted processor - assert_package( - LOCAL_FS_PACKAGE_DATA_TOP_DIR, - "gambia", - ) - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["gambia"]) + if STORAGE_BACKEND == "localfs": + assert_package( + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + "gambia", + ) + elif STORAGE_BACKEND == "awss3": + assert_package_awss3( + self.storage_backend, + "gambia", + expected_processor_versions=JOB_SUBMIT_DATA_GAMBIA_TEST_PROC["processors"], + ) - def test_submit_job_already_processing_using_test_processor(self): + def test_submit_failing_job(self): + """Submission of a job that fals""" + # Ensure the package tree is clean + expected_code = 202 + route = build_route(JOBS_BASE_ROUTE) + response = requests.post(route, json=JOB_SUBMIT_DATA_ZIMBABWE_TEST_PROC) + self.assertEqual(response.status_code, expected_code) + self.assertIn("job_id", response.json().keys()) + job_id = response.json()["job_id"] + # Await job completion + start = time() + while True: + route = build_route(JOB_STATUS_ROUTE.format(job_id=job_id)) + response = requests.get(route) + if response.json()["job_group_processors"]: + self.assertEqual( + response.json()["job_group_processors"][0]["job_id"], job_id + ) + if not response.json()["job_group_status"] == "PENDING": + # Final await for any S3 refreshing backend + sleep(1.0) + break + sleep(1.0) + if (time() - start) > self.max_job_await: + self.fail("max await reached") + self.assertEqual(response.json()["job_group_status"], "COMPLETE") + # Job Statuses show failed + self.assertEqual(response.json()["job_group_processors"][0]['job_status'], "FAILURE") + + + def test_submit_job_already_executing_using_test_processor(self): """ - Submission of a second job containing - the same boundary and processor while one is already executing + Submission of a multiple jobs containing the same boundary and + processor while one is already executing (test processor) """ - max_wait = 20 # secs + max_wait = 60 # secs dup_processors_to_submit = 8 expected_responses = [202 for i in range(dup_processors_to_submit)] - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["gambia"]) route = build_route(JOBS_BASE_ROUTE) responses = [] for _ in range(dup_processors_to_submit): @@ -191,6 +274,13 @@ def test_submit_job_already_processing_using_test_processor(self): sleep(0.5) # Jobs completed successfully self.assertEqual(statuses, ["COMPLETE" for i in range(dup_processors_to_submit)]) + # Job Statuses show skipped and success + expected_msgs = ["SKIPPED" for _ in range(dup_processors_to_submit-1)] + expected_msgs.append("SUCCESS") + self.assertCountEqual( + [i['job_status'] for i in results], + expected_msgs + ) test_proc_results = [] for result in results: test_proc_results.append(result["job_result"]) @@ -218,24 +308,28 @@ def test_submit_job_already_processing_using_test_processor(self): test_proc_results, ) # Processor success only reported once - self.assertTrue(len(set([json.dumps(i) for i in test_proc_results]))) + self.assertTrue(len(set([json.dumps(i) for i in test_proc_results])), 1) # Assert we only get a single package output - assert_package( - LOCAL_FS_PACKAGE_DATA_TOP_DIR, - "zambia", - ) - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["zambia"]) + if STORAGE_BACKEND == "localfs": + assert_package( + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + "zambia", + ) + elif STORAGE_BACKEND == "awss3": + assert_package_awss3( + self.storage_backend, + "zambia", + expected_processor_versions=JOB_SUBMIT_DATA_ZAMBIA_TEST_PROC["processors"], + ) def test_submit_job_already_processing_using_ne_vector_processor(self): """ - Submission of a second job containing - the same boundary and processor while one is already executing + Submission of a second job containing the same boundary and processor while one is already executing (ne vector) """ - max_wait = 60 # secs - dup_processors_to_submit = 8 + max_wait = 30 # secs + dup_processors_to_submit = 2 expected_responses = [202 for i in range(dup_processors_to_submit)] - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["ssudan"]) route = build_route(JOBS_BASE_ROUTE) responses = [] for _ in range(dup_processors_to_submit): @@ -270,14 +364,28 @@ def test_submit_job_already_processing_using_ne_vector_processor(self): sleep(0.5) # Jobs completed successfully self.assertEqual(statuses, ["COMPLETE" for i in range(dup_processors_to_submit)]) + # Job Statuses show skipped and success + expected_msgs = ["SKIPPED" for _ in range(dup_processors_to_submit-1)] + expected_msgs.append("SUCCESS") + self.assertCountEqual( + [i['job_status'] for i in results], + expected_msgs + ) # Between the two sets of results there should be success for # both boundaries and test_processor test_proc_results = [] for result in results: test_proc_results.append(result["job_result"]) - # Correct total processing results - including 7 exists + # Correct total processing results self.assertEqual(len(test_proc_results), dup_processors_to_submit) - # Test Processor Success + # Should have only ran fully once - the rest should be exists + count_processed_e2e = 0 + count_processed_e2e_key = "natural_earth_vector - loaded NE Roads to PG" + for i in test_proc_results: + if count_processed_e2e_key in i.keys(): + count_processed_e2e+=1 + self.assertEqual(count_processed_e2e, 1) + # Test Processor Success keys all exist self.assertIn( sorted([ "natural_earth_vector - zip download path", @@ -294,9 +402,14 @@ def test_submit_job_already_processing_using_ne_vector_processor(self): # Processor success only reported once self.assertTrue(len(set([json.dumps(i) for i in test_proc_results]))) - # Assert we only get a single package output - assert_package( - LOCAL_FS_PACKAGE_DATA_TOP_DIR, - "ssudan", - ) - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["ssudan"]) + if STORAGE_BACKEND == "localfs": + assert_package( + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + "ssudan", + ) + elif STORAGE_BACKEND == "awss3": + assert_package_awss3( + self.storage_backend, + "ssudan", + expected_processor_versions=JOB_SUBMIT_DATA_SSUDAN_NE_VECTOR_PROC["processors"], + ) diff --git a/tests/api/integration/test_packages.py b/tests/api/integration/test_packages.py index ddf96cf..425890c 100644 --- a/tests/api/integration/test_packages.py +++ b/tests/api/integration/test_packages.py @@ -6,19 +6,33 @@ import sys import inspect import unittest +import shutil import requests -from tests.helpers import build_route, create_tree, remove_tree, assert_datapackage_resource - current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parent_dir = os.path.dirname(current_dir) sys.path.insert(0, parent_dir) from api.routes import PACKAGE_ROUTE, PACKAGES_BASE_ROUTE +from tests.helpers import ( + build_route, + create_tree, + remove_tree, + assert_datapackage_resource, + create_tree_awss3, + clean_packages, +) from tests.dataproc.integration.processors import ( LOCAL_FS_PACKAGE_DATA_TOP_DIR, ) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + STORAGE_BACKEND, + S3_BUCKET, + S3_REGION +) class TestPackages(unittest.TestCase): @@ -27,66 +41,101 @@ class TestPackages(unittest.TestCase): These tests require API and Celery Worker to be run ning (with redis) """ - def assert_package(self, response, expected_boundary_name: str, expected_dataset_names_versions: list): + @classmethod + def setUpClass(cls): + cls.backend = init_storage_backend(STORAGE_BACKEND) + + @classmethod + def tearDownClass(cls): + # Package data + clean_packages( + STORAGE_BACKEND, + cls.backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) + + def assert_package( + self, + response, + expected_boundary_name: str, + expected_dataset_names_versions: list, + ): """ Check the package repsonse is valid ::param expected_dataset_names_versions list ["natural_earth_raster.version_1", ...] """ self.assertEqual(response.status_code, 200) - self.assertEqual(response.json()['boundary_name'], expected_boundary_name) + self.assertEqual(response.json()["boundary_name"], expected_boundary_name) name_versions = [] # Check the processors for dataset in response.json()["processors"]: - for version in dataset['versions']: + for version in dataset["versions"]: name_versions.append(f'{dataset["name"]}.{version["version"]}') - self.assertListEqual( - name_versions, - expected_dataset_names_versions - ) + self.assertListEqual(name_versions, expected_dataset_names_versions) # Ensure we have a nested datapackage self.assertIn("datapackage", response.json().keys()) - for dp_resource in response.json()['datapackage']['resources']: + for dp_resource in response.json()["datapackage"]["resources"]: assert_datapackage_resource(dp_resource) def test_get_all_packages(self): """ Retrieve all packages """ - if not LOCAL_FS_PACKAGE_DATA_TOP_DIR: - raise Exception("localfs storage root not set in env") - create_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + if STORAGE_BACKEND == "localfs": + create_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + elif STORAGE_BACKEND == "awss3": + with S3Manager( + *self.backend._parse_env(), region=self.backend.s3_region + ) as s3_fs: + create_tree_awss3( + s3_fs, + S3_BUCKET, + ) route = build_route(PACKAGES_BASE_ROUTE) response = requests.get(route) # Ensure we can find at least the fake packages we created self.assertIn( - 'zambia', - [boundary['boundary_name'] for boundary in response.json()] + "zambia", [boundary["boundary_name"] for boundary in response.json()] ) self.assertIn( - 'gambia', - [boundary['boundary_name'] for boundary in response.json()] + "gambia", [boundary["boundary_name"] for boundary in response.json()] ) remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR) def test_get_package_by_name_not_found(self): """Attempt to retrieve details of a package which does not exist""" - route = build_route(PACKAGE_ROUTE.format(boundary_name='noexist')) + route = build_route(PACKAGE_ROUTE.format(boundary_name="noexist")) response = requests.get(route) self.assertEqual(response.status_code, 404) - self.assertDictEqual(response.json(), {'detail': 'Package noexist not found'}) + self.assertDictEqual(response.json(), {"detail": "Package noexist not found"}) def test_get_package_by_name_no_valid_datasets(self): """ - Attempt to Retrieve details of a package by boundary name, + Attempt to Retrieve details of a package by boundary name, where there are no datasets which have applicable processors """ - create_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=['gambia'], datasets=['noexist']) - route = build_route(PACKAGE_ROUTE.format(boundary_name='gambia')) + if STORAGE_BACKEND == "localfs": + create_tree( + LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["gambia"], datasets=["noexist"] + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager( + *self.backend._parse_env(), region=self.backend.s3_region + ) as s3_fs: + create_tree_awss3( + s3_fs, S3_BUCKET, packages=["gambia"], datasets=["noexist"] + ) + route = build_route(PACKAGE_ROUTE.format(boundary_name="gambia")) response = requests.get(route) self.assertEqual(response.status_code, 404) - self.assertDictEqual(response.json(), {'detail': 'Package gambia has no existing or executing datasets'}) - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=['gambia']) + self.assertDictEqual( + response.json(), + {"detail": "Package gambia has no existing or executing datasets"}, + ) + remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["gambia"]) def test_get_package_by_name(self): """ @@ -94,9 +143,23 @@ def test_get_package_by_name(self): Package is created within the test, but the processor must exist and be valid (natural_earth_raster.version_1) """ - create_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=['gambia'], datasets=['natural_earth_raster']) - route = build_route(PACKAGE_ROUTE.format(boundary_name='gambia')) + if STORAGE_BACKEND == "localfs": + create_tree( + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + packages=["gambia"], + datasets=["natural_earth_raster"], + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager( + *self.backend._parse_env(), region=self.backend.s3_region + ) as s3_fs: + create_tree_awss3( + s3_fs, + S3_BUCKET, + packages=["gambia"], + datasets=["natural_earth_raster"], + ) + route = build_route(PACKAGE_ROUTE.format(boundary_name="gambia")) response = requests.get(route) self.assert_package(response, "gambia", ["natural_earth_raster.version_1"]) - remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=['gambia']) - + remove_tree(LOCAL_FS_PACKAGE_DATA_TOP_DIR, packages=["gambia"]) diff --git a/tests/api/integration/test_processors.py b/tests/api/integration/test_processors.py new file mode 100644 index 0000000..4f4f2e0 --- /dev/null +++ b/tests/api/integration/test_processors.py @@ -0,0 +1,105 @@ +""" +Tests for Processor Endpoints +""" + +import os +import sys +import inspect +import unittest + +import requests + +current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parent_dir = os.path.dirname(current_dir) +sys.path.insert(0, parent_dir) + +from api.routes import ( + PROCESSORS_BASE_ROUTE, + PROCESSORS_NAME_ROUTE, + PROCESSORS_VERSION_ROUTE, +) +from tests.helpers import build_route + +EXPECTED_PROCESSOR_VERSION = { + "name": "test_processor.version_1", + "description": "A test processor for nightlights", + "version": "version_1", + "data_author": "Nightlights Author", + "data_title": "", + "data_title_long": "", + "data_summary": "", + "data_citation": "", + "data_license": { + "name": "CC-BY-4.0", + "path": "https://creativecommons.org/licenses/by/4.0/", + "title": "Creative Commons Attribution 4.0", + }, + "data_origin_url": "http://url", + "data_formats" : ["GeoTIFF"] +} + + +class TestProcessors(unittest.TestCase): + + """ + These tests require API to be running + """ + + @classmethod + def setUpClass(cls): + pass + + @classmethod + def tearDownClass(cls): + pass + + def test_get_all_processors(self): + """ + Retrieve all Processors + """ + route = build_route(PROCESSORS_BASE_ROUTE) + response = requests.get(route) + self.assertEqual(response.status_code, 200) + self.assertTrue(len(response.json()) > 0) + self.assertIn("test_processor", [proc['name'] for proc in response.json()]) + + def test_get_processor_name_noexist(self): + """ + Retrieve a processor by name which does not exist + """ + route = build_route(PROCESSORS_NAME_ROUTE.format(name="noexist")) + response = requests.get(route) + self.assertEqual(response.status_code, 404) + + def test_get_processor_name_version_noexist(self): + """ + Retrieve a processor version which does not exist + """ + route = build_route( + PROCESSORS_VERSION_ROUTE.format(name="test_processor", version="noexist") + ) + response = requests.get(route) + self.assertEqual(response.status_code, 404) + + def test_get_processor_by_name(self): + """ + Retrieve a processor by name + """ + route = build_route(PROCESSORS_NAME_ROUTE.format(name="test_processor")) + response = requests.get(route) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json()["name"], "test_processor") + self.assertEqual(len(response.json()["versions"]), 1) + self.assertDictEqual(response.json()["versions"][0], EXPECTED_PROCESSOR_VERSION) + + def test_get_processor_version(self): + """ + Retrieve a processor version + """ + route = build_route( + PROCESSORS_VERSION_ROUTE.format( + name="test_processor", version=EXPECTED_PROCESSOR_VERSION["version"] + ) + ) + response = requests.get(route) + self.assertDictEqual(response.json(), EXPECTED_PROCESSOR_VERSION) diff --git a/tests/data/isimp_drought_v1/lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif b/tests/data/isimp_drought_v1/lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif new file mode 100644 index 0000000..5a312e8 Binary files /dev/null and b/tests/data/isimp_drought_v1/lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif differ diff --git a/tests/data/isimp_drought_v1/lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif b/tests/data/isimp_drought_v1/lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif new file mode 100644 index 0000000..6f75db0 Binary files /dev/null and b/tests/data/isimp_drought_v1/lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif differ diff --git a/tests/data/isimp_drought_v1/lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif b/tests/data/isimp_drought_v1/lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif new file mode 100644 index 0000000..fd444ce Binary files /dev/null and b/tests/data/isimp_drought_v1/lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif differ diff --git a/tests/data/load_boundaries.py b/tests/data/load_boundaries.py index 20daa07..2d8ed0a 100644 --- a/tests/data/load_boundaries.py +++ b/tests/data/load_boundaries.py @@ -65,7 +65,8 @@ def load_boundaries( long_name_column="name_long", admin_level="0", wipe_table=True, - skip_names=['-99'] + skip_names=['-99'], + setup_tables=True ) -> Tuple[bool, List]: """ Load a geojson file of multipolygons into Boundaries table @@ -75,6 +76,8 @@ def load_boundaries( db_uri = get_db_uri_sync(API_POSTGRES_DB) # Init DB and Load via SA engine = sa.create_engine(db_uri, pool_pre_ping=True) + if setup_tables is True: + db.Base.metadata.create_all(engine) if wipe_table is True: for tbl in reversed(db.Base.metadata.sorted_tables): engine.execute(tbl.delete()) diff --git a/tests/data/notafile.gpkg b/tests/data/notafile.gpkg new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/tmp/.gitignore b/tests/data/tmp/.gitignore deleted file mode 100644 index 86d0cb2..0000000 --- a/tests/data/tmp/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore \ No newline at end of file diff --git a/tests/dataproc/integration/processors/test_gri_osm.py b/tests/dataproc/integration/processors/test_gri_osm.py index ac26dfc..350b440 100644 --- a/tests/dataproc/integration/processors/test_gri_osm.py +++ b/tests/dataproc/integration/processors/test_gri_osm.py @@ -5,22 +5,31 @@ import unittest import shutil -from dataproc.backends.storage.localfs import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.gri_osm.roads_and_rail_version_1 import ( - Processor, - Metadata, -) from tests.helpers import ( + assert_exists_awss3, load_country_geojson, assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor ) -from config import PACKAGES_HOST_URL, TEST_GRI_OSM +from dataproc import Boundary +from dataproc.processors.core.gri_osm.roads_and_rail_version_1 import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, + TEST_GRI_OSM +) class TestGRIOSMProcessor(unittest.TestCase): @@ -34,14 +43,31 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data - shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) + shutil.rmtree(cls.test_processing_data_dir) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia"), ignore_errors=True) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -96,20 +122,29 @@ def test_generate(self): # Remove the final package artifacts (but keep the test data artifacts if they exist) if TEST_GRI_OSM is False: self.skipTest(f"Skipping GRI OSM due to TEST_GRI_OSM == {TEST_GRI_OSM}") - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) prov_log = self.proc.generate() # # Assert the log contains a succesful entries self.assertTrue(prov_log[f"{self.proc.metadata.name} - crop completed"]) self.assertTrue(prov_log[f"{self.proc.metadata.name} - move to storage success"]) # # Collect the URI for the final Raster final_uri = prov_log[f"{self.proc.metadata.name} - result URI"] - # Assert the file exists (replacing the uri for local FS) - self.assertTrue(os.path.exists(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR))) + if STORAGE_BACKEND == "localfs": + self.assertTrue(os.path.exists(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR))) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_exists_awss3( + s3_fs, + final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_gridfinder.py b/tests/dataproc/integration/processors/test_gridfinder.py index c92427c..d2b72d2 100644 --- a/tests/dataproc/integration/processors/test_gridfinder.py +++ b/tests/dataproc/integration/processors/test_gridfinder.py @@ -5,24 +5,32 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.gridfinder.version_1 import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_geotiff, assert_vector_file + from tests.helpers import ( load_country_geojson, - assert_raster_bounds_correct, + assert_vector_output, + assert_raster_output, assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor, ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.processors.core.gridfinder.version_1 import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) TEST_DATA_DIR = os.path.join( os.path.dirname( @@ -44,16 +52,30 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) # Package data - shutil.rmtree( - os.path.join(cls.storage_backend.top_level_folder_path, "gambia"), - ignore_errors=True, + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], ) def setUp(self): @@ -111,16 +133,21 @@ def test_generate(self): We are using locally sourced test-files. """ expected_crs = { - "grid.gpkg": "EPSG:4326", - "lv.tif": "ESRI:54009", - "targets.tif": "EPSG:4326", + "gridfinder-version_1-grid-gambia.gpkg": "EPSG:4326", + "gridfinder-version_1-lv-gambia.tif": "ESRI:54009", + "gridfinder-version_1-targets-gambia.tif": "EPSG:4326", } - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + result_source_map = { + "gridfinder-version_1-lv-gambia.tif": "lv.tif", + "gridfinder-version_1-targets-gambia.tif": "targets.tif" + } + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) # Move test-data into the expected source folder for _file in os.scandir(TEST_DATA_DIR): shutil.copy( @@ -136,25 +163,46 @@ def test_generate(self): # Collect the URIs for the final Raster final_uris = prov_log[f"{self.proc.metadata.name} - result URIs"] self.assertEqual(len(final_uris.split(",")), self.proc.total_expected_files) + # Collect the original source fpaths for pixel assertion for final_uri in final_uris.split(","): fname = os.path.basename(final_uri) if os.path.splitext(fname)[1] == ".tif": - # # Assert the geotiffs are valid - assert_geotiff( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - check_crs=expected_crs[fname], - ) - # # Assert the envelopes - assert_raster_bounds_correct( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - self.boundary["envelope_geojson"], - ) + # Match original source raster for pixel assertion + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + check_crs=expected_crs[fname], + pixel_check_raster_fpath=os.path.join(self.proc.source_folder, result_source_map[fname]) + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + check_crs=expected_crs[fname], + pixel_check_raster_fpath=os.path.join(self.proc.source_folder, result_source_map[fname]) + ) + else: + pass else: - assert_vector_file( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - (163, 2), - expected_crs=expected_crs[fname], - ) + if STORAGE_BACKEND == "localfs": + assert_vector_output( + (84, 2), + expected_crs[fname], + local_vector_fpath=final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_vector_output( + (84, 2), + expected_crs[fname], + s3_fs=s3_fs, + s3_vector_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_isimp_drought.py b/tests/dataproc/integration/processors/test_isimp_drought.py new file mode 100644 index 0000000..d827661 --- /dev/null +++ b/tests/dataproc/integration/processors/test_isimp_drought.py @@ -0,0 +1,176 @@ +""" +Unit tests for ISIMP Drought +""" +import os +import unittest +import shutil + +from tests.helpers import ( + load_country_geojson, + assert_raster_bounds_correct, + assert_datapackage_resource, + clean_packages, + assert_raster_output +) +from tests.dataproc.integration.processors import ( + LOCAL_FS_PROCESSING_DATA_TOP_DIR, + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + DummyTaskExecutor +) +from dataproc import Boundary +from dataproc.processors.core.isimp_drought.version_1 import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) + +TEST_VERSION_1_SOURCE_FILES = [ + "lange2020_clm45_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_lpjml_miroc5_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2080_occurrence.tif", + "lange2020_clm45_gfdl-esm2m_ewembi_rcp60_2005soc_co2_led_global_annual_2006_2099_2030_occurrence.tif" +] + +TEST_DATA_DIR = os.path.join( + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), + "data", + "isimp_drought_v1", +) + +class TestISIMPDroughtV1Processor(unittest.TestCase): + """ + """ + + @classmethod + def setUpClass(cls): + cls.test_processing_data_dir = os.path.join( + LOCAL_FS_PROCESSING_DATA_TOP_DIR, Metadata().name, Metadata().version + ) + os.makedirs(cls.test_processing_data_dir, exist_ok=True) + gambia_geojson, envelope_geojson = load_country_geojson("gambia") + cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) + + @classmethod + def tearDownClass(cls): + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) + + def setUp(self): + self.task_executor = DummyTaskExecutor() + self.meta = Metadata() + self.proc = Processor( + self.meta, + self.boundary, + self.storage_backend, + self.task_executor, + LOCAL_FS_PROCESSING_DATA_TOP_DIR, + ) + + def test_processor_init(self): + """""" + self.assertIsInstance(self.proc, Processor) + + def test_context_manager(self): + """""" + with Processor( + self.meta, + self.boundary, + self.storage_backend, + self.task_executor, + self.test_processing_data_dir, + ) as proc: + self.assertIsInstance(proc, Processor) + + def test_context_manager_cleanup_on_error(self): + """""" + with Processor( + self.meta, + self.boundary, + self.storage_backend, + self.task_executor, + self.test_processing_data_dir, + ) as proc: + test_fpath = os.path.join(proc.tmp_processing_folder, "testfile") + # Add a file into the tmp processing backend + with open(test_fpath, "w") as fptr: + fptr.write("data") + self.assertFalse(os.path.exists(test_fpath)) + + def test_meta_init(self): + """""" + self.assertIsInstance(self.meta, Metadata) + self.assertNotEqual(self.meta.name, "") + self.assertNotEqual(self.meta.version, "") + self.assertNotEqual(self.meta.dataset_name, "") + + def test_generate(self): + """E2E generate test - fetch, crop, push""" + # Move test-data into the expected source folder + for _file in os.scandir(TEST_DATA_DIR): + shutil.copy( + os.path.join(TEST_DATA_DIR, _file.name), + os.path.join(self.proc.source_folder, _file.name), + ) + # Limit expected source files + self.proc.source_files = TEST_VERSION_1_SOURCE_FILES + self.proc.total_expected_files = len(TEST_VERSION_1_SOURCE_FILES) + prov_log = self.proc.generate() + # Assert the log contains successful entries + self.assertTrue(prov_log[f"{self.proc.metadata.name} - move to storage success"]) + # Collect the URIs for the final Rasters + final_uris = prov_log[f"{self.proc.metadata.name} - result URIs"] + self.assertEqual(len(final_uris.split(",")), self.proc.total_expected_files) + # Collect the original source fpaths for pixel assertion + source_fpaths = self.proc._fetch_source() + for idx, final_uri in enumerate(final_uris.split(",")): + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + check_crs="EPSG:4326", + tolerence=0.5, + pixel_check_raster_fpath=source_fpaths[idx] + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + check_crs="EPSG:4326", + tolerence=0.5, + pixel_check_raster_fpath=source_fpaths[idx] + ) + else: + pass + # Check the datapackage thats included in the prov log + self.assertIn("datapackage", prov_log.keys()) + assert_datapackage_resource(prov_log['datapackage']) diff --git a/tests/dataproc/integration/processors/test_jrc_ghsl_built_c.py b/tests/dataproc/integration/processors/test_jrc_ghsl_built_c.py index 99c0896..c19fc5e 100644 --- a/tests/dataproc/integration/processors/test_jrc_ghsl_built_c.py +++ b/tests/dataproc/integration/processors/test_jrc_ghsl_built_c.py @@ -5,24 +5,31 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.jrc_ghsl_built_c.r2022_epoch2018_10m_mszfun import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_geotiff from tests.helpers import ( load_country_geojson, assert_raster_bounds_correct, - assert_datapackage_resource + assert_datapackage_resource, + clean_packages, + assert_raster_output ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.processors.core.jrc_ghsl_built_c.r2022_epoch2018_10m_mszfun import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) class TestJRCGHSLBuiltCR2022Processor(unittest.TestCase): @@ -36,14 +43,31 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia"), ignore_errors=True) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -100,13 +124,6 @@ def test_meta_init(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - self.skipTest(f"Skipping JRC BUILT-C due to WIP") - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass # Limit the files to be downloaded in the fetcher self.proc.total_expected_files = 2 prov_log = self.proc.generate() @@ -115,11 +132,29 @@ def test_generate(self): # Collect the URIs for the final Rasters final_uris = prov_log[f"{self.proc.metadata.name} - result URIs"] self.assertEqual(len(final_uris.split(",")), self.proc.total_expected_files) - for final_uri in final_uris.split(","): - # # Assert the geotiffs are valid - assert_geotiff(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), check_crs="ESRI:54009") - # # Assert the envelopes - assert_raster_bounds_correct(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), self.boundary["envelope_geojson"]) + # Collect the original source fpaths for pixel assertion + source_fpaths = self.proc._fetch_source() + for idx, final_uri in enumerate(final_uris.split(",")): + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + check_crs="ESRI:54009", + check_is_bigtiff=True, + pixel_check_raster_fpath=source_fpaths[idx] + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + check_crs="ESRI:54009", + check_is_bigtiff=True, + pixel_check_raster_fpath=source_fpaths[idx] + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log['datapackage']) diff --git a/tests/dataproc/integration/processors/test_jrc_ghsl_population.py b/tests/dataproc/integration/processors/test_jrc_ghsl_population.py index 5351e25..89455d8 100644 --- a/tests/dataproc/integration/processors/test_jrc_ghsl_population.py +++ b/tests/dataproc/integration/processors/test_jrc_ghsl_population.py @@ -5,24 +5,30 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.jrc_ghsl_population.r2022_epoch2020_1km import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_geotiff from tests.helpers import ( load_country_geojson, - assert_raster_bounds_correct, + assert_raster_output, assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor, ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.processors.core.jrc_ghsl_population.r2022_epoch2020_1km import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) class TestJRCGHSLPopR2022E20201KMProcessor(unittest.TestCase): @@ -36,16 +42,30 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) # Package data - shutil.rmtree( - os.path.join(cls.storage_backend.top_level_folder_path, "gambia"), - ignore_errors=True, + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], ) def setUp(self): @@ -103,12 +123,13 @@ def test_meta_init(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) # Limit the files to be downloaded in the fetcher self.proc.total_expected_files = 1 prov_log = self.proc.generate() @@ -118,16 +139,24 @@ def test_generate(self): ) # Collect the URIs for the final Raster final_uri = prov_log[f"{self.proc.metadata.name} - result URI"] - # # Assert the geotiffs are valid - assert_geotiff( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - check_crs="ESRI:54009", - ) - # # Assert the envelopes - NOTE: this will assert the Molleweide bounds - assert_raster_bounds_correct( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - self.boundary["envelope_geojson"], - ) + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + check_crs="ESRI:54009", + pixel_check_raster_fpath=self.proc._fetch_source() + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + check_crs="ESRI:54009", + pixel_check_raster_fpath=self.proc._fetch_source() + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_natural_earth_raster.py b/tests/dataproc/integration/processors/test_natural_earth_raster.py index 4b6cc23..4729e27 100644 --- a/tests/dataproc/integration/processors/test_natural_earth_raster.py +++ b/tests/dataproc/integration/processors/test_natural_earth_raster.py @@ -5,24 +5,32 @@ import unittest import shutil -from dataproc.backends.storage.localfs import LocalFSStorageBackend + from tests.helpers import ( load_country_geojson, - assert_raster_bounds_correct, + assert_raster_output, assert_datapackage_resource, + clean_packages +) +from tests.dataproc.integration.processors import ( + LOCAL_FS_PROCESSING_DATA_TOP_DIR, + LOCAL_FS_PACKAGE_DATA_TOP_DIR, + DummyTaskExecutor, ) from dataproc import Boundary +from dataproc.helpers import sample_geotiff from dataproc.processors.core.natural_earth_raster.version_1 import ( Processor, Metadata, ) -from dataproc.helpers import assert_geotiff -from tests.dataproc.integration.processors import ( - LOCAL_FS_PROCESSING_DATA_TOP_DIR, - LOCAL_FS_PACKAGE_DATA_TOP_DIR, - DummyTaskExecutor, +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, ) -from config import PACKAGES_HOST_URL class TestNaturalEarthRasterProcessor(unittest.TestCase): @@ -36,14 +44,18 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) - - @classmethod - def tearDownClass(cls): + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env # Tmp and Source data shutil.rmtree(cls.test_processing_data_dir) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia")) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -56,6 +68,19 @@ def setUp(self): self.test_processing_data_dir, ) + @classmethod + def tearDownClass(cls): + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) + def test_processor_init(self): """""" self.assertIsInstance(self.proc, Processor) @@ -100,22 +125,35 @@ def test_fetch_source(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) prov_log = self.proc.generate() # Assert the log contains a succesful entries self.assertTrue(prov_log[f"{self.proc.metadata.name} - crop success"]) - self.assertTrue(prov_log[f"{self.proc.metadata.name} - move to storage success"]) + self.assertTrue( + prov_log[f"{self.proc.metadata.name} - move to storage success"] + ) # Collect the URI for the final Raster final_uri = prov_log[f"{self.proc.metadata.name} - result URI"] - # Assert the geotiff is valid - assert_geotiff(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR)) - # Assert the envelope - assert_raster_bounds_correct(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), self.boundary["envelope_geojson"]) + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + pixel_check_raster_fpath=self.proc._fetch_source() + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + pixel_check_raster_fpath=self.proc._fetch_source() + ) # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_natural_earth_vector.py b/tests/dataproc/integration/processors/test_natural_earth_vector.py index 5a8f914..b12e14f 100644 --- a/tests/dataproc/integration/processors/test_natural_earth_vector.py +++ b/tests/dataproc/integration/processors/test_natural_earth_vector.py @@ -5,25 +5,27 @@ import unittest import shutil -from dataproc.backends.storage.localfs import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.natural_earth_vector.version_1 import ( - Processor, - Metadata, -) -from config import get_db_uri_sync, API_POSTGRES_DB from tests.helpers import ( load_country_geojson, assert_table_in_pg, drop_natural_earth_roads_from_pg, + assert_exists_awss3, assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor ) -from config import PACKAGES_HOST_URL +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from dataproc import Boundary +from dataproc.processors.core.natural_earth_vector.version_1 import ( + Processor, + Metadata, +) +from config import get_db_uri_sync, API_POSTGRES_DB, PACKAGES_HOST_URL, S3_REGION, STORAGE_BACKEND, S3_BUCKET class TestNaturalEarthVectorProcessor(unittest.TestCase): @@ -38,20 +40,32 @@ def setUpClass(cls): cls.test_data_dir = None gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Cleans processing data - try: - # Tmp and Source data - shutil.rmtree(cls.test_processing_data_dir) - # Package data - shutil.rmtree( - os.path.join(cls.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - print("Skipped removing test data tree for", cls.__name__) + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) try: drop_natural_earth_roads_from_pg() except: @@ -118,20 +132,29 @@ def test_fetch_source(self): def test_generate(self): """E2E generate test - fetch, crop, push""" # Remove the final package artifacts (but keep the test data artifacts if they exist) - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) prov_log = self.proc.generate() # # Assert the log contains a succesful entries self.assertTrue(prov_log[f"{self.proc.metadata.name} - crop completed"]) self.assertTrue(prov_log[f"{self.proc.metadata.name} - move to storage success"]) # # Collect the URI for the final Raster final_uri = prov_log[f"{self.proc.metadata.name} - result URI"] - # Assert the file exists - self.assertTrue(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR)) + if STORAGE_BACKEND == "localfs": + self.assertTrue(os.path.exists(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR))) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_exists_awss3( + s3_fs, + final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_storm.py b/tests/dataproc/integration/processors/test_storm.py index 5a00318..4d8342d 100644 --- a/tests/dataproc/integration/processors/test_storm.py +++ b/tests/dataproc/integration/processors/test_storm.py @@ -5,24 +5,31 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.storm.global_mosaics_version_1 import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_geotiff, download_file from tests.helpers import ( load_country_geojson, - assert_raster_bounds_correct, + assert_raster_output, assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor, ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.processors.core.storm.global_mosaics_version_1 import ( + Processor, + Metadata, +) +from dataproc.helpers import assert_geotiff, download_file +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) TEST_TIF_URL = "https://zenodo.org/record/7438145/files/STORM_FIXED_RETURN_PERIODS_CMCC-CM2-VHR4_10000_YR_RP.tif" @@ -38,14 +45,29 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): - # Tmp and Source data - shutil.rmtree(cls.test_processing_data_dir) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia")) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -97,12 +119,13 @@ def test_meta_init(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) # Fetch a single file into the source folder then limit expected files _ = download_file( TEST_TIF_URL, @@ -116,16 +139,26 @@ def test_generate(self): # Collect the URIs for the final Raster final_uris = prov_log[f"{self.proc.metadata.name} - result URIs"] self.assertEqual(len(final_uris.split(",")), self.proc.total_expected_files) - for final_uri in final_uris.split(","): + # Collect the original source fpaths for pixel assertion + source_fpaths = self.proc._fetch_source() + for idx, final_uri in enumerate(final_uris.split(",")): # # Assert the geotiffs are valid - assert_geotiff( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR) - ) - # # Assert the envelopes - assert_raster_bounds_correct( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - self.boundary["envelope_geojson"], - ) + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + pixel_check_raster_fpath=source_fpaths[idx] + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + pixel_check_raster_fpath=source_fpaths[idx] + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/integration/processors/test_wri_aqueduct.py b/tests/dataproc/integration/processors/test_wri_aqueduct.py index b17e5fd..cee7bc2 100644 --- a/tests/dataproc/integration/processors/test_wri_aqueduct.py +++ b/tests/dataproc/integration/processors/test_wri_aqueduct.py @@ -5,25 +5,33 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.wri_aqueduct.version_2 import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_geotiff from tests.helpers import ( load_country_geojson, assert_raster_bounds_correct, setup_test_data_paths, - assert_datapackage_resource + assert_raster_output, + assert_datapackage_resource, + clean_packages ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.helpers import tiffs_in_folder +from dataproc.processors.core.wri_aqueduct.version_2 import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, +) class TestWRIAqueductProcessor(unittest.TestCase): @@ -37,14 +45,31 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data shutil.rmtree(cls.test_processing_data_dir) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia")) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -96,12 +121,13 @@ def test_meta_init(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) # Limit the files to be downloaded in the fetcher self.proc.total_expected_files = 1 prov_log = self.proc.generate() @@ -110,11 +136,25 @@ def test_generate(self): # Collect the URIs for the final Raster final_uris = prov_log[f"{self.proc.metadata.name} - result URIs"] self.assertEqual(len(final_uris.split(",")), self.proc.total_expected_files) - for final_uri in final_uris.split(","): - # # Assert the geotiffs are valid - assert_geotiff(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR)) - # # Assert the envelopes - assert_raster_bounds_correct(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), self.boundary["envelope_geojson"]) + # Collect the original source fpaths for pixel assertion + source_tiffs = tiffs_in_folder(self.proc.source_folder) + for idx, final_uri in enumerate(final_uris.split(",")): + if STORAGE_BACKEND == "localfs": + assert_raster_output( + self.boundary["envelope_geojson"], + final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + pixel_check_raster_fpath=os.path.join(self.proc.source_folder, source_tiffs[idx]) + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_raster_output( + self.boundary["envelope_geojson"], + s3_fs=s3_fs, + s3_raster_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET), + pixel_check_raster_fpath=os.path.join(self.proc.source_folder, source_tiffs[idx]) + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log['datapackage']) diff --git a/tests/dataproc/integration/processors/test_wri_powerplants.py b/tests/dataproc/integration/processors/test_wri_powerplants.py index a0e9511..9eb7e19 100644 --- a/tests/dataproc/integration/processors/test_wri_powerplants.py +++ b/tests/dataproc/integration/processors/test_wri_powerplants.py @@ -5,23 +5,32 @@ import unittest import shutil -from dataproc.backends import LocalFSStorageBackend -from dataproc import Boundary -from dataproc.processors.core.wri_powerplants.version_130 import ( - Processor, - Metadata, -) -from dataproc.helpers import assert_vector_file from tests.helpers import ( + assert_exists_awss3, load_country_geojson, assert_datapackage_resource, + clean_packages, + assert_vector_output ) from tests.dataproc.integration.processors import ( LOCAL_FS_PROCESSING_DATA_TOP_DIR, LOCAL_FS_PACKAGE_DATA_TOP_DIR, DummyTaskExecutor ) -from config import PACKAGES_HOST_URL +from dataproc import Boundary +from dataproc.processors.core.wri_powerplants.version_130 import ( + Processor, + Metadata, +) +from dataproc.backends.storage import init_storage_backend +from dataproc.backends.storage.awss3 import S3Manager +from config import ( + PACKAGES_HOST_URL, + S3_REGION, + STORAGE_BACKEND, + S3_BUCKET, + TEST_GRI_OSM +) class TestWRIPowerplantsProcessor(unittest.TestCase): @@ -35,14 +44,31 @@ def setUpClass(cls): os.makedirs(cls.test_processing_data_dir, exist_ok=True) gambia_geojson, envelope_geojson = load_country_geojson("gambia") cls.boundary = Boundary("gambia", gambia_geojson, envelope_geojson) - cls.storage_backend = LocalFSStorageBackend(LOCAL_FS_PACKAGE_DATA_TOP_DIR) + cls.storage_backend = init_storage_backend(STORAGE_BACKEND) + # Ensure clean test-env + # Tmp and Source data + shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) + # Package data + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) @classmethod def tearDownClass(cls): # Tmp and Source data - shutil.rmtree(cls.test_processing_data_dir) + shutil.rmtree(cls.test_processing_data_dir, ignore_errors=True) # Package data - shutil.rmtree(os.path.join(cls.storage_backend.top_level_folder_path, "gambia")) + clean_packages( + STORAGE_BACKEND, + cls.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) def setUp(self): self.task_executor = DummyTaskExecutor() @@ -94,12 +120,13 @@ def test_meta_init(self): def test_generate(self): """E2E generate test - fetch, crop, push""" - try: - shutil.rmtree( - os.path.join(self.storage_backend.top_level_folder_path, "gambia") - ) - except FileNotFoundError: - pass + clean_packages( + STORAGE_BACKEND, + self.storage_backend, + s3_bucket=S3_BUCKET, + s3_region=S3_REGION, + packages=["gambia"], + ) # Limit the files to be downloaded in the fetcher self.proc.total_expected_files = 1 prov_log = self.proc.generate() @@ -112,14 +139,23 @@ def test_generate(self): # # Collect the URI for the final Raster final_uri = prov_log[f"{self.proc.metadata.name} - result URI"] # Assert the file exists - self.assertTrue( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR) - ) - assert_vector_file( - final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), - expected_shape=(2, 37), - expected_crs="EPSG:4326", - ) + if STORAGE_BACKEND == "localfs": + self.assertTrue(os.path.exists(final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR))) + assert_vector_output( + expected_shape=(2, 37), + expected_crs="EPSG:4326", + local_vector_fpath=final_uri.replace(PACKAGES_HOST_URL, LOCAL_FS_PACKAGE_DATA_TOP_DIR), + ) + elif STORAGE_BACKEND == "awss3": + with S3Manager(*self.storage_backend._parse_env(), region=S3_REGION) as s3_fs: + assert_vector_output( + expected_shape=(2, 37), + expected_crs="EPSG:4326", + s3_fs=s3_fs, + s3_vector_fpath=final_uri.replace(PACKAGES_HOST_URL, S3_BUCKET) + ) + else: + pass # Check the datapackage thats included in the prov log self.assertIn("datapackage", prov_log.keys()) assert_datapackage_resource(prov_log["datapackage"]) diff --git a/tests/dataproc/unit/processors/test_env.py b/tests/dataproc/unit/processors/test_env.py index 249f8b9..c4c5ddd 100644 --- a/tests/dataproc/unit/processors/test_env.py +++ b/tests/dataproc/unit/processors/test_env.py @@ -1,7 +1,9 @@ """ Test Processor Python environment """ +import os import unittest +from subprocess import check_call class TestProcessorEnv(unittest.TestCase): """""" @@ -14,4 +16,12 @@ def test_imports(self): import celery import shapely import pyproj - import rasterio \ No newline at end of file + import rasterio + import pyarrow as pa + import geopandas as gp + import psycopg2 + + def test_commands(self): + """""" + self.assertEqual(check_call(['gdalwarp', '--version']), 0) + self.assertEqual(check_call(['openssl', 'sha1', f'{os.path.abspath(__file__)}']), 0) \ No newline at end of file diff --git a/tests/dataproc/unit/test_localfs_backend.py b/tests/dataproc/unit/test_localfs_backend.py deleted file mode 100644 index 16df41e..0000000 --- a/tests/dataproc/unit/test_localfs_backend.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Unit tests for Dataproc classes -""" -import os -import unittest - -from dataproc.backends.storage.localfs import LocalFSStorageBackend -from tests.helpers import create_tree, remove_tree - -LOCAL_FS_DATA_TOP_DIR = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - "data", - "packages", -) - - -class TestLocalFSBackend(unittest.TestCase): - """""" - - def setUp(self): - self.backend = LocalFSStorageBackend(LOCAL_FS_DATA_TOP_DIR) - - - def expected_fs_structure(self): - """ - The expected initial FS structure - """ - return { - "gambia": { - "aqueduct": ["0.1"], - "biodiversity": ["version_1"], - "osm_roads": ["20221201"], - }, - "zambia": {"osm_roads": ["20230401"]}, - } - - def test_init(self): - """Initialisation of the backend and methods available""" - self.assertIsInstance(self.backend, LocalFSStorageBackend) - self.assertEqual(self.backend.top_level_folder_path, LOCAL_FS_DATA_TOP_DIR) - self.assertTrue( - hasattr(self.backend, "tree") and callable(getattr(self.backend, "tree")) - ) - - def test_tree(self): - """Test Generation of the package / dataset / version structure""" - create_tree(LOCAL_FS_DATA_TOP_DIR) - tree = self.backend.tree() - self.assertDictEqual(tree, self.expected_fs_structure()) - remove_tree(LOCAL_FS_DATA_TOP_DIR) diff --git a/tests/helpers.py b/tests/helpers.py index 9b4f25f..e0c42bb 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -7,15 +7,21 @@ import json from typing import Any, List, Tuple import shutil +from time import sleep, time import sqlalchemy as sa import rasterio import shapely from shapely.ops import transform import pyproj +from pyarrow import fs +from pyarrow.fs import S3FileSystem, LocalFileSystem +import numpy as np from config import get_db_uri_sync, API_POSTGRES_DB, INTEGRATION_TEST_ENDPOINT from api import db +from dataproc.helpers import assert_geotiff, assert_vector_file, sample_geotiff, sample_geotiff_coords +from dataproc.backends.storage.awss3 import S3Manager, AWSS3StorageBackend current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parent_dir = os.path.dirname(os.path.dirname(current_dir)) @@ -28,11 +34,13 @@ engine = sa.create_engine(db_uri, pool_pre_ping=True) -def wipe_db(): +def wipe_db(setup_tables=True): """Wipe all SQLA Tables in the DB""" db_uri = get_db_uri_sync(API_POSTGRES_DB) # Init DB and Load via SA engine = sa.create_engine(db_uri, pool_pre_ping=True) + if setup_tables: + db.Base.metadata.create_all(engine) for tbl in reversed(db.Base.metadata.sorted_tables): engine.execute(tbl.delete()) @@ -113,10 +121,10 @@ def create_tree( top_level_path: str, packages: list = ["gambia", "zambia"], datasets: list = ["aqueduct", "biodiversity", "osm_roads"], - wipe_existing: bool = True + wipe_existing: bool = True, ): """ - Create a fake tree so we can check reading packages + Create a fake tree in local FS so we can check reading packages """ # Generate the datapackage.jsons for package in packages: @@ -177,12 +185,209 @@ def create_tree( def remove_tree(top_level_path: str, packages=["gambia", "zambia"]): """ - Cleanup the test tree + Cleanup the test tree from local FS """ for package in packages: shutil.rmtree(os.path.join(top_level_path, package), ignore_errors=True) +def create_tree_awss3( + s3_fs: S3FileSystem, + bucket: str, + packages: list = ["gambia", "zambia"], + datasets: list = ["aqueduct", "biodiversity", "osm_roads"], + wipe_existing: bool = True, +): + """ + Create a fake tree in local FS so we can check reading packages + """ + # Generate the datapackage.jsons + for package in packages: + if wipe_existing is True: + try: + s3_fs.delete_dir(os.path.join(bucket, package)) + except FileNotFoundError: + pass + s3_fs.create_dir(os.path.join(bucket, package)) + dp = gen_datapackage(package, datasets) + dp_fpath = os.path.join(bucket, package, "datapackage.json") + with s3_fs.open_output_stream(dp_fpath) as stream: + stream.write(json.dumps(dp).encode()) + + if "gambia" in packages: + if "noexist" in datasets: + # An invalid processor or dataset was placed in the tree + s3_fs.create_dir(os.path.join(bucket, "gambia", "datasets", "noexist")) + if "aqueduct" in datasets: + s3_fs.create_dir( + os.path.join(bucket, "gambia", "datasets", "aqueduct", "0.1") + ) + if "biodiversity" in datasets: + s3_fs.create_dir( + os.path.join(bucket, "gambia", "datasets", "biodiversity", "version_1") + ) + if "osm_roads" in datasets: + s3_fs.create_dir( + os.path.join(bucket, "gambia", "datasets", "osm_roads", "20221201") + ) + if "natural_earth_raster" in datasets: + s3_fs.create_dir( + os.path.join( + bucket, + "gambia", + "datasets", + "natural_earth_raster", + "version_1", + ) + ) + if "zambia" in packages: + if "osm_roads" in datasets: + s3_fs.create_dir( + os.path.join(bucket, "zambia", "datasets", "osm_roads", "20230401") + ) + + +def remove_tree_awss3( + s3_fs: S3FileSystem, bucket: str, packages: list = ["gambia", "zambia"] +): + """Remove a tree from aws s3 backend""" + for package in packages: + s3_fs.delete_dir(os.path.join(bucket, package)) + + +def clean_packages( + backend_type: str, + storage_backend: Any, + s3_bucket: str = None, + s3_region="eu-west-2", + packages=["gambia"] +): + """Remove packages used in a test""" + max_wait = 60 + start = time() + try: + if backend_type == "awss3": + with S3Manager(*storage_backend._parse_env(), region=s3_region) as s3_fs: + remove_tree_awss3(s3_fs, s3_bucket, packages=packages) + while True: + existing_packages = storage_backend.packages() + if any([True for i in existing_packages if i in packages]): + sleep(0.5) + else: + break + if (time()-start) > max_wait: + raise Exception("timed out waiting for packages to be deleted") + elif backend_type == "localfs": + remove_tree(storage_backend.top_level_folder_path, packages=packages) + else: + print("unknown backend type:", backend_type) + except FileNotFoundError: + pass + +def assert_vector_output( + expected_shape: tuple, + expected_crs: str, + local_vector_fpath: str=None, + s3_fs: S3FileSystem = None, + s3_vector_fpath: str = None, + tmp_folder: str = None, +): + """ + Wrapper for assert vector file with support for fetching from S3 + """ + if s3_fs and s3_vector_fpath: + if not tmp_folder: + local_vector_fpath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "data", + "processing", + os.path.basename(s3_vector_fpath), + ) + else: + local_vector_fpath = os.path.join( + tmp_folder, os.path.basename(s3_vector_fpath) + ) + fs.copy_files( + s3_vector_fpath, + local_vector_fpath, + source_filesystem=s3_fs, + destination_filesystem=fs.LocalFileSystem(), + ) + assert_vector_file( + local_vector_fpath, + expected_shape, + expected_crs=expected_crs, + ) + +def assert_raster_output( + envelope: dict, + localfs_raster_fpath: str = None, + s3_fs: S3FileSystem = None, + s3_raster_fpath: str = None, + check_crs: str = "EPSG:4326", + check_compression=True, + tolerence: float = 0.1, + tmp_folder: str = None, + check_is_bigtiff: bool=False, + pixel_check_raster_fpath: str = None, + pixel_check_num_samples: int = 100 +): + """ + Wrapper for assert_geotiff and assert_raster_bounds_correct + which asserts either local or S3 source results + if localfs_raster_fpath is provided then local source will be assumed + + if s3_fs and s3_raster_fpath are provided then requested source + will be pulled locally before assertions. + + ::kwarg pixel_check_raster_fpath str + If this kwarg is set then pixels will be sampled from the raster at localfs_raster_fpath + and compared to pisels in the raster at pixel_check_raster_fpath + """ + try: + if s3_fs and s3_raster_fpath: + if not tmp_folder: + localfs_raster_fpath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "data", + "processing", + os.path.basename(s3_raster_fpath), + ) + else: + localfs_raster_fpath = os.path.join( + tmp_folder, os.path.basename(s3_raster_fpath) + ) + fs.copy_files( + s3_raster_fpath, + localfs_raster_fpath, + source_filesystem=s3_fs, + destination_filesystem=fs.LocalFileSystem(), + ) + if pixel_check_raster_fpath is not None: + # Collect sample and coords from the first raster, then sample second raster + src_coords = sample_geotiff_coords(localfs_raster_fpath, pixel_check_num_samples) + _, expected_samples = sample_geotiff(pixel_check_raster_fpath, coords=src_coords) + else: + src_coords = None + expected_samples = None + assert_geotiff( + localfs_raster_fpath, + check_crs=check_crs, + check_compression=check_compression, + check_is_bigtiff=check_is_bigtiff, + check_pixel_coords=src_coords, + check_pixel_expected_samples=expected_samples + ) + assert_raster_bounds_correct( + localfs_raster_fpath, envelope, tolerence=tolerence + ) + finally: + # Clean local S3 artifacts + if s3_fs and s3_raster_fpath: + if os.path.exists(localfs_raster_fpath): + os.remove(localfs_raster_fpath) + + def assert_raster_bounds_correct( raster_fpath: str, envelope: dict, tolerence: float = 0.1 ): @@ -222,12 +427,20 @@ def assert_raster_bounds_correct( ), f"bounds {src.bounds.bottom} did not match expected {min(y_coords)} within tolerence {tolerence}" -def assert_package( - top_level_fpath: str, boundary_name: str -): +def assert_exists_awss3(s3_fs: S3FileSystem, s3_raster_fpath: str): + """ + Check if a given file exists on the s3 filessytem + """ + chk = s3_fs.get_file_info(s3_raster_fpath) + assert ( + chk.type != fs.FileType.NotFound + ), f"file was not found on S3 {s3_raster_fpath}" + + +def assert_package(top_level_fpath: str, boundary_name: str): """Assert integrity of a package and datasets contained within - This does not assert the integrity of actualy data files (raster/vector); - just the folder structure + This does not assert the integrity of actualy data files (raster/vector); + just the folder structure """ required_top_level_docs = [ "index.html", @@ -256,6 +469,34 @@ def assert_package( os.path.join(top_level_fpath, boundary_name, doc) ), f"top-level {doc} missing" +def assert_package_awss3(awss3_backend: AWSS3StorageBackend, boundary_name: str, expected_processor_versions: List=[]): + """Assert integrity of a package and datasets contained within (on S3) + This does not assert the integrity of actualy data files (raster/vector); + just the folder structure + """ + required_top_level_docs = [ + "index.html", + "license.html", + "version.html", + "provenance.json", + "datapackage.json", + ] + packages = awss3_backend._list_directories(awss3_backend._build_absolute_path("")) + assert ( + boundary_name in packages + ), f"{boundary_name} missing in package S3 root: {packages}" + + # Ensure the top-level index and other docs exist + for doc in required_top_level_docs: + assert awss3_backend.boundary_file_exists( + boundary_name, doc + ), f"package {boundary_name} is missing a top-level file: {doc}" + + # Check we have folders for the expected processor versions + for proc_version in expected_processor_versions: + proc, version = proc_version.split('.') + s3_versions = awss3_backend.dataset_versions(boundary_name, proc) + assert version in s3_versions, f"{version} not found in dataset {s3_versions} for processor {proc}" def assert_table_in_pg(db_uri: str, tablename: str): """Check a given table exists in PG""" @@ -284,10 +525,11 @@ def assert_datapackage_resource(dp_resource: dict): assert "name" in dp_resource.keys(), "datapackage missing name" assert isinstance(dp_resource["path"], list), "datapackage path not a list" assert isinstance(dp_resource["hashes"], list), "datapackage hashes not a list" - assert isinstance(dp_resource["bytes"], int), f"datapackage bytes {dp_resource['bytes']} not a int was {type(dp_resource['bytes'])}" - assert ( - len(dp_resource["path"]) - == len(dp_resource["hashes"]) + assert isinstance( + dp_resource["bytes"], int + ), f"datapackage bytes {dp_resource['bytes']} not a int was {type(dp_resource['bytes'])}" + assert len(dp_resource["path"]) == len( + dp_resource["hashes"] ), f"datapackage path and hashes must be the same length {len(dp_resource['path'])}, {len(dp_resource['hashes'])}" assert isinstance(dp_resource["license"], dict), "datapackage license must be dict" assert (