Skip to content

Commit

Permalink
deploy: Migrate GADM geometry data to PostGIS-enabled PostgreSQL DB.
Browse files Browse the repository at this point in the history
- Switch Docker base image from postgres to postgis for spatial support.
- Refactor init-regions-table.py script to use OGR for reading GeoPackage and
  populating PostGIS geometries.
- Bump GDAL version in requirements.txt for compatibility.
- Update Makefile to conditionally build the DB only if 'regions' table doesn't
  exist.

This commit enables the migration of geometry data from the GADM GeoPackage
file to a PostGIS-enabled PostgreSQL database. It sets the stage for future
spatial operations, including aggregating region geometries.

Signed-off-by: Nikolay Martyanov <ohmspectator@gmail.com>
  • Loading branch information
OhmSpectator committed Oct 24, 2023
1 parent 494aec9 commit 9e7927a
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 75 deletions.
2 changes: 1 addition & 1 deletion deployment/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM postgres:16.0
FROM postgis/postgis:16-3.4

# Prepare apt-get
RUN apt-get update
Expand Down
4 changes: 3 additions & 1 deletion deployment/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ check-gadm-file:

# ---- Docker Database Commands ----
build-db: check-env
# Check if the database is already built
@$(DC) images | grep tyr-db >/dev/null ||
$(DC) build tyr-db

run-db:
$(DC) up tyr-db -d
$(DC) up -d tyr-db

init-db: check-gadm-file build-db run-db
@echo "Database initialized and GPKG file imported."
Expand Down
152 changes: 80 additions & 72 deletions deployment/init-db/init-regions-table.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from datetime import datetime
import os
import sqlite3
import sys

from dotenv import load_dotenv
import fiona
import psycopg2
from osgeo import ogr

# Check that the GeoPackage file was provided
if len(sys.argv) < 2:
Expand All @@ -14,8 +13,11 @@

# Connect to the GeoPackage as an SQLite database
gadm_file = sys.argv[1]
conn_gpkg = sqlite3.connect(gadm_file)
cur_gpkg = conn_gpkg.cursor()

# Check that the GeoPackage file exists
if not os.path.exists(gadm_file):
print(f"Error: GeoPackage file {gadm_file} does not exist")
sys.exit(1)

# Read the DB credentials from .env files.
# The order of the files is important, as the variables are overwritten in the
Expand Down Expand Up @@ -48,114 +50,122 @@
name VARCHAR(255) NOT NULL,
parent_region_id INTEGER REFERENCES regions(id),
has_subregions BOOLEAN NOT NULL,
gadm_uid INTEGER
gadm_uid INTEGER,
geom GEOMETRY(MULTIPOLYGON, 4326)
)
""")

# Open the GeoPackage file
ds = ogr.Open(gadm_file)
# Check if the dataset exists
if ds is None:
print("Could not open GeoPackage")
sys.exit(1)

# Get the layer name from the GeoPackage
layers = fiona.listlayers(gadm_file)
layer_count = ds.GetLayerCount()
layers = [ds.GetLayerByIndex(i).GetName() for i in range(layer_count)]

# We expect only one layer in the GeoPackage, print a warning if there are more
if len(layers) != 1:
print(f"Warning: Expected only one layer in GeoPackage, found {len(layers)}. Using first layer.")

layer_name = layers[0]

# Open the layer by name
lyr = ds.GetLayerByName(layer_name)

# Number of levels in the GADM hierarchy
num_levels = 6

# Update the columns list to include the predefined levels
# Update the properties list to include the predefined levels
predefined_levels = ['CONTINENT', 'SUBCONT', 'SOVEREIGN', 'COUNTRY', 'GOVERNEDBY', 'REGION']
columns = predefined_levels + [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)] + ['UID']

# Fetch the relevant columns from the GeoPackage
cur_gpkg.execute(f"SELECT {','.join(columns)} FROM {layer_name}")
rows = cur_gpkg.fetchall()
# List of all properties requested from the GeoPackage
properties = predefined_levels + [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)] + ['UID', 'geom']

# Now you can access the predefined levels in your row_dict as they are part of the columns list
# List of geographical levels
geo_levels = predefined_levels + [f'NAME_{i}' for i in range(num_levels)]

# Get the number of rows in the GeoPackage
num_rows = len(rows)
print(f"Processing {num_rows} rows...")
# Get the number of digits in the number of rows, to format the progress message
max_row_digits = len(str(num_rows))
# Prepare for looping through features
num_features = lyr.GetFeatureCount()

# Print a progress message every 1% of rows
rows_in_one_percent = int(num_rows / 100)
print(f"Processing {num_features} features...")
# Get the number of digits in the number of features, to format the progress message
max_feature_digits = len(str(num_features))

# Print a progress message every 1% of features
features_in_one_percent = int(num_features / 100)

timestamp = datetime.now()
timestamp_start = timestamp

last_valid_parent_region_id = None # Variable to remember the last valid parent ID


def find_next_non_empty_level(idx, row_dict, geo_levels):
# Function to find the next non-empty level in the hierarchy
def find_next_non_empty_level(idx, feature, geo_levels):
for next_idx in range(idx + 1, len(geo_levels)):
next_level_key = geo_levels[next_idx]
if row_dict.get(next_level_key, '') != '':
if feature.GetField(next_level_key):
return next_level_key
return None # Return None if no non-empty level is found
return None


existing_names = {} # Dictionary to track existing regions

# Loop through the rows from the SQLite cursor
for i, row in enumerate(rows):
if i % rows_in_one_percent == 0 and i > 0:
# Print a progress message every 1% of rows and timestamp, how long it took
for i, feature in enumerate(lyr):
if i % features_in_one_percent == 0 and i > 0:
# Print a progress message every 1% of features and timestamp, how long it took
time_now = datetime.now()
time_diff = (time_now - timestamp).total_seconds()
total_time_diff = (time_now - timestamp_start).total_seconds()
estimated_time_left = (total_time_diff / (i / num_rows)) - total_time_diff
print(f"Handled {int(i / rows_in_one_percent):3d}% ({i:{max_row_digits}} rows) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds")
estimated_time_left = (total_time_diff / (i / num_features)) - total_time_diff
print(f"Handled {int(i / features_in_one_percent):3d}% ({i:{max_feature_digits}} features) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds")
timestamp = datetime.now()

row_dict = {}
for column, value in zip(columns, row):
row_dict[column] = value
uid = feature.GetField('UID')

# Reset parent_region_id for each new row
# Reset parent_region_id for each new feature
parent_region_id = None
last_valid_parent_region_id = None # Variable to remember the last valid parent ID
path_parts = [] # List to build up the path for the current region

# Process each geographical level for the current row
# Process each geographical level for the current feature
for idx, level in enumerate(geo_levels):
if row_dict.get(level) is None:
name = feature.GetField(level)
if not name:
continue
name = row_dict[level]
if name:
path_parts.append(name) # Add the name to the path_parts list if it's not empty
key = "_".join(path_parts) # Build the unique key from the path_parts list

# Determine if the current region has subregions
next_level = find_next_non_empty_level(idx, row_dict, geo_levels) # Find the next non-empty level
has_subregions = next_level is not None # Check if a non-empty level was found

# We assign uid to the region, if it is a real GADM region, not a region we created to fill the
# hierarchy. Marker for this is that it's the last level, and it has no subregions. For the created
# regions, we don't have a uid, so we set it to None
if has_subregions:
uid = None
else:
uid = row_dict.get('UID')
if uid is None:
print("Warning: UID is None for region: ", key)

if key not in existing_names:
# If the region doesn't exist, create it
# Use last_valid_parent_region_id as the parent_region_id for the current level
query = "INSERT INTO regions (name, has_subregions, parent_region_id, gadm_uid) VALUES (%s, %s, %s, %s) RETURNING id"
params = (name, has_subregions, last_valid_parent_region_id, uid)
cur_pg.execute(query, params)
region_id = cur_pg.fetchone()[0]
existing_names[key] = region_id
else:
# If the region already exists, get its ID
region_id = existing_names[key]

# Update last_valid_parent_region_id for the next level
last_valid_parent_region_id = region_id
path_parts.append(name) # Add the name to the path_parts list if it's not empty
key = "_".join(path_parts) # Build the unique key from the path_parts list

# Determine if the current region has subregions
next_level = find_next_non_empty_level(idx, feature, geo_levels) # Find the next non-empty level
has_subregions = next_level is not None # Check if a non-empty level was found

# We assign uid and geometry to the region, if it is a real GADM region, not a region we created to fill the
# hierarchy. Marker for this is that it's the last level, and it has no subregions. For the created
# regions, we don't have a uid and geometry, so we set it to None
if has_subregions:
uid = None
geom = None
else:
geom = feature.GetGeometryRef().ExportToWkb()
# uid is already set above

if key not in existing_names:
query = """
INSERT INTO regions (name, has_subregions, parent_region_id, gadm_uid, geom)
VALUES (%s, %s, %s, %s, ST_GeomFromWKB(%s, 4326))
RETURNING id
"""
params = (name, has_subregions, last_valid_parent_region_id, uid, geom)
cur_pg.execute(query, params)
region_id = cur_pg.fetchone()[0]
existing_names[key] = region_id
else:
# If the region already exists, get its ID
region_id = existing_names[key]

# Update last_valid_parent_region_id for the next level
last_valid_parent_region_id = region_id

print("Done, in total: ", datetime.now() - timestamp_start)

Expand All @@ -166,7 +176,5 @@ def find_next_non_empty_level(idx, row_dict, geo_levels):

# Commit the changes and close the connections
conn_pg.commit()
cur_gpkg.close()
conn_gpkg.close()
cur_pg.close()
conn_pg.close()
2 changes: 1 addition & 1 deletion deployment/init-db/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
psycopg2-binary==2.9.9
python-dotenv==1.0.0
GDAL==3.6.2
GDAL==3.2.2
Fiona==1.9.4.post1

0 comments on commit 9e7927a

Please sign in to comment.