deploy: Migrate GADM geometry data to PostGIS-enabled PostgreSQL DB.

- Switch Docker base image from postgres to postgis for spatial support. - Refactor init-regions-table.py script to use OGR for reading GeoPackage and populating PostGIS geometries. - Bump GDAL version in requirements.txt for compatibility. - Update Makefile to conditionally build the DB only if 'regions' table doesn't exist. This commit enables the migration of geometry data from the GADM GeoPackage file to a PostGIS-enabled PostgreSQL database. It sets the stage for future spatial operations, including aggregating region geometries. Signed-off-by: Nikolay Martyanov <ohmspectator@gmail.com>
uncovering-world · Oct 24, 2023 · 9e7927a · 9e7927a
1 parent 494aec9
commit 9e7927a
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 75 deletions.
diff --git a/deployment/Dockerfile b/deployment/Dockerfile
@@ -1,4 +1,4 @@
-FROM postgres:16.0
+FROM postgis/postgis:16-3.4
 
 # Prepare apt-get
 RUN apt-get update

diff --git a/deployment/Makefile b/deployment/Makefile
@@ -33,10 +33,12 @@ check-gadm-file:
 
 # ---- Docker Database Commands ----
 build-db: check-env
+	# Check if the database is already built
+	@$(DC) images | grep tyr-db >/dev/null ||
 	$(DC) build tyr-db
 
 run-db:
-	$(DC) up tyr-db -d
+	$(DC) up -d tyr-db
 
 init-db: check-gadm-file build-db run-db
 	@echo "Database initialized and GPKG file imported."

diff --git a/deployment/init-db/init-regions-table.py b/deployment/init-db/init-regions-table.py
@@ -1,11 +1,10 @@
 from datetime import datetime
 import os
-import sqlite3
 import sys
 
 from dotenv import load_dotenv
-import fiona
 import psycopg2
+from osgeo import ogr
 
 # Check that the GeoPackage file was provided
 if len(sys.argv) < 2:
@@ -14,8 +13,11 @@
 
 # Connect to the GeoPackage as an SQLite database
 gadm_file = sys.argv[1]
-conn_gpkg = sqlite3.connect(gadm_file)
-cur_gpkg = conn_gpkg.cursor()
+
+# Check that the GeoPackage file exists
+if not os.path.exists(gadm_file):
+    print(f"Error: GeoPackage file {gadm_file} does not exist")
+    sys.exit(1)
 
 # Read the DB credentials from .env files.
 # The order of the files is important, as the variables are overwritten in the
@@ -48,114 +50,122 @@
         name VARCHAR(255) NOT NULL,
         parent_region_id INTEGER REFERENCES regions(id),
         has_subregions BOOLEAN NOT NULL,
-        gadm_uid INTEGER
+        gadm_uid INTEGER,
+        geom GEOMETRY(MULTIPOLYGON, 4326)
     )
 """)
 
+# Open the GeoPackage file
+ds = ogr.Open(gadm_file)
+# Check if the dataset exists
+if ds is None:
+    print("Could not open GeoPackage")
+    sys.exit(1)
+
 # Get the layer name from the GeoPackage
-layers = fiona.listlayers(gadm_file)
+layer_count = ds.GetLayerCount()
+layers = [ds.GetLayerByIndex(i).GetName() for i in range(layer_count)]
 
-# We expect only one layer in the GeoPackage, print a warning if there are more
 if len(layers) != 1:
     print(f"Warning: Expected only one layer in GeoPackage, found {len(layers)}. Using first layer.")
 
 layer_name = layers[0]
 
+# Open the layer by name
+lyr = ds.GetLayerByName(layer_name)
+
 # Number of levels in the GADM hierarchy
 num_levels = 6
 
-# Update the columns list to include the predefined levels
+# Update the properties list to include the predefined levels
 predefined_levels = ['CONTINENT', 'SUBCONT', 'SOVEREIGN', 'COUNTRY', 'GOVERNEDBY', 'REGION']
-columns = predefined_levels + [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)] + ['UID']
-
-# Fetch the relevant columns from the GeoPackage
-cur_gpkg.execute(f"SELECT {','.join(columns)} FROM {layer_name}")
-rows = cur_gpkg.fetchall()
+# List of all properties requested from the GeoPackage
+properties = predefined_levels + [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)] + ['UID', 'geom']
 
-# Now you can access the predefined levels in your row_dict as they are part of the columns list
+# List of geographical levels
 geo_levels = predefined_levels + [f'NAME_{i}' for i in range(num_levels)]
 
-# Get the number of rows in the GeoPackage
-num_rows = len(rows)
-print(f"Processing {num_rows} rows...")
-# Get the number of digits in the number of rows, to format the progress message
-max_row_digits = len(str(num_rows))
+# Prepare for looping through features
+num_features = lyr.GetFeatureCount()
 
-# Print a progress message every 1% of rows
-rows_in_one_percent = int(num_rows / 100)
+print(f"Processing {num_features} features...")
+# Get the number of digits in the number of features, to format the progress message
+max_feature_digits = len(str(num_features))
+
+# Print a progress message every 1% of features
+features_in_one_percent = int(num_features / 100)
 
 timestamp = datetime.now()
 timestamp_start = timestamp
 
+last_valid_parent_region_id = None  # Variable to remember the last valid parent ID
+
 
-def find_next_non_empty_level(idx, row_dict, geo_levels):
-    # Function to find the next non-empty level in the hierarchy
+def find_next_non_empty_level(idx, feature, geo_levels):
     for next_idx in range(idx + 1, len(geo_levels)):
         next_level_key = geo_levels[next_idx]
-        if row_dict.get(next_level_key, '') != '':
+        if feature.GetField(next_level_key):
             return next_level_key
-    return None  # Return None if no non-empty level is found
+    return None
 
 
 existing_names = {}  # Dictionary to track existing regions
 
-# Loop through the rows from the SQLite cursor
-for i, row in enumerate(rows):
-    if i % rows_in_one_percent == 0 and i > 0:
-        # Print a progress message every 1% of rows and timestamp, how long it took
+for i, feature in enumerate(lyr):
+    if i % features_in_one_percent == 0 and i > 0:
+        # Print a progress message every 1% of features and timestamp, how long it took
         time_now = datetime.now()
         time_diff = (time_now - timestamp).total_seconds()
         total_time_diff = (time_now - timestamp_start).total_seconds()
-        estimated_time_left = (total_time_diff / (i / num_rows)) - total_time_diff
-        print(f"Handled {int(i / rows_in_one_percent):3d}% ({i:{max_row_digits}} rows) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds")
+        estimated_time_left = (total_time_diff / (i / num_features)) - total_time_diff
+        print(f"Handled {int(i / features_in_one_percent):3d}% ({i:{max_feature_digits}} features) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds")
         timestamp = datetime.now()
 
-    row_dict = {}
-    for column, value in zip(columns, row):
-        row_dict[column] = value
+    uid = feature.GetField('UID')
 
-    # Reset parent_region_id for each new row
+    # Reset parent_region_id for each new feature
     parent_region_id = None
     last_valid_parent_region_id = None  # Variable to remember the last valid parent ID
     path_parts = []  # List to build up the path for the current region
 
-    # Process each geographical level for the current row
+    # Process each geographical level for the current feature
     for idx, level in enumerate(geo_levels):
-        if row_dict.get(level) is None:
+        name = feature.GetField(level)
+        if not name:
             continue
-        name = row_dict[level]
-        if name:
-            path_parts.append(name)  # Add the name to the path_parts list if it's not empty
-            key = "_".join(path_parts)  # Build the unique key from the path_parts list
-
-            # Determine if the current region has subregions
-            next_level = find_next_non_empty_level(idx, row_dict, geo_levels)  # Find the next non-empty level
-            has_subregions = next_level is not None  # Check if a non-empty level was found
-
-            # We assign uid to the region, if it is a real GADM region, not a region we created to fill the
-            # hierarchy. Marker for this is that it's the last level, and it has no subregions. For the created
-            # regions, we don't have a uid, so we set it to None
-            if has_subregions:
-                uid = None
-            else:
-                uid = row_dict.get('UID')
-                if uid is None:
-                    print("Warning: UID is None for region: ", key)
-
-            if key not in existing_names:
-                # If the region doesn't exist, create it
-                # Use last_valid_parent_region_id as the parent_region_id for the current level
-                query = "INSERT INTO regions (name, has_subregions, parent_region_id, gadm_uid) VALUES (%s, %s, %s, %s) RETURNING id"
-                params = (name, has_subregions, last_valid_parent_region_id, uid)
-                cur_pg.execute(query, params)
-                region_id = cur_pg.fetchone()[0]
-                existing_names[key] = region_id
-            else:
-                # If the region already exists, get its ID
-                region_id = existing_names[key]
-
-            # Update last_valid_parent_region_id for the next level
-            last_valid_parent_region_id = region_id
+        path_parts.append(name)  # Add the name to the path_parts list if it's not empty
+        key = "_".join(path_parts)  # Build the unique key from the path_parts list
+
+        # Determine if the current region has subregions
+        next_level = find_next_non_empty_level(idx, feature, geo_levels)  # Find the next non-empty level
+        has_subregions = next_level is not None  # Check if a non-empty level was found
+
+        # We assign uid and geometry to the region, if it is a real GADM region, not a region we created to fill the
+        # hierarchy. Marker for this is that it's the last level, and it has no subregions. For the created
+        # regions, we don't have a uid and geometry, so we set it to None
+        if has_subregions:
+            uid = None
+            geom = None
+        else:
+            geom = feature.GetGeometryRef().ExportToWkb()
+            # uid is already set above
+
+        if key not in existing_names:
+            query = """
+                INSERT INTO regions (name, has_subregions, parent_region_id, gadm_uid, geom)
+                VALUES (%s, %s, %s, %s, ST_GeomFromWKB(%s, 4326))
+                RETURNING id
+            """
+            params = (name, has_subregions, last_valid_parent_region_id, uid, geom)
+            cur_pg.execute(query, params)
+            region_id = cur_pg.fetchone()[0]
+            existing_names[key] = region_id
+        else:
+            # If the region already exists, get its ID
+            region_id = existing_names[key]
+
+        # Update last_valid_parent_region_id for the next level
+        last_valid_parent_region_id = region_id
 
 print("Done, in total: ", datetime.now() - timestamp_start)
 
@@ -166,7 +176,5 @@ def find_next_non_empty_level(idx, row_dict, geo_levels):
 
 # Commit the changes and close the connections
 conn_pg.commit()
-cur_gpkg.close()
-conn_gpkg.close()
 cur_pg.close()
 conn_pg.close()
diff --git a/deployment/init-db/requirements.txt b/deployment/init-db/requirements.txt
@@ -1,4 +1,4 @@
 psycopg2-binary==2.9.9
 python-dotenv==1.0.0
-GDAL==3.6.2
+GDAL==3.2.2
 Fiona==1.9.4.post1