Skip to content

Commit

Permalink
updated scripts for recalculation
Browse files Browse the repository at this point in the history
  • Loading branch information
hn437 committed Aug 23, 2024
1 parent 23040f7 commit 62f0010
Show file tree
Hide file tree
Showing 6 changed files with 1,723 additions and 602 deletions.
102 changes: 51 additions & 51 deletions scripts/V2024/create_full_info_uc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,73 +10,73 @@ def create_full_info_uc(inputfile_uc, layer_uc, inputfile_grid, layer_grid):
grid_sum = (
grid_df[
[
"ID_UC_G0",
"GHS_POP",
"wc_built_up_sqkm",
"wc_tree_cover_sqkm",
"wc_sparse_vegetation_sqkm",
"urban_center_id",
"ghs_pop_2023",
"worldcover_2021_built_up_sqkm",
"worldcover_2021_tree_cover_sqkm",
"worldcover_2021_sparse_vegetation_sqkm",
"selected_road_length_km",
"reference_building_area_sqkm",
"prediction_improved_sqkm",
"osm_building_area_sqkm_2008-01",
"osm_building_area_sqkm_2009-01",
"osm_building_area_sqkm_2010-01",
"osm_building_area_sqkm_2011-01",
"osm_building_area_sqkm_2012-01",
"osm_building_area_sqkm_2013-01",
"osm_building_area_sqkm_2014-01",
"osm_building_area_sqkm_2015-01",
"osm_building_area_sqkm_2016-01",
"osm_building_area_sqkm_2017-01",
"osm_building_area_sqkm_2018-01",
"osm_building_area_sqkm_2019-01",
"osm_building_area_sqkm_2020-01",
"osm_building_area_sqkm_2021-01",
"osm_building_area_sqkm_2022-01",
"osm_building_area_sqkm_2023-01",
"osm_building_area_sqkm_2024-01",
"osm_building_area_sqkm_2024-05",
"prediction",
"osm_building_area_sqkm_2008_01",
"osm_building_area_sqkm_2009_01",
"osm_building_area_sqkm_2010_01",
"osm_building_area_sqkm_2011_01",
"osm_building_area_sqkm_2012_01",
"osm_building_area_sqkm_2013_01",
"osm_building_area_sqkm_2014_01",
"osm_building_area_sqkm_2015_01",
"osm_building_area_sqkm_2016_01",
"osm_building_area_sqkm_2017_01",
"osm_building_area_sqkm_2018_01",
"osm_building_area_sqkm_2019_01",
"osm_building_area_sqkm_2020_01",
"osm_building_area_sqkm_2021_01",
"osm_building_area_sqkm_2022_01",
"osm_building_area_sqkm_2023_01",
"osm_building_area_sqkm_2024_01",
"osm_building_area_sqkm_2024_05",
]
]
.groupby("ID_UC_G0")
.groupby("urban_center_id")
.sum()
)

grid_avg = (
grid_df[
[
"ID_UC_G0",
"shdi",
"vnl_mean",
"osm_completeness_2008_01",
"osm_completeness_2009_01",
"osm_completeness_2010_01",
"osm_completeness_2011_01",
"osm_completeness_2012_01",
"osm_completeness_2013_01",
"osm_completeness_2014_01",
"osm_completeness_2015_01",
"osm_completeness_2016_01",
"osm_completeness_2017_01",
"osm_completeness_2018_01",
"osm_completeness_2019_01",
"osm_completeness_2020_01",
"osm_completeness_2021_01",
"osm_completeness_2022_01",
"osm_completeness_2023_01",
"osm_completeness_2024_01",
"osm_completeness_2024_05",
"urban_center_id",
"shdi_2021",
"vnl_2023",
"prediction_osm_completeness_2008_01",
"prediction_osm_completeness_2009_01",
"prediction_osm_completeness_2010_01",
"prediction_osm_completeness_2011_01",
"prediction_osm_completeness_2012_01",
"prediction_osm_completeness_2013_01",
"prediction_osm_completeness_2014_01",
"prediction_osm_completeness_2015_01",
"prediction_osm_completeness_2016_01",
"prediction_osm_completeness_2017_01",
"prediction_osm_completeness_2018_01",
"prediction_osm_completeness_2019_01",
"prediction_osm_completeness_2020_01",
"prediction_osm_completeness_2021_01",
"prediction_osm_completeness_2022_01",
"prediction_osm_completeness_2023_01",
"prediction_osm_completeness_2024_01",
"prediction_osm_completeness_2024_05",
]
]
.groupby("ID_UC_G0")
.groupby("urban_center_id")
.mean()
)
del grid_df

grid_sum = pd.merge(
grid_sum,
grid_avg,
on="ID_UC_G0",
on="urban_center_id",
how="left",
)
del grid_avg
Expand All @@ -85,12 +85,12 @@ def create_full_info_uc(inputfile_uc, layer_uc, inputfile_grid, layer_grid):
uc_df = pd.merge(
uc_df,
grid_sum,
on="ID_UC_G0",
on="urban_center_id",
how="left",
)
del grid_sum

uc_df.to_file("../full_info_uc.gpkg", layer="full_info_uc", driver="GPKG")
uc_df.to_file("../abgabe.gpkg", layer="uc_full_info_V2024", driver="GPKG")


if __name__ == "__main__":
Expand All @@ -99,7 +99,7 @@ def create_full_info_uc(inputfile_uc, layer_uc, inputfile_grid, layer_grid):
format="%(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(message)s",
)

inputfile_uc = pathlib.Path("../jrc_uc_wgs84.gpkg")
inputfile_uc = pathlib.Path("../abgabe.gpkg")
layer_uc = "uc_2025"

inputfile_grid = pathlib.Path("../abgabe.gpkg")
Expand Down
40 changes: 20 additions & 20 deletions scripts/V2024/model_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def load_urban_centers_grid(input_file, layer_grid):
df["region_wb_cat"] = pd.Categorical(df["region_wb"])
df["region_code"] = df.region_wb_cat.cat.codes

df["shdi"].fillna((df["shdi"].mean()), inplace=True)
df["shdi_2021"].fillna((df["shdi_2021"].mean()), inplace=True)
df["selected_road_length_km"].fillna(
(df["selected_road_length_km"].mean()), inplace=True
)
Expand All @@ -25,7 +25,7 @@ def load_urban_centers_grid(input_file, layer_grid):
"external_reference_building_area_sqkm",
"microsoft_building_area_sqkm",
"reference_building_area_sqkm",
"reference_osm_completeness",
"reference_completeness",
"region_wb",
"region_wb_cat",
"region_code",
Expand All @@ -42,26 +42,26 @@ def get_urban_center_centroids(inputfile, layer_uc, grid_df):
"""Get the centroids of the urban centers."""
# returns message, that centroids are likely incorrect because the data is in a geographic CRS. is reprojecting neccessary??
copy_df = grid_df[
["ID_UC_G0", "osm_building_area_sqkm_2024-05", "reference_building_area_sqkm"]
["urban_center_id", "osm_building_area_sqkm_2024_05", "reference_building_area_sqkm"]
]
copy_df = copy_df.groupby("ID_UC_G0").sum()
copy_df["reference_osm_completeness"] = round(
copy_df["osm_building_area_sqkm_2024-05"]
copy_df = copy_df.groupby("urban_center_id").sum()
copy_df["reference_completeness"] = round(
copy_df["osm_building_area_sqkm_2024_05"]
/ copy_df["reference_building_area_sqkm"],
3,
)

uc_grid = gpd.read_file(inputfile, layer=layer_uc)
uc_grid = pd.merge(
uc_grid,
copy_df[["reference_building_area_sqkm", "reference_osm_completeness"]],
on="ID_UC_G0",
copy_df[["reference_building_area_sqkm", "reference_completeness"]],
on="urban_center_id",
how="left",
)

# filter the columns out, where the (training) data might not be complete
df = uc_grid[
(uc_grid["reference_osm_completeness"] < 1.5)
(uc_grid["reference_completeness"] < 1.5)
& (uc_grid["reference_building_area_sqkm"].notnull())
]

Expand All @@ -72,7 +72,7 @@ def get_urban_center_centroids(inputfile, layer_uc, grid_df):

logging.info(f"got {len(df)} urban centers with centroid coordinates")

return df[["ID_UC_G0", "x", "y"]]
return df[["urban_center_id", "x", "y"]]


def spatial_train_test_split_cluster(df, cluster_label, n=0):
Expand Down Expand Up @@ -113,7 +113,7 @@ def estimate_model_performance(inputfile, layer_uc, layer_prediction, n_clusters
cluster_df, n_clusters = kmeans_cluster_urban_centers(
urban_centers_df, "x", "y", n_clusters
)
df = df.join(cluster_df.set_index("ID_UC_G0"), on="ID_UC_G0", how="inner")
df = df.join(cluster_df.set_index("urban_center_id"), on="urban_center_id", how="inner")
region_groups = list(range(0, n_clusters))

# df for model
Expand Down Expand Up @@ -165,7 +165,7 @@ def estimate_model_performance(inputfile, layer_uc, layer_prediction, n_clusters
# save predictions to Geopackage
df_export = df_test[
[
"ID_UC_G0",
"urban_center_id",
"identifier",
"region_wb",
"repeat",
Expand Down Expand Up @@ -202,18 +202,18 @@ def estimate_model_performance(inputfile, layer_uc, layer_prediction, n_clusters
format="%(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(message)s",
)

inputfile = pathlib.Path("../jrc_uc_wgs84.gpkg")
inputfile = pathlib.Path("../abgabe.gpkg")
layer_uc = "uc_2025"
layer_grid = "uc_grid"
layer_grid = "grid_full_info_v2024"
layer_grid_prediction = "prediction_improved"

COVARIATE_COLUMNS = [
"wc_built_up_sqkm",
"wc_tree_cover_sqkm",
"wc_sparse_vegetation_sqkm",
"GHS_POP",
"vnl_mean",
"shdi",
"worldcover_2021_built_up_sqkm",
"worldcover_2021_tree_cover_sqkm",
"worldcover_2021_sparse_vegetation_sqkm",
"ghs_pop_2023",
"vnl_2023",
"shdi_2021",
"selected_road_length_km",
"region_code",
]
Expand Down
50 changes: 26 additions & 24 deletions scripts/V2024/run_prediction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import pathlib
import sys

import geopandas as gpd
import pandas as pd
Expand All @@ -13,7 +14,7 @@ def load_urban_centers_grid(input_file, layer_grid):
df["region_wb_cat"] = pd.Categorical(df["region_wb"])
df["region_code"] = df.region_wb_cat.cat.codes

df["shdi"].fillna((df["shdi"].mean()), inplace=True)
df["shdi_2021"].fillna((df["shdi_2021"].mean()), inplace=True)
df["selected_road_length_km"].fillna(
(df["selected_road_length_km"].mean()), inplace=True
)
Expand All @@ -23,7 +24,7 @@ def load_urban_centers_grid(input_file, layer_grid):
"external_reference_building_area_sqkm",
"microsoft_building_area_sqkm",
"reference_building_area_sqkm",
"reference_osm_completeness",
"reference_completeness",
"region_wb",
"region_wb_cat",
"region_code",
Expand All @@ -37,14 +38,14 @@ def load_urban_centers_grid(input_file, layer_grid):


def get_outliers(df, uc_file, layer_UC, threshold=0.005):
copy_df = df[["ID_UC_G0", "osm_building_area_sqkm_2024-05", "prediction_sqkm"]]
copy_df = copy_df.groupby("ID_UC_G0").sum()
copy_df = df[["urban_center_id", "osm_building_area_sqkm_2024_05", "prediction_sqkm"]]
copy_df = copy_df.groupby("urban_center_id").sum()

uc_df = gpd.read_file(uc_file, layer=layer_UC)
uc_df = pd.merge(
uc_df,
copy_df[["osm_building_area_sqkm_2024-05", "prediction_sqkm"]],
on="ID_UC_G0",
copy_df[["osm_building_area_sqkm_2024_05", "prediction_sqkm"]],
on="urban_center_id",
how="left",
)

Expand All @@ -53,11 +54,11 @@ def get_outliers(df, uc_file, layer_UC, threshold=0.005):

# select all rows where area is greater than threshold
uc_df_subset = uc_df[
(uc_df["osm_building_area_sqkm_2024-05"] - uc_df["prediction_sqkm"])
(uc_df["osm_building_area_sqkm_2024_05"] - uc_df["prediction_sqkm"])
> uc_df["area"] * threshold
]

outliers = uc_df_subset["ID_UC_G0"].values
outliers = uc_df_subset["urban_center_id"].values
logging.info(
f"got {len(outliers)} urban center ids with prediction below threshold (th = {threshold})"
)
Expand All @@ -74,18 +75,18 @@ def run_prediction(training_data, uc_file, layer_grid, layer_UC):
urban_center_ids = get_outliers(df, uc_file, layer_UC, threshold=0.005)
df[f"reference_building_area_sqkm_initial"] = df[f"reference_building_area_sqkm"]
df.loc[
(df["ID_UC_G0"].isin(urban_center_ids)), "reference_building_area_sqkm"
] = df["osm_building_area_sqkm_2024-05"]
(df["urban_center_id"].isin(urban_center_ids)), "reference_building_area_sqkm"
] = df["osm_building_area_sqkm_2024_05"]

df["reference_completeness_area_sqkm"] = round(
df["osm_building_area_sqkm_2024-05"] / df["reference_building_area_sqkm"], 3
df["reference_completeness"] = round(
df["osm_building_area_sqkm_2024_05"] / df["reference_building_area_sqkm"], 3
)

df_train = df[
(df["reference_building_area_sqkm"] > 0)
&
# avoid urban centers for which training data might not be complete
(df["reference_osm_completeness"] < 1.5)
(df["reference_completeness"] < 1.5)
]
logging.info(f"training samples: {len(df_train)}")

Expand Down Expand Up @@ -126,9 +127,9 @@ def run_prediction(training_data, uc_file, layer_grid, layer_UC):
gdf_temp.to_file(uc_file, layer="prediction", driver="GPKG")
else:
gdf_temp["prediction_improved_sqkm"] = y_pred
gdf_temp["osm_completeness"] = (
gdf_temp["prediction_osm_completeness_2024_05"] = (
(
gdf_temp["osm_building_area_sqkm_2024-05"]
gdf_temp["osm_building_area_sqkm_2024_05"]
/ gdf_temp["prediction_improved_sqkm"]
)
* 100
Expand All @@ -143,18 +144,18 @@ def run_prediction(training_data, uc_file, layer_grid, layer_UC):
format="%(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(message)s",
)

uc_file = pathlib.Path("../jrc_uc_wgs84.gpkg")
uc_file = pathlib.Path("../abgabe.gpkg")
layer_UC = "uc_2025"
layer_grid = "uc_grid"
layer_grid = "grid_full_info_v2024"
layer_grid_prediction = "prediction"

COVARIATE_COLUMNS = [
"wc_built_up_sqkm",
"wc_tree_cover_sqkm",
"wc_sparse_vegetation_sqkm",
"GHS_POP",
"vnl_mean",
"shdi",
"worldcover_2021_built_up_sqkm",
"worldcover_2021_tree_cover_sqkm",
"worldcover_2021_sparse_vegetation_sqkm",
"ghs_pop_2023",
"vnl_2023",
"shdi_2021",
"selected_road_length_km",
"region_code",
]
Expand All @@ -163,7 +164,8 @@ def run_prediction(training_data, uc_file, layer_grid, layer_UC):

"""python scripts/run_prediction.py reference_and_osm"""

# training_data = "reference"
# training_data = sys.argv[1]
#training_data = "reference"
training_data = "reference_and_osm"

if training_data == "reference":
Expand Down
Loading

0 comments on commit 62f0010

Please sign in to comment.