From 54e39dfbbe7eea26588825cf9334ea5dab85f6a7 Mon Sep 17 00:00:00 2001 From: Bobby Xiong <36541459+bobbyxng@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:12:46 +0100 Subject: [PATCH] Major improvement to OSM-based electricity grid (e.g. using relations, preserving substation locations) (#1384) * Implemented line merge over virtual bus functionality. * Implemented: Aggregating identical lines, bus merging to stations, creating voltage-specific bus endings in stations. * Implemented: Mapping of lines to buses and extending lines to buses. * Finished implementing converters and links. * Finished implementation of entirely new build_osm_network.py script. * Updated configtables. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Temporarily disabled tqdm in retrieve. Line splitting disabled as well. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Implemented relations. Running, converging workflow. * Added updated simplify_network.py * Cleaned up code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated doc * Reactivated line splitting. * Updated build_osm_network. * Cleaned up code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added 400 kV voltage level to config.default * Updated Zenodo link * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Cleaned up code based on PR comments * Added docstrings. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Fabian Neumann --- doc/release_notes.rst | 9 + rules/build_electricity.smk | 11 +- rules/retrieve.smk | 4 +- scripts/_helpers.py | 2 + scripts/base_network.py | 3 +- scripts/build_gdp_pop_non_nuts3.py | 4 +- scripts/build_osm_network.py | 2019 ++++++++++++++++-------- scripts/build_shapes.py | 1 + scripts/clean_osm_data.py | 505 +++--- scripts/prepare_osm_network_release.py | 10 + scripts/retrieve_osm_data.py | 5 +- 11 files changed, 1689 insertions(+), 884 deletions(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 0465f900e..4d7aaa5c4 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -92,6 +92,15 @@ Upcoming Release * Bugfix: Bug when multiple DC links are connected to the same DC bus and the DC bus is connected to an AC bus via converter. In this case, the DC links were wrongly simplified, completely dropping the shared DC bus. Bug fixed by adding preceding converter removal. Other functionalities are not impacted. +* Major improvements to building the OSM based network. The code was rewritten to improve the speed, accuracy and to preserve the topology including original substation locations, wherever possible. Further features include: + - Aggregation of lines with identical geometries and voltages + - Lines overpassing virtual nodes (not actual substations), are merged, if they have the same voltage level and number of circuits + - Cleaner line geometries, especially at connection points to substations + - Substation interior point now based on Pole of Inaccessibility (doi.org/10.1080/14702540801897809) + - Substation radius sharpened to 500 meters + - Single transformers for each combination of voltage level per substation. Transformers now have a capacity s_nom based on connected lines + - Use of OSM relations where available and unambigious (Overwriting all lines that are members of the respective relation to avoid duplicates) + * Updated osm-prebuilt base network to version 0.5, for changelog, see https://zenodo.org/records/13981528 diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk index 10d253f2a..3fcbfed16 100755 --- a/rules/build_electricity.smk +++ b/rules/build_electricity.smk @@ -757,8 +757,8 @@ if config["electricity"]["base_network"] == "osm-raw": "data/osm-raw/{country}/lines_way.json", country=config_provider("countries"), ), - links_relation=expand( - "data/osm-raw/{country}/links_relation.json", + routes_relation=expand( + "data/osm-raw/{country}/routes_relation.json", country=config_provider("countries"), ), substations_way=expand( @@ -774,6 +774,7 @@ if config["electricity"]["base_network"] == "osm-raw": output: substations=resources("osm-raw/clean/substations.geojson"), substations_polygon=resources("osm-raw/clean/substations_polygon.geojson"), + converters_polygon=resources("osm-raw/clean/converters_polygon.geojson"), lines=resources("osm-raw/clean/lines.geojson"), links=resources("osm-raw/clean/links.geojson"), log: @@ -792,8 +793,14 @@ if config["electricity"]["base_network"] == "osm-raw": if config["electricity"]["base_network"] == "osm-raw": rule build_osm_network: + params: + countries=config_provider("countries"), + voltages=config_provider("electricity", "voltages"), + line_types=config_provider("lines", "types"), input: substations=resources("osm-raw/clean/substations.geojson"), + substations_polygon=resources("osm-raw/clean/substations_polygon.geojson"), + converters_polygon=resources("osm-raw/clean/converters_polygon.geojson"), lines=resources("osm-raw/clean/lines.geojson"), links=resources("osm-raw/clean/links.geojson"), country_shapes=resources("country_shapes.geojson"), diff --git a/rules/retrieve.smk b/rules/retrieve.smk index 843fb65fc..a935159b4 100755 --- a/rules/retrieve.smk +++ b/rules/retrieve.smk @@ -593,7 +593,7 @@ if config["enable"]["retrieve"] and ( output: cables_way="data/osm-raw/{country}/cables_way.json", lines_way="data/osm-raw/{country}/lines_way.json", - links_relation="data/osm-raw/{country}/links_relation.json", + routes_relation="data/osm-raw/{country}/routes_relation.json", substations_way="data/osm-raw/{country}/substations_way.json", substations_relation="data/osm-raw/{country}/substations_relation.json", log: @@ -620,7 +620,7 @@ if config["enable"]["retrieve"] and ( country=config_provider("countries"), ), expand( - "data/osm-raw/{country}/links_relation.json", + "data/osm-raw/{country}/routes_relation.json", country=config_provider("countries"), ), expand( diff --git a/scripts/_helpers.py b/scripts/_helpers.py index 5e86b51c3..064488ec6 100644 --- a/scripts/_helpers.py +++ b/scripts/_helpers.py @@ -390,6 +390,8 @@ def aggregate_costs(n, flatten=False, opts=None, existing_only=False): def progress_retrieve(url, file, disable=False): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} + # Hotfix - Bug, tqdm not working with disable=False + disable = True if disable: response = requests.get(url, headers=headers, stream=True) diff --git a/scripts/base_network.py b/scripts/base_network.py index 71eefc5c3..9a9be8ee3 100644 --- a/scripts/base_network.py +++ b/scripts/base_network.py @@ -456,7 +456,8 @@ def _set_electrical_parameters_transformers(transformers, config): ## Add transformer parameters transformers["x"] = config.get("x", 0.1) - transformers["s_nom"] = config.get("s_nom", 2000) + if "s_nom" not in transformers: + transformers["s_nom"] = config.get("s_nom", 2000) transformers["type"] = config.get("type", "") return transformers diff --git a/scripts/build_gdp_pop_non_nuts3.py b/scripts/build_gdp_pop_non_nuts3.py index cd3f69f3b..99e04b0ee 100644 --- a/scripts/build_gdp_pop_non_nuts3.py +++ b/scripts/build_gdp_pop_non_nuts3.py @@ -123,7 +123,9 @@ def calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3): if "snakemake" not in globals(): from _helpers import mock_snakemake - snakemake = mock_snakemake("build_gdp_pop_non_nuts3") + snakemake = mock_snakemake( + "build_gdp_pop_non_nuts3", configfiles=["config/config.osm-raw.yaml"] + ) configure_logging(snakemake) set_scenario_config(snakemake) diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py index 83461a98d..31884d461 100644 --- a/scripts/build_osm_network.py +++ b/scripts/build_osm_network.py @@ -3,16 +3,21 @@ # # SPDX-License-Identifier: MIT +import itertools import logging -import os import string import geopandas as gpd +import networkx as nx import numpy as np import pandas as pd +import pypsa from _helpers import configure_logging, set_scenario_config -from shapely.geometry import LineString, Point -from shapely.ops import linemerge, nearest_points, split +from pyproj import Transformer +from shapely import prepare +from shapely.algorithms.polylabel import polylabel +from shapely.geometry import LineString, MultiLineString, Point +from shapely.ops import linemerge, split from tqdm import tqdm logger = logging.getLogger(__name__) @@ -20,9 +25,19 @@ GEO_CRS = "EPSG:4326" DISTANCE_CRS = "EPSG:3035" -BUS_TOL = ( - 5000 # unit: meters, default 5000 - Buses within this distance are grouped together -) +BUS_TOL = 500 # meters +BUSES_COLUMNS = [ + "station_id", + "voltage", + "dc", + "symbol", + "under_construction", + "tags", + "x", + "y", + "country", + "geometry", +] LINES_COLUMNS = [ "bus0", "bus1", @@ -31,6 +46,7 @@ "length", "underground", "under_construction", + "tags", "geometry", ] LINKS_COLUMNS = [ @@ -41,6 +57,7 @@ "length", "underground", "under_construction", + "tags", "geometry", ] TRANSFORMERS_COLUMNS = [ @@ -48,808 +65,1531 @@ "bus1", "voltage_bus0", "voltage_bus1", - "country", + "s_nom", + "station_id", "geometry", ] +CONVERTERS_COLUMNS = [ + "bus0", + "bus1", + "voltage", + "p_nom", + "geometry", +] + + +def _merge_identical_lines(lines): + """ + Aggregates lines with identical geometries and voltage levels by merging them into a single line. + + Parameters: + - lines (pd.DataFrame): DataFrame containing line information with columns "geometry", "voltage", "line_id", and "circuits". + + Returns: + - pd.DataFrame: DataFrame with aggregated lines, where lines with identical geometries and voltage levels are merged. + """ + lines_all = lines.copy() + lines_to_drop = [] + + logger.info("Aggregating lines with identical geometries and voltage levels.") + for g_name, g_value in tqdm( + lines_all.groupby(["geometry", "voltage"]), + desc="Aggregating identical lines", + unit=" lines", + ): + line_ids = list(g_value["line_id"]) + + if len(line_ids) > 1: + # New aggregated line + lid_old = line_ids[0] + lid_agg = lid_old.split("-")[0] + circuits_agg = g_value["circuits"].sum() + + # Update line with aggregated parameters + lines_all.loc[lines_all["line_id"] == lid_old, "line_id"] = lid_agg + lines_all.loc[lines_all["line_id"] == lid_agg, "circuits"] = circuits_agg + + # Add other lines to lines to drop + lines_to_drop += line_ids[1:] + logger.info(f"In total {len(lines_to_drop)} lines aggregated.") -def line_endings_to_bus_conversion(lines): + # Drop lines + lines_all = lines_all[~lines_all["line_id"].isin(lines_to_drop)] + + # Update line ids to make them unique again + # Add voltage suffix to line_id + lines_all["line_id"] = ( + lines_all["line_id"] + + "-" + + lines_all["voltage"].div(1e3).astype(int).astype(str) + ) + + return lines_all + + +def _add_line_endings(buses, lines, add=0, name="line-end"): """ - Converts line endings to bus connections. + Adds virtual buses based on line endings to the set of buses. - This function takes a df of lines and converts the line endings to bus - connections. It performs the necessary operations to ensure that the line - endings are properly connected to the buses in the network. + This function creates virtual bus endpoints at the boundaries of the given lines' geometries. + It ensures that each unique combination of geometry and voltage is represented by a single bus endpoint. Parameters: - lines (DataFrame) + - buses (pd.DataFrame): DataFrame containing bus information. + - lines (pd.DataFrame): DataFrame containing line information, including 'voltage' and 'geometry' columns. + - add (int, optional): Offset to add to the bus index for generating unique bus IDs. Default is 0. + - name (str, optional): Name to assign to the 'contains' column for the virtual buses. Default is "line-end". Returns: - lines (DataFrame) + - pd.DataFrame: DataFrame containing the virtual bus endpoints with columns 'bus_id', 'voltage', 'geometry', and 'contains'. """ - # Assign to every line a start and end point + buses_all = buses.copy() - lines["bounds"] = lines["geometry"].boundary # create start and end point + endpoints0 = lines[["voltage", "geometry"]].copy() + endpoints0["geometry"] = endpoints0["geometry"].apply(lambda x: x.boundary.geoms[0]) - lines["bus_0_coors"] = lines["bounds"].map(lambda p: p.geoms[0]) - lines["bus_1_coors"] = lines["bounds"].map(lambda p: p.geoms[-1]) + endpoints1 = lines[["voltage", "geometry"]].copy() + endpoints1["geometry"] = endpoints1["geometry"].apply(lambda x: x.boundary.geoms[1]) - # splits into coordinates - lines["bus0_lon"] = lines["bus_0_coors"].x - lines["bus0_lat"] = lines["bus_0_coors"].y - lines["bus1_lon"] = lines["bus_1_coors"].x - lines["bus1_lat"] = lines["bus_1_coors"].y + endpoints = pd.concat([endpoints0, endpoints1], ignore_index=True) + endpoints.drop_duplicates(subset=["geometry", "voltage"], inplace=True) + endpoints.reset_index(drop=True, inplace=True) - return lines + endpoints["bus_id"] = endpoints.index + add + 1 + endpoints["bus_id"] = "virtual" + "-" + endpoints["bus_id"].astype(str) + + endpoints["contains"] = name + return endpoints[["bus_id", "voltage", "geometry", "contains"]] -# tol in m -def set_substations_ids(buses, distance_crs, tol=5000): + +def _split_linestring_by_point(linestring, points): """ - Function to set substations ids to buses, accounting for location - tolerance. + Splits a LineString geometry by multiple points. - The algorithm is as follows: + Parameters: + - linestring (LineString): The LineString geometry to be split. + - points (list of Point): A list of Point geometries where the LineString should be split. - 1. initialize all substation ids to -1 - 2. if the current substation has been already visited [substation_id < 0], then skip the calculation - 3. otherwise: - 1. identify the substations within the specified tolerance (tol) - 2. when all the substations in tolerance have substation_id < 0, then specify a new substation_id - 3. otherwise, if one of the substation in tolerance has a substation_id >= 0, then set that substation_id to all the others; - in case of multiple substations with substation_ids >= 0, the first value is picked for all + Returns: + - list of LineString: A list of LineString geometries resulting from the split. """ + list_linestrings = [linestring] - buses["station_id"] = -1 + for p in points: + # execute split to all lines and store results + temp_list = [split(l, p) for l in list_linestrings] + # nest all geometries + list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms] - # create temporary series to execute distance calculations using m as reference distances - temp_bus_geom = buses.geometry.to_crs(distance_crs) + return list_linestrings - # set tqdm options for substation ids - tqdm_kwargs_substation_ids = dict( - ascii=False, - unit=" buses", - total=buses.shape[0], - desc="Set substation ids ", - ) - station_id = 0 - for i, row in tqdm(buses.iterrows(), **tqdm_kwargs_substation_ids): - if buses.loc[i, "station_id"] >= 0: - continue +# TODO: Last old function to improve, either vectorise or parallelise +def split_overpassing_lines(lines, buses, distance_crs=DISTANCE_CRS, tol=1): + """ + Split overpassing lines by splitting them at nodes within a given tolerance, + to include the buses being overpassed. - # get substations within tolerance - close_nodes = np.flatnonzero( - temp_bus_geom.distance(temp_bus_geom.loc[i]) <= tol - ) + Parameters: + - lines (GeoDataFrame): The lines to be split. + - buses (GeoDataFrame): The buses representing nodes. + - distance_crs (str): The coordinate reference system (CRS) for distance calculations. + - tol (float): The tolerance distance in meters for determining if a bus is within a line. - if len(close_nodes) == 1: - # if only one substation is in tolerance, then the substation is the current one iƬ - # Note that the node cannot be with substation_id >= 0, given the preliminary check - # at the beginning of the for loop - buses.loc[buses.index[i], "station_id"] = station_id - # update station id - station_id += 1 - else: - # several substations in tolerance - # get their ids - subset_substation_ids = buses.loc[buses.index[close_nodes], "station_id"] - # check if all substation_ids are negative (<0) - all_neg = subset_substation_ids.max() < 0 - # check if at least a substation_id is negative (<0) - some_neg = subset_substation_ids.min() < 0 - - if all_neg: - # when all substation_ids are negative, then this is a new substation id - # set the current station_id and increment the counter - buses.loc[buses.index[close_nodes], "station_id"] = station_id - station_id += 1 - elif some_neg: - # otherwise, when at least a substation_id is non-negative, then pick the first value - # and set it to all the other substations within tolerance - sub_id = -1 - for substation_id in subset_substation_ids: - if substation_id >= 0: - sub_id = substation_id - break - buses.loc[buses.index[close_nodes], "station_id"] = sub_id - - -def set_lines_ids(lines, buses, distance_crs): - """ - Function to set line buses ids to the closest bus in the list. - """ - # set tqdm options for set lines ids - tqdm_kwargs_line_ids = dict( + Returns: + - lines (GeoDataFrame): The split lines. + - buses (GeoDataFrame): The buses representing nodes. + """ + lines = lines.copy() + logger.info(f"Splitting lines over overpassing nodes (Tolerance {tol} m).") + lines_to_add = [] # list of lines to be added + lines_to_split = [] # list of lines that have been split + + lines_epsgmod = lines.to_crs(distance_crs) + buses_epsgmod = buses.to_crs(distance_crs) + + # set tqdm options for substation ids + tqdm_kwargs_substation_ids = dict( ascii=False, unit=" lines", total=lines.shape[0], - desc="Set line/link bus ids ", + desc="Splitting lines", ) - # initialization - lines["bus0"] = -1 - lines["bus1"] = -1 + for l in tqdm(lines.index, **tqdm_kwargs_substation_ids): + # bus indices being within tolerance from the line + bus_in_tol_epsg = buses_epsgmod[ + buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol + ] - busesepsg = buses.to_crs(distance_crs) - linesepsg = lines.to_crs(distance_crs) + # Get boundary points + endpoint0 = lines_epsgmod.geometry.loc[l].boundary.geoms[0] + endpoint1 = lines_epsgmod.geometry.loc[l].boundary.geoms[1] - for i, row in tqdm(linesepsg.iterrows(), **tqdm_kwargs_line_ids): - # select buses having the voltage level of the current line - buses_sel = busesepsg[ - (buses["voltage"] == row["voltage"]) & (buses["dc"] == row["dc"]) - ] + # Calculate distances + dist_to_ep0 = bus_in_tol_epsg.geometry.distance(endpoint0) + dist_to_ep1 = bus_in_tol_epsg.geometry.distance(endpoint1) - # find the closest node of the bus0 of the line - bus0_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[0]).idxmin() - lines.loc[i, "bus0"] = buses.loc[bus0_id, "bus_id"] + # exclude endings of the lines + bus_in_tol_epsg = bus_in_tol_epsg[(dist_to_ep0 > tol) | (dist_to_ep1 > tol)] - # check if the line starts exactly in the node, otherwise modify the linestring - distance_bus0 = busesepsg.geometry.loc[bus0_id].distance( - row.geometry.boundary.geoms[0] - ) + if not bus_in_tol_epsg.empty: + # add index of line to split + lines_to_split.append(l) - if distance_bus0 > 0: - # the line does not start in the node, thus modify the linestring - line_start_point = lines.geometry.loc[i].boundary.geoms[0] - new_segment = LineString([buses.geometry.loc[bus0_id], line_start_point]) - modified_line = linemerge([new_segment, lines.geometry.loc[i]]) - lines.loc[i, "geometry"] = modified_line + buses_locs = buses.geometry.loc[bus_in_tol_epsg.index] - # find the closest node of the bus1 of the line - bus1_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[1]).idxmin() - lines.loc[i, "bus1"] = buses.loc[bus1_id, "bus_id"] + # get new line geometries + new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs) + n_geoms = len(new_geometries) - # check if the line ends exactly in the node, otherwise modify the linestring - distance_bus1 = busesepsg.geometry.loc[bus1_id].distance( - row.geometry.boundary.geoms[1] - ) + # create temporary copies of the line + df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms) + # update geometries + df_append["geometry"] = new_geometries + # update name of the line if there are multiple line segments + df_append["line_id"] = [ + str(df_append["line_id"].iloc[0]) + + (f"-{letter}" if n_geoms > 1 else "") + for letter in string.ascii_lowercase[:n_geoms] + ] - if distance_bus1 > 0: - # the line does not start in the node, thus modify the linestring - line_end_point = lines.geometry.loc[i].boundary.geoms[1] - new_segment = LineString([line_end_point, buses.geometry.loc[bus1_id]]) - modified_line = linemerge([lines.geometry.loc[i], new_segment]) - lines.loc[i, "geometry"] = modified_line + lines_to_add.append(df_append) - return lines, buses + if not lines_to_add: + return lines + df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True)) + df_to_add.set_crs(lines.crs, inplace=True) + df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True) -def merge_stations_same_station_id( - buses, delta_lon=0.001, delta_lat=0.001, precision=4 -): + # remove original lines + lines.drop(lines_to_split, inplace=True) + lines = df_to_add if lines.empty else pd.concat([lines, df_to_add]) + lines = gpd.GeoDataFrame(lines.reset_index(drop=True), crs=lines.crs) + + return lines + + +def _create_merge_mapping(lines, buses, buses_polygon, geo_crs=GEO_CRS): """ - Function to merge buses with same voltage and station_id This function - iterates over all substation ids and creates a bus_id for every substation - and voltage level. - - Therefore, a substation with multiple voltage levels is represented - with different buses, one per voltage level - """ - # initialize list of cleaned buses - buses_clean = [] - - # initialize the number of buses - n_buses = 0 - - for g_name, g_value in buses.groupby(by="station_id"): - # average location of the buses having the same station_id - station_point_x = np.round(g_value.geometry.x.mean(), precision) - station_point_y = np.round(g_value.geometry.y.mean(), precision) - # is_dclink_boundary_point = any(g_value["is_dclink_boundary_point"]) - - # loop for every voltage level in the bus - # The location of the buses is averaged; in the case of multiple voltage levels for the same station_id, - # each bus corresponding to a voltage level and each polatity is located at a distance regulated by delta_lon/delta_lat - v_it = 0 - for v_name, bus_row in g_value.groupby(by=["voltage", "dc"]): - lon_bus = np.round(station_point_x + v_it * delta_lon, precision) - lat_bus = np.round(station_point_y + v_it * delta_lat, precision) - - bus_data = [ - n_buses, # "bus_id" - g_name, # "station_id" - v_name[0], # "voltage" - bus_row["dc"].all(), # "dc" - "|".join(bus_row["symbol"].unique()), # "symbol" - bus_row["under_construction"].any(), # "under_construction" - "|".join(bus_row["tag_substation"].unique()), # "tag_substation" - bus_row["tag_area"].sum(), # "tag_area" - lon_bus, # "lon" - lat_bus, # "lat" - bus_row["country"].iloc[0], # "country" - Point(lon_bus, lat_bus), # "geometry" - ] + Creates a mapping for merging lines with the same electric parameters over virtual buses. + + This function performs the following steps: + - Filters virtual buses that do not intersect with the given polygon. + - Spatially joins the filtered virtual buses with lines based on their geometries. + - Filters the joined data to keep only those with matching voltage levels. + - Identifies shared buses to remove using networkx. + - Creates a network graph of lines to be merged using networkx + - Identifies connected components in the graph and merges lines within each component. + - Note that only lines that unambigruosly can be merged are considered. + + Parameters: + - lines (GeoDataFrame): GeoDataFrame containing line data with columns ["line_id", "geometry", "voltage", "circuits"]. + - buses (GeoDataFrame): GeoDataFrame containing bus data with columns ["bus_id", "geometry"]. + - buses_polygon (GeoDataFrame): GeoDataFrame containing the polygon data to filter virtual buses. + - geo_crs (CRS, optional): Coordinate reference system for the geometries. Defaults to GEO_CRS. + + Returns: + - GeoDataFrame: A GeoDataFrame containing the merged lines with columns ["line_id", "circuits", "voltage", "geometry", "underground", "contains_lines", "contains_buses"]. + """ + logger.info( + "Creating mapping for merging lines with same electric parameters over virtual buses." + ) + lines = lines.copy() + buses_virtual = buses.copy() + + buses_virtual = buses_virtual[buses_virtual["bus_id"].str.startswith("virtual")] + # Drop buses_virtual that are inside buses_virtual_polygon + bool_intersects_buses_virtual_polygon = buses_virtual.intersects( + buses_polygon.union_all() + ) + buses_virtual = buses_virtual[~bool_intersects_buses_virtual_polygon] + + # sjoin with lines_filtered in column "connected_lines", that intersect with buses_virtual + buses_virtual = gpd.sjoin( + buses_virtual, + lines[["line_id", "geometry", "voltage", "circuits"]], + how="left", + predicate="touches", + ) + # Filtering, only keep where voltage_left == voltage_right + buses_virtual = buses_virtual[ + buses_virtual["voltage_left"] == buses_virtual["voltage_right"] + ] + # Drop voltage_right and rename voltage_left to voltage + buses_virtual = buses_virtual.drop(columns=["voltage_right"]) + buses_virtual = buses_virtual.rename(columns={"voltage_left": "voltage"}) + + # Group by bus_id, voltage, circuits and count the number of connected lines + buses_to_remove = ( + buses_virtual.groupby(["bus_id", "voltage"]).size().reset_index(name="count") + ) + buses_to_remove = buses_to_remove[buses_to_remove["count"] == 2] + + buses_to_remove = buses_virtual[ + buses_virtual["bus_id"].isin(buses_to_remove["bus_id"]) + ] + buses_to_remove = ( + buses_to_remove.groupby(["bus_id", "circuits", "voltage"]) + .size() + .reset_index(name="count") + ) + # Keep only where count == 2 + buses_to_remove = buses_to_remove[buses_to_remove["count"] == 2] + buses_to_remove = buses_virtual[ + buses_virtual["bus_id"].isin(buses_to_remove["bus_id"]) + ] - # add the bus - buses_clean.append(bus_data) - - # increase counters - v_it += 1 - n_buses += 1 - - # names of the columns - buses_clean_columns = [ - "bus_id", - "station_id", - "voltage", - "dc", - "symbol", - "under_construction", - "tag_substation", - "tag_area", - "x", - "y", - "country", - # "is_dclink_boundary_point", - "geometry", + # Group by bus_id, voltage, circuits and count the number of connected lines, add column "lines" which contains the list of lines + buses_to_remove = ( + buses_to_remove.groupby(["bus_id", "voltage", "circuits"]) + .agg( + { + "line_id": list, + } + ) + .reset_index() + ) + + unique_lines = pd.Series(itertools.chain(*buses_to_remove["line_id"])).unique() + lines_to_merge = lines.loc[ + lines["line_id"].isin(unique_lines), + ["line_id", "voltage", "circuits", "length", "geometry", "underground"], + ] + lines_to_merge_dict = [ + (node, row.to_dict()) + for node, row in lines_to_merge.set_index("line_id").iterrows() ] - gdf_buses_clean = gpd.GeoDataFrame( - buses_clean, columns=buses_clean_columns - ).set_crs(crs=buses.crs, inplace=True) + # Create networkx + G = nx.Graph() + + # Add nodes from + # lines_to_merge + G.add_nodes_from(lines_to_merge_dict) + + edges = [ + ( + row["line_id"][0], # from node + row["line_id"][1], # to node + { + "bus_id": row["bus_id"], # shared bus + }, + ) + for _, row in buses_to_remove.iterrows() + ] + + # Add all edges at once + G.add_edges_from(edges) + + connected_components = nx.connected_components(G) + + # Create empty + merged_lines = pd.DataFrame( + columns=[ + "line_id", + "circuits", + "voltage", + "geometry", + "underground", + "contains_lines", + "contains_buses", + ] + ) + + # Iterate over each connected component + for component in tqdm( + connected_components, desc="Merging lines", unit=" components" + ): + # Create a subgraph for the current component + subgraph = G.subgraph(component) + + # Extract the first node in the component + first_node = next(iter(component)) # Get the first node (can be arbitrary) + + # Extract the attributes for the first node + circuits = G.nodes[first_node].get( + "circuits", None + ) # Default to None if not present + voltage = G.nodes[first_node].get( + "voltage", None + ) # Default to None if not present + + # Extract the geometries for all nodes in the subgraph + geometry = [G.nodes[node].get("geometry", None) for node in subgraph.nodes()] + geometry = linemerge(geometry) # Merge the geometries + + # Contains lines + contains_lines = list(subgraph.nodes()) + number_of_lines = int(len(contains_lines)) + # Find the node with the highest "length" parameter + node_longest = max( + subgraph.nodes(), key=lambda node: G.nodes[node].get("length", 0) + ) + underground = G.nodes[node_longest].get("underground", None) + + # Extract the list of edges (lines) in the subgraph + contains_buses = list() + for edge in subgraph.edges(): + contains_buses.append(G.edges[edge].get("bus_id", None)) + + subgraph_data = { + "line_id": [ + "merged_" + str(node_longest) + "+" + str(number_of_lines - 1) + ], # line_id + "circuits": [circuits], + "voltage": [voltage], + "geometry": [geometry], + "underground": [underground], + "contains_lines": [contains_lines], + "contains_buses": [contains_buses], + } + + # Convert to DataFrame and append to the merged_lines DataFrame + merged_lines = pd.concat( + [merged_lines, pd.DataFrame(subgraph_data)], ignore_index=True + ) + merged_lines = gpd.GeoDataFrame(merged_lines, geometry="geometry", crs=geo_crs) + + # Drop all closed linestrings (circles) + merged_lines = merged_lines[ + merged_lines["geometry"].apply(lambda x: not x.is_closed) + ] - return gdf_buses_clean + return merged_lines -def get_ac_frequency(df, fr_col="tag_frequency"): +def _merge_lines_over_virtual_buses( + lines, buses, merged_lines_map, distance_crs=DISTANCE_CRS +): """ - # Function to define a default frequency value. + Merges lines over virtual buses and updates the lines and buses DataFrames accordingly. + + Parameters: + - lines (pd.DataFrame): DataFrame containing line information. + - buses (pd.DataFrame): DataFrame containing bus information. + - merged_lines_map (pd.DataFrame): DataFrame mapping virtual buses to the lines they contain. + - distance_crs (str, optional): Coordinate reference system for calculating distances. Defaults to DISTANCE_CRS. - Attempts to find the most usual non-zero frequency across the - dataframe; 50 Hz is assumed as a back-up value + Returns: + - tuple: A tuple containing the updated lines and buses DataFrames. """ + lines_merged = lines.copy() + buses_merged = buses.copy() - # Initialize a default frequency value - ac_freq_default = 50 + lines_to_remove = merged_lines_map["contains_lines"].explode().unique() + buses_to_remove = merged_lines_map["contains_buses"].explode().unique() - grid_freq_levels = df[fr_col].value_counts(sort=True, dropna=True) - if not grid_freq_levels.empty: - # AC lines frequency shouldn't be 0Hz - ac_freq_levels = grid_freq_levels.loc[ - grid_freq_levels.index.get_level_values(0) != "0" - ] - ac_freq_default = ac_freq_levels.index.get_level_values(0)[0] + logger.info( + f"Merging {len(lines_to_remove)} lines over virtual buses and dropping {len(buses_to_remove)} virtual buses." + ) + + # Remove lines and buses + lines_merged = lines_merged[~lines_merged["line_id"].isin(lines_to_remove)] + lines_merged["contains_lines"] = lines_merged["line_id"].apply(lambda x: [x]) + buses_merged = buses_merged[~buses_merged["bus_id"].isin(buses_to_remove)] + # Add new lines from merged_lines_map to lines_merged + + ### Update lines to add + lines_to_add = merged_lines_map.copy().reset_index(drop=True) - return ac_freq_default + # Update columns + lines_to_add["under_construction"] = False + lines_to_add["tag_frequency"] = 50 + lines_to_add["dc"] = False + lines_to_add["bus0"] = None + lines_to_add["bus1"] = None + lines_to_add["length"] = lines_to_add["geometry"].to_crs(distance_crs).length + # Reorder + lines_to_add = lines_to_add[lines_merged.columns] -def get_transformers(buses, lines): + # Concatenate + lines_merged = pd.concat([lines_merged, lines_to_add], ignore_index=True) + + return lines_merged, buses_merged + + +def _create_station_seeds( + buses, + buses_polygon, + country_shapes, + tol=BUS_TOL, + distance_crs=DISTANCE_CRS, + geo_crs=GEO_CRS, +): """ - Function to create fake transformer lines that connect buses of the same - station_id at different voltage. + Creates aggregated station seeds (candidates) based on substation polygons and updates their country information. + + Parameters: + - buses (GeoDataFrame): GeoDataFrame containing bus information with columns "bus_id" and "geometry". + - buses_polygon (GeoDataFrame): GeoDataFrame containing bus polygon information with columns "bus_id" and "geometry". + - country_shapes (GeoDataFrame): GeoDataFrame containing country shapes with a "name" column. + - tol (float, optional): Tolerance for buffering geometries. Default is BUS_TOL. + - distance_crs (CRS, optional): Coordinate reference system for distance calculations. Default is DISTANCE_CRS. + - geo_crs (CRS, optional): Coordinate reference system for geographic calculations. Default is GEO_CRS. + + Returns: + GeoDataFrame: Aggregated station seeds with updated country information and renamed virtual buses. """ + # Drop all buses that have bus_id starting with "way/" or "relation/" prefix + # They are present in polygon_buffer anyway + logger.info("Creating aggregated stations based on substation polygons") + columns = ["bus_id", "geometry"] + filtered_buses = buses[ + ~buses["bus_id"].str.startswith("way/") + & ~buses["bus_id"].str.startswith("relation/") + ] - ac_freq = get_ac_frequency(lines) - df_transformers = [] + buses_buffer = gpd.GeoDataFrame( + data=filtered_buses[columns], geometry="geometry" + ).set_index("bus_id") + buses_buffer["geometry"] = ( + buses_buffer["geometry"].to_crs(distance_crs).buffer(tol).to_crs(geo_crs) + ) + buses_buffer["area"] = buses_buffer.to_crs(distance_crs).area + + buses_polygon_buffer = gpd.GeoDataFrame( + data=buses_polygon[columns], geometry="geometry" + ).set_index("bus_id") + buses_polygon_buffer["geometry"] = ( + buses_polygon_buffer["geometry"] + .to_crs(distance_crs) + .buffer(tol) + .to_crs(geo_crs) + ) + buses_polygon_buffer["area"] = buses_polygon_buffer.to_crs(distance_crs).area + + # Find the pole of inaccessibility (PoI) for each polygon, the interior point most distant from a polygon's boundary + # Garcia-Castellanos & Lombardo 2007 + # doi.org/10.1080/14702540801897809 + buses_polygon_buffer["poi"] = ( + buses_polygon.set_index("bus_id")["geometry"] + .to_crs(distance_crs) + .apply(lambda polygon: polylabel(polygon, tolerance=tol / 2)) + .to_crs(geo_crs) + ) - # Transformers should be added between AC buses only - buses_ac = buses[buses["dc"] != True] + buses_all_buffer = pd.concat([buses_buffer, buses_polygon_buffer]) - for g_name, g_value in buses_ac.sort_values("voltage", ascending=True).groupby( - by="station_id" - ): - # note: by construction there cannot be more that two buses with the same station_id and same voltage - n_voltages = len(g_value) - - if n_voltages > 1: - for id in range(n_voltages - 1): - # when g_value has more than one node, it means that there are multiple voltages for the same bus - transformer_geometry = LineString( - [g_value.geometry.iloc[id], g_value.geometry.iloc[id + 1]] - ) + buses_all_agg = gpd.GeoDataFrame( + geometry=[poly for poly in buses_all_buffer.union_all().geoms], crs=geo_crs + ) - transformer_data = [ - f"transf_{g_name}_{id}", # "line_id" - g_value["bus_id"].iloc[id], # "bus0" - g_value["bus_id"].iloc[id + 1], # "bus1" - g_value.voltage.iloc[id], # "voltage_bus0" - g_value.voltage.iloc[id + 1], # "voltage_bus0" - g_value.country.iloc[id], # "country" - transformer_geometry, # "geometry" - ] - - df_transformers.append(transformer_data) - - # name of the columns - transformers_columns = [ - "transformer_id", - "bus0", - "bus1", - "voltage_bus0", - "voltage_bus1", - "country", - "geometry", + # full spatial join + buses_all_agg = gpd.sjoin( + buses_all_agg, buses_all_buffer, how="left", predicate="intersects" + ).reset_index() + max_area_idx = buses_all_agg.groupby("index")["area"].idxmax() + buses_all_agg = buses_all_agg.loc[max_area_idx] + buses_all_agg = buses_all_agg.drop(columns=["index"]) + buses_all_agg.set_index("bus_id", inplace=True) + + # Fill missing PoI with the PoI of the polygon + poi_missing = buses_all_agg["poi"].isna() + buses_all_agg.loc[poi_missing, "poi"] = ( + buses_all_agg.loc[poi_missing, "geometry"] + .to_crs(distance_crs) + .apply(lambda polygon: polylabel(polygon, tolerance=tol / 2)) + .to_crs(geo_crs) + ) + + # Update countries based on the PoI location: + updated_country_mapping = gpd.sjoin_nearest( + gpd.GeoDataFrame(buses_all_agg["poi"], geometry="poi", crs=geo_crs).to_crs( + distance_crs + ), + country_shapes.to_crs(distance_crs), + how="left", + )["name"] + updated_country_mapping.index.name = "country" + + buses_all_agg["country"] = updated_country_mapping + + # Rename rows virtual buses that are not actual substations + buses_to_rename = buses_all_agg.loc[ + ( + ~buses_all_agg.index.str.startswith("way/") + & ~buses_all_agg.index.str.startswith("relation/") + ), + ["country", "poi"], ] - df_transformers = gpd.GeoDataFrame(df_transformers, columns=transformers_columns) - if not df_transformers.empty: - init_index = 0 if lines.empty else lines.index[-1] + 1 - df_transformers.set_index(init_index + df_transformers.index, inplace=True) - # update line endings - df_transformers = line_endings_to_bus_conversion(df_transformers) - df_transformers.drop(columns=["bounds", "bus_0_coors", "bus_1_coors"], inplace=True) + buses_to_rename["lat"] = buses_to_rename["poi"].apply(lambda p: p.y) + buses_to_rename["lon"] = buses_to_rename["poi"].apply(lambda p: p.x) - gdf_transformers = gpd.GeoDataFrame(df_transformers) - gdf_transformers.crs = GEO_CRS + # Now sort by country, latitude (north to south), and longitude (west to east) + buses_to_rename = buses_to_rename.sort_values( + by=["country", "lat", "lon"], ascending=[True, False, True] + ) + buses_to_rename["bus_id"] = buses_to_rename.groupby("country").cumcount() + 1 + buses_to_rename["bus_id"] = buses_to_rename["country"] + buses_to_rename[ + "bus_id" + ].astype(str) + + # Dict to rename virtual buses + dict_rename = buses_to_rename["bus_id"].to_dict() + + # Rename virtual buses in buses_all_agg with dict + buses_all_agg.rename(index=dict_rename, inplace=True) + + # extract substring before - from index + buses_all_agg["osm_identifier"] = buses_all_agg.index.str.split("-").str[0] + buses_all_agg.reset_index(inplace=True) + # count how often each value in osm_identifier occurs in column + buses_all_agg["id_occurence"] = buses_all_agg.groupby("osm_identifier")[ + "osm_identifier" + ].transform("count") + + # For each row, if id_occurence =1 set bus_id = osm_identifier + buses_all_agg.loc[buses_all_agg["id_occurence"] == 1, "bus_id"] = buses_all_agg.loc[ + buses_all_agg["id_occurence"] == 1, "osm_identifier" + ] + buses_all_agg.set_index("bus_id", inplace=True) + # Rename index name to station_id + buses_all_agg.index.name = "station_id" + buses_all_agg.drop(columns=["area", "osm_identifier", "id_occurence"], inplace=True) - return gdf_transformers + buses_all_agg["poi_perimeter"] = ( + buses_all_agg["poi"].to_crs(distance_crs).buffer(tol / 2).to_crs(geo_crs) + ) + + buses_all_agg.reset_index(inplace=True) + + return buses_all_agg -def _find_closest_bus(row, buses, distance_crs, tol=5000): +def _merge_buses_to_stations( + buses, stations, distance_crs=DISTANCE_CRS, geo_crs=GEO_CRS +): """ - Find the closest bus to a given bus based on geographical distance and - country. + Merges buses with the same voltage level within the same station. Parameters: - - row: The bus_id of the bus to find the closest bus for. - - buses: A GeoDataFrame containing information about all the buses. - - distance_crs: The coordinate reference system to use for distance calculations. - - tol: The tolerance distance within which a bus is considered closest (default: 5000). + - buses (GeoDataFrame): GeoDataFrame containing bus data with geometries. + - stations (GeoDataFrame): GeoDataFrame containing station data with geometries. + - distance_crs (CRS, optional): Coordinate reference system for distance calculations. Defaults to DISTANCE_CRS. + - geo_crs (CRS, optional): Coordinate reference system for geographic coordinates. Defaults to GEO_CRS. + Returns: - - closest_bus_id: The bus_id of the closest bus, or None if no bus is found within the distance and same country. + - GeoDataFrame: Updated GeoDataFrame with merged buses and updated geometries. """ - gdf_buses = buses.copy() - gdf_buses = gdf_buses.to_crs(distance_crs) - # Get the geometry of the bus with bus_id = link_bus_id - bus = gdf_buses[gdf_buses["bus_id"] == row] - bus_geom = bus.geometry.values[0] + # Merge buses with same voltage and within tolerance + logger.info(f"Merging buses of the same substation.") + # bus types (AC != DC) + buses_all = buses.copy().reset_index(drop=True) + stations_all = stations.copy().set_index("station_id") + stations_all["polygon"] = stations_all["geometry"].copy() - gdf_buses_filtered = gdf_buses[gdf_buses["dc"] == False] + # Set station ids based on station seeds (polygons) + buses_all = gpd.sjoin(buses_all, stations_all, how="left", predicate="within") - # Find the closest point in the filtered buses - nearest_geom = nearest_points(bus_geom, gdf_buses_filtered.union_all())[1] + # TODO: For now also include DC buses in the merging process. In the future, try to keep original HVDC converter location within substation + buses_all = buses_all.drop_duplicates(subset=["station_id", "voltage"]) - # Get the bus_id of the closest bus - closest_bus = gdf_buses_filtered.loc[gdf_buses["geometry"] == nearest_geom] + # Offsetting geometries within same substations for each voltage level + offset = 15 # meters (radius) + geo_to_dist = Transformer.from_crs(geo_crs, distance_crs, always_xy=True) + dist_to_geo = Transformer.from_crs(distance_crs, geo_crs, always_xy=True) - # check if closest_bus_id is within the distance - within_distance = ( - closest_bus.to_crs(distance_crs).distance(bus.to_crs(distance_crs), align=False) - ).values[0] <= tol + for g_name, g_value in tqdm( + buses_all.groupby("station_id"), desc="Updating geometries", unit=" substations" + ): + voltages = sorted( + g_value["voltage"].unique(), reverse=True + ) # Sort voltags in descending order - in_same_country = closest_bus.country.values[0] == bus.country.values[0] + if len(voltages) > 1: + poi_x, poi_y = geo_to_dist.transform( + g_value["poi"].values[0].x, g_value["poi"].values[0].y + ) - if within_distance and in_same_country: - closest_bus_id = closest_bus.bus_id.values[0] - else: - closest_bus_id = None + for idx, v in enumerate(voltages): + poi_x_offset = poi_x + offset * np.sin( + np.pi / 4 + 2 * np.pi * idx / len(voltages) + ).round(4) + poi_y_offset = poi_y + offset * np.cos( + np.pi / 4 + 2 * np.pi * idx / len(voltages) + ).round(4) - return closest_bus_id + poi_offset = Point(dist_to_geo.transform(poi_x_offset, poi_y_offset)) + # Update bus_name + g_value.loc[g_value["voltage"] == v, "bus_id"] = ( + g_name + "-" + str(int(v / 1000)) + ) + + # Update geometry + g_value.loc[g_value["voltage"] == v, "geometry"] = poi_offset -def _get_converters(buses, links, distance_crs): + buses_all.loc[g_value.index, "bus_id"] = g_value["bus_id"] + buses_all.loc[g_value.index, "geometry"] = g_value["geometry"] + else: + v = voltages[0] + buses_all.loc[g_value.index, "bus_id"] = g_name + "-" + str(int(v / 1000)) + buses_all.loc[g_value.index, "geometry"] = g_value["poi"] + + return buses_all + + +def _remove_loops_from_multiline(multiline): """ - Get the converters for the given buses and links. Connecting link endings - to closest AC bus. + Removes closed loops from a MultiLineString geometry. + This function iteratively removes closed loops from a MultiLineString geometry + until no closed loops remain or a maximum of 5 iterations is reached. Parameters: - - buses (pandas.DataFrame): DataFrame containing information about buses. - - links (pandas.DataFrame): DataFrame containing information about links. + - multiline (shapely.geometry.MultiLineString or shapely.geometry.LineString): The input geometry which may contain closed loops. + Returns: - - gdf_converters (geopandas.GeoDataFrame): GeoDataFrame containing information about converters. + - shapely.geometry.MultiLineString or shapely.geometry.LineString: The geometry with closed loops removed. """ - converters = [] - for idx, row in links.iterrows(): - for conv in range(2): - link_end = row[f"bus{conv}"] - # HVDC Gotland is connected to 130 kV grid, closest HVAC bus is further away + elements_initial = ( + [line for line in multiline.geoms] + if multiline.geom_type == "MultiLineString" + else [multiline] + ) - closest_bus = _find_closest_bus(link_end, buses, distance_crs, tol=40000) + if not any([line.is_closed for line in elements_initial]): + return multiline - if closest_bus is None: - continue + elements = elements_initial + iteration_count = 0 + while any([line.is_closed for line in elements]) and iteration_count < 5: + elements = [line for line in elements if not line.is_closed] + geometry_updated = linemerge(elements) - converter_id = f"converter/{row['link_id']}_{conv}" - converter_geometry = LineString( - [ - buses[buses["bus_id"] == link_end].geometry.values[0], - buses[buses["bus_id"] == closest_bus].geometry.values[0], - ] - ) + elements = ( + [line for line in geometry_updated.geoms] + if geometry_updated.geom_type == "MultiLineString" + else [geometry_updated] + ) + iteration_count += 1 - logger.info( - f"Added converter #{conv+1}/2 for link {row['link_id']}:{converter_id}." - ) + if not any([line.is_closed for line in elements]): + break - converter_data = [ - converter_id, # "line_id" - link_end, # "bus0" - closest_bus, # "bus1" - row["voltage"], # "voltage" - row["p_nom"], # "p_nom" - False, # "underground" - False, # "under_construction" - buses[buses["bus_id"] == closest_bus].country.values[0], # "country" - converter_geometry, # "geometry" - ] + return geometry_updated - # Create the converter - converters.append(converter_data) - - conv_columns = [ - "converter_id", - "bus0", - "bus1", - "voltage", - "p_nom", - "underground", - "under_construction", - "country", - "geometry", - ] - gdf_converters = gpd.GeoDataFrame( - converters, columns=conv_columns, crs=GEO_CRS - ).reset_index() +def _identify_linestring_between_polygons( + multiline, polygon0, polygon1, geo_crs=GEO_CRS, distance_crs=DISTANCE_CRS +): + """ + Identifies a LineString from a MultiLineString that touches both given polygons. + This function takes a MultiLineString and two polygons, and identifies a LineString within the MultiLineString that touches both polygons. + If no such LineString is found, the original MultiLineString is returned. + + Parameters: + - multiline (shapely.geometry.MultiLineString or shapely.geometry.LineString): The input MultiLineString or LineString. + - polygon0 (shapely.geometry.Polygon): The first polygon. + - polygon1 (shapely.geometry.Polygon): The second polygon. + - geo_crs (str or pyproj.CRS, optional): The geographic coordinate reference system. Default is GEO_CRS. + - distance_crs (str or pyproj.CRS, optional): The distance coordinate reference system. Default is DISTANCE_CRS. + + Returns: + - shapely.geometry.LineString or shapely.geometry.MultiLineString: The identified LineString that touches both polygons, or the original MultiLineString if no such LineString is found. + """ + list_lines = ( + [line for line in multiline.geoms] + if multiline.geom_type == "MultiLineString" + else [multiline] + ) + prepare(polygon0) + prepare(polygon1) + gdf_polygon0 = gpd.GeoDataFrame(geometry=[polygon0], crs=geo_crs).to_crs( + distance_crs + ) + gdf_polygon1 = gpd.GeoDataFrame(geometry=[polygon1], crs=geo_crs).to_crs( + distance_crs + ) - return gdf_converters + idx = None + for line in list_lines: + prepare(line) + gdf_line = gpd.GeoDataFrame(geometry=[line], crs=geo_crs).to_crs(distance_crs) + touches = gdf_line.intersects(gdf_polygon0.buffer(1e-2)) & gdf_line.intersects( + gdf_polygon1.buffer(1e-2) + ) + if touches.any(): + idx = list_lines.index(line) + break + return list_lines[idx] if idx is not None else multiline -def connect_stations_same_station_id(lines, buses): + +def _map_endpoints_to_buses( + connection, + buses, + shape="station_polygon", + id_col="line_id", + sjoin="intersects", + distance_crs=DISTANCE_CRS, + geo_crs=GEO_CRS, +): """ - Function to create fake links between substations with the same - substation_id. + Maps the endpoints of lines to buses based on spatial relationships. + + Parameters: + - connection (GeoDataFrame): GeoDataFrame containing the line connections. + - buses (GeoDataFrame): GeoDataFrame containing the bus information. + - shape (str, optional): The shape type to use for mapping. Default is "station_polygon". + - id_col (str, optional): The column name for line IDs. Default is "line_id". + - sjoin (str, optional): The spatial join method to use ("intersects" or "nearest"). Default is "intersects". + - distance_crs (CRS, optional): Coordinate reference system for distance calculations. Default is DISTANCE_CRS. + - geo_crs (CRS, optional): Coordinate reference system for geographic data. Default is GEO_CRS. + + Returns: + - GeoDataFrame: Updated GeoDataFrame with mapped endpoints and filtered lines. """ - ac_freq = get_ac_frequency(lines) - station_id_list = buses.station_id.unique() + logger.info("Mapping endpoints of lines to buses.") + buses_all = buses.copy().set_index("bus_id") + buses_all["station_polygon"] = buses_all["polygon"].copy() + buses_all = gpd.GeoDataFrame(buses_all, geometry="polygon", crs=buses.crs) + + lines_all = connection.copy().set_index(id_col) + + for coord in range(2): + # Obtain endpoints + endpoints = lines_all[["voltage", "geometry"]].copy() + endpoints["geometry"] = endpoints["geometry"].apply( + lambda x: x.boundary.geoms[coord] + ) + if sjoin == "intersects": + endpoints = gpd.sjoin( + endpoints, buses_all, how="left", predicate="intersects" + ) + endpoints = endpoints[ + endpoints["voltage_left"] == endpoints["voltage_right"] + ] + if sjoin == "nearest": + endpoints = gpd.sjoin_nearest( + endpoints.to_crs(distance_crs), + buses_all.to_crs(distance_crs), + how="left", + ) + # Groupby index and keep the row with highest voltage_right + endpoints.reset_index(inplace=True) + endpoints = endpoints.loc[ + endpoints.groupby("link_id")["voltage_right"].idxmax() + ] + endpoints.set_index("link_id", inplace=True) - add_lines = [] - from shapely.geometry import LineString + # rename voltage_left + endpoints = endpoints.drop(columns=["voltage_right"]) + endpoints = endpoints.rename(columns={"voltage_left": "voltage"}) - for s_id in station_id_list: - buses_station_id = buses[buses.station_id == s_id] + lines_all[f"poi_perimeter{coord}"] = endpoints["poi_perimeter"] + lines_all[f"station_polygon{coord}"] = endpoints["station_polygon"] + lines_all[f"bus{coord}"] = endpoints["bus_id"] - if len(buses_station_id) > 1: - for b_it in range(1, len(buses_station_id)): - line_geometry = LineString( - [ - buses_station_id.geometry.iloc[0], - buses_station_id.geometry.iloc[b_it], - ] + lines_all["geometry"] = lines_all.apply( + lambda row: row["geometry"].difference(row[shape + "0"]), axis=1 + ) + lines_all["geometry"] = lines_all.apply( + lambda row: row["geometry"].difference(row[shape + "1"]), axis=1 + ) + + # Drop lines that have same bus0 and bus1 + lines_all = lines_all[lines_all["bus0"] != lines_all["bus1"]] + + contains_stubs = lines_all["geometry"].apply( + lambda x: isinstance(x, MultiLineString) + ) + if contains_stubs.any(): + lines_stubs = lines_all.loc[contains_stubs].copy() + lines_stubs["linestrings"] = lines_stubs["geometry"].apply( + lambda x: ( + [line for line in x.geoms] if x.geom_type == "MultiLineString" else [x] + ) + ) + + # Remove closed subgeometries (circles), if any + lines_stubs["geometry"] = lines_stubs["geometry"].apply( + _remove_loops_from_multiline + ) + + if shape == "station_polygon": + # Only keep linestrings that are actually connection shape of bus0 and bus1 + lines_stubs["geometry"] = lines_stubs.apply( + lambda row: _identify_linestring_between_polygons( + row["geometry"], + row[f"{shape}0"], + row[f"{shape}1"], + ), + axis=1, + ) + lines_all.loc[lines_stubs.index, "geometry"] = lines_stubs["geometry"] + + # If the cutton should be done at the poi_perimeters + if shape == "poi_perimeter": + for coord in range(2): + lines_stubs = lines_all.loc[contains_stubs].copy() + lines_stubs["linestrings"] = lines_stubs["geometry"].apply( + lambda x: ( + [line for line in x.geoms] + if x.geom_type == "MultiLineString" + else [x] + ) ) - line_bounds = line_geometry.bounds - - line_data = [ - f"link{buses_station_id}_{b_it}", # "line_id" - buses_station_id.index[0], # "bus0" - buses_station_id.index[b_it], # "bus1" - 400000, # "voltage" - 1, # "circuits" - 0.0, # "length" - False, # "underground" - False, # "under_construction" - "transmission", # "tag_type" - ac_freq, # "tag_frequency" - buses_station_id.country.iloc[0], # "country" - line_geometry, # "geometry" - line_bounds, # "bounds" - buses_station_id.geometry.iloc[0], # "bus_0_coors" - buses_station_id.geometry.iloc[b_it], # "bus_1_coors" - buses_station_id.lon.iloc[0], # "bus0_lon" - buses_station_id.lat.iloc[0], # "bus0_lat" - buses_station_id.lon.iloc[b_it], # "bus1_lon" - buses_station_id.lat.iloc[b_it], # "bus1_lat" - ] - - add_lines.append(line_data) - - # name of the columns - add_lines_columns = [ - "line_id", - "bus0", - "bus1", - "voltage", - "circuits", - "length", - "underground", - "under_construction", - "tag_type", - "tag_frequency", - "country", - "geometry", - "bounds", - "bus_0_coors", - "bus_1_coors", - "bus0_lon", - "bus0_lat", - "bus1_lon", - "bus1_lat", - ] - df_add_lines = gpd.GeoDataFrame(pd.concat(add_lines), columns=add_lines_columns) - lines = pd.concat([lines, df_add_lines], ignore_index=True) + # Check for each individual element in list of linestrings, if they are within the station_polygon, if yes delete + lines_stubs["linestrings"] = lines_stubs.apply( + lambda row: [ + line + for line in row["linestrings"] + if not row[f"station_polygon{coord}"].contains(line) + ], + axis=1, + ) - return lines + # Update geometry through linemerge + lines_stubs["geometry"] = lines_stubs["linestrings"].apply( + lambda x: linemerge(x) + ) + # Update lines_all + lines_all.loc[lines_stubs.index, "geometry"] = lines_stubs["geometry"] + + lines_all.reset_index(inplace=True) + lines_all.drop( + columns=[ + "poi_perimeter0", + "poi_perimeter1", + "station_polygon0", + "station_polygon1", + ], + inplace=True, + ) -def set_lv_substations(buses): + return lines_all + + +def _add_point_to_line(linestring, point): """ - Function to set what nodes are lv, thereby setting substation_lv The - current methodology is to set lv nodes to buses where multiple voltage - level are found, hence when the station_id is duplicated. + Adds the bus coordinate to a linestring by extending the linestring with a + new segment. + + Parameters: + - linestring (LineString): The original linestring to extend. + - point (Point): The shapely.Point of the bus. + + Returns: + - merged (LineString): The extended linestring with the new segment. """ - # initialize column substation_lv to true - buses["substation_lv"] = True + start = linestring.boundary.geoms[0] + end = linestring.boundary.geoms[1] - # For each station number with multiple buses make lowest voltage `substation_lv = TRUE` - bus_with_stations_duplicates = buses[ - buses.station_id.duplicated(keep=False) - ].sort_values(by=["station_id", "voltage"]) - lv_bus_at_station_duplicates = ( - buses[buses.station_id.duplicated(keep=False)] - .sort_values(by=["station_id", "voltage"]) - .drop_duplicates(subset=["station_id"]) - ) - # Set all buses with station duplicates "False" - buses.loc[bus_with_stations_duplicates.index, "substation_lv"] = False - # Set lv_buses with station duplicates "True" - buses.loc[lv_bus_at_station_duplicates.index, "substation_lv"] = True + dist_to_start = point.distance(start) + dist_to_end = point.distance(end) + + if dist_to_start < dist_to_end: + new_segment = LineString([point, start]) + else: + new_segment = LineString([point, end]) - return buses + merged = linemerge([linestring, new_segment]) + return merged -def merge_stations_lines_by_station_id_and_voltage( - lines, links, buses, distance_crs, tol=5000 -): + +def _extend_lines_to_buses(connection, buses): """ - Function to merge close stations and adapt the line datasets to adhere to - the merged dataset. + Extends the geometry of lines/links to include mapped bus points. + This function takes a DataFrame of connections (lines/links) and a DataFrame of buses, + and extends the geometry of each line/link to include the geometry of the bus points + at both ends (bus0 and bus1). The resulting DataFrame will have updated geometries + that include these bus points. + + Parameters: + - connection (pd.DataFrame): DataFrame containing the lines/links with their geometries. + - buses (pd.DataFrame): DataFrame containing the bus points with their geometries. + + Returns: + - pd.DataFrame: DataFrame with updated geometries for the lines/links, including the bus points. """ + lines_all = connection.copy() + buses_all = buses.copy() + + logger.info("Extending line/link geometry to mapped bus points.") + lines_all = lines_all.merge( + buses_all[["geometry", "bus_id"]], + left_on="bus0", + right_on="bus_id", + how="left", + suffixes=("", "_right"), + ) + lines_all.drop(columns=["bus_id"], inplace=True) + lines_all.rename(columns={"geometry_right": "bus0_point"}, inplace=True) + + lines_all = lines_all.merge( + buses_all[["geometry", "bus_id"]], + left_on="bus1", + right_on="bus_id", + how="left", + suffixes=("", "_right"), + ) + lines_all.drop(columns=["bus_id"], inplace=True) + lines_all.rename(columns={"geometry_right": "bus1_point"}, inplace=True) - logger.info(" - Setting substation ids with tolerance of %.2f m" % (tol)) + lines_all["geometry"] = lines_all.apply( + lambda row: _add_point_to_line(row["geometry"], row["bus0_point"]), axis=1 + ) + lines_all["geometry"] = lines_all.apply( + lambda row: _add_point_to_line(row["geometry"], row["bus1_point"]), axis=1 + ) - # bus types (AC != DC) - buses_ac = buses[buses["dc"] == False].reset_index() - buses_dc = buses[buses["dc"] == True].reset_index() + # Drop bus0_point and bus1_point + lines_all.drop(columns=["bus0_point", "bus1_point"], inplace=True) - # set_substations_ids(buses, distance_crs, tol=tol) - set_substations_ids(buses_ac, distance_crs, tol=tol) - set_substations_ids(buses_dc, distance_crs, tol=tol) + return lines_all - logger.info(" - Merging substations with the same id") - # merge buses with same station id and voltage - if not buses.empty: - buses_ac = merge_stations_same_station_id(buses_ac) - buses_dc = merge_stations_same_station_id(buses_dc) - buses_dc["bus_id"] = buses_ac["bus_id"].max() + buses_dc["bus_id"] + 1 - buses = pd.concat([buses_ac, buses_dc], ignore_index=True) - set_substations_ids(buses, distance_crs, tol=tol) +def _determine_bus_capacity(buses, lines, voltages, line_types): + """ + Determines the bus capacity based on the sum of connected line capacities. - logger.info(" - Specifying the bus ids of the line endings") + Parameters: + - buses (pd.DataFrame): DataFrame containing bus information. + - lines (pd.DataFrame): DataFrame containing line information. + - voltages (list): List of voltage levels based on config file. + - line_types (dict): Dictionary mapping voltage levels to line types based on config file. - # set the bus ids to the line dataset - lines, buses = set_lines_ids(lines, buses, distance_crs) - links, buses = set_lines_ids(links, buses, distance_crs) + Returns: + - buses_all (pd.DataFrame): Containing the updated bus information with calculated capacities. + """ + logger.info("Determining total capacity of connected lines for each bus.") + pypsa_line_types = pypsa.Network().line_types + buses_all = buses.copy().set_index("bus_id") + lines_all = lines.copy() - # drop lines starting and ending in the same node - lines.drop(lines[lines["bus0"] == lines["bus1"]].index, inplace=True) - links.drop(links[links["bus0"] == links["bus1"]].index, inplace=True) - # update line endings - lines = line_endings_to_bus_conversion(lines) - links = line_endings_to_bus_conversion(links) + lines_all["closest_voltage_kV"] = lines_all["voltage"].apply( + lambda x: _closest_voltage(x / 1000, voltages) + ) + lines_all["line_type"] = lines_all["closest_voltage_kV"].apply( + lambda x: line_types[x] + ) + lines_all["i_nom"] = lines_all.apply( + lambda row: pypsa_line_types.loc[row["line_type"]]["i_nom"], axis=1 + ) + lines_all["s_nom"] = lines_all.apply( + lambda row: np.sqrt(3) * row["i_nom"] * row["voltage"] / 1e3, axis=1 + ) - # set substation_lv - set_lv_substations(buses) + buses_all["s_nom_sum0"] = lines_all.groupby("bus0")["s_nom"].sum() + buses_all["s_nom_sum0"] = buses_all["s_nom_sum0"].fillna(0) + buses_all["s_nom_sum1"] = lines_all.groupby("bus1")["s_nom"].sum() + buses_all["s_nom_sum1"] = buses_all["s_nom_sum1"].fillna(0) + buses_all["s_nom_sum"] = buses_all["s_nom_sum0"] + buses_all["s_nom_sum1"] + buses_all.drop(columns=["s_nom_sum0", "s_nom_sum1"], inplace=True) + buses_all["s_nom_sum"] = np.ceil(buses_all["s_nom_sum"]).astype(int) - # reset index - lines.reset_index(drop=True, inplace=True) - links.reset_index(drop=True, inplace=True) + buses_all.reset_index(inplace=True) - return lines, links, buses + return buses_all -def _split_linestring_by_point(linestring, points): +def _add_transformers(buses, geo_crs=GEO_CRS): """ - Function to split a linestring geometry by multiple inner points. + Adds unique transformers between buses of different voltage levels at each station. - Parameters - ---------- - lstring : LineString - Linestring of the line to be split - points : list - List of points to split the linestring + The function performs the following steps: + - Groups the buses by 'station_id' and identifies stations with buses of different voltage levels. + - Creates unique combinations of buses within each station and calculates the geometry for each transformer. + - Assigns unique transformer IDs based on station ID and voltage levels. + - Calculates the capacity of transformers based on the maximum capacity of connected buses. - Return - ------ - list_lines : list - List of linestring to split the line + Parameters: + - buses (GeoDataFrame): A GeoDataFrame containing bus information with columns including 'bus_id', 'station_id', 'voltage', and 'geometry'. + - geo_crs (CRS, optional): Coordinate reference system for the GeoDataFrame. Defaults to GEO_CRS. + + Returns: + - GeoDataFrame: A GeoDataFrame containing the added transformers with columns including 'transformer_id' and TRANSFORMERS_COLUMNS. """ + buses_all = buses.copy().set_index("bus_id") + logger.info( + "Adding unique transformers between buses of different voltage levels at each station." + ) - list_linestrings = [linestring] + all_transformers = gpd.GeoDataFrame( + columns=TRANSFORMERS_COLUMNS + ["transformer_id"], crs=geo_crs + ) - for p in points: - # execute split to all lines and store results - temp_list = [split(l, p) for l in list_linestrings] - # nest all geometries - list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms] + for g_name, g_value in buses_all.groupby("station_id"): + if g_value["voltage"].nunique() > 1: + combinations = list(itertools.combinations(sorted(g_value.index), 2)) - return list_linestrings + station_transformers = pd.DataFrame(combinations, columns=["bus0", "bus1"]) + station_transformers["voltage_bus0"] = station_transformers["bus0"].map( + g_value["voltage"] + ) + station_transformers["voltage_bus1"] = station_transformers["bus1"].map( + g_value["voltage"] + ) + station_transformers["geometry"] = station_transformers.apply( + lambda row: LineString( + [ + g_value.loc[row["bus0"]]["geometry"], + g_value.loc[row["bus1"]]["geometry"], + ] + ), + axis=1, + ) + station_transformers["station_id"] = g_name + station_transformers["transformer_id"] = ( + g_name + + "-" + + station_transformers["voltage_bus0"].div(1e3).astype(int).astype(str) + + "-" + + station_transformers["voltage_bus1"].div(1e3).astype(int).astype(str) + ) + station_transformers = gpd.GeoDataFrame(station_transformers, crs=geo_crs) + + all_transformers = pd.concat( + [all_transformers, station_transformers] + ).reset_index(drop=True) + # Calculate capacity of transformers based on maximum of connected lines + all_transformers["s_nom_sum0"] = all_transformers.apply( + lambda row: buses_all.loc[row["bus0"]]["s_nom_sum"], axis=1 + ) + all_transformers["s_nom_sum1"] = all_transformers.apply( + lambda row: buses_all.loc[row["bus1"]]["s_nom_sum"], axis=1 + ) + all_transformers["s_nom"] = all_transformers[["s_nom_sum0", "s_nom_sum1"]].max( + axis=1 + ) + all_transformers.drop(columns=["s_nom_sum0", "s_nom_sum1"], inplace=True) -def fix_overpassing_lines(lines, buses, distance_crs, tol=1): + logger.info( + f"Added {len(all_transformers)} transformers to {len(all_transformers['station_id'].unique())} stations." + ) + + return all_transformers[["transformer_id"] + TRANSFORMERS_COLUMNS] + + +def _add_dc_buses( + converters_polygon, + links, + buses, + country_shapes, + tol=BUS_TOL, + distance_crs=DISTANCE_CRS, + geo_crs=GEO_CRS, +): """ - Fix overpassing lines by splitting them at nodes within a given tolerance, - to include the buses being overpassed. + Adds DC buses to the network and mapping them to the nearest AC buses. Parameters: - - lines (GeoDataFrame): The lines to be fixed. - - buses (GeoDataFrame): The buses representing nodes. - - distance_crs (str): The coordinate reference system (CRS) for distance calculations. - - tol (float): The tolerance distance in meters for determining if a bus is within a line. + - converters_polygon (GeoDataFrame): GeoDataFrame containing the polygons of the DC converters. + - links (GeoDataFrame): GeoDataFrame containing the links in the network. + - buses (GeoDataFrame): GeoDataFrame containing the AC buses in the network. + - tol (float, optional): Tolerance value for determining the point of interest (PoI) within the DC bus polygon. Defaults to BUS_TOL. + - distance_crs (CRS, optional): Coordinate reference system for distance calculations. Defaults to DISTANCE_CRS. + - geo_crs (CRS, optional): Coordinate reference system for geographic calculations. Defaults to GEO_CRS. + Returns: - - lines (GeoDataFrame): The fixed lines. - - buses (GeoDataFrame): The buses representing nodes. + - GeoDataFrame: A GeoDataFrame containing the DC buses with their corresponding PoI and mapped to the nearest AC bus. """ + dc_buses = converters_polygon.copy() + max_distance = 50000 # m, Arbitrary, maximum distance between DC bus and AC bus (value needs to be high for Gotland HVDC to be connected) - lines_to_add = [] # list of lines to be added - lines_to_split = [] # list of lines that have been split + logger.info( + "Adding DC buses to the network and mapping them to the nearest AC buses." + ) + # Rough filtering: Only keep dc_buses if distance to links union is less than 5000 meters + links_closeby = ( + dc_buses.to_crs(distance_crs).distance(links.to_crs(distance_crs).union_all()) + < 5000 + ) + dc_buses = dc_buses[links_closeby] - lines_epsgmod = lines.to_crs(distance_crs) - buses_epsgmod = buses.to_crs(distance_crs) + dc_buses.rename(columns={"id": "bus_id", "geometry": "polygon"}, inplace=True) - # set tqdm options for substation ids - tqdm_kwargs_substation_ids = dict( - ascii=False, - unit=" lines", - total=lines.shape[0], - desc="Verify lines overpassing nodes ", + # Determine PoI for each dc bus + dc_buses["poi"] = ( + dc_buses["polygon"] + .to_crs(distance_crs) + .apply(lambda polygon: polylabel(polygon, tolerance=tol / 2)) + .to_crs(geo_crs) + ) + dc_buses["geometry"] = dc_buses["poi"] + + # Map dc_buses to stations + ac_buses = buses.copy().set_index("bus_id") + # Group ac_buses by station, keep row with highest voltage + ac_buses = ac_buses.loc[ac_buses.groupby("station_id")["voltage"].idxmax()] + ac_buses_polygon = gpd.GeoDataFrame( + ac_buses[["station_id", "polygon", "country"]], + geometry="polygon", + crs=ac_buses.crs, ) - for l in tqdm(lines.index, **tqdm_kwargs_substation_ids): - # bus indices being within tolerance from the line - bus_in_tol_epsg = buses_epsgmod[ - buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol - ] + dc_buses = gpd.sjoin_nearest( + dc_buses.to_crs(distance_crs), + ac_buses_polygon.to_crs(distance_crs), + how="left", + lsuffix=None, + rsuffix="ac", + max_distance=max_distance, + ).to_crs(geo_crs) + logger.info(f"Added {len(dc_buses)} DC buses to the network.") + + # Remaining DC buses that could not be mapped to an AC bus: + dc_buses_unmapped = dc_buses[dc_buses["bus_id_ac"].isna()].copy() + dc_buses_unmapped["station_id"] = dc_buses_unmapped["bus_id"] + + # Update countries based on the PoI location: + dc_buses_unmapped["country"] = gpd.sjoin_nearest( + dc_buses_unmapped.to_crs(distance_crs), + country_shapes.to_crs(distance_crs), + how="left", + )["name"] + + dc_buses.loc[dc_buses_unmapped.index, "country"] = dc_buses_unmapped["country"] + dc_buses.loc[dc_buses_unmapped.index, "station_id"] = dc_buses_unmapped[ + "station_id" + ] - # Get boundary points - endpoint0 = lines_epsgmod.geometry.loc[l].boundary.geoms[0] - endpoint1 = lines_epsgmod.geometry.loc[l].boundary.geoms[1] + return dc_buses - # Calculate distances - dist_to_ep0 = bus_in_tol_epsg.geometry.distance(endpoint0) - dist_to_ep1 = bus_in_tol_epsg.geometry.distance(endpoint1) - # exclude endings of the lines - bus_in_tol_epsg = bus_in_tol_epsg[(dist_to_ep0 > tol) | (dist_to_ep1 > tol)] +def _map_links_to_dc_buses(links, dc_buses, distance_crs=DISTANCE_CRS): + """ + Maps links to DC buses based on geographical proximity and updates DC bus attributes. - if not bus_in_tol_epsg.empty: - # add index of line to split - lines_to_split.append(l) + Parameters: + - links (GeoDataFrame): GeoDataFrame containing link geometries and attributes. + - dc_buses (GeoDataFrame): GeoDataFrame containing DC bus geometries and attributes. + - distance_crs (CRS, optional): Coordinate reference system to use for distance calculations. Defaults to DISTANCE_CRS. - buses_locs = buses.geometry.loc[bus_in_tol_epsg.index] + Returns: + - tuple: A tuple containing: + - links_all (GeoDataFrame): Updated GeoDataFrame of links with mapped DC buses. + - dc_buses_all (GeoDataFrame): Updated GeoDataFrame of DC buses with additional attributes. + """ + logger.info("Mapping links to DC buses based on geographical proximity.") + links_all = links.copy().set_index("link_id") + max_distance = 120000 # m, arbitrary maximum relevant for islands, and if second country is missing + dc_buses_all = dc_buses.copy().set_index("bus_id") + dc_buses_polygon = gpd.GeoDataFrame( + dc_buses_all["polygon"], geometry="polygon", crs=dc_buses.crs + ) - # get new line geometries - new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs) - n_geoms = len(new_geometries) + links_all["bus0"] = links_all["geometry"].apply(lambda x: x.boundary.geoms[0]) + links_all["bus1"] = links_all["geometry"].apply(lambda x: x.boundary.geoms[1]) + + links_all["bus0"] = gpd.sjoin_nearest( + gpd.GeoDataFrame(links_all["bus0"], geometry="bus0", crs=links_all.crs).to_crs( + distance_crs + ), + dc_buses_polygon.to_crs(distance_crs), + how="left", + max_distance=max_distance, + )["bus_id"] + + links_all["bus1"] = gpd.sjoin_nearest( + gpd.GeoDataFrame(links_all["bus1"], geometry="bus1", crs=links_all.crs).to_crs( + distance_crs + ), + dc_buses_polygon.to_crs(distance_crs), + how="left", + max_distance=max_distance, + )["bus_id"] + + # list of dc_buses that have been used + mapped_dc_buses = links_all[["bus0", "bus1"]].stack().unique() + # Drop remaining dc_buses + dc_buses_all = dc_buses_all[dc_buses_all.index.isin(mapped_dc_buses)] + + # Obtain voltage for DC buses + bus_voltages0 = links_all.set_index("bus0")["voltage"] + bus_voltages1 = links_all.set_index("bus1")["voltage"] + bus_voltages = pd.DataFrame(pd.concat([bus_voltages0, bus_voltages1], axis=0)) + bus_voltages = bus_voltages.groupby(bus_voltages.index).max() + dc_buses_all["voltage"] = bus_voltages + + # Obtain aggregated capacity connected to each DC bus + dc_buses_all["p_nom_sum0"] = links_all.groupby("bus0")["p_nom"].sum() + dc_buses_all["p_nom_sum0"] = dc_buses_all["p_nom_sum0"].fillna(0) + dc_buses_all["p_nom_sum1"] = links_all.groupby("bus1")["p_nom"].sum() + dc_buses_all["p_nom_sum1"] = dc_buses_all["p_nom_sum1"].fillna(0) + dc_buses_all["p_nom_sum"] = dc_buses_all["p_nom_sum0"] + dc_buses_all["p_nom_sum1"] + dc_buses_all.drop(columns=["p_nom_sum0", "p_nom_sum1"], inplace=True) + + # Reset index + links_all.reset_index(inplace=True) + dc_buses_all.reset_index(inplace=True) - # create temporary copies of the line - df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms) - # update geometries - df_append["geometry"] = new_geometries - # update name of the line if there are multiple line segments - df_append["line_id"] = [ - str(df_append["line_id"].iloc[0]) - + (f"-{letter}" if n_geoms > 1 else "") - for letter in string.ascii_lowercase[:n_geoms] - ] + logger.info( + f"Mapped {len(links_all)} links to {len(dc_buses_all)} DC buses. Dropping {len(dc_buses)-len(dc_buses_all)} DC buses." + ) - lines_to_add.append(df_append) + return links_all, dc_buses_all - if not lines_to_add: - return lines, buses - df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True)) - df_to_add.set_crs(lines.crs, inplace=True) - df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True) +def _add_converter_links(dc_buses, buses): + """ + Adds (PyPSA) converter links between DC buses and AC buses. + This function processes the provided DataFrames of DC buses and AC buses to create + links (converters) between them. It filters out DC buses that do not have an associated + AC bus, renames columns for clarity, and constructs geometries for the links. - # update length - df_to_add["length"] = df_to_add.to_crs(distance_crs).geometry.length + Parameters: + - dc_buses (pd.DataFrame): DataFrame containing DC bus information. + - buses (pd.DataFrame): DataFrame containing AC bus information. - # update line endings - df_to_add = line_endings_to_bus_conversion(df_to_add) + Returns: + - pd.DataFrame: DataFrame containing the converter links. + """ + logger.info("Adding converter links between DC buses and AC buses.") + converter_links = dc_buses.copy() + # Drop rows that do not have a bus_id_ac + converter_links = converter_links[~converter_links["bus_id_ac"].isna()] + + ac_buses_geometry = buses.copy().set_index("bus_id")[["geometry"]] + ac_buses_geometry = ac_buses_geometry[ + ac_buses_geometry.index.isin(converter_links["bus_id_ac"]) + ] - # remove original lines - lines.drop(lines_to_split, inplace=True) + converter_links.rename(columns={"bus_id": "converter_id"}, inplace=True) + converter_links["bus0"] = converter_links["converter_id"] + converter_links["bus1"] = converter_links["bus_id_ac"] + converter_links["ac_bus_geometry"] = converter_links["bus_id_ac"].apply( + lambda x: ac_buses_geometry.loc[x] + ) - lines = df_to_add if lines.empty else pd.concat([lines, df_to_add]) + converter_links["geometry"] = converter_links.apply( + lambda row: LineString([row["geometry"], row["ac_bus_geometry"]]), axis=1 + ) - lines = gpd.GeoDataFrame(lines.reset_index(drop=True), crs=lines.crs) + converter_links["converter_id"] = "conv-" + converter_links["converter_id"] + converter_links.rename(columns={"p_nom_sum": "p_nom"}, inplace=True) + + logger.info(f"Added {len(converter_links)} converter links to the network.") + + return converter_links[ + ["converter_id", "bus0", "bus1", "voltage", "p_nom", "geometry"] + ] + + +def _closest_voltage(voltage, voltage_list): + """ + Returns the closest voltage from a list of voltages to a given voltage. + + Parameters: + - voltage (float): The source voltage. + - voltage_list (list): List of voltages to compare against. - return lines, buses + Returns: + - float: The closest voltage to the source voltage + """ + return min(voltage_list, key=lambda x: abs(x - voltage)) -def build_network(inputs, outputs): +def _finalise_network(all_buses, converters, lines, links, transformers): + """ + Finalises network components and prepares for export. + Parameters: + - buses (pd.DataFrame): DataFrame containing bus information. + - converters (pd.DataFrame): DataFrame containing converter information. + - lines (pd.DataFrame): DataFrame containing line information. + - links (pd.DataFrame): DataFrame containing link information. + - transformers (pd.DataFrame): DataFrame containing transformer information. + + Returns: + - tuple: A tuple containing the updated DataFrames for buses, converters, lines, links, and transformers + """ + logger.info("Finalising network components and preparing for export.") + buses_all = all_buses.copy() + converters_all = converters.copy() + lines_all = lines.copy() + links_all = links.copy() + transformers_all = transformers.copy() + + # BUSES + logger.info("- buses") + buses_all["symbol"] = "Substation" + buses_all["under_construction"] = False + buses_all["tags"] = buses_all["bus_id"].str.split("-").str[0] + buses_all["voltage"] = buses_all["voltage"] / 1000 + buses_all["x"] = buses_all["geometry"].x + buses_all["y"] = buses_all["geometry"].y + buses_all = buses_all.replace({True: "t", False: "f"}) + # Move to tags + buses_all = buses_all[["bus_id"] + BUSES_COLUMNS] + buses_all.set_index("bus_id", inplace=True) + + # LINES + logger.info("- lines") + lines_all["voltage"] = lines_all["voltage"] / 1000 + lines_all["length"] = lines_all["length"].round(2) + lines_all["under_construction"] = False + lines_all["tags"] = lines_all["contains_lines"] + lines_all["underground"] = lines_all["underground"].replace({True: "t", False: "f"}) + lines_all["under_construction"] = lines_all["under_construction"].replace( + {True: "t", False: "f"} + ) + lines_all = lines_all[["line_id"] + LINES_COLUMNS] + lines_all.set_index("line_id", inplace=True) + + # TRANSFORMERS + logger.info("- transformers.") + transformers_all["voltage_bus0"] = transformers_all["voltage_bus0"] / 1000 + transformers_all["voltage_bus1"] = transformers_all["voltage_bus1"] / 1000 + transformers_all = transformers_all[["transformer_id"] + TRANSFORMERS_COLUMNS] + transformers_all.set_index("transformer_id", inplace=True) + + # LINKS + if not links_all.empty: + logger.info("- links") + links_all["voltage"] = links_all["voltage"] / 1000 + links_all["length"] = links_all["length"].round(2) + links_all["under_construction"] = False + links_all["tags"] = links_all["link_id"].str.split("-").str[0] + links_all = links_all.replace({True: "t", False: "f"}) + links_all = links_all[["link_id"] + LINKS_COLUMNS] + links_all.set_index("link_id", inplace=True) + + # CONVERTERS + if not converters_all.empty: + logger.info("- converters") + converters_all["voltage"] = converters_all["voltage"] / 1000 + converters_all = converters_all.replace({True: "t", False: "f"}) + converters_all = converters_all[["converter_id"] + CONVERTERS_COLUMNS] + converters_all.set_index("converter_id", inplace=True) + + return buses_all, converters_all, lines_all, links_all, transformers_all + + +def build_network( + inputs, + country_shapes, + voltages, + line_types, +): logger.info("Reading input data.") + + # Buses buses = gpd.read_file(inputs["substations"]) + buses.drop(columns=["country"], inplace=True) + buses_polygon = gpd.read_file(inputs["substations_polygon"]) + buses_polygon["bus_id"] = buses_polygon["bus_id"].apply(lambda x: x.split("-")[0]) + buses_polygon.drop_duplicates(subset=["bus_id", "geometry"], inplace=True) + buses_polygon.drop(columns=["voltage"], inplace=True) + + # Lines lines = gpd.read_file(inputs["lines"]) - links = gpd.read_file(inputs["links"]) + lines = _merge_identical_lines(lines) + + ### DATA PROCESSING (AC) + buses_line_endings = _add_line_endings(buses, lines) + buses = pd.concat([buses, buses_line_endings], ignore_index=True) - lines = line_endings_to_bus_conversion(lines) - links = line_endings_to_bus_conversion(links) + # Split lines overpassing nearby buses (tolerance 1 m) + lines = split_overpassing_lines(lines, buses) + # Update end points + bool_virtual_buses = buses["bus_id"].str.startswith("virtual") + buses = buses[~bool_virtual_buses] + buses_updated_line_endings = _add_line_endings(buses, lines) + buses = pd.concat([buses, buses_updated_line_endings], ignore_index=True) + + # Update length of lines + lines["length"] = lines.to_crs(DISTANCE_CRS).length + + # Merging lines over virtual buses (buses that are not designated as substations, e.g. junctions) + merged_lines_map = _create_merge_mapping(lines, buses, buses_polygon) + lines, buses = _merge_lines_over_virtual_buses(lines, buses, merged_lines_map) + + # Create station seeds + stations = _create_station_seeds(buses, buses_polygon, country_shapes) + buses = _merge_buses_to_stations(buses, stations) + + # Drop lines that are fully within stations.polygon + internal_lines = gpd.sjoin(lines, stations, how="inner", predicate="within").line_id logger.info( - "Fixing lines overpassing nodes: Connecting nodes and splittling lines." + f"Dropping {len(internal_lines)} lines that are fully within aggregated substations." + ) + lines = lines[~lines.line_id.isin(internal_lines)].reset_index(drop=True) + logger.info(f"Keeping {len(lines)} lines.") + + # Map AC lines to buses + lines = _map_endpoints_to_buses( + connection=lines, + buses=buses, + shape="station_polygon", + id_col="line_id", + sjoin="intersects", ) - lines, buses = fix_overpassing_lines(lines, buses, DISTANCE_CRS, tol=1) - # Merge buses with same voltage and within tolerance - logger.info(f"Aggregating close substations with a tolerance of {BUS_TOL} m") - - lines, links, buses = merge_stations_lines_by_station_id_and_voltage( - lines, links, buses, DISTANCE_CRS, BUS_TOL - ) - - # Recalculate lengths of lines - utm = lines.estimate_utm_crs(datum_name="WGS 84") - lines["length"] = lines.to_crs(utm).length - links["length"] = links.to_crs(utm).length - - transformers = get_transformers(buses, lines) - converters = _get_converters(buses, links, DISTANCE_CRS) - - logger.info("Saving outputs") - - ### Convert output to pypsa-eur friendly format - # Rename "substation" in buses["symbol"] to "Substation" - buses["symbol"] = buses["symbol"].replace({"substation": "Substation"}) - - # Drop unnecessary index column and set respective element ids as index - lines.set_index("line_id", inplace=True) - if not links.empty: - links.set_index("link_id", inplace=True) - converters.set_index("converter_id", inplace=True) - transformers.set_index("transformer_id", inplace=True) - buses.set_index("bus_id", inplace=True) - - # Convert voltages from V to kV - lines["voltage"] = lines["voltage"] / 1000 - if not links.empty: - links["voltage"] = links["voltage"] / 1000 - if not converters.empty: - converters["voltage"] = converters["voltage"] / 1000 - transformers["voltage_bus0"], transformers["voltage_bus1"] = ( - transformers["voltage_bus0"] / 1000, - transformers["voltage_bus1"] / 1000, - ) - buses["voltage"] = buses["voltage"] / 1000 - - # Convert 'true' and 'false' to 't' and 'f' - lines = lines.replace({True: "t", False: "f"}) - links = links.replace({True: "t", False: "f"}) - converters = converters.replace({True: "t", False: "f"}) - buses = buses.replace({True: "t", False: "f"}) - - # Change column orders - lines = lines[LINES_COLUMNS] - if not links.empty: - links = links[LINKS_COLUMNS] - else: - links = pd.DataFrame(columns=["link_id"] + LINKS_COLUMNS) - links.set_index("link_id", inplace=True) - transformers = transformers[TRANSFORMERS_COLUMNS] + # Extend line geometries + lines = _extend_lines_to_buses(lines, buses) - # Export to csv for base_network - buses.to_csv(outputs["substations"], quotechar="'") - lines.to_csv(outputs["lines"], quotechar="'") - links.to_csv(outputs["links"], quotechar="'") - converters.to_csv(outputs["converters"], quotechar="'") - transformers.to_csv(outputs["transformers"], quotechar="'") + # Drop buses that are not connected to any line + bool_not_connected = ~( + buses["bus_id"].isin(lines["bus0"]) | buses["bus_id"].isin(lines["bus1"]) + ) + logger.info( + f"Dropping {bool_not_connected.sum()} buses that are not connected to any line." + ) + buses = buses[~bool_not_connected].reset_index(drop=True) + + # Obtain connected line capacities for each bus + buses = _determine_bus_capacity( + buses, + lines, + voltages, + line_types, + ) - # Export to GeoJSON for quick validations - buses.to_file(outputs["substations_geojson"]) - lines.to_file(outputs["lines_geojson"]) - links.to_file(outputs["links_geojson"]) - converters.to_file(outputs["converters_geojson"]) - transformers.to_file(outputs["transformers_geojson"]) + # Add transformers + transformers = _add_transformers(buses) - return None + ### DATA PROCESSING (DC) + links = gpd.read_file(inputs["links"]) + converters_polygon = gpd.read_file(inputs["converters_polygon"]) + + # Create DC buses + dc_buses = _add_dc_buses(converters_polygon, links, buses, country_shapes) + links, dc_buses = _map_links_to_dc_buses(links, dc_buses) + + # Concatenate AC and DC buses + buses["dc"] = False + dc_buses["dc"] = True + all_buses = pd.concat([buses, dc_buses], ignore_index=True) + + # Add suffix + links["link_id"] = ( + links["link_id"] + + "-" + + links["voltage"].div(1e3).astype(int).astype(str) + + "-DC" + ) + + # Extend DC links to DC buses + links = _extend_lines_to_buses(links, dc_buses) + + # Add PyPSA converter links between DC buses and AC buses + converters = _add_converter_links(dc_buses, buses) + + # Update all lengths + lines["length"] = lines.to_crs(DISTANCE_CRS).length + links["length"] = links.to_crs(DISTANCE_CRS).length + + ### Saving outputs to PyPSA-compatible format + buses_final, converters_final, lines_final, links_final, transformers_final = ( + _finalise_network(all_buses, converters, lines, links, transformers) + ) + + return buses_final, converters_final, lines_final, links_final, transformers_final if __name__ == "__main__": - # Detect running outside of snakemake and mock snakemake for testing if "snakemake" not in globals(): from _helpers import mock_snakemake @@ -858,6 +1598,29 @@ def build_network(inputs, outputs): configure_logging(snakemake) set_scenario_config(snakemake) - countries = snakemake.config["countries"] + # Parameters + voltages = snakemake.params.voltages + line_types = snakemake.params.line_types + country_shapes = gpd.read_file(snakemake.input["country_shapes"]).set_index("name") + + # Build network + buses, converters, lines, links, transformers = build_network( + snakemake.input, + country_shapes, + voltages, + line_types, + ) + + # Export to csv for base_network + buses.to_csv(snakemake.output["substations"], quotechar="'") + lines.drop(columns=["tags"]).to_csv(snakemake.output["lines"], quotechar="'") + links.to_csv(snakemake.output["links"], quotechar="'") + converters.to_csv(snakemake.output["converters"], quotechar="'") + transformers.to_csv(snakemake.output["transformers"], quotechar="'") - build_network(snakemake.input, snakemake.output) + # Export to GeoJSON for quick validations + buses.to_file(snakemake.output["substations_geojson"]) + lines.to_file(snakemake.output["lines_geojson"]) + links.to_file(snakemake.output["links_geojson"]) + converters.to_file(snakemake.output["converters_geojson"]) + transformers.to_file(snakemake.output["transformers_geojson"]) diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py index 29eb91478..47e1c255d 100755 --- a/scripts/build_shapes.py +++ b/scripts/build_shapes.py @@ -249,6 +249,7 @@ def nuts3(country_shapes, nuts3, nuts3pop, nuts3gdp, ch_cantons, ch_popgdp): set_scenario_config(snakemake) country_shapes = countries(snakemake.input.naturalearth, snakemake.params.countries) + country_shapes.reset_index().to_file(snakemake.output.country_shapes) offshore_shapes = eez(snakemake.input.eez, snakemake.params.countries) diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py index 8669d0af5..1eff5d2a6 100644 --- a/scripts/clean_osm_data.py +++ b/scripts/clean_osm_data.py @@ -13,6 +13,7 @@ - Adding line endings to substations based on line data """ +import itertools import json import logging import os @@ -22,12 +23,20 @@ import numpy as np import pandas as pd from _helpers import configure_logging, set_scenario_config +from shapely.algorithms.polylabel import polylabel from shapely.geometry import LineString, MultiLineString, Point, Polygon from shapely.ops import linemerge, unary_union logger = logging.getLogger(__name__) +GEO_CRS = "EPSG:4326" +DISTANCE_CRS = "EPSG:3035" +BUS_TOL = ( + 500 # unit: meters, default 5000 - Buses within this distance are grouped together +) + + def _create_linestring(row): """ Create a LineString object from the given row. @@ -66,29 +75,6 @@ def _create_polygon(row): return polygon -def _find_closest_polygon(gdf, point): - """ - Find the closest polygon in a GeoDataFrame to a given point. - - Parameters: - gdf (GeoDataFrame): A GeoDataFrame containing polygons. - point (Point): A Point object representing the target point. - - Returns: - int: The index of the closest polygon in the GeoDataFrame. - """ - # Compute the distance to each polygon - gdf["distance"] = gdf["geometry"].apply(lambda geom: point.distance(geom)) - - # Find the index of the closest polygon - closest_idx = gdf["distance"].idxmin() - - # Get the closest polygon's row - closest_polygon = gdf.loc[closest_idx] - - return closest_idx - - def _clean_voltage(column): """ Function to clean the raw voltage column: manual fixing and drop nan values @@ -393,120 +379,6 @@ def _distribute_to_circuits(row): return single_circuit -def _add_line_endings_to_substations( - df_substations, - gdf_lines, - path_country_shapes, - path_offshore_shapes, - prefix, -): - """ - Add line endings to substations. - - This function takes two pandas DataFrames, `substations` and `lines`, and - adds line endings to the substations based on the information from the - lines DataFrame. - - Parameters: - - substations (pandas DataFrame): DataFrame containing information about - substations. - - lines (pandas DataFrame): DataFrame containing information about lines. - - Returns: - - buses (pandas DataFrame): DataFrame containing the updated information - about substations with line endings. - """ - if gdf_lines.empty: - return df_substations - - logger.info("Adding line endings to substations") - # extract columns from df_substations - bus_s = pd.DataFrame(columns=df_substations.columns) - bus_e = pd.DataFrame(columns=df_substations.columns) - - # TODO pypsa-eur: fix country code to contain single country code - # Read information from gdf_lines - bus_s[["voltage", "country"]] = gdf_lines[["voltage", "country"]] - bus_s.loc[:, "geometry"] = gdf_lines.geometry.boundary.map( - lambda p: p.geoms[0] if len(p.geoms) >= 2 else None - ) - bus_s.loc[:, "lon"] = bus_s["geometry"].map(lambda p: p.x if p != None else None) - bus_s.loc[:, "lat"] = bus_s["geometry"].map(lambda p: p.y if p != None else None) - bus_s.loc[:, "dc"] = gdf_lines["dc"] - - bus_e[["voltage", "country"]] = gdf_lines[["voltage", "country"]] - bus_e.loc[:, "geometry"] = gdf_lines.geometry.boundary.map( - lambda p: p.geoms[1] if len(p.geoms) >= 2 else None - ) - bus_e.loc[:, "lon"] = bus_e["geometry"].map(lambda p: p.x if p != None else None) - bus_e.loc[:, "lat"] = bus_e["geometry"].map(lambda p: p.y if p != None else None) - bus_e.loc[:, "dc"] = gdf_lines["dc"] - - bus_all = pd.concat([bus_s, bus_e], ignore_index=True) - - # Group gdf_substations by voltage and and geometry (dropping duplicates) - bus_all = bus_all.groupby(["voltage", "lon", "lat", "dc"]).first().reset_index() - bus_all = bus_all[df_substations.columns] - bus_all.loc[:, "bus_id"] = bus_all.apply( - lambda row: f"{prefix}/{row.name + 1}", axis=1 - ) - - # Initialize default values - bus_all["station_id"] = None - # Assuming substations completed for installed lines - bus_all["under_construction"] = False - bus_all["tag_area"] = None - bus_all["symbol"] = "substation" - # TODO: this tag may be improved, maybe depending on voltage levels - bus_all["tag_substation"] = "transmission" - bus_all["tag_source"] = prefix - - buses = pd.concat([df_substations, bus_all], ignore_index=True) - buses.set_index("bus_id", inplace=True) - - # Fix country codes - # TODO pypsa-eur: Temporary solution as long as the shapes have a low, - # incomplete resolution (cf. 2500 meters for buffering) - bool_multiple_countries = buses["country"].str.contains(";") - gdf_offshore = gpd.read_file(path_offshore_shapes).set_index("name")["geometry"] - gdf_offshore = gpd.GeoDataFrame( - gdf_offshore, geometry=gdf_offshore, crs=gdf_offshore.crs - ) - gdf_countries = gpd.read_file(path_country_shapes).set_index("name")["geometry"] - # reproject to enable buffer - gdf_countries = gpd.GeoDataFrame(geometry=gdf_countries, crs=gdf_countries.crs) - gdf_union = gdf_countries.merge( - gdf_offshore, how="outer", left_index=True, right_index=True - ) - gdf_union["geometry"] = gdf_union.apply( - lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]).union_all(), - axis=1, - ) - gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs=crs) - gdf_buses_tofix = gpd.GeoDataFrame( - buses[bool_multiple_countries], geometry="geometry", crs=crs - ) - joined = gpd.sjoin( - gdf_buses_tofix, gdf_union.reset_index(), how="left", predicate="within" - ) - - # For all remaining rows where the country/index_right column is NaN, find - # find the closest polygon index - joined.loc[joined["name"].isna(), "name"] = joined.loc[ - joined["name"].isna(), "geometry" - ].apply(lambda x: _find_closest_polygon(gdf_union, x)) - - joined.reset_index(inplace=True) - joined = joined.drop_duplicates(subset="bus_id") - joined.set_index("bus_id", inplace=True) - - buses.loc[bool_multiple_countries, "country"] = joined.loc[ - bool_multiple_countries, "name" - ] - - return buses.reset_index() - - def _import_lines_and_cables(path_lines): """ Import lines and cables from the given input paths. @@ -584,19 +456,14 @@ def _import_lines_and_cables(path_lines): continue logger.info("---") - return df_lines + # Append prefix "way/" + df_lines["id"] = "way/" + df_lines["id"] + return df_lines -def _import_links(path_links): - """ - Import links from the given input paths. - - Parameters: - - path_links (dict): A dictionary containing the input paths for links. - Returns: - - df_links (DataFrame): A DataFrame containing the imported links data. - """ +def _import_routes_relation(path_relation): + """ """ columns = [ "id", "bounds", @@ -604,23 +471,23 @@ def _import_links(path_links): "geometry", "country", "circuits", + "cables", "frequency", - "rating", "voltage", ] - df_links = pd.DataFrame(columns=columns) + df_relation = pd.DataFrame(columns=columns) - logger.info("Importing links") - for key in path_links: + logger.info("Importing power route relations (lines, cables, links)") + for key in path_relation: logger.info(f"Processing {key}...") - for idx, ip in enumerate(path_links[key]): + for idx, ip in enumerate(path_relation[key]): if ( os.path.exists(ip) and os.path.getsize(ip) > 400 ): # unpopulated OSM json is about 51 bytes - country = os.path.basename(os.path.dirname(path_links[key][idx])) + country = os.path.basename(os.path.dirname(path_relation[key][idx])) logger.info( - f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)}: {ip}" + f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_relation[key])).zfill(2)}: {ip}" ) with open(ip, "r") as f: data = json.load(f) @@ -632,9 +499,10 @@ def _import_links(path_links): col_tags = [ "circuits", + "cables", "frequency", - "rating", "voltage", + "rating", ] tags = pd.json_normalize(df["tags"]).map( @@ -650,24 +518,15 @@ def _import_links(path_links): df = pd.concat([df, tags], axis="columns") df.drop(columns=["type", "tags"], inplace=True) - df_links = pd.concat([df_links, df], axis="rows") + df_relation = pd.concat([df_relation, df], axis="rows") else: logger.info( - f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)} (empty): {ip}" + f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_relation[key])).zfill(2)} (empty): {ip}" ) continue - logger.info("---") - logger.info("Dropping lines without rating.") - len_before = len(df_links) - df_links = df_links.dropna(subset=["rating"]) - len_after = len(df_links) - logger.info( - f"Dropped {len_before-len_after} elements without rating. " - + f"Imported {len_after} elements." - ) - return df_links + return df_relation def _create_single_link(row): @@ -720,6 +579,40 @@ def _create_single_link(row): return single_link +def _extract_members(row): + df = pd.json_normalize(row["members"]) + df["ref"] = df["ref"].astype(str) + df["ways"] = "way/" + df["ref"] + member_ids = df["ways"].values.tolist() + + return member_ids + + +def _create_line(row): + """ + Create a line from multiple rows. Drops closed geometries (substations). + + Parameters: + - row: A row of OSM data containing information about the relation/line. + + Returns: + - line: LineString/MultiLineString representing the relation/line. + """ + df = pd.json_normalize(row["members"]) + df["ref"] = df["ref"].astype(str) + df["ways"] = "way/" + df["ref"] + # Drop NAs + df = df.dropna(subset=["geometry"]) + df.loc[:, "geometry"] = df.apply(_create_linestring, axis=1) + # Drop closed geometries (substations) + closed_geom = df["geometry"].apply(lambda x: x.is_closed) + + line = linemerge(df[~closed_geom]["geometry"].values.tolist()) + members = df[~closed_geom]["ways"].values.tolist() + + return line, members + + def _drop_duplicate_lines(df_lines): """ Drop duplicate lines from the given dataframe. Duplicates are usually lines @@ -765,14 +658,14 @@ def _drop_duplicate_lines(df_lines): return df_lines -def _filter_by_voltage(df, min_voltage=200000): +def _filter_by_voltage(df, min_voltage=220000): """ Filter rows in the DataFrame based on the voltage in V. Parameters: - df (pandas.DataFrame): The DataFrame containing the substations or lines data. - min_voltage (int, optional): The minimum voltage value to filter the - rows. Defaults to 200000 [unit: V]. + rows. Defaults to 220000 [unit: V]. Returns: - filtered df (pandas.DataFrame): The filtered DataFrame containing @@ -1062,29 +955,29 @@ def _create_substations_geometry(df_substations): logger.info("Creating substations geometry.") df_substations = df_substations.copy() - # Create centroids from geometries and keep the original polygons + # Create PoI from geometries and keep the original polygons df_substations.loc[:, "polygon"] = df_substations["geometry"] return df_substations -def _create_substations_centroid(df_substations): +def _create_substations_poi(df_substations, tol=BUS_TOL / 2): """ - Creates centroids from geometries and keeps the original polygons. + Creates Pole of Inaccessibility (PoI) from geometries and keeps the original polygons. Parameters: df_substations (DataFrame): The input DataFrame containing the substations data. Returns: - df_substations (DataFrame): A new DataFrame with the centroids ["geometry"] + df_substations (DataFrame): A new DataFrame with the PoI ["geometry"] and polygons ["polygon"] of the substations geometries. """ logger.info("Creating substations geometry.") df_substations = df_substations.copy() df_substations.loc[:, "geometry"] = df_substations["polygon"].apply( - lambda x: x.centroid + lambda polygon: polylabel(polygon, tol) ) df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x) @@ -1120,14 +1013,14 @@ def _create_lines_geometry(df_lines): return df_lines -def _add_bus_centroid_to_line(linestring, point): +def _add_bus_poi_to_line(linestring, point): """ - Adds the centroid of a substation to a linestring by extending the + Adds the PoI of a substation to a linestring by extending the linestring with a new segment. Parameters: linestring (LineString): The original linestring to extend. - point (Point): The centroid of the bus. + point (Point): The PoI of the bus. Returns: merged (LineString): The extended linestring with the new segment. @@ -1173,32 +1066,23 @@ def _finalise_substations(df_substations): ) # Initiate new columns for subsequent build_osm_network step - df_substations.loc[:, "symbol"] = "substation" - df_substations.loc[:, "tag_substation"] = "transmission" - df_substations.loc[:, "dc"] = False - df_substations.loc[df_substations["frequency"] == "0", "dc"] = True - df_substations.loc[:, "under_construction"] = False - df_substations.loc[:, "station_id"] = None - df_substations.loc[:, "tag_area"] = None - df_substations.loc[:, "tag_source"] = df_substations["bus_id"] + df_substations.loc[:, "contains"] = df_substations["bus_id"].apply( + lambda x: x.split("-")[0] + ) + + # Initialise x_node column (if the bus is part of an interconnector) to False, will be set later + df_substations.loc[:, "x_node"] = False # Only included needed columns df_substations = df_substations[ [ "bus_id", - "symbol", - "tag_substation", "voltage", - "lon", - "lat", - "dc", - "under_construction", - "station_id", - "tag_area", "country", + "x_node", "geometry", "polygon", - "tag_source", + "contains", ] ] @@ -1238,33 +1122,23 @@ def _finalise_lines(df_lines): df_lines.loc[:, "underground"] = False df_lines.loc[df_lines["tag_type"] == "line", "underground"] = False df_lines.loc[df_lines["tag_type"] == "cable", "underground"] = True - df_lines.loc[:, "under_construction"] = False - df_lines.loc[:, "dc"] = False - df_lines.loc[df_lines["tag_frequency"] == "50", "dc"] = False - df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True # Only include needed columns df_lines = df_lines[ [ "line_id", "circuits", - "tag_type", "voltage", - "tag_frequency", "bus0", "bus1", "length", "underground", - "under_construction", - "dc", - "country", "geometry", ] ] df_lines["circuits"] = df_lines["circuits"].astype(int) df_lines["voltage"] = df_lines["voltage"].astype(int) - df_lines["tag_frequency"] = df_lines["tag_frequency"].astype(int) return df_lines @@ -1296,8 +1170,6 @@ def _finalise_links(df_links): df_links["bus1"] = None df_links["length"] = None df_links["underground"] = True - df_links["under_construction"] = False - df_links["dc"] = True # Only include needed columns df_links = df_links[ @@ -1309,9 +1181,6 @@ def _finalise_links(df_links): "bus1", "length", "underground", - "under_construction", - "dc", - "country", "geometry", ] ] @@ -1537,7 +1406,7 @@ def _merge_touching_polygons(df): return gdf -def _add_endpoints_to_line(linestring, polygon_dict): +def _add_endpoints_to_line(linestring, polygon_dict, tol=BUS_TOL / 2): """ Adds endpoints to a line by removing any overlapping areas with polygons. @@ -1550,10 +1419,10 @@ def _add_endpoints_to_line(linestring, polygon_dict): """ if not polygon_dict: return linestring - polygon_centroids = { - bus_id: polygon.centroid for bus_id, polygon in polygon_dict.items() + polygon_pois = { + bus_id: polylabel(polygon, tol) for bus_id, polygon in polygon_dict.items() } - polygon_unary = polygons = unary_union(list(polygon_dict.values())) + polygon_unary = unary_union(list(polygon_dict.values())) # difference with polygon linestring_new = linestring.difference(polygon_unary) @@ -1562,8 +1431,8 @@ def _add_endpoints_to_line(linestring, polygon_dict): # keep the longest line in the multilinestring linestring_new = max(linestring_new.geoms, key=lambda x: x.length) - for p in polygon_centroids: - linestring_new = _add_bus_centroid_to_line(linestring_new, polygon_centroids[p]) + for p in polygon_pois: + linestring_new = _add_bus_poi_to_line(linestring_new, polygon_pois[p]) return linestring_new @@ -1593,9 +1462,9 @@ def _get_polygons_at_endpoints(linestring, polygon_dict): return bus_id_polygon_dict -def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): +def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon, tol=BUS_TOL / 2): """ - Extends the lines in the given GeoDataFrame `gdf_lines` to the centroid of + Extends the lines in the given GeoDataFrame `gdf_lines` to the Pole of Inaccessibility (PoI) of the nearest substations represented by the polygons in the `gdf_substations_polygon` GeoDataFrame. @@ -1606,6 +1475,9 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): Returns: GeoDataFrame: A new GeoDataFrame with the lines extended to the substations. """ + logger.info( + "Extending lines ending in substations to interior 'Pole of Inaccessibility'" + ) gdf = gpd.sjoin( gdf_lines, gdf_substations_polygon.drop_duplicates(subset="polygon", inplace=False), @@ -1642,7 +1514,9 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): ) gdf.loc[:, "line_geometry_new"] = gdf.apply( - lambda row: _add_endpoints_to_line(row["line_geometry"], row["bus_endpoints"]), + lambda row: _add_endpoints_to_line( + row["line_geometry"], row["bus_endpoints"], tol + ), axis=1, ) @@ -1654,7 +1528,9 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): return gdf_lines -# Function to bridge gaps between all lines +def _check_if_ways_in_multi(list, longer_list): + # Check if any of the elements in list are in longer_list + return any([x in longer_list for x in list]) if __name__ == "__main__": @@ -1668,7 +1544,7 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): # Parameters crs = "EPSG:4326" # Correct crs for OSM data - min_voltage_ac = 200000 # [unit: V] Minimum voltage value to filter AC lines. + min_voltage_ac = 220000 # [unit: V] Minimum voltage value to filter AC lines. min_voltage_dc = 150000 # [unit: V] Minimum voltage value to filter DC links. logger.info("---") @@ -1682,6 +1558,15 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): # Cleaning process df_substations = _import_substations(path_substations) df_substations["voltage"] = _clean_voltage(df_substations["voltage"]) + + # Extract converter subset + df_substations.reset_index(drop=True, inplace=True) + converter_candidates = ( + df_substations["substation"].str.contains("converter").dropna() + ) + converter_candidates = converter_candidates[converter_candidates].index + df_converters = df_substations.loc[converter_candidates] + df_substations, list_voltages = _filter_by_voltage( df_substations, min_voltage=min_voltage_ac ) @@ -1691,7 +1576,7 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): # Merge touching polygons df_substations = _merge_touching_polygons(df_substations) - df_substations = _create_substations_centroid(df_substations) + df_substations = _create_substations_poi(df_substations) df_substations = _finalise_substations(df_substations) # Create polygon GeoDataFrame to remove lines within substations @@ -1703,6 +1588,108 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): gdf_substations_polygon["geometry"] = gdf_substations_polygon.polygon.copy() + # Continue cleaning of converters + logger.info("---") + logger.info("CONVERTERS") + logger.info(f"Extracting {len(df_converters)} converters as subset of substations.") + df_converters, list_converter_voltages = _filter_by_voltage( + df_converters, min_voltage=min_voltage_dc + ) + df_converters.reset_index(drop=True, inplace=True) + gdf_converters = gpd.GeoDataFrame( + df_converters[["id", "geometry"]], geometry="geometry", crs=crs + ) + + ### Lines/Cables relations + logger.info("---") + logger.info("AC LINES/CABLES RELATIONS") + path_routes_relation = { + "routes_relation": snakemake.input.routes_relation, + } + + df_routes_relation = _import_routes_relation(path_routes_relation) + + df_lines_cables_relation = df_routes_relation.copy() + df_lines_cables_relation = _drop_duplicate_lines(df_lines_cables_relation) + df_lines_cables_relation.loc[:, "voltage"] = _clean_voltage( + df_lines_cables_relation["voltage"] + ) + df_lines_cables_relation, list_voltages = _filter_by_voltage( + df_lines_cables_relation, min_voltage=min_voltage_ac + ) + df_lines_cables_relation.loc[:, "frequency"] = _clean_frequency( + df_lines_cables_relation["frequency"] + ) + df_lines_cables_relation = df_lines_cables_relation[ + df_lines_cables_relation["frequency"] != "0" + ] + df_lines_cables_relation["frequency"] = "50" + df_lines_cables_relation.loc[:, "circuits"] = _clean_circuits( + df_lines_cables_relation["circuits"] + ) + df_lines_cables_relation.loc[:, "cables"] = _clean_cables( + df_lines_cables_relation["cables"] + ) + df_lines_cables_relation = _clean_lines(df_lines_cables_relation, list_voltages) + df_lines_cables_relation.drop( + columns=["voltage_original", "cleaned", "circuits_original", "split_elements"], + inplace=True, + ) + + # Create geometries + df_lines_cables_relation["components"] = df_lines_cables_relation.apply( + _create_line, axis=1 + ) + df_lines_cables_relation["geometry"] = df_lines_cables_relation["components"].apply( + lambda x: x[0] + ) + df_lines_cables_relation["contains"] = df_lines_cables_relation["components"].apply( + lambda x: x[1] + ) + + # Show lines that are multilinestring + subset_multi_linestrings = df_lines_cables_relation[ + df_lines_cables_relation["geometry"].apply( + lambda x: x.geom_type == "MultiLineString" + ) + ].copy() + ways_in_multi_linestring = pd.Series( + itertools.chain(*subset_multi_linestrings["contains"]) + ).unique() + + # Mark rows that have ways members in subset_multi_linestrings (These cannot be used) + # TODO: Improve this in the future, to make this constraint less tight (if it is in another multi_linstring), reduce the underlying way/circuit accordingly. + # TODO: Then, the relation can still be added. + df_lines_cables_relation["contains_ways_in_multi"] = df_lines_cables_relation[ + "contains" + ].apply(lambda x: _check_if_ways_in_multi(x, ways_in_multi_linestring)) + + # Only keep rows that have contains_ways_in_multi == False + df_lines_cables_relation = df_lines_cables_relation[ + ~df_lines_cables_relation["contains_ways_in_multi"] + ] + # Only keep rows if instance of Linestring + df_lines_cables_relation = df_lines_cables_relation[ + df_lines_cables_relation["geometry"].apply( + lambda x: x.geom_type == "LineString" + ) + ] + ways_to_replace = pd.Series( + itertools.chain(*df_lines_cables_relation["contains"]) + ).unique() + + df_lines_cables_relation.rename(columns={"id": "line_id"}, inplace=True) + df_lines_cables_relation = df_lines_cables_relation[ + ["line_id", "circuits", "voltage", "geometry", "contains"] + ] + df_lines_cables_relation["circuits"] = df_lines_cables_relation["circuits"].astype( + int + ) + df_lines_cables_relation["voltage"] = df_lines_cables_relation["voltage"].astype( + int + ) + + # Lines and cables logger.info("---") logger.info("LINES AND CABLES") path_lines = { @@ -1710,15 +1697,29 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): "cables": snakemake.input.cables_way, } - # Cleaning process + # Import and replace with relations, if relations unique linestrings and line is a member df_lines = _import_lines_and_cables(path_lines) df_lines = _drop_duplicate_lines(df_lines) + + # Dropping + len_before = len(df_lines) + connection_type = ( + df_lines[df_lines["id"].isin(ways_to_replace)].set_index("id")["power"].copy() + ) + df_lines = df_lines[~df_lines["id"].isin(ways_to_replace)] + len_after = len(df_lines) + logger.info( + f"Dropping {len_before - len_after} OSM ways (AC lines and cables) for their parent OSM relations." + ) + + # Cleaning process df_lines.loc[:, "voltage"] = _clean_voltage(df_lines["voltage"]) df_lines, list_voltages = _filter_by_voltage(df_lines, min_voltage=min_voltage_ac) df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"]) df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"]) df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"]) df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"]) + df_lines = _clean_lines(df_lines, list_voltages) # Drop DC lines, will be added through relations later @@ -1728,22 +1729,47 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): logger.info( f"Dropped {len_before - len_after} DC lines. Keeping {len_after} AC lines." ) - df_lines = _create_lines_geometry(df_lines) df_lines = _finalise_lines(df_lines) + df_lines["contains"] = df_lines["line_id"].apply(lambda x: [x.split("-")[0]]) + + # Merge + # Map element in list of column ways to connection_type + df_lines_cables_relation["underground"] = df_lines_cables_relation[ + "contains" + ].apply(lambda x: [connection_type[way] for way in x if way in connection_type]) + df_lines_cables_relation["underground"] = df_lines_cables_relation[ + "underground" + ].apply(lambda x: list(set(x))) + df_lines_cables_relation["underground"] = df_lines_cables_relation[ + "underground" + ].apply(lambda x: True if x == ["cable"] else False) + + df_lines = pd.concat( + [df_lines, df_lines_cables_relation], axis=0, ignore_index=True + ) # Create GeoDataFrame gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs) gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon) - gdf_lines = _extend_lines_to_substations(gdf_lines, gdf_substations_polygon) + + gdf_lines = _extend_lines_to_substations( + gdf_lines, gdf_substations_polygon, tol=BUS_TOL / 2 + ) logger.info("---") logger.info("HVDC LINKS") - path_links = { - "links": snakemake.input.links_relation, - } - df_links = _import_links(path_links) + df_links = df_routes_relation.copy() + + logger.info("Dropping lines without rating.") + len_before = len(df_links) + df_links = df_links.dropna(subset=["rating"]) + len_after = len(df_links) + logger.info( + f"Dropped {len_before-len_after} elements without rating. " + + f"Imported {len_after} elements." + ) df_links = _drop_duplicate_lines(df_links) df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"]) @@ -1761,26 +1787,6 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): "link_id" ) - # Add line endings to substations - path_country_shapes = snakemake.input.country_shapes - path_offshore_shapes = snakemake.input.offshore_shapes - - df_substations = _add_line_endings_to_substations( - df_substations, - gdf_lines, - path_country_shapes, - path_offshore_shapes, - prefix="line-end", - ) - - df_substations = _add_line_endings_to_substations( - df_substations, - gdf_links, - path_country_shapes, - path_offshore_shapes, - prefix="link-end", - ) - # Drop polygons and create GDF gdf_substations = gpd.GeoDataFrame( df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs @@ -1788,6 +1794,7 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): output_substations_polygon = snakemake.output["substations_polygon"] output_substations = snakemake.output["substations"] + output_converters_polygon = snakemake.output["converters_polygon"] output_lines = snakemake.output["lines"] output_links = snakemake.output["links"] @@ -1799,6 +1806,8 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): ) logger.info(f"Exporting clean substations to {output_substations}") gdf_substations.to_file(output_substations, driver="GeoJSON") + logger.info(f"Exporting converter polygons to {output_converters_polygon}") + gdf_converters.to_file(output_converters_polygon, driver="GeoJSON") logger.info(f"Exporting clean lines to {output_lines}") gdf_lines.to_file(output_lines, driver="GeoJSON") logger.info(f"Exporting clean links to {output_links}") diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py index ac6b25354..b66d851f7 100644 --- a/scripts/prepare_osm_network_release.py +++ b/scripts/prepare_osm_network_release.py @@ -19,6 +19,7 @@ "dc", "symbol", "under_construction", + "tags", "x", "y", "country", @@ -44,6 +45,7 @@ "length", "underground", "under_construction", + "tags", "geometry", ] TRANSFORMERS_COLUMNS = [ @@ -52,6 +54,7 @@ "bus1", "voltage_bus0", "voltage_bus1", + "s_nom", "geometry", ] CONVERTERS_COLUMNS = [ @@ -59,6 +62,7 @@ "bus0", "bus1", "voltage", + "p_nom", "geometry", ] @@ -109,6 +113,12 @@ def export_clean_csv(df, columns, output_file): network.lines.length = network.lines.length * 1e3 network.links.length = network.links.length * 1e3 + # Sort alphabetically + network.buses.sort_index(inplace=True) + network.transformers.sort_index(inplace=True) + network.lines.sort_index(inplace=True) + network.links.sort_index(inplace=True) + # Export to clean csv for release logger.info(f"Exporting {len(network.buses)} buses to %s", snakemake.output.buses) export_clean_csv(network.buses, BUSES_COLUMNS, snakemake.output.buses) diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py index 745533cff..d51d871f5 100644 --- a/scripts/retrieve_osm_data.py +++ b/scripts/retrieve_osm_data.py @@ -31,7 +31,7 @@ def retrieve_osm_data( features=[ "cables_way", "lines_way", - "links_relation", + "routes_relation", "substations_way", "substations_relation", ], @@ -51,6 +51,7 @@ def retrieve_osm_data( A list of OSM features to retrieve. The default is [ "cables_way", "lines_way", + "routes_relation", "substations_way", "substations_relation", ]. @@ -61,7 +62,7 @@ def retrieve_osm_data( features_dict = { "cables_way": 'way["power"="cable"]', "lines_way": 'way["power"="line"]', - "links_relation": 'relation["route"="power"]["frequency"="0"]', + "routes_relation": 'relation["route"="power"]', "substations_way": 'way["power"="substation"]', "substations_relation": 'relation["power"="substation"]', }