Skip to content

Commit

Permalink
assign all facility locations
Browse files Browse the repository at this point in the history
  • Loading branch information
Hussein-Mahfouz committed Aug 28, 2024
1 parent 1d97b59 commit d4da468
Showing 1 changed file with 344 additions and 0 deletions.
344 changes: 344 additions & 0 deletions scripts/3.3_assign_facility_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,344 @@
import geopandas as gpd
import pandas as pd
from libpysal.weights import Queen

import acbm
from acbm.assigning.plots import plot_desire_lines, plot_scatter_actual_reported
from acbm.assigning.select_facility import map_activity_locations, select_facility
from acbm.logger_config import assigning_primary_locations_logger as logger

# --- Load data: activity chains
logger.info("Loading activity chains")

activity_chains = pd.read_csv(acbm.root_path / "data/processed/activities_pam/legs.csv")
activity_chains = activity_chains.drop(columns=["Unnamed: 0", "freq"])

# --- Preprocess: Split activity chains by activity purpose
logger.info("Splitting activity chains by activity purpose")

activity_chains_home = activity_chains[
activity_chains["destination activity"] == "home"
]
activity_chains_work = activity_chains[
activity_chains["destination activity"] == "work"
]
activity_chains_edu = activity_chains[
activity_chains["destination activity"] == "education"
]
# secondary activities
activities_to_exclude = ["home", "work", "education"]
activity_chains_other = activity_chains[
~activity_chains["destination activity"].isin(activities_to_exclude)
]

# --- Load data: POI locations
logger.info("Loading facility data")

osm_data_gdf = gpd.read_parquet(
acbm.root_path / "data/external/boundaries/west-yorkshire_epsg_4326.parquet"
)

# --- Load data: Boundaries
logger.info("Loading boundaries data")

where_clause = "MSOA21NM LIKE '%Leeds%'"

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson", where=where_clause
)
boundaries = boundaries.to_crs(epsg=4326)

# --- Prepprocess: add zone column to POI data
logger.info("Adding zone column to POI data")

# ensure that osm_data_gdf and boundaries are in the same crs
osm_data_gdf = osm_data_gdf.to_crs(boundaries.crs)

osm_data_gdf = gpd.sjoin(
osm_data_gdf, boundaries[["OA21CD", "geometry"]], how="inner", predicate="within"
)


# --- Analysis: SELECTING FACILITIES
logger.info("Selecting facilities")

# Get neighboring zones
logger.info("1. Calculating neighboring zones")

# get neighbors
zone_neighbors = Queen.from_dataframe(boundaries, idVariable="OA21CD").neighbors

# - HOME LOCATIONS
logger.info("2. Selecting HOME locations")

# Keep one row per household and select only household and OA21CD columns
activity_chains_home_hh = activity_chains_home.drop_duplicates(subset=["hid"])
activity_chains_home_hh = activity_chains_home_hh[
["hid", "destination activity", "dzone"]
]

activity_locations_home = select_facility(
df=activity_chains_home_hh,
unique_id_col="hid",
facilities_gdf=osm_data_gdf,
row_destination_zone_col="dzone",
row_activity_type_col="destination activity",
gdf_facility_zone_col="OA21CD",
gdf_facility_type_col="activities",
gdf_sample_col="floor_area",
neighboring_zones=zone_neighbors,
)

# Map the activity_id and activity_geometry to the activity_chains_home_df DataFrame
activity_chains_home = map_activity_locations(
activity_chains_df=activity_chains_home,
activity_locations_dict=activity_locations_home,
id_col="hid",
)

# - WORK LOCATIONS
logger.info("3. Selecting WORK locations")

activity_locations_work = select_facility(
df=activity_chains_work,
unique_id_col="pid",
facilities_gdf=osm_data_gdf,
row_destination_zone_col="dzone",
row_activity_type_col="destination activity",
gdf_facility_zone_col="OA21CD",
gdf_facility_type_col="activities",
gdf_sample_col="floor_area",
neighboring_zones=zone_neighbors,
)

# Map the activity_id and activity_geometry to the activity_chains_df DataFrame
activity_chains_work = map_activity_locations(
activity_chains_df=activity_chains_work,
activity_locations_dict=activity_locations_work,
id_col="pid",
)


# - EDUCATION LOCATIONS
logger.info("4. Selecting EDUCATION locations")

logger.info("a. Adding eduction type as fallback")
# load in activity chains
spc_with_nts = pd.read_parquet(
acbm.root_path / "data/interim/matching/spc_with_nts_trips.parquet"
)
# we get one row per id
spc_with_nts_edu = spc_with_nts[["id", "education_type"]].drop_duplicates(subset="id")
# merge the education type with the activity chains
activity_chains_edu = activity_chains_edu.merge(
spc_with_nts_edu, left_on="pid", right_on="id", how="left"
).drop(columns=["id"])

logger.info("b. Selecting education locations")

# apply the function to a row in activity_chains_ex
activity_locations_edu = select_facility(
df=activity_chains_edu,
unique_id_col="pid",
facilities_gdf=osm_data_gdf,
row_destination_zone_col="dzone",
row_activity_type_col="education_type",
gdf_facility_zone_col="OA21CD",
gdf_facility_type_col="activities",
gdf_sample_col="floor_area",
neighboring_zones=zone_neighbors,
fallback_type="education",
)

# Map the activity_id and activity_geometry to the activity_chains_home_df DataFrame
activity_chains_edu = map_activity_locations(
activity_chains_df=activity_chains_edu,
activity_locations_dict=activity_locations_edu,
id_col="pid",
)


# - SECONDARY LOCATIONS
logger.info("5. Selecting SECONDARY locations")

logger.info("a. creating unique_id column")
# pid and hid are not unique id columns, as there can be many different secondary
# activities done by the same person.
# We create a unique identifier that can be mapped back to the original data.

# Unique id column: Concatenate pid, seq
activity_chains_other["act_id"] = (
activity_chains_other["pid"].astype(str)
+ "_"
+ activity_chains_other["seq"].astype(str)
)

logger.info("b. Selecting secondary locations")

# apply the function to a row in activity_chains_ex
activity_locations_other = select_facility(
df=activity_chains_other,
unique_id_col="act_id",
facilities_gdf=osm_data_gdf,
row_destination_zone_col="dzone",
row_activity_type_col="purp",
gdf_facility_zone_col="OA21CD",
gdf_facility_type_col="activities",
gdf_sample_col="floor_area",
neighboring_zones=zone_neighbors,
fallback_to_random=True,
)

# Map the activity_id and activity_geometry to the activity_chains_home_df DataFrame
activity_chains_other = map_activity_locations(
activity_chains_df=activity_chains_other,
activity_locations_dict=activity_locations_other,
id_col="act_id",
)


# --- Analysis: Merging data
logger.info("Merging all activity chains")

activity_chains_all = pd.concat(
[
activity_chains_home,
activity_chains_work,
activity_chains_edu,
activity_chains_other,
]
)

activity_chains_all = activity_chains_all.sort_values(by=["hid", "pid", "seq"])


# --- Analysis: Create start_location_id and start_location_geometry column
logger.info("Creating start_location_id and start_location_geometry columns")

# Create start_location_id and start_location_geometry by shifting end_location_id and end_location_geometry within each 'pid'
activity_chains_all["start_location_id"] = activity_chains_all.groupby("pid")[
"end_location_id"
].shift(1)
activity_chains_all["start_location_geometry"] = activity_chains_all.groupby("pid")[
"end_location_geometry"
].shift(1)

logger.info("Fill rows where seq = 1 with home location")

mask = activity_chains_all["seq"] == 1
# Aggregate duplicates by taking the first occurrence
activity_chains_home_agg = activity_chains_home.groupby("hid").first().reset_index()
# Map home location data to the start_location_id and start_location_geometry columns
activity_chains_all.loc[mask, "start_location_id"] = activity_chains_all.loc[
mask, "hid"
].map(activity_chains_home_agg.set_index("hid")["end_location_id"])
activity_chains_all.loc[mask, "start_location_geometry"] = activity_chains_all.loc[
mask, "hid"
].map(activity_chains_home_agg.set_index("hid")["end_location_geometry"])


# --- Save data

# Keep necessary columns


# select only the columns we need
activity_chains_all = activity_chains_all[
[
"pid",
"hid",
"ozone",
"dzone",
"purp",
"origin activity",
"destination activity",
"mode",
"seq",
"tst",
"tet",
"duration",
"start_location_id",
"start_location_geometry",
"end_location_id",
"end_location_geometry",
]
]


# save as parquet
activity_chains_all.to_parquet(
acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet"
)


# --- Plots

logger.info("Creating plots")

# merge actual times from the NTS
activity_chains_all = activity_chains_all.merge(
spc_with_nts[["id", "seq", "TripTotalTime", "TripDisIncSW"]],
left_on=["pid", "seq"],
right_on=["id", "seq"],
how="left",
).drop(columns=["id"])

# Get unique activity types from the 'purp' column
unique_activity_types = activity_chains_all["purp"].unique()

# Plot 1: Euclidian travel distance vs reported (NTS) travel DISTANCE
logger.info("Plotting Euclidian travel distance vs reported (NTS) travel DISTANCE")

# Iterate over each unique activity type and create a plot
for activity_type in unique_activity_types:
plot_scatter_actual_reported(
activities=activity_chains_all,
activity_type=activity_type,
activity_type_col="destination activity",
x_col="TripDisIncSW",
y_col="length",
x_label="Reported Travel Distance (km)",
y_label="Actual Distance - Euclidian (km)",
title_prefix=f"Scatter plot of TripDisIncSW vs. Length for {activity_type}",
save_dir=acbm.root_path / "data/processed/plots/assigning/",
)

# Plot 2: Euclidian travel distance vs reported (NTS) travel TIME
logger.info("Plotting Euclidian travel distance vs reported (NTS) travel TIME")

# # convert duration to numeric
# activity_chains_all["duration"] = pd.to_timedelta(activity_chains_all["duration"], errors="coerce")
# activity_chains_all['duration'] = activity_chains_all['duration'].apply(lambda x: x + pd.Timedelta(days=1) if x.days < 0 else x)
# activity_chains_all["duration"] = activity_chains_all["duration"].dt.total_seconds() / 60
# activity_chains_all["duration"] = activity_chains_all["duration"].astype(int)


# Iterate over each unique activity type and create a plot
for activity_type in unique_activity_types:
plot_scatter_actual_reported(
activities=activity_chains_all,
activity_type=activity_type,
activity_type_col="destination activity",
x_col="TripTotalTime",
y_col="length",
x_label="Reported Travel TIme (min)",
y_label="Actual Distance - Euclidian (km)",
title_prefix="Scatter plot of TripTotalTime vs. Length",
save_dir=acbm.root_path / "data/processed/plots/assigning/",
)

# ....

# Plot 3: Desire lines between start and end locations
logger.info("Plotting desire lines between start and end locations")

for activity_type in unique_activity_types:
plot_desire_lines(
activities=activity_chains_all,
activity_type_col="destination activity",
activity_type=activity_type,
bin_size=5000,
boundaries=boundaries,
sample_size=1000,
save_dir=acbm.root_path / "data/processed/plots/assigning/",
)

0 comments on commit d4da468

Please sign in to comment.