Skip to content

Commit

Permalink
Merge pull request #67 from Urban-Analytics-Technology-Platform/16-fi…
Browse files Browse the repository at this point in the history
…lter-nts-data-to-study-area-to-avoid-unrepresentative-travel-distances-or-mode-share

add filter_by_region fn and migrate region, years, and travday to config
  • Loading branch information
Hussein-Mahfouz authored Nov 19, 2024
2 parents 8a5281b + 93f81d2 commit 2b470b2
Show file tree
Hide file tree
Showing 14 changed files with 409 additions and 283 deletions.
33 changes: 26 additions & 7 deletions config/base.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
[parameters]
seed = 0
region = "leeds"
number_of_households = 5000
zone_id = "OA21CD"
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
region = "leeds" # this is used to query poi data from osm and to load in SPC data
number_of_households = 5000 # how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton
zone_id = "OA21CD" # "OA21CD": OA level, "MSOA11CD": MSOA level
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
boundary_geography = "OA"
# NTS years to use
nts_years = [2019, 2021, 2022]
# NTS regions to use
nts_regions = [
'Yorkshire and the Humber',
'North West',
'North East',
'East Midlands',
'West Midlands',
'East of England',
'South East',
'South West']
# nts day of the week to use
# 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday
nts_day_of_week = 3

[matching]
# for optional and required columns, see the [iterative_match_categorical](https://github.com/Urban-Analytics-Technology-Platform/acbm/blob/ca181c54d7484ebe44706ff4b43c26286b22aceb/src/acbm/matching.py#L110) function
# Do not add any column not listed below. You can only move a column from optional to require (or vise versa)
required_columns = ["number_adults", "number_children"]
optional_columns = [
"number_cars",
Expand All @@ -15,10 +32,12 @@ optional_columns = [
"employment_status",
"tenure_status",
]
n_matches = 10
n_matches = 10 # What is the maximum number of NTS matches we want for each SPC household?

[work_assignment]
use_percentages = true
commute_level = "MSOA"
use_percentages = true # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true
# weights to add for each objective in the optimization problem
weight_max_dev = 0.2
weight_total_dev = 0.8
max_zones = 8
max_zones = 8 # maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution)
114 changes: 100 additions & 14 deletions notebooks/2_match_households_and_individuals.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -34,7 +34,7 @@
"from acbm.matching import MatcherExact, match_individuals\n",
"from acbm.preprocessing import (\n",
" count_per_group,\n",
" #nts_filter_by_region,\n",
" nts_filter_by_region,\n",
" nts_filter_by_year,\n",
" num_adult_child_hh,\n",
" transform_by_group,\n",
Expand Down Expand Up @@ -470,7 +470,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -487,7 +487,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -539,7 +539,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -587,7 +587,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -635,18 +635,40 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"years = [2019, 2021, 2022]\n",
"years = [2018, 2019, 2021, 2022]\n",
"\n",
"nts_individuals = nts_filter_by_year(nts_individuals, psu, years)\n",
"nts_households = nts_filter_by_year(nts_households, psu, years)\n",
"nts_trips = nts_filter_by_year(nts_trips, psu, years)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of rows in individuals: 52351\n",
"Number of rows in households: 22545\n",
"Number of rows in trips: 630600\n"
]
}
],
"source": [
"# get number of rows in each table\n",
"print(f\"Number of rows in individuals: {nts_individuals.shape[0]}\")\n",
"print(f\"Number of rows in households: {nts_households.shape[0]}\")\n",
"print(f\"Number of rows in trips: {nts_trips.shape[0]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -658,15 +680,79 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0;31mSignature:\u001b[0m\n",
"\u001b[0mnts_filter_by_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mpsu\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mregions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m\n",
"Filter the NTS dataframe based on the chosen region(s)\n",
"\n",
"data: pandas DataFrame\n",
" The NTS data to be filtered\n",
"psu: pandas DataFrame\n",
" The Primary Sampling Unit table in the NTS. It has the region assigned to each sample\n",
"regions: list\n",
" The chosen region(s)\n",
"\u001b[0;31mFile:\u001b[0m ~/Documents/GitHub/acbm/src/acbm/preprocessing.py\n",
"\u001b[0;31mType:\u001b[0m function"
]
}
],
"source": [
"?nts_filter_by_region"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# regions = ['Yorkshire and the Humber', 'North West']\n",
"\n",
"# nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)\n",
"# nts_households = nts_filter_by_region(nts_households, psu, regions)\n",
"# nts_trips = nts_filter_by_region(nts_trips, psu, regions)\n"
"# Keep all regions except for London (as it is very different from the rest of the country)\n",
"regions = [\n",
" 'Yorkshire and the Humber', \n",
" 'North West',\n",
" 'North East',\n",
" 'East Midlands',\n",
" 'West Midlands',\n",
" 'East of England',\n",
" 'South East',\n",
" 'South West']\n",
"\n",
"nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)\n",
"nts_households = nts_filter_by_region(nts_households, psu, regions)\n",
"nts_trips = nts_filter_by_region(nts_trips, psu, regions)\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of rows in individuals: 27003\n",
"Number of rows in households: 11581\n",
"Number of rows in trips: 369196\n"
]
}
],
"source": [
"# get number of rows in each table\n",
"print(f\"Number of rows in individuals: {nts_individuals.shape[0]}\")\n",
"print(f\"Number of rows in households: {nts_households.shape[0]}\")\n",
"print(f\"Number of rows in trips: {nts_trips.shape[0]}\")"
]
},
{
Expand Down
14 changes: 6 additions & 8 deletions scripts/2_match_households_and_individuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from acbm.matching import MatcherExact, match_individuals
from acbm.preprocessing import (
count_per_group,
# nts_filter_by_region,
nts_filter_by_region,
nts_filter_by_year,
num_adult_child_hh,
transform_by_group,
Expand Down Expand Up @@ -244,22 +244,20 @@ def get_interim_path(

logger.info("Filtering NTS data by specified year(s)")

years = [2019, 2021, 2022]
years = config.parameters.nts_years

nts_individuals = nts_filter_by_year(nts_individuals, psu, years)
nts_households = nts_filter_by_year(nts_households, psu, years)
nts_trips = nts_filter_by_year(nts_trips, psu, years)

# #### Filter by geography
#
# I will not do this for categorical matching, as it reduces the sample significantly,
# and leads to more spc households not being matched

# regions = ['Yorkshire and the Humber', 'North West']
regions = config.parameters.nts_regions

# nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
# nts_households = nts_filter_by_region(nts_households, psu, regions)
# nts_trips = nts_filter_by_region(nts_trips, psu, regions)
nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
nts_households = nts_filter_by_region(nts_households, psu, regions)
nts_trips = nts_filter_by_region(nts_trips, psu, regions)

# Create dictionaries of key value pairs

Expand Down
8 changes: 6 additions & 2 deletions scripts/3.1_assign_primary_feasible_zones.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def main(config_file):

# Filter to a specific day of the week
logger.info("Filtering activity chains to a specific day of the week")
activity_chains = activity_chains[activity_chains["TravDay"] == 3] # Wednesday
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

# --- Study area boundaries

Expand Down Expand Up @@ -94,7 +96,9 @@ def main(config_file):
# If travel_times is not true or loading failed, create a new travel time matrix
logger.info("No travel time matrix found. Creating a new travel time matrix.")
# Create a new travel time matrix based on distances between zones
travel_times = zones_to_time_matrix(zones=boundaries, id_col=config.zone_id)
travel_times = zones_to_time_matrix(
zones=boundaries, id_col=config.zone_id, time_units="m"
)
logger.info("Travel time estimates created")
# save travel_times as parquet

Expand Down
4 changes: 3 additions & 1 deletion scripts/3.2.1_assign_primary_zone_edu.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def main(config_file):
logger.info("Loading activity chains")

activity_chains = activity_chains_for_assignment(columns=cols_for_assignment_edu())
activity_chains = activity_chains[activity_chains["TravDay"] == 3] # Wednesday
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

logger.info("Filtering activity chains for trip purpose: education")
activity_chains_edu = activity_chains[activity_chains["dact"] == "education"]
Expand Down
4 changes: 3 additions & 1 deletion scripts/3.2.2_assign_primary_zone_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def main(config_file):
# --- Activity chains
activity_chains = activity_chains_for_assignment(cols_for_assignment_work())
activity_chains = add_locations_to_activity_chains(activity_chains)
activity_chains = activity_chains[activity_chains["TravDay"] == 3] # Wednesday
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

activity_chains_work = activity_chains[activity_chains["dact"] == "work"]

Expand Down
4 changes: 3 additions & 1 deletion scripts/3.2.3_assign_secondary_zone.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def main(config_file):
logger.info("Loading: activity chains")

activity_chains = activity_chains_for_assignment()
activity_chains = activity_chains[activity_chains["TravDay"] == 3] # Wednesday
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

# --- Add OA21CD to the data

Expand Down
Loading

0 comments on commit 2b470b2

Please sign in to comment.