Merge pull request #67 from Urban-Analytics-Technology-Platform/16-fi…

…lter-nts-data-to-study-area-to-avoid-unrepresentative-travel-distances-or-mode-share add filter_by_region fn and migrate region, years, and travday to config
Urban-Analytics-Technology-Platform · Nov 19, 2024 · 2b470b2 · 2b470b2
2 parents 8a5281b + 93f81d2
commit 2b470b2
Show file tree

Hide file tree

Showing 14 changed files with 409 additions and 283 deletions.
diff --git a/config/base.toml b/config/base.toml
@@ -1,12 +1,29 @@
 [parameters]
 seed = 0
-region = "leeds"
-number_of_households = 5000
-zone_id = "OA21CD"
-travel_times = true         # Only set to true if you have travel time matrix at the level specified in boundary_geography
+region = "leeds"  # this is used to query poi data from osm and to load in SPC data
+number_of_households = 5000  # how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton
+zone_id = "OA21CD" # "OA21CD": OA level, "MSOA11CD": MSOA level
+travel_times = true  # Only set to true if you have travel time matrix at the level specified in boundary_geography
 boundary_geography = "OA"
+# NTS years to use
+nts_years = [2019, 2021, 2022]
+# NTS regions to use
+nts_regions = [
+    'Yorkshire and the Humber',
+    'North West',
+    'North East',
+    'East Midlands',
+    'West Midlands',
+    'East of England',
+    'South East',
+    'South West']
+# nts day of the week to use
+# 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday
+nts_day_of_week = 3
 
 [matching]
+# for optional and required columns, see the [iterative_match_categorical](https://github.com/Urban-Analytics-Technology-Platform/acbm/blob/ca181c54d7484ebe44706ff4b43c26286b22aceb/src/acbm/matching.py#L110) function
+# Do not add any column not listed below. You can only move a column from optional to require (or vise versa)
 required_columns = ["number_adults", "number_children"]
 optional_columns = [
     "number_cars",
@@ -15,10 +32,12 @@ optional_columns = [
     "employment_status",
     "tenure_status",
 ]
-n_matches = 10
+n_matches = 10 # What is the maximum number of NTS matches we want for each SPC household?
 
 [work_assignment]
-use_percentages = true
+commute_level = "MSOA"
+use_percentages = true  # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true
+# weights to add for each objective in the optimization problem
 weight_max_dev = 0.2
 weight_total_dev = 0.8
-max_zones = 8
+max_zones = 8   # maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution)
diff --git a/notebooks/2_match_households_and_individuals.ipynb b/notebooks/2_match_households_and_individuals.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,7 +34,7 @@
     "from acbm.matching import MatcherExact, match_individuals\n",
     "from acbm.preprocessing import (\n",
     "    count_per_group,\n",
-    "    #nts_filter_by_region,\n",
+    "    nts_filter_by_region,\n",
     "    nts_filter_by_year,\n",
     "    num_adult_child_hh,\n",
     "    transform_by_group,\n",
@@ -470,7 +470,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -487,7 +487,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -539,7 +539,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -587,7 +587,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -635,18 +635,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
-    "years = [2019, 2021, 2022]\n",
+    "years = [2018, 2019, 2021, 2022]\n",
     "\n",
     "nts_individuals = nts_filter_by_year(nts_individuals, psu, years)\n",
     "nts_households = nts_filter_by_year(nts_households, psu, years)\n",
     "nts_trips = nts_filter_by_year(nts_trips, psu, years)\n",
     "\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of rows in individuals: 52351\n",
+      "Number of rows in households: 22545\n",
+      "Number of rows in trips: 630600\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get number of rows in each table\n",
+    "print(f\"Number of rows in individuals: {nts_individuals.shape[0]}\")\n",
+    "print(f\"Number of rows in households: {nts_households.shape[0]}\")\n",
+    "print(f\"Number of rows in trips: {nts_trips.shape[0]}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -658,15 +680,79 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;31mSignature:\u001b[0m\n",
+      "\u001b[0mnts_filter_by_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mpsu\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mregions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mDocstring:\u001b[0m\n",
+      "Filter the NTS dataframe based on the chosen region(s)\n",
+      "\n",
+      "data: pandas DataFrame\n",
+      "    The NTS data to be filtered\n",
+      "psu: pandas DataFrame\n",
+      "    The Primary Sampling Unit table in the NTS. It has the region assigned to each sample\n",
+      "regions: list\n",
+      "    The chosen region(s)\n",
+      "\u001b[0;31mFile:\u001b[0m      ~/Documents/GitHub/acbm/src/acbm/preprocessing.py\n",
+      "\u001b[0;31mType:\u001b[0m      function"
+     ]
+    }
+   ],
+   "source": [
+    "?nts_filter_by_region"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# regions = ['Yorkshire and the Humber', 'North West']\n",
-    "\n",
-    "# nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)\n",
-    "# nts_households = nts_filter_by_region(nts_households, psu, regions)\n",
-    "# nts_trips = nts_filter_by_region(nts_trips, psu, regions)\n"
+    "# Keep all regions except for London (as it is very different from the rest of the country)\n",
+    "regions = [\n",
+    "    'Yorkshire and the Humber', \n",
+    "    'North West',\n",
+    "    'North East',\n",
+    "    'East Midlands',\n",
+    "    'West Midlands',\n",
+    "    'East of England',\n",
+    "    'South East',\n",
+    "    'South West']\n",
+    "\n",
+    "nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)\n",
+    "nts_households = nts_filter_by_region(nts_households, psu, regions)\n",
+    "nts_trips = nts_filter_by_region(nts_trips, psu, regions)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of rows in individuals: 27003\n",
+      "Number of rows in households: 11581\n",
+      "Number of rows in trips: 369196\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get number of rows in each table\n",
+    "print(f\"Number of rows in individuals: {nts_individuals.shape[0]}\")\n",
+    "print(f\"Number of rows in households: {nts_households.shape[0]}\")\n",
+    "print(f\"Number of rows in trips: {nts_trips.shape[0]}\")"
    ]
   },
   {

diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py
@@ -15,7 +15,7 @@
 from acbm.matching import MatcherExact, match_individuals
 from acbm.preprocessing import (
     count_per_group,
-    # nts_filter_by_region,
+    nts_filter_by_region,
     nts_filter_by_year,
     num_adult_child_hh,
     transform_by_group,
@@ -244,22 +244,20 @@ def get_interim_path(
 
     logger.info("Filtering NTS data by specified year(s)")
 
-    years = [2019, 2021, 2022]
+    years = config.parameters.nts_years
 
     nts_individuals = nts_filter_by_year(nts_individuals, psu, years)
     nts_households = nts_filter_by_year(nts_households, psu, years)
     nts_trips = nts_filter_by_year(nts_trips, psu, years)
 
     # #### Filter by geography
     #
-    # I will not do this for categorical matching, as it reduces the sample significantly,
-    # and leads to more spc households not being matched
 
-    # regions = ['Yorkshire and the Humber', 'North West']
+    regions = config.parameters.nts_regions
 
-    # nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
-    # nts_households = nts_filter_by_region(nts_households, psu, regions)
-    # nts_trips = nts_filter_by_region(nts_trips, psu, regions)
+    nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
+    nts_households = nts_filter_by_region(nts_households, psu, regions)
+    nts_trips = nts_filter_by_region(nts_trips, psu, regions)
 
     # Create dictionaries of key value pairs
 

diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
@@ -32,7 +32,9 @@ def main(config_file):
 
     # Filter to a specific day of the week
     logger.info("Filtering activity chains to a specific day of the week")
-    activity_chains = activity_chains[activity_chains["TravDay"] == 3]  # Wednesday
+    activity_chains = activity_chains[
+        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+    ]
 
     # --- Study area boundaries
 
@@ -94,7 +96,9 @@ def main(config_file):
         # If travel_times is not true or loading failed, create a new travel time matrix
         logger.info("No travel time matrix found. Creating a new travel time matrix.")
         # Create a new travel time matrix based on distances between zones
-        travel_times = zones_to_time_matrix(zones=boundaries, id_col=config.zone_id)
+        travel_times = zones_to_time_matrix(
+            zones=boundaries, id_col=config.zone_id, time_units="m"
+        )
         logger.info("Travel time estimates created")
         # save travel_times as parquet
 

diff --git a/scripts/3.2.1_assign_primary_zone_edu.py b/scripts/3.2.1_assign_primary_zone_edu.py
@@ -56,7 +56,9 @@ def main(config_file):
     logger.info("Loading activity chains")
 
     activity_chains = activity_chains_for_assignment(columns=cols_for_assignment_edu())
-    activity_chains = activity_chains[activity_chains["TravDay"] == 3]  # Wednesday
+    activity_chains = activity_chains[
+        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+    ]
 
     logger.info("Filtering activity chains for trip purpose: education")
     activity_chains_edu = activity_chains[activity_chains["dact"] == "education"]

diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py
@@ -54,7 +54,9 @@ def main(config_file):
     # --- Activity chains
     activity_chains = activity_chains_for_assignment(cols_for_assignment_work())
     activity_chains = add_locations_to_activity_chains(activity_chains)
-    activity_chains = activity_chains[activity_chains["TravDay"] == 3]  # Wednesday
+    activity_chains = activity_chains[
+        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+    ]
 
     activity_chains_work = activity_chains[activity_chains["dact"] == "work"]
 

diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py
@@ -39,7 +39,9 @@ def main(config_file):
     logger.info("Loading: activity chains")
 
     activity_chains = activity_chains_for_assignment()
-    activity_chains = activity_chains[activity_chains["TravDay"] == 3]  # Wednesday
+    activity_chains = activity_chains[
+        activity_chains["TravDay"] == config.parameters.nts_day_of_week
+    ]
 
     # --- Add OA21CD to the data