chore: cleanup, automate validator year

agrc · Aug 9, 2024 · 6a3d1a4 · 6a3d1a4
1 parent 6cf933a
commit 6a3d1a4
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 [![Push Events](https://github.com/agrc/wmrc-skid/actions/workflows/push.yml/badge.svg)](https://github.com/agrc/wmrc-skid/actions/workflows/push.yml)
 
-An automated updater for updating the hosted feature services behind the Department of Waste Management and Radiation Control (WMRC)'s recycling facilities [map](https://deq.utah.gov/waste-management-and-radiation-control/statewide-recycling-data-initiative) and dashboard.
+An automated updater for updating the hosted feature services behind the Department of Waste Management and Radiation Control (WMRC)'s recycling facilities [map](https://deq.utah.gov/waste-management-and-radiation-control/statewide-recycling-data-initiative) and dashboard and also running validation analyses three times a year.
 
-## Overview
+## Facilities Update
 
 The map is an Experience Builder app that lives in DEQ's AGOL org. The dashboard is currently being built in their org as well. This skid updates five hosted feature services in their org:
 
@@ -17,3 +17,18 @@ The map is an Experience Builder app that lives in DEQ's AGOL org. The dashboard
 It pulls data from a Google Sheet (facility ids, used oil collection center [UOCC] locations and amounts) and DEQ's Salesforce organization.
 
 It is designed to run weekly as a Google Cloud Function.
+
+## Validation Script
+
+The validation script compares year-over-year changes for different metrics at the facility, county, and state levels to help WMRC staff identify potential typos, missing information, or other problems with the data.
+
+It runs on the following schedule:
+
+- April 1 of each year: First check
+- May 1 of each year: Check for go-live
+- May 20 of each year: Data from previous year live on map (validation script doesn't run, but reminder for us to change the year value in config.py and any needed filters on the map/dashboard)
+- June 1 of each year: Final check
+
+## Multiple Schedules, One Function
+
+This skid is deployed as a single gen2 Cloud Function with two different schedules. Each Cloud Schedule should contain a different message-body: `'facility updates'` to trigger the feature service updating, and `'validate'` to trigger the validator script.
diff --git a/src/wmrc/main.py b/src/wmrc/main.py
@@ -419,44 +419,13 @@ def _load_salesforce_data(self) -> helpers.SalesForceRecords:
         return salesforce_records
 
 
-# def main(event, context):  # pylint: disable=unused-argument
-#     """Entry point for Google Cloud Function triggered by pub/sub event
-
-#     Args:
-#          event (dict):  The dictionary with data specific to this type of
-#                         event. The `@type` field maps to
-#                          `type.googleapis.com/google.pubsub.v1.PubsubMessage`.
-#                         The `data` field maps to the PubsubMessage data
-#                         in a base64-encoded string. The `attributes` field maps
-#                         to the PubsubMessage attributes if any is present.
-#          context (google.cloud.functions.Context): Metadata of triggering event
-#                         including `event_id` which maps to the PubsubMessage
-#                         messageId, `timestamp` which maps to the PubsubMessage
-#                         publishTime, `event_type` which maps to
-#                         `google.pubsub.topic.publish`, and `resource` which is
-#                         a dictionary that describes the service API endpoint
-#                         pubsub.googleapis.com, the triggering topic's name, and
-#                         the triggering event type
-#                         `type.googleapis.com/google.pubsub.v1.PubsubMessage`.
-#     Returns:
-#         None. The output is written to Cloud Logging.
-#     """
-
-#     #: This function must be called 'main' to act as the Google Cloud Function entry point. It must accept the two
-#     #: arguments listed, but doesn't have to do anything with them (I haven't used them in anything yet).
-
-#     #: Call process() and any other functions you want to be run as part of the skid here.
-#     wmrc_skid = Skid()
-#     wmrc_skid.process()
-
-
 def run_validation():
 
     start = datetime.now()
 
     wmrc_skid = Skid()
 
-    base_year = 2023
+    base_year = date.today().year - 1
     report_path = wmrc_skid.tempdir_path / f"validation_{date.today()}.csv"
 
     wmrc_skid.skid_logger.debug("Loading salesforce data...")
@@ -466,8 +435,6 @@ def run_validation():
     county_summary_df = summarize.counties(records)
 
     wmrc_skid.skid_logger.debug("Year-over-year changes...")
-
-    #: Calc year-over-year changes
     facility_changes = validate.facility_year_over_year(facility_summary_df, records.df, base_year)
     county_changes = validate.county_year_over_year(county_summary_df, base_year)
     state_changes = validate.state_year_over_year(county_summary_df, base_year)
@@ -533,6 +500,7 @@ def subscribe(cloud_event: CloudEvent) -> None:
     #: This function must be called 'subscribe' to act as the Google Cloud Function entry point. It must accept the
     #: CloudEvent object as the only argument.
 
+    #: Use the message-body value from the pub/sub event to figure out which process to run.
     if base64.b64decode(cloud_event.data["message"]["data"]).decode() == "facility updates":
         wmrc_skid = Skid()
         wmrc_skid.process()

diff --git a/src/wmrc/validate.py b/src/wmrc/validate.py
@@ -132,41 +132,3 @@ def _year_over_year_changes(metrics_df: pd.DataFrame, current_year: int) -> pd.D
     return everything[
         list(interleave([pct_change.columns, values_current_year.columns, values_previous_year.columns, diffs.columns]))
     ]
-
-
-# def run_validations():
-
-#     base_year = 2023
-#     report_path = r"c:\gis\projects\wmrc\data\from_sf\validation_2.csv"
-
-#     #: Get records from salesforce, run summary methods
-#     wmrc_skid = Skid()
-#     records = wmrc_skid._load_salesforce_data()
-#     _ = records.deduplicate_records_on_facility_id()
-#     facility_summary_df = summarize.facility_metrics(records)
-#     county_summary_df = summarize.counties(records)
-
-#     #: Calc year-over-year changes
-#     facility_changes = facility_year_over_year(facility_summary_df, records.df, base_year)
-#     county_changes = county_year_over_year(county_summary_df, base_year)
-#     state_changes = state_year_over_year(county_summary_df, base_year)
-
-#     #: Remove county-wide and statewide prefixes so we can concat the different change dfs by row
-#     county_changes.rename(
-#         columns={col: col.replace("county_wide_", "") for col in county_changes.columns}, inplace=True
-#     )
-#     state_changes.rename(columns={col: col.replace("statewide_", "") for col in state_changes.columns}, inplace=True)
-
-#     all_changes = pd.concat([facility_changes, county_changes, state_changes], axis=0)
-
-#     #: Move the msw_recycling_rate columns to the front, write to csv
-#     index_a = all_changes.columns.get_loc("msw_recycling_rate_pct_change")
-#     slice_b = all_changes.columns.slice_indexer("msw_recycling_rate_pct_change", "msw_recycling_rate_diff")
-#     index_c = all_changes.columns.get_loc("msw_recycling_rate_diff") + 1
-#     new_index = all_changes.columns[slice_b].append([all_changes.columns[:index_a], all_changes.columns[index_c:]])
-
-#     all_changes.reindex(columns=new_index).to_csv(report_path)
-
-
-# if __name__ == "__main__":
-#     run_validations()