From 978b857293d0a7e06be33ea2bc6ae2ea4a371e74 Mon Sep 17 00:00:00 2001 From: Jake Adams Date: Mon, 3 Jun 2024 15:48:26 -0600 Subject: [PATCH] feat: deduplicate facility records per calendar year --- src/wmrc/helpers.py | 40 ++++++++++++++-- src/wmrc/main.py | 4 ++ tests/test_helpers.py | 104 ++++++++++++++++++++++++++++++++++++++++++ tests/test_wmrc.py | 1 - 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/src/wmrc/helpers.py b/src/wmrc/helpers.py index f8fb410..52e5b33 100644 --- a/src/wmrc/helpers.py +++ b/src/wmrc/helpers.py @@ -68,12 +68,15 @@ def _build_columns_string(self) -> str: Returns: str: A comma-delimited string of needed columns for the SOQL query """ + additional_fields = [ + "RecordTypeId", + "Classifications__c", + "RecordType.Name", + "Facility__r.Solid_Waste_Facility_ID_Number__c", + "LastModifiedDate", + ] - fields_string = ",".join(self.field_mapping.values()) - fields_string += ( - ",RecordTypeId,Classifications__c,RecordType.Name,Facility__r.Solid_Waste_Facility_ID_Number__c" - ) - fields_string += "," + ",".join(self.county_fields) + fields_string = ",".join(list(self.field_mapping.values()) + additional_fields + self.county_fields) return fields_string @@ -165,3 +168,30 @@ def _build_field_mapping(self): if missing_fields: raise ValueError(f"Missing fields: {missing_fields}") + + def deduplicate_records_on_facility_id(self) -> Mapping[str, str]: + """Deduplicate all facilities' records, dropping all but the latest modified record per Calendar_Year__c. + + Returns: + Mapping[str, str]: Dictionary of facility ids: calendar years that had duplicate records - {"SW0123": + "2022, 2023", etc} + """ + + #: {"SW0123": "2022, 2023", etc} + duplicated_facility_ids = { + facility_id: ", ".join(years) + for facility_id, years in self.df[ + self.df.duplicated(subset=["facility_id", "Calendar_Year__c"], keep=False) + ] + .groupby("facility_id")["Calendar_Year__c"] + .unique() + .items() + } + + #: Sort by last updated time and keep the most recent record + self.df["LastModifiedDate"] = pd.to_datetime(self.df["LastModifiedDate"]) + self.df = self.df.sort_values("LastModifiedDate").drop_duplicates( + subset=["facility_id", "Calendar_Year__c"], keep="last" + ) + + return duplicated_facility_ids diff --git a/src/wmrc/main.py b/src/wmrc/main.py index 791405c..60e4fc2 100644 --- a/src/wmrc/main.py +++ b/src/wmrc/main.py @@ -143,6 +143,7 @@ def process(self): #: Load data from Salesforce and generate analyses using Summarize methods self.skid_logger.info("Loading records from Salesforce...") records = self._load_salesforce_data() + duplicate_facility_ids = records.deduplicate_records_on_facility_id() facility_summary_df = summarize.facilities(records).query("data_year == @config.YEAR") county_summary_df = summarize.counties(records) materials_recycled_df = summarize.materials_recycled(records) @@ -198,6 +199,9 @@ def process(self): f"Materials composted rows loaded: {composting_count}", f"Statewide metrics rows loaded: {statewide_count}", ] + if duplicate_facility_ids: + summary_rows.insert(7, "Duplicate facility IDs per calendar year:") + summary_rows.insert(8, "\t" + "\n\t".join(f"{k}: {v}" for k, v in duplicate_facility_ids.items())) summary_message.message = "\n".join(summary_rows) summary_message.attachments = self.tempdir_path / self.log_name diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 9795967..5756ea3 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,4 +1,5 @@ import pandas as pd + from wmrc import helpers @@ -15,3 +16,106 @@ def test_add_bogus_geometries_happy_path(self): result_df = helpers.add_bogus_geometries(input_df) assert result_df.spatial.validate() + + +class TestSalesForceRecords: + + def test_build_columns_string_happy_path(self, mocker): + salesforce_records = mocker.Mock() + + salesforce_records.field_mapping = { + "a": "b", + "c": "d", + } + salesforce_records.county_fields = ["foo", "bar"] + + result = helpers.SalesForceRecords._build_columns_string(salesforce_records) + + assert ( + result + == "b,d,RecordTypeId,Classifications__c,RecordType.Name,Facility__r.Solid_Waste_Facility_ID_Number__c,LastModifiedDate,foo,bar" + ) + + def test_deduplicate_records_on_facility_id_single_year(self, mocker): + salesforce_records = mocker.Mock() + salesforce_records.df = pd.DataFrame( + { + "facility_id": ["1", "2", "1"], + "LastModifiedDate": ["2022-01-01", "2022-01-02", "2022-01-03"], + "a": [1, 2, 3], + "Calendar_Year__c": "2022", + } + ) + + duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records) + + expected_df = pd.DataFrame( + { + "facility_id": ["2", "1"], + "LastModifiedDate": ["2022-01-02", "2022-01-03"], + "a": [2, 3], + "Calendar_Year__c": "2022", + }, + index=[1, 2], + ) + expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"]) + + pd.testing.assert_frame_equal(salesforce_records.df, expected_df) + assert duplicate_ids == {"1": "2022"} + + def test_deduplicate_records_on_facility_id_keeps_multiple_years(self, mocker): + salesforce_records = mocker.Mock() + salesforce_records.df = pd.DataFrame( + { + "facility_id": ["1", "2", "1", "2"], + "LastModifiedDate": ["2022-01-01", "2022-01-02", "2023-01-03", "2023-12-02"], + "a": [1, 2, 3, 4], + "Calendar_Year__c": ["2022", "2022", "2022", "2023"], + } + ) + + duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records) + + expected_df = pd.DataFrame( + { + "facility_id": ["2", "1", "2"], + "LastModifiedDate": ["2022-01-02", "2023-01-03", "2023-12-02"], + "a": [2, 3, 4], + "Calendar_Year__c": ["2022", "2022", "2023"], + }, + index=[1, 2, 3], + ) + expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"]) + + pd.testing.assert_frame_equal(salesforce_records.df, expected_df) + assert duplicate_ids == {"1": "2022"} + + def test_deduplicate_records_on_facility_id_keeps_modified_date_later_than_calendar_year(self, mocker): + salesforce_records = mocker.Mock() + salesforce_records.df = pd.DataFrame( + { + "facility_id": ["1", "1", "1"], + "LastModifiedDate": ["2022-01-01", "2023-01-02", "2024-01-03"], + "a": [1, 2, 3], + "Calendar_Year__c": ["2022", "2023", "2023"], + } + ) + + duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records) + + expected_df = pd.DataFrame( + { + "facility_id": [ + "1", + "1", + ], + "LastModifiedDate": ["2022-01-01", "2024-01-03"], + "a": [1, 3], + "Calendar_Year__c": ["2022", "2023"], + }, + index=[0, 2], + ) + expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"]) + + pd.testing.assert_frame_equal(salesforce_records.df, expected_df) + assert duplicate_ids == {"1": "2023"} diff --git a/tests/test_wmrc.py b/tests/test_wmrc.py index 6b226ec..fa59f82 100644 --- a/tests/test_wmrc.py +++ b/tests/test_wmrc.py @@ -1,5 +1,4 @@ import pandas as pd - from wmrc import main