Skip to content

Commit

Permalink
feat: deduplicate facility records per calendar year
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobdadams committed Jun 3, 2024
1 parent 0de1158 commit 978b857
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 6 deletions.
40 changes: 35 additions & 5 deletions src/wmrc/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,15 @@ def _build_columns_string(self) -> str:
Returns:
str: A comma-delimited string of needed columns for the SOQL query
"""
additional_fields = [
"RecordTypeId",
"Classifications__c",
"RecordType.Name",
"Facility__r.Solid_Waste_Facility_ID_Number__c",
"LastModifiedDate",
]

fields_string = ",".join(self.field_mapping.values())
fields_string += (
",RecordTypeId,Classifications__c,RecordType.Name,Facility__r.Solid_Waste_Facility_ID_Number__c"
)
fields_string += "," + ",".join(self.county_fields)
fields_string = ",".join(list(self.field_mapping.values()) + additional_fields + self.county_fields)

return fields_string

Expand Down Expand Up @@ -165,3 +168,30 @@ def _build_field_mapping(self):

if missing_fields:
raise ValueError(f"Missing fields: {missing_fields}")

def deduplicate_records_on_facility_id(self) -> Mapping[str, str]:
"""Deduplicate all facilities' records, dropping all but the latest modified record per Calendar_Year__c.
Returns:
Mapping[str, str]: Dictionary of facility ids: calendar years that had duplicate records - {"SW0123":
"2022, 2023", etc}
"""

#: {"SW0123": "2022, 2023", etc}
duplicated_facility_ids = {
facility_id: ", ".join(years)
for facility_id, years in self.df[
self.df.duplicated(subset=["facility_id", "Calendar_Year__c"], keep=False)
]
.groupby("facility_id")["Calendar_Year__c"]
.unique()
.items()
}

#: Sort by last updated time and keep the most recent record
self.df["LastModifiedDate"] = pd.to_datetime(self.df["LastModifiedDate"])
self.df = self.df.sort_values("LastModifiedDate").drop_duplicates(
subset=["facility_id", "Calendar_Year__c"], keep="last"
)

return duplicated_facility_ids
4 changes: 4 additions & 0 deletions src/wmrc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def process(self):
#: Load data from Salesforce and generate analyses using Summarize methods
self.skid_logger.info("Loading records from Salesforce...")
records = self._load_salesforce_data()
duplicate_facility_ids = records.deduplicate_records_on_facility_id()
facility_summary_df = summarize.facilities(records).query("data_year == @config.YEAR")
county_summary_df = summarize.counties(records)
materials_recycled_df = summarize.materials_recycled(records)
Expand Down Expand Up @@ -198,6 +199,9 @@ def process(self):
f"Materials composted rows loaded: {composting_count}",
f"Statewide metrics rows loaded: {statewide_count}",
]
if duplicate_facility_ids:
summary_rows.insert(7, "Duplicate facility IDs per calendar year:")
summary_rows.insert(8, "\t" + "\n\t".join(f"{k}: {v}" for k, v in duplicate_facility_ids.items()))

summary_message.message = "\n".join(summary_rows)
summary_message.attachments = self.tempdir_path / self.log_name
Expand Down
104 changes: 104 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd

from wmrc import helpers


Expand All @@ -15,3 +16,106 @@ def test_add_bogus_geometries_happy_path(self):
result_df = helpers.add_bogus_geometries(input_df)

assert result_df.spatial.validate()


class TestSalesForceRecords:

def test_build_columns_string_happy_path(self, mocker):
salesforce_records = mocker.Mock()

salesforce_records.field_mapping = {
"a": "b",
"c": "d",
}
salesforce_records.county_fields = ["foo", "bar"]

result = helpers.SalesForceRecords._build_columns_string(salesforce_records)

assert (
result
== "b,d,RecordTypeId,Classifications__c,RecordType.Name,Facility__r.Solid_Waste_Facility_ID_Number__c,LastModifiedDate,foo,bar"
)

def test_deduplicate_records_on_facility_id_single_year(self, mocker):
salesforce_records = mocker.Mock()
salesforce_records.df = pd.DataFrame(
{
"facility_id": ["1", "2", "1"],
"LastModifiedDate": ["2022-01-01", "2022-01-02", "2022-01-03"],
"a": [1, 2, 3],
"Calendar_Year__c": "2022",
}
)

duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records)

expected_df = pd.DataFrame(
{
"facility_id": ["2", "1"],
"LastModifiedDate": ["2022-01-02", "2022-01-03"],
"a": [2, 3],
"Calendar_Year__c": "2022",
},
index=[1, 2],
)
expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"])

pd.testing.assert_frame_equal(salesforce_records.df, expected_df)
assert duplicate_ids == {"1": "2022"}

def test_deduplicate_records_on_facility_id_keeps_multiple_years(self, mocker):
salesforce_records = mocker.Mock()
salesforce_records.df = pd.DataFrame(
{
"facility_id": ["1", "2", "1", "2"],
"LastModifiedDate": ["2022-01-01", "2022-01-02", "2023-01-03", "2023-12-02"],
"a": [1, 2, 3, 4],
"Calendar_Year__c": ["2022", "2022", "2022", "2023"],
}
)

duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records)

expected_df = pd.DataFrame(
{
"facility_id": ["2", "1", "2"],
"LastModifiedDate": ["2022-01-02", "2023-01-03", "2023-12-02"],
"a": [2, 3, 4],
"Calendar_Year__c": ["2022", "2022", "2023"],
},
index=[1, 2, 3],
)
expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"])

pd.testing.assert_frame_equal(salesforce_records.df, expected_df)
assert duplicate_ids == {"1": "2022"}

def test_deduplicate_records_on_facility_id_keeps_modified_date_later_than_calendar_year(self, mocker):
salesforce_records = mocker.Mock()
salesforce_records.df = pd.DataFrame(
{
"facility_id": ["1", "1", "1"],
"LastModifiedDate": ["2022-01-01", "2023-01-02", "2024-01-03"],
"a": [1, 2, 3],
"Calendar_Year__c": ["2022", "2023", "2023"],
}
)

duplicate_ids = helpers.SalesForceRecords.deduplicate_records_on_facility_id(salesforce_records)

expected_df = pd.DataFrame(
{
"facility_id": [
"1",
"1",
],
"LastModifiedDate": ["2022-01-01", "2024-01-03"],
"a": [1, 3],
"Calendar_Year__c": ["2022", "2023"],
},
index=[0, 2],
)
expected_df["LastModifiedDate"] = pd.to_datetime(expected_df["LastModifiedDate"])

pd.testing.assert_frame_equal(salesforce_records.df, expected_df)
assert duplicate_ids == {"1": "2023"}
1 change: 0 additions & 1 deletion tests/test_wmrc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd

from wmrc import main


Expand Down

0 comments on commit 978b857

Please sign in to comment.