diff --git a/src/cleaner/__init__.py b/src/cleaner/__init__.py index 7e8ce99..5a4c168 100644 --- a/src/cleaner/__init__.py +++ b/src/cleaner/__init__.py @@ -1,5 +1,4 @@ import json, argparse, os, datetime as dt, xxhash -from azure.cosmos import CosmosClient, exceptions from dotenv import load_dotenv class Cleaner: @@ -7,6 +6,12 @@ class Cleaner: def __init__(self, county): self.county = county.lower() + def redact_cause_number(self, input_dict: dict, out_file: dict) -> dict: + #This will hash and redact the cause number and then add it to the output file. + cause_number_hash = xxhash.xxh64(str(input_dict['code'])).hexdigest() + out_file["cause_number_redacted"] = cause_number_hash + return out_file + def clean(self): case_json_folder_path = os.path.join( @@ -26,7 +31,7 @@ def clean(self): list_case_json_files = os.listdir(case_json_folder_path) for case_json in list_case_json_files: print(case_json) - # List of motions identified as evidenciary + # List of motions identified as evidentiary. TODO: These should be moved to a separate JSON in resources good_motions = [ "Motion To Suppress", "Motion to Reduce Bond", @@ -61,7 +66,6 @@ def clean(self): charge_name_to_umich = charge_name_to_umich_dict # Cleaned Case Primary format out_file = {} - out_file["case_number"] = input_dict["code"] #Note: This may be closed to personally identifying information of the defendant. out_file["attorney_type"] = input_dict["party information"]["appointed or retained"] #Adding the county and hash values into the final version. out_file["county"] = input_dict["county"] @@ -112,6 +116,8 @@ def contains_good_motion(motion, event): def_atty_hash = xxhash.xxh64(str(def_atty_unique_str)).hexdigest() out_file["defense attorney"] = def_atty_hash + out_file = self.redact_cause_number(input_dict, out_file) + # Original Format out_filepath = os.path.join( os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned",case_json @@ -119,4 +125,3 @@ def contains_good_motion(motion, event): with open(out_filepath, "w") as f: json.dump(out_file, f) -