Merge branch 'dev' into main

DSACMS · Sep 10, 2024 · b93f8fc · b93f8fc
2 parents 1b3a52b + 8da4c12
commit b93f8fc
Show file tree

Hide file tree

Showing 13 changed files with 813 additions and 1,102 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10", "3.11","3.12"]
     steps:
       - uses: actions/checkout@v4
       - name: Install poetry

diff --git a/cli/deduplifhirLib/settings.py b/cli/deduplifhirLib/settings.py
@@ -23,7 +23,7 @@
 
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-with open(dir_path + '/splink_settings.json',"r",encoding="utf-8") as f:
+with open(os.path.join(dir_path,'splink_settings.json'),"r",encoding="utf-8") as f:
     splink_settings_dict = json.load(f)
 
 
@@ -32,38 +32,96 @@
 #blocking_rules = list(
 #    map(block_on,blocking_rules))
 
-blocking_rules = []
-for rule in BLOCKING_RULE_STRINGS:
-    if isinstance(rule, list):
-        blocking_rules.append(block_on(*rule))
-    else:
-        blocking_rules.append(block_on(rule))
+def get_additional_comparison_rules(parsed_data_df):
+    """
+    This function generates appropriate comparison rules based on pandas column names
+
+    Arguments:
+        parsed_data_df: The dataframe that was parsed from the user that we want to
+        find duplicates in
+    
+    Returns:
+        A generator collection object full of splink comparison objects
+    """
+
+    parsed_data_columns = parsed_data_df.columns
+
+    for col in parsed_data_columns:
+        if 'street_address' in col:
+            yield cl.ExactMatch(col)
+        elif 'postal_code' in col:
+            yield cl.PostcodeComparison(col)
+
+def create_blocking_rules():
+    blocking_rules = []
+    for rule in BLOCKING_RULE_STRINGS:
+        if isinstance(rule, list):
+            blocking_rules.append(block_on(*rule))
+        else:
+            blocking_rules.append(block_on(rule))
+
+    return blocking_rules
+
+
+def create_settings(parsed_data_df):
+    """
+    This function generates a Splink SettingsCreator object based on the parsed
+    input data's columns and the blocking settings in splink_settings.json
+
+    Arguments:
+        parsed_data_df: The dataframe that was parsed from the user that we want to
+        find duplicates in
+    
+    Returns:
+        A splink SettingsCreator object to be used with a splink linker object
+    """
+
+    blocking_rules = create_blocking_rules()
 
+    comparison_rules = [item for item in get_additional_comparison_rules(parsed_data_df)]
+    comparison_rules.extend([
+        cl.ExactMatch("phone").configure(
+            term_frequency_adjustments=True
+        ),
+        cl.NameComparison("given_name").configure(
+            term_frequency_adjustments=True
+        ),
+        cl.NameComparison("family_name").configure(
+            term_frequency_adjustments=True
+        ),
+        cl.DateOfBirthComparison("birth_date",input_is_string=True)]
+    )
 
-comparison_rules = [
-    cl.ExactMatch("street_address").configure(
-        term_frequency_adjustments=True
-    ),
-    cl.ExactMatch("phone").configure(
-        term_frequency_adjustments=True
-    ),
-    cl.NameComparison("given_name").configure(
-        term_frequency_adjustments=True
-    ),
-    cl.NameComparison("family_name").configure(
-        term_frequency_adjustments=True
-    ),
-    cl.DateOfBirthComparison("birth_date",input_is_string=True),
-    cl.PostcodeComparison("postal_code")
-]
 
+    return SettingsCreator(
+        link_type=splink_settings_dict["link_type"],
+        blocking_rules_to_generate_predictions=blocking_rules,
+        comparisons=comparison_rules,
+        max_iterations=splink_settings_dict["max_iterations"],
+        em_convergence=splink_settings_dict["em_convergence"])
 
-SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE = SettingsCreator(
-    link_type=splink_settings_dict["link_type"],
-    blocking_rules_to_generate_predictions=blocking_rules,
-    comparisons=comparison_rules,
-    max_iterations=splink_settings_dict["max_iterations"],
-    em_convergence=splink_settings_dict["em_convergence"])
+
+
+def parse_fhir_dates(fhir_json_obj):
+    """
+    A generator function that parses the address portion of a FHIR file
+    into a dictionary object that can be added to the overall patient record
+
+    Arguments:
+        fhir_json_obj: The object that has been parsed from the FHIR data
+    
+    Returns:
+        A generator containing dictionaries of address data.
+    """
+    addresses = fhir_json_obj['entry'][0]['resource']['address']
+
+    for addr,n in enumerate(sorted(addresses)):
+        yield {
+            f"street_address{n}": [normalize_addr_text(''.join(addr['line']))],
+            f"city{n}": [normalize_addr_text(addr['city'])],
+            f"state{n}": [normalize_addr_text(addr['state'])],
+            f"postal_code{n}": [normalize_addr_text(addr['postalCode'])]
+        }
 
 
 
@@ -103,21 +161,19 @@ def read_fhir_data(patient_record_path):
             patient_json_record['entry'][0]['resource']['birthDate']
         ),
         "phone": [patient_json_record['entry'][0]['resource']['telecom'][0]['value']],
-        "street_address": [
-            normalize_addr_text(
-                patient_json_record['entry'][0]['resource']['address'][0]['line'][0]
-            )
-        ],
-        "city": [
-            normalize_addr_text(patient_json_record['entry'][0]['resource']['address'][0]['city'])
-        ],
-        "state": [
-            normalize_addr_text(patient_json_record['entry'][0]['resource']['address'][0]['state'])
-        ],
-        "postal_code": [patient_json_record['entry'][0]['resource']['address'][0]['postalCode']],
         "ssn": [patient_json_record['entry'][0]['resource']['identifier'][1]['value']],
         "path": patient_record_path
     }
-    #print(patient_dict)
+
+    try:
+        patient_dict["middle_name"] = [
+            normalize_name_text(patient_json_record['entry'][0]['resouce']['name'][0]['given'][1])
+        ]
+    except IndexError:
+        patient_dict["middle_name"] = [""]
+        print("no middle name found!")
+
+    for date in parse_fhir_dates(patient_json_record):
+        patient_dict.update(date)
 
     return pd.DataFrame(patient_dict)
diff --git a/cli/deduplifhirLib/splink_settings.json b/cli/deduplifhirLib/splink_settings.json
@@ -3,7 +3,6 @@
     "blocking_rules_to_generate_predictions": [
          "birth_date",
         ["ssn", "birth_date"],
-        ["ssn", "street_address"],
         "phone"
     ],
     "max_iterations": 20,

diff --git a/cli/deduplifhirLib/tests/duplicate_data_generator.py b/cli/deduplifhirLib/tests/duplicate_data_generator.py
@@ -85,7 +85,7 @@ def generate_temp_files(config, fake_gen):
         fake_gen: Faker generation object
     """
 
-    tmp_dir = './temp'
+    tmp_dir = os.path.join(os.getcwd(), 'temp')
 
     with Pool(config['cpus']) as pool:
 
@@ -116,7 +116,7 @@ def combine_temp_files(tmp_dir, output_file):
     if os.path.isfile(output_file):
         os.remove(output_file)
     with open(output_file, 'wb') as outfile:
-        for filename in glob.glob(tmp_dir + '/*'):
+        for filename in glob.glob(os.path.join(tmp_dir, '*')):
             with open(filename, 'rb') as readfile:
                 shutil.copyfileobj(readfile, outfile)
 
@@ -144,7 +144,7 @@ def create_fake_data_file(config, fake_gen, tmp_dir, batch_size, remaining_rows)
     try:
         fake_data = get_fake_data(
             num_of_initial_rows, num_duplicated_rows, config['columns'], fake_gen)
-        temp_file_name = tmp_dir + '/' + str(uuid.uuid4())
+        temp_file_name = os.path.join(tmp_dir, str(uuid.uuid4()))
         print(f"Writing {rows_to_process} rows to file")
         fake_data.to_csv(temp_file_name, header=False)
     except Exception as e:

diff --git a/cli/deduplifhirLib/tests/main_test.py b/cli/deduplifhirLib/tests/main_test.py
@@ -23,8 +23,12 @@ def generate_mock_data_fixture(request):
     """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
         temp_file.close()
-        generate_dup_data('deduplifhirLib/tests/test_data_columns.json',
-         temp_file.name, rows=request.param, duprate=0.2)
+        generate_dup_data(
+            os.path.join(
+                'deduplifhirLib','tests','test_data_columns.json'
+            ),
+            temp_file.name, rows=request.param, duprate=0.2
+        )
         sample_df = pd.read_csv(temp_file.name)
         assert sample_df.shape[0] == request.param, f"Expected {request.param} deduplicated records"
         print(sample_df)
@@ -45,7 +49,7 @@ def test_dedupe_data_with_csv_output(cli_runner):
     """
 
     # Prepare test data paths
-    bad_data_path = 'deduplifhirLib/tests/test_data.csv'
+    bad_data_path = os.path.join('deduplifhirLib','tests','test_data.csv')
     output_path = 'output.csv'
     print(os.getcwd())
     # Simulate CLI command execution
@@ -66,7 +70,7 @@ def test_dedupe_data_with_specific_csv(cli_runner):
     Test dedupe_data function with specific CSV data to verify deduplication.
     """
     # Prepare test data
-    test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address,city,state,postal_code,SSN
+    test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address0,city0,state0,postal_code0,SSN
 8,9b0b0b7c-e05e-4c89-991d-268eab2483f7,Obrien,Curtis,M,07/02/1996,,300 Amy Corners Suite 735,Rileytown,Alaska,60281,480-21-0833
 342,9b0b0b7c-e05e-4c89-991d-268eab2483f7,Orbien,Cutris,M,07/02/1996,,300 Amy oCrenrs Suite 735,Rileytown,Alaska,60281,480-210-833
 502,9b0b0b7c-e05e-4c89-991d-268eab2483f7,bOrien,Curtsi,M,07/02/1996,,300 AmyCo rners Suite 735,Rileytown,Alaska,60281,480-21-8033
@@ -78,8 +82,7 @@ def test_dedupe_data_with_specific_csv(cli_runner):
 273,04584982-ae7a-44a1-b4f0-e927a8bab0e1,Russlel,Lnidsay,F,02/05/1977,,2110 Kimbelry Vilalges Apt. 639,New David,Wyoming,52082,211-52-6989
 311,04584982-ae7a-44a1-b4f0-e927a8bab0e1,Russlel,Lindasy,F,02/05/1977,,2110 Kimbelry Villgaes Apt. 639,New David,Wyoming,52082,211-52-9698
 652,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lidnsay,F,02/05/1977,,2110 Kimberly Vlilagse Apt. 639,New David,Wyoming,52082,121-52-6998
-726,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lindasy,F,02/05/1977,,2110 Kmiberly Vilalges Apt. 639,New David,Wyoming,52082,2115-2-6
-    """
+726,04584982-ae7a-44a1-b4f0-e927a8bab0e1,uRssell,Lindasy,F,02/05/1977,,2110 Kmiberly Vilalges Apt. 639,New David,Wyoming,52082,2115-2-6S"""
 
     # Write test data to specific.csv
     with open('specific.csv', 'w',encoding='utf-8') as f:
@@ -108,7 +111,7 @@ def test_dedupe_data_with_json_output(cli_runner):
     """
 
     # Prepare test data paths
-    bad_data_path = 'deduplifhirLib/tests/test_data.csv'
+    bad_data_path = os.path.join('deduplifhirLib','tests','test_data.csv')
     output_path = 'output.json'
 
     # Simulate CLI command execution
@@ -130,7 +133,7 @@ def test_dedupe_data_with_invalid_format(cli_runner):
     """
 
     # Prepare invalid test data paths
-    bad_data_path = 'deduplifhirLib/tests/test_data_invalid.txt'
+    bad_data_path = os.path.join('deduplifhirLib','tests','test_data_invalid.txt')
     output_path = 'output.csv'
 
     # Write some invalid content to the test file
@@ -153,11 +156,11 @@ def test_dedupe_accuracy(cli_runner):
     Test dedupe_data function for deduplication accuracy using a dataset with known duplicates.
     """
     # Prepare test data
-    test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address,city,state,postal_code,SSN
+    test_data_csv = """id,truth_value,family_name,given_name,gender,birth_date,phone,street_address0,city0,state0,postal_code0,SSN
     1,duplicate,Smith,John,M,01/01/1990,,123 Elm St,Springfield,IL,62701,123-45-6789
     2,duplicate,Smyth,John,M,01/01/1990,,123 Elm St.,Springfield,IL,62701,123-45-6789
-    3,unique,Doe,Jane,F,02/02/1992,,456 Oak St,Springfield,IL,62702,987-65-4321
-    """
+    3,unique,Doe,Jane,F,02/02/1992,,456 Oak St,Springfield,IL,62702,987-65-4321"""
+
     with open('accuracy.csv', 'w',encoding='utf-8') as f:
         f.write(test_data_csv)