forked from cliffordlab/DEID-2023
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deid-SHANIAH.py
92 lines (75 loc) · 4.46 KB
/
deid-SHANIAH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Import necessary libraries
import re # Import the regular expressions library
import sys # Import the system-specific library
# Define a regular expression pattern for identifying names that begin with titles (Mr., Ms., or Mrs.), case insensitive
name_pattern = r'(?:[Mm]r\.|[Mm]s\.|[Mm]rs\.) [A-Za-z][a-z][A-Za-z][a-z]+'
# Open and read the 'PTName.phi' file
with open('C:/Users/Shaniah Reece/OneDrive/Documents/BMI500/DEID-2023/python/PTName.phi') as file:
for line in file:
patient_note = line.strip() # Remove leading and trailing whitespace
# Apply the regular expression to the current line and find matched names
matched_names = re.findall(name_pattern, patient_note, re.I) # 're.I' makes the regex case insensitive
# Print or save the matched names to verify correctness
if matched_names:
print(f"Matched names in '{patient_note}': {matched_names}")
# Compile the regular expression for better performance
name_reg = re.compile(name_pattern)
def check_for_name(patient, note, chunk, output_handle):
"""
This function checks for names in a given text chunk and writes the results to an output file.
Inputs:
- patient: Patient Name, will be printed in each occurrence of personal information found.
- note: Note Number, will be printed in each occurrence of personal information found.
- chunk: One whole record of a patient.
- output_handle: An opened file handle to write the results.
"""
# Define an offset to adjust position due to differences from Perl code
offset = 27
# For each new note, write the "Patient X Note Y" line to the output file
output_handle.write(f'Patient {patient}\tNote {note}\n')
# Search the whole chunk for name occurrences and write results to the output file
for match in name_reg.finditer(chunk):
# Debug print on the screen (not written to the output file)
print(f'{patient} {note}', end=' ')
print((match.start() - offset), match.end() - offset, match.group())
# Create a string to write to the output file in the format 'start start end'
result = f'{match.start() - offset} {match.start() - offset} {match.end() - offset}'
# Write the result to one line of the output file
output_handle.write(result + '\n')
def deid_name(text_path = "C:/Users/Shaniah Reece/OneDrive/Documents/BMI500/DEID-2023/python/id.text", output_path = 'C:/Users/Shaniah Reece/OneDrive/Documents/BMI500/DEID-2023/python/PTName.phi'):
"""
This function de-identifies personal information in patient records.
Inputs:
- text_path: Path to the file containing patient records.
- output_path: Path to the output file.
"""
# Define a regular expression pattern to match the start of each note
start_of_record_pattern = '^start_of_record=(\d+)\|\|\|\|(\d+)\|\|\|\|$'
# Define a regular expression pattern to match the end of each note
end_of_record_pattern = '\|\|\|\|END_OF_RECORD$'
# Open the output file just once to save time on the time-intensive I/O
with open(output_path, 'w+') as output_file:
with open(text_path) as text:
# Initialize an empty chunk. Go through the input file line by line.
# Whenever we see the start_of_record pattern, note patient and note numbers and start
# adding everything to the 'chunk' until we see the end_of_record.
chunk = ''
for line in text:
record_start = re.findall(start_of_record_pattern, line, flags=re.IGNORECASE)
if len(record_start):
patient, note = record_start[0]
chunk += line
# Check to see if we have seen the end of one note
record_end = re.findall(end_of_record_pattern, line, flags=re.IGNORECASE)
if len(record_end):
# Now we have a full patient note stored in 'chunk', along with patient number and note number.
# Pass all to check_for_name to find any name numbers in the note.
check_for_name(patient, note, chunk.strip(), output_file)
# Initialize the chunk for the next note to be read
chunk = ''
if __name__ == "__main__":
print(len(sys.argv))
if len(sys.argv) == 1:
deid_name()
else:
deid_name(sys.argv[1], sys.argv[2])