-
Notifications
You must be signed in to change notification settings - Fork 0
/
matcher.py
238 lines (205 loc) · 10 KB
/
matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"OCL Concepts Matcher to find existing concepts in OCL based on provided Excel metadata"
import json
import math
import openpyxl
from rapidfuzz import process, fuzz
import pandas as pd
# Ignore potential warnings related to opening large Excel files
openpyxl.reader.excel.warnings.simplefilter(action='ignore')
# Excel Metadata filepath
METADATA_FILEPATH = "LIME EMR - Iraq Metadata - Release 1 (18).xlsx"
# OCL URL
OCL_URL = "https://app.openconceptlab.org/#"
# Matching threshold for fuzzy string matching
FUZZY_THRESHOLD = 95
# Initialize the counter of concept to match
CONCEPTS_TO_MATCH = 0
# Initialize the counter of matches found above the threshold
MATCHES_FOUND = 0
# Define the sheets to read
#excel_sheets = [ 'FXX-Test_Only', 'F06-PHQ-9']
excel_sheets = [
'F01-MHPSS Baseline',
'F02-MHPSS Follow-up',
'F03-mhGAP Baseline',
'F04-mhGAP Follow-up',
'F05-MH Closure',
'F06-PHQ-9',
'F07-ITFC form',
'F08-ATFC form',
'OptionSets'
]
# Define OCL sources and their respective columns in the Excel document
automatch_references = {
"MSF": {
"source_filepath": "MSF_Source_20240712_163433.json",
"suggestion_column": "MSF Source Suggestion",
"external_id_column": "MSF Source External ID",
"description_column": "MSF Source Description",
"datatype_column": "MSF Source Datatype",
"dataclass_column": "MSF Source Class",
"score_column": "MSF Source Score"
},
"CIEL": {
"source_filepath": "CIEL_Source_20240708_153712.json",
"suggestion_column": "CIEL Source Suggestion",
"external_id_column": "CIEL Source External ID",
"description_column": "CIEL Source Description",
"datatype_column": "CIEL Source Datatype",
"dataclass_column": "CIEL Source Class",
"score_column": "CIEL Source Score"
},
"PIH": {
"source_filepath": "PIH_Source_20240708_145352.json",
"suggestion_column": "PIH Source Suggestion",
"external_id_column": "PIH Source External ID",
"description_column": "PIH Source Description",
"datatype_column": "PIH Source Datatype",
"dataclass_column": "PIH Source Class",
"score_column": "PIH Source Score"
}
}
# Find the best matches for each primary and secondary label in the metadata spreadsheet
def find_best_matches(primary, secondary, data, threshold=FUZZY_THRESHOLD, limit=5):
"""
Find the best matches for a primary and secondary value from a list of data.
:param primary: The primary value to search for
:param secondary: The secondary value to search for
:param data: List of dictionaries with 'id' (string) and 'display_name'
:param limit: The maximum number of matches to return
:return: List of tuples containing the id, match, score, and definition
"""
# Combine the primary and secondary values into a single query string
query = f"{primary} {secondary}"
# Extract display_names from data for comparison
display_names = [item['display_name'] for item in data]
# Use rapidfuzz's process.extract to find the best matches with threshold
matches = process.extract(
query, display_names, scorer=fuzz.WRatio, score_cutoff=threshold, limit=limit
)
# Map the matches back to their corresponding IDs and definitions
results = []
for _, score, match_index in matches:
if score >= FUZZY_THRESHOLD:
results.append((
data[match_index]['id'],
data[match_index]['external_id'],
data[match_index]['display_name'],
data[match_index]['description'],
data[match_index]['datatype'],
data[match_index]['concept_class'],
data[match_index]['url'],
score
))
# Return the list of results
return results
# Open the metadata Excel file and find the column indices for the required columns
def find_column_index(worksheet, column_name):
"""
Find the column index for the specified column name in the second row.
Args:
worksheet (openpyxl.worksheet.worksheet): The worksheet object.
column_name (str): The name of the column to find the index for.
Returns:
int: The index of the specified column name in the second row.
"""
for idx, cell in enumerate(worksheet[2], 1): # assuming the second row contains headers
if cell.value == column_name:
return idx
return -1 # Return -1 if the column name is not found
# Initialize the counter for total matches found
TOTAL_MATCHES_FOUND = 0
# Iterate through the sheets in df that are in the excel_sheets list with headers on row 2
for sheet_name in excel_sheets:
# Iterate through each OCL source and look for suggestions for the primary and secondary lookups
for source_name, source_config in automatch_references.items():
print(f"Looking for suggestions in {source_name} source...")
SOURCE_FILEPATH = source_config['source_filepath']
with open(SOURCE_FILEPATH, 'r', encoding='UTF-8') as f:
json_data = json.load(f)
# Extract only the ID, display names, external IDs, datatype,
# concept_class, and extras > definitions from the JSON data
source_data = []
for item in json_data:
concept_id = item.get('id', '')
external_id = item.get('external_id')
display_name = item.get('display_name', '')
definition = (
item.get('extras', {}).get('definition', '')
if item.get('extras', {}) else ''
)
description = (
item.get('descriptions', [])[0].get('description', '')
if item.get('descriptions', []) else ''
)
datatype = item.get('datatype')
concept_class = item.get('concept_class')
concept_url = item.get('url')
# If no description is found, use the definition as the description
if description == '':
description = definition
# Add the concept details to the source_data list
source_data.append({
'id': concept_id,
'display_name': display_name,
'definition': definition,
'description': description,
'datatype': datatype,
'concept_class': concept_class,
'external_id': external_id,
'url': concept_url
})
# Load the Excel file, considering the header on row 2
df = pd.read_excel(METADATA_FILEPATH, sheet_name=sheet_name, header=1)
print(f"Processing sheet: {sheet_name}")
# Load the existing Excel file to append the suggestions
book = openpyxl.load_workbook(METADATA_FILEPATH)
# Using Excel Writer, append or update the details in the original existing
# Excel sheet and cells in the specified columns depending on the source name
with pd.ExcelWriter(METADATA_FILEPATH, engine='openpyxl', mode='a') as writer:
workbook = writer.book
ws = workbook[sheet_name]
# Get the column indices for the suggestion,
# external ID, description, datatype, concept class, and extras
suggestion_column = find_column_index(ws, source_config['suggestion_column'])
external_id_column = find_column_index(ws, source_config['external_id_column'])
description_column = find_column_index(ws, source_config['description_column'])
datatype_column = find_column_index(ws, source_config['datatype_column'])
dataclass_column = find_column_index(ws, source_config['dataclass_column'])
score_column = find_column_index(ws, source_config['score_column'])
# Iterate through each row in the sheet and get
# the primary and secondary labels to match
for index, row in df.iterrows():
CONCEPTS_TO_MATCH += 1
primary_lookup = row.get('Label if different') or None
secondary_lookup = row.get('Question') or row.get('Answers') or None
# Get suggestions from each OCL source using closest match
# with RapidFuzz using the primary lookup and secondary lookup
best_matches = find_best_matches(primary_lookup, secondary_lookup, source_data)
TOTAL_MATCHES_FOUND += len(best_matches)
# Add the suggestions to the Excel sheet
for m in best_matches:
# Write the suggestion details to the Excel sheet
# Add URL concatenated with OCL_URL in Excel cell using =HYPERLINK() formula
ws.cell(row=index+3, column=suggestion_column).value = (
f"=HYPERLINK(\"{OCL_URL}{m[6]}\", \"{m[2]}\")"
)
ws.cell(row=index+3, column=external_id_column).value = m[1]
ws.cell(row=index+3, column=description_column).value = m[3]
ws.cell(row=index+3, column=datatype_column).value = m[4]
ws.cell(row=index+3, column=dataclass_column).value = m[5]
ws.cell(row=index+3, column=score_column).value = math.ceil(m[7])
print(
f"Added suggestion: {m[2]} - {m[1]} with score of {m[7]}"
)
# Close the Excel file writer
workbook.save(METADATA_FILEPATH)
# Count of sources used
print(f"\nSources used: {len(automatch_references)}")
# Show the total number of concepts processed
CONCEPTS_TO_MATCH = math.ceil(CONCEPTS_TO_MATCH / len(automatch_references))
print(f"\nTotal concept processed: {CONCEPTS_TO_MATCH}")
# Show the total number of matches found above the threshold
percentage_found = (TOTAL_MATCHES_FOUND / CONCEPTS_TO_MATCH) * 100
rounded_percentage_found = math.ceil(percentage_found)
print(f"\nTotal matches found: {TOTAL_MATCHES_FOUND} ({rounded_percentage_found}%)")