Skip to content

Commit

Permalink
Added U/I priority (unidentified), ran with new dataset using auto-po…
Browse files Browse the repository at this point in the history
…pulating UUID.

Added lambdas/ and layers/ folder for AWS Lambda Function Integration, as well as template.yaml
  • Loading branch information
jhou98 committed Jul 15, 2020
1 parent d205b17 commit d25e482
Show file tree
Hide file tree
Showing 16 changed files with 66,078 additions and 17,663 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- [preprocess_data.py](/preprocess/preprocess_data.py): Main python script to preprocess MRI data
- [preprocess_helper.py](/preprocess/preprocess_helper.py): Helper functions
- [sample_output.json](sample_output.json): Sample JSON output for first 100 rows of data of sample.csv
- To create a sample_output.json, in the terminal run `python preprocess_data.py`
- To create a sample_output.json, in the terminal run `python .\preprocess\preprocess_data.py`

## Generating P-Value
- [init_db.sql](init_db.sql): Script to re-initialize database (wipes out any previous data!)
Expand All @@ -13,7 +13,7 @@
- [config.py](/rule_processing/config.py): Connecting to the postgres database with a custome database.ini file
- To initialize the database, run the init_db.sql script in the terminal using `psql -U {username} -h {host} -f init_db.sql`. _Note this does assume you have a database named rules._
- To update the weighted rule tokens, run `python update_weights.py` in the terminal
- To apply the rules and obtain a Rule ID + P-Value, run `python rules.py` in the terminal
- To apply the rules and obtain a Rule ID + P-Value, run `python .\rule_processing\rules.py` in the terminal

## Rule Database Analysis
- [mri_sample_results.xlsx](/csv/mri_sample_results.xlsx): Form to P-Value Data
- [Sample Result](/csv/mri_sample_results_0.2.xlsx): Form to P-Value Data (most recent)
Binary file added csv/mri_sample_results_0.2.xlsx
Binary file not shown.
3,489 changes: 3,489 additions & 0 deletions csv/req_data_june.csv

Large diffs are not rendered by default.

File renamed without changes.
174 changes: 87 additions & 87 deletions init_db.sql

Large diffs are not rendered by default.

49 changes: 40 additions & 9 deletions mri_preprocess_lambda.py → lambdas/preprocess/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@
import string
import string
import logging
import uuid
from datetime import datetime, date
from spellchecker import SpellChecker

logger = logging.getLogger()
logger.setLevel(logging.INFO)
compr = boto3.client(service_name='comprehend')
compr_m = boto3.client(service_name='comprehendmedical')
lambda_client = boto3.client('lambda')

RuleProcessingLambdaName = os.getenv('RULE_PROCESSING_LAMBDA')

spell = SpellChecker()

def convert2CM(height):
Expand Down Expand Up @@ -217,18 +222,29 @@ def lambda_handler(event, context):
logger.error( 'Missing parameters')
return {'result': False, 'msg': 'Missing parameters' }

data_df = event['body']
#data_df = json.loads(event['body']) # use for postman tests
data_df = event['body'] # use for console tests
logger.info(data_df)

if 'ReqCIO' not in data_df or not data_df['ReqCIO']:
data_df['CIO_ID'] = str(uuid.uuid4())
else:
data_df['CIO_ID'] = (data_df['ReqCIO'])

if data_df['Radiologist Priority'].lower() == 'unidentified':
data_df['priority'] = 'U/I'
else:
data_df['priority'] = data_df['Radiologist Priority']

# Format columns that don't need comprehend medical and preprocess the text
data_df['CIO_ID'] = int(data_df['Req # CIO'])
data_df['age'] = dob2age(data_df['DOB \r\n(yyyy-mm-dd)'])
data_df['height'] = data_df['Height \r\n(eg: ft.in)'] + \
' ' + data_df['INCH - CM']
data_df['weight'] = data_df['Weight'] + ' ' + data_df['KG - LBS']
data_df['priority'] = data_df['Radiologist Priority']
data_df['age'] = dob2age(data_df['DOB'])
data_df['height'] = data_df['Height'] + \
' ' + data_df['inch-cm']
data_df['weight'] = data_df['Weight'] + ' ' + data_df['kg-lbs']
data_df['height'] = convert2CM(data_df['height'])
data_df['weight'] = convert2KG(data_df['weight'])
data_df['Exam Requested'] = preProcessText(data_df['Exam Requested (Free Text)'])
data_df['Reason for Exam/Relevant Clinical History'] = preProcessText(data_df['Reason for Exam/Relevant Clinical History (Free Text)'])
data_df['Exam Requested'] = preProcessText(data_df['Exam Requested'])
data_df['Reason for Exam/Relevant Clinical History'] = preProcessText(data_df['Reason for Exam'])
# data_df['Spine'] = preProcessText(data_df['Appropriateness Checklist - Spine'])
# data_df['Hip & Knee'] = preProcessText(data_df['Appropriateness Checklist - Hip & Knee'])

Expand Down Expand Up @@ -273,3 +289,18 @@ def lambda_handler(event, context):

#formatted_df.to_json('sample_output.json', orient='index')
print("output is: ", formatted_df)

response = lambda_client.invoke(
FunctionName=RuleProcessingLambdaName,
InvocationType='RequestResponse',
Payload=json.dumps(formatted_df)
)

data = json.loads(response['Payload'].read())

response = {
'result': data,
'context': formatted_df
}

return response
109 changes: 109 additions & 0 deletions lambdas/rule_processing/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import json
import time
from configparser import ConfigParser
import psycopg2
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def config():
db = {
"host": "mri-dev.cluster-cfp0qt1gjcdb.ca-central-1.rds.amazonaws.com",
"dbname": "rules",
"user": "postgres",
"password": ".Medicine01"
}
return db

insert_cmd = """
INSERT INTO data_results(id, info, init_priority) VALUES
(%s, %s, %s)
ON CONFLICT (id) DO UPDATE
SET info = excluded.info,
init_priority = excluded.init_priority;
"""

update_sys_priority = """
UPDATE data_results
SET sys_priority = %s
WHERE id = '%s';
"""

update_cmd = """
UPDATE data_results
SET rules_id = r.id , sys_priority = r.priority
FROM mri_rules r WHERE r.id = (
SELECT id
FROM mri_rules, to_tsquery('%s') query
WHERE info_weighted_tk @@ query
"""

update_cmd_end = """
ORDER BY ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',info_weighted_tk, query, 1) DESC LIMIT 1)
AND data_results.id = '%s'
RETURNING r.id, r.priority;
"""

def searchAnatomy(data):
anatomy_list = []
for val in data:
# for any values that have multiple words (tsquery limitation)
val = val.replace(' ', ' | ')
anatomy_list.append(val)

body_parts = ' | '.join(anatomy_list)
anatomy_cmd = 'AND bp_tk @@ to_tsquery(\''+body_parts+'\')'
return anatomy_cmd

def searchText(data, *data_keys):
value_set = set()
for key in data_keys:
for val in data[key]:
for word in val.split():
value_set.add(word)
return (' | ').join(value_set)

""" Connect to the PostgreSQL database server """
try:
# read the connection parameters
params = config()
logger.info("Trying to connect to postgresql")
logger.info(params)
# connect to the PostgreSQL server
conn = psycopg2.connect(**params)
logger.info("Success, connected to PostgreSQL!")
except (Exception, psycopg2.DatabaseError) as error:
logger.info(error)

def lambda_handler(event, context):
logger.info(event)
v = event
with conn.cursor() as cur:
# insert into data_results one by one
if "anatomy" not in v.keys():
# No anatomy found => sys_priority = P99
try:
logger.info("No anatomy found for CIO ID: ", v["CIO_ID"])
cur.execute(insert_cmd, (v["CIO_ID"], json.dumps(v), v["priority"]))
cur.execute(update_sys_priority, ('P99',v["CIO_ID"]))
except psycopg2.IntegrityError:
logger.info("Exception: ", err)
else:
anatomy_str = searchAnatomy(v["anatomy"])
info_str = searchText(v, "anatomy", "medical_condition", "diagnosis", "symptoms", "phrases", "other_info")
command = (update_cmd % info_str) + anatomy_str + (update_cmd_end % v["CIO_ID"])
try:
cur.execute(insert_cmd, (v["CIO_ID"], json.dumps(v), v["priority"]))
cur.execute(command)
ret = cur.fetchall()
if not ret:
cur.execute(update_sys_priority, ('P98', v["CIO_ID"]))
except psycopg2.IntegrityError:
logger.info("Exception: ", err)
# commit the changes
conn.commit()
return {"rule_id": ret[0][0], "priority": ret[0][1]}



Binary file added layers/psycopg2.zip
Binary file not shown.
Binary file added layers/spellchecker.zip
Binary file not shown.
23 changes: 14 additions & 9 deletions preprocess/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

start = time.time()

data_df = pd.read_csv('../csv/requisition_data.csv', skip_blank_lines=True)
data_df = pd.read_csv('./csv/req_data_june.csv', skip_blank_lines=True).fillna({'Req # CIO': '-1'})
# Format columns that don't need comprehend medical and preprocess the text
data_df['CIO_ID'] = data_df['Req # CIO'].astype(int)
data_df['CIO_ID'] = data_df['Req # CIO']
data_df['age'] = data_df['DOB \r\n(yyyy-mm-dd)'].apply(dob2age)
data_df['height'] = data_df['Height \r\n(eg: ft.in)'] + \
' ' + data_df['INCH - CM']
Expand All @@ -23,12 +23,12 @@

# New Dataframe with
formatted_df = data_df[['CIO_ID', 'height', 'weight', 'Sex','age', 'Preferred MRI Site', 'priority']]
formatted_df['medical_condition'] = ''
formatted_df['diagnosis'] = ''
formatted_df['anatomy'] = ''
formatted_df['symptoms'] = ''
formatted_df['phrases'] = ''
formatted_df['other_info'] = ''
formatted_df.loc[:,'medical_condition'] = ''
formatted_df.loc[:,'diagnosis'] = ''
formatted_df.loc[:,'anatomy'] = ''
formatted_df.loc[:,'symptoms'] = ''
formatted_df.loc[:,'phrases'] = ''
formatted_df.loc[:,'other_info'] = ''

for row in range(len(formatted_df.index)):
print("row is :", row)
Expand All @@ -39,6 +39,11 @@
key_phrases = []
other_info = []

# Create a UUID for CIO ID if there isn't one already
formatted_df.loc[row,'CIO_ID'] = findId(formatted_df['CIO_ID'][row])
# Change Unidentified priority to code UND
formatted_df.loc[row,'priority'] = findUnidentified(formatted_df['priority'][row])

# Parse the Exam Requested Column into Comprehend Medical to find Anatomy Entities
anatomy_json = find_all_entities(anatomySpelling(f'{data_df["Exam Requested"][row]}'))
preprocessed_text = preProcessAnatomy(f'{data_df["Reason for Exam/Relevant Clinical History"][row]}')
Expand All @@ -64,6 +69,6 @@
formatted_df['phrases'][row] = key_phrases
formatted_df['other_info'][row] = other_info

formatted_df.to_json('../sample_output.json', orient='index')
formatted_df.to_json('sample_output.json', orient='index')

print( f'---{time.time()-start}---')
13 changes: 12 additions & 1 deletion preprocess/preprocess_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,23 @@
import string
import pandas as pd
from spellchecker import SpellChecker
import uuid

compr = boto3.client(service_name='comprehend')
compr_m = boto3.client(service_name='comprehendmedical')

spell = SpellChecker()
spell.word_frequency.load_text_file('./wordbank.txt')
spell.word_frequency.load_text_file('./preprocess/wordbank.txt')

def findId(val):
if val == '-1':
return str(uuid.uuid4())
return val

def findUnidentified(val):
if val.lower() == 'unidentified':
return 'U/I'
return val

def convert2CM(height):
if not isinstance(height, str):
Expand Down
Loading

0 comments on commit d25e482

Please sign in to comment.