Added U/I priority (unidentified), ran with new dataset using auto-po…

…pulating UUID. Added lambdas/ and layers/ folder for AWS Lambda Function Integration, as well as template.yaml
scmc · Jul 15, 2020 · d25e482 · d25e482
1 parent d205b17
commit d25e482
Show file tree

Hide file tree

Showing 16 changed files with 66,078 additions and 17,663 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 - [preprocess_data.py](/preprocess/preprocess_data.py): Main python script to preprocess MRI data 
 - [preprocess_helper.py](/preprocess/preprocess_helper.py): Helper functions 
 - [sample_output.json](sample_output.json): Sample JSON output for first 100 rows of data of sample.csv
-- To create a sample_output.json, in the terminal run `python preprocess_data.py`
+- To create a sample_output.json, in the terminal run `python .\preprocess\preprocess_data.py`
 
 ## Generating P-Value 
 - [init_db.sql](init_db.sql): Script to re-initialize database (wipes out any previous data!) 
@@ -13,7 +13,7 @@
 - [config.py](/rule_processing/config.py): Connecting to the postgres database with a custome database.ini file 
 - To initialize the database, run the init_db.sql script in the terminal using `psql -U {username} -h {host} -f init_db.sql`. _Note this does assume you have a database named rules._
 - To update the weighted rule tokens, run `python update_weights.py` in the terminal
-- To apply the rules and obtain a Rule ID + P-Value, run `python rules.py` in the terminal
+- To apply the rules and obtain a Rule ID + P-Value, run `python .\rule_processing\rules.py` in the terminal
 
 ## Rule Database Analysis
-- [mri_sample_results.xlsx](/csv/mri_sample_results.xlsx): Form to P-Value Data
+- [Sample Result](/csv/mri_sample_results_0.2.xlsx): Form to P-Value Data (most recent)
diff --git a/csv/mri_sample_results_0.2.xlsx b/csv/mri_sample_results_0.2.xlsx
diff --git a/csv/req_data_june.csv b/csv/req_data_june.csv
diff --git a/rule_processing/database_sample.ini → database_sample.ini b/rule_processing/database_sample.ini → database_sample.ini
diff --git a/init_db.sql b/init_db.sql
diff --git a/mri_preprocess_lambda.py → lambdas/preprocess/index.py b/mri_preprocess_lambda.py → lambdas/preprocess/index.py
@@ -6,13 +6,18 @@
 import string
 import string
 import logging
+import uuid
 from datetime import datetime, date
 from spellchecker import SpellChecker 
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 compr = boto3.client(service_name='comprehend')
 compr_m = boto3.client(service_name='comprehendmedical')
+lambda_client = boto3.client('lambda')
+
+RuleProcessingLambdaName = os.getenv('RULE_PROCESSING_LAMBDA')
+
 spell = SpellChecker() 
 
 def convert2CM(height):
@@ -217,18 +222,29 @@ def lambda_handler(event, context):
         logger.error( 'Missing parameters')
         return {'result': False, 'msg': 'Missing parameters' }
 
-    data_df = event['body']
+    #data_df = json.loads(event['body']) # use for postman tests
+    data_df = event['body'] # use for console tests
+    logger.info(data_df)
+
+    if 'ReqCIO' not in data_df or not data_df['ReqCIO']:
+        data_df['CIO_ID'] = str(uuid.uuid4())
+    else: 
+        data_df['CIO_ID'] = (data_df['ReqCIO'])
+
+    if data_df['Radiologist Priority'].lower() == 'unidentified':
+        data_df['priority'] = 'U/I'
+    else: 
+        data_df['priority'] = data_df['Radiologist Priority']
+
     # Format columns that don't need comprehend medical and preprocess the text
-    data_df['CIO_ID'] = int(data_df['Req # CIO'])
-    data_df['age'] = dob2age(data_df['DOB \r\n(yyyy-mm-dd)'])
-    data_df['height'] = data_df['Height \r\n(eg: ft.in)'] + \
-        ' ' + data_df['INCH - CM']
-    data_df['weight'] = data_df['Weight'] + ' ' + data_df['KG - LBS']
-    data_df['priority'] = data_df['Radiologist Priority']
+    data_df['age'] = dob2age(data_df['DOB'])
+    data_df['height'] = data_df['Height'] + \
+        ' ' + data_df['inch-cm']
+    data_df['weight'] = data_df['Weight'] + ' ' + data_df['kg-lbs']
     data_df['height'] = convert2CM(data_df['height'])
     data_df['weight'] = convert2KG(data_df['weight'])
-    data_df['Exam Requested'] = preProcessText(data_df['Exam Requested (Free Text)'])
-    data_df['Reason for Exam/Relevant Clinical History'] = preProcessText(data_df['Reason for Exam/Relevant Clinical History (Free Text)'])
+    data_df['Exam Requested'] = preProcessText(data_df['Exam Requested'])
+    data_df['Reason for Exam/Relevant Clinical History'] = preProcessText(data_df['Reason for Exam'])
     # data_df['Spine'] = preProcessText(data_df['Appropriateness Checklist - Spine'])
     # data_df['Hip & Knee'] = preProcessText(data_df['Appropriateness Checklist - Hip & Knee'])
 
@@ -273,3 +289,18 @@ def lambda_handler(event, context):
 
     #formatted_df.to_json('sample_output.json', orient='index')
     print("output is: ", formatted_df)
+
+    response = lambda_client.invoke(
+            FunctionName=RuleProcessingLambdaName,
+            InvocationType='RequestResponse',
+            Payload=json.dumps(formatted_df)
+        )
+
+    data = json.loads(response['Payload'].read())
+
+    response = { 
+        'result': data, 
+        'context': formatted_df
+    }
+
+    return response
diff --git a/lambdas/rule_processing/index.py b/lambdas/rule_processing/index.py
@@ -0,0 +1,109 @@
+import json
+import time 
+from configparser import ConfigParser
+import psycopg2 
+import logging
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+def config():
+    db = {
+        "host": "mri-dev.cluster-cfp0qt1gjcdb.ca-central-1.rds.amazonaws.com",
+        "dbname": "rules",
+        "user": "postgres",
+        "password": ".Medicine01"
+    }
+    return db
+
+insert_cmd = """
+INSERT INTO data_results(id, info, init_priority) VALUES 
+(%s, %s, %s)
+ON CONFLICT (id) DO UPDATE
+SET info = excluded.info, 
+init_priority = excluded.init_priority;
+"""
+
+update_sys_priority = """
+UPDATE data_results
+SET sys_priority = %s
+WHERE id = '%s'; 
+"""
+
+update_cmd = """
+UPDATE data_results 
+SET rules_id = r.id , sys_priority = r.priority
+FROM mri_rules r WHERE r.id = (
+SELECT id
+FROM mri_rules, to_tsquery('%s') query 
+WHERE info_weighted_tk @@ query
+"""
+
+update_cmd_end = """
+ORDER BY ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',info_weighted_tk, query, 1) DESC LIMIT 1)
+AND data_results.id = '%s'
+RETURNING r.id, r.priority;
+"""
+
+def searchAnatomy(data):        
+    anatomy_list = []
+    for val in data: 
+        # for any values that have multiple words (tsquery limitation)
+        val = val.replace(' ', ' | ')
+        anatomy_list.append(val)
+
+    body_parts = ' | '.join(anatomy_list)
+    anatomy_cmd = 'AND bp_tk @@ to_tsquery(\''+body_parts+'\')'
+    return anatomy_cmd 
+
+def searchText(data, *data_keys):
+    value_set = set()
+    for key in data_keys: 
+        for val in data[key]:
+            for word in val.split(): 
+                value_set.add(word)
+    return (' | ').join(value_set)
+
+""" Connect to the PostgreSQL database server """
+try:
+    # read the connection parameters
+    params = config()
+    logger.info("Trying to connect to postgresql")
+    logger.info(params)
+    # connect to the PostgreSQL server
+    conn = psycopg2.connect(**params)
+    logger.info("Success, connected to PostgreSQL!")
+except (Exception, psycopg2.DatabaseError) as error:
+    logger.info(error)
+
+def lambda_handler(event, context):
+    logger.info(event)
+    v = event
+    with conn.cursor() as cur: 
+        # insert into data_results one by one 
+        if "anatomy" not in v.keys():
+            # No anatomy found => sys_priority = P99
+            try: 
+                logger.info("No anatomy found for CIO ID: ", v["CIO_ID"])
+                cur.execute(insert_cmd, (v["CIO_ID"], json.dumps(v), v["priority"]))
+                cur.execute(update_sys_priority, ('P99',v["CIO_ID"]))
+            except psycopg2.IntegrityError:
+                logger.info("Exception: ", err)
+        else:
+            anatomy_str  = searchAnatomy(v["anatomy"])
+            info_str = searchText(v, "anatomy", "medical_condition", "diagnosis", "symptoms", "phrases", "other_info")
+            command = (update_cmd % info_str) + anatomy_str + (update_cmd_end % v["CIO_ID"])
+            try:
+                cur.execute(insert_cmd, (v["CIO_ID"], json.dumps(v), v["priority"]))
+                cur.execute(command)
+                ret = cur.fetchall() 
+                if not ret: 
+                    cur.execute(update_sys_priority, ('P98', v["CIO_ID"]))
+            except psycopg2.IntegrityError:                    
+                logger.info("Exception: ", err)
+        # commit the changes
+        conn.commit()
+        return {"rule_id": ret[0][0], "priority": ret[0][1]}
+
+
+
diff --git a/layers/psycopg2.zip b/layers/psycopg2.zip
diff --git a/layers/spellchecker.zip b/layers/spellchecker.zip
diff --git a/preprocess/preprocess_data.py b/preprocess/preprocess_data.py
@@ -6,9 +6,9 @@
 
 start = time.time() 
 
-data_df = pd.read_csv('../csv/requisition_data.csv', skip_blank_lines=True)
+data_df = pd.read_csv('./csv/req_data_june.csv', skip_blank_lines=True).fillna({'Req # CIO': '-1'})
 # Format columns that don't need comprehend medical and preprocess the text 
-data_df['CIO_ID'] = data_df['Req # CIO'].astype(int)
+data_df['CIO_ID'] = data_df['Req # CIO']
 data_df['age'] = data_df['DOB \r\n(yyyy-mm-dd)'].apply(dob2age)
 data_df['height'] = data_df['Height \r\n(eg: ft.in)'] + \
     ' ' + data_df['INCH - CM']
@@ -23,12 +23,12 @@
 
 # New Dataframe with
 formatted_df = data_df[['CIO_ID', 'height', 'weight', 'Sex','age', 'Preferred MRI Site', 'priority']]
-formatted_df['medical_condition'] = ''
-formatted_df['diagnosis'] = ''
-formatted_df['anatomy'] = ''
-formatted_df['symptoms'] = ''
-formatted_df['phrases'] = ''
-formatted_df['other_info'] = ''
+formatted_df.loc[:,'medical_condition'] = ''
+formatted_df.loc[:,'diagnosis'] = ''
+formatted_df.loc[:,'anatomy'] = ''
+formatted_df.loc[:,'symptoms'] = ''
+formatted_df.loc[:,'phrases'] = ''
+formatted_df.loc[:,'other_info'] = ''
 
 for row in range(len(formatted_df.index)):
     print("row is :", row)
@@ -39,6 +39,11 @@
     key_phrases = []
     other_info = []
 
+    # Create a UUID for CIO ID if there isn't one already 
+    formatted_df.loc[row,'CIO_ID'] = findId(formatted_df['CIO_ID'][row])
+    # Change Unidentified priority to code UND 
+    formatted_df.loc[row,'priority'] = findUnidentified(formatted_df['priority'][row])
+
     # Parse the Exam Requested Column into Comprehend Medical to find Anatomy Entities
     anatomy_json = find_all_entities(anatomySpelling(f'{data_df["Exam Requested"][row]}'))
     preprocessed_text = preProcessAnatomy(f'{data_df["Reason for Exam/Relevant Clinical History"][row]}')
@@ -64,6 +69,6 @@
     formatted_df['phrases'][row] = key_phrases
     formatted_df['other_info'][row] = other_info
 
-formatted_df.to_json('../sample_output.json', orient='index')
+formatted_df.to_json('sample_output.json', orient='index')
 
 print( f'---{time.time()-start}---')
diff --git a/preprocess/preprocess_helper.py b/preprocess/preprocess_helper.py
@@ -4,12 +4,23 @@
 import string
 import pandas as pd
 from spellchecker import SpellChecker 
+import uuid 
 
 compr = boto3.client(service_name='comprehend')
 compr_m = boto3.client(service_name='comprehendmedical')
 
 spell = SpellChecker() 
-spell.word_frequency.load_text_file('./wordbank.txt')
+spell.word_frequency.load_text_file('./preprocess/wordbank.txt')
+
+def findId(val):
+    if val == '-1': 
+        return str(uuid.uuid4())
+    return val
+
+def findUnidentified(val):
+    if val.lower() == 'unidentified':
+        return 'U/I'
+    return val 
 
 def convert2CM(height):
     if not isinstance(height, str):