-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathbaselines.py
130 lines (114 loc) · 5.05 KB
/
baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from gensim.models.doc2vec import Doc2Vec
import argparse
import json
import os
import time
import warnings
from utils import cal_metric, get_ids, text2words
warnings.filterwarnings('ignore')
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='mortality') # mortality, readmit, or llos
parser.add_argument('--model', type=str, default='all') # all, lr, or rf
parser.add_argument('--inputs', type=int, default=4) # 3: T + S, 4: U, 7: U + T + S
args = parser.parse_args()
return args
def train_test_base(X_train, X_test, y_train, y_test, name):
mtl = 1 if y_test.shape[1] > 1 else 0 # multi-label
if name == 'lr':
print('Start training Logistic Regression:')
model = LogisticRegression()
param_grid = {
'penalty': ['l1', 'l2']
}
else:
print('Start training Random Forest:')
model = RandomForestClassifier()
param_grid = {
'n_estimators': [x for x in range(20, 40, 5)],
'max_depth': [None, 20, 40, 60, 80, 100]
}
if mtl:
model = OneVsRestClassifier(model)
else:
y_train, y_test = y_train[:, 0], y_test[:, 0]
t0 = time.time()
gridsearch = GridSearchCV(model, param_grid, scoring='roc_auc', cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_
t1 = time.time()
print('Running time:', t1 - t0)
probs = model.predict_proba(X_test)
metrics = []
if mtl:
for idx in range(y_test.shape[1]):
metric = cal_metric(y_test[:, idx], probs[:, idx])
print(idx + 1, metric)
metrics.append(metric)
print('Avg', np.mean(metrics, axis=0).tolist())
else:
metric = cal_metric(y_test, probs[:, 1])
print(metric)
if __name__ == '__main__':
args = parse_args()
task = args.task
model = args.model
inputs = args.inputs
print('Running task %s using inputs %d...' % (task, inputs))
train_ids, _, test_ids = get_ids('data/processed/files/splits.json')
df = pd.read_csv('data/processed/%s.csv' % task).sort_values('hadm_id')
train_ids = np.intersect1d(train_ids, df['hadm_id'].tolist())
test_ids = np.intersect1d(test_ids, df['hadm_id'].tolist())
choices = '{0:b}'.format(inputs).rjust(3, '0')
X_train, X_test = [], []
if choices[0] == '1':
print('Loading notes...')
vector_dict = json.load(open('data/processed/files/vector_dict.json'))
X_train_notes = [np.mean(vector_dict.get(adm_id, []), axis=0) for adm_id in train_ids]
X_test_notes = [np.mean(vector_dict.get(adm_id, []), axis=0) for adm_id in test_ids]
X_train.append(X_train_notes)
X_test.append(X_test_notes)
if choices[1] == '1':
print('Loading temporal data...')
df_temporal = pd.read_csv('data/processed/features.csv').drop('charttime', axis=1)
temporal_mm_dict = json.load(open('data/processed/files/feature_mm_dict.json'))
for col in df_temporal.columns[1:]:
col_min, col_max = temporal_mm_dict[col]
df_temporal[col] = (df_temporal[col] - col_min) / (col_max - col_min)
df_temporal = df_temporal.groupby(
'hadm_id').agg(['mean', 'count', 'max', 'min', 'std'])
df_temporal.columns = ['_'.join(col).strip()
for col in df_temporal.columns.values]
df_temporal.fillna(0, inplace=True)
df_temporal = df_temporal.reset_index().sort_values('hadm_id')
df_temporal_cols = df_temporal.columns[1:]
X_train_temporal = df_temporal[df_temporal['hadm_id'].isin(train_ids)][df_temporal_cols].to_numpy()
X_test_temporal = df_temporal[df_temporal['hadm_id'].isin(test_ids)][df_temporal_cols].to_numpy()
X_train.append(X_train_temporal)
X_test.append(X_test_temporal)
if choices[2] == '1':
print('Loading demographics...')
demo_json = json.load(open('data/processed/files/demo_dict.json'))
df_demo = pd.DataFrame(demo_json.items(), columns=['hadm_id', 'demos']).sort_values('hadm_id')
X_train_demo = df_demo[df_demo['hadm_id'].isin(train_ids)][['demos']].to_numpy()
X_test_demo = df_demo[df_demo['hadm_id'].isin(test_ids)][['demos']].to_numpy()
X_train.append(X_train_demo)
X_test.append(X_test_demo)
print('Done.')
df_cols = df.columns[1:]
X_train = np.concatenate(X_train, axis=1)
X_test = np.concatenate(X_test, axis=1)
y_train = df[df['hadm_id'].isin(train_ids)][df_cols].to_numpy()
y_test = df[df['hadm_id'].isin(test_ids)][df_cols].to_numpy()
if model == 'all':
train_test_base(X_train, X_test, y_train, y_test, 'lr')
train_test_base(X_train, X_test, y_train, y_test, 'rf')
else:
train_test_base(X_train, X_test, y_train, y_test, model)