-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHeart_Attack_Predictions.py
278 lines (226 loc) · 9.06 KB
/
Heart_Attack_Predictions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# -*- coding: utf-8 -*-
"""
"""
#%% Module
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
#%%
def cramers_corrected_stat(cmx):
""" calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
chi2 = ss.chi2_contingency(cmx)[0]
n = cmx.sum()
phi2 = chi2/n
r,k = cmx.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
#%% Constant
CSV_PATH = os.path.join(os.getcwd(),'Dataset','heart.csv')
#%% Step 1) Data Loading
df = pd.read_csv(CSV_PATH)
#%% Step 2) Data Inspection
df.info()
des=df.describe().T
df.columns
cat_col = ['sex','cp','fbs','restecg','exng','slp','caa','thall','output']
con_col = df.drop(labels=cat_col,axis=1).columns
# Categorical Visualization
for i in cat_col:
print(i)
plt.figure()
sns.countplot(df[i])
plt.show()
# Continous Visualization
for i in con_col:
plt.figure()
sns.displot(df[i])
plt.show()
df.boxplot(figsize=(20,5))
# To check NaNs
df.isna().sum()
# - From this we can see that, we can see that this dataset have no NaNs
#%% Step 3) Data Cleaning
# NaNs
# there are NaNs in caa (mask as 4) and thall (mask value 0)
df['caa'].replace(4, np.nan, inplace=True)
df['thall'].replace(0, np.nan, inplace=True)
# To check the NaNs
df.isna().sum()
# KNN Imputation
columns_names = df.columns
knn_i = KNNImputer()
df = knn_i.fit_transform(df) # return numpy array
df = pd.DataFrame(df) # to convert back into dataframe
df.columns = columns_names
#To check the if there is any duplicated data in the datasets
df.duplicated().sum() # From this data, there are 1 duplicated datasets
# Removing Duplicated
df = df.drop_duplicates()
#%% Step 4) Features selection
y = df['output']
selected_features = []
# print(df.corr)
# To check correlation between continous data vs categorical data
for i in con_col:
lr = LogisticRegression()
lr.fit(np.expand_dims(df[i], axis=-1),y)
print(i)
print(lr.score(np.expand_dims(df[i],axis=-1),y))
if lr.score(np.expand_dims(df[i],axis=-1),y) >= 0.5:
selected_features.append(i)
print(selected_features)
# From con_col, the features that are more than 0.5 are age,trtbps,
# chol,thalachh,oldpeak.
# To check correlation between categorical vs categorical data
for i in cat_col:
print(i)
cmx = pd.crosstab(df[i],y).to_numpy()
print(cramers_corrected_stat(cmx))
if cramers_corrected_stat(cmx) >= 0.4:
selected_features.append(i)
print(selected_features)
# From cat_col, the features that have correlation more than 0.4 with y
# are cp,exng,slp,caa,thall,output
# to visualize the correlation using heatmap
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), cmap='RdBu')
#%% Step 5) Data Preprocessing
# From Step 4, there are 10 selected features that are more than 0.4 correlation
# which are age,trtbps,chol,thalachh,oldpeak,cp,exng,slp,caa,thall.
df = df.loc[:,selected_features]
X = df.drop(labels='output',axis=1)
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.3,
random_state=123)
#%% Model Development
# KNN
pipeline_mms_knn = Pipeline([
('Min_Max_Scalar', MinMaxScaler()),
('KNN_Classifier', KNeighborsClassifier())
]) # Pipeline([STEPS])
pipeline_ss_knn = Pipeline([
('Standard_Scaler', StandardScaler()),
('KNN_Classifier', KNeighborsClassifier())
]) # Pipeline([STEPS])
# RandomForest
pipeline_mms_rf = Pipeline([
('Min_Max_Scalar',MinMaxScaler()),
('Forest_Classifier',RandomForestClassifier())
]) # Pipeline([STEPS])
pipeline_ss_rf = Pipeline([
('Standard_Scaler',StandardScaler()),
('Forest_Classifier',RandomForestClassifier())
]) # Pipeline([STEPS])
#Logistic Regression
pipeline_mms_lr = Pipeline([
('Min_Max_Scalar', MinMaxScaler()),
('Logistic_Classifier', LogisticRegression())
]) # Pipeline([STEPS])
pipeline_ss_lr = Pipeline([
('Standard_Scaler', StandardScaler()),
('Logistic_Classifier', LogisticRegression())
]) # Pipeline([STEPS])
# Decision Tree
pipeline_mms_dt = Pipeline([
('Min_Max_Scalar', MinMaxScaler()),
('Tree_Classifier', DecisionTreeClassifier())
]) # Pipeline([STEPS])
pipeline_ss_dt = Pipeline([
('Standard_Scaler', StandardScaler()),
('Tree_Classifier', DecisionTreeClassifier())
]) # Pipeline([STEPS])
# SVC
pipeline_mms_svc = Pipeline([
('Min_Max_Scalar', MinMaxScaler()),
('SVC_Classifier', SVC())
]) # Pipeline([STEPS])
pipeline_ss_svc = Pipeline([
('Standard_Scaler', StandardScaler()),
('SVC_Classifier', SVC())
]) # Pipeline([STEPS])
pipelines = [pipeline_mms_knn,pipeline_ss_knn,pipeline_mms_rf,pipeline_ss_rf,
pipeline_mms_lr,pipeline_ss_lr,pipeline_mms_dt,pipeline_ss_dt,
pipeline_mms_svc,pipeline_ss_svc]
for pipe in pipelines:
pipe.fit(X_train, y_train)
best_accuracy = 0
for i,pipe in enumerate(pipelines):
print(pipe.score(X_test, y_test))
if pipe.score (X_test, y_test) > best_accuracy:
best_accuracy = pipe.score(X_test, y_test)
best_pipeline = pipe
print('The best scaler and classifier for HAP app is {},with accuracy of {}'.
format(best_pipeline.steps,best_accuracy))
# From Model Development steps,the best scaler and classifier for HAP app is
# [('Standard_Scaler', StandardScaler()), ('Logistic_Classifier', LogisticRegression())]
# with accuracy of 0.8461538461538461
#%% GridSearchCV
# To check the best parameter of the best model.
pipeline_mms_lr = Pipeline([
('Standard_Scaler', StandardScaler()),
('Logistic_Classifier', LogisticRegression())
]) # Pipeline([STEPS])
grid_param = [{'Logistic_Classifier__random_state':[None,10,15],
'Logistic_Classifier__tol':[1,2,3,5],
'Logistic_Classifier__C':[1.0,3.0,5.0],
'Logistic_Classifier__solver':['newton-cg','lbfgs','liblinear',
'sag','saga'],
'Logistic_Classifier__intercept_scaling': [1,2,3]
}]
grid_search = GridSearchCV(pipeline_mms_lr,param_grid=grid_param,cv=5,
verbose=1, n_jobs=-1)
grid = grid_search.fit(X_train, y_train)
print(grid.best_score_)
# The best score for grid is equal to 0.8529346622369879
print(grid.best_params_)
# The best params are {'Logistic_Classifier__C': 5.0,
# 'Logistic_Classifier__intercept_scaling': 1,
# 'Logistic_Classifier__random_state': None,
# 'Logistic_Classifier__solver': 'saga',
# 'Logistic_Classifier__tol': 5}
print(grid.best_estimator_)
# The best estimator is
# Pipeline(steps=[('Standard_Scaler', StandardScaler()),
# ('Logistic_Classifier',
# LogisticRegression(C=5.0,
# solver='saga',
# tol=5))])
# To check the accuracy of the model
y_pred = grid.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
cr = classification_report(y_test,y_pred)
# Plotting heatmap
labels = ['0','1']
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.show()
# Print classification report
print(cr)
#%% Model Saving
BEST_ESTIMATOR_SAVE_PATH = os.path.join(os.getcwd(),'Models',
'HAP_App_model.pkl')
with open(BEST_ESTIMATOR_SAVE_PATH, 'wb') as file:
pickle.dump(grid.best_estimator_,file)