-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3-KNN.py
128 lines (92 loc) · 3.43 KB
/
3-KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
################################################
# KNN
################################################
# knn yöntemi --> bana arkadaşının söyle sana kim olduğunu söyleyeyim
# 1. Exploratory Data Analysis
# 2. Data Preprocessing & Feature Engineering
# 3. Modeling & Prediction
# 4. Model Evaluation
# 5. Hyperparameter Optimization
# 6. Final Model
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
################################################
# 1. Exploratory Data Analysis
################################################
df = pd.read_csv("datasets/diabetes.csv")
df.head()
df.shape
df.describe().T
df["Outcome"].value_counts()
################################################
# 2. Data Preprocessing & Feature Engineering
################################################
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
X_scaled = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
################################################
# 3. Modeling & Prediction
################################################
knn_model = KNeighborsClassifier().fit(X, y)
random_user = X.sample(1, random_state=45)
knn_model.predict(random_user)
################################################
# 4. Model Evaluation
################################################
# Confusion matrix için y_pred:
y_pred = knn_model.predict(X)
# AUC için y_prob:
y_prob = knn_model.predict_proba(X)[:, 1]
print(classification_report(y, y_pred))
# acc 0.83
# f1 0.74
# AUC
roc_auc_score(y, y_prob)
# 0.90
cv_results = cross_validate(knn_model, X, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()
cv_results['test_f1'].mean()
cv_results['test_roc_auc'].mean()
# 0.73
# 0.59
# 0.78
# 1. Örnek boyutu arttıralabilir.
# 2. Veri ön işleme
# 3. Özellik mühendisliği
# 4. İlgili algoritma için optimizasyonlar yapılabilir.
knn_model.get_params()
################################################
# 5. Hyperparameter Optimization
################################################
knn_model = KNeighborsClassifier()
knn_model.get_params()
knn_params = {"n_neighbors": range(2, 50)}
knn_gs_best = GridSearchCV(knn_model,
knn_params,
cv=5,
n_jobs=-1,
verbose=1).fit(X, y)
# n_jobs = -1 de işlemciler en üst düzey performansta kullanılır
knn_gs_best.best_params_ # ön tanımlı 5 olduğunu görüyorduk oysaki 17 olabilirmiş bunu deneyeeceğiz
################################################
# 6. Final Model
################################################
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)
cv_results = cross_validate(knn_final,
X,
y,
cv=5,
scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()
cv_results['test_f1'].mean()
cv_results['test_roc_auc'].mean()
# gördüğümüz sonuçlar gibi sonuç skorlarımız artmış oldu
# 4. İlgili algoritma için optimizasyonlar yapılabilir. --> bunu uygulamış olduk
random_user = X.sample(1)
knn_final.predict(random_user)