-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiment_fix_ratio.py
139 lines (115 loc) · 4.41 KB
/
experiment_fix_ratio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
from collections import defaultdict
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import safe_indexing
from datasets import load_adult
from datasets import load_cover_type
from datasets import load_diabetes
from datasets import load_mammography
from datasets import load_oil
from datasets import load_phoneme
from datasets import load_satimage
def _fit_score(pipe, param_grid, X, y, train_idx, test_idx, cv_idx):
"""Fit a pipeline and score.
Parameters
----------
pipe : Estimator
A scikit-learn pipeline.
param_grid : ParameterGrid
A ParameterGrid with all the parameters to try for the pipeline.
X : ndarray, shape (n_samples, n_features)
The full dataset.
y : ndarray, shape (n_samples,)
The associated target.
train_idx : ndarray, (n_train_samples,)
The training indexes.
test_idx : ndarray, (n_test_samples,)
The testing indexes.
cv_idx : int
The index of the fold.
Returns
-------
cv_results : dict
A dictionary containing the score and parameters.
"""
cv_results = defaultdict(list)
X_train, y_train = safe_indexing(X, train_idx), y[train_idx]
X_test, y_test = safe_indexing(X, test_idx), y[test_idx]
for param in param_grid:
pipe_cv = clone(pipe)
pipe_cv.set_params(**param)
try:
pipe_cv.fit(X_train, y_train)
except ValueError:
continue
y_pred_proba_train = pipe_cv.predict_proba(X_train)
y_pred_proba_test = pipe_cv.predict_proba(X_test)
y_pred_train = pipe_cv.predict(X_train)
y_pred_test = pipe_cv.predict(X_test)
cv_results['auc_train_score'].append(
roc_auc_score(y_train, y_pred_proba_train[:, 1]))
cv_results['auc_test_score'].append(
roc_auc_score(y_test, y_pred_proba_test[:, 1]))
cv_results['bacc_train_score'].append(
balanced_accuracy_score(y_train, y_pred_train))
cv_results['bacc_test_score'].append(
balanced_accuracy_score(y_test, y_pred_test))
cv_results['cv_idx'].append(cv_idx)
for k, v in param.items():
cv_results[k].append(v)
return cv_results
def _merge_dicts(d1, d2):
"""Merge two dictionaries."""
for k in d1.keys():
d1[k] += d2[k]
return d1
for name, func_dataset in [
('adult', load_adult),
('cover_type', load_cover_type),
('diabetes', load_diabetes),
('mammography', load_mammography),
('oil', load_oil),
('phoneme', load_phoneme),
('satimage', load_satimage)]:
X, y = func_dataset()
for clf in [DecisionTreeClassifier(random_state=42),
LogisticRegression(random_state=42)]:
pipe = Pipeline(steps=[
('standardscaler', StandardScaler()),
('smote', SMOTE(random_state=42)),
('clf', clf)
])
param_grid = ParameterGrid(
{'smote__sampling_strategy': np.arange(0.1, 1, 0.05)}
)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=50,
random_state=42)
results = Parallel(n_jobs=-1)(
delayed(_fit_score)(pipe, param_grid, X, y,
train_idx, test_idx, cv_idx)
for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X, y)))
cv_results = results[-1]
for res in results[:-1]:
cv_results = _merge_dicts(cv_results, res)
if not os.path.exists('results'):
os.makedirs('results')
cv_results = pd.DataFrame(cv_results)
cv_results.to_csv(os.path.join('results',
name + '_' + clf.__class__.__name__ +
'_fix_ratio.csv'))