forked from MC3-code/MC3-Code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
different_expansions_mean_iterations.py
121 lines (88 loc) · 5.33 KB
/
different_expansions_mean_iterations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import math
import numpy as np
import data_config
import copy
import new_classifiers5
import itertools
import mean_combine
import greedy
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.grid_search import GridSearchCV
'''
Input:
full_training_set- Full training set
TR_set_used- Contains dict of full training set split into 3 parts-
TR1 + TR1_outcome, TR2 + TR2_outcome, TR3 + TR3_outcome
(6 different dicts)
labels- List of all the class labels in the data
binary- Boolean, true if the class labels have only 2 options, false otherwise
TS- The testing set data without the outcomes. (the real test data)
TS_outcome- The testing set data with the outcomes. (the real test outcomes)
training_set3- If training set split into 3 or 2.
new_features_only- Boolean. True if only the predictions of the classifiers included in features, false otherwise
greedy_find_best- Boolean. True if using a greedy approach to determine best subset of classifiers, false otherwise.
Description:
Output:
Returns tuple of 4 different expansions of algorthim:
1) Expand best Combine all
2) Expand best Combine best
3) Expand all Combine best
4) Expand all Combine all
Each tuple contains a list of dicts.
Dicts contains:
type- type of classifier
prediction- contains prediction on TS
(also includes ensemble as one type)
'''
def all(full_training_set, TR_set_used, labels, binary, TS, TS_outcome, training_set3, new_features_only, greedy_find_best, ensemble_methods, weight, lb):
#Training set used to determine best classifier*.
keeper = expand_best(TR_set_used, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight)
best_classifiers = keeper[0]
best_strings_first = keeper[1]
hold3 = expand_all_combine_all(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers, labels, binary, new_features_only, best_strings_first, ensemble_methods, weight, lb)
return (hold3, best_strings_first)
'''
Input-
TR_set_used- Contains dict of TR1, TR1_outcome, TR2, TR2_outcome, TR3, and TR3_outcome.
Description:
Different from expand_best.
Does not find subset of classifiers used to create expanded set.
Instead uses all classifiers to create expanded set.
1)Stores in best_strings_first- all the classifiers in a list of strings (SVM, Random Forest,....etc)
2) Creates best classifiers- Array of dicts where each classifiers prediction on the expanded set (created by the all of classifers' prediction). Each dict contains: mean_score, model2, type, coefficent, and prediction
Return-
best classifers- talked about above
best_strings_first- all the classifers in list of strings representation.
'''
def expand_best(TR_set_used, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight):
#for each classifier train on tr and test on tR3
#
list_classifiers = []
for each_classifier in new_classifiers5.create_classifiers(ensemble_methods):
model = None
if(each_classifier['tuned_parameters'] != []):
model = GridSearchCV(each_classifier['model'], each_classifier['tuned_parameters'], cv=10, scoring="accuracy").fit(TR_set_used['TR'],TR_set_used['TR_outcome'])
else:
model = each_classifier['model'].fit(TR_set_used['TR'],TR_set_used['TR_outcome'])
type_hold = each_classifier['type']
predictions = model.predict(TR_set_used['TR3'])
roc_score = None
if(binary):
roc_score = roc_auc_score(predictions,TR_set_used['TR3_outcome'] )
else:
roc_score = new_classifiers5.multi_class_roc(weight, lb, predictions, TR_set_used['TR3_outcome'], labels)
hold_tup = (type_hold, predictions, roc_score, model)
list_classifiers.append(hold_tup)
best_strings_first = greedy.find_best(list_classifiers, TR_set_used['TR3'], TR_set_used['TR3_outcome'], labels, TR_set_used['TR'],TR_set_used['TR_outcome'], weight, lb, binary)
best_strings_second = new_classifiers5.names_all_classifiers(ensemble_methods)
best_classifiers = new_classifiers5.one_iteration(TR_set_used, training_set3, new_features_only, labels, binary, best_strings_first, best_strings_second, ensemble_methods, lb, weight)
return (best_classifiers, best_strings_first)
#(type, prediction, roc_auc_score, model)
def expand_all(TR_set_used, labels, binary, new_features_only, training_set3, ensemble_methods, lb, weight):
best_strings_first = new_classifiers5.names_all_classifiers(ensemble_methods)
best_strings_second = best_strings_first
best_classifiers = new_classifiers5.one_iteration(TR_set_used, training_set3, new_features_only, labels, binary, best_strings_first, best_strings_second, ensemble_methods, lb, weight)
return (best_classifiers, best_strings_first)
def expand_all_combine_all(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers, labels, binary, new_features_only, best_strings_first, ensemble_methods, weight, lb):
best_strings_second = new_classifiers5.names_all_classifiers(ensemble_methods)
return mean_combine.combine(full_training_set, TR_set_used, TS, TS_outcome, best_classifiers, labels, binary, new_features_only, best_strings_first, best_strings_second, ensemble_methods, weight, lb)