-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon_functions.py
60 lines (46 loc) · 1.79 KB
/
common_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
import operator
from tabulate import tabulate
def label_encoding(data_set, variable): # returns the recoding dictionary
# doesn't return the array to avoid complication
le = LabelEncoder()
le.fit(data_set[variable])
return {str_value: num_value for num_value, str_value in enumerate(le.classes_)}
def one_hot_encoding(data_set, variable):
# recode every category as binary variable
value_map = label_encoding(data_set[[variable]], variable)
x_0 = data_set[[variable]]
x_0[variable] = x_0[variable].map(value_map)
enc = OneHotEncoder()
enc.fit(x_0)
x_one_hot = enc.transform(x_0).toarray()
names = ["%s_%s" % (variable, i) for i in value_map]
df = pd.DataFrame(x_one_hot, columns=names)
return df
def recode_categorical_variables(data_set, variables):
# use one hot encoding to adapt data to machine learning tools
data_to_modify = data_set[variables].copy()
vectors = [one_hot_encoding(data_to_modify, variable) for variable in data_to_modify.columns.values]
modified_data = pd.concat(vectors, axis=1)
return modified_data
def dict_to_table(result_dict):
return tabulate(sorted(result_dict.items(), key=operator.itemgetter(1), reverse=True))
def parse_feature_importance_table(table):
replace_map = [
("'", ""),
("\n\n ", "\n"),
(" ", "*"),
("* ", "*"),
("event_", "event "),
("climate_", "climate "),
("West_North_Central", "West North Central"),
("Upper_Midwest", "Upper Midwest"),
("_lg", " lg"),
("_", ": ")
]
for old, new in replace_map:
table = table.replace(old, new)
while "**" in table:
table = table.replace("**", "*")
return table