-
Notifications
You must be signed in to change notification settings - Fork 0
/
poi_id.py
146 lines (117 loc) · 4.25 KB
/
poi_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function
import pickle
import sys
sys.path.append("../tools/")
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tester import dump_classifier_and_data, test_classifier
# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
# replace 'Nan' strings with None in dataset
for outer_keys, inner_dicts in data_dict.items():
for k, v in inner_dicts.items():
if v == 'NaN':
data_dict[outer_keys][k] = None
df = pd.DataFrame.from_dict(data_dict,
orient='index' # user outer dict keys as column names
)
# Handles email_address field
df.fillna(value=pd.np.nan, inplace=True)
# the 'TOTAL' record
TOTAL = df.sort_values(by=['salary'], ascending=False, na_position='last').head(1)
# dropping computed 'TOTAL' observation
df.drop(index='TOTAL', inplace=True)
# Drop records where all inputs are NaN
df.dropna(thresh=2, inplace=True)
# Not a person, dropping
df.drop(index='THE TRAVEL AGENCY IN THE PARK', inplace=True)
# New Feature - has_email - creation
df['has_email'] = df.email_address.notna()
# create copy of dataframe
X = df.copy()
# Dropping email_adress field
X.drop(['email_address'], axis=1, inplace=True)
# Popping poi field
y = X.pop('poi')
# Impute missing values
X_imputed = Imputer(
strategy="most_frequent",
axis=0).fit_transform(X)
imp_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
imp_df['poi'] = y
# Scale values
X_imp_std = StandardScaler().fit_transform(X_imputed)
# Bring values back into a Dataframe
X_imp_std = StandardScaler().fit_transform(X_imputed)
imp_std_df = pd.DataFrame(X_imp_std, columns=X.columns, index=X.index)
imp_std_df['poi'] = y
# convert dataset back to dictionary for tester.py
dict_for_tester = imp_std_df.to_dict('index')
imp_std_df.head()
# Support vector machine testing
# -----------------------------
# Warning supression
np.seterr(divide='ignore', invalid='ignore')
import warnings
warnings.filterwarnings('ignore')
# Building pipeline
pipeline = Pipeline([
("features", SelectKBest(f_classif)),
("svm", SVC())])
# Paramater grid for gridsearchcv
param_grid = dict(features__k=np.arange(1, X.shape[1]),
svm__C=[0.1, 1., 10.],
svm__kernel=['linear', 'rbf', 'sigmoid']
)
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='f1',
cv=5,
iid=True)
grid_search.fit(X_imp_std, y)
# the classifier used
clf = grid_search.best_estimator_.steps[1][1]
# features selected
cols = grid_search.best_estimator_.steps[0][1].get_support(indices=True)
feature_list = ['poi'] + list(X.columns[cols])
# Run test classifier from tester.py
test_classifier(clf,
dataset=dict_for_tester,
feature_list=feature_list)
# Gaussian Naive Bayes testing
# -----------------------------
# Building pipeline
pipeline = Pipeline([
("features", SelectKBest(f_classif)),
("gnb", GaussianNB())])
# Paramater grid for gridsearchcv
param_grid = dict(features__k=np.arange(1, X.shape[1]))
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
cv=5,
iid=True,
scoring='f1'
)
grid_search.fit(X_imp_std, y)
# the classifier used
clf = grid_search.best_estimator_.steps[1][1]
# features selected
cols = grid_search.best_estimator_.steps[0][1].get_support(indices=True)
feature_list = ['poi'] + list(X.columns[cols])
# Run test classifier from tester.py
test_classifier(clf,
dataset=dict_for_tester,
feature_list=feature_list)
# Dump required objects
dump_classifier_and_data(clf,
dataset=dict_for_tester,
feature_list=feature_list)