-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathclassifier.py
92 lines (73 loc) · 3.3 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This is to give us a bit more compatibility with Python 3.
from __future__ import print_function
# Here we import a bunch of utility functions that have little to do with
# the machine learning at hand, but are useful for parsing our features from
# the JSON format we wrote them in and evaluating the resulting machine
# learning model that we've created.
from json_utils import read_data
from model_utils import evaluate_model
# Here we import DecisionTreeClassifier, which is the machine learning
# algorithm we'll be using to create our model.
from sklearn.tree import DecisionTreeClassifier
# We'll use Pickle (as much as it's considered a bad practise)
# to serialize our model for later recall.
import pickle
def train_model(output_file='model.bin'):
# First, let's read all of the features that we got from feature_extract.
# Fun fact: you could do ./feature_extract.py | ./classifier.py to execute
# both the feature extraction and classification steps at once, without
# writing the results to JSON first. Very handy for iterating on features.
features, classes, sample_names, feature_names, class_names = read_data()
# We'll use this percentage of the data to train, and the rest for testing.
# Why not just train on all the data? That would result in a model that is
# overfitted, or overly good at the data that it's seen and does poorly
# with data that it hasn't seen.
training_percentage = 0.85
num_training_samples = int(len(features) * training_percentage)
# Here we separate all of our features and classes into just the ones
# we want to train on...
training_features, training_classes = \
features[:num_training_samples], classes[:num_training_samples]
# ...and we do the training, which creates our model!
# vvv MACHINE LEARNING HAPPENS ON THIS LINE BELOW vvv
model = DecisionTreeClassifier(random_state=2).fit(
training_features, training_classes)
# ^^^ MACHINE LEARNING HAPPENS ON THIS LINE ABOVE ^^^
with open(output_file, 'wb') as out:
pickle.dump(model, out)
# These two lines write out a .pdf file of the model's decision tree.
# It's useful if you want to explain the model, but requires
# you to have Graphviz installed, so I've left it commented out.
# from model_utils import explain_model
# explain_model(model, feature_names, class_names)
print("Evaluating training accuracy...")
evaluate_model(
model,
training_features,
training_classes,
sample_names[:num_training_samples],
class_names,
output=False
)
return model
def evaluate(model):
features, classes, sample_names, feature_names, class_names = read_data()
# Now, here we take the other portion of our input data and use
# that to test the model and ensure it performs well on data it
# hasn't seen before.
num_test_samples = int(0.15 * len(features))
test_features, test_classes = \
features[-num_test_samples:], classes[-num_test_samples:]
print("Evaluating test accuracy...")
evaluate_model(
model,
test_features,
test_classes,
sample_names[-num_test_samples:],
class_names
)
if __name__ == "__main__":
model = train_model()
evaluate(model)