-
Notifications
You must be signed in to change notification settings - Fork 1
/
naivebayes.py
218 lines (190 loc) · 9.15 KB
/
naivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# naivebayes.py
# -----------------
# Main file for the Naive Bayes algorithm methods.
# Thanks to the Berkeley AI projects for general code structure.
#
# Chet Aldrich, Laura Biester
from loadFeatures import *
from naiveBayesProbs import getSavedProbs
from progressBar import ProgressBar
import util
import math
import copy
import time
class NaiveBayes:
"""
A Naive Bayes classifier.
"""
def __init__(self, legalLabels):
"""
Initializes the information required for Naive Bayes.
Keyword Arguments:
legalLabels -- the possible labels in the dataset
"""
self.legalLabels = legalLabels
self.k = 1 # this is the smoothing parameter
self.automaticTuning = False # Flat for automatic tuning of the parameters
self.probLabel = util.Counter()
self.probFeature = util.Counter()
self.features = None
def printToFile(self):
"""
printToFile() prints trained probabilities to a separate file. Simply place this method after the training sequence if you want to save the values for later use.
"""
file = open("naiveBayesProbs.py", "w+")
print >>file, "def getSavedProbs():\n\treturn {0}, {1}".format(self.probLabel, self.probFeature)
file.close()
def useTrainedProbs(self, features):
"""
useTrainedProbs() gets a dictionary of previously trained probabilities from
the naiveBayesProbs.py file.
Keyword Arguments:
featurs -- features needed for the saved data
"""
self.probLabel, self.probFeature = getSavedProbs()
self.features = features
def train(self, trainingData, trainingLabels, validationData, validationLabels, allFeatures, tune):
"""
train() acts as an outside shell to call the method.
Keyword Arguments:
trainingData -- training data for the perceptron
trainingLabels -- labels for the associated training data
validationData -- validation data for the perceptron tuning function
validationLabels -- labels for the associated validation data
allFeatures -- a list of all of the possible features in the dataset
tune -- a boolean indicating whether to tune over iterations.
"""
self.features = allFeatures # list with all valid features
self.automaticTuning = tune
if (self.automaticTuning):
kgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]
else:
kgrid = [self.k]
return self.trainAndTune(trainingData, trainingLabels, validationData, validationLabels, kgrid)
def trainAndTune(self, trainingData, trainingLabels, validationData, validationLabels, kgrid):
"""
trainAndTune() trains the classifier by collecting counts over the training data and choosing the smoothing parameter among the choices in kgrid by
using the validation data. This method should store the right parameters
as a side-effect and should return the best smoothing parameters.
Keyword Arguments:
trainingData -- training data for the perceptron
trainingLabels -- labels for the associated training data
validationData -- validation data for the perceptron tuning function
validationLabels -- labels for the associated validation data
kgrid -- a list of possible k values to try for smoothing
"""
# We begin by creating the prior probabilities for each of the labels
# and the features based on the counts in the training data.
countLabel = util.Counter() # in form k = label, v = numOfL
countFeature = util.Counter()
# We begin looking over the training data here.
progressBar = ProgressBar(100, len(trainingData), "Counting Data")
for i in range(len(trainingData)):
# update our progress bar
progressBar.update(i)
label = trainingLabels[i]
# Labels are counted at each point they are seen here.
countLabel[label] += 1
# Then, if we haven't seen the label, we add it to the feature counter.
if label not in countFeature:
countFeature[label] = util.Counter()
# Finally, we loop over the features for each datum and add each feature once
# for each occurrence.
for feature in trainingData[i]:
countFeature[label][feature] += trainingData[i][feature]
progressBar.clear()
self.probLabel = copy.deepcopy(countLabel)
self.probLabel.normalize()
# At this point we have the counts, and we want to see what level of smoothing
# increases our accuracy the most over the training set. Essentially, we just
# create all of the probabilities from the feature counts while adding the smoothing
# and classify the training data each time and pick whatever was most accurate.
kClassifications = util.Counter()
probForK = util.Counter()
numCorrectK = util.Counter()
print "Validation Accuracy"
print "==================="
for k in kgrid:
# make counter for probabilities for each k
probForK[k] = util.Counter()
# make counters for probabilities for each label
for label in self.legalLabels:
probForK[k][label] = util.Counter()
# find probability of each feature given each label
progressBar = ProgressBar(100, len(self.features), "Getting Probabilities for Features, Label {0}".format(label))
for index, feature in enumerate(self.features):
progressBar.update(index)
if countFeature[label] != 0:
probForK[k][label][feature] = float(countFeature[label][feature] + k) / (countLabel[label] + k * len(self.features))
progressBar.clear()
# set probabilities for features and classify validation data
self.probFeature = probForK[k]
classificationLabels = self.classify(validationData)
# check how much of the data was classified correctly
correct = 0
for i in range(len(classificationLabels)):
if classificationLabels[i] == validationLabels[i]:
correct += 1
# print accuracy for each k
print "k = {0}, number of correct classifications = {1}".format(k, correct)
# store the number of correct classifications for k value
numCorrectK[k] = correct
# pick k from our list of possible k values
self.k = None
for k in numCorrectK:
# find k with the highest number of correct classifications
# if there is a tie, use a lower k value
if (self.k == None or numCorrectK[self.k] < numCorrectK[k] or
(numCorrectK[self.k] == numCorrectK[k] and k < self.k)):
self.k = k
self.probFeature = probForK[self.k]
# print final choice for k
print "K chosen = {0}".format(self.k)
return self.k
def classify(self, testData):
"""
classify() classifies each data item in the input by finding the best prototype vector.
Keyword Arguments:
testData -- the test data to classify
"""
guesses = []
self.posteriors = [] # Log posteriors are stored for later data analysis.
counter = 0
size = len(testData)
progressBar = ProgressBar(100, len(testData), "Classifying Data")
for index, datum in enumerate(testData):
progressBar.update(index)
posterior = self.calculateLogJointProbabilities(datum)
guesses.append(posterior.argMax())
self.posteriors.append(posterior)
progressBar.clear()
return guesses
def calculateLogJointProbabilities(self, datum):
"""
calculateLogJointProbabilities() returns the log-joint distribution over
legal labels and the datum.
Keyword Arguments:
datum -- a dictionary with 0 or 1 values depending on whether a value is
colored or not.
"""
logJoint = util.Counter()
for label in self.legalLabels:
logJoint[label] += math.log(self.probLabel[label])
for feature in self.features:
# probability is the stored probability if the feature is on
# otherwise, it is 1 - that probability, because the distribution
# sums to 0
if datum[feature] != 0:
probability = self.probFeature[label][feature]
else:
probability = 1 - self.probFeature[label][feature]
# if probability is 0, break to avoid log being undefined
# any probability in the joint being 0 will lead to an
# overall probability of 0
if probability == 0:
logJoint[label] = 0
break
# otherwise, sum up the logs to get values proportional
# to multiplying the probabilities
logJoint[label] += math.log(probability)
return logJoint