-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgaussian_naive_bayes.py
151 lines (124 loc) · 5.45 KB
/
gaussian_naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from utils.preprocessing import compute_average, compute_variance
class GaussianNaiveBayes(object):
""" A Bayes classifier is a simple probabilistic classifier, which is based
on applying Bayes' theorem. The feature model used by a naive Bayes
classifier makes strong independence assumptions. This means that the
existence of a particular feature of a class is independent or unrelated to
the existence of every other feature.
Attributes:
epsilon: float
The number that is added to the divisor to prevent division by zero.
labels: numpy.array
The array of the labels of the classes.
priors: numpy.array (n_classes, 1)
The array of the probability of each class, where n_classes is the
number of classes.
mean: numpy.array (n_classes, n_features)
The array of the mean of each feature for each class, where
n_classes is the number of classes and n_features is the number
of features.
variance: numpy.array (n_classes, n_features)
The array of the variance of each feature for each class, where
n_classes is the number of classes and n_features is the number
of features.
"""
def __init__(self):
self.epsilon = 1e-8
self.n_classes = None
self.labels = None
self.priors = None
self.mean = None
self.variance = None
def __extract_labels(self, Y):
""" Extracts the labels of the classes.
Args:
Y: numpy.array (n_samples)
The target array, where n_samples is the number of samples.
"""
self.labels = np.unique(Y)
self.n_classes = self.labels.shape[0]
def __prior(self, X, Y):
""" Prior P(Y) is the probability of hypothesis Y being true
(regardless of the data).
Args:
X: numpy.array (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Y: numpy.array (n_samples)
Target array.
"""
self.priors = np.zeros(self.n_classes).reshape(self.n_classes, 1)
for l in self.labels:
self.priors[l] = len(X[Y == l]) / X.shape[0]
def __gaussian(self, X, Y):
""" Learns Gaussian components of the likelihood P(X|Y).
Args:
X: numpy.array (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Y: numpy.array (n_samples)
Target array.
"""
mean = []
variance = []
for l in np.nditer(self.labels):
mean.append(compute_average(X[Y == l]))
variance.append(compute_variance(X[Y == l]))
self.mean = np.array(mean)
self.variance = np.array(variance)
def __likelihood(self, X, label):
""" Likelihood method for one class P(datapoints|Y=class)
Args:
X: numpy.array (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
label: int|str
Label of a class.
"""
pdf = np.zeros(X.size).reshape(X.shape[0], X.shape[1])
for i in range(X.shape[0]):
pdf[i] = np.power(np.e, -(np.power(X[i] - self.mean[label], 2))
/ (2 * self.variance[label] +self.epsilon))
pdf[i] /= np.sqrt(2*np.pi*self.variance[label])
return np.prod(pdf, axis=1)
def __posterior(self, X, label):
""" Compute posterior P(Y|datapoints) for one class.
Args:
X: numpy.array (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
label: int|str
Label of a class.
Returns:
posteriors: numpy.array
The array of the posterior probability of the class.
"""
posteriors = self.__likelihood(X, label) * self.priors[label]
return posteriors
def fit(self, X, Y):
""" Fit Gaussian Naive Bayes according to X, y.
Args:
X: numpy.array (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Y: numpy.array (n_samples)
Target array.
"""
self.__extract_labels(Y)
self.__prior(X, Y)
self.__gaussian(X, Y)
def predict(self, X):
""" Perform classification.
Args:
X: numpy.array (n_samples, n_features)
Test vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns:
class: numpy.array
Predicted class for each data point.
"""
posteriors = np.zeros(X.shape[0] * self.n_classes).reshape(X.shape[0], self.n_classes)
for l in self.labels:
posteriors[:, l] = self.__posterior(X, l)
return np.argmax(posteriors, axis=1)