1
2
3
4
5
6 """This provides code for a general Naive Bayes learner.
7
8 Naive Bayes is a supervised classification algorithm that uses Bayes
9 rule to compute the fit between a new observation and some previously
10 observed data. The observations are discrete feature vectors, with
11 the Bayes assumption that the features are independent. Although this
12 is hardly ever true, the classifier works well enough in practice.
13
14 Glossary:
15 observation A feature vector of discrete data.
16 class A possible classification for an observation.
17
18
19 Classes:
20 NaiveBayes Holds information for a naive Bayes classifier.
21
22 Functions:
23 train Train a new naive Bayes classifier.
24 calculate Calculate the probabilities of each class, given an observation.
25 classify Classify an observation into a class.
26
27 """
28
29 from __future__ import print_function
30
31 import numpy
32
33
34 -def _contents(items):
35 term = 1.0/len(items)
36 counts = {}
37 for item in items:
38 counts[item] = counts.get(item, 0) + term
39 return counts
40
41
43 """Holds information for a NaiveBayes classifier.
44
45 Members:
46 classes List of the possible classes of data.
47 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim)
48 p_prior List of the prior probabilities for every class.
49 dimensionality Dimensionality of the data.
50
51 """
53 self.classes = []
54 self.p_conditional = None
55 self.p_prior = []
56 self.dimensionality = None
57
58
60 """calculate(nb, observation[, scale]) -> probability dict
61
62 Calculate log P(class|observation) for each class. nb is a NaiveBayes
63 classifier that has been trained. observation is a list representing
64 the observed data. scale is whether the probability should be
65 scaled by P(observation). By default, no scaling is done. The return
66 value is a dictionary where the keys is the class and the value is the
67 log probability of the class.
68
69 """
70
71
72
73
74
75 if len(observation) != nb.dimensionality:
76 raise ValueError("observation in %d dimension, but classifier in %d"
77 % (len(observation), nb.dimensionality))
78
79
80 n = len(nb.classes)
81 lp_observation_class = numpy.zeros(n)
82 for i in range(n):
83
84 probs = [None] * len(observation)
85 for j in range(len(observation)):
86 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
87 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
88 lp_observation_class[i] = sum(lprobs)
89
90
91 lp_prior = numpy.log(nb.p_prior)
92
93
94 lp_observation = 0.0
95 if scale:
96
97 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class, -700, +700))
98 lp_observation = numpy.log(sum(obs))
99
100
101 lp_class_observation = {}
102 for i in range(len(nb.classes)):
103 lp_class_observation[nb.classes[i]] = \
104 lp_observation_class[i] + lp_prior[i] - lp_observation
105
106 return lp_class_observation
107
108
110 """classify(nb, observation) -> class
111
112 Classify an observation into a class.
113
114 """
115
116 probs = calculate(nb, observation, scale=0)
117 max_prob = max_class = None
118 for klass in nb.classes:
119 if max_prob is None or probs[klass] > max_prob:
120 max_prob, max_class = probs[klass], klass
121 return max_class
122
123
124 -def train(training_set, results, priors=None, typecode=None):
125 """train(training_set, results[, priors]) -> NaiveBayes
126
127 Train a naive bayes classifier on a training set. training_set is a
128 list of observations. results is a list of the class assignments
129 for each observation. Thus, training_set and results must be the same
130 length. priors is an optional dictionary specifying the prior
131 probabilities for each type of result. If not specified, the priors
132 will be estimated from the training results.
133
134 """
135 if not len(training_set):
136 raise ValueError("No data in the training set.")
137 if len(training_set) != len(results):
138 raise ValueError("training_set and results should be parallel lists.")
139
140
141
142
143
144
145
146
147 dimensions = [len(x) for x in training_set]
148 if min(dimensions) != max(dimensions):
149 raise ValueError("observations have different dimensionality")
150
151 nb = NaiveBayes()
152 nb.dimensionality = dimensions[0]
153
154
155
156 if priors is not None:
157 percs = priors
158 nb.classes = list(set(results))
159 else:
160 class_freq = _contents(results)
161 nb.classes = list(class_freq.keys())
162 percs = class_freq
163 nb.classes.sort()
164
165 nb.p_prior = numpy.zeros(len(nb.classes))
166 for i in range(len(nb.classes)):
167 nb.p_prior[i] = percs[nb.classes[i]]
168
169
170
171
172
173
174
175 c2i = {}
176 for index, key in enumerate(nb.classes):
177 c2i[key] = index
178 observations = [[] for c in nb.classes]
179 for i in range(len(results)):
180 klass, obs = results[i], training_set[i]
181 observations[c2i[klass]].append(obs)
182
183 for i in range(len(observations)):
184
185 observations[i] = numpy.asarray(observations[i], typecode)
186
187
188
189 nb.p_conditional = []
190 for i in range(len(nb.classes)):
191 class_observations = observations[i]
192 nb.p_conditional.append([None] * nb.dimensionality)
193 for j in range(nb.dimensionality):
194
195 values = class_observations[:, j]
196
197
198
199
200
201 nb.p_conditional[i][j] = _contents(values)
202 return nb
203
204 if __name__ == "__main__":
205
206
207 xcar = [
208 ['Red', 'Sports', 'Domestic'],
209 ['Red', 'Sports', 'Domestic'],
210 ['Red', 'Sports', 'Domestic'],
211 ['Yellow', 'Sports', 'Domestic'],
212 ['Yellow', 'Sports', 'Imported'],
213 ['Yellow', 'SUV', 'Imported'],
214 ['Yellow', 'SUV', 'Imported'],
215 ['Yellow', 'SUV', 'Domestic'],
216 ['Red', 'SUV', 'Imported'],
217 ['Red', 'Sports', 'Imported']
218 ]
219
220 ycar = [
221 'Yes',
222 'No',
223 'Yes',
224 'No',
225 'Yes',
226 'No',
227 'Yes',
228 'No',
229 'No',
230 'Yes'
231 ]
232
233 carmodel = train(xcar, ycar)
234 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])
235 print('Is Yes? %s' % carresult)
236 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic'])
237 print('Is No? %s' % carresult)
238