1
2
3
4
5 """
6 This module provides code for doing k-nearest-neighbors classification.
7
8 k Nearest Neighbors is a supervised learning algorithm that classifies
9 a new observation based the classes in its surrounding neighborhood.
10
11 Glossary:
12
13 - distance The distance between two points in the feature space.
14 - weight The importance given to each point for classification.
15
16
17 Classes:
18
19 - kNN Holds information for a nearest neighbors classifier.
20
21
22 Functions:
23
24 - train Train a new kNN classifier.
25 - calculate Calculate the probabilities of each class, given an observation.
26 - classify Classify an observation into a class.
27
28 Weighting Functions:
29
30 - equal_weight Every example is given a weight of 1.
31
32 """
33
34 import numpy
35
36 __docformat__ = "restructuredtext en"
37
38
40 """Holds information necessary to do nearest neighbors classification.
41
42 Members:
43
44 - classes Set of the possible classes.
45 - xs List of the neighbors.
46 - ys List of the classes that the neighbors belong to.
47 - k Number of neighbors to look at.
48
49 """
51 """kNN()"""
52 self.classes = set()
53 self.xs = []
54 self.ys = []
55 self.k = None
56
57
59 """equal_weight(x, y) -> 1"""
60
61 return 1
62
63
64 -def train(xs, ys, k, typecode=None):
65 """train(xs, ys, k) -> kNN
66
67 Train a k nearest neighbors classifier on a training set. xs is a
68 list of observations and ys is a list of the class assignments.
69 Thus, xs and ys should contain the same number of elements. k is
70 the number of neighbors that should be examined when doing the
71 classification.
72 """
73 knn = kNN()
74 knn.classes = set(ys)
75 knn.xs = numpy.asarray(xs, typecode)
76 knn.ys = ys
77 knn.k = k
78 return knn
79
80
82 """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
83
84 Calculate the probability for each class. knn is a kNN object. x
85 is the observed data. weight_fn is an optional function that
86 takes x and a training example, and returns a weight. distance_fn
87 is an optional function that takes two points and returns the
88 distance between them. If distance_fn is None (the default), the
89 Euclidean distance is used. Returns a dictionary of the class to
90 the weight given to the class.
91 """
92 x = numpy.asarray(x)
93
94 order = []
95 if distance_fn:
96 for i in range(len(knn.xs)):
97 dist = distance_fn(x, knn.xs[i])
98 order.append((dist, i))
99 else:
100
101 temp = numpy.zeros(len(x))
102
103
104 for i in range(len(knn.xs)):
105 temp[:] = x - knn.xs[i]
106 dist = numpy.sqrt(numpy.dot(temp, temp))
107 order.append((dist, i))
108 order.sort()
109
110
111 weights = {}
112 for k in knn.classes:
113 weights[k] = 0.0
114 for dist, i in order[:knn.k]:
115 klass = knn.ys[i]
116 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
117
118 return weights
119
120
122 """classify(knn, x[, weight_fn][, distance_fn]) -> class
123
124 Classify an observation into a class. If not specified, weight_fn will
125 give all neighbors equal weight. distance_fn is an optional function
126 that takes two points and returns the distance between them. If
127 distance_fn is None (the default), the Euclidean distance is used.
128 """
129 weights = calculate(
130 knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
131
132 most_class = None
133 most_weight = None
134 for klass, weight in weights.items():
135 if most_class is None or weight > most_weight:
136 most_class = klass
137 most_weight = weight
138 return most_class
139