1
2
3
4
5 """
6 This module provides code for doing k-nearest-neighbors classification.
7
8 k Nearest Neighbors is a supervised learning algorithm that classifies
9 a new observation based the classes in its surrounding neighborhood.
10
11 Glossary:
12 distance The distance between two points in the feature space.
13 weight The importance given to each point for classification.
14
15
16 Classes:
17 kNN Holds information for a nearest neighbors classifier.
18
19
20 Functions:
21 train Train a new kNN classifier.
22 calculate Calculate the probabilities of each class, given an observation.
23 classify Classify an observation into a class.
24
25 Weighting Functions:
26 equal_weight Every example is given a weight of 1.
27
28 """
29
30 import numpy
31
32
34 """Holds information necessary to do nearest neighbors classification.
35
36 Members:
37 classes Set of the possible classes.
38 xs List of the neighbors.
39 ys List of the classes that the neighbors belong to.
40 k Number of neighbors to look at.
41
42 """
44 """kNN()"""
45 self.classes = set()
46 self.xs = []
47 self.ys = []
48 self.k = None
49
50
52 """equal_weight(x, y) -> 1"""
53
54 return 1
55
56
57 -def train(xs, ys, k, typecode=None):
58 """train(xs, ys, k) -> kNN
59
60 Train a k nearest neighbors classifier on a training set. xs is a
61 list of observations and ys is a list of the class assignments.
62 Thus, xs and ys should contain the same number of elements. k is
63 the number of neighbors that should be examined when doing the
64 classification.
65 """
66 knn = kNN()
67 knn.classes = set(ys)
68 knn.xs = numpy.asarray(xs, typecode)
69 knn.ys = ys
70 knn.k = k
71 return knn
72
73
75 """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
76
77 Calculate the probability for each class. knn is a kNN object. x
78 is the observed data. weight_fn is an optional function that
79 takes x and a training example, and returns a weight. distance_fn
80 is an optional function that takes two points and returns the
81 distance between them. If distance_fn is None (the default), the
82 Euclidean distance is used. Returns a dictionary of the class to
83 the weight given to the class.
84 """
85 x = numpy.asarray(x)
86
87 order = []
88 if distance_fn:
89 for i in range(len(knn.xs)):
90 dist = distance_fn(x, knn.xs[i])
91 order.append((dist, i))
92 else:
93
94 temp = numpy.zeros(len(x))
95
96
97 for i in range(len(knn.xs)):
98 temp[:] = x - knn.xs[i]
99 dist = numpy.sqrt(numpy.dot(temp, temp))
100 order.append((dist, i))
101 order.sort()
102
103
104 weights = {}
105 for k in knn.classes:
106 weights[k] = 0.0
107 for dist, i in order[:knn.k]:
108 klass = knn.ys[i]
109 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
110
111 return weights
112
113
115 """classify(knn, x[, weight_fn][, distance_fn]) -> class
116
117 Classify an observation into a class. If not specified, weight_fn will
118 give all neighbors equal weight. distance_fn is an optional function
119 that takes two points and returns the distance between them. If
120 distance_fn is None (the default), the Euclidean distance is used.
121 """
122 weights = calculate(
123 knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
124
125 most_class = None
126 most_weight = None
127 for klass, weight in weights.items():
128 if most_class is None or weight > most_weight:
129 most_class = klass
130 most_weight = weight
131 return most_class
132