jpayne@69
|
1 # Copyright 2002 by Jeffrey Chang.
|
jpayne@69
|
2 # All rights reserved.
|
jpayne@69
|
3 #
|
jpayne@69
|
4 # This file is part of the Biopython distribution and governed by your
|
jpayne@69
|
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@69
|
6 # Please see the LICENSE file that should have been included as part of this
|
jpayne@69
|
7 # package.
|
jpayne@69
|
8 """Code for doing k-nearest-neighbors classification (DEPRECATED).
|
jpayne@69
|
9
|
jpayne@69
|
10 k Nearest Neighbors is a supervised learning algorithm that classifies
|
jpayne@69
|
11 a new observation based the classes in its surrounding neighborhood.
|
jpayne@69
|
12
|
jpayne@69
|
13 Glossary:
|
jpayne@69
|
14 - distance The distance between two points in the feature space.
|
jpayne@69
|
15 - weight The importance given to each point for classification.
|
jpayne@69
|
16
|
jpayne@69
|
17 Classes:
|
jpayne@69
|
18 - kNN Holds information for a nearest neighbors classifier.
|
jpayne@69
|
19
|
jpayne@69
|
20
|
jpayne@69
|
21 Functions:
|
jpayne@69
|
22 - train Train a new kNN classifier.
|
jpayne@69
|
23 - calculate Calculate the probabilities of each class, given an observation.
|
jpayne@69
|
24 - classify Classify an observation into a class.
|
jpayne@69
|
25
|
jpayne@69
|
26 Weighting Functions:
|
jpayne@69
|
27 - equal_weight Every example is given a weight of 1.
|
jpayne@69
|
28
|
jpayne@69
|
29 This module has been deprecated, please consider an alternative like scikit-learn
|
jpayne@69
|
30 insead.
|
jpayne@69
|
31 """
|
jpayne@69
|
32
|
jpayne@69
|
33 import warnings
|
jpayne@69
|
34
|
jpayne@69
|
35 try:
|
jpayne@69
|
36 import numpy as np
|
jpayne@69
|
37 except ImportError:
|
jpayne@69
|
38 from Bio import MissingPythonDependencyError
|
jpayne@69
|
39
|
jpayne@69
|
40 raise MissingPythonDependencyError(
|
jpayne@69
|
41 "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/"
|
jpayne@69
|
42 ) from None
|
jpayne@69
|
43
|
jpayne@69
|
44 from Bio import BiopythonDeprecationWarning
|
jpayne@69
|
45
|
jpayne@69
|
46 warnings.warn(
|
jpayne@69
|
47 "The 'Bio.kNN' module is deprecated and will be removed in a future "
|
jpayne@69
|
48 "release of Biopython. Consider using scikit-learn instead.",
|
jpayne@69
|
49 BiopythonDeprecationWarning,
|
jpayne@69
|
50 )
|
jpayne@69
|
51
|
jpayne@69
|
52
|
jpayne@69
|
53 class kNN:
|
jpayne@69
|
54 """Holds information necessary to do nearest neighbors classification.
|
jpayne@69
|
55
|
jpayne@69
|
56 Attributes:
|
jpayne@69
|
57 - classes Set of the possible classes.
|
jpayne@69
|
58 - xs List of the neighbors.
|
jpayne@69
|
59 - ys List of the classes that the neighbors belong to.
|
jpayne@69
|
60 - k Number of neighbors to look at.
|
jpayne@69
|
61 """
|
jpayne@69
|
62
|
jpayne@69
|
63 def __init__(self):
|
jpayne@69
|
64 """Initialize the class."""
|
jpayne@69
|
65 self.classes = set()
|
jpayne@69
|
66 self.xs = []
|
jpayne@69
|
67 self.ys = []
|
jpayne@69
|
68 self.k = None
|
jpayne@69
|
69
|
jpayne@69
|
70
|
jpayne@69
|
71 def equal_weight(x, y):
|
jpayne@69
|
72 """Return integer one (dummy method for equally weighting)."""
|
jpayne@69
|
73 # everything gets 1 vote
|
jpayne@69
|
74 return 1
|
jpayne@69
|
75
|
jpayne@69
|
76
|
jpayne@69
|
77 def train(xs, ys, k, typecode=None):
|
jpayne@69
|
78 """Train a k nearest neighbors classifier on a training set.
|
jpayne@69
|
79
|
jpayne@69
|
80 xs is a list of observations and ys is a list of the class assignments.
|
jpayne@69
|
81 Thus, xs and ys should contain the same number of elements. k is
|
jpayne@69
|
82 the number of neighbors that should be examined when doing the
|
jpayne@69
|
83 classification.
|
jpayne@69
|
84 """
|
jpayne@69
|
85 knn = kNN()
|
jpayne@69
|
86 knn.classes = set(ys)
|
jpayne@69
|
87 knn.xs = np.asarray(xs, typecode)
|
jpayne@69
|
88 knn.ys = ys
|
jpayne@69
|
89 knn.k = k
|
jpayne@69
|
90 return knn
|
jpayne@69
|
91
|
jpayne@69
|
92
|
jpayne@69
|
93 def calculate(knn, x, weight_fn=None, distance_fn=None):
|
jpayne@69
|
94 """Calculate the probability for each class.
|
jpayne@69
|
95
|
jpayne@69
|
96 Arguments:
|
jpayne@69
|
97 - x is the observed data.
|
jpayne@69
|
98 - weight_fn is an optional function that takes x and a training
|
jpayne@69
|
99 example, and returns a weight.
|
jpayne@69
|
100 - distance_fn is an optional function that takes two points and
|
jpayne@69
|
101 returns the distance between them. If distance_fn is None (the
|
jpayne@69
|
102 default), the Euclidean distance is used.
|
jpayne@69
|
103
|
jpayne@69
|
104 Returns a dictionary of the class to the weight given to the class.
|
jpayne@69
|
105 """
|
jpayne@69
|
106 if weight_fn is None:
|
jpayne@69
|
107 weight_fn = equal_weight
|
jpayne@69
|
108
|
jpayne@69
|
109 x = np.asarray(x)
|
jpayne@69
|
110
|
jpayne@69
|
111 order = [] # list of (distance, index)
|
jpayne@69
|
112 if distance_fn:
|
jpayne@69
|
113 for i in range(len(knn.xs)):
|
jpayne@69
|
114 dist = distance_fn(x, knn.xs[i])
|
jpayne@69
|
115 order.append((dist, i))
|
jpayne@69
|
116 else:
|
jpayne@69
|
117 # Default: Use a fast implementation of the Euclidean distance
|
jpayne@69
|
118 temp = np.zeros(len(x))
|
jpayne@69
|
119 # Predefining temp allows reuse of this array, making this
|
jpayne@69
|
120 # function about twice as fast.
|
jpayne@69
|
121 for i in range(len(knn.xs)):
|
jpayne@69
|
122 temp[:] = x - knn.xs[i]
|
jpayne@69
|
123 dist = np.sqrt(np.dot(temp, temp))
|
jpayne@69
|
124 order.append((dist, i))
|
jpayne@69
|
125 order.sort()
|
jpayne@69
|
126
|
jpayne@69
|
127 # first 'k' are the ones I want.
|
jpayne@69
|
128 weights = {} # class -> number of votes
|
jpayne@69
|
129 for k in knn.classes:
|
jpayne@69
|
130 weights[k] = 0.0
|
jpayne@69
|
131 for dist, i in order[: knn.k]:
|
jpayne@69
|
132 klass = knn.ys[i]
|
jpayne@69
|
133 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
|
jpayne@69
|
134
|
jpayne@69
|
135 return weights
|
jpayne@69
|
136
|
jpayne@69
|
137
|
jpayne@69
|
138 def classify(knn, x, weight_fn=None, distance_fn=None):
|
jpayne@69
|
139 """Classify an observation into a class.
|
jpayne@69
|
140
|
jpayne@69
|
141 If not specified, weight_fn will give all neighbors equal weight.
|
jpayne@69
|
142 distance_fn is an optional function that takes two points and returns
|
jpayne@69
|
143 the distance between them. If distance_fn is None (the default),
|
jpayne@69
|
144 the Euclidean distance is used.
|
jpayne@69
|
145 """
|
jpayne@69
|
146 if weight_fn is None:
|
jpayne@69
|
147 weight_fn = equal_weight
|
jpayne@69
|
148
|
jpayne@69
|
149 weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
|
jpayne@69
|
150
|
jpayne@69
|
151 most_class = None
|
jpayne@69
|
152 most_weight = None
|
jpayne@69
|
153 for klass, weight in weights.items():
|
jpayne@69
|
154 if most_class is None or weight > most_weight:
|
jpayne@69
|
155 most_class = klass
|
jpayne@69
|
156 most_weight = weight
|
jpayne@69
|
157 return most_class
|