comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # Copyright 2002 by Jeffrey Chang.
2 # All rights reserved.
3 #
4 # This file is part of the Biopython distribution and governed by your
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
6 # Please see the LICENSE file that should have been included as part of this
7 # package.
8 """Code for doing k-nearest-neighbors classification (DEPRECATED).
9
10 k Nearest Neighbors is a supervised learning algorithm that classifies
11 a new observation based the classes in its surrounding neighborhood.
12
13 Glossary:
14 - distance The distance between two points in the feature space.
15 - weight The importance given to each point for classification.
16
17 Classes:
18 - kNN Holds information for a nearest neighbors classifier.
19
20
21 Functions:
22 - train Train a new kNN classifier.
23 - calculate Calculate the probabilities of each class, given an observation.
24 - classify Classify an observation into a class.
25
26 Weighting Functions:
27 - equal_weight Every example is given a weight of 1.
28
29 This module has been deprecated, please consider an alternative like scikit-learn
30 insead.
31 """
32
33 import warnings
34
35 try:
36 import numpy as np
37 except ImportError:
38 from Bio import MissingPythonDependencyError
39
40 raise MissingPythonDependencyError(
41 "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/"
42 ) from None
43
44 from Bio import BiopythonDeprecationWarning
45
46 warnings.warn(
47 "The 'Bio.kNN' module is deprecated and will be removed in a future "
48 "release of Biopython. Consider using scikit-learn instead.",
49 BiopythonDeprecationWarning,
50 )
51
52
53 class kNN:
54 """Holds information necessary to do nearest neighbors classification.
55
56 Attributes:
57 - classes Set of the possible classes.
58 - xs List of the neighbors.
59 - ys List of the classes that the neighbors belong to.
60 - k Number of neighbors to look at.
61 """
62
63 def __init__(self):
64 """Initialize the class."""
65 self.classes = set()
66 self.xs = []
67 self.ys = []
68 self.k = None
69
70
71 def equal_weight(x, y):
72 """Return integer one (dummy method for equally weighting)."""
73 # everything gets 1 vote
74 return 1
75
76
77 def train(xs, ys, k, typecode=None):
78 """Train a k nearest neighbors classifier on a training set.
79
80 xs is a list of observations and ys is a list of the class assignments.
81 Thus, xs and ys should contain the same number of elements. k is
82 the number of neighbors that should be examined when doing the
83 classification.
84 """
85 knn = kNN()
86 knn.classes = set(ys)
87 knn.xs = np.asarray(xs, typecode)
88 knn.ys = ys
89 knn.k = k
90 return knn
91
92
93 def calculate(knn, x, weight_fn=None, distance_fn=None):
94 """Calculate the probability for each class.
95
96 Arguments:
97 - x is the observed data.
98 - weight_fn is an optional function that takes x and a training
99 example, and returns a weight.
100 - distance_fn is an optional function that takes two points and
101 returns the distance between them. If distance_fn is None (the
102 default), the Euclidean distance is used.
103
104 Returns a dictionary of the class to the weight given to the class.
105 """
106 if weight_fn is None:
107 weight_fn = equal_weight
108
109 x = np.asarray(x)
110
111 order = [] # list of (distance, index)
112 if distance_fn:
113 for i in range(len(knn.xs)):
114 dist = distance_fn(x, knn.xs[i])
115 order.append((dist, i))
116 else:
117 # Default: Use a fast implementation of the Euclidean distance
118 temp = np.zeros(len(x))
119 # Predefining temp allows reuse of this array, making this
120 # function about twice as fast.
121 for i in range(len(knn.xs)):
122 temp[:] = x - knn.xs[i]
123 dist = np.sqrt(np.dot(temp, temp))
124 order.append((dist, i))
125 order.sort()
126
127 # first 'k' are the ones I want.
128 weights = {} # class -> number of votes
129 for k in knn.classes:
130 weights[k] = 0.0
131 for dist, i in order[: knn.k]:
132 klass = knn.ys[i]
133 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
134
135 return weights
136
137
138 def classify(knn, x, weight_fn=None, distance_fn=None):
139 """Classify an observation into a class.
140
141 If not specified, weight_fn will give all neighbors equal weight.
142 distance_fn is an optional function that takes two points and returns
143 the distance between them. If distance_fn is None (the default),
144 the Euclidean distance is used.
145 """
146 if weight_fn is None:
147 weight_fn = equal_weight
148
149 weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
150
151 most_class = None
152 most_weight = None
153 for klass, weight in weights.items():
154 if most_class is None or weight > most_weight:
155 most_class = klass
156 most_weight = weight
157 return most_class