Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright 2002 by Jeffrey Chang. | |
2 # All rights reserved. | |
3 # | |
4 # This file is part of the Biopython distribution and governed by your | |
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
6 # Please see the LICENSE file that should have been included as part of this | |
7 # package. | |
8 """Code for doing k-nearest-neighbors classification (DEPRECATED). | |
9 | |
10 k Nearest Neighbors is a supervised learning algorithm that classifies | |
11 a new observation based the classes in its surrounding neighborhood. | |
12 | |
13 Glossary: | |
14 - distance The distance between two points in the feature space. | |
15 - weight The importance given to each point for classification. | |
16 | |
17 Classes: | |
18 - kNN Holds information for a nearest neighbors classifier. | |
19 | |
20 | |
21 Functions: | |
22 - train Train a new kNN classifier. | |
23 - calculate Calculate the probabilities of each class, given an observation. | |
24 - classify Classify an observation into a class. | |
25 | |
26 Weighting Functions: | |
27 - equal_weight Every example is given a weight of 1. | |
28 | |
29 This module has been deprecated, please consider an alternative like scikit-learn | |
30 insead. | |
31 """ | |
32 | |
33 import warnings | |
34 | |
35 try: | |
36 import numpy as np | |
37 except ImportError: | |
38 from Bio import MissingPythonDependencyError | |
39 | |
40 raise MissingPythonDependencyError( | |
41 "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/" | |
42 ) from None | |
43 | |
44 from Bio import BiopythonDeprecationWarning | |
45 | |
46 warnings.warn( | |
47 "The 'Bio.kNN' module is deprecated and will be removed in a future " | |
48 "release of Biopython. Consider using scikit-learn instead.", | |
49 BiopythonDeprecationWarning, | |
50 ) | |
51 | |
52 | |
53 class kNN: | |
54 """Holds information necessary to do nearest neighbors classification. | |
55 | |
56 Attributes: | |
57 - classes Set of the possible classes. | |
58 - xs List of the neighbors. | |
59 - ys List of the classes that the neighbors belong to. | |
60 - k Number of neighbors to look at. | |
61 """ | |
62 | |
63 def __init__(self): | |
64 """Initialize the class.""" | |
65 self.classes = set() | |
66 self.xs = [] | |
67 self.ys = [] | |
68 self.k = None | |
69 | |
70 | |
71 def equal_weight(x, y): | |
72 """Return integer one (dummy method for equally weighting).""" | |
73 # everything gets 1 vote | |
74 return 1 | |
75 | |
76 | |
77 def train(xs, ys, k, typecode=None): | |
78 """Train a k nearest neighbors classifier on a training set. | |
79 | |
80 xs is a list of observations and ys is a list of the class assignments. | |
81 Thus, xs and ys should contain the same number of elements. k is | |
82 the number of neighbors that should be examined when doing the | |
83 classification. | |
84 """ | |
85 knn = kNN() | |
86 knn.classes = set(ys) | |
87 knn.xs = np.asarray(xs, typecode) | |
88 knn.ys = ys | |
89 knn.k = k | |
90 return knn | |
91 | |
92 | |
93 def calculate(knn, x, weight_fn=None, distance_fn=None): | |
94 """Calculate the probability for each class. | |
95 | |
96 Arguments: | |
97 - x is the observed data. | |
98 - weight_fn is an optional function that takes x and a training | |
99 example, and returns a weight. | |
100 - distance_fn is an optional function that takes two points and | |
101 returns the distance between them. If distance_fn is None (the | |
102 default), the Euclidean distance is used. | |
103 | |
104 Returns a dictionary of the class to the weight given to the class. | |
105 """ | |
106 if weight_fn is None: | |
107 weight_fn = equal_weight | |
108 | |
109 x = np.asarray(x) | |
110 | |
111 order = [] # list of (distance, index) | |
112 if distance_fn: | |
113 for i in range(len(knn.xs)): | |
114 dist = distance_fn(x, knn.xs[i]) | |
115 order.append((dist, i)) | |
116 else: | |
117 # Default: Use a fast implementation of the Euclidean distance | |
118 temp = np.zeros(len(x)) | |
119 # Predefining temp allows reuse of this array, making this | |
120 # function about twice as fast. | |
121 for i in range(len(knn.xs)): | |
122 temp[:] = x - knn.xs[i] | |
123 dist = np.sqrt(np.dot(temp, temp)) | |
124 order.append((dist, i)) | |
125 order.sort() | |
126 | |
127 # first 'k' are the ones I want. | |
128 weights = {} # class -> number of votes | |
129 for k in knn.classes: | |
130 weights[k] = 0.0 | |
131 for dist, i in order[: knn.k]: | |
132 klass = knn.ys[i] | |
133 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) | |
134 | |
135 return weights | |
136 | |
137 | |
138 def classify(knn, x, weight_fn=None, distance_fn=None): | |
139 """Classify an observation into a class. | |
140 | |
141 If not specified, weight_fn will give all neighbors equal weight. | |
142 distance_fn is an optional function that takes two points and returns | |
143 the distance between them. If distance_fn is None (the default), | |
144 the Euclidean distance is used. | |
145 """ | |
146 if weight_fn is None: | |
147 weight_fn = equal_weight | |
148 | |
149 weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn) | |
150 | |
151 most_class = None | |
152 most_weight = None | |
153 for klass, weight in weights.items(): | |
154 if most_class is None or weight > most_weight: | |
155 most_class = klass | |
156 most_weight = weight | |
157 return most_class |