annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # Copyright 2002 by Jeffrey Chang.
jpayne@68 2 # All rights reserved.
jpayne@68 3 #
jpayne@68 4 # This file is part of the Biopython distribution and governed by your
jpayne@68 5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68 6 # Please see the LICENSE file that should have been included as part of this
jpayne@68 7 # package.
jpayne@68 8 """Code for doing k-nearest-neighbors classification (DEPRECATED).
jpayne@68 9
jpayne@68 10 k Nearest Neighbors is a supervised learning algorithm that classifies
jpayne@68 11 a new observation based the classes in its surrounding neighborhood.
jpayne@68 12
jpayne@68 13 Glossary:
jpayne@68 14 - distance The distance between two points in the feature space.
jpayne@68 15 - weight The importance given to each point for classification.
jpayne@68 16
jpayne@68 17 Classes:
jpayne@68 18 - kNN Holds information for a nearest neighbors classifier.
jpayne@68 19
jpayne@68 20
jpayne@68 21 Functions:
jpayne@68 22 - train Train a new kNN classifier.
jpayne@68 23 - calculate Calculate the probabilities of each class, given an observation.
jpayne@68 24 - classify Classify an observation into a class.
jpayne@68 25
jpayne@68 26 Weighting Functions:
jpayne@68 27 - equal_weight Every example is given a weight of 1.
jpayne@68 28
jpayne@68 29 This module has been deprecated, please consider an alternative like scikit-learn
jpayne@68 30 insead.
jpayne@68 31 """
jpayne@68 32
jpayne@68 33 import warnings
jpayne@68 34
jpayne@68 35 try:
jpayne@68 36 import numpy as np
jpayne@68 37 except ImportError:
jpayne@68 38 from Bio import MissingPythonDependencyError
jpayne@68 39
jpayne@68 40 raise MissingPythonDependencyError(
jpayne@68 41 "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/"
jpayne@68 42 ) from None
jpayne@68 43
jpayne@68 44 from Bio import BiopythonDeprecationWarning
jpayne@68 45
jpayne@68 46 warnings.warn(
jpayne@68 47 "The 'Bio.kNN' module is deprecated and will be removed in a future "
jpayne@68 48 "release of Biopython. Consider using scikit-learn instead.",
jpayne@68 49 BiopythonDeprecationWarning,
jpayne@68 50 )
jpayne@68 51
jpayne@68 52
jpayne@68 53 class kNN:
jpayne@68 54 """Holds information necessary to do nearest neighbors classification.
jpayne@68 55
jpayne@68 56 Attributes:
jpayne@68 57 - classes Set of the possible classes.
jpayne@68 58 - xs List of the neighbors.
jpayne@68 59 - ys List of the classes that the neighbors belong to.
jpayne@68 60 - k Number of neighbors to look at.
jpayne@68 61 """
jpayne@68 62
jpayne@68 63 def __init__(self):
jpayne@68 64 """Initialize the class."""
jpayne@68 65 self.classes = set()
jpayne@68 66 self.xs = []
jpayne@68 67 self.ys = []
jpayne@68 68 self.k = None
jpayne@68 69
jpayne@68 70
jpayne@68 71 def equal_weight(x, y):
jpayne@68 72 """Return integer one (dummy method for equally weighting)."""
jpayne@68 73 # everything gets 1 vote
jpayne@68 74 return 1
jpayne@68 75
jpayne@68 76
jpayne@68 77 def train(xs, ys, k, typecode=None):
jpayne@68 78 """Train a k nearest neighbors classifier on a training set.
jpayne@68 79
jpayne@68 80 xs is a list of observations and ys is a list of the class assignments.
jpayne@68 81 Thus, xs and ys should contain the same number of elements. k is
jpayne@68 82 the number of neighbors that should be examined when doing the
jpayne@68 83 classification.
jpayne@68 84 """
jpayne@68 85 knn = kNN()
jpayne@68 86 knn.classes = set(ys)
jpayne@68 87 knn.xs = np.asarray(xs, typecode)
jpayne@68 88 knn.ys = ys
jpayne@68 89 knn.k = k
jpayne@68 90 return knn
jpayne@68 91
jpayne@68 92
jpayne@68 93 def calculate(knn, x, weight_fn=None, distance_fn=None):
jpayne@68 94 """Calculate the probability for each class.
jpayne@68 95
jpayne@68 96 Arguments:
jpayne@68 97 - x is the observed data.
jpayne@68 98 - weight_fn is an optional function that takes x and a training
jpayne@68 99 example, and returns a weight.
jpayne@68 100 - distance_fn is an optional function that takes two points and
jpayne@68 101 returns the distance between them. If distance_fn is None (the
jpayne@68 102 default), the Euclidean distance is used.
jpayne@68 103
jpayne@68 104 Returns a dictionary of the class to the weight given to the class.
jpayne@68 105 """
jpayne@68 106 if weight_fn is None:
jpayne@68 107 weight_fn = equal_weight
jpayne@68 108
jpayne@68 109 x = np.asarray(x)
jpayne@68 110
jpayne@68 111 order = [] # list of (distance, index)
jpayne@68 112 if distance_fn:
jpayne@68 113 for i in range(len(knn.xs)):
jpayne@68 114 dist = distance_fn(x, knn.xs[i])
jpayne@68 115 order.append((dist, i))
jpayne@68 116 else:
jpayne@68 117 # Default: Use a fast implementation of the Euclidean distance
jpayne@68 118 temp = np.zeros(len(x))
jpayne@68 119 # Predefining temp allows reuse of this array, making this
jpayne@68 120 # function about twice as fast.
jpayne@68 121 for i in range(len(knn.xs)):
jpayne@68 122 temp[:] = x - knn.xs[i]
jpayne@68 123 dist = np.sqrt(np.dot(temp, temp))
jpayne@68 124 order.append((dist, i))
jpayne@68 125 order.sort()
jpayne@68 126
jpayne@68 127 # first 'k' are the ones I want.
jpayne@68 128 weights = {} # class -> number of votes
jpayne@68 129 for k in knn.classes:
jpayne@68 130 weights[k] = 0.0
jpayne@68 131 for dist, i in order[: knn.k]:
jpayne@68 132 klass = knn.ys[i]
jpayne@68 133 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
jpayne@68 134
jpayne@68 135 return weights
jpayne@68 136
jpayne@68 137
jpayne@68 138 def classify(knn, x, weight_fn=None, distance_fn=None):
jpayne@68 139 """Classify an observation into a class.
jpayne@68 140
jpayne@68 141 If not specified, weight_fn will give all neighbors equal weight.
jpayne@68 142 distance_fn is an optional function that takes two points and returns
jpayne@68 143 the distance between them. If distance_fn is None (the default),
jpayne@68 144 the Euclidean distance is used.
jpayne@68 145 """
jpayne@68 146 if weight_fn is None:
jpayne@68 147 weight_fn = equal_weight
jpayne@68 148
jpayne@68 149 weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
jpayne@68 150
jpayne@68 151 most_class = None
jpayne@68 152 most_weight = None
jpayne@68 153 for klass, weight in weights.items():
jpayne@68 154 if most_class is None or weight > most_weight:
jpayne@68 155 most_class = klass
jpayne@68 156 most_weight = weight
jpayne@68 157 return most_class