annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # Copyright 2002 by Jeffrey Chang.
jpayne@69 2 # All rights reserved.
jpayne@69 3 #
jpayne@69 4 # This file is part of the Biopython distribution and governed by your
jpayne@69 5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69 6 # Please see the LICENSE file that should have been included as part of this
jpayne@69 7 # package.
jpayne@69 8 """Code for doing k-nearest-neighbors classification (DEPRECATED).
jpayne@69 9
jpayne@69 10 k Nearest Neighbors is a supervised learning algorithm that classifies
jpayne@69 11 a new observation based the classes in its surrounding neighborhood.
jpayne@69 12
jpayne@69 13 Glossary:
jpayne@69 14 - distance The distance between two points in the feature space.
jpayne@69 15 - weight The importance given to each point for classification.
jpayne@69 16
jpayne@69 17 Classes:
jpayne@69 18 - kNN Holds information for a nearest neighbors classifier.
jpayne@69 19
jpayne@69 20
jpayne@69 21 Functions:
jpayne@69 22 - train Train a new kNN classifier.
jpayne@69 23 - calculate Calculate the probabilities of each class, given an observation.
jpayne@69 24 - classify Classify an observation into a class.
jpayne@69 25
jpayne@69 26 Weighting Functions:
jpayne@69 27 - equal_weight Every example is given a weight of 1.
jpayne@69 28
jpayne@69 29 This module has been deprecated, please consider an alternative like scikit-learn
jpayne@69 30 insead.
jpayne@69 31 """
jpayne@69 32
jpayne@69 33 import warnings
jpayne@69 34
jpayne@69 35 try:
jpayne@69 36 import numpy as np
jpayne@69 37 except ImportError:
jpayne@69 38 from Bio import MissingPythonDependencyError
jpayne@69 39
jpayne@69 40 raise MissingPythonDependencyError(
jpayne@69 41 "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/"
jpayne@69 42 ) from None
jpayne@69 43
jpayne@69 44 from Bio import BiopythonDeprecationWarning
jpayne@69 45
jpayne@69 46 warnings.warn(
jpayne@69 47 "The 'Bio.kNN' module is deprecated and will be removed in a future "
jpayne@69 48 "release of Biopython. Consider using scikit-learn instead.",
jpayne@69 49 BiopythonDeprecationWarning,
jpayne@69 50 )
jpayne@69 51
jpayne@69 52
jpayne@69 53 class kNN:
jpayne@69 54 """Holds information necessary to do nearest neighbors classification.
jpayne@69 55
jpayne@69 56 Attributes:
jpayne@69 57 - classes Set of the possible classes.
jpayne@69 58 - xs List of the neighbors.
jpayne@69 59 - ys List of the classes that the neighbors belong to.
jpayne@69 60 - k Number of neighbors to look at.
jpayne@69 61 """
jpayne@69 62
jpayne@69 63 def __init__(self):
jpayne@69 64 """Initialize the class."""
jpayne@69 65 self.classes = set()
jpayne@69 66 self.xs = []
jpayne@69 67 self.ys = []
jpayne@69 68 self.k = None
jpayne@69 69
jpayne@69 70
jpayne@69 71 def equal_weight(x, y):
jpayne@69 72 """Return integer one (dummy method for equally weighting)."""
jpayne@69 73 # everything gets 1 vote
jpayne@69 74 return 1
jpayne@69 75
jpayne@69 76
jpayne@69 77 def train(xs, ys, k, typecode=None):
jpayne@69 78 """Train a k nearest neighbors classifier on a training set.
jpayne@69 79
jpayne@69 80 xs is a list of observations and ys is a list of the class assignments.
jpayne@69 81 Thus, xs and ys should contain the same number of elements. k is
jpayne@69 82 the number of neighbors that should be examined when doing the
jpayne@69 83 classification.
jpayne@69 84 """
jpayne@69 85 knn = kNN()
jpayne@69 86 knn.classes = set(ys)
jpayne@69 87 knn.xs = np.asarray(xs, typecode)
jpayne@69 88 knn.ys = ys
jpayne@69 89 knn.k = k
jpayne@69 90 return knn
jpayne@69 91
jpayne@69 92
jpayne@69 93 def calculate(knn, x, weight_fn=None, distance_fn=None):
jpayne@69 94 """Calculate the probability for each class.
jpayne@69 95
jpayne@69 96 Arguments:
jpayne@69 97 - x is the observed data.
jpayne@69 98 - weight_fn is an optional function that takes x and a training
jpayne@69 99 example, and returns a weight.
jpayne@69 100 - distance_fn is an optional function that takes two points and
jpayne@69 101 returns the distance between them. If distance_fn is None (the
jpayne@69 102 default), the Euclidean distance is used.
jpayne@69 103
jpayne@69 104 Returns a dictionary of the class to the weight given to the class.
jpayne@69 105 """
jpayne@69 106 if weight_fn is None:
jpayne@69 107 weight_fn = equal_weight
jpayne@69 108
jpayne@69 109 x = np.asarray(x)
jpayne@69 110
jpayne@69 111 order = [] # list of (distance, index)
jpayne@69 112 if distance_fn:
jpayne@69 113 for i in range(len(knn.xs)):
jpayne@69 114 dist = distance_fn(x, knn.xs[i])
jpayne@69 115 order.append((dist, i))
jpayne@69 116 else:
jpayne@69 117 # Default: Use a fast implementation of the Euclidean distance
jpayne@69 118 temp = np.zeros(len(x))
jpayne@69 119 # Predefining temp allows reuse of this array, making this
jpayne@69 120 # function about twice as fast.
jpayne@69 121 for i in range(len(knn.xs)):
jpayne@69 122 temp[:] = x - knn.xs[i]
jpayne@69 123 dist = np.sqrt(np.dot(temp, temp))
jpayne@69 124 order.append((dist, i))
jpayne@69 125 order.sort()
jpayne@69 126
jpayne@69 127 # first 'k' are the ones I want.
jpayne@69 128 weights = {} # class -> number of votes
jpayne@69 129 for k in knn.classes:
jpayne@69 130 weights[k] = 0.0
jpayne@69 131 for dist, i in order[: knn.k]:
jpayne@69 132 klass = knn.ys[i]
jpayne@69 133 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
jpayne@69 134
jpayne@69 135 return weights
jpayne@69 136
jpayne@69 137
jpayne@69 138 def classify(knn, x, weight_fn=None, distance_fn=None):
jpayne@69 139 """Classify an observation into a class.
jpayne@69 140
jpayne@69 141 If not specified, weight_fn will give all neighbors equal weight.
jpayne@69 142 distance_fn is an optional function that takes two points and returns
jpayne@69 143 the distance between them. If distance_fn is None (the default),
jpayne@69 144 the Euclidean distance is used.
jpayne@69 145 """
jpayne@69 146 if weight_fn is None:
jpayne@69 147 weight_fn = equal_weight
jpayne@69 148
jpayne@69 149 weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
jpayne@69 150
jpayne@69 151 most_class = None
jpayne@69 152 most_weight = None
jpayne@69 153 for klass, weight in weights.items():
jpayne@69 154 if most_class is None or weight > most_weight:
jpayne@69 155 most_class = klass
jpayne@69 156 most_weight = weight
jpayne@69 157 return most_class