Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line source
# Copyright 2002 by Jeffrey Chang. # All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Code for doing k-nearest-neighbors classification (DEPRECATED). k Nearest Neighbors is a supervised learning algorithm that classifies a new observation based the classes in its surrounding neighborhood. Glossary: - distance The distance between two points in the feature space. - weight The importance given to each point for classification. Classes: - kNN Holds information for a nearest neighbors classifier. Functions: - train Train a new kNN classifier. - calculate Calculate the probabilities of each class, given an observation. - classify Classify an observation into a class. Weighting Functions: - equal_weight Every example is given a weight of 1. This module has been deprecated, please consider an alternative like scikit-learn insead. """ import warnings try: import numpy as np except ImportError: from Bio import MissingPythonDependencyError raise MissingPythonDependencyError( "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/" ) from None from Bio import BiopythonDeprecationWarning warnings.warn( "The 'Bio.kNN' module is deprecated and will be removed in a future " "release of Biopython. Consider using scikit-learn instead.", BiopythonDeprecationWarning, ) class kNN: """Holds information necessary to do nearest neighbors classification. Attributes: - classes Set of the possible classes. - xs List of the neighbors. - ys List of the classes that the neighbors belong to. - k Number of neighbors to look at. """ def __init__(self): """Initialize the class.""" self.classes = set() self.xs = [] self.ys = [] self.k = None def equal_weight(x, y): """Return integer one (dummy method for equally weighting).""" # everything gets 1 vote return 1 def train(xs, ys, k, typecode=None): """Train a k nearest neighbors classifier on a training set. xs is a list of observations and ys is a list of the class assignments. Thus, xs and ys should contain the same number of elements. k is the number of neighbors that should be examined when doing the classification. """ knn = kNN() knn.classes = set(ys) knn.xs = np.asarray(xs, typecode) knn.ys = ys knn.k = k return knn def calculate(knn, x, weight_fn=None, distance_fn=None): """Calculate the probability for each class. Arguments: - x is the observed data. - weight_fn is an optional function that takes x and a training example, and returns a weight. - distance_fn is an optional function that takes two points and returns the distance between them. If distance_fn is None (the default), the Euclidean distance is used. Returns a dictionary of the class to the weight given to the class. """ if weight_fn is None: weight_fn = equal_weight x = np.asarray(x) order = [] # list of (distance, index) if distance_fn: for i in range(len(knn.xs)): dist = distance_fn(x, knn.xs[i]) order.append((dist, i)) else: # Default: Use a fast implementation of the Euclidean distance temp = np.zeros(len(x)) # Predefining temp allows reuse of this array, making this # function about twice as fast. for i in range(len(knn.xs)): temp[:] = x - knn.xs[i] dist = np.sqrt(np.dot(temp, temp)) order.append((dist, i)) order.sort() # first 'k' are the ones I want. weights = {} # class -> number of votes for k in knn.classes: weights[k] = 0.0 for dist, i in order[: knn.k]: klass = knn.ys[i] weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) return weights def classify(knn, x, weight_fn=None, distance_fn=None): """Classify an observation into a class. If not specified, weight_fn will give all neighbors equal weight. distance_fn is an optional function that takes two points and returns the distance between them. If distance_fn is None (the default), the Euclidean distance is used. """ if weight_fn is None: weight_fn = equal_weight weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn) most_class = None most_weight = None for klass, weight in weights.items(): if most_class is None or weight > most_weight: most_class = klass most_weight = weight return most_class