jpayne@69: # Copyright 2002 by Jeffrey Chang. jpayne@69: # All rights reserved. jpayne@69: # jpayne@69: # This file is part of the Biopython distribution and governed by your jpayne@69: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@69: # Please see the LICENSE file that should have been included as part of this jpayne@69: # package. jpayne@69: """Code for doing k-nearest-neighbors classification (DEPRECATED). jpayne@69: jpayne@69: k Nearest Neighbors is a supervised learning algorithm that classifies jpayne@69: a new observation based the classes in its surrounding neighborhood. jpayne@69: jpayne@69: Glossary: jpayne@69: - distance The distance between two points in the feature space. jpayne@69: - weight The importance given to each point for classification. jpayne@69: jpayne@69: Classes: jpayne@69: - kNN Holds information for a nearest neighbors classifier. jpayne@69: jpayne@69: jpayne@69: Functions: jpayne@69: - train Train a new kNN classifier. jpayne@69: - calculate Calculate the probabilities of each class, given an observation. jpayne@69: - classify Classify an observation into a class. jpayne@69: jpayne@69: Weighting Functions: jpayne@69: - equal_weight Every example is given a weight of 1. jpayne@69: jpayne@69: This module has been deprecated, please consider an alternative like scikit-learn jpayne@69: insead. jpayne@69: """ jpayne@69: jpayne@69: import warnings jpayne@69: jpayne@69: try: jpayne@69: import numpy as np jpayne@69: except ImportError: jpayne@69: from Bio import MissingPythonDependencyError jpayne@69: jpayne@69: raise MissingPythonDependencyError( jpayne@69: "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/" jpayne@69: ) from None jpayne@69: jpayne@69: from Bio import BiopythonDeprecationWarning jpayne@69: jpayne@69: warnings.warn( jpayne@69: "The 'Bio.kNN' module is deprecated and will be removed in a future " jpayne@69: "release of Biopython. Consider using scikit-learn instead.", jpayne@69: BiopythonDeprecationWarning, jpayne@69: ) jpayne@69: jpayne@69: jpayne@69: class kNN: jpayne@69: """Holds information necessary to do nearest neighbors classification. jpayne@69: jpayne@69: Attributes: jpayne@69: - classes Set of the possible classes. jpayne@69: - xs List of the neighbors. jpayne@69: - ys List of the classes that the neighbors belong to. jpayne@69: - k Number of neighbors to look at. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self): jpayne@69: """Initialize the class.""" jpayne@69: self.classes = set() jpayne@69: self.xs = [] jpayne@69: self.ys = [] jpayne@69: self.k = None jpayne@69: jpayne@69: jpayne@69: def equal_weight(x, y): jpayne@69: """Return integer one (dummy method for equally weighting).""" jpayne@69: # everything gets 1 vote jpayne@69: return 1 jpayne@69: jpayne@69: jpayne@69: def train(xs, ys, k, typecode=None): jpayne@69: """Train a k nearest neighbors classifier on a training set. jpayne@69: jpayne@69: xs is a list of observations and ys is a list of the class assignments. jpayne@69: Thus, xs and ys should contain the same number of elements. k is jpayne@69: the number of neighbors that should be examined when doing the jpayne@69: classification. jpayne@69: """ jpayne@69: knn = kNN() jpayne@69: knn.classes = set(ys) jpayne@69: knn.xs = np.asarray(xs, typecode) jpayne@69: knn.ys = ys jpayne@69: knn.k = k jpayne@69: return knn jpayne@69: jpayne@69: jpayne@69: def calculate(knn, x, weight_fn=None, distance_fn=None): jpayne@69: """Calculate the probability for each class. jpayne@69: jpayne@69: Arguments: jpayne@69: - x is the observed data. jpayne@69: - weight_fn is an optional function that takes x and a training jpayne@69: example, and returns a weight. jpayne@69: - distance_fn is an optional function that takes two points and jpayne@69: returns the distance between them. If distance_fn is None (the jpayne@69: default), the Euclidean distance is used. jpayne@69: jpayne@69: Returns a dictionary of the class to the weight given to the class. jpayne@69: """ jpayne@69: if weight_fn is None: jpayne@69: weight_fn = equal_weight jpayne@69: jpayne@69: x = np.asarray(x) jpayne@69: jpayne@69: order = [] # list of (distance, index) jpayne@69: if distance_fn: jpayne@69: for i in range(len(knn.xs)): jpayne@69: dist = distance_fn(x, knn.xs[i]) jpayne@69: order.append((dist, i)) jpayne@69: else: jpayne@69: # Default: Use a fast implementation of the Euclidean distance jpayne@69: temp = np.zeros(len(x)) jpayne@69: # Predefining temp allows reuse of this array, making this jpayne@69: # function about twice as fast. jpayne@69: for i in range(len(knn.xs)): jpayne@69: temp[:] = x - knn.xs[i] jpayne@69: dist = np.sqrt(np.dot(temp, temp)) jpayne@69: order.append((dist, i)) jpayne@69: order.sort() jpayne@69: jpayne@69: # first 'k' are the ones I want. jpayne@69: weights = {} # class -> number of votes jpayne@69: for k in knn.classes: jpayne@69: weights[k] = 0.0 jpayne@69: for dist, i in order[: knn.k]: jpayne@69: klass = knn.ys[i] jpayne@69: weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) jpayne@69: jpayne@69: return weights jpayne@69: jpayne@69: jpayne@69: def classify(knn, x, weight_fn=None, distance_fn=None): jpayne@69: """Classify an observation into a class. jpayne@69: jpayne@69: If not specified, weight_fn will give all neighbors equal weight. jpayne@69: distance_fn is an optional function that takes two points and returns jpayne@69: the distance between them. If distance_fn is None (the default), jpayne@69: the Euclidean distance is used. jpayne@69: """ jpayne@69: if weight_fn is None: jpayne@69: weight_fn = equal_weight jpayne@69: jpayne@69: weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn) jpayne@69: jpayne@69: most_class = None jpayne@69: most_weight = None jpayne@69: for klass, weight in weights.items(): jpayne@69: if most_class is None or weight > most_weight: jpayne@69: most_class = klass jpayne@69: most_weight = weight jpayne@69: return most_class