jpayne@68: # Copyright 2002 by Jeffrey Chang. jpayne@68: # All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: """Code for doing k-nearest-neighbors classification (DEPRECATED). jpayne@68: jpayne@68: k Nearest Neighbors is a supervised learning algorithm that classifies jpayne@68: a new observation based the classes in its surrounding neighborhood. jpayne@68: jpayne@68: Glossary: jpayne@68: - distance The distance between two points in the feature space. jpayne@68: - weight The importance given to each point for classification. jpayne@68: jpayne@68: Classes: jpayne@68: - kNN Holds information for a nearest neighbors classifier. jpayne@68: jpayne@68: jpayne@68: Functions: jpayne@68: - train Train a new kNN classifier. jpayne@68: - calculate Calculate the probabilities of each class, given an observation. jpayne@68: - classify Classify an observation into a class. jpayne@68: jpayne@68: Weighting Functions: jpayne@68: - equal_weight Every example is given a weight of 1. jpayne@68: jpayne@68: This module has been deprecated, please consider an alternative like scikit-learn jpayne@68: insead. jpayne@68: """ jpayne@68: jpayne@68: import warnings jpayne@68: jpayne@68: try: jpayne@68: import numpy as np jpayne@68: except ImportError: jpayne@68: from Bio import MissingPythonDependencyError jpayne@68: jpayne@68: raise MissingPythonDependencyError( jpayne@68: "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/" jpayne@68: ) from None jpayne@68: jpayne@68: from Bio import BiopythonDeprecationWarning jpayne@68: jpayne@68: warnings.warn( jpayne@68: "The 'Bio.kNN' module is deprecated and will be removed in a future " jpayne@68: "release of Biopython. Consider using scikit-learn instead.", jpayne@68: BiopythonDeprecationWarning, jpayne@68: ) jpayne@68: jpayne@68: jpayne@68: class kNN: jpayne@68: """Holds information necessary to do nearest neighbors classification. jpayne@68: jpayne@68: Attributes: jpayne@68: - classes Set of the possible classes. jpayne@68: - xs List of the neighbors. jpayne@68: - ys List of the classes that the neighbors belong to. jpayne@68: - k Number of neighbors to look at. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self): jpayne@68: """Initialize the class.""" jpayne@68: self.classes = set() jpayne@68: self.xs = [] jpayne@68: self.ys = [] jpayne@68: self.k = None jpayne@68: jpayne@68: jpayne@68: def equal_weight(x, y): jpayne@68: """Return integer one (dummy method for equally weighting).""" jpayne@68: # everything gets 1 vote jpayne@68: return 1 jpayne@68: jpayne@68: jpayne@68: def train(xs, ys, k, typecode=None): jpayne@68: """Train a k nearest neighbors classifier on a training set. jpayne@68: jpayne@68: xs is a list of observations and ys is a list of the class assignments. jpayne@68: Thus, xs and ys should contain the same number of elements. k is jpayne@68: the number of neighbors that should be examined when doing the jpayne@68: classification. jpayne@68: """ jpayne@68: knn = kNN() jpayne@68: knn.classes = set(ys) jpayne@68: knn.xs = np.asarray(xs, typecode) jpayne@68: knn.ys = ys jpayne@68: knn.k = k jpayne@68: return knn jpayne@68: jpayne@68: jpayne@68: def calculate(knn, x, weight_fn=None, distance_fn=None): jpayne@68: """Calculate the probability for each class. jpayne@68: jpayne@68: Arguments: jpayne@68: - x is the observed data. jpayne@68: - weight_fn is an optional function that takes x and a training jpayne@68: example, and returns a weight. jpayne@68: - distance_fn is an optional function that takes two points and jpayne@68: returns the distance between them. If distance_fn is None (the jpayne@68: default), the Euclidean distance is used. jpayne@68: jpayne@68: Returns a dictionary of the class to the weight given to the class. jpayne@68: """ jpayne@68: if weight_fn is None: jpayne@68: weight_fn = equal_weight jpayne@68: jpayne@68: x = np.asarray(x) jpayne@68: jpayne@68: order = [] # list of (distance, index) jpayne@68: if distance_fn: jpayne@68: for i in range(len(knn.xs)): jpayne@68: dist = distance_fn(x, knn.xs[i]) jpayne@68: order.append((dist, i)) jpayne@68: else: jpayne@68: # Default: Use a fast implementation of the Euclidean distance jpayne@68: temp = np.zeros(len(x)) jpayne@68: # Predefining temp allows reuse of this array, making this jpayne@68: # function about twice as fast. jpayne@68: for i in range(len(knn.xs)): jpayne@68: temp[:] = x - knn.xs[i] jpayne@68: dist = np.sqrt(np.dot(temp, temp)) jpayne@68: order.append((dist, i)) jpayne@68: order.sort() jpayne@68: jpayne@68: # first 'k' are the ones I want. jpayne@68: weights = {} # class -> number of votes jpayne@68: for k in knn.classes: jpayne@68: weights[k] = 0.0 jpayne@68: for dist, i in order[: knn.k]: jpayne@68: klass = knn.ys[i] jpayne@68: weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) jpayne@68: jpayne@68: return weights jpayne@68: jpayne@68: jpayne@68: def classify(knn, x, weight_fn=None, distance_fn=None): jpayne@68: """Classify an observation into a class. jpayne@68: jpayne@68: If not specified, weight_fn will give all neighbors equal weight. jpayne@68: distance_fn is an optional function that takes two points and returns jpayne@68: the distance between them. If distance_fn is None (the default), jpayne@68: the Euclidean distance is used. jpayne@68: """ jpayne@68: if weight_fn is None: jpayne@68: weight_fn = equal_weight jpayne@68: jpayne@68: weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn) jpayne@68: jpayne@68: most_class = None jpayne@68: most_weight = None jpayne@68: for klass, weight in weights.items(): jpayne@68: if most_class is None or weight > most_weight: jpayne@68: most_class = klass jpayne@68: most_weight = weight jpayne@68: return most_class