diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/kNN.py	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,157 @@
+# Copyright 2002 by Jeffrey Chang.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for doing k-nearest-neighbors classification (DEPRECATED).
+
+k Nearest Neighbors is a supervised learning algorithm that classifies
+a new observation based the classes in its surrounding neighborhood.
+
+Glossary:
+ - distance   The distance between two points in the feature space.
+ - weight     The importance given to each point for classification.
+
+Classes:
+ - kNN           Holds information for a nearest neighbors classifier.
+
+
+Functions:
+ - train        Train a new kNN classifier.
+ - calculate    Calculate the probabilities of each class, given an observation.
+ - classify     Classify an observation into a class.
+
+Weighting Functions:
+ - equal_weight    Every example is given a weight of 1.
+
+This module has been deprecated, please consider an alternative like scikit-learn
+insead.
+"""
+
+import warnings
+
+try:
+    import numpy as np
+except ImportError:
+    from Bio import MissingPythonDependencyError
+
+    raise MissingPythonDependencyError(
+        "Please install NumPy if you want to use Bio.kNN. See http://www.numpy.org/"
+    ) from None
+
+from Bio import BiopythonDeprecationWarning
+
+warnings.warn(
+    "The 'Bio.kNN' module is deprecated and will be removed in a future "
+    "release of Biopython. Consider using scikit-learn instead.",
+    BiopythonDeprecationWarning,
+)
+
+
+class kNN:
+    """Holds information necessary to do nearest neighbors classification.
+
+    Attributes:
+     - classes  Set of the possible classes.
+     - xs       List of the neighbors.
+     - ys       List of the classes that the neighbors belong to.
+     - k        Number of neighbors to look at.
+    """
+
+    def __init__(self):
+        """Initialize the class."""
+        self.classes = set()
+        self.xs = []
+        self.ys = []
+        self.k = None
+
+
+def equal_weight(x, y):
+    """Return integer one (dummy method for equally weighting)."""
+    # everything gets 1 vote
+    return 1
+
+
+def train(xs, ys, k, typecode=None):
+    """Train a k nearest neighbors classifier on a training set.
+
+    xs is a list of observations and ys is a list of the class assignments.
+    Thus, xs and ys should contain the same number of elements.  k is
+    the number of neighbors that should be examined when doing the
+    classification.
+    """
+    knn = kNN()
+    knn.classes = set(ys)
+    knn.xs = np.asarray(xs, typecode)
+    knn.ys = ys
+    knn.k = k
+    return knn
+
+
+def calculate(knn, x, weight_fn=None, distance_fn=None):
+    """Calculate the probability for each class.
+
+    Arguments:
+     - x is the observed data.
+     - weight_fn is an optional function that takes x and a training
+       example, and returns a weight.
+     - distance_fn is an optional function that takes two points and
+       returns the distance between them.  If distance_fn is None (the
+       default), the Euclidean distance is used.
+
+    Returns a dictionary of the class to the weight given to the class.
+    """
+    if weight_fn is None:
+        weight_fn = equal_weight
+
+    x = np.asarray(x)
+
+    order = []  # list of (distance, index)
+    if distance_fn:
+        for i in range(len(knn.xs)):
+            dist = distance_fn(x, knn.xs[i])
+            order.append((dist, i))
+    else:
+        # Default: Use a fast implementation of the Euclidean distance
+        temp = np.zeros(len(x))
+        # Predefining temp allows reuse of this array, making this
+        # function about twice as fast.
+        for i in range(len(knn.xs)):
+            temp[:] = x - knn.xs[i]
+            dist = np.sqrt(np.dot(temp, temp))
+            order.append((dist, i))
+    order.sort()
+
+    # first 'k' are the ones I want.
+    weights = {}  # class -> number of votes
+    for k in knn.classes:
+        weights[k] = 0.0
+    for dist, i in order[: knn.k]:
+        klass = knn.ys[i]
+        weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
+
+    return weights
+
+
+def classify(knn, x, weight_fn=None, distance_fn=None):
+    """Classify an observation into a class.
+
+    If not specified, weight_fn will give all neighbors equal weight.
+    distance_fn is an optional function that takes two points and returns
+    the distance between them.  If distance_fn is None (the default),
+    the Euclidean distance is used.
+    """
+    if weight_fn is None:
+        weight_fn = equal_weight
+
+    weights = calculate(knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
+
+    most_class = None
+    most_weight = None
+    for klass, weight in weights.items():
+        if most_class is None or weight > most_weight:
+            most_class = klass
+            most_weight = weight
+    return most_class