Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,225 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. + +"""General Naive Bayes learner (DEPRECATED). + +Naive Bayes is a supervised classification algorithm that uses Bayes +rule to compute the fit between a new observation and some previously +observed data. The observations are discrete feature vectors, with +the Bayes assumption that the features are independent. Although this +is hardly ever true, the classifier works well enough in practice. + +Glossary: + - observation - A feature vector of discrete data. + - class - A possible classification for an observation. + +Classes: + - NaiveBayes - Holds information for a naive Bayes classifier. + +Functions: + - train - Train a new naive Bayes classifier. + - calculate - Calculate the probabilities of each class, + given an observation. + - classify - Classify an observation into a class. + +""" + + +import warnings +from Bio import BiopythonDeprecationWarning + +warnings.warn( + "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future " + "release of Biopython. Consider using scikit-learn instead.", + BiopythonDeprecationWarning, +) + + +try: + import numpy as np +except ImportError: + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Please install NumPy if you want to use Bio.NaiveBayes. " + "See http://www.numpy.org/" + ) from None + + +def _contents(items): + """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" + term = 1.0 / len(items) + counts = {} + for item in items: + counts[item] = counts.get(item, 0) + term + return counts + + +class NaiveBayes: + """Hold information for a NaiveBayes classifier. + + Attributes: + - classes - List of the possible classes of data. + - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` + - p_prior - List of the prior probabilities for every class. + - dimensionality - Dimensionality of the data. + + """ + + def __init__(self): + """Initialize the class.""" + self.classes = [] + self.p_conditional = None + self.p_prior = [] + self.dimensionality = None + + +def calculate(nb, observation, scale=False): + """Calculate the logarithmic conditional probability for each class. + + Arguments: + - nb - A NaiveBayes classifier that has been trained. + - observation - A list representing the observed data. + - scale - Boolean to indicate whether the probability should be + scaled by ``P(observation)``. By default, no scaling is done. + + A dictionary is returned where the key is the class and the value is + the log probability of the class. + """ + # P(class|observation) = P(observation|class)*P(class)/P(observation) + # Taking the log: + # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) + + # Make sure the observation has the right dimensionality. + if len(observation) != nb.dimensionality: + raise ValueError( + f"observation in {len(observation)} dimension," + f" but classifier in {nb.dimensionality}" + ) + + # Calculate log P(observation|class) for every class. + n = len(nb.classes) + lp_observation_class = np.zeros(n) # array of log P(observation|class) + for i in range(n): + # log P(observation|class) = SUM_i log P(observation_i|class) + probs = [None] * len(observation) + for j in range(len(observation)): + probs[j] = nb.p_conditional[i][j].get(observation[j], 0) + lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300)) + lp_observation_class[i] = sum(lprobs) + + # Calculate log P(class). + lp_prior = np.log(nb.p_prior) + + # Calculate log P(observation). + lp_observation = 0.0 # P(observation) + if scale: # Only calculate this if requested. + # log P(observation) = log SUM_i P(observation|class_i)P(class_i) + obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700)) + lp_observation = np.log(sum(obs)) + + # Calculate log P(class|observation). + lp_class_observation = {} # Dict of class : log P(class|observation) + for i in range(len(nb.classes)): + lp_class_observation[nb.classes[i]] = ( + lp_observation_class[i] + lp_prior[i] - lp_observation + ) + + return lp_class_observation + + +def classify(nb, observation): + """Classify an observation into a class.""" + # The class is the one with the highest probability. + probs = calculate(nb, observation, scale=False) + max_prob = max_class = None + for klass in nb.classes: + if max_prob is None or probs[klass] > max_prob: + max_prob, max_class = probs[klass], klass + return max_class + + +def train(training_set, results, priors=None, typecode=None): + """Train a NaiveBayes classifier on a training set. + + Arguments: + - training_set - List of observations. + - results - List of the class assignments for each observation. + Thus, training_set and results must be the same length. + - priors - Optional dictionary specifying the prior probabilities + for each type of result. If not specified, the priors will + be estimated from the training results. + + """ + if not len(training_set): + raise ValueError("No data in the training set.") + if len(training_set) != len(results): + raise ValueError("training_set and results should be parallel lists.") + + # If no typecode is specified, try to pick a reasonable one. If + # training_set is a Numeric array, then use that typecode. + # Otherwise, choose a reasonable default. + # XXX NOT IMPLEMENTED + + # Check to make sure each vector in the training set has the same + # dimensionality. + dimensions = [len(x) for x in training_set] + if min(dimensions) != max(dimensions): + raise ValueError("observations have different dimensionality") + + nb = NaiveBayes() + nb.dimensionality = dimensions[0] + + # Get a list of all the classes, and + # estimate the prior probabilities for the classes. + if priors is not None: + percs = priors + nb.classes = list(set(results)) + else: + class_freq = _contents(results) + nb.classes = list(class_freq.keys()) + percs = class_freq + nb.classes.sort() # keep it tidy + + nb.p_prior = np.zeros(len(nb.classes)) + for i in range(len(nb.classes)): + nb.p_prior[i] = percs[nb.classes[i]] + + # Collect all the observations in class. For each class, make a + # matrix of training instances versus dimensions. I might be able + # to optimize this with Numeric, if the training_set parameter + # were guaranteed to be a matrix. However, this may not be the + # case, because the client may be hacking up a sparse matrix or + # something. + c2i = {} # class to index of class + for index, key in enumerate(nb.classes): + c2i[key] = index + observations = [[] for c in nb.classes] # separate observations by class + for i in range(len(results)): + klass, obs = results[i], training_set[i] + observations[c2i[klass]].append(obs) + # Now make the observations Numeric matrix. + for i in range(len(observations)): + # XXX typecode must be specified! + observations[i] = np.asarray(observations[i], typecode) + + # Calculate P(value|class,dim) for every class. + # This is a good loop to optimize. + nb.p_conditional = [] + for i in range(len(nb.classes)): + class_observations = observations[i] # observations for this class + nb.p_conditional.append([None] * nb.dimensionality) + for j in range(nb.dimensionality): + # Collect all the values in this dimension. + values = class_observations[:, j] + + # Add pseudocounts here. This needs to be parameterized. + # values = list(values) + range(len(nb.classes)) # XXX add 1 + + # Estimate P(value|class,dim) + nb.p_conditional[i][j] = _contents(values) + return nb