jpayne@69: # Copyright 2000 by Jeffrey Chang. All rights reserved. jpayne@69: # jpayne@69: # This file is part of the Biopython distribution and governed by your jpayne@69: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@69: # Please see the LICENSE file that should have been included as part of this jpayne@69: # package. jpayne@69: jpayne@69: """General Naive Bayes learner (DEPRECATED). jpayne@69: jpayne@69: Naive Bayes is a supervised classification algorithm that uses Bayes jpayne@69: rule to compute the fit between a new observation and some previously jpayne@69: observed data. The observations are discrete feature vectors, with jpayne@69: the Bayes assumption that the features are independent. Although this jpayne@69: is hardly ever true, the classifier works well enough in practice. jpayne@69: jpayne@69: Glossary: jpayne@69: - observation - A feature vector of discrete data. jpayne@69: - class - A possible classification for an observation. jpayne@69: jpayne@69: Classes: jpayne@69: - NaiveBayes - Holds information for a naive Bayes classifier. jpayne@69: jpayne@69: Functions: jpayne@69: - train - Train a new naive Bayes classifier. jpayne@69: - calculate - Calculate the probabilities of each class, jpayne@69: given an observation. jpayne@69: - classify - Classify an observation into a class. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: jpayne@69: import warnings jpayne@69: from Bio import BiopythonDeprecationWarning jpayne@69: jpayne@69: warnings.warn( jpayne@69: "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future " jpayne@69: "release of Biopython. Consider using scikit-learn instead.", jpayne@69: BiopythonDeprecationWarning, jpayne@69: ) jpayne@69: jpayne@69: jpayne@69: try: jpayne@69: import numpy as np jpayne@69: except ImportError: jpayne@69: from Bio import MissingPythonDependencyError jpayne@69: jpayne@69: raise MissingPythonDependencyError( jpayne@69: "Please install NumPy if you want to use Bio.NaiveBayes. " jpayne@69: "See http://www.numpy.org/" jpayne@69: ) from None jpayne@69: jpayne@69: jpayne@69: def _contents(items): jpayne@69: """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" jpayne@69: term = 1.0 / len(items) jpayne@69: counts = {} jpayne@69: for item in items: jpayne@69: counts[item] = counts.get(item, 0) + term jpayne@69: return counts jpayne@69: jpayne@69: jpayne@69: class NaiveBayes: jpayne@69: """Hold information for a NaiveBayes classifier. jpayne@69: jpayne@69: Attributes: jpayne@69: - classes - List of the possible classes of data. jpayne@69: - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` jpayne@69: - p_prior - List of the prior probabilities for every class. jpayne@69: - dimensionality - Dimensionality of the data. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self): jpayne@69: """Initialize the class.""" jpayne@69: self.classes = [] jpayne@69: self.p_conditional = None jpayne@69: self.p_prior = [] jpayne@69: self.dimensionality = None jpayne@69: jpayne@69: jpayne@69: def calculate(nb, observation, scale=False): jpayne@69: """Calculate the logarithmic conditional probability for each class. jpayne@69: jpayne@69: Arguments: jpayne@69: - nb - A NaiveBayes classifier that has been trained. jpayne@69: - observation - A list representing the observed data. jpayne@69: - scale - Boolean to indicate whether the probability should be jpayne@69: scaled by ``P(observation)``. By default, no scaling is done. jpayne@69: jpayne@69: A dictionary is returned where the key is the class and the value is jpayne@69: the log probability of the class. jpayne@69: """ jpayne@69: # P(class|observation) = P(observation|class)*P(class)/P(observation) jpayne@69: # Taking the log: jpayne@69: # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) jpayne@69: jpayne@69: # Make sure the observation has the right dimensionality. jpayne@69: if len(observation) != nb.dimensionality: jpayne@69: raise ValueError( jpayne@69: f"observation in {len(observation)} dimension," jpayne@69: f" but classifier in {nb.dimensionality}" jpayne@69: ) jpayne@69: jpayne@69: # Calculate log P(observation|class) for every class. jpayne@69: n = len(nb.classes) jpayne@69: lp_observation_class = np.zeros(n) # array of log P(observation|class) jpayne@69: for i in range(n): jpayne@69: # log P(observation|class) = SUM_i log P(observation_i|class) jpayne@69: probs = [None] * len(observation) jpayne@69: for j in range(len(observation)): jpayne@69: probs[j] = nb.p_conditional[i][j].get(observation[j], 0) jpayne@69: lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300)) jpayne@69: lp_observation_class[i] = sum(lprobs) jpayne@69: jpayne@69: # Calculate log P(class). jpayne@69: lp_prior = np.log(nb.p_prior) jpayne@69: jpayne@69: # Calculate log P(observation). jpayne@69: lp_observation = 0.0 # P(observation) jpayne@69: if scale: # Only calculate this if requested. jpayne@69: # log P(observation) = log SUM_i P(observation|class_i)P(class_i) jpayne@69: obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700)) jpayne@69: lp_observation = np.log(sum(obs)) jpayne@69: jpayne@69: # Calculate log P(class|observation). jpayne@69: lp_class_observation = {} # Dict of class : log P(class|observation) jpayne@69: for i in range(len(nb.classes)): jpayne@69: lp_class_observation[nb.classes[i]] = ( jpayne@69: lp_observation_class[i] + lp_prior[i] - lp_observation jpayne@69: ) jpayne@69: jpayne@69: return lp_class_observation jpayne@69: jpayne@69: jpayne@69: def classify(nb, observation): jpayne@69: """Classify an observation into a class.""" jpayne@69: # The class is the one with the highest probability. jpayne@69: probs = calculate(nb, observation, scale=False) jpayne@69: max_prob = max_class = None jpayne@69: for klass in nb.classes: jpayne@69: if max_prob is None or probs[klass] > max_prob: jpayne@69: max_prob, max_class = probs[klass], klass jpayne@69: return max_class jpayne@69: jpayne@69: jpayne@69: def train(training_set, results, priors=None, typecode=None): jpayne@69: """Train a NaiveBayes classifier on a training set. jpayne@69: jpayne@69: Arguments: jpayne@69: - training_set - List of observations. jpayne@69: - results - List of the class assignments for each observation. jpayne@69: Thus, training_set and results must be the same length. jpayne@69: - priors - Optional dictionary specifying the prior probabilities jpayne@69: for each type of result. If not specified, the priors will jpayne@69: be estimated from the training results. jpayne@69: jpayne@69: """ jpayne@69: if not len(training_set): jpayne@69: raise ValueError("No data in the training set.") jpayne@69: if len(training_set) != len(results): jpayne@69: raise ValueError("training_set and results should be parallel lists.") jpayne@69: jpayne@69: # If no typecode is specified, try to pick a reasonable one. If jpayne@69: # training_set is a Numeric array, then use that typecode. jpayne@69: # Otherwise, choose a reasonable default. jpayne@69: # XXX NOT IMPLEMENTED jpayne@69: jpayne@69: # Check to make sure each vector in the training set has the same jpayne@69: # dimensionality. jpayne@69: dimensions = [len(x) for x in training_set] jpayne@69: if min(dimensions) != max(dimensions): jpayne@69: raise ValueError("observations have different dimensionality") jpayne@69: jpayne@69: nb = NaiveBayes() jpayne@69: nb.dimensionality = dimensions[0] jpayne@69: jpayne@69: # Get a list of all the classes, and jpayne@69: # estimate the prior probabilities for the classes. jpayne@69: if priors is not None: jpayne@69: percs = priors jpayne@69: nb.classes = list(set(results)) jpayne@69: else: jpayne@69: class_freq = _contents(results) jpayne@69: nb.classes = list(class_freq.keys()) jpayne@69: percs = class_freq jpayne@69: nb.classes.sort() # keep it tidy jpayne@69: jpayne@69: nb.p_prior = np.zeros(len(nb.classes)) jpayne@69: for i in range(len(nb.classes)): jpayne@69: nb.p_prior[i] = percs[nb.classes[i]] jpayne@69: jpayne@69: # Collect all the observations in class. For each class, make a jpayne@69: # matrix of training instances versus dimensions. I might be able jpayne@69: # to optimize this with Numeric, if the training_set parameter jpayne@69: # were guaranteed to be a matrix. However, this may not be the jpayne@69: # case, because the client may be hacking up a sparse matrix or jpayne@69: # something. jpayne@69: c2i = {} # class to index of class jpayne@69: for index, key in enumerate(nb.classes): jpayne@69: c2i[key] = index jpayne@69: observations = [[] for c in nb.classes] # separate observations by class jpayne@69: for i in range(len(results)): jpayne@69: klass, obs = results[i], training_set[i] jpayne@69: observations[c2i[klass]].append(obs) jpayne@69: # Now make the observations Numeric matrix. jpayne@69: for i in range(len(observations)): jpayne@69: # XXX typecode must be specified! jpayne@69: observations[i] = np.asarray(observations[i], typecode) jpayne@69: jpayne@69: # Calculate P(value|class,dim) for every class. jpayne@69: # This is a good loop to optimize. jpayne@69: nb.p_conditional = [] jpayne@69: for i in range(len(nb.classes)): jpayne@69: class_observations = observations[i] # observations for this class jpayne@69: nb.p_conditional.append([None] * nb.dimensionality) jpayne@69: for j in range(nb.dimensionality): jpayne@69: # Collect all the values in this dimension. jpayne@69: values = class_observations[:, j] jpayne@69: jpayne@69: # Add pseudocounts here. This needs to be parameterized. jpayne@69: # values = list(values) + range(len(nb.classes)) # XXX add 1 jpayne@69: jpayne@69: # Estimate P(value|class,dim) jpayne@69: nb.p_conditional[i][j] = _contents(values) jpayne@69: return nb