jpayne@68: # Copyright 2000 by Jeffrey Chang. All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: jpayne@68: """General Naive Bayes learner (DEPRECATED). jpayne@68: jpayne@68: Naive Bayes is a supervised classification algorithm that uses Bayes jpayne@68: rule to compute the fit between a new observation and some previously jpayne@68: observed data. The observations are discrete feature vectors, with jpayne@68: the Bayes assumption that the features are independent. Although this jpayne@68: is hardly ever true, the classifier works well enough in practice. jpayne@68: jpayne@68: Glossary: jpayne@68: - observation - A feature vector of discrete data. jpayne@68: - class - A possible classification for an observation. jpayne@68: jpayne@68: Classes: jpayne@68: - NaiveBayes - Holds information for a naive Bayes classifier. jpayne@68: jpayne@68: Functions: jpayne@68: - train - Train a new naive Bayes classifier. jpayne@68: - calculate - Calculate the probabilities of each class, jpayne@68: given an observation. jpayne@68: - classify - Classify an observation into a class. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: jpayne@68: import warnings jpayne@68: from Bio import BiopythonDeprecationWarning jpayne@68: jpayne@68: warnings.warn( jpayne@68: "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future " jpayne@68: "release of Biopython. Consider using scikit-learn instead.", jpayne@68: BiopythonDeprecationWarning, jpayne@68: ) jpayne@68: jpayne@68: jpayne@68: try: jpayne@68: import numpy as np jpayne@68: except ImportError: jpayne@68: from Bio import MissingPythonDependencyError jpayne@68: jpayne@68: raise MissingPythonDependencyError( jpayne@68: "Please install NumPy if you want to use Bio.NaiveBayes. " jpayne@68: "See http://www.numpy.org/" jpayne@68: ) from None jpayne@68: jpayne@68: jpayne@68: def _contents(items): jpayne@68: """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" jpayne@68: term = 1.0 / len(items) jpayne@68: counts = {} jpayne@68: for item in items: jpayne@68: counts[item] = counts.get(item, 0) + term jpayne@68: return counts jpayne@68: jpayne@68: jpayne@68: class NaiveBayes: jpayne@68: """Hold information for a NaiveBayes classifier. jpayne@68: jpayne@68: Attributes: jpayne@68: - classes - List of the possible classes of data. jpayne@68: - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` jpayne@68: - p_prior - List of the prior probabilities for every class. jpayne@68: - dimensionality - Dimensionality of the data. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self): jpayne@68: """Initialize the class.""" jpayne@68: self.classes = [] jpayne@68: self.p_conditional = None jpayne@68: self.p_prior = [] jpayne@68: self.dimensionality = None jpayne@68: jpayne@68: jpayne@68: def calculate(nb, observation, scale=False): jpayne@68: """Calculate the logarithmic conditional probability for each class. jpayne@68: jpayne@68: Arguments: jpayne@68: - nb - A NaiveBayes classifier that has been trained. jpayne@68: - observation - A list representing the observed data. jpayne@68: - scale - Boolean to indicate whether the probability should be jpayne@68: scaled by ``P(observation)``. By default, no scaling is done. jpayne@68: jpayne@68: A dictionary is returned where the key is the class and the value is jpayne@68: the log probability of the class. jpayne@68: """ jpayne@68: # P(class|observation) = P(observation|class)*P(class)/P(observation) jpayne@68: # Taking the log: jpayne@68: # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) jpayne@68: jpayne@68: # Make sure the observation has the right dimensionality. jpayne@68: if len(observation) != nb.dimensionality: jpayne@68: raise ValueError( jpayne@68: f"observation in {len(observation)} dimension," jpayne@68: f" but classifier in {nb.dimensionality}" jpayne@68: ) jpayne@68: jpayne@68: # Calculate log P(observation|class) for every class. jpayne@68: n = len(nb.classes) jpayne@68: lp_observation_class = np.zeros(n) # array of log P(observation|class) jpayne@68: for i in range(n): jpayne@68: # log P(observation|class) = SUM_i log P(observation_i|class) jpayne@68: probs = [None] * len(observation) jpayne@68: for j in range(len(observation)): jpayne@68: probs[j] = nb.p_conditional[i][j].get(observation[j], 0) jpayne@68: lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300)) jpayne@68: lp_observation_class[i] = sum(lprobs) jpayne@68: jpayne@68: # Calculate log P(class). jpayne@68: lp_prior = np.log(nb.p_prior) jpayne@68: jpayne@68: # Calculate log P(observation). jpayne@68: lp_observation = 0.0 # P(observation) jpayne@68: if scale: # Only calculate this if requested. jpayne@68: # log P(observation) = log SUM_i P(observation|class_i)P(class_i) jpayne@68: obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700)) jpayne@68: lp_observation = np.log(sum(obs)) jpayne@68: jpayne@68: # Calculate log P(class|observation). jpayne@68: lp_class_observation = {} # Dict of class : log P(class|observation) jpayne@68: for i in range(len(nb.classes)): jpayne@68: lp_class_observation[nb.classes[i]] = ( jpayne@68: lp_observation_class[i] + lp_prior[i] - lp_observation jpayne@68: ) jpayne@68: jpayne@68: return lp_class_observation jpayne@68: jpayne@68: jpayne@68: def classify(nb, observation): jpayne@68: """Classify an observation into a class.""" jpayne@68: # The class is the one with the highest probability. jpayne@68: probs = calculate(nb, observation, scale=False) jpayne@68: max_prob = max_class = None jpayne@68: for klass in nb.classes: jpayne@68: if max_prob is None or probs[klass] > max_prob: jpayne@68: max_prob, max_class = probs[klass], klass jpayne@68: return max_class jpayne@68: jpayne@68: jpayne@68: def train(training_set, results, priors=None, typecode=None): jpayne@68: """Train a NaiveBayes classifier on a training set. jpayne@68: jpayne@68: Arguments: jpayne@68: - training_set - List of observations. jpayne@68: - results - List of the class assignments for each observation. jpayne@68: Thus, training_set and results must be the same length. jpayne@68: - priors - Optional dictionary specifying the prior probabilities jpayne@68: for each type of result. If not specified, the priors will jpayne@68: be estimated from the training results. jpayne@68: jpayne@68: """ jpayne@68: if not len(training_set): jpayne@68: raise ValueError("No data in the training set.") jpayne@68: if len(training_set) != len(results): jpayne@68: raise ValueError("training_set and results should be parallel lists.") jpayne@68: jpayne@68: # If no typecode is specified, try to pick a reasonable one. If jpayne@68: # training_set is a Numeric array, then use that typecode. jpayne@68: # Otherwise, choose a reasonable default. jpayne@68: # XXX NOT IMPLEMENTED jpayne@68: jpayne@68: # Check to make sure each vector in the training set has the same jpayne@68: # dimensionality. jpayne@68: dimensions = [len(x) for x in training_set] jpayne@68: if min(dimensions) != max(dimensions): jpayne@68: raise ValueError("observations have different dimensionality") jpayne@68: jpayne@68: nb = NaiveBayes() jpayne@68: nb.dimensionality = dimensions[0] jpayne@68: jpayne@68: # Get a list of all the classes, and jpayne@68: # estimate the prior probabilities for the classes. jpayne@68: if priors is not None: jpayne@68: percs = priors jpayne@68: nb.classes = list(set(results)) jpayne@68: else: jpayne@68: class_freq = _contents(results) jpayne@68: nb.classes = list(class_freq.keys()) jpayne@68: percs = class_freq jpayne@68: nb.classes.sort() # keep it tidy jpayne@68: jpayne@68: nb.p_prior = np.zeros(len(nb.classes)) jpayne@68: for i in range(len(nb.classes)): jpayne@68: nb.p_prior[i] = percs[nb.classes[i]] jpayne@68: jpayne@68: # Collect all the observations in class. For each class, make a jpayne@68: # matrix of training instances versus dimensions. I might be able jpayne@68: # to optimize this with Numeric, if the training_set parameter jpayne@68: # were guaranteed to be a matrix. However, this may not be the jpayne@68: # case, because the client may be hacking up a sparse matrix or jpayne@68: # something. jpayne@68: c2i = {} # class to index of class jpayne@68: for index, key in enumerate(nb.classes): jpayne@68: c2i[key] = index jpayne@68: observations = [[] for c in nb.classes] # separate observations by class jpayne@68: for i in range(len(results)): jpayne@68: klass, obs = results[i], training_set[i] jpayne@68: observations[c2i[klass]].append(obs) jpayne@68: # Now make the observations Numeric matrix. jpayne@68: for i in range(len(observations)): jpayne@68: # XXX typecode must be specified! jpayne@68: observations[i] = np.asarray(observations[i], typecode) jpayne@68: jpayne@68: # Calculate P(value|class,dim) for every class. jpayne@68: # This is a good loop to optimize. jpayne@68: nb.p_conditional = [] jpayne@68: for i in range(len(nb.classes)): jpayne@68: class_observations = observations[i] # observations for this class jpayne@68: nb.p_conditional.append([None] * nb.dimensionality) jpayne@68: for j in range(nb.dimensionality): jpayne@68: # Collect all the values in this dimension. jpayne@68: values = class_observations[:, j] jpayne@68: jpayne@68: # Add pseudocounts here. This needs to be parameterized. jpayne@68: # values = list(values) + range(len(nb.classes)) # XXX add 1 jpayne@68: jpayne@68: # Estimate P(value|class,dim) jpayne@68: nb.p_conditional[i][j] = _contents(values) jpayne@68: return nb