jpayne@68: # Copyright 2000 by Jeffrey Chang.  All rights reserved.
jpayne@68: #
jpayne@68: # This file is part of the Biopython distribution and governed by your
jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68: # Please see the LICENSE file that should have been included as part of this
jpayne@68: # package.
jpayne@68: 
jpayne@68: """General Naive Bayes learner (DEPRECATED).
jpayne@68: 
jpayne@68: Naive Bayes is a supervised classification algorithm that uses Bayes
jpayne@68: rule to compute the fit between a new observation and some previously
jpayne@68: observed data.  The observations are discrete feature vectors, with
jpayne@68: the Bayes assumption that the features are independent.  Although this
jpayne@68: is hardly ever true, the classifier works well enough in practice.
jpayne@68: 
jpayne@68: Glossary:
jpayne@68:  - observation - A feature vector of discrete data.
jpayne@68:  - class       - A possible classification for an observation.
jpayne@68: 
jpayne@68: Classes:
jpayne@68:  - NaiveBayes - Holds information for a naive Bayes classifier.
jpayne@68: 
jpayne@68: Functions:
jpayne@68:  - train     - Train a new naive Bayes classifier.
jpayne@68:  - calculate - Calculate the probabilities of each class,
jpayne@68:    given an observation.
jpayne@68:  - classify  - Classify an observation into a class.
jpayne@68: 
jpayne@68: """
jpayne@68: 
jpayne@68: 
jpayne@68: import warnings
jpayne@68: from Bio import BiopythonDeprecationWarning
jpayne@68: 
jpayne@68: warnings.warn(
jpayne@68:     "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future "
jpayne@68:     "release of Biopython. Consider using scikit-learn instead.",
jpayne@68:     BiopythonDeprecationWarning,
jpayne@68: )
jpayne@68: 
jpayne@68: 
jpayne@68: try:
jpayne@68:     import numpy as np
jpayne@68: except ImportError:
jpayne@68:     from Bio import MissingPythonDependencyError
jpayne@68: 
jpayne@68:     raise MissingPythonDependencyError(
jpayne@68:         "Please install NumPy if you want to use Bio.NaiveBayes. "
jpayne@68:         "See http://www.numpy.org/"
jpayne@68:     ) from None
jpayne@68: 
jpayne@68: 
jpayne@68: def _contents(items):
jpayne@68:     """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE)."""
jpayne@68:     term = 1.0 / len(items)
jpayne@68:     counts = {}
jpayne@68:     for item in items:
jpayne@68:         counts[item] = counts.get(item, 0) + term
jpayne@68:     return counts
jpayne@68: 
jpayne@68: 
jpayne@68: class NaiveBayes:
jpayne@68:     """Hold information for a NaiveBayes classifier.
jpayne@68: 
jpayne@68:     Attributes:
jpayne@68:      - classes        - List of the possible classes of data.
jpayne@68:      - p_conditional  - CLASS x DIM array of dicts of value -> ``P(value|class,dim)``
jpayne@68:      - p_prior        - List of the prior probabilities for every class.
jpayne@68:      - dimensionality - Dimensionality of the data.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self):
jpayne@68:         """Initialize the class."""
jpayne@68:         self.classes = []
jpayne@68:         self.p_conditional = None
jpayne@68:         self.p_prior = []
jpayne@68:         self.dimensionality = None
jpayne@68: 
jpayne@68: 
jpayne@68: def calculate(nb, observation, scale=False):
jpayne@68:     """Calculate the logarithmic conditional probability for each class.
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - nb          - A NaiveBayes classifier that has been trained.
jpayne@68:      - observation - A list representing the observed data.
jpayne@68:      - scale       - Boolean to indicate whether the probability should be
jpayne@68:        scaled by ``P(observation)``.  By default, no scaling is done.
jpayne@68: 
jpayne@68:     A dictionary is returned where the key is the class and the value is
jpayne@68:     the log probability of the class.
jpayne@68:     """
jpayne@68:     # P(class|observation) = P(observation|class)*P(class)/P(observation)
jpayne@68:     # Taking the log:
jpayne@68:     # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
jpayne@68: 
jpayne@68:     # Make sure the observation has the right dimensionality.
jpayne@68:     if len(observation) != nb.dimensionality:
jpayne@68:         raise ValueError(
jpayne@68:             f"observation in {len(observation)} dimension,"
jpayne@68:             f" but classifier in {nb.dimensionality}"
jpayne@68:         )
jpayne@68: 
jpayne@68:     # Calculate log P(observation|class) for every class.
jpayne@68:     n = len(nb.classes)
jpayne@68:     lp_observation_class = np.zeros(n)  # array of log P(observation|class)
jpayne@68:     for i in range(n):
jpayne@68:         # log P(observation|class) = SUM_i log P(observation_i|class)
jpayne@68:         probs = [None] * len(observation)
jpayne@68:         for j in range(len(observation)):
jpayne@68:             probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
jpayne@68:         lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300))
jpayne@68:         lp_observation_class[i] = sum(lprobs)
jpayne@68: 
jpayne@68:     # Calculate log P(class).
jpayne@68:     lp_prior = np.log(nb.p_prior)
jpayne@68: 
jpayne@68:     # Calculate log P(observation).
jpayne@68:     lp_observation = 0.0  # P(observation)
jpayne@68:     if scale:  # Only calculate this if requested.
jpayne@68:         # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
jpayne@68:         obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700))
jpayne@68:         lp_observation = np.log(sum(obs))
jpayne@68: 
jpayne@68:     # Calculate log P(class|observation).
jpayne@68:     lp_class_observation = {}  # Dict of class : log P(class|observation)
jpayne@68:     for i in range(len(nb.classes)):
jpayne@68:         lp_class_observation[nb.classes[i]] = (
jpayne@68:             lp_observation_class[i] + lp_prior[i] - lp_observation
jpayne@68:         )
jpayne@68: 
jpayne@68:     return lp_class_observation
jpayne@68: 
jpayne@68: 
jpayne@68: def classify(nb, observation):
jpayne@68:     """Classify an observation into a class."""
jpayne@68:     # The class is the one with the highest probability.
jpayne@68:     probs = calculate(nb, observation, scale=False)
jpayne@68:     max_prob = max_class = None
jpayne@68:     for klass in nb.classes:
jpayne@68:         if max_prob is None or probs[klass] > max_prob:
jpayne@68:             max_prob, max_class = probs[klass], klass
jpayne@68:     return max_class
jpayne@68: 
jpayne@68: 
jpayne@68: def train(training_set, results, priors=None, typecode=None):
jpayne@68:     """Train a NaiveBayes classifier on a training set.
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - training_set - List of observations.
jpayne@68:      - results      - List of the class assignments for each observation.
jpayne@68:        Thus, training_set and results must be the same length.
jpayne@68:      - priors       - Optional dictionary specifying the prior probabilities
jpayne@68:        for each type of result. If not specified, the priors will
jpayne@68:        be estimated from the training results.
jpayne@68: 
jpayne@68:     """
jpayne@68:     if not len(training_set):
jpayne@68:         raise ValueError("No data in the training set.")
jpayne@68:     if len(training_set) != len(results):
jpayne@68:         raise ValueError("training_set and results should be parallel lists.")
jpayne@68: 
jpayne@68:     # If no typecode is specified, try to pick a reasonable one.  If
jpayne@68:     # training_set is a Numeric array, then use that typecode.
jpayne@68:     # Otherwise, choose a reasonable default.
jpayne@68:     # XXX NOT IMPLEMENTED
jpayne@68: 
jpayne@68:     # Check to make sure each vector in the training set has the same
jpayne@68:     # dimensionality.
jpayne@68:     dimensions = [len(x) for x in training_set]
jpayne@68:     if min(dimensions) != max(dimensions):
jpayne@68:         raise ValueError("observations have different dimensionality")
jpayne@68: 
jpayne@68:     nb = NaiveBayes()
jpayne@68:     nb.dimensionality = dimensions[0]
jpayne@68: 
jpayne@68:     # Get a list of all the classes, and
jpayne@68:     # estimate the prior probabilities for the classes.
jpayne@68:     if priors is not None:
jpayne@68:         percs = priors
jpayne@68:         nb.classes = list(set(results))
jpayne@68:     else:
jpayne@68:         class_freq = _contents(results)
jpayne@68:         nb.classes = list(class_freq.keys())
jpayne@68:         percs = class_freq
jpayne@68:     nb.classes.sort()  # keep it tidy
jpayne@68: 
jpayne@68:     nb.p_prior = np.zeros(len(nb.classes))
jpayne@68:     for i in range(len(nb.classes)):
jpayne@68:         nb.p_prior[i] = percs[nb.classes[i]]
jpayne@68: 
jpayne@68:     # Collect all the observations in class.  For each class, make a
jpayne@68:     # matrix of training instances versus dimensions.  I might be able
jpayne@68:     # to optimize this with Numeric, if the training_set parameter
jpayne@68:     # were guaranteed to be a matrix.  However, this may not be the
jpayne@68:     # case, because the client may be hacking up a sparse matrix or
jpayne@68:     # something.
jpayne@68:     c2i = {}  # class to index of class
jpayne@68:     for index, key in enumerate(nb.classes):
jpayne@68:         c2i[key] = index
jpayne@68:     observations = [[] for c in nb.classes]  # separate observations by class
jpayne@68:     for i in range(len(results)):
jpayne@68:         klass, obs = results[i], training_set[i]
jpayne@68:         observations[c2i[klass]].append(obs)
jpayne@68:     # Now make the observations Numeric matrix.
jpayne@68:     for i in range(len(observations)):
jpayne@68:         # XXX typecode must be specified!
jpayne@68:         observations[i] = np.asarray(observations[i], typecode)
jpayne@68: 
jpayne@68:     # Calculate P(value|class,dim) for every class.
jpayne@68:     # This is a good loop to optimize.
jpayne@68:     nb.p_conditional = []
jpayne@68:     for i in range(len(nb.classes)):
jpayne@68:         class_observations = observations[i]  # observations for this class
jpayne@68:         nb.p_conditional.append([None] * nb.dimensionality)
jpayne@68:         for j in range(nb.dimensionality):
jpayne@68:             # Collect all the values in this dimension.
jpayne@68:             values = class_observations[:, j]
jpayne@68: 
jpayne@68:             # Add pseudocounts here.  This needs to be parameterized.
jpayne@68:             # values = list(values) + range(len(nb.classes))  # XXX add 1
jpayne@68: 
jpayne@68:             # Estimate P(value|class,dim)
jpayne@68:             nb.p_conditional[i][j] = _contents(values)
jpayne@68:     return nb