csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright 2000 by Jeffrey Chang. All rights reserved.
jpayne@69	2 #
jpayne@69	3 # This file is part of the Biopython distribution and governed by your
jpayne@69	4 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69	5 # Please see the LICENSE file that should have been included as part of this
jpayne@69	6 # package.
jpayne@69	7
jpayne@69	8 """General Naive Bayes learner (DEPRECATED).
jpayne@69	9
jpayne@69	10 Naive Bayes is a supervised classification algorithm that uses Bayes
jpayne@69	11 rule to compute the fit between a new observation and some previously
jpayne@69	12 observed data. The observations are discrete feature vectors, with
jpayne@69	13 the Bayes assumption that the features are independent. Although this
jpayne@69	14 is hardly ever true, the classifier works well enough in practice.
jpayne@69	15
jpayne@69	16 Glossary:
jpayne@69	17 - observation - A feature vector of discrete data.
jpayne@69	18 - class - A possible classification for an observation.
jpayne@69	19
jpayne@69	20 Classes:
jpayne@69	21 - NaiveBayes - Holds information for a naive Bayes classifier.
jpayne@69	22
jpayne@69	23 Functions:
jpayne@69	24 - train - Train a new naive Bayes classifier.
jpayne@69	25 - calculate - Calculate the probabilities of each class,
jpayne@69	26 given an observation.
jpayne@69	27 - classify - Classify an observation into a class.
jpayne@69	28
jpayne@69	29 """
jpayne@69	30
jpayne@69	31
jpayne@69	32 import warnings
jpayne@69	33 from Bio import BiopythonDeprecationWarning
jpayne@69	34
jpayne@69	35 warnings.warn(
jpayne@69	36 "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future "
jpayne@69	37 "release of Biopython. Consider using scikit-learn instead.",
jpayne@69	38 BiopythonDeprecationWarning,
jpayne@69	39 )
jpayne@69	40
jpayne@69	41
jpayne@69	42 try:
jpayne@69	43 import numpy as np
jpayne@69	44 except ImportError:
jpayne@69	45 from Bio import MissingPythonDependencyError
jpayne@69	46
jpayne@69	47 raise MissingPythonDependencyError(
jpayne@69	48 "Please install NumPy if you want to use Bio.NaiveBayes. "
jpayne@69	49 "See http://www.numpy.org/"
jpayne@69	50 ) from None
jpayne@69	51
jpayne@69	52
jpayne@69	53 def _contents(items):
jpayne@69	54 """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE)."""
jpayne@69	55 term = 1.0 / len(items)
jpayne@69	56 counts = {}
jpayne@69	57 for item in items:
jpayne@69	58 counts[item] = counts.get(item, 0) + term
jpayne@69	59 return counts
jpayne@69	60
jpayne@69	61
jpayne@69	62 class NaiveBayes:
jpayne@69	63 """Hold information for a NaiveBayes classifier.
jpayne@69	64
jpayne@69	65 Attributes:
jpayne@69	66 - classes - List of the possible classes of data.
jpayne@69	67 - p_conditional - CLASS x DIM array of dicts of value -> ``P(value\|class,dim)``
jpayne@69	68 - p_prior - List of the prior probabilities for every class.
jpayne@69	69 - dimensionality - Dimensionality of the data.
jpayne@69	70
jpayne@69	71 """
jpayne@69	72
jpayne@69	73 def __init__(self):
jpayne@69	74 """Initialize the class."""
jpayne@69	75 self.classes = []
jpayne@69	76 self.p_conditional = None
jpayne@69	77 self.p_prior = []
jpayne@69	78 self.dimensionality = None
jpayne@69	79
jpayne@69	80
jpayne@69	81 def calculate(nb, observation, scale=False):
jpayne@69	82 """Calculate the logarithmic conditional probability for each class.
jpayne@69	83
jpayne@69	84 Arguments:
jpayne@69	85 - nb - A NaiveBayes classifier that has been trained.
jpayne@69	86 - observation - A list representing the observed data.
jpayne@69	87 - scale - Boolean to indicate whether the probability should be
jpayne@69	88 scaled by ``P(observation)``. By default, no scaling is done.
jpayne@69	89
jpayne@69	90 A dictionary is returned where the key is the class and the value is
jpayne@69	91 the log probability of the class.
jpayne@69	92 """
jpayne@69	93 # P(class\|observation) = P(observation\|class)*P(class)/P(observation)
jpayne@69	94 # Taking the log:
jpayne@69	95 # lP(class\|observation) = lP(observation\|class)+lP(class)-lP(observation)
jpayne@69	96
jpayne@69	97 # Make sure the observation has the right dimensionality.
jpayne@69	98 if len(observation) != nb.dimensionality:
jpayne@69	99 raise ValueError(
jpayne@69	100 f"observation in {len(observation)} dimension,"
jpayne@69	101 f" but classifier in {nb.dimensionality}"
jpayne@69	102 )
jpayne@69	103
jpayne@69	104 # Calculate log P(observation\|class) for every class.
jpayne@69	105 n = len(nb.classes)
jpayne@69	106 lp_observation_class = np.zeros(n) # array of log P(observation\|class)
jpayne@69	107 for i in range(n):
jpayne@69	108 # log P(observation\|class) = SUM_i log P(observation_i\|class)
jpayne@69	109 probs = [None] * len(observation)
jpayne@69	110 for j in range(len(observation)):
jpayne@69	111 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
jpayne@69	112 lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300))
jpayne@69	113 lp_observation_class[i] = sum(lprobs)
jpayne@69	114
jpayne@69	115 # Calculate log P(class).
jpayne@69	116 lp_prior = np.log(nb.p_prior)
jpayne@69	117
jpayne@69	118 # Calculate log P(observation).
jpayne@69	119 lp_observation = 0.0 # P(observation)
jpayne@69	120 if scale: # Only calculate this if requested.
jpayne@69	121 # log P(observation) = log SUM_i P(observation\|class_i)P(class_i)
jpayne@69	122 obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700))
jpayne@69	123 lp_observation = np.log(sum(obs))
jpayne@69	124
jpayne@69	125 # Calculate log P(class\|observation).
jpayne@69	126 lp_class_observation = {} # Dict of class : log P(class\|observation)
jpayne@69	127 for i in range(len(nb.classes)):
jpayne@69	128 lp_class_observation[nb.classes[i]] = (
jpayne@69	129 lp_observation_class[i] + lp_prior[i] - lp_observation
jpayne@69	130 )
jpayne@69	131
jpayne@69	132 return lp_class_observation
jpayne@69	133
jpayne@69	134
jpayne@69	135 def classify(nb, observation):
jpayne@69	136 """Classify an observation into a class."""
jpayne@69	137 # The class is the one with the highest probability.
jpayne@69	138 probs = calculate(nb, observation, scale=False)
jpayne@69	139 max_prob = max_class = None
jpayne@69	140 for klass in nb.classes:
jpayne@69	141 if max_prob is None or probs[klass] > max_prob:
jpayne@69	142 max_prob, max_class = probs[klass], klass
jpayne@69	143 return max_class
jpayne@69	144
jpayne@69	145
jpayne@69	146 def train(training_set, results, priors=None, typecode=None):
jpayne@69	147 """Train a NaiveBayes classifier on a training set.
jpayne@69	148
jpayne@69	149 Arguments:
jpayne@69	150 - training_set - List of observations.
jpayne@69	151 - results - List of the class assignments for each observation.
jpayne@69	152 Thus, training_set and results must be the same length.
jpayne@69	153 - priors - Optional dictionary specifying the prior probabilities
jpayne@69	154 for each type of result. If not specified, the priors will
jpayne@69	155 be estimated from the training results.
jpayne@69	156
jpayne@69	157 """
jpayne@69	158 if not len(training_set):
jpayne@69	159 raise ValueError("No data in the training set.")
jpayne@69	160 if len(training_set) != len(results):
jpayne@69	161 raise ValueError("training_set and results should be parallel lists.")
jpayne@69	162
jpayne@69	163 # If no typecode is specified, try to pick a reasonable one. If
jpayne@69	164 # training_set is a Numeric array, then use that typecode.
jpayne@69	165 # Otherwise, choose a reasonable default.
jpayne@69	166 # XXX NOT IMPLEMENTED
jpayne@69	167
jpayne@69	168 # Check to make sure each vector in the training set has the same
jpayne@69	169 # dimensionality.
jpayne@69	170 dimensions = [len(x) for x in training_set]
jpayne@69	171 if min(dimensions) != max(dimensions):
jpayne@69	172 raise ValueError("observations have different dimensionality")
jpayne@69	173
jpayne@69	174 nb = NaiveBayes()
jpayne@69	175 nb.dimensionality = dimensions[0]
jpayne@69	176
jpayne@69	177 # Get a list of all the classes, and
jpayne@69	178 # estimate the prior probabilities for the classes.
jpayne@69	179 if priors is not None:
jpayne@69	180 percs = priors
jpayne@69	181 nb.classes = list(set(results))
jpayne@69	182 else:
jpayne@69	183 class_freq = _contents(results)
jpayne@69	184 nb.classes = list(class_freq.keys())
jpayne@69	185 percs = class_freq
jpayne@69	186 nb.classes.sort() # keep it tidy
jpayne@69	187
jpayne@69	188 nb.p_prior = np.zeros(len(nb.classes))
jpayne@69	189 for i in range(len(nb.classes)):
jpayne@69	190 nb.p_prior[i] = percs[nb.classes[i]]
jpayne@69	191
jpayne@69	192 # Collect all the observations in class. For each class, make a
jpayne@69	193 # matrix of training instances versus dimensions. I might be able
jpayne@69	194 # to optimize this with Numeric, if the training_set parameter
jpayne@69	195 # were guaranteed to be a matrix. However, this may not be the
jpayne@69	196 # case, because the client may be hacking up a sparse matrix or
jpayne@69	197 # something.
jpayne@69	198 c2i = {} # class to index of class
jpayne@69	199 for index, key in enumerate(nb.classes):
jpayne@69	200 c2i[key] = index
jpayne@69	201 observations = [[] for c in nb.classes] # separate observations by class
jpayne@69	202 for i in range(len(results)):
jpayne@69	203 klass, obs = results[i], training_set[i]
jpayne@69	204 observations[c2i[klass]].append(obs)
jpayne@69	205 # Now make the observations Numeric matrix.
jpayne@69	206 for i in range(len(observations)):
jpayne@69	207 # XXX typecode must be specified!
jpayne@69	208 observations[i] = np.asarray(observations[i], typecode)
jpayne@69	209
jpayne@69	210 # Calculate P(value\|class,dim) for every class.
jpayne@69	211 # This is a good loop to optimize.
jpayne@69	212 nb.p_conditional = []
jpayne@69	213 for i in range(len(nb.classes)):
jpayne@69	214 class_observations = observations[i] # observations for this class
jpayne@69	215 nb.p_conditional.append([None] * nb.dimensionality)
jpayne@69	216 for j in range(nb.dimensionality):
jpayne@69	217 # Collect all the values in this dimension.
jpayne@69	218 values = class_observations[:, j]
jpayne@69	219
jpayne@69	220 # Add pseudocounts here. This needs to be parameterized.
jpayne@69	221 # values = list(values) + range(len(nb.classes)) # XXX add 1
jpayne@69	222
jpayne@69	223 # Estimate P(value\|class,dim)
jpayne@69	224 nb.p_conditional[i][j] = _contents(values)
jpayne@69	225 return nb

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py @ 69:33d812a61356