comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # Copyright 2000 by Jeffrey Chang. All rights reserved.
2 #
3 # This file is part of the Biopython distribution and governed by your
4 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5 # Please see the LICENSE file that should have been included as part of this
6 # package.
7
8 """General Naive Bayes learner (DEPRECATED).
9
10 Naive Bayes is a supervised classification algorithm that uses Bayes
11 rule to compute the fit between a new observation and some previously
12 observed data. The observations are discrete feature vectors, with
13 the Bayes assumption that the features are independent. Although this
14 is hardly ever true, the classifier works well enough in practice.
15
16 Glossary:
17 - observation - A feature vector of discrete data.
18 - class - A possible classification for an observation.
19
20 Classes:
21 - NaiveBayes - Holds information for a naive Bayes classifier.
22
23 Functions:
24 - train - Train a new naive Bayes classifier.
25 - calculate - Calculate the probabilities of each class,
26 given an observation.
27 - classify - Classify an observation into a class.
28
29 """
30
31
32 import warnings
33 from Bio import BiopythonDeprecationWarning
34
35 warnings.warn(
36 "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future "
37 "release of Biopython. Consider using scikit-learn instead.",
38 BiopythonDeprecationWarning,
39 )
40
41
42 try:
43 import numpy as np
44 except ImportError:
45 from Bio import MissingPythonDependencyError
46
47 raise MissingPythonDependencyError(
48 "Please install NumPy if you want to use Bio.NaiveBayes. "
49 "See http://www.numpy.org/"
50 ) from None
51
52
53 def _contents(items):
54 """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE)."""
55 term = 1.0 / len(items)
56 counts = {}
57 for item in items:
58 counts[item] = counts.get(item, 0) + term
59 return counts
60
61
62 class NaiveBayes:
63 """Hold information for a NaiveBayes classifier.
64
65 Attributes:
66 - classes - List of the possible classes of data.
67 - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)``
68 - p_prior - List of the prior probabilities for every class.
69 - dimensionality - Dimensionality of the data.
70
71 """
72
73 def __init__(self):
74 """Initialize the class."""
75 self.classes = []
76 self.p_conditional = None
77 self.p_prior = []
78 self.dimensionality = None
79
80
81 def calculate(nb, observation, scale=False):
82 """Calculate the logarithmic conditional probability for each class.
83
84 Arguments:
85 - nb - A NaiveBayes classifier that has been trained.
86 - observation - A list representing the observed data.
87 - scale - Boolean to indicate whether the probability should be
88 scaled by ``P(observation)``. By default, no scaling is done.
89
90 A dictionary is returned where the key is the class and the value is
91 the log probability of the class.
92 """
93 # P(class|observation) = P(observation|class)*P(class)/P(observation)
94 # Taking the log:
95 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
96
97 # Make sure the observation has the right dimensionality.
98 if len(observation) != nb.dimensionality:
99 raise ValueError(
100 f"observation in {len(observation)} dimension,"
101 f" but classifier in {nb.dimensionality}"
102 )
103
104 # Calculate log P(observation|class) for every class.
105 n = len(nb.classes)
106 lp_observation_class = np.zeros(n) # array of log P(observation|class)
107 for i in range(n):
108 # log P(observation|class) = SUM_i log P(observation_i|class)
109 probs = [None] * len(observation)
110 for j in range(len(observation)):
111 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
112 lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300))
113 lp_observation_class[i] = sum(lprobs)
114
115 # Calculate log P(class).
116 lp_prior = np.log(nb.p_prior)
117
118 # Calculate log P(observation).
119 lp_observation = 0.0 # P(observation)
120 if scale: # Only calculate this if requested.
121 # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
122 obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700))
123 lp_observation = np.log(sum(obs))
124
125 # Calculate log P(class|observation).
126 lp_class_observation = {} # Dict of class : log P(class|observation)
127 for i in range(len(nb.classes)):
128 lp_class_observation[nb.classes[i]] = (
129 lp_observation_class[i] + lp_prior[i] - lp_observation
130 )
131
132 return lp_class_observation
133
134
135 def classify(nb, observation):
136 """Classify an observation into a class."""
137 # The class is the one with the highest probability.
138 probs = calculate(nb, observation, scale=False)
139 max_prob = max_class = None
140 for klass in nb.classes:
141 if max_prob is None or probs[klass] > max_prob:
142 max_prob, max_class = probs[klass], klass
143 return max_class
144
145
146 def train(training_set, results, priors=None, typecode=None):
147 """Train a NaiveBayes classifier on a training set.
148
149 Arguments:
150 - training_set - List of observations.
151 - results - List of the class assignments for each observation.
152 Thus, training_set and results must be the same length.
153 - priors - Optional dictionary specifying the prior probabilities
154 for each type of result. If not specified, the priors will
155 be estimated from the training results.
156
157 """
158 if not len(training_set):
159 raise ValueError("No data in the training set.")
160 if len(training_set) != len(results):
161 raise ValueError("training_set and results should be parallel lists.")
162
163 # If no typecode is specified, try to pick a reasonable one. If
164 # training_set is a Numeric array, then use that typecode.
165 # Otherwise, choose a reasonable default.
166 # XXX NOT IMPLEMENTED
167
168 # Check to make sure each vector in the training set has the same
169 # dimensionality.
170 dimensions = [len(x) for x in training_set]
171 if min(dimensions) != max(dimensions):
172 raise ValueError("observations have different dimensionality")
173
174 nb = NaiveBayes()
175 nb.dimensionality = dimensions[0]
176
177 # Get a list of all the classes, and
178 # estimate the prior probabilities for the classes.
179 if priors is not None:
180 percs = priors
181 nb.classes = list(set(results))
182 else:
183 class_freq = _contents(results)
184 nb.classes = list(class_freq.keys())
185 percs = class_freq
186 nb.classes.sort() # keep it tidy
187
188 nb.p_prior = np.zeros(len(nb.classes))
189 for i in range(len(nb.classes)):
190 nb.p_prior[i] = percs[nb.classes[i]]
191
192 # Collect all the observations in class. For each class, make a
193 # matrix of training instances versus dimensions. I might be able
194 # to optimize this with Numeric, if the training_set parameter
195 # were guaranteed to be a matrix. However, this may not be the
196 # case, because the client may be hacking up a sparse matrix or
197 # something.
198 c2i = {} # class to index of class
199 for index, key in enumerate(nb.classes):
200 c2i[key] = index
201 observations = [[] for c in nb.classes] # separate observations by class
202 for i in range(len(results)):
203 klass, obs = results[i], training_set[i]
204 observations[c2i[klass]].append(obs)
205 # Now make the observations Numeric matrix.
206 for i in range(len(observations)):
207 # XXX typecode must be specified!
208 observations[i] = np.asarray(observations[i], typecode)
209
210 # Calculate P(value|class,dim) for every class.
211 # This is a good loop to optimize.
212 nb.p_conditional = []
213 for i in range(len(nb.classes)):
214 class_observations = observations[i] # observations for this class
215 nb.p_conditional.append([None] * nb.dimensionality)
216 for j in range(nb.dimensionality):
217 # Collect all the values in this dimension.
218 values = class_observations[:, j]
219
220 # Add pseudocounts here. This needs to be parameterized.
221 # values = list(values) + range(len(nb.classes)) # XXX add 1
222
223 # Estimate P(value|class,dim)
224 nb.p_conditional[i][j] = _contents(values)
225 return nb