Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/NaiveBayes.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # Copyright 2000 by Jeffrey Chang. All rights reserved. | |
2 # | |
3 # This file is part of the Biopython distribution and governed by your | |
4 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
5 # Please see the LICENSE file that should have been included as part of this | |
6 # package. | |
7 | |
8 """General Naive Bayes learner (DEPRECATED). | |
9 | |
10 Naive Bayes is a supervised classification algorithm that uses Bayes | |
11 rule to compute the fit between a new observation and some previously | |
12 observed data. The observations are discrete feature vectors, with | |
13 the Bayes assumption that the features are independent. Although this | |
14 is hardly ever true, the classifier works well enough in practice. | |
15 | |
16 Glossary: | |
17 - observation - A feature vector of discrete data. | |
18 - class - A possible classification for an observation. | |
19 | |
20 Classes: | |
21 - NaiveBayes - Holds information for a naive Bayes classifier. | |
22 | |
23 Functions: | |
24 - train - Train a new naive Bayes classifier. | |
25 - calculate - Calculate the probabilities of each class, | |
26 given an observation. | |
27 - classify - Classify an observation into a class. | |
28 | |
29 """ | |
30 | |
31 | |
32 import warnings | |
33 from Bio import BiopythonDeprecationWarning | |
34 | |
35 warnings.warn( | |
36 "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future " | |
37 "release of Biopython. Consider using scikit-learn instead.", | |
38 BiopythonDeprecationWarning, | |
39 ) | |
40 | |
41 | |
42 try: | |
43 import numpy as np | |
44 except ImportError: | |
45 from Bio import MissingPythonDependencyError | |
46 | |
47 raise MissingPythonDependencyError( | |
48 "Please install NumPy if you want to use Bio.NaiveBayes. " | |
49 "See http://www.numpy.org/" | |
50 ) from None | |
51 | |
52 | |
53 def _contents(items): | |
54 """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" | |
55 term = 1.0 / len(items) | |
56 counts = {} | |
57 for item in items: | |
58 counts[item] = counts.get(item, 0) + term | |
59 return counts | |
60 | |
61 | |
62 class NaiveBayes: | |
63 """Hold information for a NaiveBayes classifier. | |
64 | |
65 Attributes: | |
66 - classes - List of the possible classes of data. | |
67 - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` | |
68 - p_prior - List of the prior probabilities for every class. | |
69 - dimensionality - Dimensionality of the data. | |
70 | |
71 """ | |
72 | |
73 def __init__(self): | |
74 """Initialize the class.""" | |
75 self.classes = [] | |
76 self.p_conditional = None | |
77 self.p_prior = [] | |
78 self.dimensionality = None | |
79 | |
80 | |
81 def calculate(nb, observation, scale=False): | |
82 """Calculate the logarithmic conditional probability for each class. | |
83 | |
84 Arguments: | |
85 - nb - A NaiveBayes classifier that has been trained. | |
86 - observation - A list representing the observed data. | |
87 - scale - Boolean to indicate whether the probability should be | |
88 scaled by ``P(observation)``. By default, no scaling is done. | |
89 | |
90 A dictionary is returned where the key is the class and the value is | |
91 the log probability of the class. | |
92 """ | |
93 # P(class|observation) = P(observation|class)*P(class)/P(observation) | |
94 # Taking the log: | |
95 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) | |
96 | |
97 # Make sure the observation has the right dimensionality. | |
98 if len(observation) != nb.dimensionality: | |
99 raise ValueError( | |
100 f"observation in {len(observation)} dimension," | |
101 f" but classifier in {nb.dimensionality}" | |
102 ) | |
103 | |
104 # Calculate log P(observation|class) for every class. | |
105 n = len(nb.classes) | |
106 lp_observation_class = np.zeros(n) # array of log P(observation|class) | |
107 for i in range(n): | |
108 # log P(observation|class) = SUM_i log P(observation_i|class) | |
109 probs = [None] * len(observation) | |
110 for j in range(len(observation)): | |
111 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) | |
112 lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300)) | |
113 lp_observation_class[i] = sum(lprobs) | |
114 | |
115 # Calculate log P(class). | |
116 lp_prior = np.log(nb.p_prior) | |
117 | |
118 # Calculate log P(observation). | |
119 lp_observation = 0.0 # P(observation) | |
120 if scale: # Only calculate this if requested. | |
121 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) | |
122 obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700)) | |
123 lp_observation = np.log(sum(obs)) | |
124 | |
125 # Calculate log P(class|observation). | |
126 lp_class_observation = {} # Dict of class : log P(class|observation) | |
127 for i in range(len(nb.classes)): | |
128 lp_class_observation[nb.classes[i]] = ( | |
129 lp_observation_class[i] + lp_prior[i] - lp_observation | |
130 ) | |
131 | |
132 return lp_class_observation | |
133 | |
134 | |
135 def classify(nb, observation): | |
136 """Classify an observation into a class.""" | |
137 # The class is the one with the highest probability. | |
138 probs = calculate(nb, observation, scale=False) | |
139 max_prob = max_class = None | |
140 for klass in nb.classes: | |
141 if max_prob is None or probs[klass] > max_prob: | |
142 max_prob, max_class = probs[klass], klass | |
143 return max_class | |
144 | |
145 | |
146 def train(training_set, results, priors=None, typecode=None): | |
147 """Train a NaiveBayes classifier on a training set. | |
148 | |
149 Arguments: | |
150 - training_set - List of observations. | |
151 - results - List of the class assignments for each observation. | |
152 Thus, training_set and results must be the same length. | |
153 - priors - Optional dictionary specifying the prior probabilities | |
154 for each type of result. If not specified, the priors will | |
155 be estimated from the training results. | |
156 | |
157 """ | |
158 if not len(training_set): | |
159 raise ValueError("No data in the training set.") | |
160 if len(training_set) != len(results): | |
161 raise ValueError("training_set and results should be parallel lists.") | |
162 | |
163 # If no typecode is specified, try to pick a reasonable one. If | |
164 # training_set is a Numeric array, then use that typecode. | |
165 # Otherwise, choose a reasonable default. | |
166 # XXX NOT IMPLEMENTED | |
167 | |
168 # Check to make sure each vector in the training set has the same | |
169 # dimensionality. | |
170 dimensions = [len(x) for x in training_set] | |
171 if min(dimensions) != max(dimensions): | |
172 raise ValueError("observations have different dimensionality") | |
173 | |
174 nb = NaiveBayes() | |
175 nb.dimensionality = dimensions[0] | |
176 | |
177 # Get a list of all the classes, and | |
178 # estimate the prior probabilities for the classes. | |
179 if priors is not None: | |
180 percs = priors | |
181 nb.classes = list(set(results)) | |
182 else: | |
183 class_freq = _contents(results) | |
184 nb.classes = list(class_freq.keys()) | |
185 percs = class_freq | |
186 nb.classes.sort() # keep it tidy | |
187 | |
188 nb.p_prior = np.zeros(len(nb.classes)) | |
189 for i in range(len(nb.classes)): | |
190 nb.p_prior[i] = percs[nb.classes[i]] | |
191 | |
192 # Collect all the observations in class. For each class, make a | |
193 # matrix of training instances versus dimensions. I might be able | |
194 # to optimize this with Numeric, if the training_set parameter | |
195 # were guaranteed to be a matrix. However, this may not be the | |
196 # case, because the client may be hacking up a sparse matrix or | |
197 # something. | |
198 c2i = {} # class to index of class | |
199 for index, key in enumerate(nb.classes): | |
200 c2i[key] = index | |
201 observations = [[] for c in nb.classes] # separate observations by class | |
202 for i in range(len(results)): | |
203 klass, obs = results[i], training_set[i] | |
204 observations[c2i[klass]].append(obs) | |
205 # Now make the observations Numeric matrix. | |
206 for i in range(len(observations)): | |
207 # XXX typecode must be specified! | |
208 observations[i] = np.asarray(observations[i], typecode) | |
209 | |
210 # Calculate P(value|class,dim) for every class. | |
211 # This is a good loop to optimize. | |
212 nb.p_conditional = [] | |
213 for i in range(len(nb.classes)): | |
214 class_observations = observations[i] # observations for this class | |
215 nb.p_conditional.append([None] * nb.dimensionality) | |
216 for j in range(nb.dimensionality): | |
217 # Collect all the values in this dimension. | |
218 values = class_observations[:, j] | |
219 | |
220 # Add pseudocounts here. This needs to be parameterized. | |
221 # values = list(values) + range(len(nb.classes)) # XXX add 1 | |
222 | |
223 # Estimate P(value|class,dim) | |
224 nb.p_conditional[i][j] = _contents(values) | |
225 return nb |