jpayne@69
|
1 # Copyright 2000 by Jeffrey Chang. All rights reserved.
|
jpayne@69
|
2 #
|
jpayne@69
|
3 # This file is part of the Biopython distribution and governed by your
|
jpayne@69
|
4 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@69
|
5 # Please see the LICENSE file that should have been included as part of this
|
jpayne@69
|
6 # package.
|
jpayne@69
|
7
|
jpayne@69
|
8 """General Naive Bayes learner (DEPRECATED).
|
jpayne@69
|
9
|
jpayne@69
|
10 Naive Bayes is a supervised classification algorithm that uses Bayes
|
jpayne@69
|
11 rule to compute the fit between a new observation and some previously
|
jpayne@69
|
12 observed data. The observations are discrete feature vectors, with
|
jpayne@69
|
13 the Bayes assumption that the features are independent. Although this
|
jpayne@69
|
14 is hardly ever true, the classifier works well enough in practice.
|
jpayne@69
|
15
|
jpayne@69
|
16 Glossary:
|
jpayne@69
|
17 - observation - A feature vector of discrete data.
|
jpayne@69
|
18 - class - A possible classification for an observation.
|
jpayne@69
|
19
|
jpayne@69
|
20 Classes:
|
jpayne@69
|
21 - NaiveBayes - Holds information for a naive Bayes classifier.
|
jpayne@69
|
22
|
jpayne@69
|
23 Functions:
|
jpayne@69
|
24 - train - Train a new naive Bayes classifier.
|
jpayne@69
|
25 - calculate - Calculate the probabilities of each class,
|
jpayne@69
|
26 given an observation.
|
jpayne@69
|
27 - classify - Classify an observation into a class.
|
jpayne@69
|
28
|
jpayne@69
|
29 """
|
jpayne@69
|
30
|
jpayne@69
|
31
|
jpayne@69
|
32 import warnings
|
jpayne@69
|
33 from Bio import BiopythonDeprecationWarning
|
jpayne@69
|
34
|
jpayne@69
|
35 warnings.warn(
|
jpayne@69
|
36 "The 'Bio.NaiveBayes' module is deprecated and will be removed in a future "
|
jpayne@69
|
37 "release of Biopython. Consider using scikit-learn instead.",
|
jpayne@69
|
38 BiopythonDeprecationWarning,
|
jpayne@69
|
39 )
|
jpayne@69
|
40
|
jpayne@69
|
41
|
jpayne@69
|
42 try:
|
jpayne@69
|
43 import numpy as np
|
jpayne@69
|
44 except ImportError:
|
jpayne@69
|
45 from Bio import MissingPythonDependencyError
|
jpayne@69
|
46
|
jpayne@69
|
47 raise MissingPythonDependencyError(
|
jpayne@69
|
48 "Please install NumPy if you want to use Bio.NaiveBayes. "
|
jpayne@69
|
49 "See http://www.numpy.org/"
|
jpayne@69
|
50 ) from None
|
jpayne@69
|
51
|
jpayne@69
|
52
|
jpayne@69
|
53 def _contents(items):
|
jpayne@69
|
54 """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE)."""
|
jpayne@69
|
55 term = 1.0 / len(items)
|
jpayne@69
|
56 counts = {}
|
jpayne@69
|
57 for item in items:
|
jpayne@69
|
58 counts[item] = counts.get(item, 0) + term
|
jpayne@69
|
59 return counts
|
jpayne@69
|
60
|
jpayne@69
|
61
|
jpayne@69
|
62 class NaiveBayes:
|
jpayne@69
|
63 """Hold information for a NaiveBayes classifier.
|
jpayne@69
|
64
|
jpayne@69
|
65 Attributes:
|
jpayne@69
|
66 - classes - List of the possible classes of data.
|
jpayne@69
|
67 - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)``
|
jpayne@69
|
68 - p_prior - List of the prior probabilities for every class.
|
jpayne@69
|
69 - dimensionality - Dimensionality of the data.
|
jpayne@69
|
70
|
jpayne@69
|
71 """
|
jpayne@69
|
72
|
jpayne@69
|
73 def __init__(self):
|
jpayne@69
|
74 """Initialize the class."""
|
jpayne@69
|
75 self.classes = []
|
jpayne@69
|
76 self.p_conditional = None
|
jpayne@69
|
77 self.p_prior = []
|
jpayne@69
|
78 self.dimensionality = None
|
jpayne@69
|
79
|
jpayne@69
|
80
|
jpayne@69
|
81 def calculate(nb, observation, scale=False):
|
jpayne@69
|
82 """Calculate the logarithmic conditional probability for each class.
|
jpayne@69
|
83
|
jpayne@69
|
84 Arguments:
|
jpayne@69
|
85 - nb - A NaiveBayes classifier that has been trained.
|
jpayne@69
|
86 - observation - A list representing the observed data.
|
jpayne@69
|
87 - scale - Boolean to indicate whether the probability should be
|
jpayne@69
|
88 scaled by ``P(observation)``. By default, no scaling is done.
|
jpayne@69
|
89
|
jpayne@69
|
90 A dictionary is returned where the key is the class and the value is
|
jpayne@69
|
91 the log probability of the class.
|
jpayne@69
|
92 """
|
jpayne@69
|
93 # P(class|observation) = P(observation|class)*P(class)/P(observation)
|
jpayne@69
|
94 # Taking the log:
|
jpayne@69
|
95 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
|
jpayne@69
|
96
|
jpayne@69
|
97 # Make sure the observation has the right dimensionality.
|
jpayne@69
|
98 if len(observation) != nb.dimensionality:
|
jpayne@69
|
99 raise ValueError(
|
jpayne@69
|
100 f"observation in {len(observation)} dimension,"
|
jpayne@69
|
101 f" but classifier in {nb.dimensionality}"
|
jpayne@69
|
102 )
|
jpayne@69
|
103
|
jpayne@69
|
104 # Calculate log P(observation|class) for every class.
|
jpayne@69
|
105 n = len(nb.classes)
|
jpayne@69
|
106 lp_observation_class = np.zeros(n) # array of log P(observation|class)
|
jpayne@69
|
107 for i in range(n):
|
jpayne@69
|
108 # log P(observation|class) = SUM_i log P(observation_i|class)
|
jpayne@69
|
109 probs = [None] * len(observation)
|
jpayne@69
|
110 for j in range(len(observation)):
|
jpayne@69
|
111 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
|
jpayne@69
|
112 lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300))
|
jpayne@69
|
113 lp_observation_class[i] = sum(lprobs)
|
jpayne@69
|
114
|
jpayne@69
|
115 # Calculate log P(class).
|
jpayne@69
|
116 lp_prior = np.log(nb.p_prior)
|
jpayne@69
|
117
|
jpayne@69
|
118 # Calculate log P(observation).
|
jpayne@69
|
119 lp_observation = 0.0 # P(observation)
|
jpayne@69
|
120 if scale: # Only calculate this if requested.
|
jpayne@69
|
121 # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
|
jpayne@69
|
122 obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700))
|
jpayne@69
|
123 lp_observation = np.log(sum(obs))
|
jpayne@69
|
124
|
jpayne@69
|
125 # Calculate log P(class|observation).
|
jpayne@69
|
126 lp_class_observation = {} # Dict of class : log P(class|observation)
|
jpayne@69
|
127 for i in range(len(nb.classes)):
|
jpayne@69
|
128 lp_class_observation[nb.classes[i]] = (
|
jpayne@69
|
129 lp_observation_class[i] + lp_prior[i] - lp_observation
|
jpayne@69
|
130 )
|
jpayne@69
|
131
|
jpayne@69
|
132 return lp_class_observation
|
jpayne@69
|
133
|
jpayne@69
|
134
|
jpayne@69
|
135 def classify(nb, observation):
|
jpayne@69
|
136 """Classify an observation into a class."""
|
jpayne@69
|
137 # The class is the one with the highest probability.
|
jpayne@69
|
138 probs = calculate(nb, observation, scale=False)
|
jpayne@69
|
139 max_prob = max_class = None
|
jpayne@69
|
140 for klass in nb.classes:
|
jpayne@69
|
141 if max_prob is None or probs[klass] > max_prob:
|
jpayne@69
|
142 max_prob, max_class = probs[klass], klass
|
jpayne@69
|
143 return max_class
|
jpayne@69
|
144
|
jpayne@69
|
145
|
jpayne@69
|
146 def train(training_set, results, priors=None, typecode=None):
|
jpayne@69
|
147 """Train a NaiveBayes classifier on a training set.
|
jpayne@69
|
148
|
jpayne@69
|
149 Arguments:
|
jpayne@69
|
150 - training_set - List of observations.
|
jpayne@69
|
151 - results - List of the class assignments for each observation.
|
jpayne@69
|
152 Thus, training_set and results must be the same length.
|
jpayne@69
|
153 - priors - Optional dictionary specifying the prior probabilities
|
jpayne@69
|
154 for each type of result. If not specified, the priors will
|
jpayne@69
|
155 be estimated from the training results.
|
jpayne@69
|
156
|
jpayne@69
|
157 """
|
jpayne@69
|
158 if not len(training_set):
|
jpayne@69
|
159 raise ValueError("No data in the training set.")
|
jpayne@69
|
160 if len(training_set) != len(results):
|
jpayne@69
|
161 raise ValueError("training_set and results should be parallel lists.")
|
jpayne@69
|
162
|
jpayne@69
|
163 # If no typecode is specified, try to pick a reasonable one. If
|
jpayne@69
|
164 # training_set is a Numeric array, then use that typecode.
|
jpayne@69
|
165 # Otherwise, choose a reasonable default.
|
jpayne@69
|
166 # XXX NOT IMPLEMENTED
|
jpayne@69
|
167
|
jpayne@69
|
168 # Check to make sure each vector in the training set has the same
|
jpayne@69
|
169 # dimensionality.
|
jpayne@69
|
170 dimensions = [len(x) for x in training_set]
|
jpayne@69
|
171 if min(dimensions) != max(dimensions):
|
jpayne@69
|
172 raise ValueError("observations have different dimensionality")
|
jpayne@69
|
173
|
jpayne@69
|
174 nb = NaiveBayes()
|
jpayne@69
|
175 nb.dimensionality = dimensions[0]
|
jpayne@69
|
176
|
jpayne@69
|
177 # Get a list of all the classes, and
|
jpayne@69
|
178 # estimate the prior probabilities for the classes.
|
jpayne@69
|
179 if priors is not None:
|
jpayne@69
|
180 percs = priors
|
jpayne@69
|
181 nb.classes = list(set(results))
|
jpayne@69
|
182 else:
|
jpayne@69
|
183 class_freq = _contents(results)
|
jpayne@69
|
184 nb.classes = list(class_freq.keys())
|
jpayne@69
|
185 percs = class_freq
|
jpayne@69
|
186 nb.classes.sort() # keep it tidy
|
jpayne@69
|
187
|
jpayne@69
|
188 nb.p_prior = np.zeros(len(nb.classes))
|
jpayne@69
|
189 for i in range(len(nb.classes)):
|
jpayne@69
|
190 nb.p_prior[i] = percs[nb.classes[i]]
|
jpayne@69
|
191
|
jpayne@69
|
192 # Collect all the observations in class. For each class, make a
|
jpayne@69
|
193 # matrix of training instances versus dimensions. I might be able
|
jpayne@69
|
194 # to optimize this with Numeric, if the training_set parameter
|
jpayne@69
|
195 # were guaranteed to be a matrix. However, this may not be the
|
jpayne@69
|
196 # case, because the client may be hacking up a sparse matrix or
|
jpayne@69
|
197 # something.
|
jpayne@69
|
198 c2i = {} # class to index of class
|
jpayne@69
|
199 for index, key in enumerate(nb.classes):
|
jpayne@69
|
200 c2i[key] = index
|
jpayne@69
|
201 observations = [[] for c in nb.classes] # separate observations by class
|
jpayne@69
|
202 for i in range(len(results)):
|
jpayne@69
|
203 klass, obs = results[i], training_set[i]
|
jpayne@69
|
204 observations[c2i[klass]].append(obs)
|
jpayne@69
|
205 # Now make the observations Numeric matrix.
|
jpayne@69
|
206 for i in range(len(observations)):
|
jpayne@69
|
207 # XXX typecode must be specified!
|
jpayne@69
|
208 observations[i] = np.asarray(observations[i], typecode)
|
jpayne@69
|
209
|
jpayne@69
|
210 # Calculate P(value|class,dim) for every class.
|
jpayne@69
|
211 # This is a good loop to optimize.
|
jpayne@69
|
212 nb.p_conditional = []
|
jpayne@69
|
213 for i in range(len(nb.classes)):
|
jpayne@69
|
214 class_observations = observations[i] # observations for this class
|
jpayne@69
|
215 nb.p_conditional.append([None] * nb.dimensionality)
|
jpayne@69
|
216 for j in range(nb.dimensionality):
|
jpayne@69
|
217 # Collect all the values in this dimension.
|
jpayne@69
|
218 values = class_observations[:, j]
|
jpayne@69
|
219
|
jpayne@69
|
220 # Add pseudocounts here. This needs to be parameterized.
|
jpayne@69
|
221 # values = list(values) + range(len(nb.classes)) # XXX add 1
|
jpayne@69
|
222
|
jpayne@69
|
223 # Estimate P(value|class,dim)
|
jpayne@69
|
224 nb.p_conditional[i][j] = _contents(values)
|
jpayne@69
|
225 return nb
|