jpayne@68
|
1 # Copyright 2002 by Jeffrey Chang.
|
jpayne@68
|
2 # All rights reserved.
|
jpayne@68
|
3 #
|
jpayne@68
|
4 # This file is part of the Biopython distribution and governed by your
|
jpayne@68
|
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@68
|
6 # Please see the LICENSE file that should have been included as part of this
|
jpayne@68
|
7 # package.
|
jpayne@68
|
8 """Code for doing logistic regressions (DEPRECATED).
|
jpayne@68
|
9
|
jpayne@68
|
10 Classes:
|
jpayne@68
|
11 - LogisticRegression Holds information for a LogisticRegression classifier.
|
jpayne@68
|
12
|
jpayne@68
|
13 Functions:
|
jpayne@68
|
14 - train Train a new classifier.
|
jpayne@68
|
15 - calculate Calculate the probabilities of each class, given an observation.
|
jpayne@68
|
16 - classify Classify an observation into a class.
|
jpayne@68
|
17
|
jpayne@68
|
18 This module has been deprecated, please consider an alternative like scikit-learn
|
jpayne@68
|
19 insead.
|
jpayne@68
|
20 """
|
jpayne@68
|
21
|
jpayne@68
|
22 import warnings
|
jpayne@68
|
23 from Bio import BiopythonDeprecationWarning
|
jpayne@68
|
24
|
jpayne@68
|
25 warnings.warn(
|
jpayne@68
|
26 "The 'Bio.LogisticRegression' module is deprecated and will be removed in a future "
|
jpayne@68
|
27 "release of Biopython. Consider using scikit-learn instead.",
|
jpayne@68
|
28 BiopythonDeprecationWarning,
|
jpayne@68
|
29 )
|
jpayne@68
|
30
|
jpayne@68
|
31 try:
|
jpayne@68
|
32 import numpy as np
|
jpayne@68
|
33 import numpy.linalg
|
jpayne@68
|
34 except ImportError:
|
jpayne@68
|
35 from Bio import MissingPythonDependencyError
|
jpayne@68
|
36
|
jpayne@68
|
37 raise MissingPythonDependencyError(
|
jpayne@68
|
38 "Please install NumPy if you want to use Bio.LogisticRegression. "
|
jpayne@68
|
39 "See http://www.numpy.org/"
|
jpayne@68
|
40 ) from None
|
jpayne@68
|
41
|
jpayne@68
|
42
|
jpayne@68
|
43 class LogisticRegression:
|
jpayne@68
|
44 """Holds information necessary to do logistic regression classification.
|
jpayne@68
|
45
|
jpayne@68
|
46 Attributes:
|
jpayne@68
|
47 - beta - List of the weights for each dimension.
|
jpayne@68
|
48
|
jpayne@68
|
49 """
|
jpayne@68
|
50
|
jpayne@68
|
51 def __init__(self):
|
jpayne@68
|
52 """Initialize the class."""
|
jpayne@68
|
53 self.beta = []
|
jpayne@68
|
54
|
jpayne@68
|
55
|
jpayne@68
|
56 def train(xs, ys, update_fn=None, typecode=None):
|
jpayne@68
|
57 """Train a logistic regression classifier on a training set.
|
jpayne@68
|
58
|
jpayne@68
|
59 Argument xs is a list of observations and ys is a list of the class
|
jpayne@68
|
60 assignments, which should be 0 or 1. xs and ys should contain the
|
jpayne@68
|
61 same number of elements. update_fn is an optional callback function
|
jpayne@68
|
62 that takes as parameters that iteration number and log likelihood.
|
jpayne@68
|
63 """
|
jpayne@68
|
64 if len(xs) != len(ys):
|
jpayne@68
|
65 raise ValueError("xs and ys should be the same length.")
|
jpayne@68
|
66 classes = set(ys)
|
jpayne@68
|
67 if classes != {0, 1}:
|
jpayne@68
|
68 raise ValueError("Classes should be 0's and 1's")
|
jpayne@68
|
69 if typecode is None:
|
jpayne@68
|
70 typecode = "d"
|
jpayne@68
|
71
|
jpayne@68
|
72 # Dimensionality of the data is the dimensionality of the
|
jpayne@68
|
73 # observations plus a constant dimension.
|
jpayne@68
|
74 N, ndims = len(xs), len(xs[0]) + 1
|
jpayne@68
|
75 if N == 0 or ndims == 1:
|
jpayne@68
|
76 raise ValueError("No observations or observation of 0 dimension.")
|
jpayne@68
|
77
|
jpayne@68
|
78 # Make an X array, with a constant first dimension.
|
jpayne@68
|
79 X = np.ones((N, ndims), typecode)
|
jpayne@68
|
80 X[:, 1:] = xs
|
jpayne@68
|
81 Xt = np.transpose(X)
|
jpayne@68
|
82 y = np.asarray(ys, typecode)
|
jpayne@68
|
83
|
jpayne@68
|
84 # Initialize the beta parameter to 0.
|
jpayne@68
|
85 beta = np.zeros(ndims, typecode)
|
jpayne@68
|
86
|
jpayne@68
|
87 MAX_ITERATIONS = 500
|
jpayne@68
|
88 CONVERGE_THRESHOLD = 0.01
|
jpayne@68
|
89 stepsize = 1.0
|
jpayne@68
|
90 # Now iterate using Newton-Raphson until the log-likelihoods
|
jpayne@68
|
91 # converge.
|
jpayne@68
|
92 i = 0
|
jpayne@68
|
93 old_beta = old_llik = None
|
jpayne@68
|
94 while i < MAX_ITERATIONS:
|
jpayne@68
|
95 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
|
jpayne@68
|
96 ebetaX = np.exp(np.dot(beta, Xt))
|
jpayne@68
|
97 p = ebetaX / (1 + ebetaX)
|
jpayne@68
|
98
|
jpayne@68
|
99 # Find the log likelihood score and see if I've converged.
|
jpayne@68
|
100 logp = y * np.log(p) + (1 - y) * np.log(1 - p)
|
jpayne@68
|
101 llik = sum(logp)
|
jpayne@68
|
102 if update_fn is not None:
|
jpayne@68
|
103 update_fn(iter, llik)
|
jpayne@68
|
104 if old_llik is not None:
|
jpayne@68
|
105 # Check to see if the likelihood decreased. If it did, then
|
jpayne@68
|
106 # restore the old beta parameters and half the step size.
|
jpayne@68
|
107 if llik < old_llik:
|
jpayne@68
|
108 stepsize /= 2.0
|
jpayne@68
|
109 beta = old_beta
|
jpayne@68
|
110 # If I've converged, then stop.
|
jpayne@68
|
111 if np.fabs(llik - old_llik) <= CONVERGE_THRESHOLD:
|
jpayne@68
|
112 break
|
jpayne@68
|
113 old_llik, old_beta = llik, beta
|
jpayne@68
|
114 i += 1
|
jpayne@68
|
115
|
jpayne@68
|
116 W = np.identity(N) * p
|
jpayne@68
|
117 Xtyp = np.dot(Xt, y - p) # Calculate the first derivative.
|
jpayne@68
|
118 XtWX = np.dot(np.dot(Xt, W), X) # Calculate the second derivative.
|
jpayne@68
|
119 delta = numpy.linalg.solve(XtWX, Xtyp)
|
jpayne@68
|
120 if np.fabs(stepsize - 1.0) > 0.001:
|
jpayne@68
|
121 delta *= stepsize
|
jpayne@68
|
122 beta += delta # Update beta.
|
jpayne@68
|
123 else:
|
jpayne@68
|
124 raise RuntimeError("Didn't converge.")
|
jpayne@68
|
125
|
jpayne@68
|
126 lr = LogisticRegression()
|
jpayne@68
|
127 lr.beta = list(beta)
|
jpayne@68
|
128 return lr
|
jpayne@68
|
129
|
jpayne@68
|
130
|
jpayne@68
|
131 def calculate(lr, x):
|
jpayne@68
|
132 """Calculate the probability for each class.
|
jpayne@68
|
133
|
jpayne@68
|
134 Arguments:
|
jpayne@68
|
135 - lr is a LogisticRegression object.
|
jpayne@68
|
136 - x is the observed data.
|
jpayne@68
|
137
|
jpayne@68
|
138 Returns a list of the probability that it fits each class.
|
jpayne@68
|
139 """
|
jpayne@68
|
140 # Insert a constant term for x.
|
jpayne@68
|
141 x = np.asarray([1.0] + x)
|
jpayne@68
|
142 # Calculate the probability. p = e^(beta X) / (1+e^(beta X))
|
jpayne@68
|
143 ebetaX = np.exp(np.dot(lr.beta, x))
|
jpayne@68
|
144 p = ebetaX / (1 + ebetaX)
|
jpayne@68
|
145 return [1 - p, p]
|
jpayne@68
|
146
|
jpayne@68
|
147
|
jpayne@68
|
148 def classify(lr, x):
|
jpayne@68
|
149 """Classify an observation into a class."""
|
jpayne@68
|
150 probs = calculate(lr, x)
|
jpayne@68
|
151 if probs[0] > probs[1]:
|
jpayne@68
|
152 return 0
|
jpayne@68
|
153 return 1
|