jpayne@69: # Copyright 2002 by Jeffrey Chang.
jpayne@69: # All rights reserved.
jpayne@69: #
jpayne@69: # This file is part of the Biopython distribution and governed by your
jpayne@69: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69: # Please see the LICENSE file that should have been included as part of this
jpayne@69: # package.
jpayne@69: """Code for doing logistic regressions (DEPRECATED).
jpayne@69: 
jpayne@69: Classes:
jpayne@69:  - LogisticRegression    Holds information for a LogisticRegression classifier.
jpayne@69: 
jpayne@69: Functions:
jpayne@69:  - train        Train a new classifier.
jpayne@69:  - calculate    Calculate the probabilities of each class, given an observation.
jpayne@69:  - classify     Classify an observation into a class.
jpayne@69: 
jpayne@69: This module has been deprecated, please consider an alternative like scikit-learn
jpayne@69: insead.
jpayne@69: """
jpayne@69: 
jpayne@69: import warnings
jpayne@69: from Bio import BiopythonDeprecationWarning
jpayne@69: 
jpayne@69: warnings.warn(
jpayne@69:     "The 'Bio.LogisticRegression' module is deprecated and will be removed in a future "
jpayne@69:     "release of Biopython. Consider using scikit-learn instead.",
jpayne@69:     BiopythonDeprecationWarning,
jpayne@69: )
jpayne@69: 
jpayne@69: try:
jpayne@69:     import numpy as np
jpayne@69:     import numpy.linalg
jpayne@69: except ImportError:
jpayne@69:     from Bio import MissingPythonDependencyError
jpayne@69: 
jpayne@69:     raise MissingPythonDependencyError(
jpayne@69:         "Please install NumPy if you want to use Bio.LogisticRegression. "
jpayne@69:         "See http://www.numpy.org/"
jpayne@69:     ) from None
jpayne@69: 
jpayne@69: 
jpayne@69: class LogisticRegression:
jpayne@69:     """Holds information necessary to do logistic regression classification.
jpayne@69: 
jpayne@69:     Attributes:
jpayne@69:      - beta - List of the weights for each dimension.
jpayne@69: 
jpayne@69:     """
jpayne@69: 
jpayne@69:     def __init__(self):
jpayne@69:         """Initialize the class."""
jpayne@69:         self.beta = []
jpayne@69: 
jpayne@69: 
jpayne@69: def train(xs, ys, update_fn=None, typecode=None):
jpayne@69:     """Train a logistic regression classifier on a training set.
jpayne@69: 
jpayne@69:     Argument xs is a list of observations and ys is a list of the class
jpayne@69:     assignments, which should be 0 or 1.  xs and ys should contain the
jpayne@69:     same number of elements.  update_fn is an optional callback function
jpayne@69:     that takes as parameters that iteration number and log likelihood.
jpayne@69:     """
jpayne@69:     if len(xs) != len(ys):
jpayne@69:         raise ValueError("xs and ys should be the same length.")
jpayne@69:     classes = set(ys)
jpayne@69:     if classes != {0, 1}:
jpayne@69:         raise ValueError("Classes should be 0's and 1's")
jpayne@69:     if typecode is None:
jpayne@69:         typecode = "d"
jpayne@69: 
jpayne@69:     # Dimensionality of the data is the dimensionality of the
jpayne@69:     # observations plus a constant dimension.
jpayne@69:     N, ndims = len(xs), len(xs[0]) + 1
jpayne@69:     if N == 0 or ndims == 1:
jpayne@69:         raise ValueError("No observations or observation of 0 dimension.")
jpayne@69: 
jpayne@69:     # Make an X array, with a constant first dimension.
jpayne@69:     X = np.ones((N, ndims), typecode)
jpayne@69:     X[:, 1:] = xs
jpayne@69:     Xt = np.transpose(X)
jpayne@69:     y = np.asarray(ys, typecode)
jpayne@69: 
jpayne@69:     # Initialize the beta parameter to 0.
jpayne@69:     beta = np.zeros(ndims, typecode)
jpayne@69: 
jpayne@69:     MAX_ITERATIONS = 500
jpayne@69:     CONVERGE_THRESHOLD = 0.01
jpayne@69:     stepsize = 1.0
jpayne@69:     # Now iterate using Newton-Raphson until the log-likelihoods
jpayne@69:     # converge.
jpayne@69:     i = 0
jpayne@69:     old_beta = old_llik = None
jpayne@69:     while i < MAX_ITERATIONS:
jpayne@69:         # Calculate the probabilities.  p = e^(beta X) / (1+e^(beta X))
jpayne@69:         ebetaX = np.exp(np.dot(beta, Xt))
jpayne@69:         p = ebetaX / (1 + ebetaX)
jpayne@69: 
jpayne@69:         # Find the log likelihood score and see if I've converged.
jpayne@69:         logp = y * np.log(p) + (1 - y) * np.log(1 - p)
jpayne@69:         llik = sum(logp)
jpayne@69:         if update_fn is not None:
jpayne@69:             update_fn(iter, llik)
jpayne@69:         if old_llik is not None:
jpayne@69:             # Check to see if the likelihood decreased.  If it did, then
jpayne@69:             # restore the old beta parameters and half the step size.
jpayne@69:             if llik < old_llik:
jpayne@69:                 stepsize /= 2.0
jpayne@69:                 beta = old_beta
jpayne@69:             # If I've converged, then stop.
jpayne@69:             if np.fabs(llik - old_llik) <= CONVERGE_THRESHOLD:
jpayne@69:                 break
jpayne@69:         old_llik, old_beta = llik, beta
jpayne@69:         i += 1
jpayne@69: 
jpayne@69:         W = np.identity(N) * p
jpayne@69:         Xtyp = np.dot(Xt, y - p)  # Calculate the first derivative.
jpayne@69:         XtWX = np.dot(np.dot(Xt, W), X)  # Calculate the second derivative.
jpayne@69:         delta = numpy.linalg.solve(XtWX, Xtyp)
jpayne@69:         if np.fabs(stepsize - 1.0) > 0.001:
jpayne@69:             delta *= stepsize
jpayne@69:         beta += delta  # Update beta.
jpayne@69:     else:
jpayne@69:         raise RuntimeError("Didn't converge.")
jpayne@69: 
jpayne@69:     lr = LogisticRegression()
jpayne@69:     lr.beta = list(beta)
jpayne@69:     return lr
jpayne@69: 
jpayne@69: 
jpayne@69: def calculate(lr, x):
jpayne@69:     """Calculate the probability for each class.
jpayne@69: 
jpayne@69:     Arguments:
jpayne@69:      - lr is a LogisticRegression object.
jpayne@69:      - x is the observed data.
jpayne@69: 
jpayne@69:     Returns a list of the probability that it fits each class.
jpayne@69:     """
jpayne@69:     # Insert a constant term for x.
jpayne@69:     x = np.asarray([1.0] + x)
jpayne@69:     # Calculate the probability.  p = e^(beta X) / (1+e^(beta X))
jpayne@69:     ebetaX = np.exp(np.dot(lr.beta, x))
jpayne@69:     p = ebetaX / (1 + ebetaX)
jpayne@69:     return [1 - p, p]
jpayne@69: 
jpayne@69: 
jpayne@69: def classify(lr, x):
jpayne@69:     """Classify an observation into a class."""
jpayne@69:     probs = calculate(lr, x)
jpayne@69:     if probs[0] > probs[1]:
jpayne@69:         return 0
jpayne@69:     return 1