jpayne@69: # Copyright 2002 by Jeffrey Chang. jpayne@69: # All rights reserved. jpayne@69: # jpayne@69: # This file is part of the Biopython distribution and governed by your jpayne@69: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@69: # Please see the LICENSE file that should have been included as part of this jpayne@69: # package. jpayne@69: """Code for doing logistic regressions (DEPRECATED). jpayne@69: jpayne@69: Classes: jpayne@69: - LogisticRegression Holds information for a LogisticRegression classifier. jpayne@69: jpayne@69: Functions: jpayne@69: - train Train a new classifier. jpayne@69: - calculate Calculate the probabilities of each class, given an observation. jpayne@69: - classify Classify an observation into a class. jpayne@69: jpayne@69: This module has been deprecated, please consider an alternative like scikit-learn jpayne@69: insead. jpayne@69: """ jpayne@69: jpayne@69: import warnings jpayne@69: from Bio import BiopythonDeprecationWarning jpayne@69: jpayne@69: warnings.warn( jpayne@69: "The 'Bio.LogisticRegression' module is deprecated and will be removed in a future " jpayne@69: "release of Biopython. Consider using scikit-learn instead.", jpayne@69: BiopythonDeprecationWarning, jpayne@69: ) jpayne@69: jpayne@69: try: jpayne@69: import numpy as np jpayne@69: import numpy.linalg jpayne@69: except ImportError: jpayne@69: from Bio import MissingPythonDependencyError jpayne@69: jpayne@69: raise MissingPythonDependencyError( jpayne@69: "Please install NumPy if you want to use Bio.LogisticRegression. " jpayne@69: "See http://www.numpy.org/" jpayne@69: ) from None jpayne@69: jpayne@69: jpayne@69: class LogisticRegression: jpayne@69: """Holds information necessary to do logistic regression classification. jpayne@69: jpayne@69: Attributes: jpayne@69: - beta - List of the weights for each dimension. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self): jpayne@69: """Initialize the class.""" jpayne@69: self.beta = [] jpayne@69: jpayne@69: jpayne@69: def train(xs, ys, update_fn=None, typecode=None): jpayne@69: """Train a logistic regression classifier on a training set. jpayne@69: jpayne@69: Argument xs is a list of observations and ys is a list of the class jpayne@69: assignments, which should be 0 or 1. xs and ys should contain the jpayne@69: same number of elements. update_fn is an optional callback function jpayne@69: that takes as parameters that iteration number and log likelihood. jpayne@69: """ jpayne@69: if len(xs) != len(ys): jpayne@69: raise ValueError("xs and ys should be the same length.") jpayne@69: classes = set(ys) jpayne@69: if classes != {0, 1}: jpayne@69: raise ValueError("Classes should be 0's and 1's") jpayne@69: if typecode is None: jpayne@69: typecode = "d" jpayne@69: jpayne@69: # Dimensionality of the data is the dimensionality of the jpayne@69: # observations plus a constant dimension. jpayne@69: N, ndims = len(xs), len(xs[0]) + 1 jpayne@69: if N == 0 or ndims == 1: jpayne@69: raise ValueError("No observations or observation of 0 dimension.") jpayne@69: jpayne@69: # Make an X array, with a constant first dimension. jpayne@69: X = np.ones((N, ndims), typecode) jpayne@69: X[:, 1:] = xs jpayne@69: Xt = np.transpose(X) jpayne@69: y = np.asarray(ys, typecode) jpayne@69: jpayne@69: # Initialize the beta parameter to 0. jpayne@69: beta = np.zeros(ndims, typecode) jpayne@69: jpayne@69: MAX_ITERATIONS = 500 jpayne@69: CONVERGE_THRESHOLD = 0.01 jpayne@69: stepsize = 1.0 jpayne@69: # Now iterate using Newton-Raphson until the log-likelihoods jpayne@69: # converge. jpayne@69: i = 0 jpayne@69: old_beta = old_llik = None jpayne@69: while i < MAX_ITERATIONS: jpayne@69: # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) jpayne@69: ebetaX = np.exp(np.dot(beta, Xt)) jpayne@69: p = ebetaX / (1 + ebetaX) jpayne@69: jpayne@69: # Find the log likelihood score and see if I've converged. jpayne@69: logp = y * np.log(p) + (1 - y) * np.log(1 - p) jpayne@69: llik = sum(logp) jpayne@69: if update_fn is not None: jpayne@69: update_fn(iter, llik) jpayne@69: if old_llik is not None: jpayne@69: # Check to see if the likelihood decreased. If it did, then jpayne@69: # restore the old beta parameters and half the step size. jpayne@69: if llik < old_llik: jpayne@69: stepsize /= 2.0 jpayne@69: beta = old_beta jpayne@69: # If I've converged, then stop. jpayne@69: if np.fabs(llik - old_llik) <= CONVERGE_THRESHOLD: jpayne@69: break jpayne@69: old_llik, old_beta = llik, beta jpayne@69: i += 1 jpayne@69: jpayne@69: W = np.identity(N) * p jpayne@69: Xtyp = np.dot(Xt, y - p) # Calculate the first derivative. jpayne@69: XtWX = np.dot(np.dot(Xt, W), X) # Calculate the second derivative. jpayne@69: delta = numpy.linalg.solve(XtWX, Xtyp) jpayne@69: if np.fabs(stepsize - 1.0) > 0.001: jpayne@69: delta *= stepsize jpayne@69: beta += delta # Update beta. jpayne@69: else: jpayne@69: raise RuntimeError("Didn't converge.") jpayne@69: jpayne@69: lr = LogisticRegression() jpayne@69: lr.beta = list(beta) jpayne@69: return lr jpayne@69: jpayne@69: jpayne@69: def calculate(lr, x): jpayne@69: """Calculate the probability for each class. jpayne@69: jpayne@69: Arguments: jpayne@69: - lr is a LogisticRegression object. jpayne@69: - x is the observed data. jpayne@69: jpayne@69: Returns a list of the probability that it fits each class. jpayne@69: """ jpayne@69: # Insert a constant term for x. jpayne@69: x = np.asarray([1.0] + x) jpayne@69: # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) jpayne@69: ebetaX = np.exp(np.dot(lr.beta, x)) jpayne@69: p = ebetaX / (1 + ebetaX) jpayne@69: return [1 - p, p] jpayne@69: jpayne@69: jpayne@69: def classify(lr, x): jpayne@69: """Classify an observation into a class.""" jpayne@69: probs = calculate(lr, x) jpayne@69: if probs[0] > probs[1]: jpayne@69: return 0 jpayne@69: return 1