jpayne@68: # Copyright 2002 by Jeffrey Chang. jpayne@68: # All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: """Code for doing logistic regressions (DEPRECATED). jpayne@68: jpayne@68: Classes: jpayne@68: - LogisticRegression Holds information for a LogisticRegression classifier. jpayne@68: jpayne@68: Functions: jpayne@68: - train Train a new classifier. jpayne@68: - calculate Calculate the probabilities of each class, given an observation. jpayne@68: - classify Classify an observation into a class. jpayne@68: jpayne@68: This module has been deprecated, please consider an alternative like scikit-learn jpayne@68: insead. jpayne@68: """ jpayne@68: jpayne@68: import warnings jpayne@68: from Bio import BiopythonDeprecationWarning jpayne@68: jpayne@68: warnings.warn( jpayne@68: "The 'Bio.LogisticRegression' module is deprecated and will be removed in a future " jpayne@68: "release of Biopython. Consider using scikit-learn instead.", jpayne@68: BiopythonDeprecationWarning, jpayne@68: ) jpayne@68: jpayne@68: try: jpayne@68: import numpy as np jpayne@68: import numpy.linalg jpayne@68: except ImportError: jpayne@68: from Bio import MissingPythonDependencyError jpayne@68: jpayne@68: raise MissingPythonDependencyError( jpayne@68: "Please install NumPy if you want to use Bio.LogisticRegression. " jpayne@68: "See http://www.numpy.org/" jpayne@68: ) from None jpayne@68: jpayne@68: jpayne@68: class LogisticRegression: jpayne@68: """Holds information necessary to do logistic regression classification. jpayne@68: jpayne@68: Attributes: jpayne@68: - beta - List of the weights for each dimension. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self): jpayne@68: """Initialize the class.""" jpayne@68: self.beta = [] jpayne@68: jpayne@68: jpayne@68: def train(xs, ys, update_fn=None, typecode=None): jpayne@68: """Train a logistic regression classifier on a training set. jpayne@68: jpayne@68: Argument xs is a list of observations and ys is a list of the class jpayne@68: assignments, which should be 0 or 1. xs and ys should contain the jpayne@68: same number of elements. update_fn is an optional callback function jpayne@68: that takes as parameters that iteration number and log likelihood. jpayne@68: """ jpayne@68: if len(xs) != len(ys): jpayne@68: raise ValueError("xs and ys should be the same length.") jpayne@68: classes = set(ys) jpayne@68: if classes != {0, 1}: jpayne@68: raise ValueError("Classes should be 0's and 1's") jpayne@68: if typecode is None: jpayne@68: typecode = "d" jpayne@68: jpayne@68: # Dimensionality of the data is the dimensionality of the jpayne@68: # observations plus a constant dimension. jpayne@68: N, ndims = len(xs), len(xs[0]) + 1 jpayne@68: if N == 0 or ndims == 1: jpayne@68: raise ValueError("No observations or observation of 0 dimension.") jpayne@68: jpayne@68: # Make an X array, with a constant first dimension. jpayne@68: X = np.ones((N, ndims), typecode) jpayne@68: X[:, 1:] = xs jpayne@68: Xt = np.transpose(X) jpayne@68: y = np.asarray(ys, typecode) jpayne@68: jpayne@68: # Initialize the beta parameter to 0. jpayne@68: beta = np.zeros(ndims, typecode) jpayne@68: jpayne@68: MAX_ITERATIONS = 500 jpayne@68: CONVERGE_THRESHOLD = 0.01 jpayne@68: stepsize = 1.0 jpayne@68: # Now iterate using Newton-Raphson until the log-likelihoods jpayne@68: # converge. jpayne@68: i = 0 jpayne@68: old_beta = old_llik = None jpayne@68: while i < MAX_ITERATIONS: jpayne@68: # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) jpayne@68: ebetaX = np.exp(np.dot(beta, Xt)) jpayne@68: p = ebetaX / (1 + ebetaX) jpayne@68: jpayne@68: # Find the log likelihood score and see if I've converged. jpayne@68: logp = y * np.log(p) + (1 - y) * np.log(1 - p) jpayne@68: llik = sum(logp) jpayne@68: if update_fn is not None: jpayne@68: update_fn(iter, llik) jpayne@68: if old_llik is not None: jpayne@68: # Check to see if the likelihood decreased. If it did, then jpayne@68: # restore the old beta parameters and half the step size. jpayne@68: if llik < old_llik: jpayne@68: stepsize /= 2.0 jpayne@68: beta = old_beta jpayne@68: # If I've converged, then stop. jpayne@68: if np.fabs(llik - old_llik) <= CONVERGE_THRESHOLD: jpayne@68: break jpayne@68: old_llik, old_beta = llik, beta jpayne@68: i += 1 jpayne@68: jpayne@68: W = np.identity(N) * p jpayne@68: Xtyp = np.dot(Xt, y - p) # Calculate the first derivative. jpayne@68: XtWX = np.dot(np.dot(Xt, W), X) # Calculate the second derivative. jpayne@68: delta = numpy.linalg.solve(XtWX, Xtyp) jpayne@68: if np.fabs(stepsize - 1.0) > 0.001: jpayne@68: delta *= stepsize jpayne@68: beta += delta # Update beta. jpayne@68: else: jpayne@68: raise RuntimeError("Didn't converge.") jpayne@68: jpayne@68: lr = LogisticRegression() jpayne@68: lr.beta = list(beta) jpayne@68: return lr jpayne@68: jpayne@68: jpayne@68: def calculate(lr, x): jpayne@68: """Calculate the probability for each class. jpayne@68: jpayne@68: Arguments: jpayne@68: - lr is a LogisticRegression object. jpayne@68: - x is the observed data. jpayne@68: jpayne@68: Returns a list of the probability that it fits each class. jpayne@68: """ jpayne@68: # Insert a constant term for x. jpayne@68: x = np.asarray([1.0] + x) jpayne@68: # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) jpayne@68: ebetaX = np.exp(np.dot(lr.beta, x)) jpayne@68: p = ebetaX / (1 + ebetaX) jpayne@68: return [1 - p, p] jpayne@68: jpayne@68: jpayne@68: def classify(lr, x): jpayne@68: """Classify an observation into a class.""" jpayne@68: probs = calculate(lr, x) jpayne@68: if probs[0] > probs[1]: jpayne@68: return 0 jpayne@68: return 1