jpayne@68: # Copyright 2000 Andrew Dalke.
jpayne@68: # Copyright 2000-2002 Brad Chapman.
jpayne@68: # Copyright 2004-2005, 2010 by M de Hoon.
jpayne@68: # Copyright 2007-2023 by Peter Cock.
jpayne@68: # All rights reserved.
jpayne@68: #
jpayne@68: # This file is part of the Biopython distribution and governed by your
jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68: # Please see the LICENSE file that should have been included as part of this
jpayne@68: # package.
jpayne@68: """Provide objects to represent biological sequences.
jpayne@68: 
jpayne@68: See also the Seq_ wiki and the chapter in our tutorial:
jpayne@68:  - `HTML Tutorial`_
jpayne@68:  - `PDF Tutorial`_
jpayne@68: 
jpayne@68: .. _Seq: http://biopython.org/wiki/Seq
jpayne@68: .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
jpayne@68: .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
jpayne@68: 
jpayne@68: """
jpayne@68: import array
jpayne@68: import collections
jpayne@68: import numbers
jpayne@68: import warnings
jpayne@68: 
jpayne@68: from abc import ABC
jpayne@68: from abc import abstractmethod
jpayne@68: from typing import overload, Optional, Union, Dict
jpayne@68: 
jpayne@68: from Bio import BiopythonWarning
jpayne@68: from Bio.Data import CodonTable
jpayne@68: from Bio.Data import IUPACData
jpayne@68: 
jpayne@68: 
jpayne@68: def _maketrans(complement_mapping):
jpayne@68:     """Make a python string translation table (PRIVATE).
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - complement_mapping - a dictionary such as ambiguous_dna_complement
jpayne@68:        and ambiguous_rna_complement from Data.IUPACData.
jpayne@68: 
jpayne@68:     Returns a translation table (a bytes object of length 256) for use with
jpayne@68:     the python string's translate method to use in a (reverse) complement.
jpayne@68: 
jpayne@68:     Compatible with lower case and upper case sequences.
jpayne@68: 
jpayne@68:     For internal use only.
jpayne@68:     """
jpayne@68:     keys = "".join(complement_mapping.keys()).encode("ASCII")
jpayne@68:     values = "".join(complement_mapping.values()).encode("ASCII")
jpayne@68:     return bytes.maketrans(keys + keys.lower(), values + values.lower())
jpayne@68: 
jpayne@68: 
jpayne@68: ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
jpayne@68: ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
jpayne@68: _dna_complement_table = _maketrans(ambiguous_dna_complement)
jpayne@68: del ambiguous_dna_complement
jpayne@68: ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
jpayne@68: ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
jpayne@68: _rna_complement_table = _maketrans(ambiguous_rna_complement)
jpayne@68: del ambiguous_rna_complement
jpayne@68: 
jpayne@68: 
jpayne@68: class SequenceDataAbstractBaseClass(ABC):
jpayne@68:     """Abstract base class for sequence content providers.
jpayne@68: 
jpayne@68:     Most users will not need to use this class. It is used internally as a base
jpayne@68:     class for sequence content provider classes such as _UndefinedSequenceData
jpayne@68:     defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
jpayne@68:     Instances of these classes can be used instead of a ``bytes`` object as the
jpayne@68:     data argument when creating a Seq object, and provide the sequence content
jpayne@68:     only when requested via ``__getitem__``. This allows lazy parsers to load
jpayne@68:     and parse sequence data from a file only for the requested sequence regions,
jpayne@68:     and _UndefinedSequenceData instances to raise an exception when undefined
jpayne@68:     sequence data are requested.
jpayne@68: 
jpayne@68:     Future implementations of lazy parsers that similarly provide on-demand
jpayne@68:     parsing of sequence data should use a subclass of this abstract class and
jpayne@68:     implement the abstract methods ``__len__`` and ``__getitem__``:
jpayne@68: 
jpayne@68:     * ``__len__`` must return the sequence length;
jpayne@68:     * ``__getitem__`` must return
jpayne@68: 
jpayne@68:       * a ``bytes`` object for the requested region; or
jpayne@68:       * a new instance of the subclass for the requested region; or
jpayne@68:       * raise an ``UndefinedSequenceError``.
jpayne@68: 
jpayne@68:       Calling ``__getitem__`` for a sequence region of size zero should always
jpayne@68:       return an empty ``bytes`` object.
jpayne@68:       Calling ``__getitem__`` for the full sequence (as in data[:]) should
jpayne@68:       either return a ``bytes`` object with the full sequence, or raise an
jpayne@68:       ``UndefinedSequenceError``.
jpayne@68: 
jpayne@68:     Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
jpayne@68:     as part of their ``__init__`` method.
jpayne@68:     """
jpayne@68: 
jpayne@68:     __slots__ = ()
jpayne@68: 
jpayne@68:     def __init__(self):
jpayne@68:         """Check if ``__getitem__`` returns a bytes-like object."""
jpayne@68:         assert self[:0] == b""
jpayne@68: 
jpayne@68:     @abstractmethod
jpayne@68:     def __len__(self):
jpayne@68:         pass
jpayne@68: 
jpayne@68:     @abstractmethod
jpayne@68:     def __getitem__(self, key):
jpayne@68:         pass
jpayne@68: 
jpayne@68:     def __bytes__(self):
jpayne@68:         return self[:]
jpayne@68: 
jpayne@68:     def __hash__(self):
jpayne@68:         return hash(bytes(self))
jpayne@68: 
jpayne@68:     def __eq__(self, other):
jpayne@68:         return bytes(self) == other
jpayne@68: 
jpayne@68:     def __lt__(self, other):
jpayne@68:         return bytes(self) < other
jpayne@68: 
jpayne@68:     def __le__(self, other):
jpayne@68:         return bytes(self) <= other
jpayne@68: 
jpayne@68:     def __gt__(self, other):
jpayne@68:         return bytes(self) > other
jpayne@68: 
jpayne@68:     def __ge__(self, other):
jpayne@68:         return bytes(self) >= other
jpayne@68: 
jpayne@68:     def __add__(self, other):
jpayne@68:         try:
jpayne@68:             return bytes(self) + bytes(other)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             return NotImplemented
jpayne@68:             # will be handled by _UndefinedSequenceData.__radd__ or
jpayne@68:             # by _PartiallyDefinedSequenceData.__radd__
jpayne@68: 
jpayne@68:     def __radd__(self, other):
jpayne@68:         return other + bytes(self)
jpayne@68: 
jpayne@68:     def __mul__(self, other):
jpayne@68:         return other * bytes(self)
jpayne@68: 
jpayne@68:     def __contains__(self, item):
jpayne@68:         return bytes(self).__contains__(item)
jpayne@68: 
jpayne@68:     def decode(self, encoding="utf-8"):
jpayne@68:         """Decode the data as bytes using the codec registered for encoding.
jpayne@68: 
jpayne@68:         encoding
jpayne@68:           The encoding with which to decode the bytes.
jpayne@68:         """
jpayne@68:         return bytes(self).decode(encoding)
jpayne@68: 
jpayne@68:     def count(self, sub, start=None, end=None):
jpayne@68:         """Return the number of non-overlapping occurrences of sub in data[start:end].
jpayne@68: 
jpayne@68:         Optional arguments start and end are interpreted as in slice notation.
jpayne@68:         This method behaves as the count method of Python strings.
jpayne@68:         """
jpayne@68:         return bytes(self).count(sub, start, end)
jpayne@68: 
jpayne@68:     def find(self, sub, start=None, end=None):
jpayne@68:         """Return the lowest index in data where subsection sub is found.
jpayne@68: 
jpayne@68:         Return the lowest index in data where subsection sub is found,
jpayne@68:         such that sub is contained within data[start,end].  Optional
jpayne@68:         arguments start and end are interpreted as in slice notation.
jpayne@68: 
jpayne@68:         Return -1 on failure.
jpayne@68:         """
jpayne@68:         return bytes(self).find(sub, start, end)
jpayne@68: 
jpayne@68:     def rfind(self, sub, start=None, end=None):
jpayne@68:         """Return the highest index in data where subsection sub is found.
jpayne@68: 
jpayne@68:         Return the highest index in data where subsection sub is found,
jpayne@68:         such that sub is contained within data[start,end].  Optional
jpayne@68:         arguments start and end are interpreted as in slice notation.
jpayne@68: 
jpayne@68:         Return -1 on failure.
jpayne@68:         """
jpayne@68:         return bytes(self).rfind(sub, start, end)
jpayne@68: 
jpayne@68:     def index(self, sub, start=None, end=None):
jpayne@68:         """Return the lowest index in data where subsection sub is found.
jpayne@68: 
jpayne@68:         Return the lowest index in data where subsection sub is found,
jpayne@68:         such that sub is contained within data[start,end].  Optional
jpayne@68:         arguments start and end are interpreted as in slice notation.
jpayne@68: 
jpayne@68:         Raises ValueError when the subsection is not found.
jpayne@68:         """
jpayne@68:         return bytes(self).index(sub, start, end)
jpayne@68: 
jpayne@68:     def rindex(self, sub, start=None, end=None):
jpayne@68:         """Return the highest index in data where subsection sub is found.
jpayne@68: 
jpayne@68:         Return the highest index in data where subsection sub is found,
jpayne@68:         such that sub is contained within data[start,end].  Optional
jpayne@68:         arguments start and end are interpreted as in slice notation.
jpayne@68: 
jpayne@68:         Raise ValueError when the subsection is not found.
jpayne@68:         """
jpayne@68:         return bytes(self).rindex(sub, start, end)
jpayne@68: 
jpayne@68:     def startswith(self, prefix, start=None, end=None):
jpayne@68:         """Return True if data starts with the specified prefix, False otherwise.
jpayne@68: 
jpayne@68:         With optional start, test data beginning at that position.
jpayne@68:         With optional end, stop comparing data at that position.
jpayne@68:         prefix can also be a tuple of bytes to try.
jpayne@68:         """
jpayne@68:         return bytes(self).startswith(prefix, start, end)
jpayne@68: 
jpayne@68:     def endswith(self, suffix, start=None, end=None):
jpayne@68:         """Return True if data ends with the specified suffix, False otherwise.
jpayne@68: 
jpayne@68:         With optional start, test data beginning at that position.
jpayne@68:         With optional end, stop comparing data at that position.
jpayne@68:         suffix can also be a tuple of bytes to try.
jpayne@68:         """
jpayne@68:         return bytes(self).endswith(suffix, start, end)
jpayne@68: 
jpayne@68:     def split(self, sep=None, maxsplit=-1):
jpayne@68:         """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68: 
jpayne@68:         sep
jpayne@68:           The delimiter according which to split the data.
jpayne@68:           None (the default value) means split on ASCII whitespace characters
jpayne@68:           (space, tab, return, newline, formfeed, vertical tab).
jpayne@68:         maxsplit
jpayne@68:           Maximum number of splits to do.
jpayne@68:           -1 (the default value) means no limit.
jpayne@68:         """
jpayne@68:         return bytes(self).split(sep, maxsplit)
jpayne@68: 
jpayne@68:     def rsplit(self, sep=None, maxsplit=-1):
jpayne@68:         """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68: 
jpayne@68:         sep
jpayne@68:           The delimiter according which to split the data.
jpayne@68:           None (the default value) means split on ASCII whitespace characters
jpayne@68:           (space, tab, return, newline, formfeed, vertical tab).
jpayne@68:         maxsplit
jpayne@68:           Maximum number of splits to do.
jpayne@68:           -1 (the default value) means no limit.
jpayne@68: 
jpayne@68:         Splitting is done starting at the end of the data and working to the front.
jpayne@68:         """
jpayne@68:         return bytes(self).rsplit(sep, maxsplit)
jpayne@68: 
jpayne@68:     def strip(self, chars=None):
jpayne@68:         """Strip leading and trailing characters contained in the argument.
jpayne@68: 
jpayne@68:         If the argument is omitted or None, strip leading and trailing ASCII whitespace.
jpayne@68:         """
jpayne@68:         return bytes(self).strip(chars)
jpayne@68: 
jpayne@68:     def lstrip(self, chars=None):
jpayne@68:         """Strip leading characters contained in the argument.
jpayne@68: 
jpayne@68:         If the argument is omitted or None, strip leading ASCII whitespace.
jpayne@68:         """
jpayne@68:         return bytes(self).lstrip(chars)
jpayne@68: 
jpayne@68:     def rstrip(self, chars=None):
jpayne@68:         """Strip trailing characters contained in the argument.
jpayne@68: 
jpayne@68:         If the argument is omitted or None, strip trailing ASCII whitespace.
jpayne@68:         """
jpayne@68:         return bytes(self).rstrip(chars)
jpayne@68: 
jpayne@68:     def removeprefix(self, prefix):
jpayne@68:         """Remove the prefix if present."""
jpayne@68:         # Want to do just this, but need Python 3.9+
jpayne@68:         # return bytes(self).removeprefix(prefix)
jpayne@68:         data = bytes(self)
jpayne@68:         try:
jpayne@68:             return data.removeprefix(prefix)
jpayne@68:         except AttributeError:
jpayne@68:             if data.startswith(prefix):
jpayne@68:                 return data[len(prefix) :]
jpayne@68:             else:
jpayne@68:                 return data
jpayne@68: 
jpayne@68:     def removesuffix(self, suffix):
jpayne@68:         """Remove the suffix if present."""
jpayne@68:         # Want to do just this, but need Python 3.9+
jpayne@68:         # return bytes(self).removesuffix(suffix)
jpayne@68:         data = bytes(self)
jpayne@68:         try:
jpayne@68:             return data.removesuffix(suffix)
jpayne@68:         except AttributeError:
jpayne@68:             if data.startswith(suffix):
jpayne@68:                 return data[: -len(suffix)]
jpayne@68:             else:
jpayne@68:                 return data
jpayne@68: 
jpayne@68:     def upper(self):
jpayne@68:         """Return a copy of data with all ASCII characters converted to uppercase."""
jpayne@68:         return bytes(self).upper()
jpayne@68: 
jpayne@68:     def lower(self):
jpayne@68:         """Return a copy of data with all ASCII characters converted to lowercase."""
jpayne@68:         return bytes(self).lower()
jpayne@68: 
jpayne@68:     def isupper(self):
jpayne@68:         """Return True if all ASCII characters in data are uppercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         return bytes(self).isupper()
jpayne@68: 
jpayne@68:     def islower(self):
jpayne@68:         """Return True if all ASCII characters in data are lowercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         return bytes(self).islower()
jpayne@68: 
jpayne@68:     def replace(self, old, new):
jpayne@68:         """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68:         return bytes(self).replace(old, new)
jpayne@68: 
jpayne@68:     def translate(self, table, delete=b""):
jpayne@68:         """Return a copy with each character mapped by the given translation table.
jpayne@68: 
jpayne@68:           table
jpayne@68:             Translation table, which must be a bytes object of length 256.
jpayne@68: 
jpayne@68:         All characters occurring in the optional argument delete are removed.
jpayne@68:         The remaining characters are mapped through the given translation table.
jpayne@68:         """
jpayne@68:         return bytes(self).translate(table, delete)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined(self):
jpayne@68:         """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68: 
jpayne@68:         Zero-length sequences are always considered to be defined.
jpayne@68:         """
jpayne@68:         return True
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined_ranges(self):
jpayne@68:         """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68: 
jpayne@68:         The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68:         """
jpayne@68:         length = len(self)
jpayne@68:         if length > 0:
jpayne@68:             return ((0, length),)
jpayne@68:         else:
jpayne@68:             return ()
jpayne@68: 
jpayne@68: 
jpayne@68: class _SeqAbstractBaseClass(ABC):
jpayne@68:     """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
jpayne@68: 
jpayne@68:     Most users will not need to use this class. It is used internally as an
jpayne@68:     abstract base class for Seq and MutableSeq, as most of their methods are
jpayne@68:     identical.
jpayne@68:     """
jpayne@68: 
jpayne@68:     __slots__ = ("_data",)
jpayne@68:     __array_ufunc__ = None  # turn off numpy Ufuncs
jpayne@68: 
jpayne@68:     @abstractmethod
jpayne@68:     def __init__(self):
jpayne@68:         pass
jpayne@68: 
jpayne@68:     def __bytes__(self):
jpayne@68:         return bytes(self._data)
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         """Return (truncated) representation of the sequence."""
jpayne@68:         data = self._data
jpayne@68:         if isinstance(data, _UndefinedSequenceData):
jpayne@68:             return f"Seq(None, length={len(self)})"
jpayne@68:         if isinstance(data, _PartiallyDefinedSequenceData):
jpayne@68:             d = {}
jpayne@68:             for position, seq in data._data.items():
jpayne@68:                 if len(seq) > 60:
jpayne@68:                     start = seq[:54].decode("ASCII")
jpayne@68:                     end = seq[-3:].decode("ASCII")
jpayne@68:                     seq = f"{start}...{end}"
jpayne@68:                 else:
jpayne@68:                     seq = seq.decode("ASCII")
jpayne@68:                 d[position] = seq
jpayne@68:             return "Seq(%r, length=%d)" % (d, len(self))
jpayne@68:         if len(data) > 60:
jpayne@68:             # Shows the last three letters as it is often useful to see if
jpayne@68:             # there is a stop codon at the end of a sequence.
jpayne@68:             # Note total length is 54+3+3=60
jpayne@68:             start = data[:54].decode("ASCII")
jpayne@68:             end = data[-3:].decode("ASCII")
jpayne@68:             return f"{self.__class__.__name__}('{start}...{end}')"
jpayne@68:         else:
jpayne@68:             data = data.decode("ASCII")
jpayne@68:             return f"{self.__class__.__name__}('{data}')"
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         """Return the full sequence as a python string."""
jpayne@68:         return self._data.decode("ASCII")
jpayne@68: 
jpayne@68:     def __eq__(self, other):
jpayne@68:         """Compare the sequence to another sequence or a string.
jpayne@68: 
jpayne@68:         Sequences are equal to each other if their sequence contents is
jpayne@68:         identical:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> seq1 = Seq("ACGT")
jpayne@68:         >>> seq2 = Seq("ACGT")
jpayne@68:         >>> mutable_seq = MutableSeq("ACGT")
jpayne@68:         >>> seq1 == seq2
jpayne@68:         True
jpayne@68:         >>> seq1 == mutable_seq
jpayne@68:         True
jpayne@68:         >>> seq1 == "ACGT"
jpayne@68:         True
jpayne@68: 
jpayne@68:         Note that the sequence objects themselves are not identical to each
jpayne@68:         other:
jpayne@68: 
jpayne@68:         >>> id(seq1) == id(seq2)
jpayne@68:         False
jpayne@68:         >>> seq1 is seq2
jpayne@68:         False
jpayne@68: 
jpayne@68:         Sequences can also be compared to strings, ``bytes``, and ``bytearray``
jpayne@68:         objects:
jpayne@68: 
jpayne@68:         >>> seq1 == "ACGT"
jpayne@68:         True
jpayne@68:         >>> seq1 == b"ACGT"
jpayne@68:         True
jpayne@68:         >>> seq1 == bytearray(b"ACGT")
jpayne@68:         True
jpayne@68:         """
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self._data == other._data
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self._data == other.encode("ASCII")
jpayne@68:         else:
jpayne@68:             return self._data == other
jpayne@68: 
jpayne@68:     def __lt__(self, other):
jpayne@68:         """Implement the less-than operand."""
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self._data < other._data
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self._data < other.encode("ASCII")
jpayne@68:         else:
jpayne@68:             return self._data < other
jpayne@68: 
jpayne@68:     def __le__(self, other):
jpayne@68:         """Implement the less-than or equal operand."""
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self._data <= other._data
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self._data <= other.encode("ASCII")
jpayne@68:         else:
jpayne@68:             return self._data <= other
jpayne@68: 
jpayne@68:     def __gt__(self, other):
jpayne@68:         """Implement the greater-than operand."""
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self._data > other._data
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self._data > other.encode("ASCII")
jpayne@68:         else:
jpayne@68:             return self._data > other
jpayne@68: 
jpayne@68:     def __ge__(self, other):
jpayne@68:         """Implement the greater-than or equal operand."""
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self._data >= other._data
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self._data >= other.encode("ASCII")
jpayne@68:         else:
jpayne@68:             return self._data >= other
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return the length of the sequence."""
jpayne@68:         return len(self._data)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         """Return an iterable of the sequence."""
jpayne@68:         return self._data.decode("ASCII").__iter__()
jpayne@68: 
jpayne@68:     @overload
jpayne@68:     def __getitem__(self, index: int) -> str:
jpayne@68:         ...
jpayne@68: 
jpayne@68:     @overload
jpayne@68:     def __getitem__(self, index: slice) -> "Seq":
jpayne@68:         ...
jpayne@68: 
jpayne@68:     def __getitem__(self, index):
jpayne@68:         """Return a subsequence as a single letter or as a sequence object.
jpayne@68: 
jpayne@68:         If the index is an integer, a single letter is returned as a Python
jpayne@68:         string:
jpayne@68: 
jpayne@68:         >>> seq = Seq('ACTCGACGTCG')
jpayne@68:         >>> seq[5]
jpayne@68:         'A'
jpayne@68: 
jpayne@68:         Otherwise, a new sequence object of the same class is returned:
jpayne@68: 
jpayne@68:         >>> seq[5:8]
jpayne@68:         Seq('ACG')
jpayne@68:         >>> mutable_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> mutable_seq[5:8]
jpayne@68:         MutableSeq('ACG')
jpayne@68:         """
jpayne@68:         if isinstance(index, numbers.Integral):
jpayne@68:             # Return a single letter as a string
jpayne@68:             return chr(self._data[index])
jpayne@68:         else:
jpayne@68:             # Return the (sub)sequence as another Seq/MutableSeq object
jpayne@68:             return self.__class__(self._data[index])
jpayne@68: 
jpayne@68:     def __add__(self, other):
jpayne@68:         """Add a sequence or string to this sequence.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> Seq("MELKI") + "LV"
jpayne@68:         Seq('MELKILV')
jpayne@68:         >>> MutableSeq("MELKI") + "LV"
jpayne@68:         MutableSeq('MELKILV')
jpayne@68:         """
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self.__class__(self._data + other._data)
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self.__class__(self._data + other.encode("ASCII"))
jpayne@68:         else:
jpayne@68:             # If other is a SeqRecord, then SeqRecord's __radd__ will handle
jpayne@68:             # this. If not, returning NotImplemented will trigger a TypeError.
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:     def __radd__(self, other):
jpayne@68:         """Add a sequence string on the left.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> "LV" + Seq("MELKI")
jpayne@68:         Seq('LVMELKI')
jpayne@68:         >>> "LV" + MutableSeq("MELKI")
jpayne@68:         MutableSeq('LVMELKI')
jpayne@68: 
jpayne@68:         Adding two sequence objects is handled via the __add__ method.
jpayne@68:         """
jpayne@68:         if isinstance(other, str):
jpayne@68:             return self.__class__(other.encode("ASCII") + self._data)
jpayne@68:         else:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:     def __mul__(self, other):
jpayne@68:         """Multiply sequence by integer.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> Seq('ATG') * 2
jpayne@68:         Seq('ATGATG')
jpayne@68:         >>> MutableSeq('ATG') * 2
jpayne@68:         MutableSeq('ATGATG')
jpayne@68:         """
jpayne@68:         if not isinstance(other, numbers.Integral):
jpayne@68:             raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68:         # we would like to simply write
jpayne@68:         # data = self._data * other
jpayne@68:         # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68:         # bytearray and other is a numpy integer. Using this workaround:
jpayne@68:         data = self._data.__mul__(other)
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def __rmul__(self, other):
jpayne@68:         """Multiply integer by sequence.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> 2 * Seq('ATG')
jpayne@68:         Seq('ATGATG')
jpayne@68:         """
jpayne@68:         if not isinstance(other, numbers.Integral):
jpayne@68:             raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68:         # we would like to simply write
jpayne@68:         # data = self._data * other
jpayne@68:         # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68:         # bytearray and other is a numpy integer. Using this workaround:
jpayne@68:         data = self._data.__mul__(other)
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def __imul__(self, other):
jpayne@68:         """Multiply the sequence object by other and assign.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> seq = Seq('ATG')
jpayne@68:         >>> seq *= 2
jpayne@68:         >>> seq
jpayne@68:         Seq('ATGATG')
jpayne@68: 
jpayne@68:         Note that this is different from in-place multiplication. The ``seq``
jpayne@68:         variable is reassigned to the multiplication result, but any variable
jpayne@68:         pointing to ``seq`` will remain unchanged:
jpayne@68: 
jpayne@68:         >>> seq = Seq('ATG')
jpayne@68:         >>> seq2 = seq
jpayne@68:         >>> id(seq) == id(seq2)
jpayne@68:         True
jpayne@68:         >>> seq *= 2
jpayne@68:         >>> seq
jpayne@68:         Seq('ATGATG')
jpayne@68:         >>> seq2
jpayne@68:         Seq('ATG')
jpayne@68:         >>> id(seq) == id(seq2)
jpayne@68:         False
jpayne@68:         """
jpayne@68:         if not isinstance(other, numbers.Integral):
jpayne@68:             raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68:         # we would like to simply write
jpayne@68:         # data = self._data * other
jpayne@68:         # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68:         # bytearray and other is a numpy integer. Using this workaround:
jpayne@68:         data = self._data.__mul__(other)
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def count(self, sub, start=None, end=None):
jpayne@68:         """Return a non-overlapping count, like that of a python string.
jpayne@68: 
jpayne@68:         The number of occurrences of substring argument sub in the
jpayne@68:         (sub)sequence given by [start:end] is returned as an integer.
jpayne@68:         Optional arguments start and end are interpreted as in slice
jpayne@68:         notation.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq object to look for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_seq = Seq("AAAATGA")
jpayne@68:         >>> print(my_seq.count("A"))
jpayne@68:         5
jpayne@68:         >>> print(my_seq.count("ATG"))
jpayne@68:         1
jpayne@68:         >>> print(my_seq.count(Seq("AT")))
jpayne@68:         1
jpayne@68:         >>> print(my_seq.count("AT", 2, -1))
jpayne@68:         1
jpayne@68: 
jpayne@68:         HOWEVER, please note because the ``count`` method of Seq and MutableSeq
jpayne@68:         objects, like that of Python strings, do a non-overlapping search, this
jpayne@68:         may not give the answer you expect:
jpayne@68: 
jpayne@68:         >>> "AAAA".count("AA")
jpayne@68:         2
jpayne@68:         >>> print(Seq("AAAA").count("AA"))
jpayne@68:         2
jpayne@68: 
jpayne@68:         For an overlapping search, use the ``count_overlap`` method:
jpayne@68: 
jpayne@68:         >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68:         3
jpayne@68:         """
jpayne@68:         if isinstance(sub, MutableSeq):
jpayne@68:             sub = sub._data
jpayne@68:         elif isinstance(sub, Seq):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         return self._data.count(sub, start, end)
jpayne@68: 
jpayne@68:     def count_overlap(self, sub, start=None, end=None):
jpayne@68:         """Return an overlapping count.
jpayne@68: 
jpayne@68:         Returns an integer, the number of occurrences of substring
jpayne@68:         argument sub in the (sub)sequence given by [start:end].
jpayne@68:         Optional arguments start and end are interpreted as in slice
jpayne@68:         notation.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq object to look for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68:         3
jpayne@68:         >>> print(Seq("ATATATATA").count_overlap("ATA"))
jpayne@68:         4
jpayne@68:         >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
jpayne@68:         1
jpayne@68: 
jpayne@68:         For a non-overlapping search, use the ``count`` method:
jpayne@68: 
jpayne@68:         >>> print(Seq("AAAA").count("AA"))
jpayne@68:         2
jpayne@68: 
jpayne@68:         Where substrings do not overlap, ``count_overlap`` behaves the same as
jpayne@68:         the ``count`` method:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_seq = Seq("AAAATGA")
jpayne@68:         >>> print(my_seq.count_overlap("A"))
jpayne@68:         5
jpayne@68:         >>> my_seq.count_overlap("A") == my_seq.count("A")
jpayne@68:         True
jpayne@68:         >>> print(my_seq.count_overlap("ATG"))
jpayne@68:         1
jpayne@68:         >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
jpayne@68:         True
jpayne@68:         >>> print(my_seq.count_overlap(Seq("AT")))
jpayne@68:         1
jpayne@68:         >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
jpayne@68:         True
jpayne@68:         >>> print(my_seq.count_overlap("AT", 2, -1))
jpayne@68:         1
jpayne@68:         >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
jpayne@68:         True
jpayne@68: 
jpayne@68:         HOWEVER, do not use this method for such cases because the
jpayne@68:         count() method is much for efficient.
jpayne@68:         """
jpayne@68:         if isinstance(sub, MutableSeq):
jpayne@68:             sub = sub._data
jpayne@68:         elif isinstance(sub, Seq):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         data = self._data
jpayne@68:         overlap_count = 0
jpayne@68:         while True:
jpayne@68:             start = data.find(sub, start, end) + 1
jpayne@68:             if start != 0:
jpayne@68:                 overlap_count += 1
jpayne@68:             else:
jpayne@68:                 return overlap_count
jpayne@68: 
jpayne@68:     def __contains__(self, item):
jpayne@68:         """Return True if item is a subsequence of the sequence, and False otherwise.
jpayne@68: 
jpayne@68:         e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> my_dna = Seq("ATATGAAATTTGAAAA")
jpayne@68:         >>> "AAA" in my_dna
jpayne@68:         True
jpayne@68:         >>> Seq("AAA") in my_dna
jpayne@68:         True
jpayne@68:         >>> MutableSeq("AAA") in my_dna
jpayne@68:         True
jpayne@68:         """
jpayne@68:         if isinstance(item, _SeqAbstractBaseClass):
jpayne@68:             item = bytes(item)
jpayne@68:         elif isinstance(item, str):
jpayne@68:             item = item.encode("ASCII")
jpayne@68:         return item in self._data
jpayne@68: 
jpayne@68:     def find(self, sub, start=None, end=None):
jpayne@68:         """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68: 
jpayne@68:         With optional arguments start and end, return the lowest index in the
jpayne@68:         sequence such that the subsequence sub is contained within the sequence
jpayne@68:         region [start:end].
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq or MutableSeq object to search for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         Returns -1 if the subsequence is NOT found.
jpayne@68: 
jpayne@68:         e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.find("AUG")
jpayne@68:         3
jpayne@68: 
jpayne@68:         The next typical start codon can then be found by starting the search
jpayne@68:         at position 4:
jpayne@68: 
jpayne@68:         >>> my_rna.find("AUG", 4)
jpayne@68:         15
jpayne@68: 
jpayne@68:         See the ``search`` method to find the locations of multiple subsequences
jpayne@68:         at the same time.
jpayne@68:         """
jpayne@68:         if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         return self._data.find(sub, start, end)
jpayne@68: 
jpayne@68:     def rfind(self, sub, start=None, end=None):
jpayne@68:         """Return the highest index in the sequence where subsequence sub is found.
jpayne@68: 
jpayne@68:         With optional arguments start and end, return the highest index in the
jpayne@68:         sequence such that the subsequence sub is contained within the sequence
jpayne@68:         region [start:end].
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq or MutableSeq object to search for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         Returns -1 if the subsequence is NOT found.
jpayne@68: 
jpayne@68:         e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.rfind("AUG")
jpayne@68:         15
jpayne@68: 
jpayne@68:         The location of the typical start codon before that can be found by
jpayne@68:         ending the search at position 15:
jpayne@68: 
jpayne@68:         >>> my_rna.rfind("AUG", end=15)
jpayne@68:         3
jpayne@68: 
jpayne@68:         See the ``search`` method to find the locations of multiple subsequences
jpayne@68:         at the same time.
jpayne@68:         """
jpayne@68:         if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         return self._data.rfind(sub, start, end)
jpayne@68: 
jpayne@68:     def index(self, sub, start=None, end=None):
jpayne@68:         """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68: 
jpayne@68:         With optional arguments start and end, return the lowest index in the
jpayne@68:         sequence such that the subsequence sub is contained within the sequence
jpayne@68:         region [start:end].
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq or MutableSeq object to search for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         Raises a ValueError if the subsequence is NOT found.
jpayne@68: 
jpayne@68:         e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.index("AUG")
jpayne@68:         3
jpayne@68: 
jpayne@68:         The next typical start codon can then be found by starting the search
jpayne@68:         at position 4:
jpayne@68: 
jpayne@68:         >>> my_rna.index("AUG", 4)
jpayne@68:         15
jpayne@68: 
jpayne@68:         This method performs the same search as the ``find`` method.  However,
jpayne@68:         if the subsequence is not found, ``find`` returns -1 while ``index``
jpayne@68:         raises a ValueError:
jpayne@68: 
jpayne@68:         >>> my_rna.index("T")
jpayne@68:         Traceback (most recent call last):
jpayne@68:                    ...
jpayne@68:         ValueError: ...
jpayne@68:         >>> my_rna.find("T")
jpayne@68:         -1
jpayne@68: 
jpayne@68:         See the ``search`` method to find the locations of multiple subsequences
jpayne@68:         at the same time.
jpayne@68:         """
jpayne@68:         if isinstance(sub, MutableSeq):
jpayne@68:             sub = sub._data
jpayne@68:         elif isinstance(sub, Seq):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         return self._data.index(sub, start, end)
jpayne@68: 
jpayne@68:     def rindex(self, sub, start=None, end=None):
jpayne@68:         """Return the highest index in the sequence where subsequence sub is found.
jpayne@68: 
jpayne@68:         With optional arguments start and end, return the highest index in the
jpayne@68:         sequence such that the subsequence sub is contained within the sequence
jpayne@68:         region [start:end].
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - sub - a string or another Seq or MutableSeq object to search for
jpayne@68:          - start - optional integer, slice start
jpayne@68:          - end - optional integer, slice end
jpayne@68: 
jpayne@68:         Returns -1 if the subsequence is NOT found.
jpayne@68: 
jpayne@68:         e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.rindex("AUG")
jpayne@68:         15
jpayne@68: 
jpayne@68:         The location of the typical start codon before that can be found by
jpayne@68:         ending the search at position 15:
jpayne@68: 
jpayne@68:         >>> my_rna.rindex("AUG", end=15)
jpayne@68:         3
jpayne@68: 
jpayne@68:         This method performs the same search as the ``rfind`` method.  However,
jpayne@68:         if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
jpayne@68:         raises a ValueError:
jpayne@68: 
jpayne@68:         >>> my_rna.rindex("T")
jpayne@68:         Traceback (most recent call last):
jpayne@68:                    ...
jpayne@68:         ValueError: ...
jpayne@68:         >>> my_rna.rfind("T")
jpayne@68:         -1
jpayne@68: 
jpayne@68:         See the ``search`` method to find the locations of multiple subsequences
jpayne@68:         at the same time.
jpayne@68:         """
jpayne@68:         if isinstance(sub, MutableSeq):
jpayne@68:             sub = sub._data
jpayne@68:         elif isinstance(sub, Seq):
jpayne@68:             sub = bytes(sub)
jpayne@68:         elif isinstance(sub, str):
jpayne@68:             sub = sub.encode("ASCII")
jpayne@68:         elif not isinstance(sub, (bytes, bytearray)):
jpayne@68:             raise TypeError(
jpayne@68:                 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                 % type(sub)
jpayne@68:             )
jpayne@68:         return self._data.rindex(sub, start, end)
jpayne@68: 
jpayne@68:     def search(self, subs):
jpayne@68:         """Search the substrings subs in self and yield the index and substring found.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
jpayne@68:            objects containing the substrings to search for.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
jpayne@68:         >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
jpayne@68:         >>> for index, substring in matches:
jpayne@68:         ...     print(index, substring)
jpayne@68:         ...
jpayne@68:         7 CC
jpayne@68:         9 ATTG
jpayne@68:         20 CC
jpayne@68:         34 CC
jpayne@68:         34 CCC
jpayne@68:         35 CC
jpayne@68:         """
jpayne@68:         subdict = collections.defaultdict(set)
jpayne@68:         for index, sub in enumerate(subs):
jpayne@68:             if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
jpayne@68:                 sub = bytes(sub)
jpayne@68:             elif isinstance(sub, str):
jpayne@68:                 sub = sub.encode("ASCII")
jpayne@68:             elif not isinstance(sub, bytes):
jpayne@68:                 raise TypeError(
jpayne@68:                     "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68:                     % (index, type(sub))
jpayne@68:                 )
jpayne@68:             length = len(sub)
jpayne@68:             subdict[length].add(sub)
jpayne@68:         for start in range(len(self) - 1):
jpayne@68:             for length, subs in subdict.items():
jpayne@68:                 stop = start + length
jpayne@68:                 for sub in subs:
jpayne@68:                     if self._data[start:stop] == sub:
jpayne@68:                         yield (start, sub.decode())
jpayne@68:                         break
jpayne@68: 
jpayne@68:     def startswith(self, prefix, start=None, end=None):
jpayne@68:         """Return True if the sequence starts with the given prefix, False otherwise.
jpayne@68: 
jpayne@68:         Return True if the sequence starts with the specified prefix
jpayne@68:         (a string or another Seq object), False otherwise.
jpayne@68:         With optional start, test sequence beginning at that position.
jpayne@68:         With optional end, stop comparing sequence at that position.
jpayne@68:         prefix can also be a tuple of strings to try.  e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.startswith("GUC")
jpayne@68:         True
jpayne@68:         >>> my_rna.startswith("AUG")
jpayne@68:         False
jpayne@68:         >>> my_rna.startswith("AUG", 3)
jpayne@68:         True
jpayne@68:         >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
jpayne@68:         True
jpayne@68:         """
jpayne@68:         if isinstance(prefix, tuple):
jpayne@68:             prefix = tuple(
jpayne@68:                 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68:                 for p in prefix
jpayne@68:             )
jpayne@68:         elif isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68:             prefix = bytes(prefix)
jpayne@68:         elif isinstance(prefix, str):
jpayne@68:             prefix = prefix.encode("ASCII")
jpayne@68:         return self._data.startswith(prefix, start, end)
jpayne@68: 
jpayne@68:     def endswith(self, suffix, start=None, end=None):
jpayne@68:         """Return True if the sequence ends with the given suffix, False otherwise.
jpayne@68: 
jpayne@68:         Return True if the sequence ends with the specified suffix
jpayne@68:         (a string or another Seq object), False otherwise.
jpayne@68:         With optional start, test sequence beginning at that position.
jpayne@68:         With optional end, stop comparing sequence at that position.
jpayne@68:         suffix can also be a tuple of strings to try.  e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_rna.endswith("UUG")
jpayne@68:         True
jpayne@68:         >>> my_rna.endswith("AUG")
jpayne@68:         False
jpayne@68:         >>> my_rna.endswith("AUG", 0, 18)
jpayne@68:         True
jpayne@68:         >>> my_rna.endswith(("UCC", "UCA", "UUG"))
jpayne@68:         True
jpayne@68:         """
jpayne@68:         if isinstance(suffix, tuple):
jpayne@68:             suffix = tuple(
jpayne@68:                 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68:                 for p in suffix
jpayne@68:             )
jpayne@68:         elif isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68:             suffix = bytes(suffix)
jpayne@68:         elif isinstance(suffix, str):
jpayne@68:             suffix = suffix.encode("ASCII")
jpayne@68:         return self._data.endswith(suffix, start, end)
jpayne@68: 
jpayne@68:     def split(self, sep=None, maxsplit=-1):
jpayne@68:         """Return a list of subsequences when splitting the sequence by separator sep.
jpayne@68: 
jpayne@68:         Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68:         using sep as the delimiter string.  If maxsplit is given, at
jpayne@68:         most maxsplit splits are done.  If maxsplit is omitted, all
jpayne@68:         splits are made.
jpayne@68: 
jpayne@68:         For consistency with the ``split`` method of Python strings, any
jpayne@68:         whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68:         default value
jpayne@68: 
jpayne@68:         e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_aa = my_rna.translate()
jpayne@68:         >>> my_aa
jpayne@68:         Seq('VMAIVMGR*KGAR*L')
jpayne@68:         >>> for pep in my_aa.split("*"):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR')
jpayne@68:         Seq('KGAR')
jpayne@68:         Seq('L')
jpayne@68:         >>> for pep in my_aa.split("*", 1):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR')
jpayne@68:         Seq('KGAR*L')
jpayne@68: 
jpayne@68:         See also the rsplit method, which splits the sequence starting from the
jpayne@68:         end:
jpayne@68: 
jpayne@68:         >>> for pep in my_aa.rsplit("*", 1):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR*KGAR')
jpayne@68:         Seq('L')
jpayne@68:         """
jpayne@68:         if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68:             sep = bytes(sep)
jpayne@68:         elif isinstance(sep, str):
jpayne@68:             sep = sep.encode("ASCII")
jpayne@68:         return [Seq(part) for part in self._data.split(sep, maxsplit)]
jpayne@68: 
jpayne@68:     def rsplit(self, sep=None, maxsplit=-1):
jpayne@68:         """Return a list of subsequences by splitting the sequence from the right.
jpayne@68: 
jpayne@68:         Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68:         using sep as the delimiter string.  If maxsplit is given, at
jpayne@68:         most maxsplit splits are done.  If maxsplit is omitted, all
jpayne@68:         splits are made.
jpayne@68: 
jpayne@68:         For consistency with the ``rsplit`` method of Python strings, any
jpayne@68:         whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68:         default value
jpayne@68: 
jpayne@68:         e.g.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68:         >>> my_aa = my_rna.translate()
jpayne@68:         >>> my_aa
jpayne@68:         Seq('VMAIVMGR*KGAR*L')
jpayne@68:         >>> for pep in my_aa.rsplit("*"):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR')
jpayne@68:         Seq('KGAR')
jpayne@68:         Seq('L')
jpayne@68:         >>> for pep in my_aa.rsplit("*", 1):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR*KGAR')
jpayne@68:         Seq('L')
jpayne@68: 
jpayne@68:         See also the split method, which splits the sequence starting from the
jpayne@68:         beginning:
jpayne@68: 
jpayne@68:         >>> for pep in my_aa.split("*", 1):
jpayne@68:         ...     pep
jpayne@68:         Seq('VMAIVMGR')
jpayne@68:         Seq('KGAR*L')
jpayne@68:         """
jpayne@68:         if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68:             sep = bytes(sep)
jpayne@68:         elif isinstance(sep, str):
jpayne@68:             sep = sep.encode("ASCII")
jpayne@68:         return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
jpayne@68: 
jpayne@68:     def strip(self, chars=None, inplace=False):
jpayne@68:         """Return a sequence object with leading and trailing ends stripped.
jpayne@68: 
jpayne@68:         With default arguments, leading and trailing whitespace is removed:
jpayne@68: 
jpayne@68:         >>> seq = Seq(" ACGT ")
jpayne@68:         >>> seq.strip()
jpayne@68:         Seq('ACGT')
jpayne@68:         >>> seq
jpayne@68:         Seq(' ACGT ')
jpayne@68: 
jpayne@68:         If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68:         instead.  The order of the characters to be removed is not important:
jpayne@68: 
jpayne@68:         >>> Seq("ACGTACGT").strip("TGCA")
jpayne@68:         Seq('')
jpayne@68: 
jpayne@68:         A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68:         default value).  If ``inplace`` is ``True``, the sequence is stripped
jpayne@68:         in-place and returned.
jpayne@68: 
jpayne@68:         >>> seq = MutableSeq(" ACGT ")
jpayne@68:         >>> seq.strip()
jpayne@68:         MutableSeq('ACGT')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq(' ACGT ')
jpayne@68:         >>> seq.strip(inplace=True)
jpayne@68:         MutableSeq('ACGT')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq('ACGT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
jpayne@68:         is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the lstrip and rstrip methods.
jpayne@68:         """
jpayne@68:         if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68:             chars = bytes(chars)
jpayne@68:         elif isinstance(chars, str):
jpayne@68:             chars = chars.encode("ASCII")
jpayne@68:         try:
jpayne@68:             data = self._data.strip(chars)
jpayne@68:         except TypeError:
jpayne@68:             raise TypeError(
jpayne@68:                 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68:             ) from None
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def lstrip(self, chars=None, inplace=False):
jpayne@68:         """Return a sequence object with leading and trailing ends stripped.
jpayne@68: 
jpayne@68:         With default arguments, leading whitespace is removed:
jpayne@68: 
jpayne@68:         >>> seq = Seq(" ACGT ")
jpayne@68:         >>> seq.lstrip()
jpayne@68:         Seq('ACGT ')
jpayne@68:         >>> seq
jpayne@68:         Seq(' ACGT ')
jpayne@68: 
jpayne@68:         If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68:         from the leading end instead.  The order of the characters to be removed
jpayne@68:         is not important:
jpayne@68: 
jpayne@68:         >>> Seq("ACGACGTTACG").lstrip("GCA")
jpayne@68:         Seq('TTACG')
jpayne@68: 
jpayne@68:         A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68:         default value).  If ``inplace`` is ``True``, the sequence is stripped
jpayne@68:         in-place and returned.
jpayne@68: 
jpayne@68:         >>> seq = MutableSeq(" ACGT ")
jpayne@68:         >>> seq.lstrip()
jpayne@68:         MutableSeq('ACGT ')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq(' ACGT ')
jpayne@68:         >>> seq.lstrip(inplace=True)
jpayne@68:         MutableSeq('ACGT ')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq('ACGT ')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the strip and rstrip methods.
jpayne@68:         """
jpayne@68:         if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68:             chars = bytes(chars)
jpayne@68:         elif isinstance(chars, str):
jpayne@68:             chars = chars.encode("ASCII")
jpayne@68:         try:
jpayne@68:             data = self._data.lstrip(chars)
jpayne@68:         except TypeError:
jpayne@68:             raise TypeError(
jpayne@68:                 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68:             ) from None
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def rstrip(self, chars=None, inplace=False):
jpayne@68:         """Return a sequence object with trailing ends stripped.
jpayne@68: 
jpayne@68:         With default arguments, trailing whitespace is removed:
jpayne@68: 
jpayne@68:         >>> seq = Seq(" ACGT ")
jpayne@68:         >>> seq.rstrip()
jpayne@68:         Seq(' ACGT')
jpayne@68:         >>> seq
jpayne@68:         Seq(' ACGT ')
jpayne@68: 
jpayne@68:         If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68:         from the trailing end instead.  The order of the characters to be
jpayne@68:         removed is not important:
jpayne@68: 
jpayne@68:         >>> Seq("ACGACGTTACG").rstrip("GCA")
jpayne@68:         Seq('ACGACGTT')
jpayne@68: 
jpayne@68:         A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68:         default value).  If ``inplace`` is ``True``, the sequence is stripped
jpayne@68:         in-place and returned.
jpayne@68: 
jpayne@68:         >>> seq = MutableSeq(" ACGT ")
jpayne@68:         >>> seq.rstrip()
jpayne@68:         MutableSeq(' ACGT')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq(' ACGT ')
jpayne@68:         >>> seq.rstrip(inplace=True)
jpayne@68:         MutableSeq(' ACGT')
jpayne@68:         >>> seq
jpayne@68:         MutableSeq(' ACGT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the strip and lstrip methods.
jpayne@68:         """
jpayne@68:         if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68:             chars = bytes(chars)
jpayne@68:         elif isinstance(chars, str):
jpayne@68:             chars = chars.encode("ASCII")
jpayne@68:         try:
jpayne@68:             data = self._data.rstrip(chars)
jpayne@68:         except TypeError:
jpayne@68:             raise TypeError(
jpayne@68:                 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68:             ) from None
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def removeprefix(self, prefix, inplace=False):
jpayne@68:         """Return a new Seq object with prefix (left) removed.
jpayne@68: 
jpayne@68:         This behaves like the python string method of the same name.
jpayne@68: 
jpayne@68:         e.g. Removing a start Codon:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_seq = Seq("ATGGTGTGTGT")
jpayne@68:         >>> my_seq
jpayne@68:         Seq('ATGGTGTGTGT')
jpayne@68:         >>> my_seq.removeprefix('ATG')
jpayne@68:         Seq('GTGTGTGT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the removesuffix method.
jpayne@68:         """
jpayne@68:         if isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68:             prefix = bytes(prefix)
jpayne@68:         elif isinstance(prefix, str):
jpayne@68:             prefix = prefix.encode("ASCII")
jpayne@68:         try:
jpayne@68:             data = self._data.removeprefix(prefix)
jpayne@68:         except TypeError:
jpayne@68:             raise TypeError(
jpayne@68:                 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68:             ) from None
jpayne@68:         except AttributeError:
jpayne@68:             # Fall back for pre-Python 3.9
jpayne@68:             data = self._data
jpayne@68:             if data.startswith(prefix):
jpayne@68:                 data = data[len(prefix) :]
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def removesuffix(self, suffix, inplace=False):
jpayne@68:         """Return a new Seq object with suffix (right) removed.
jpayne@68: 
jpayne@68:         This behaves like the python string method of the same name.
jpayne@68: 
jpayne@68:         e.g. Removing a stop codon:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_seq = Seq("GTGTGTGTTAG")
jpayne@68:         >>> my_seq
jpayne@68:         Seq('GTGTGTGTTAG')
jpayne@68:         >>> stop_codon = Seq("TAG")
jpayne@68:         >>> my_seq.removesuffix(stop_codon)
jpayne@68:         Seq('GTGTGTGT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the removeprefix method.
jpayne@68:         """
jpayne@68:         if isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68:             suffix = bytes(suffix)
jpayne@68:         elif isinstance(suffix, str):
jpayne@68:             suffix = suffix.encode("ASCII")
jpayne@68:         try:
jpayne@68:             data = self._data.removesuffix(suffix)
jpayne@68:         except TypeError:
jpayne@68:             raise TypeError(
jpayne@68:                 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68:             ) from None
jpayne@68:         except AttributeError:
jpayne@68:             # Fall back for pre-Python 3.9
jpayne@68:             data = self._data
jpayne@68:             if data.endswith(suffix):
jpayne@68:                 data = data[: -len(suffix)]
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def upper(self, inplace=False):
jpayne@68:         """Return the sequence in upper case.
jpayne@68: 
jpayne@68:         An upper-case copy of the sequence is returned if inplace is False,
jpayne@68:         the default value:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> my_seq = Seq("VHLTPeeK*")
jpayne@68:         >>> my_seq
jpayne@68:         Seq('VHLTPeeK*')
jpayne@68:         >>> my_seq.lower()
jpayne@68:         Seq('vhltpeek*')
jpayne@68:         >>> my_seq.upper()
jpayne@68:         Seq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         Seq('VHLTPeeK*')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPeeK*')
jpayne@68:         >>> my_seq.lower()
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq.upper()
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPeeK*')
jpayne@68: 
jpayne@68:         >>> my_seq.lower(inplace=True)
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq.upper(inplace=True)
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``upper`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the ``lower`` method.
jpayne@68:         """
jpayne@68:         data = self._data.upper()
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def lower(self, inplace=False):
jpayne@68:         """Return the sequence in lower case.
jpayne@68: 
jpayne@68:         An lower-case copy of the sequence is returned if inplace is False,
jpayne@68:         the default value:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq, MutableSeq
jpayne@68:         >>> my_seq = Seq("VHLTPeeK*")
jpayne@68:         >>> my_seq
jpayne@68:         Seq('VHLTPeeK*')
jpayne@68:         >>> my_seq.lower()
jpayne@68:         Seq('vhltpeek*')
jpayne@68:         >>> my_seq.upper()
jpayne@68:         Seq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         Seq('VHLTPeeK*')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPeeK*')
jpayne@68:         >>> my_seq.lower()
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq.upper()
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPeeK*')
jpayne@68: 
jpayne@68:         >>> my_seq.lower(inplace=True)
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('vhltpeek*')
jpayne@68:         >>> my_seq.upper(inplace=True)
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('VHLTPEEK*')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``lower`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         See also the ``upper`` method.
jpayne@68:         """
jpayne@68:         data = self._data.lower()
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         else:
jpayne@68:             return self.__class__(data)
jpayne@68: 
jpayne@68:     def isupper(self):
jpayne@68:         """Return True if all ASCII characters in data are uppercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         return self._data.isupper()
jpayne@68: 
jpayne@68:     def islower(self):
jpayne@68:         """Return True if all ASCII characters in data are lowercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         return self._data.islower()
jpayne@68: 
jpayne@68:     def translate(
jpayne@68:         self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
jpayne@68:     ):
jpayne@68:         """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
jpayne@68: 
jpayne@68:         This method will translate DNA or RNA sequences. It should not
jpayne@68:         be used on protein sequences as any result will be biologically
jpayne@68:         meaningless.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - table - Which codon table to use?  This can be either a name
jpayne@68:            (string), an NCBI identifier (integer), or a CodonTable
jpayne@68:            object (useful for non-standard genetic codes).  This
jpayne@68:            defaults to the "Standard" table.
jpayne@68:          - stop_symbol - Single character string, what to use for
jpayne@68:            terminators.  This defaults to the asterisk, "*".
jpayne@68:          - to_stop - Boolean, defaults to False meaning do a full
jpayne@68:            translation continuing on past any stop codons (translated as the
jpayne@68:            specified stop_symbol).  If True, translation is terminated at
jpayne@68:            the first in frame stop codon (and the stop_symbol is not
jpayne@68:            appended to the returned protein sequence).
jpayne@68:          - cds - Boolean, indicates this is a complete CDS.  If True,
jpayne@68:            this checks the sequence starts with a valid alternative start
jpayne@68:            codon (which will be translated as methionine, M), that the
jpayne@68:            sequence length is a multiple of three, and that there is a
jpayne@68:            single in frame stop codon at the end (this will be excluded
jpayne@68:            from the protein sequence, regardless of the to_stop option).
jpayne@68:            If these tests fail, an exception is raised.
jpayne@68:          - gap - Single character string to denote symbol used for gaps.
jpayne@68:            Defaults to the minus sign.
jpayne@68: 
jpayne@68:         A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
jpayne@68:         object; a ``MutableSeq`` object is returned if ``translate`` is called
jpayne@68:         pn a ``MutableSeq`` object.
jpayne@68: 
jpayne@68:         e.g. Using the standard table:
jpayne@68: 
jpayne@68:         >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68:         >>> coding_dna.translate()
jpayne@68:         Seq('VAIVMGR*KGAR*')
jpayne@68:         >>> coding_dna.translate(stop_symbol="@")
jpayne@68:         Seq('VAIVMGR@KGAR@')
jpayne@68:         >>> coding_dna.translate(to_stop=True)
jpayne@68:         Seq('VAIVMGR')
jpayne@68: 
jpayne@68:         Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68: 
jpayne@68:         >>> coding_dna.translate(table=2)
jpayne@68:         Seq('VAIVMGRWKGAR*')
jpayne@68:         >>> coding_dna.translate(table=2, to_stop=True)
jpayne@68:         Seq('VAIVMGRWKGAR')
jpayne@68: 
jpayne@68:         In fact, GTG is an alternative start codon under NCBI table 2, meaning
jpayne@68:         this sequence could be a complete CDS:
jpayne@68: 
jpayne@68:         >>> coding_dna.translate(table=2, cds=True)
jpayne@68:         Seq('MAIVMGRWKGAR')
jpayne@68: 
jpayne@68:         It isn't a valid CDS under NCBI table 1, due to both the start codon
jpayne@68:         and also the in frame stop codons:
jpayne@68: 
jpayne@68:         >>> coding_dna.translate(table=1, cds=True)
jpayne@68:         Traceback (most recent call last):
jpayne@68:             ...
jpayne@68:         Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
jpayne@68: 
jpayne@68:         If the sequence has no in-frame stop codon, then the to_stop argument
jpayne@68:         has no effect:
jpayne@68: 
jpayne@68:         >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
jpayne@68:         >>> coding_dna2.translate()
jpayne@68:         Seq('LAIVMGR')
jpayne@68:         >>> coding_dna2.translate(to_stop=True)
jpayne@68:         Seq('LAIVMGR')
jpayne@68: 
jpayne@68:         NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68:         or a stop codon.  These are translated as "X".  Any invalid codon
jpayne@68:         (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68: 
jpayne@68:         NOTE - This does NOT behave like the python string's translate
jpayne@68:         method.  For that use str(my_seq).translate(...) instead
jpayne@68:         """
jpayne@68:         try:
jpayne@68:             data = str(self)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             # translating an undefined sequence yields an undefined
jpayne@68:             # sequence with the length divided by 3
jpayne@68:             n = len(self)
jpayne@68:             if n % 3 != 0:
jpayne@68:                 warnings.warn(
jpayne@68:                     "Partial codon, len(sequence) not a multiple of three. "
jpayne@68:                     "This may become an error in future.",
jpayne@68:                     BiopythonWarning,
jpayne@68:                 )
jpayne@68:             return Seq(None, n // 3)
jpayne@68: 
jpayne@68:         return self.__class__(
jpayne@68:             _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68:         )
jpayne@68: 
jpayne@68:     def complement(self, inplace=False):
jpayne@68:         """Return the complement as a DNA sequence.
jpayne@68: 
jpayne@68:         >>> Seq("CGA").complement()
jpayne@68:         Seq('GCT')
jpayne@68: 
jpayne@68:         Any U in the sequence is treated as a T:
jpayne@68: 
jpayne@68:         >>> Seq("CGAUT").complement()
jpayne@68:         Seq('GCTAA')
jpayne@68: 
jpayne@68:         In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68: 
jpayne@68:         >>> Seq("CGAUT").complement_rna()
jpayne@68:         Seq('GCUAA')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("CGA")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68:         >>> my_seq.complement()
jpayne@68:         MutableSeq('GCT')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68: 
jpayne@68:         >>> my_seq.complement(inplace=True)
jpayne@68:         MutableSeq('GCT')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('GCT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68:         """
jpayne@68:         ttable = _dna_complement_table
jpayne@68:         try:
jpayne@68:             data = self._data.translate(ttable)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             # complement of an undefined sequence is an undefined sequence
jpayne@68:             # of the same length
jpayne@68:             return self
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def complement_rna(self, inplace=False):
jpayne@68:         """Return the complement as an RNA sequence.
jpayne@68: 
jpayne@68:         >>> Seq("CGA").complement_rna()
jpayne@68:         Seq('GCU')
jpayne@68: 
jpayne@68:         Any T in the sequence is treated as a U:
jpayne@68: 
jpayne@68:         >>> Seq("CGAUT").complement_rna()
jpayne@68:         Seq('GCUAA')
jpayne@68: 
jpayne@68:         In contrast, ``complement`` returns a DNA sequence by default:
jpayne@68: 
jpayne@68:         >>> Seq("CGA").complement()
jpayne@68:         Seq('GCT')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("CGA")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68:         >>> my_seq.complement_rna()
jpayne@68:         MutableSeq('GCU')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68: 
jpayne@68:         >>> my_seq.complement_rna(inplace=True)
jpayne@68:         MutableSeq('GCU')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('GCU')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68:         """
jpayne@68:         try:
jpayne@68:             data = self._data.translate(_rna_complement_table)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             # complement of an undefined sequence is an undefined sequence
jpayne@68:             # of the same length
jpayne@68:             return self
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def reverse_complement(self, inplace=False):
jpayne@68:         """Return the reverse complement as a DNA sequence.
jpayne@68: 
jpayne@68:         >>> Seq("CGA").reverse_complement()
jpayne@68:         Seq('TCG')
jpayne@68: 
jpayne@68:         Any U in the sequence is treated as a T:
jpayne@68: 
jpayne@68:         >>> Seq("CGAUT").reverse_complement()
jpayne@68:         Seq('AATCG')
jpayne@68: 
jpayne@68:         In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68: 
jpayne@68:         >>> Seq("CGA").reverse_complement_rna()
jpayne@68:         Seq('UCG')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("CGA")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68:         >>> my_seq.reverse_complement()
jpayne@68:         MutableSeq('TCG')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68: 
jpayne@68:         >>> my_seq.reverse_complement(inplace=True)
jpayne@68:         MutableSeq('TCG')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('TCG')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68:         ``inplace=True``.
jpayne@68:         """
jpayne@68:         try:
jpayne@68:             data = self._data.translate(_dna_complement_table)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             # reverse complement of an undefined sequence is an undefined sequence
jpayne@68:             # of the same length
jpayne@68:             return self
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[::-1] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data[::-1])
jpayne@68: 
jpayne@68:     def reverse_complement_rna(self, inplace=False):
jpayne@68:         """Return the reverse complement as an RNA sequence.
jpayne@68: 
jpayne@68:         >>> Seq("CGA").reverse_complement_rna()
jpayne@68:         Seq('UCG')
jpayne@68: 
jpayne@68:         Any T in the sequence is treated as a U:
jpayne@68: 
jpayne@68:         >>> Seq("CGAUT").reverse_complement_rna()
jpayne@68:         Seq('AAUCG')
jpayne@68: 
jpayne@68:         In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68: 
jpayne@68:         >>> Seq("CGA").reverse_complement()
jpayne@68:         Seq('TCG')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq("CGA")
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68:         >>> my_seq.reverse_complement_rna()
jpayne@68:         MutableSeq('UCG')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CGA')
jpayne@68: 
jpayne@68:         >>> my_seq.reverse_complement_rna(inplace=True)
jpayne@68:         MutableSeq('UCG')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('UCG')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``reverse_complement_rna`` is called on a ``Seq`` object with
jpayne@68:         ``inplace=True``.
jpayne@68:         """
jpayne@68:         try:
jpayne@68:             data = self._data.translate(_rna_complement_table)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             # reverse complement of an undefined sequence is an undefined sequence
jpayne@68:             # of the same length
jpayne@68:             return self
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[::-1] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data[::-1])
jpayne@68: 
jpayne@68:     def transcribe(self, inplace=False):
jpayne@68:         """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
jpayne@68: 
jpayne@68:         Following the usual convention, the sequence is interpreted as the
jpayne@68:         coding strand of the DNA double helix, not the template strand. This
jpayne@68:         means we can get the RNA sequence just by switching T to U.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68:         >>> coding_dna
jpayne@68:         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68:         >>> coding_dna.transcribe()
jpayne@68:         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68:         >>> sequence.transcribe()
jpayne@68:         MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68: 
jpayne@68:         >>> sequence.transcribe(inplace=True)
jpayne@68:         MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         Trying to transcribe an RNA sequence has no effect.
jpayne@68:         If you have a nucleotide sequence which might be DNA or RNA
jpayne@68:         (or even a mixture), calling the transcribe method will ensure
jpayne@68:         any T becomes U.
jpayne@68: 
jpayne@68:         Trying to transcribe a protein sequence will replace any
jpayne@68:         T for Threonine with U for Selenocysteine, which has no
jpayne@68:         biologically plausible rational.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_protein = Seq("MAIVMGRT")
jpayne@68:         >>> my_protein.transcribe()
jpayne@68:         Seq('MAIVMGRU')
jpayne@68:         """
jpayne@68:         data = self._data.replace(b"T", b"U").replace(b"t", b"u")
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def back_transcribe(self, inplace=False):
jpayne@68:         """Return the DNA sequence from an RNA sequence by creating a new Seq object.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68:         >>> messenger_rna
jpayne@68:         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68:         >>> messenger_rna.back_transcribe()
jpayne@68:         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68: 
jpayne@68:         The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:         >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68:         >>> sequence.back_transcribe()
jpayne@68:         MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68: 
jpayne@68:         >>> sequence.back_transcribe(inplace=True)
jpayne@68:         MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68:         >>> sequence
jpayne@68:         MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68: 
jpayne@68:         Trying to back-transcribe DNA has no effect, If you have a nucleotide
jpayne@68:         sequence which might be DNA or RNA (or even a mixture), calling the
jpayne@68:         back-transcribe method will ensure any U becomes T.
jpayne@68: 
jpayne@68:         Trying to back-transcribe a protein sequence will replace any U for
jpayne@68:         Selenocysteine with T for Threonine, which is biologically meaningless.
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_protein = Seq("MAIVMGRU")
jpayne@68:         >>> my_protein.back_transcribe()
jpayne@68:         Seq('MAIVMGRT')
jpayne@68:         """
jpayne@68:         data = self._data.replace(b"U", b"T").replace(b"u", b"t")
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     def join(self, other):
jpayne@68:         """Return a merge of the sequences in other, spaced by the sequence from self.
jpayne@68: 
jpayne@68:         Accepts a Seq object, MutableSeq object, or string (and iterates over
jpayne@68:         the letters), or an iterable containing Seq, MutableSeq, or string
jpayne@68:         objects. These arguments will be concatenated with the calling sequence
jpayne@68:         as the spacer:
jpayne@68: 
jpayne@68:         >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
jpayne@68:         >>> concatenated
jpayne@68:         Seq('AAANNNNNTTTNNNNNPPP')
jpayne@68: 
jpayne@68:         Joining the letters of a single sequence:
jpayne@68: 
jpayne@68:         >>> Seq('NNNNN').join(Seq("ACGT"))
jpayne@68:         Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68:         >>> Seq('NNNNN').join("ACGT")
jpayne@68:         Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68:         """
jpayne@68:         if isinstance(other, _SeqAbstractBaseClass):
jpayne@68:             return self.__class__(str(self).join(str(other)))
jpayne@68:         elif isinstance(other, str):
jpayne@68:             return self.__class__(str(self).join(other))
jpayne@68: 
jpayne@68:         from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
jpayne@68: 
jpayne@68:         if isinstance(other, SeqRecord):
jpayne@68:             raise TypeError("Iterable cannot be a SeqRecord")
jpayne@68: 
jpayne@68:         for c in other:
jpayne@68:             if isinstance(c, SeqRecord):
jpayne@68:                 raise TypeError("Iterable cannot contain SeqRecords")
jpayne@68:             elif not isinstance(c, (str, _SeqAbstractBaseClass)):
jpayne@68:                 raise TypeError(
jpayne@68:                     "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
jpayne@68:                 )
jpayne@68:         return self.__class__(str(self).join([str(_) for _ in other]))
jpayne@68: 
jpayne@68:     def replace(self, old, new, inplace=False):
jpayne@68:         """Return a copy with all occurrences of subsequence old replaced by new.
jpayne@68: 
jpayne@68:         >>> s = Seq("ACGTAACCGGTT")
jpayne@68:         >>> t = s.replace("AC", "XYZ")
jpayne@68:         >>> s
jpayne@68:         Seq('ACGTAACCGGTT')
jpayne@68:         >>> t
jpayne@68:         Seq('XYZGTAXYZCGGTT')
jpayne@68: 
jpayne@68:         For mutable sequences, passing inplace=True will modify the sequence in place:
jpayne@68: 
jpayne@68:         >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68:         >>> t = m.replace("AC", "XYZ")
jpayne@68:         >>> m
jpayne@68:         MutableSeq('ACGTAACCGGTT')
jpayne@68:         >>> t
jpayne@68:         MutableSeq('XYZGTAXYZCGGTT')
jpayne@68: 
jpayne@68:         >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68:         >>> t = m.replace("AC", "XYZ", inplace=True)
jpayne@68:         >>> m
jpayne@68:         MutableSeq('XYZGTAXYZCGGTT')
jpayne@68:         >>> t
jpayne@68:         MutableSeq('XYZGTAXYZCGGTT')
jpayne@68: 
jpayne@68:         As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68:         ``replace`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68:         """
jpayne@68:         if isinstance(old, _SeqAbstractBaseClass):
jpayne@68:             old = bytes(old)
jpayne@68:         elif isinstance(old, str):
jpayne@68:             old = old.encode("ASCII")
jpayne@68:         if isinstance(new, _SeqAbstractBaseClass):
jpayne@68:             new = bytes(new)
jpayne@68:         elif isinstance(new, str):
jpayne@68:             new = new.encode("ASCII")
jpayne@68:         data = self._data.replace(old, new)
jpayne@68:         if inplace:
jpayne@68:             if not isinstance(self._data, bytearray):
jpayne@68:                 raise TypeError("Sequence is immutable")
jpayne@68:             self._data[:] = data
jpayne@68:             return self
jpayne@68:         return self.__class__(data)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined(self):
jpayne@68:         """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68: 
jpayne@68:         Zero-length sequences are always considered to be defined.
jpayne@68:         """
jpayne@68:         if isinstance(self._data, (bytes, bytearray)):
jpayne@68:             return True
jpayne@68:         else:
jpayne@68:             return self._data.defined
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined_ranges(self):
jpayne@68:         """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68: 
jpayne@68:         The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68:         """
jpayne@68:         if isinstance(self._data, (bytes, bytearray)):
jpayne@68:             length = len(self)
jpayne@68:             if length > 0:
jpayne@68:                 return ((0, length),)
jpayne@68:             else:
jpayne@68:                 return ()
jpayne@68:         else:
jpayne@68:             return self._data.defined_ranges
jpayne@68: 
jpayne@68: 
jpayne@68: class Seq(_SeqAbstractBaseClass):
jpayne@68:     """Read-only sequence object (essentially a string with biological methods).
jpayne@68: 
jpayne@68:     Like normal python strings, our basic sequence object is immutable.
jpayne@68:     This prevents you from doing my_seq[5] = "A" for example, but does allow
jpayne@68:     Seq objects to be used as dictionary keys.
jpayne@68: 
jpayne@68:     The Seq object provides a number of string like methods (such as count,
jpayne@68:     find, split and strip).
jpayne@68: 
jpayne@68:     The Seq object also provides some biological methods, such as complement,
jpayne@68:     reverse_complement, transcribe, back_transcribe and translate (which are
jpayne@68:     not applicable to protein sequences).
jpayne@68:     """
jpayne@68: 
jpayne@68:     _data: Union[bytes, SequenceDataAbstractBaseClass]
jpayne@68: 
jpayne@68:     def __init__(
jpayne@68:         self,
jpayne@68:         data: Union[
jpayne@68:             str,
jpayne@68:             bytes,
jpayne@68:             bytearray,
jpayne@68:             _SeqAbstractBaseClass,
jpayne@68:             SequenceDataAbstractBaseClass,
jpayne@68:             dict,
jpayne@68:             None,
jpayne@68:         ],
jpayne@68:         length: Optional[int] = None,
jpayne@68:     ):
jpayne@68:         """Create a Seq object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - data - Sequence, required (string)
jpayne@68:          - length - Sequence length, used only if data is None or a dictionary (integer)
jpayne@68: 
jpayne@68:         You will typically use Bio.SeqIO to read in sequences from files as
jpayne@68:         SeqRecord objects, whose sequence will be exposed as a Seq object via
jpayne@68:         the seq property.
jpayne@68: 
jpayne@68:         However, you can also create a Seq object directly:
jpayne@68: 
jpayne@68:         >>> from Bio.Seq import Seq
jpayne@68:         >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
jpayne@68:         >>> my_seq
jpayne@68:         Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
jpayne@68:         >>> print(my_seq)
jpayne@68:         MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
jpayne@68: 
jpayne@68:         To create a Seq object with for a sequence of known length but
jpayne@68:         unknown sequence contents, use None for the data argument and pass
jpayne@68:         the sequence length for the length argument. Trying to access the
jpayne@68:         sequence contents of a Seq object created in this way will raise
jpayne@68:         an UndefinedSequenceError:
jpayne@68: 
jpayne@68:         >>> my_undefined_sequence = Seq(None, 20)
jpayne@68:         >>> my_undefined_sequence
jpayne@68:         Seq(None, length=20)
jpayne@68:         >>> len(my_undefined_sequence)
jpayne@68:         20
jpayne@68:         >>> print(my_undefined_sequence)
jpayne@68:         Traceback (most recent call last):
jpayne@68:         ...
jpayne@68:         Bio.Seq.UndefinedSequenceError: Sequence content is undefined
jpayne@68: 
jpayne@68:         If the sequence contents is known for parts of the sequence only, use
jpayne@68:         a dictionary for the data argument to pass the known sequence segments:
jpayne@68: 
jpayne@68:         >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
jpayne@68:         >>> my_partially_defined_sequence
jpayne@68:         Seq({3: 'ACGT'}, length=10)
jpayne@68:         >>> len(my_partially_defined_sequence)
jpayne@68:         10
jpayne@68:         >>> print(my_partially_defined_sequence)
jpayne@68:         Traceback (most recent call last):
jpayne@68:         ...
jpayne@68:         Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
jpayne@68:         >>> my_partially_defined_sequence[3:7]
jpayne@68:         Seq('ACGT')
jpayne@68:         >>> print(my_partially_defined_sequence[3:7])
jpayne@68:         ACGT
jpayne@68:         """
jpayne@68:         if data is None:
jpayne@68:             if length is None:
jpayne@68:                 raise ValueError("length must not be None if data is None")
jpayne@68:             elif length == 0:
jpayne@68:                 self._data = b""
jpayne@68:             elif length < 0:
jpayne@68:                 raise ValueError("length must not be negative.")
jpayne@68:             else:
jpayne@68:                 self._data = _UndefinedSequenceData(length)
jpayne@68:         elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
jpayne@68:             self._data = data
jpayne@68:         elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
jpayne@68:             self._data = bytes(data)
jpayne@68:         elif isinstance(data, str):
jpayne@68:             self._data = bytes(data, encoding="ASCII")
jpayne@68:         elif isinstance(data, dict):
jpayne@68:             if length is None:
jpayne@68:                 raise ValueError("length must not be None if data is a dictionary")
jpayne@68:             elif length == 0:
jpayne@68:                 self._data = b""
jpayne@68:             elif length < 0:
jpayne@68:                 raise ValueError("length must not be negative.")
jpayne@68:             else:
jpayne@68:                 current = 0  # not needed here, but it keeps mypy happy
jpayne@68:                 end = -1
jpayne@68:                 starts = sorted(data.keys())
jpayne@68:                 _data: Dict[int, bytes] = {}
jpayne@68:                 for start in starts:
jpayne@68:                     seq = data[start]
jpayne@68:                     if isinstance(seq, str):
jpayne@68:                         seq = bytes(seq, encoding="ASCII")
jpayne@68:                     else:
jpayne@68:                         try:
jpayne@68:                             seq = bytes(seq)
jpayne@68:                         except Exception:
jpayne@68:                             raise ValueError("Expected bytes-like objects or strings")
jpayne@68:                     if start < end:
jpayne@68:                         raise ValueError("Sequence data are overlapping.")
jpayne@68:                     elif start == end:
jpayne@68:                         _data[current] += seq  # noqa: F821
jpayne@68:                     else:
jpayne@68:                         _data[start] = seq
jpayne@68:                         current = start
jpayne@68:                     end = start + len(seq)
jpayne@68:                 if end > length:
jpayne@68:                     raise ValueError(
jpayne@68:                         "Provided sequence data extend beyond sequence length."
jpayne@68:                     )
jpayne@68:                 elif end == length and current == 0:
jpayne@68:                     # sequence is fully defined
jpayne@68:                     self._data = _data[current]
jpayne@68:                 else:
jpayne@68:                     self._data = _PartiallyDefinedSequenceData(length, _data)
jpayne@68:         else:
jpayne@68:             raise TypeError(
jpayne@68:                 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
jpayne@68:             )
jpayne@68: 
jpayne@68:     def __hash__(self):
jpayne@68:         """Hash of the sequence as a string for comparison.
jpayne@68: 
jpayne@68:         See Seq object comparison documentation (method ``__eq__`` in
jpayne@68:         particular) as this has changed in Biopython 1.65. Older versions
jpayne@68:         would hash on object identity.
jpayne@68:         """
jpayne@68:         return hash(self._data)
jpayne@68: 
jpayne@68: 
jpayne@68: class MutableSeq(_SeqAbstractBaseClass):
jpayne@68:     """An editable sequence object.
jpayne@68: 
jpayne@68:     Unlike normal python strings and our basic sequence object (the Seq class)
jpayne@68:     which are immutable, the MutableSeq lets you edit the sequence in place.
jpayne@68:     However, this means you cannot use a MutableSeq object as a dictionary key.
jpayne@68: 
jpayne@68:     >>> from Bio.Seq import MutableSeq
jpayne@68:     >>> my_seq = MutableSeq("ACTCGTCGTCG")
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('ACTCGTCGTCG')
jpayne@68:     >>> my_seq[5]
jpayne@68:     'T'
jpayne@68:     >>> my_seq[5] = "A"
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('ACTCGACGTCG')
jpayne@68:     >>> my_seq[5]
jpayne@68:     'A'
jpayne@68:     >>> my_seq[5:8] = "NNN"
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('ACTCGNNNTCG')
jpayne@68:     >>> len(my_seq)
jpayne@68:     11
jpayne@68: 
jpayne@68:     Note that the MutableSeq object does not support as many string-like
jpayne@68:     or biological methods as the Seq object.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, data):
jpayne@68:         """Create a MutableSeq object."""
jpayne@68:         if isinstance(data, bytearray):
jpayne@68:             self._data = data
jpayne@68:         elif isinstance(data, bytes):
jpayne@68:             self._data = bytearray(data)
jpayne@68:         elif isinstance(data, str):
jpayne@68:             self._data = bytearray(data, "ASCII")
jpayne@68:         elif isinstance(data, MutableSeq):
jpayne@68:             self._data = data._data[:]  # Take a copy
jpayne@68:         elif isinstance(data, Seq):
jpayne@68:             # Make no assumptions about the Seq subclass internal storage
jpayne@68:             self._data = bytearray(bytes(data))
jpayne@68:         else:
jpayne@68:             raise TypeError(
jpayne@68:                 "data should be a string, bytearray object, Seq object, or a "
jpayne@68:                 "MutableSeq object"
jpayne@68:             )
jpayne@68: 
jpayne@68:     def __setitem__(self, index, value):
jpayne@68:         """Set a subsequence of single letter via value parameter.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq[0] = 'T'
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('TCTCGACGTCG')
jpayne@68:         """
jpayne@68:         if isinstance(index, numbers.Integral):
jpayne@68:             # Replacing a single letter with a new string
jpayne@68:             self._data[index] = ord(value)
jpayne@68:         else:
jpayne@68:             # Replacing a sub-sequence
jpayne@68:             if isinstance(value, MutableSeq):
jpayne@68:                 self._data[index] = value._data
jpayne@68:             elif isinstance(value, Seq):
jpayne@68:                 self._data[index] = bytes(value)
jpayne@68:             elif isinstance(value, str):
jpayne@68:                 self._data[index] = value.encode("ASCII")
jpayne@68:             else:
jpayne@68:                 raise TypeError(f"received unexpected type '{type(value).__name__}'")
jpayne@68: 
jpayne@68:     def __delitem__(self, index):
jpayne@68:         """Delete a subsequence of single letter.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> del my_seq[0]
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('CTCGACGTCG')
jpayne@68:         """
jpayne@68:         # Could be deleting a single letter, or a slice
jpayne@68:         del self._data[index]
jpayne@68: 
jpayne@68:     def append(self, c):
jpayne@68:         """Add a subsequence to the mutable sequence object.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq.append('A')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ACTCGACGTCGA')
jpayne@68: 
jpayne@68:         No return value.
jpayne@68:         """
jpayne@68:         self._data.append(ord(c.encode("ASCII")))
jpayne@68: 
jpayne@68:     def insert(self, i, c):
jpayne@68:         """Add a subsequence to the mutable sequence object at a given index.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq.insert(0,'A')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('AACTCGACGTCG')
jpayne@68:         >>> my_seq.insert(8,'G')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('AACTCGACGGTCG')
jpayne@68: 
jpayne@68:         No return value.
jpayne@68:         """
jpayne@68:         self._data.insert(i, ord(c.encode("ASCII")))
jpayne@68: 
jpayne@68:     def pop(self, i=(-1)):
jpayne@68:         """Remove a subsequence of a single letter at given index.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq.pop()
jpayne@68:         'G'
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ACTCGACGTC')
jpayne@68:         >>> my_seq.pop()
jpayne@68:         'C'
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ACTCGACGT')
jpayne@68: 
jpayne@68:         Returns the last character of the sequence.
jpayne@68:         """
jpayne@68:         c = self._data[i]
jpayne@68:         del self._data[i]
jpayne@68:         return chr(c)
jpayne@68: 
jpayne@68:     def remove(self, item):
jpayne@68:         """Remove a subsequence of a single letter from mutable sequence.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq.remove('C')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ATCGACGTCG')
jpayne@68:         >>> my_seq.remove('A')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('TCGACGTCG')
jpayne@68: 
jpayne@68:         No return value.
jpayne@68:         """
jpayne@68:         codepoint = ord(item)
jpayne@68:         try:
jpayne@68:             self._data.remove(codepoint)
jpayne@68:         except ValueError:
jpayne@68:             raise ValueError("value not found in MutableSeq") from None
jpayne@68: 
jpayne@68:     def reverse(self):
jpayne@68:         """Modify the mutable sequence to reverse itself.
jpayne@68: 
jpayne@68:         No return value.
jpayne@68:         """
jpayne@68:         self._data.reverse()
jpayne@68: 
jpayne@68:     def extend(self, other):
jpayne@68:         """Add a sequence to the original mutable sequence object.
jpayne@68: 
jpayne@68:         >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68:         >>> my_seq.extend('A')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ACTCGACGTCGA')
jpayne@68:         >>> my_seq.extend('TTT')
jpayne@68:         >>> my_seq
jpayne@68:         MutableSeq('ACTCGACGTCGATTT')
jpayne@68: 
jpayne@68:         No return value.
jpayne@68:         """
jpayne@68:         if isinstance(other, MutableSeq):
jpayne@68:             self._data.extend(other._data)
jpayne@68:         elif isinstance(other, Seq):
jpayne@68:             self._data.extend(bytes(other))
jpayne@68:         elif isinstance(other, str):
jpayne@68:             self._data.extend(other.encode("ASCII"))
jpayne@68:         else:
jpayne@68:             raise TypeError("expected a string, Seq or MutableSeq")
jpayne@68: 
jpayne@68: 
jpayne@68: class UndefinedSequenceError(ValueError):
jpayne@68:     """Sequence contents is undefined."""
jpayne@68: 
jpayne@68: 
jpayne@68: class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68:     """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68: 
jpayne@68:     Objects of this class can be used to create a Seq object to represent
jpayne@68:     sequences with a known length, but an unknown sequence contents.
jpayne@68:     Calling __len__ returns the sequence length, calling __getitem__ raises an
jpayne@68:     UndefinedSequenceError except for requests of zero size, for which it
jpayne@68:     returns an empty bytes object.
jpayne@68:     """
jpayne@68: 
jpayne@68:     __slots__ = ("_length",)
jpayne@68: 
jpayne@68:     def __init__(self, length):
jpayne@68:         """Initialize the object with the sequence length.
jpayne@68: 
jpayne@68:         The calling function is responsible for ensuring that the length is
jpayne@68:         greater than zero.
jpayne@68:         """
jpayne@68:         self._length = length
jpayne@68:         super().__init__()
jpayne@68: 
jpayne@68:     def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
jpayne@68:         if isinstance(key, slice):
jpayne@68:             start, end, step = key.indices(self._length)
jpayne@68:             size = len(range(start, end, step))
jpayne@68:             if size == 0:
jpayne@68:                 return b""
jpayne@68:             return _UndefinedSequenceData(size)
jpayne@68:         else:
jpayne@68:             raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self._length
jpayne@68: 
jpayne@68:     def __bytes__(self):
jpayne@68:         raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68: 
jpayne@68:     def __add__(self, other):
jpayne@68:         length = len(self) + len(other)
jpayne@68:         try:
jpayne@68:             other = bytes(other)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             if isinstance(other, _UndefinedSequenceData):
jpayne@68:                 return _UndefinedSequenceData(length)
jpayne@68:             else:
jpayne@68:                 return NotImplemented
jpayne@68:                 # _PartiallyDefinedSequenceData.__radd__ will handle this
jpayne@68:         else:
jpayne@68:             data = {len(self): other}
jpayne@68:             return _PartiallyDefinedSequenceData(length, data)
jpayne@68: 
jpayne@68:     def __radd__(self, other):
jpayne@68:         data = {0: bytes(other)}
jpayne@68:         length = len(other) + len(self)
jpayne@68:         return _PartiallyDefinedSequenceData(length, data)
jpayne@68: 
jpayne@68:     def upper(self):
jpayne@68:         """Return an upper case copy of the sequence."""
jpayne@68:         # An upper case copy of an undefined sequence is an undefined
jpayne@68:         # sequence of the same length
jpayne@68:         return _UndefinedSequenceData(self._length)
jpayne@68: 
jpayne@68:     def lower(self):
jpayne@68:         """Return a lower case copy of the sequence."""
jpayne@68:         # A lower case copy of an undefined sequence is an undefined
jpayne@68:         # sequence of the same length
jpayne@68:         return _UndefinedSequenceData(self._length)
jpayne@68: 
jpayne@68:     def isupper(self):
jpayne@68:         """Return True if all ASCII characters in data are uppercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         # Character case is irrelevant for an undefined sequence
jpayne@68:         raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68: 
jpayne@68:     def islower(self):
jpayne@68:         """Return True if all ASCII characters in data are lowercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         # Character case is irrelevant for an undefined sequence
jpayne@68:         raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68: 
jpayne@68:     def replace(self, old, new):
jpayne@68:         """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68:         # Replacing substring old by new in an undefined sequence will result
jpayne@68:         # in an undefined sequence of the same length, if old and new have the
jpayne@68:         # number of characters.
jpayne@68:         if len(old) != len(new):
jpayne@68:             raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68:         return _UndefinedSequenceData(self._length)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined(self):
jpayne@68:         """Return False, as the sequence is not defined and has a non-zero length."""
jpayne@68:         return False
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined_ranges(self):
jpayne@68:         """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68: 
jpayne@68:         As the sequence contents of an _UndefinedSequenceData object is fully
jpayne@68:         undefined, the return value is always an empty tuple.
jpayne@68:         """
jpayne@68:         return ()
jpayne@68: 
jpayne@68: 
jpayne@68: class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68:     """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68: 
jpayne@68:     Objects of this class can be used to create a Seq object to represent
jpayne@68:     sequences with a known length, but with a sequence contents that is only
jpayne@68:     partially known.
jpayne@68:     Calling __len__ returns the sequence length, calling __getitem__ returns
jpayne@68:     the sequence contents if known, otherwise an UndefinedSequenceError is
jpayne@68:     raised.
jpayne@68:     """
jpayne@68: 
jpayne@68:     __slots__ = ("_length", "_data")
jpayne@68: 
jpayne@68:     def __init__(self, length, data):
jpayne@68:         """Initialize with the sequence length and defined sequence segments.
jpayne@68: 
jpayne@68:         The calling function is responsible for ensuring that the length is
jpayne@68:         greater than zero.
jpayne@68:         """
jpayne@68:         self._length = length
jpayne@68:         self._data = data
jpayne@68:         super().__init__()
jpayne@68: 
jpayne@68:     def __getitem__(
jpayne@68:         self, key: Union[slice, int]
jpayne@68:     ) -> Union[bytes, SequenceDataAbstractBaseClass]:
jpayne@68:         if isinstance(key, slice):
jpayne@68:             start, end, step = key.indices(self._length)
jpayne@68:             size = len(range(start, end, step))
jpayne@68:             if size == 0:
jpayne@68:                 return b""
jpayne@68:             data = {}
jpayne@68:             for s, d in self._data.items():
jpayne@68:                 indices = range(-s, -s + self._length)[key]
jpayne@68:                 e: Optional[int] = indices.stop
jpayne@68:                 assert e is not None
jpayne@68:                 if step > 0:
jpayne@68:                     if e <= 0:
jpayne@68:                         continue
jpayne@68:                     if indices.start < 0:
jpayne@68:                         s = indices.start % step
jpayne@68:                     else:
jpayne@68:                         s = indices.start
jpayne@68:                 else:  # step < 0
jpayne@68:                     if e < 0:
jpayne@68:                         e = None
jpayne@68:                     end = len(d) - 1
jpayne@68:                     if indices.start > end:
jpayne@68:                         s = end + (indices.start - end) % step
jpayne@68:                     else:
jpayne@68:                         s = indices.start
jpayne@68:                     if s < 0:
jpayne@68:                         continue
jpayne@68:                 start = (s - indices.start) // step
jpayne@68:                 d = d[s:e:step]
jpayne@68:                 if d:
jpayne@68:                     data[start] = d
jpayne@68:             if len(data) == 0:  # Fully undefined sequence
jpayne@68:                 return _UndefinedSequenceData(size)
jpayne@68:             # merge adjacent sequence segments
jpayne@68:             end = -1
jpayne@68:             previous = 0  # not needed here, but it keeps flake happy
jpayne@68:             items = data.items()
jpayne@68:             data = {}
jpayne@68:             for start, seq in items:
jpayne@68:                 if end == start:
jpayne@68:                     data[previous] += seq
jpayne@68:                 else:
jpayne@68:                     data[start] = seq
jpayne@68:                     previous = start
jpayne@68:                 end = start + len(seq)
jpayne@68:             if len(data) == 1:
jpayne@68:                 seq = data.get(0)
jpayne@68:                 if seq is not None and len(seq) == size:
jpayne@68:                     return seq  # Fully defined sequence; return bytes
jpayne@68:             if step < 0:
jpayne@68:                 # use this after we drop Python 3.7:
jpayne@68:                 # data = {start: data[start] for start in reversed(data)}
jpayne@68:                 # use this as long as we support Python 3.7:
jpayne@68:                 data = {start: data[start] for start in reversed(list(data.keys()))}
jpayne@68:             return _PartiallyDefinedSequenceData(size, data)
jpayne@68:         elif self._length <= key:
jpayne@68:             raise IndexError("sequence index out of range")
jpayne@68:         else:
jpayne@68:             for start, seq in self._data.items():
jpayne@68:                 if start <= key and key < start + len(seq):
jpayne@68:                     return seq[key - start]
jpayne@68:             raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self._length
jpayne@68: 
jpayne@68:     def __bytes__(self):
jpayne@68:         raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68: 
jpayne@68:     def __add__(self, other):
jpayne@68:         length = len(self) + len(other)
jpayne@68:         data = dict(self._data)
jpayne@68:         items = list(self._data.items())
jpayne@68:         start, seq = items[-1]
jpayne@68:         end = start + len(seq)
jpayne@68:         try:
jpayne@68:             other = bytes(other)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             if isinstance(other, _UndefinedSequenceData):
jpayne@68:                 pass
jpayne@68:             elif isinstance(other, _PartiallyDefinedSequenceData):
jpayne@68:                 other_items = list(other._data.items())
jpayne@68:                 if end == len(self):
jpayne@68:                     other_start, other_seq = other_items.pop(0)
jpayne@68:                     if other_start == 0:
jpayne@68:                         data[start] += other_seq
jpayne@68:                     else:
jpayne@68:                         data[len(self) + other_start] = other_seq
jpayne@68:                 for other_start, other_seq in other_items:
jpayne@68:                     data[len(self) + other_start] = other_seq
jpayne@68:         else:
jpayne@68:             if end == len(self):
jpayne@68:                 data[start] += other
jpayne@68:             else:
jpayne@68:                 data[len(self)] = other
jpayne@68:         return _PartiallyDefinedSequenceData(length, data)
jpayne@68: 
jpayne@68:     def __radd__(self, other):
jpayne@68:         length = len(other) + len(self)
jpayne@68:         try:
jpayne@68:             other = bytes(other)
jpayne@68:         except UndefinedSequenceError:
jpayne@68:             data = {len(other) + start: seq for start, seq in self._data.items()}
jpayne@68:         else:
jpayne@68:             data = {0: other}
jpayne@68:             items = list(self._data.items())
jpayne@68:             start, seq = items.pop(0)
jpayne@68:             if start == 0:
jpayne@68:                 data[0] += seq
jpayne@68:             else:
jpayne@68:                 data[len(other) + start] = seq
jpayne@68:             for start, seq in items:
jpayne@68:                 data[len(other) + start] = seq
jpayne@68:         return _PartiallyDefinedSequenceData(length, data)
jpayne@68: 
jpayne@68:     def __mul__(self, other):
jpayne@68:         length = self._length
jpayne@68:         items = self._data.items()
jpayne@68:         data = {}
jpayne@68:         end = -1
jpayne@68:         previous = 0  # not needed here, but it keeps flake happy
jpayne@68:         for i in range(other):
jpayne@68:             for start, seq in items:
jpayne@68:                 start += i * length
jpayne@68:                 if end == start:
jpayne@68:                     data[previous] += seq
jpayne@68:                 else:
jpayne@68:                     data[start] = seq
jpayne@68:                     previous = start
jpayne@68:             end = start + len(seq)
jpayne@68:         return _PartiallyDefinedSequenceData(length * other, data)
jpayne@68: 
jpayne@68:     def upper(self):
jpayne@68:         """Return an upper case copy of the sequence."""
jpayne@68:         data = {start: seq.upper() for start, seq in self._data.items()}
jpayne@68:         return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68: 
jpayne@68:     def lower(self):
jpayne@68:         """Return a lower case copy of the sequence."""
jpayne@68:         data = {start: seq.lower() for start, seq in self._data.items()}
jpayne@68:         return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68: 
jpayne@68:     def isupper(self):
jpayne@68:         """Return True if all ASCII characters in data are uppercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         # Character case is irrelevant for an undefined sequence
jpayne@68:         raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68: 
jpayne@68:     def islower(self):
jpayne@68:         """Return True if all ASCII characters in data are lowercase.
jpayne@68: 
jpayne@68:         If there are no cased characters, the method returns False.
jpayne@68:         """
jpayne@68:         # Character case is irrelevant for an undefined sequence
jpayne@68:         raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68: 
jpayne@68:     def translate(self, table, delete=b""):
jpayne@68:         """Return a copy with each character mapped by the given translation table.
jpayne@68: 
jpayne@68:           table
jpayne@68:             Translation table, which must be a bytes object of length 256.
jpayne@68: 
jpayne@68:         All characters occurring in the optional argument delete are removed.
jpayne@68:         The remaining characters are mapped through the given translation table.
jpayne@68:         """
jpayne@68:         items = self._data.items()
jpayne@68:         data = {start: seq.translate(table, delete) for start, seq in items}
jpayne@68:         return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68: 
jpayne@68:     def replace(self, old, new):
jpayne@68:         """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68:         # Replacing substring old by new in the undefined sequence segments
jpayne@68:         # will result in an undefined sequence segment of the same length, if
jpayne@68:         # old and new have the number of characters. If not, an error is raised,
jpayne@68:         # as the correct start positions cannot be calculated reliably.
jpayne@68:         if len(old) != len(new):
jpayne@68:             raise UndefinedSequenceError(
jpayne@68:                 "Sequence content is only partially defined; substring \n"
jpayne@68:                 "replacement cannot be performed reliably"
jpayne@68:             )
jpayne@68:         items = self._data.items()
jpayne@68:         data = {start: seq.replace(old, new) for start, seq in items}
jpayne@68:         return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined(self):
jpayne@68:         """Return False, as the sequence is not fully defined and has a non-zero length."""
jpayne@68:         return False
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def defined_ranges(self):
jpayne@68:         """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68: 
jpayne@68:         The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68:         """
jpayne@68:         return tuple((start, start + len(seq)) for start, seq in self._data.items())
jpayne@68: 
jpayne@68: 
jpayne@68: # The transcribe, backward_transcribe, and translate functions are
jpayne@68: # user-friendly versions of the corresponding Seq/MutableSeq methods.
jpayne@68: # The functions work both on Seq objects, and on strings.
jpayne@68: 
jpayne@68: 
jpayne@68: def transcribe(dna):
jpayne@68:     """Transcribe a DNA sequence into RNA.
jpayne@68: 
jpayne@68:     Following the usual convention, the sequence is interpreted as the
jpayne@68:     coding strand of the DNA double helix, not the template strand. This
jpayne@68:     means we can get the RNA sequence just by switching T to U.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68: 
jpayne@68:     Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68: 
jpayne@68:     e.g.
jpayne@68: 
jpayne@68:     >>> transcribe("ACTGN")
jpayne@68:     'ACUGN'
jpayne@68:     """
jpayne@68:     if isinstance(dna, Seq):
jpayne@68:         return dna.transcribe()
jpayne@68:     elif isinstance(dna, MutableSeq):
jpayne@68:         return Seq(dna).transcribe()
jpayne@68:     else:
jpayne@68:         return dna.replace("T", "U").replace("t", "u")
jpayne@68: 
jpayne@68: 
jpayne@68: def back_transcribe(rna):
jpayne@68:     """Return the RNA sequence back-transcribed into DNA.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68: 
jpayne@68:     Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68: 
jpayne@68:     e.g.
jpayne@68: 
jpayne@68:     >>> back_transcribe("ACUGN")
jpayne@68:     'ACTGN'
jpayne@68:     """
jpayne@68:     if isinstance(rna, Seq):
jpayne@68:         return rna.back_transcribe()
jpayne@68:     elif isinstance(rna, MutableSeq):
jpayne@68:         return Seq(rna).back_transcribe()
jpayne@68:     else:
jpayne@68:         return rna.replace("U", "T").replace("u", "t")
jpayne@68: 
jpayne@68: 
jpayne@68: def _translate_str(
jpayne@68:     sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
jpayne@68: ):
jpayne@68:     """Translate nucleotide string into a protein string (PRIVATE).
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - sequence - a string
jpayne@68:      - table - Which codon table to use?  This can be either a name (string),
jpayne@68:        an NCBI identifier (integer), or a CodonTable object (useful for
jpayne@68:        non-standard genetic codes).  This defaults to the "Standard" table.
jpayne@68:      - stop_symbol - a single character string, what to use for terminators.
jpayne@68:      - to_stop - boolean, should translation terminate at the first
jpayne@68:        in frame stop codon?  If there is no in-frame stop codon
jpayne@68:        then translation continues to the end.
jpayne@68:      - pos_stop - a single character string for a possible stop codon
jpayne@68:        (e.g. TAN or NNN)
jpayne@68:      - cds - Boolean, indicates this is a complete CDS.  If True, this
jpayne@68:        checks the sequence starts with a valid alternative start
jpayne@68:        codon (which will be translated as methionine, M), that the
jpayne@68:        sequence length is a multiple of three, and that there is a
jpayne@68:        single in frame stop codon at the end (this will be excluded
jpayne@68:        from the protein sequence, regardless of the to_stop option).
jpayne@68:        If these tests fail, an exception is raised.
jpayne@68:      - gap - Single character string to denote symbol used for gaps.
jpayne@68:        Defaults to None.
jpayne@68: 
jpayne@68:     Returns a string.
jpayne@68: 
jpayne@68:     e.g.
jpayne@68: 
jpayne@68:     >>> from Bio.Data import CodonTable
jpayne@68:     >>> table = CodonTable.ambiguous_dna_by_id[1]
jpayne@68:     >>> _translate_str("AAA", table)
jpayne@68:     'K'
jpayne@68:     >>> _translate_str("TAR", table)
jpayne@68:     '*'
jpayne@68:     >>> _translate_str("TAN", table)
jpayne@68:     'X'
jpayne@68:     >>> _translate_str("TAN", table, pos_stop="@")
jpayne@68:     '@'
jpayne@68:     >>> _translate_str("TA?", table)
jpayne@68:     Traceback (most recent call last):
jpayne@68:        ...
jpayne@68:     Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
jpayne@68: 
jpayne@68:     In a change to older versions of Biopython, partial codons are now
jpayne@68:     always regarded as an error (previously only checked if cds=True)
jpayne@68:     and will trigger a warning (likely to become an exception in a
jpayne@68:     future release).
jpayne@68: 
jpayne@68:     If **cds=True**, the start and stop codons are checked, and the start
jpayne@68:     codon will be translated at methionine. The sequence must be an
jpayne@68:     while number of codons.
jpayne@68: 
jpayne@68:     >>> _translate_str("ATGCCCTAG", table, cds=True)
jpayne@68:     'MP'
jpayne@68:     >>> _translate_str("AAACCCTAG", table, cds=True)
jpayne@68:     Traceback (most recent call last):
jpayne@68:        ...
jpayne@68:     Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
jpayne@68:     >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
jpayne@68:     Traceback (most recent call last):
jpayne@68:        ...
jpayne@68:     Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
jpayne@68:     """
jpayne@68:     try:
jpayne@68:         table_id = int(table)
jpayne@68:     except ValueError:
jpayne@68:         # Assume it's a table name
jpayne@68:         # The same table can be used for RNA or DNA
jpayne@68:         try:
jpayne@68:             codon_table = CodonTable.ambiguous_generic_by_name[table]
jpayne@68:         except KeyError:
jpayne@68:             if isinstance(table, str):
jpayne@68:                 raise ValueError(
jpayne@68:                     "The Bio.Seq translate methods and function DO NOT "
jpayne@68:                     "take a character string mapping table like the python "
jpayne@68:                     "string object's translate method. "
jpayne@68:                     "Use str(my_seq).translate(...) instead."
jpayne@68:                 ) from None
jpayne@68:             else:
jpayne@68:                 raise TypeError("table argument must be integer or string") from None
jpayne@68:     except (AttributeError, TypeError):
jpayne@68:         # Assume it's a CodonTable object
jpayne@68:         if isinstance(table, CodonTable.CodonTable):
jpayne@68:             codon_table = table
jpayne@68:         else:
jpayne@68:             raise ValueError("Bad table argument") from None
jpayne@68:     else:
jpayne@68:         # Assume it's a table ID
jpayne@68:         # The same table can be used for RNA or DNA
jpayne@68:         codon_table = CodonTable.ambiguous_generic_by_id[table_id]
jpayne@68:     sequence = sequence.upper()
jpayne@68:     amino_acids = []
jpayne@68:     forward_table = codon_table.forward_table
jpayne@68:     stop_codons = codon_table.stop_codons
jpayne@68:     if codon_table.nucleotide_alphabet is not None:
jpayne@68:         valid_letters = set(codon_table.nucleotide_alphabet.upper())
jpayne@68:     else:
jpayne@68:         # Assume the worst case, ambiguous DNA or RNA:
jpayne@68:         valid_letters = set(
jpayne@68:             IUPACData.ambiguous_dna_letters.upper()
jpayne@68:             + IUPACData.ambiguous_rna_letters.upper()
jpayne@68:         )
jpayne@68:     n = len(sequence)
jpayne@68: 
jpayne@68:     # Check for tables with 'ambiguous' (dual-coding) stop codons:
jpayne@68:     dual_coding = [c for c in stop_codons if c in forward_table]
jpayne@68:     if dual_coding:
jpayne@68:         c = dual_coding[0]
jpayne@68:         if to_stop:
jpayne@68:             raise ValueError(
jpayne@68:                 "You cannot use 'to_stop=True' with this table as it contains"
jpayne@68:                 f" {len(dual_coding)} codon(s) which can be both STOP and an"
jpayne@68:                 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
jpayne@68:             )
jpayne@68:         warnings.warn(
jpayne@68:             f"This table contains {len(dual_coding)} codon(s) which code(s) for"
jpayne@68:             f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
jpayne@68:             " or STOP). Such codons will be translated as amino acid.",
jpayne@68:             BiopythonWarning,
jpayne@68:         )
jpayne@68: 
jpayne@68:     if cds:
jpayne@68:         if str(sequence[:3]).upper() not in codon_table.start_codons:
jpayne@68:             raise CodonTable.TranslationError(
jpayne@68:                 f"First codon '{sequence[:3]}' is not a start codon"
jpayne@68:             )
jpayne@68:         if n % 3 != 0:
jpayne@68:             raise CodonTable.TranslationError(
jpayne@68:                 f"Sequence length {n} is not a multiple of three"
jpayne@68:             )
jpayne@68:         if str(sequence[-3:]).upper() not in stop_codons:
jpayne@68:             raise CodonTable.TranslationError(
jpayne@68:                 f"Final codon '{sequence[-3:]}' is not a stop codon"
jpayne@68:             )
jpayne@68:         # Don't translate the stop symbol, and manually translate the M
jpayne@68:         sequence = sequence[3:-3]
jpayne@68:         n -= 6
jpayne@68:         amino_acids = ["M"]
jpayne@68:     elif n % 3 != 0:
jpayne@68:         warnings.warn(
jpayne@68:             "Partial codon, len(sequence) not a multiple of three. "
jpayne@68:             "Explicitly trim the sequence or add trailing N before "
jpayne@68:             "translation. This may become an error in future.",
jpayne@68:             BiopythonWarning,
jpayne@68:         )
jpayne@68:     if gap is not None:
jpayne@68:         if not isinstance(gap, str):
jpayne@68:             raise TypeError("Gap character should be a single character string.")
jpayne@68:         elif len(gap) > 1:
jpayne@68:             raise ValueError("Gap character should be a single character string.")
jpayne@68: 
jpayne@68:     for i in range(0, n - n % 3, 3):
jpayne@68:         codon = sequence[i : i + 3]
jpayne@68:         try:
jpayne@68:             amino_acids.append(forward_table[codon])
jpayne@68:         except (KeyError, CodonTable.TranslationError):
jpayne@68:             if codon in codon_table.stop_codons:
jpayne@68:                 if cds:
jpayne@68:                     raise CodonTable.TranslationError(
jpayne@68:                         f"Extra in frame stop codon '{codon}' found."
jpayne@68:                     ) from None
jpayne@68:                 if to_stop:
jpayne@68:                     break
jpayne@68:                 amino_acids.append(stop_symbol)
jpayne@68:             elif valid_letters.issuperset(set(codon)):
jpayne@68:                 # Possible stop codon (e.g. NNN or TAN)
jpayne@68:                 amino_acids.append(pos_stop)
jpayne@68:             elif gap is not None and codon == gap * 3:
jpayne@68:                 # Gapped translation
jpayne@68:                 amino_acids.append(gap)
jpayne@68:             else:
jpayne@68:                 raise CodonTable.TranslationError(
jpayne@68:                     f"Codon '{codon}' is invalid"
jpayne@68:                 ) from None
jpayne@68:     return "".join(amino_acids)
jpayne@68: 
jpayne@68: 
jpayne@68: def translate(
jpayne@68:     sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
jpayne@68: ):
jpayne@68:     """Translate a nucleotide sequence into amino acids.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object. Given a Seq or
jpayne@68:     MutableSeq, returns a Seq object.
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - table - Which codon table to use?  This can be either a name
jpayne@68:        (string), an NCBI identifier (integer), or a CodonTable object
jpayne@68:        (useful for non-standard genetic codes).  Defaults to the "Standard"
jpayne@68:        table.
jpayne@68:      - stop_symbol - Single character string, what to use for any
jpayne@68:        terminators, defaults to the asterisk, "*".
jpayne@68:      - to_stop - Boolean, defaults to False meaning do a full
jpayne@68:        translation continuing on past any stop codons
jpayne@68:        (translated as the specified stop_symbol).  If
jpayne@68:        True, translation is terminated at the first in
jpayne@68:        frame stop codon (and the stop_symbol is not
jpayne@68:        appended to the returned protein sequence).
jpayne@68:      - cds - Boolean, indicates this is a complete CDS.  If True, this
jpayne@68:        checks the sequence starts with a valid alternative start
jpayne@68:        codon (which will be translated as methionine, M), that the
jpayne@68:        sequence length is a multiple of three, and that there is a
jpayne@68:        single in frame stop codon at the end (this will be excluded
jpayne@68:        from the protein sequence, regardless of the to_stop option).
jpayne@68:        If these tests fail, an exception is raised.
jpayne@68:      - gap - Single character string to denote symbol used for gaps.
jpayne@68:        Defaults to None.
jpayne@68: 
jpayne@68:     A simple string example using the default (standard) genetic code:
jpayne@68: 
jpayne@68:     >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
jpayne@68:     >>> translate(coding_dna)
jpayne@68:     'VAIVMGR*KGAR*'
jpayne@68:     >>> translate(coding_dna, stop_symbol="@")
jpayne@68:     'VAIVMGR@KGAR@'
jpayne@68:     >>> translate(coding_dna, to_stop=True)
jpayne@68:     'VAIVMGR'
jpayne@68: 
jpayne@68:     Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68: 
jpayne@68:     >>> translate(coding_dna, table=2)
jpayne@68:     'VAIVMGRWKGAR*'
jpayne@68:     >>> translate(coding_dna, table=2, to_stop=True)
jpayne@68:     'VAIVMGRWKGAR'
jpayne@68: 
jpayne@68:     In fact this example uses an alternative start codon valid under NCBI
jpayne@68:     table 2, GTG, which means this example is a complete valid CDS which
jpayne@68:     when translated should really start with methionine (not valine):
jpayne@68: 
jpayne@68:     >>> translate(coding_dna, table=2, cds=True)
jpayne@68:     'MAIVMGRWKGAR'
jpayne@68: 
jpayne@68:     Note that if the sequence has no in-frame stop codon, then the to_stop
jpayne@68:     argument has no effect:
jpayne@68: 
jpayne@68:     >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
jpayne@68:     >>> translate(coding_dna2)
jpayne@68:     'VAIVMGR'
jpayne@68:     >>> translate(coding_dna2, to_stop=True)
jpayne@68:     'VAIVMGR'
jpayne@68: 
jpayne@68:     NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68:     or a stop codon.  These are translated as "X".  Any invalid codon
jpayne@68:     (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68: 
jpayne@68:     It will however translate either DNA or RNA.
jpayne@68: 
jpayne@68:     NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
jpayne@68:     stop codons'. These are stop codons with unambiguous sequence but which
jpayne@68:     have a context dependent coding as STOP or as amino acid. With these tables
jpayne@68:     'to_stop' must be False (otherwise a ValueError is raised). The dual
jpayne@68:     coding codons will always be translated as amino acid, except for
jpayne@68:     'cds=True', where the last codon will be translated as STOP.
jpayne@68: 
jpayne@68:     >>> coding_dna3 = "ATGGCACGGAAGTGA"
jpayne@68:     >>> translate(coding_dna3)
jpayne@68:     'MARK*'
jpayne@68: 
jpayne@68:     >>> translate(coding_dna3, table=27)  # Table 27: TGA -> STOP or W
jpayne@68:     'MARKW'
jpayne@68: 
jpayne@68:     It will however raise a BiopythonWarning (not shown).
jpayne@68: 
jpayne@68:     >>> translate(coding_dna3, table=27, cds=True)
jpayne@68:     'MARK'
jpayne@68: 
jpayne@68:     >>> translate(coding_dna3, table=27, to_stop=True)
jpayne@68:     Traceback (most recent call last):
jpayne@68:        ...
jpayne@68:     ValueError: You cannot use 'to_stop=True' with this table ...
jpayne@68:     """
jpayne@68:     if isinstance(sequence, Seq):
jpayne@68:         return sequence.translate(table, stop_symbol, to_stop, cds)
jpayne@68:     elif isinstance(sequence, MutableSeq):
jpayne@68:         # Return a Seq object
jpayne@68:         return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
jpayne@68:     else:
jpayne@68:         # Assume it's a string, return a string
jpayne@68:         return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68: 
jpayne@68: 
jpayne@68: def reverse_complement(sequence, inplace=False):
jpayne@68:     """Return the reverse complement as a DNA sequence.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68:     Given a Seq object, returns a new Seq object.
jpayne@68:     Given a MutableSeq, returns a new MutableSeq object.
jpayne@68:     Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68: 
jpayne@68:     >>> my_seq = "CGA"
jpayne@68:     >>> reverse_complement(my_seq)
jpayne@68:     'TCG'
jpayne@68:     >>> my_seq = Seq("CGA")
jpayne@68:     >>> reverse_complement(my_seq)
jpayne@68:     Seq('TCG')
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> reverse_complement(my_seq)
jpayne@68:     MutableSeq('TCG')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('CGA')
jpayne@68: 
jpayne@68:     Any U in the sequence is treated as a T:
jpayne@68: 
jpayne@68:     >>> reverse_complement(Seq("CGAUT"))
jpayne@68:     Seq('AATCG')
jpayne@68: 
jpayne@68:     In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68: 
jpayne@68:     >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68:     Seq('AAUCG')
jpayne@68: 
jpayne@68:     Supports and lower- and upper-case characters, and unambiguous and
jpayne@68:     ambiguous nucleotides. All other characters are not converted:
jpayne@68: 
jpayne@68:     >>> reverse_complement("ACGTUacgtuXYZxyz")
jpayne@68:     'zrxZRXaacgtAACGT'
jpayne@68: 
jpayne@68:     The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> reverse_complement(my_seq, inplace=True)
jpayne@68:     MutableSeq('TCG')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('TCG')
jpayne@68: 
jpayne@68:     As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68:     raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68:     ``inplace=True``.
jpayne@68:     """
jpayne@68:     from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
jpayne@68: 
jpayne@68:     if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68:         return sequence.reverse_complement(inplace)
jpayne@68:     if isinstance(sequence, SeqRecord):
jpayne@68:         if inplace:
jpayne@68:             raise TypeError("SeqRecords are immutable")
jpayne@68:         return sequence.reverse_complement()
jpayne@68:     # Assume it's a string.
jpayne@68:     if inplace:
jpayne@68:         raise TypeError("strings are immutable")
jpayne@68:     sequence = sequence.encode("ASCII")
jpayne@68:     sequence = sequence.translate(_dna_complement_table)
jpayne@68:     sequence = sequence.decode("ASCII")
jpayne@68:     return sequence[::-1]
jpayne@68: 
jpayne@68: 
jpayne@68: def reverse_complement_rna(sequence, inplace=False):
jpayne@68:     """Return the reverse complement as an RNA sequence.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68:     Given a Seq object, returns a new Seq object.
jpayne@68:     Given a MutableSeq, returns a new MutableSeq object.
jpayne@68:     Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68: 
jpayne@68:     >>> my_seq = "CGA"
jpayne@68:     >>> reverse_complement_rna(my_seq)
jpayne@68:     'UCG'
jpayne@68:     >>> my_seq = Seq("CGA")
jpayne@68:     >>> reverse_complement_rna(my_seq)
jpayne@68:     Seq('UCG')
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> reverse_complement_rna(my_seq)
jpayne@68:     MutableSeq('UCG')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('CGA')
jpayne@68: 
jpayne@68:     Any T in the sequence is treated as a U:
jpayne@68: 
jpayne@68:     >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68:     Seq('AAUCG')
jpayne@68: 
jpayne@68:     In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68: 
jpayne@68:     >>> reverse_complement(Seq("CGAUT"), inplace=False)
jpayne@68:     Seq('AATCG')
jpayne@68: 
jpayne@68:     Supports and lower- and upper-case characters, and unambiguous and
jpayne@68:     ambiguous nucleotides. All other characters are not converted:
jpayne@68: 
jpayne@68:     >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
jpayne@68:     'zrxZRXaacguAACGU'
jpayne@68: 
jpayne@68:     The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> reverse_complement_rna(my_seq, inplace=True)
jpayne@68:     MutableSeq('UCG')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('UCG')
jpayne@68: 
jpayne@68:     As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68:     raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68:     ``inplace=True``.
jpayne@68:     """
jpayne@68:     from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
jpayne@68: 
jpayne@68:     if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68:         return sequence.reverse_complement_rna(inplace)
jpayne@68:     if isinstance(sequence, SeqRecord):
jpayne@68:         if inplace:
jpayne@68:             raise TypeError("SeqRecords are immutable")
jpayne@68:         return sequence.reverse_complement_rna()
jpayne@68:     # Assume it's a string.
jpayne@68:     if inplace:
jpayne@68:         raise TypeError("strings are immutable")
jpayne@68:     sequence = sequence.encode("ASCII")
jpayne@68:     sequence = sequence.translate(_rna_complement_table)
jpayne@68:     sequence = sequence.decode("ASCII")
jpayne@68:     return sequence[::-1]
jpayne@68: 
jpayne@68: 
jpayne@68: def complement(sequence, inplace=False):
jpayne@68:     """Return the complement as a DNA sequence.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68:     Given a Seq object, returns a new Seq object.
jpayne@68:     Given a MutableSeq, returns a new MutableSeq object.
jpayne@68:     Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68: 
jpayne@68:     >>> my_seq = "CGA"
jpayne@68:     >>> complement(my_seq)
jpayne@68:     'GCT'
jpayne@68:     >>> my_seq = Seq("CGA")
jpayne@68:     >>> complement(my_seq)
jpayne@68:     Seq('GCT')
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> complement(my_seq)
jpayne@68:     MutableSeq('GCT')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('CGA')
jpayne@68: 
jpayne@68:     Any U in the sequence is treated as a T:
jpayne@68: 
jpayne@68:     >>> complement(Seq("CGAUT"))
jpayne@68:     Seq('GCTAA')
jpayne@68: 
jpayne@68:     In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68: 
jpayne@68:     >>> complement_rna(Seq("CGAUT"))
jpayne@68:     Seq('GCUAA')
jpayne@68: 
jpayne@68:     Supports and lower- and upper-case characters, and unambiguous and
jpayne@68:     ambiguous nucleotides. All other characters are not converted:
jpayne@68: 
jpayne@68:     >>> complement("ACGTUacgtuXYZxyz")
jpayne@68:     'TGCAAtgcaaXRZxrz'
jpayne@68: 
jpayne@68:     The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> complement(my_seq, inplace=True)
jpayne@68:     MutableSeq('GCT')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('GCT')
jpayne@68: 
jpayne@68:     As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68:     raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68:     ``inplace=True``.
jpayne@68:     """
jpayne@68:     from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
jpayne@68: 
jpayne@68:     if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68:         return sequence.complement(inplace)
jpayne@68:     if isinstance(sequence, SeqRecord):
jpayne@68:         if inplace:
jpayne@68:             raise TypeError("SeqRecords are immutable")
jpayne@68:         return sequence.complement()
jpayne@68:     # Assume it's a string.
jpayne@68:     if inplace is True:
jpayne@68:         raise TypeError("strings are immutable")
jpayne@68:     sequence = sequence.encode("ASCII")
jpayne@68:     sequence = sequence.translate(_dna_complement_table)
jpayne@68:     return sequence.decode("ASCII")
jpayne@68: 
jpayne@68: 
jpayne@68: def complement_rna(sequence, inplace=False):
jpayne@68:     """Return the complement as an RNA sequence.
jpayne@68: 
jpayne@68:     If given a string, returns a new string object.
jpayne@68:     Given a Seq object, returns a new Seq object.
jpayne@68:     Given a MutableSeq, returns a new MutableSeq object.
jpayne@68:     Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68: 
jpayne@68:     >>> my_seq = "CGA"
jpayne@68:     >>> complement_rna(my_seq)
jpayne@68:     'GCU'
jpayne@68:     >>> my_seq = Seq("CGA")
jpayne@68:     >>> complement_rna(my_seq)
jpayne@68:     Seq('GCU')
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> complement_rna(my_seq)
jpayne@68:     MutableSeq('GCU')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('CGA')
jpayne@68: 
jpayne@68:     Any T in the sequence is treated as a U:
jpayne@68: 
jpayne@68:     >>> complement_rna(Seq("CGAUT"))
jpayne@68:     Seq('GCUAA')
jpayne@68: 
jpayne@68:     In contrast, ``complement`` returns a DNA sequence:
jpayne@68: 
jpayne@68:     >>> complement(Seq("CGAUT"))
jpayne@68:     Seq('GCTAA')
jpayne@68: 
jpayne@68:     Supports and lower- and upper-case characters, and unambiguous and
jpayne@68:     ambiguous nucleotides. All other characters are not converted:
jpayne@68: 
jpayne@68:     >>> complement_rna("ACGTUacgtuXYZxyz")
jpayne@68:     'UGCAAugcaaXRZxrz'
jpayne@68: 
jpayne@68:     The sequence is modified in-place and returned if inplace is True:
jpayne@68: 
jpayne@68:     >>> my_seq = MutableSeq("CGA")
jpayne@68:     >>> complement(my_seq, inplace=True)
jpayne@68:     MutableSeq('GCT')
jpayne@68:     >>> my_seq
jpayne@68:     MutableSeq('GCT')
jpayne@68: 
jpayne@68:     As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68:     raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68:     ``inplace=True``.
jpayne@68:     """
jpayne@68:     from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
jpayne@68: 
jpayne@68:     if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68:         return sequence.complement_rna(inplace)
jpayne@68:     if isinstance(sequence, SeqRecord):
jpayne@68:         if inplace:
jpayne@68:             raise TypeError("SeqRecords are immutable")
jpayne@68:         return sequence.complement_rna()
jpayne@68:     # Assume it's a string.
jpayne@68:     if inplace:
jpayne@68:         raise TypeError("strings are immutable")
jpayne@68:     sequence = sequence.encode("ASCII")
jpayne@68:     sequence = sequence.translate(_rna_complement_table)
jpayne@68:     return sequence.decode("ASCII")
jpayne@68: 
jpayne@68: 
jpayne@68: def _test():
jpayne@68:     """Run the Bio.Seq module's doctests (PRIVATE)."""
jpayne@68:     print("Running doctests...")
jpayne@68:     import doctest
jpayne@68: 
jpayne@68:     doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
jpayne@68:     print("Done")
jpayne@68: 
jpayne@68: 
jpayne@68: if __name__ == "__main__":
jpayne@68:     _test()