jpayne@68: # Copyright 2000 Andrew Dalke. jpayne@68: # Copyright 2000-2002 Brad Chapman. jpayne@68: # Copyright 2004-2005, 2010 by M de Hoon. jpayne@68: # Copyright 2007-2023 by Peter Cock. jpayne@68: # All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: """Provide objects to represent biological sequences. jpayne@68: jpayne@68: See also the Seq_ wiki and the chapter in our tutorial: jpayne@68: - `HTML Tutorial`_ jpayne@68: - `PDF Tutorial`_ jpayne@68: jpayne@68: .. _Seq: http://biopython.org/wiki/Seq jpayne@68: .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html jpayne@68: .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf jpayne@68: jpayne@68: """ jpayne@68: import array jpayne@68: import collections jpayne@68: import numbers jpayne@68: import warnings jpayne@68: jpayne@68: from abc import ABC jpayne@68: from abc import abstractmethod jpayne@68: from typing import overload, Optional, Union, Dict jpayne@68: jpayne@68: from Bio import BiopythonWarning jpayne@68: from Bio.Data import CodonTable jpayne@68: from Bio.Data import IUPACData jpayne@68: jpayne@68: jpayne@68: def _maketrans(complement_mapping): jpayne@68: """Make a python string translation table (PRIVATE). jpayne@68: jpayne@68: Arguments: jpayne@68: - complement_mapping - a dictionary such as ambiguous_dna_complement jpayne@68: and ambiguous_rna_complement from Data.IUPACData. jpayne@68: jpayne@68: Returns a translation table (a bytes object of length 256) for use with jpayne@68: the python string's translate method to use in a (reverse) complement. jpayne@68: jpayne@68: Compatible with lower case and upper case sequences. jpayne@68: jpayne@68: For internal use only. jpayne@68: """ jpayne@68: keys = "".join(complement_mapping.keys()).encode("ASCII") jpayne@68: values = "".join(complement_mapping.values()).encode("ASCII") jpayne@68: return bytes.maketrans(keys + keys.lower(), values + values.lower()) jpayne@68: jpayne@68: jpayne@68: ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement) jpayne@68: ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"] jpayne@68: _dna_complement_table = _maketrans(ambiguous_dna_complement) jpayne@68: del ambiguous_dna_complement jpayne@68: ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement) jpayne@68: ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"] jpayne@68: _rna_complement_table = _maketrans(ambiguous_rna_complement) jpayne@68: del ambiguous_rna_complement jpayne@68: jpayne@68: jpayne@68: class SequenceDataAbstractBaseClass(ABC): jpayne@68: """Abstract base class for sequence content providers. jpayne@68: jpayne@68: Most users will not need to use this class. It is used internally as a base jpayne@68: class for sequence content provider classes such as _UndefinedSequenceData jpayne@68: defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO. jpayne@68: Instances of these classes can be used instead of a ``bytes`` object as the jpayne@68: data argument when creating a Seq object, and provide the sequence content jpayne@68: only when requested via ``__getitem__``. This allows lazy parsers to load jpayne@68: and parse sequence data from a file only for the requested sequence regions, jpayne@68: and _UndefinedSequenceData instances to raise an exception when undefined jpayne@68: sequence data are requested. jpayne@68: jpayne@68: Future implementations of lazy parsers that similarly provide on-demand jpayne@68: parsing of sequence data should use a subclass of this abstract class and jpayne@68: implement the abstract methods ``__len__`` and ``__getitem__``: jpayne@68: jpayne@68: * ``__len__`` must return the sequence length; jpayne@68: * ``__getitem__`` must return jpayne@68: jpayne@68: * a ``bytes`` object for the requested region; or jpayne@68: * a new instance of the subclass for the requested region; or jpayne@68: * raise an ``UndefinedSequenceError``. jpayne@68: jpayne@68: Calling ``__getitem__`` for a sequence region of size zero should always jpayne@68: return an empty ``bytes`` object. jpayne@68: Calling ``__getitem__`` for the full sequence (as in data[:]) should jpayne@68: either return a ``bytes`` object with the full sequence, or raise an jpayne@68: ``UndefinedSequenceError``. jpayne@68: jpayne@68: Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()`` jpayne@68: as part of their ``__init__`` method. jpayne@68: """ jpayne@68: jpayne@68: __slots__ = () jpayne@68: jpayne@68: def __init__(self): jpayne@68: """Check if ``__getitem__`` returns a bytes-like object.""" jpayne@68: assert self[:0] == b"" jpayne@68: jpayne@68: @abstractmethod jpayne@68: def __len__(self): jpayne@68: pass jpayne@68: jpayne@68: @abstractmethod jpayne@68: def __getitem__(self, key): jpayne@68: pass jpayne@68: jpayne@68: def __bytes__(self): jpayne@68: return self[:] jpayne@68: jpayne@68: def __hash__(self): jpayne@68: return hash(bytes(self)) jpayne@68: jpayne@68: def __eq__(self, other): jpayne@68: return bytes(self) == other jpayne@68: jpayne@68: def __lt__(self, other): jpayne@68: return bytes(self) < other jpayne@68: jpayne@68: def __le__(self, other): jpayne@68: return bytes(self) <= other jpayne@68: jpayne@68: def __gt__(self, other): jpayne@68: return bytes(self) > other jpayne@68: jpayne@68: def __ge__(self, other): jpayne@68: return bytes(self) >= other jpayne@68: jpayne@68: def __add__(self, other): jpayne@68: try: jpayne@68: return bytes(self) + bytes(other) jpayne@68: except UndefinedSequenceError: jpayne@68: return NotImplemented jpayne@68: # will be handled by _UndefinedSequenceData.__radd__ or jpayne@68: # by _PartiallyDefinedSequenceData.__radd__ jpayne@68: jpayne@68: def __radd__(self, other): jpayne@68: return other + bytes(self) jpayne@68: jpayne@68: def __mul__(self, other): jpayne@68: return other * bytes(self) jpayne@68: jpayne@68: def __contains__(self, item): jpayne@68: return bytes(self).__contains__(item) jpayne@68: jpayne@68: def decode(self, encoding="utf-8"): jpayne@68: """Decode the data as bytes using the codec registered for encoding. jpayne@68: jpayne@68: encoding jpayne@68: The encoding with which to decode the bytes. jpayne@68: """ jpayne@68: return bytes(self).decode(encoding) jpayne@68: jpayne@68: def count(self, sub, start=None, end=None): jpayne@68: """Return the number of non-overlapping occurrences of sub in data[start:end]. jpayne@68: jpayne@68: Optional arguments start and end are interpreted as in slice notation. jpayne@68: This method behaves as the count method of Python strings. jpayne@68: """ jpayne@68: return bytes(self).count(sub, start, end) jpayne@68: jpayne@68: def find(self, sub, start=None, end=None): jpayne@68: """Return the lowest index in data where subsection sub is found. jpayne@68: jpayne@68: Return the lowest index in data where subsection sub is found, jpayne@68: such that sub is contained within data[start,end]. Optional jpayne@68: arguments start and end are interpreted as in slice notation. jpayne@68: jpayne@68: Return -1 on failure. jpayne@68: """ jpayne@68: return bytes(self).find(sub, start, end) jpayne@68: jpayne@68: def rfind(self, sub, start=None, end=None): jpayne@68: """Return the highest index in data where subsection sub is found. jpayne@68: jpayne@68: Return the highest index in data where subsection sub is found, jpayne@68: such that sub is contained within data[start,end]. Optional jpayne@68: arguments start and end are interpreted as in slice notation. jpayne@68: jpayne@68: Return -1 on failure. jpayne@68: """ jpayne@68: return bytes(self).rfind(sub, start, end) jpayne@68: jpayne@68: def index(self, sub, start=None, end=None): jpayne@68: """Return the lowest index in data where subsection sub is found. jpayne@68: jpayne@68: Return the lowest index in data where subsection sub is found, jpayne@68: such that sub is contained within data[start,end]. Optional jpayne@68: arguments start and end are interpreted as in slice notation. jpayne@68: jpayne@68: Raises ValueError when the subsection is not found. jpayne@68: """ jpayne@68: return bytes(self).index(sub, start, end) jpayne@68: jpayne@68: def rindex(self, sub, start=None, end=None): jpayne@68: """Return the highest index in data where subsection sub is found. jpayne@68: jpayne@68: Return the highest index in data where subsection sub is found, jpayne@68: such that sub is contained within data[start,end]. Optional jpayne@68: arguments start and end are interpreted as in slice notation. jpayne@68: jpayne@68: Raise ValueError when the subsection is not found. jpayne@68: """ jpayne@68: return bytes(self).rindex(sub, start, end) jpayne@68: jpayne@68: def startswith(self, prefix, start=None, end=None): jpayne@68: """Return True if data starts with the specified prefix, False otherwise. jpayne@68: jpayne@68: With optional start, test data beginning at that position. jpayne@68: With optional end, stop comparing data at that position. jpayne@68: prefix can also be a tuple of bytes to try. jpayne@68: """ jpayne@68: return bytes(self).startswith(prefix, start, end) jpayne@68: jpayne@68: def endswith(self, suffix, start=None, end=None): jpayne@68: """Return True if data ends with the specified suffix, False otherwise. jpayne@68: jpayne@68: With optional start, test data beginning at that position. jpayne@68: With optional end, stop comparing data at that position. jpayne@68: suffix can also be a tuple of bytes to try. jpayne@68: """ jpayne@68: return bytes(self).endswith(suffix, start, end) jpayne@68: jpayne@68: def split(self, sep=None, maxsplit=-1): jpayne@68: """Return a list of the sections in the data, using sep as the delimiter. jpayne@68: jpayne@68: sep jpayne@68: The delimiter according which to split the data. jpayne@68: None (the default value) means split on ASCII whitespace characters jpayne@68: (space, tab, return, newline, formfeed, vertical tab). jpayne@68: maxsplit jpayne@68: Maximum number of splits to do. jpayne@68: -1 (the default value) means no limit. jpayne@68: """ jpayne@68: return bytes(self).split(sep, maxsplit) jpayne@68: jpayne@68: def rsplit(self, sep=None, maxsplit=-1): jpayne@68: """Return a list of the sections in the data, using sep as the delimiter. jpayne@68: jpayne@68: sep jpayne@68: The delimiter according which to split the data. jpayne@68: None (the default value) means split on ASCII whitespace characters jpayne@68: (space, tab, return, newline, formfeed, vertical tab). jpayne@68: maxsplit jpayne@68: Maximum number of splits to do. jpayne@68: -1 (the default value) means no limit. jpayne@68: jpayne@68: Splitting is done starting at the end of the data and working to the front. jpayne@68: """ jpayne@68: return bytes(self).rsplit(sep, maxsplit) jpayne@68: jpayne@68: def strip(self, chars=None): jpayne@68: """Strip leading and trailing characters contained in the argument. jpayne@68: jpayne@68: If the argument is omitted or None, strip leading and trailing ASCII whitespace. jpayne@68: """ jpayne@68: return bytes(self).strip(chars) jpayne@68: jpayne@68: def lstrip(self, chars=None): jpayne@68: """Strip leading characters contained in the argument. jpayne@68: jpayne@68: If the argument is omitted or None, strip leading ASCII whitespace. jpayne@68: """ jpayne@68: return bytes(self).lstrip(chars) jpayne@68: jpayne@68: def rstrip(self, chars=None): jpayne@68: """Strip trailing characters contained in the argument. jpayne@68: jpayne@68: If the argument is omitted or None, strip trailing ASCII whitespace. jpayne@68: """ jpayne@68: return bytes(self).rstrip(chars) jpayne@68: jpayne@68: def removeprefix(self, prefix): jpayne@68: """Remove the prefix if present.""" jpayne@68: # Want to do just this, but need Python 3.9+ jpayne@68: # return bytes(self).removeprefix(prefix) jpayne@68: data = bytes(self) jpayne@68: try: jpayne@68: return data.removeprefix(prefix) jpayne@68: except AttributeError: jpayne@68: if data.startswith(prefix): jpayne@68: return data[len(prefix) :] jpayne@68: else: jpayne@68: return data jpayne@68: jpayne@68: def removesuffix(self, suffix): jpayne@68: """Remove the suffix if present.""" jpayne@68: # Want to do just this, but need Python 3.9+ jpayne@68: # return bytes(self).removesuffix(suffix) jpayne@68: data = bytes(self) jpayne@68: try: jpayne@68: return data.removesuffix(suffix) jpayne@68: except AttributeError: jpayne@68: if data.startswith(suffix): jpayne@68: return data[: -len(suffix)] jpayne@68: else: jpayne@68: return data jpayne@68: jpayne@68: def upper(self): jpayne@68: """Return a copy of data with all ASCII characters converted to uppercase.""" jpayne@68: return bytes(self).upper() jpayne@68: jpayne@68: def lower(self): jpayne@68: """Return a copy of data with all ASCII characters converted to lowercase.""" jpayne@68: return bytes(self).lower() jpayne@68: jpayne@68: def isupper(self): jpayne@68: """Return True if all ASCII characters in data are uppercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: return bytes(self).isupper() jpayne@68: jpayne@68: def islower(self): jpayne@68: """Return True if all ASCII characters in data are lowercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: return bytes(self).islower() jpayne@68: jpayne@68: def replace(self, old, new): jpayne@68: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@68: return bytes(self).replace(old, new) jpayne@68: jpayne@68: def translate(self, table, delete=b""): jpayne@68: """Return a copy with each character mapped by the given translation table. jpayne@68: jpayne@68: table jpayne@68: Translation table, which must be a bytes object of length 256. jpayne@68: jpayne@68: All characters occurring in the optional argument delete are removed. jpayne@68: The remaining characters are mapped through the given translation table. jpayne@68: """ jpayne@68: return bytes(self).translate(table, delete) jpayne@68: jpayne@68: @property jpayne@68: def defined(self): jpayne@68: """Return True if the sequence is defined, False if undefined or partially defined. jpayne@68: jpayne@68: Zero-length sequences are always considered to be defined. jpayne@68: """ jpayne@68: return True jpayne@68: jpayne@68: @property jpayne@68: def defined_ranges(self): jpayne@68: """Return a tuple of the ranges where the sequence contents is defined. jpayne@68: jpayne@68: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@68: """ jpayne@68: length = len(self) jpayne@68: if length > 0: jpayne@68: return ((0, length),) jpayne@68: else: jpayne@68: return () jpayne@68: jpayne@68: jpayne@68: class _SeqAbstractBaseClass(ABC): jpayne@68: """Abstract base class for the Seq and MutableSeq classes (PRIVATE). jpayne@68: jpayne@68: Most users will not need to use this class. It is used internally as an jpayne@68: abstract base class for Seq and MutableSeq, as most of their methods are jpayne@68: identical. jpayne@68: """ jpayne@68: jpayne@68: __slots__ = ("_data",) jpayne@68: __array_ufunc__ = None # turn off numpy Ufuncs jpayne@68: jpayne@68: @abstractmethod jpayne@68: def __init__(self): jpayne@68: pass jpayne@68: jpayne@68: def __bytes__(self): jpayne@68: return bytes(self._data) jpayne@68: jpayne@68: def __repr__(self): jpayne@68: """Return (truncated) representation of the sequence.""" jpayne@68: data = self._data jpayne@68: if isinstance(data, _UndefinedSequenceData): jpayne@68: return f"Seq(None, length={len(self)})" jpayne@68: if isinstance(data, _PartiallyDefinedSequenceData): jpayne@68: d = {} jpayne@68: for position, seq in data._data.items(): jpayne@68: if len(seq) > 60: jpayne@68: start = seq[:54].decode("ASCII") jpayne@68: end = seq[-3:].decode("ASCII") jpayne@68: seq = f"{start}...{end}" jpayne@68: else: jpayne@68: seq = seq.decode("ASCII") jpayne@68: d[position] = seq jpayne@68: return "Seq(%r, length=%d)" % (d, len(self)) jpayne@68: if len(data) > 60: jpayne@68: # Shows the last three letters as it is often useful to see if jpayne@68: # there is a stop codon at the end of a sequence. jpayne@68: # Note total length is 54+3+3=60 jpayne@68: start = data[:54].decode("ASCII") jpayne@68: end = data[-3:].decode("ASCII") jpayne@68: return f"{self.__class__.__name__}('{start}...{end}')" jpayne@68: else: jpayne@68: data = data.decode("ASCII") jpayne@68: return f"{self.__class__.__name__}('{data}')" jpayne@68: jpayne@68: def __str__(self): jpayne@68: """Return the full sequence as a python string.""" jpayne@68: return self._data.decode("ASCII") jpayne@68: jpayne@68: def __eq__(self, other): jpayne@68: """Compare the sequence to another sequence or a string. jpayne@68: jpayne@68: Sequences are equal to each other if their sequence contents is jpayne@68: identical: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> seq1 = Seq("ACGT") jpayne@68: >>> seq2 = Seq("ACGT") jpayne@68: >>> mutable_seq = MutableSeq("ACGT") jpayne@68: >>> seq1 == seq2 jpayne@68: True jpayne@68: >>> seq1 == mutable_seq jpayne@68: True jpayne@68: >>> seq1 == "ACGT" jpayne@68: True jpayne@68: jpayne@68: Note that the sequence objects themselves are not identical to each jpayne@68: other: jpayne@68: jpayne@68: >>> id(seq1) == id(seq2) jpayne@68: False jpayne@68: >>> seq1 is seq2 jpayne@68: False jpayne@68: jpayne@68: Sequences can also be compared to strings, ``bytes``, and ``bytearray`` jpayne@68: objects: jpayne@68: jpayne@68: >>> seq1 == "ACGT" jpayne@68: True jpayne@68: >>> seq1 == b"ACGT" jpayne@68: True jpayne@68: >>> seq1 == bytearray(b"ACGT") jpayne@68: True jpayne@68: """ jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self._data == other._data jpayne@68: elif isinstance(other, str): jpayne@68: return self._data == other.encode("ASCII") jpayne@68: else: jpayne@68: return self._data == other jpayne@68: jpayne@68: def __lt__(self, other): jpayne@68: """Implement the less-than operand.""" jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self._data < other._data jpayne@68: elif isinstance(other, str): jpayne@68: return self._data < other.encode("ASCII") jpayne@68: else: jpayne@68: return self._data < other jpayne@68: jpayne@68: def __le__(self, other): jpayne@68: """Implement the less-than or equal operand.""" jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self._data <= other._data jpayne@68: elif isinstance(other, str): jpayne@68: return self._data <= other.encode("ASCII") jpayne@68: else: jpayne@68: return self._data <= other jpayne@68: jpayne@68: def __gt__(self, other): jpayne@68: """Implement the greater-than operand.""" jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self._data > other._data jpayne@68: elif isinstance(other, str): jpayne@68: return self._data > other.encode("ASCII") jpayne@68: else: jpayne@68: return self._data > other jpayne@68: jpayne@68: def __ge__(self, other): jpayne@68: """Implement the greater-than or equal operand.""" jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self._data >= other._data jpayne@68: elif isinstance(other, str): jpayne@68: return self._data >= other.encode("ASCII") jpayne@68: else: jpayne@68: return self._data >= other jpayne@68: jpayne@68: def __len__(self): jpayne@68: """Return the length of the sequence.""" jpayne@68: return len(self._data) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: """Return an iterable of the sequence.""" jpayne@68: return self._data.decode("ASCII").__iter__() jpayne@68: jpayne@68: @overload jpayne@68: def __getitem__(self, index: int) -> str: jpayne@68: ... jpayne@68: jpayne@68: @overload jpayne@68: def __getitem__(self, index: slice) -> "Seq": jpayne@68: ... jpayne@68: jpayne@68: def __getitem__(self, index): jpayne@68: """Return a subsequence as a single letter or as a sequence object. jpayne@68: jpayne@68: If the index is an integer, a single letter is returned as a Python jpayne@68: string: jpayne@68: jpayne@68: >>> seq = Seq('ACTCGACGTCG') jpayne@68: >>> seq[5] jpayne@68: 'A' jpayne@68: jpayne@68: Otherwise, a new sequence object of the same class is returned: jpayne@68: jpayne@68: >>> seq[5:8] jpayne@68: Seq('ACG') jpayne@68: >>> mutable_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> mutable_seq[5:8] jpayne@68: MutableSeq('ACG') jpayne@68: """ jpayne@68: if isinstance(index, numbers.Integral): jpayne@68: # Return a single letter as a string jpayne@68: return chr(self._data[index]) jpayne@68: else: jpayne@68: # Return the (sub)sequence as another Seq/MutableSeq object jpayne@68: return self.__class__(self._data[index]) jpayne@68: jpayne@68: def __add__(self, other): jpayne@68: """Add a sequence or string to this sequence. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> Seq("MELKI") + "LV" jpayne@68: Seq('MELKILV') jpayne@68: >>> MutableSeq("MELKI") + "LV" jpayne@68: MutableSeq('MELKILV') jpayne@68: """ jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self.__class__(self._data + other._data) jpayne@68: elif isinstance(other, str): jpayne@68: return self.__class__(self._data + other.encode("ASCII")) jpayne@68: else: jpayne@68: # If other is a SeqRecord, then SeqRecord's __radd__ will handle jpayne@68: # this. If not, returning NotImplemented will trigger a TypeError. jpayne@68: return NotImplemented jpayne@68: jpayne@68: def __radd__(self, other): jpayne@68: """Add a sequence string on the left. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> "LV" + Seq("MELKI") jpayne@68: Seq('LVMELKI') jpayne@68: >>> "LV" + MutableSeq("MELKI") jpayne@68: MutableSeq('LVMELKI') jpayne@68: jpayne@68: Adding two sequence objects is handled via the __add__ method. jpayne@68: """ jpayne@68: if isinstance(other, str): jpayne@68: return self.__class__(other.encode("ASCII") + self._data) jpayne@68: else: jpayne@68: return NotImplemented jpayne@68: jpayne@68: def __mul__(self, other): jpayne@68: """Multiply sequence by integer. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> Seq('ATG') * 2 jpayne@68: Seq('ATGATG') jpayne@68: >>> MutableSeq('ATG') * 2 jpayne@68: MutableSeq('ATGATG') jpayne@68: """ jpayne@68: if not isinstance(other, numbers.Integral): jpayne@68: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@68: # we would like to simply write jpayne@68: # data = self._data * other jpayne@68: # here, but currently that causes a bug on PyPy if self._data is a jpayne@68: # bytearray and other is a numpy integer. Using this workaround: jpayne@68: data = self._data.__mul__(other) jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def __rmul__(self, other): jpayne@68: """Multiply integer by sequence. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> 2 * Seq('ATG') jpayne@68: Seq('ATGATG') jpayne@68: """ jpayne@68: if not isinstance(other, numbers.Integral): jpayne@68: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@68: # we would like to simply write jpayne@68: # data = self._data * other jpayne@68: # here, but currently that causes a bug on PyPy if self._data is a jpayne@68: # bytearray and other is a numpy integer. Using this workaround: jpayne@68: data = self._data.__mul__(other) jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def __imul__(self, other): jpayne@68: """Multiply the sequence object by other and assign. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> seq = Seq('ATG') jpayne@68: >>> seq *= 2 jpayne@68: >>> seq jpayne@68: Seq('ATGATG') jpayne@68: jpayne@68: Note that this is different from in-place multiplication. The ``seq`` jpayne@68: variable is reassigned to the multiplication result, but any variable jpayne@68: pointing to ``seq`` will remain unchanged: jpayne@68: jpayne@68: >>> seq = Seq('ATG') jpayne@68: >>> seq2 = seq jpayne@68: >>> id(seq) == id(seq2) jpayne@68: True jpayne@68: >>> seq *= 2 jpayne@68: >>> seq jpayne@68: Seq('ATGATG') jpayne@68: >>> seq2 jpayne@68: Seq('ATG') jpayne@68: >>> id(seq) == id(seq2) jpayne@68: False jpayne@68: """ jpayne@68: if not isinstance(other, numbers.Integral): jpayne@68: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@68: # we would like to simply write jpayne@68: # data = self._data * other jpayne@68: # here, but currently that causes a bug on PyPy if self._data is a jpayne@68: # bytearray and other is a numpy integer. Using this workaround: jpayne@68: data = self._data.__mul__(other) jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def count(self, sub, start=None, end=None): jpayne@68: """Return a non-overlapping count, like that of a python string. jpayne@68: jpayne@68: The number of occurrences of substring argument sub in the jpayne@68: (sub)sequence given by [start:end] is returned as an integer. jpayne@68: Optional arguments start and end are interpreted as in slice jpayne@68: notation. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq object to look for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_seq = Seq("AAAATGA") jpayne@68: >>> print(my_seq.count("A")) jpayne@68: 5 jpayne@68: >>> print(my_seq.count("ATG")) jpayne@68: 1 jpayne@68: >>> print(my_seq.count(Seq("AT"))) jpayne@68: 1 jpayne@68: >>> print(my_seq.count("AT", 2, -1)) jpayne@68: 1 jpayne@68: jpayne@68: HOWEVER, please note because the ``count`` method of Seq and MutableSeq jpayne@68: objects, like that of Python strings, do a non-overlapping search, this jpayne@68: may not give the answer you expect: jpayne@68: jpayne@68: >>> "AAAA".count("AA") jpayne@68: 2 jpayne@68: >>> print(Seq("AAAA").count("AA")) jpayne@68: 2 jpayne@68: jpayne@68: For an overlapping search, use the ``count_overlap`` method: jpayne@68: jpayne@68: >>> print(Seq("AAAA").count_overlap("AA")) jpayne@68: 3 jpayne@68: """ jpayne@68: if isinstance(sub, MutableSeq): jpayne@68: sub = sub._data jpayne@68: elif isinstance(sub, Seq): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: return self._data.count(sub, start, end) jpayne@68: jpayne@68: def count_overlap(self, sub, start=None, end=None): jpayne@68: """Return an overlapping count. jpayne@68: jpayne@68: Returns an integer, the number of occurrences of substring jpayne@68: argument sub in the (sub)sequence given by [start:end]. jpayne@68: Optional arguments start and end are interpreted as in slice jpayne@68: notation. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq object to look for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> print(Seq("AAAA").count_overlap("AA")) jpayne@68: 3 jpayne@68: >>> print(Seq("ATATATATA").count_overlap("ATA")) jpayne@68: 4 jpayne@68: >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1)) jpayne@68: 1 jpayne@68: jpayne@68: For a non-overlapping search, use the ``count`` method: jpayne@68: jpayne@68: >>> print(Seq("AAAA").count("AA")) jpayne@68: 2 jpayne@68: jpayne@68: Where substrings do not overlap, ``count_overlap`` behaves the same as jpayne@68: the ``count`` method: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_seq = Seq("AAAATGA") jpayne@68: >>> print(my_seq.count_overlap("A")) jpayne@68: 5 jpayne@68: >>> my_seq.count_overlap("A") == my_seq.count("A") jpayne@68: True jpayne@68: >>> print(my_seq.count_overlap("ATG")) jpayne@68: 1 jpayne@68: >>> my_seq.count_overlap("ATG") == my_seq.count("ATG") jpayne@68: True jpayne@68: >>> print(my_seq.count_overlap(Seq("AT"))) jpayne@68: 1 jpayne@68: >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT")) jpayne@68: True jpayne@68: >>> print(my_seq.count_overlap("AT", 2, -1)) jpayne@68: 1 jpayne@68: >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1) jpayne@68: True jpayne@68: jpayne@68: HOWEVER, do not use this method for such cases because the jpayne@68: count() method is much for efficient. jpayne@68: """ jpayne@68: if isinstance(sub, MutableSeq): jpayne@68: sub = sub._data jpayne@68: elif isinstance(sub, Seq): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: data = self._data jpayne@68: overlap_count = 0 jpayne@68: while True: jpayne@68: start = data.find(sub, start, end) + 1 jpayne@68: if start != 0: jpayne@68: overlap_count += 1 jpayne@68: else: jpayne@68: return overlap_count jpayne@68: jpayne@68: def __contains__(self, item): jpayne@68: """Return True if item is a subsequence of the sequence, and False otherwise. jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> my_dna = Seq("ATATGAAATTTGAAAA") jpayne@68: >>> "AAA" in my_dna jpayne@68: True jpayne@68: >>> Seq("AAA") in my_dna jpayne@68: True jpayne@68: >>> MutableSeq("AAA") in my_dna jpayne@68: True jpayne@68: """ jpayne@68: if isinstance(item, _SeqAbstractBaseClass): jpayne@68: item = bytes(item) jpayne@68: elif isinstance(item, str): jpayne@68: item = item.encode("ASCII") jpayne@68: return item in self._data jpayne@68: jpayne@68: def find(self, sub, start=None, end=None): jpayne@68: """Return the lowest index in the sequence where subsequence sub is found. jpayne@68: jpayne@68: With optional arguments start and end, return the lowest index in the jpayne@68: sequence such that the subsequence sub is contained within the sequence jpayne@68: region [start:end]. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq or MutableSeq object to search for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: Returns -1 if the subsequence is NOT found. jpayne@68: jpayne@68: e.g. Locating the first typical start codon, AUG, in an RNA sequence: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.find("AUG") jpayne@68: 3 jpayne@68: jpayne@68: The next typical start codon can then be found by starting the search jpayne@68: at position 4: jpayne@68: jpayne@68: >>> my_rna.find("AUG", 4) jpayne@68: 15 jpayne@68: jpayne@68: See the ``search`` method to find the locations of multiple subsequences jpayne@68: at the same time. jpayne@68: """ jpayne@68: if isinstance(sub, _SeqAbstractBaseClass): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: return self._data.find(sub, start, end) jpayne@68: jpayne@68: def rfind(self, sub, start=None, end=None): jpayne@68: """Return the highest index in the sequence where subsequence sub is found. jpayne@68: jpayne@68: With optional arguments start and end, return the highest index in the jpayne@68: sequence such that the subsequence sub is contained within the sequence jpayne@68: region [start:end]. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq or MutableSeq object to search for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: Returns -1 if the subsequence is NOT found. jpayne@68: jpayne@68: e.g. Locating the last typical start codon, AUG, in an RNA sequence: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.rfind("AUG") jpayne@68: 15 jpayne@68: jpayne@68: The location of the typical start codon before that can be found by jpayne@68: ending the search at position 15: jpayne@68: jpayne@68: >>> my_rna.rfind("AUG", end=15) jpayne@68: 3 jpayne@68: jpayne@68: See the ``search`` method to find the locations of multiple subsequences jpayne@68: at the same time. jpayne@68: """ jpayne@68: if isinstance(sub, _SeqAbstractBaseClass): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: return self._data.rfind(sub, start, end) jpayne@68: jpayne@68: def index(self, sub, start=None, end=None): jpayne@68: """Return the lowest index in the sequence where subsequence sub is found. jpayne@68: jpayne@68: With optional arguments start and end, return the lowest index in the jpayne@68: sequence such that the subsequence sub is contained within the sequence jpayne@68: region [start:end]. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq or MutableSeq object to search for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: Raises a ValueError if the subsequence is NOT found. jpayne@68: jpayne@68: e.g. Locating the first typical start codon, AUG, in an RNA sequence: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.index("AUG") jpayne@68: 3 jpayne@68: jpayne@68: The next typical start codon can then be found by starting the search jpayne@68: at position 4: jpayne@68: jpayne@68: >>> my_rna.index("AUG", 4) jpayne@68: 15 jpayne@68: jpayne@68: This method performs the same search as the ``find`` method. However, jpayne@68: if the subsequence is not found, ``find`` returns -1 while ``index`` jpayne@68: raises a ValueError: jpayne@68: jpayne@68: >>> my_rna.index("T") jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: ValueError: ... jpayne@68: >>> my_rna.find("T") jpayne@68: -1 jpayne@68: jpayne@68: See the ``search`` method to find the locations of multiple subsequences jpayne@68: at the same time. jpayne@68: """ jpayne@68: if isinstance(sub, MutableSeq): jpayne@68: sub = sub._data jpayne@68: elif isinstance(sub, Seq): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: return self._data.index(sub, start, end) jpayne@68: jpayne@68: def rindex(self, sub, start=None, end=None): jpayne@68: """Return the highest index in the sequence where subsequence sub is found. jpayne@68: jpayne@68: With optional arguments start and end, return the highest index in the jpayne@68: sequence such that the subsequence sub is contained within the sequence jpayne@68: region [start:end]. jpayne@68: jpayne@68: Arguments: jpayne@68: - sub - a string or another Seq or MutableSeq object to search for jpayne@68: - start - optional integer, slice start jpayne@68: - end - optional integer, slice end jpayne@68: jpayne@68: Returns -1 if the subsequence is NOT found. jpayne@68: jpayne@68: e.g. Locating the last typical start codon, AUG, in an RNA sequence: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.rindex("AUG") jpayne@68: 15 jpayne@68: jpayne@68: The location of the typical start codon before that can be found by jpayne@68: ending the search at position 15: jpayne@68: jpayne@68: >>> my_rna.rindex("AUG", end=15) jpayne@68: 3 jpayne@68: jpayne@68: This method performs the same search as the ``rfind`` method. However, jpayne@68: if the subsequence is not found, ``rfind`` returns -1 which ``rindex`` jpayne@68: raises a ValueError: jpayne@68: jpayne@68: >>> my_rna.rindex("T") jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: ValueError: ... jpayne@68: >>> my_rna.rfind("T") jpayne@68: -1 jpayne@68: jpayne@68: See the ``search`` method to find the locations of multiple subsequences jpayne@68: at the same time. jpayne@68: """ jpayne@68: if isinstance(sub, MutableSeq): jpayne@68: sub = sub._data jpayne@68: elif isinstance(sub, Seq): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, (bytes, bytearray)): jpayne@68: raise TypeError( jpayne@68: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % type(sub) jpayne@68: ) jpayne@68: return self._data.rindex(sub, start, end) jpayne@68: jpayne@68: def search(self, subs): jpayne@68: """Search the substrings subs in self and yield the index and substring found. jpayne@68: jpayne@68: Arguments: jpayne@68: - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray jpayne@68: objects containing the substrings to search for. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG") jpayne@68: >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")]) jpayne@68: >>> for index, substring in matches: jpayne@68: ... print(index, substring) jpayne@68: ... jpayne@68: 7 CC jpayne@68: 9 ATTG jpayne@68: 20 CC jpayne@68: 34 CC jpayne@68: 34 CCC jpayne@68: 35 CC jpayne@68: """ jpayne@68: subdict = collections.defaultdict(set) jpayne@68: for index, sub in enumerate(subs): jpayne@68: if isinstance(sub, (_SeqAbstractBaseClass, bytearray)): jpayne@68: sub = bytes(sub) jpayne@68: elif isinstance(sub, str): jpayne@68: sub = sub.encode("ASCII") jpayne@68: elif not isinstance(sub, bytes): jpayne@68: raise TypeError( jpayne@68: "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@68: % (index, type(sub)) jpayne@68: ) jpayne@68: length = len(sub) jpayne@68: subdict[length].add(sub) jpayne@68: for start in range(len(self) - 1): jpayne@68: for length, subs in subdict.items(): jpayne@68: stop = start + length jpayne@68: for sub in subs: jpayne@68: if self._data[start:stop] == sub: jpayne@68: yield (start, sub.decode()) jpayne@68: break jpayne@68: jpayne@68: def startswith(self, prefix, start=None, end=None): jpayne@68: """Return True if the sequence starts with the given prefix, False otherwise. jpayne@68: jpayne@68: Return True if the sequence starts with the specified prefix jpayne@68: (a string or another Seq object), False otherwise. jpayne@68: With optional start, test sequence beginning at that position. jpayne@68: With optional end, stop comparing sequence at that position. jpayne@68: prefix can also be a tuple of strings to try. e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.startswith("GUC") jpayne@68: True jpayne@68: >>> my_rna.startswith("AUG") jpayne@68: False jpayne@68: >>> my_rna.startswith("AUG", 3) jpayne@68: True jpayne@68: >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) jpayne@68: True jpayne@68: """ jpayne@68: if isinstance(prefix, tuple): jpayne@68: prefix = tuple( jpayne@68: bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") jpayne@68: for p in prefix jpayne@68: ) jpayne@68: elif isinstance(prefix, _SeqAbstractBaseClass): jpayne@68: prefix = bytes(prefix) jpayne@68: elif isinstance(prefix, str): jpayne@68: prefix = prefix.encode("ASCII") jpayne@68: return self._data.startswith(prefix, start, end) jpayne@68: jpayne@68: def endswith(self, suffix, start=None, end=None): jpayne@68: """Return True if the sequence ends with the given suffix, False otherwise. jpayne@68: jpayne@68: Return True if the sequence ends with the specified suffix jpayne@68: (a string or another Seq object), False otherwise. jpayne@68: With optional start, test sequence beginning at that position. jpayne@68: With optional end, stop comparing sequence at that position. jpayne@68: suffix can also be a tuple of strings to try. e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_rna.endswith("UUG") jpayne@68: True jpayne@68: >>> my_rna.endswith("AUG") jpayne@68: False jpayne@68: >>> my_rna.endswith("AUG", 0, 18) jpayne@68: True jpayne@68: >>> my_rna.endswith(("UCC", "UCA", "UUG")) jpayne@68: True jpayne@68: """ jpayne@68: if isinstance(suffix, tuple): jpayne@68: suffix = tuple( jpayne@68: bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") jpayne@68: for p in suffix jpayne@68: ) jpayne@68: elif isinstance(suffix, _SeqAbstractBaseClass): jpayne@68: suffix = bytes(suffix) jpayne@68: elif isinstance(suffix, str): jpayne@68: suffix = suffix.encode("ASCII") jpayne@68: return self._data.endswith(suffix, start, end) jpayne@68: jpayne@68: def split(self, sep=None, maxsplit=-1): jpayne@68: """Return a list of subsequences when splitting the sequence by separator sep. jpayne@68: jpayne@68: Return a list of the subsequences in the sequence (as Seq objects), jpayne@68: using sep as the delimiter string. If maxsplit is given, at jpayne@68: most maxsplit splits are done. If maxsplit is omitted, all jpayne@68: splits are made. jpayne@68: jpayne@68: For consistency with the ``split`` method of Python strings, any jpayne@68: whitespace (tabs, spaces, newlines) is a separator if sep is None, the jpayne@68: default value jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_aa = my_rna.translate() jpayne@68: >>> my_aa jpayne@68: Seq('VMAIVMGR*KGAR*L') jpayne@68: >>> for pep in my_aa.split("*"): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR') jpayne@68: Seq('KGAR') jpayne@68: Seq('L') jpayne@68: >>> for pep in my_aa.split("*", 1): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR') jpayne@68: Seq('KGAR*L') jpayne@68: jpayne@68: See also the rsplit method, which splits the sequence starting from the jpayne@68: end: jpayne@68: jpayne@68: >>> for pep in my_aa.rsplit("*", 1): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR*KGAR') jpayne@68: Seq('L') jpayne@68: """ jpayne@68: if isinstance(sep, _SeqAbstractBaseClass): jpayne@68: sep = bytes(sep) jpayne@68: elif isinstance(sep, str): jpayne@68: sep = sep.encode("ASCII") jpayne@68: return [Seq(part) for part in self._data.split(sep, maxsplit)] jpayne@68: jpayne@68: def rsplit(self, sep=None, maxsplit=-1): jpayne@68: """Return a list of subsequences by splitting the sequence from the right. jpayne@68: jpayne@68: Return a list of the subsequences in the sequence (as Seq objects), jpayne@68: using sep as the delimiter string. If maxsplit is given, at jpayne@68: most maxsplit splits are done. If maxsplit is omitted, all jpayne@68: splits are made. jpayne@68: jpayne@68: For consistency with the ``rsplit`` method of Python strings, any jpayne@68: whitespace (tabs, spaces, newlines) is a separator if sep is None, the jpayne@68: default value jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@68: >>> my_aa = my_rna.translate() jpayne@68: >>> my_aa jpayne@68: Seq('VMAIVMGR*KGAR*L') jpayne@68: >>> for pep in my_aa.rsplit("*"): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR') jpayne@68: Seq('KGAR') jpayne@68: Seq('L') jpayne@68: >>> for pep in my_aa.rsplit("*", 1): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR*KGAR') jpayne@68: Seq('L') jpayne@68: jpayne@68: See also the split method, which splits the sequence starting from the jpayne@68: beginning: jpayne@68: jpayne@68: >>> for pep in my_aa.split("*", 1): jpayne@68: ... pep jpayne@68: Seq('VMAIVMGR') jpayne@68: Seq('KGAR*L') jpayne@68: """ jpayne@68: if isinstance(sep, _SeqAbstractBaseClass): jpayne@68: sep = bytes(sep) jpayne@68: elif isinstance(sep, str): jpayne@68: sep = sep.encode("ASCII") jpayne@68: return [Seq(part) for part in self._data.rsplit(sep, maxsplit)] jpayne@68: jpayne@68: def strip(self, chars=None, inplace=False): jpayne@68: """Return a sequence object with leading and trailing ends stripped. jpayne@68: jpayne@68: With default arguments, leading and trailing whitespace is removed: jpayne@68: jpayne@68: >>> seq = Seq(" ACGT ") jpayne@68: >>> seq.strip() jpayne@68: Seq('ACGT') jpayne@68: >>> seq jpayne@68: Seq(' ACGT ') jpayne@68: jpayne@68: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@68: instead. The order of the characters to be removed is not important: jpayne@68: jpayne@68: >>> Seq("ACGTACGT").strip("TGCA") jpayne@68: Seq('') jpayne@68: jpayne@68: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@68: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@68: in-place and returned. jpayne@68: jpayne@68: >>> seq = MutableSeq(" ACGT ") jpayne@68: >>> seq.strip() jpayne@68: MutableSeq('ACGT') jpayne@68: >>> seq jpayne@68: MutableSeq(' ACGT ') jpayne@68: >>> seq.strip(inplace=True) jpayne@68: MutableSeq('ACGT') jpayne@68: >>> seq jpayne@68: MutableSeq('ACGT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip`` jpayne@68: is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the lstrip and rstrip methods. jpayne@68: """ jpayne@68: if isinstance(chars, _SeqAbstractBaseClass): jpayne@68: chars = bytes(chars) jpayne@68: elif isinstance(chars, str): jpayne@68: chars = chars.encode("ASCII") jpayne@68: try: jpayne@68: data = self._data.strip(chars) jpayne@68: except TypeError: jpayne@68: raise TypeError( jpayne@68: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@68: ) from None jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def lstrip(self, chars=None, inplace=False): jpayne@68: """Return a sequence object with leading and trailing ends stripped. jpayne@68: jpayne@68: With default arguments, leading whitespace is removed: jpayne@68: jpayne@68: >>> seq = Seq(" ACGT ") jpayne@68: >>> seq.lstrip() jpayne@68: Seq('ACGT ') jpayne@68: >>> seq jpayne@68: Seq(' ACGT ') jpayne@68: jpayne@68: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@68: from the leading end instead. The order of the characters to be removed jpayne@68: is not important: jpayne@68: jpayne@68: >>> Seq("ACGACGTTACG").lstrip("GCA") jpayne@68: Seq('TTACG') jpayne@68: jpayne@68: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@68: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@68: in-place and returned. jpayne@68: jpayne@68: >>> seq = MutableSeq(" ACGT ") jpayne@68: >>> seq.lstrip() jpayne@68: MutableSeq('ACGT ') jpayne@68: >>> seq jpayne@68: MutableSeq(' ACGT ') jpayne@68: >>> seq.lstrip(inplace=True) jpayne@68: MutableSeq('ACGT ') jpayne@68: >>> seq jpayne@68: MutableSeq('ACGT ') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``lstrip`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the strip and rstrip methods. jpayne@68: """ jpayne@68: if isinstance(chars, _SeqAbstractBaseClass): jpayne@68: chars = bytes(chars) jpayne@68: elif isinstance(chars, str): jpayne@68: chars = chars.encode("ASCII") jpayne@68: try: jpayne@68: data = self._data.lstrip(chars) jpayne@68: except TypeError: jpayne@68: raise TypeError( jpayne@68: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@68: ) from None jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def rstrip(self, chars=None, inplace=False): jpayne@68: """Return a sequence object with trailing ends stripped. jpayne@68: jpayne@68: With default arguments, trailing whitespace is removed: jpayne@68: jpayne@68: >>> seq = Seq(" ACGT ") jpayne@68: >>> seq.rstrip() jpayne@68: Seq(' ACGT') jpayne@68: >>> seq jpayne@68: Seq(' ACGT ') jpayne@68: jpayne@68: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@68: from the trailing end instead. The order of the characters to be jpayne@68: removed is not important: jpayne@68: jpayne@68: >>> Seq("ACGACGTTACG").rstrip("GCA") jpayne@68: Seq('ACGACGTT') jpayne@68: jpayne@68: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@68: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@68: in-place and returned. jpayne@68: jpayne@68: >>> seq = MutableSeq(" ACGT ") jpayne@68: >>> seq.rstrip() jpayne@68: MutableSeq(' ACGT') jpayne@68: >>> seq jpayne@68: MutableSeq(' ACGT ') jpayne@68: >>> seq.rstrip(inplace=True) jpayne@68: MutableSeq(' ACGT') jpayne@68: >>> seq jpayne@68: MutableSeq(' ACGT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``rstrip`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the strip and lstrip methods. jpayne@68: """ jpayne@68: if isinstance(chars, _SeqAbstractBaseClass): jpayne@68: chars = bytes(chars) jpayne@68: elif isinstance(chars, str): jpayne@68: chars = chars.encode("ASCII") jpayne@68: try: jpayne@68: data = self._data.rstrip(chars) jpayne@68: except TypeError: jpayne@68: raise TypeError( jpayne@68: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@68: ) from None jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def removeprefix(self, prefix, inplace=False): jpayne@68: """Return a new Seq object with prefix (left) removed. jpayne@68: jpayne@68: This behaves like the python string method of the same name. jpayne@68: jpayne@68: e.g. Removing a start Codon: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_seq = Seq("ATGGTGTGTGT") jpayne@68: >>> my_seq jpayne@68: Seq('ATGGTGTGTGT') jpayne@68: >>> my_seq.removeprefix('ATG') jpayne@68: Seq('GTGTGTGT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the removesuffix method. jpayne@68: """ jpayne@68: if isinstance(prefix, _SeqAbstractBaseClass): jpayne@68: prefix = bytes(prefix) jpayne@68: elif isinstance(prefix, str): jpayne@68: prefix = prefix.encode("ASCII") jpayne@68: try: jpayne@68: data = self._data.removeprefix(prefix) jpayne@68: except TypeError: jpayne@68: raise TypeError( jpayne@68: "argument must be a string, Seq, MutableSeq, or bytes-like object" jpayne@68: ) from None jpayne@68: except AttributeError: jpayne@68: # Fall back for pre-Python 3.9 jpayne@68: data = self._data jpayne@68: if data.startswith(prefix): jpayne@68: data = data[len(prefix) :] jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def removesuffix(self, suffix, inplace=False): jpayne@68: """Return a new Seq object with suffix (right) removed. jpayne@68: jpayne@68: This behaves like the python string method of the same name. jpayne@68: jpayne@68: e.g. Removing a stop codon: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_seq = Seq("GTGTGTGTTAG") jpayne@68: >>> my_seq jpayne@68: Seq('GTGTGTGTTAG') jpayne@68: >>> stop_codon = Seq("TAG") jpayne@68: >>> my_seq.removesuffix(stop_codon) jpayne@68: Seq('GTGTGTGT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the removeprefix method. jpayne@68: """ jpayne@68: if isinstance(suffix, _SeqAbstractBaseClass): jpayne@68: suffix = bytes(suffix) jpayne@68: elif isinstance(suffix, str): jpayne@68: suffix = suffix.encode("ASCII") jpayne@68: try: jpayne@68: data = self._data.removesuffix(suffix) jpayne@68: except TypeError: jpayne@68: raise TypeError( jpayne@68: "argument must be a string, Seq, MutableSeq, or bytes-like object" jpayne@68: ) from None jpayne@68: except AttributeError: jpayne@68: # Fall back for pre-Python 3.9 jpayne@68: data = self._data jpayne@68: if data.endswith(suffix): jpayne@68: data = data[: -len(suffix)] jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def upper(self, inplace=False): jpayne@68: """Return the sequence in upper case. jpayne@68: jpayne@68: An upper-case copy of the sequence is returned if inplace is False, jpayne@68: the default value: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> my_seq = Seq("VHLTPeeK*") jpayne@68: >>> my_seq jpayne@68: Seq('VHLTPeeK*') jpayne@68: >>> my_seq.lower() jpayne@68: Seq('vhltpeek*') jpayne@68: >>> my_seq.upper() jpayne@68: Seq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: Seq('VHLTPeeK*') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("VHLTPeeK*") jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPeeK*') jpayne@68: >>> my_seq.lower() jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq.upper() jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPeeK*') jpayne@68: jpayne@68: >>> my_seq.lower(inplace=True) jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq.upper(inplace=True) jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``upper`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the ``lower`` method. jpayne@68: """ jpayne@68: data = self._data.upper() jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def lower(self, inplace=False): jpayne@68: """Return the sequence in lower case. jpayne@68: jpayne@68: An lower-case copy of the sequence is returned if inplace is False, jpayne@68: the default value: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq, MutableSeq jpayne@68: >>> my_seq = Seq("VHLTPeeK*") jpayne@68: >>> my_seq jpayne@68: Seq('VHLTPeeK*') jpayne@68: >>> my_seq.lower() jpayne@68: Seq('vhltpeek*') jpayne@68: >>> my_seq.upper() jpayne@68: Seq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: Seq('VHLTPeeK*') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("VHLTPeeK*") jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPeeK*') jpayne@68: >>> my_seq.lower() jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq.upper() jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPeeK*') jpayne@68: jpayne@68: >>> my_seq.lower(inplace=True) jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('vhltpeek*') jpayne@68: >>> my_seq.upper(inplace=True) jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: >>> my_seq jpayne@68: MutableSeq('VHLTPEEK*') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``lower`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: See also the ``upper`` method. jpayne@68: """ jpayne@68: data = self._data.lower() jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: else: jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def isupper(self): jpayne@68: """Return True if all ASCII characters in data are uppercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: return self._data.isupper() jpayne@68: jpayne@68: def islower(self): jpayne@68: """Return True if all ASCII characters in data are lowercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: return self._data.islower() jpayne@68: jpayne@68: def translate( jpayne@68: self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-" jpayne@68: ): jpayne@68: """Turn a nucleotide sequence into a protein sequence by creating a new sequence object. jpayne@68: jpayne@68: This method will translate DNA or RNA sequences. It should not jpayne@68: be used on protein sequences as any result will be biologically jpayne@68: meaningless. jpayne@68: jpayne@68: Arguments: jpayne@68: - table - Which codon table to use? This can be either a name jpayne@68: (string), an NCBI identifier (integer), or a CodonTable jpayne@68: object (useful for non-standard genetic codes). This jpayne@68: defaults to the "Standard" table. jpayne@68: - stop_symbol - Single character string, what to use for jpayne@68: terminators. This defaults to the asterisk, "*". jpayne@68: - to_stop - Boolean, defaults to False meaning do a full jpayne@68: translation continuing on past any stop codons (translated as the jpayne@68: specified stop_symbol). If True, translation is terminated at jpayne@68: the first in frame stop codon (and the stop_symbol is not jpayne@68: appended to the returned protein sequence). jpayne@68: - cds - Boolean, indicates this is a complete CDS. If True, jpayne@68: this checks the sequence starts with a valid alternative start jpayne@68: codon (which will be translated as methionine, M), that the jpayne@68: sequence length is a multiple of three, and that there is a jpayne@68: single in frame stop codon at the end (this will be excluded jpayne@68: from the protein sequence, regardless of the to_stop option). jpayne@68: If these tests fail, an exception is raised. jpayne@68: - gap - Single character string to denote symbol used for gaps. jpayne@68: Defaults to the minus sign. jpayne@68: jpayne@68: A ``Seq`` object is returned if ``translate`` is called on a ``Seq`` jpayne@68: object; a ``MutableSeq`` object is returned if ``translate`` is called jpayne@68: pn a ``MutableSeq`` object. jpayne@68: jpayne@68: e.g. Using the standard table: jpayne@68: jpayne@68: >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@68: >>> coding_dna.translate() jpayne@68: Seq('VAIVMGR*KGAR*') jpayne@68: >>> coding_dna.translate(stop_symbol="@") jpayne@68: Seq('VAIVMGR@KGAR@') jpayne@68: >>> coding_dna.translate(to_stop=True) jpayne@68: Seq('VAIVMGR') jpayne@68: jpayne@68: Now using NCBI table 2, where TGA is not a stop codon: jpayne@68: jpayne@68: >>> coding_dna.translate(table=2) jpayne@68: Seq('VAIVMGRWKGAR*') jpayne@68: >>> coding_dna.translate(table=2, to_stop=True) jpayne@68: Seq('VAIVMGRWKGAR') jpayne@68: jpayne@68: In fact, GTG is an alternative start codon under NCBI table 2, meaning jpayne@68: this sequence could be a complete CDS: jpayne@68: jpayne@68: >>> coding_dna.translate(table=2, cds=True) jpayne@68: Seq('MAIVMGRWKGAR') jpayne@68: jpayne@68: It isn't a valid CDS under NCBI table 1, due to both the start codon jpayne@68: and also the in frame stop codons: jpayne@68: jpayne@68: >>> coding_dna.translate(table=1, cds=True) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon jpayne@68: jpayne@68: If the sequence has no in-frame stop codon, then the to_stop argument jpayne@68: has no effect: jpayne@68: jpayne@68: >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") jpayne@68: >>> coding_dna2.translate() jpayne@68: Seq('LAIVMGR') jpayne@68: >>> coding_dna2.translate(to_stop=True) jpayne@68: Seq('LAIVMGR') jpayne@68: jpayne@68: NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid jpayne@68: or a stop codon. These are translated as "X". Any invalid codon jpayne@68: (e.g. "TA?" or "T-A") will throw a TranslationError. jpayne@68: jpayne@68: NOTE - This does NOT behave like the python string's translate jpayne@68: method. For that use str(my_seq).translate(...) instead jpayne@68: """ jpayne@68: try: jpayne@68: data = str(self) jpayne@68: except UndefinedSequenceError: jpayne@68: # translating an undefined sequence yields an undefined jpayne@68: # sequence with the length divided by 3 jpayne@68: n = len(self) jpayne@68: if n % 3 != 0: jpayne@68: warnings.warn( jpayne@68: "Partial codon, len(sequence) not a multiple of three. " jpayne@68: "This may become an error in future.", jpayne@68: BiopythonWarning, jpayne@68: ) jpayne@68: return Seq(None, n // 3) jpayne@68: jpayne@68: return self.__class__( jpayne@68: _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap) jpayne@68: ) jpayne@68: jpayne@68: def complement(self, inplace=False): jpayne@68: """Return the complement as a DNA sequence. jpayne@68: jpayne@68: >>> Seq("CGA").complement() jpayne@68: Seq('GCT') jpayne@68: jpayne@68: Any U in the sequence is treated as a T: jpayne@68: jpayne@68: >>> Seq("CGAUT").complement() jpayne@68: Seq('GCTAA') jpayne@68: jpayne@68: In contrast, ``complement_rna`` returns an RNA sequence: jpayne@68: jpayne@68: >>> Seq("CGAUT").complement_rna() jpayne@68: Seq('GCUAA') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: >>> my_seq.complement() jpayne@68: MutableSeq('GCT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: >>> my_seq.complement(inplace=True) jpayne@68: MutableSeq('GCT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('GCT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: """ jpayne@68: ttable = _dna_complement_table jpayne@68: try: jpayne@68: data = self._data.translate(ttable) jpayne@68: except UndefinedSequenceError: jpayne@68: # complement of an undefined sequence is an undefined sequence jpayne@68: # of the same length jpayne@68: return self jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def complement_rna(self, inplace=False): jpayne@68: """Return the complement as an RNA sequence. jpayne@68: jpayne@68: >>> Seq("CGA").complement_rna() jpayne@68: Seq('GCU') jpayne@68: jpayne@68: Any T in the sequence is treated as a U: jpayne@68: jpayne@68: >>> Seq("CGAUT").complement_rna() jpayne@68: Seq('GCUAA') jpayne@68: jpayne@68: In contrast, ``complement`` returns a DNA sequence by default: jpayne@68: jpayne@68: >>> Seq("CGA").complement() jpayne@68: Seq('GCT') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: >>> my_seq.complement_rna() jpayne@68: MutableSeq('GCU') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: >>> my_seq.complement_rna(inplace=True) jpayne@68: MutableSeq('GCU') jpayne@68: >>> my_seq jpayne@68: MutableSeq('GCU') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: """ jpayne@68: try: jpayne@68: data = self._data.translate(_rna_complement_table) jpayne@68: except UndefinedSequenceError: jpayne@68: # complement of an undefined sequence is an undefined sequence jpayne@68: # of the same length jpayne@68: return self jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def reverse_complement(self, inplace=False): jpayne@68: """Return the reverse complement as a DNA sequence. jpayne@68: jpayne@68: >>> Seq("CGA").reverse_complement() jpayne@68: Seq('TCG') jpayne@68: jpayne@68: Any U in the sequence is treated as a T: jpayne@68: jpayne@68: >>> Seq("CGAUT").reverse_complement() jpayne@68: Seq('AATCG') jpayne@68: jpayne@68: In contrast, ``reverse_complement_rna`` returns an RNA sequence: jpayne@68: jpayne@68: >>> Seq("CGA").reverse_complement_rna() jpayne@68: Seq('UCG') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: >>> my_seq.reverse_complement() jpayne@68: MutableSeq('TCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: >>> my_seq.reverse_complement(inplace=True) jpayne@68: MutableSeq('TCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('TCG') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``reverse_complement`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: try: jpayne@68: data = self._data.translate(_dna_complement_table) jpayne@68: except UndefinedSequenceError: jpayne@68: # reverse complement of an undefined sequence is an undefined sequence jpayne@68: # of the same length jpayne@68: return self jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[::-1] = data jpayne@68: return self jpayne@68: return self.__class__(data[::-1]) jpayne@68: jpayne@68: def reverse_complement_rna(self, inplace=False): jpayne@68: """Return the reverse complement as an RNA sequence. jpayne@68: jpayne@68: >>> Seq("CGA").reverse_complement_rna() jpayne@68: Seq('UCG') jpayne@68: jpayne@68: Any T in the sequence is treated as a U: jpayne@68: jpayne@68: >>> Seq("CGAUT").reverse_complement_rna() jpayne@68: Seq('AAUCG') jpayne@68: jpayne@68: In contrast, ``reverse_complement`` returns a DNA sequence: jpayne@68: jpayne@68: >>> Seq("CGA").reverse_complement() jpayne@68: Seq('TCG') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: >>> my_seq.reverse_complement_rna() jpayne@68: MutableSeq('UCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: >>> my_seq.reverse_complement_rna(inplace=True) jpayne@68: MutableSeq('UCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('UCG') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``reverse_complement_rna`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: try: jpayne@68: data = self._data.translate(_rna_complement_table) jpayne@68: except UndefinedSequenceError: jpayne@68: # reverse complement of an undefined sequence is an undefined sequence jpayne@68: # of the same length jpayne@68: return self jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[::-1] = data jpayne@68: return self jpayne@68: return self.__class__(data[::-1]) jpayne@68: jpayne@68: def transcribe(self, inplace=False): jpayne@68: """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object. jpayne@68: jpayne@68: Following the usual convention, the sequence is interpreted as the jpayne@68: coding strand of the DNA double helix, not the template strand. This jpayne@68: means we can get the RNA sequence just by switching T to U. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@68: >>> coding_dna jpayne@68: Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: >>> coding_dna.transcribe() jpayne@68: Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@68: >>> sequence jpayne@68: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: >>> sequence.transcribe() jpayne@68: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: >>> sequence jpayne@68: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: jpayne@68: >>> sequence.transcribe(inplace=True) jpayne@68: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: >>> sequence jpayne@68: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: Trying to transcribe an RNA sequence has no effect. jpayne@68: If you have a nucleotide sequence which might be DNA or RNA jpayne@68: (or even a mixture), calling the transcribe method will ensure jpayne@68: any T becomes U. jpayne@68: jpayne@68: Trying to transcribe a protein sequence will replace any jpayne@68: T for Threonine with U for Selenocysteine, which has no jpayne@68: biologically plausible rational. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_protein = Seq("MAIVMGRT") jpayne@68: >>> my_protein.transcribe() jpayne@68: Seq('MAIVMGRU') jpayne@68: """ jpayne@68: data = self._data.replace(b"T", b"U").replace(b"t", b"u") jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def back_transcribe(self, inplace=False): jpayne@68: """Return the DNA sequence from an RNA sequence by creating a new Seq object. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") jpayne@68: >>> messenger_rna jpayne@68: Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: >>> messenger_rna.back_transcribe() jpayne@68: Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") jpayne@68: >>> sequence jpayne@68: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: >>> sequence.back_transcribe() jpayne@68: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: >>> sequence jpayne@68: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@68: jpayne@68: >>> sequence.back_transcribe(inplace=True) jpayne@68: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: >>> sequence jpayne@68: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: jpayne@68: Trying to back-transcribe DNA has no effect, If you have a nucleotide jpayne@68: sequence which might be DNA or RNA (or even a mixture), calling the jpayne@68: back-transcribe method will ensure any U becomes T. jpayne@68: jpayne@68: Trying to back-transcribe a protein sequence will replace any U for jpayne@68: Selenocysteine with T for Threonine, which is biologically meaningless. jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_protein = Seq("MAIVMGRU") jpayne@68: >>> my_protein.back_transcribe() jpayne@68: Seq('MAIVMGRT') jpayne@68: """ jpayne@68: data = self._data.replace(b"U", b"T").replace(b"u", b"t") jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: def join(self, other): jpayne@68: """Return a merge of the sequences in other, spaced by the sequence from self. jpayne@68: jpayne@68: Accepts a Seq object, MutableSeq object, or string (and iterates over jpayne@68: the letters), or an iterable containing Seq, MutableSeq, or string jpayne@68: objects. These arguments will be concatenated with the calling sequence jpayne@68: as the spacer: jpayne@68: jpayne@68: >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")]) jpayne@68: >>> concatenated jpayne@68: Seq('AAANNNNNTTTNNNNNPPP') jpayne@68: jpayne@68: Joining the letters of a single sequence: jpayne@68: jpayne@68: >>> Seq('NNNNN').join(Seq("ACGT")) jpayne@68: Seq('ANNNNNCNNNNNGNNNNNT') jpayne@68: >>> Seq('NNNNN').join("ACGT") jpayne@68: Seq('ANNNNNCNNNNNGNNNNNT') jpayne@68: """ jpayne@68: if isinstance(other, _SeqAbstractBaseClass): jpayne@68: return self.__class__(str(self).join(str(other))) jpayne@68: elif isinstance(other, str): jpayne@68: return self.__class__(str(self).join(other)) jpayne@68: jpayne@68: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@68: jpayne@68: if isinstance(other, SeqRecord): jpayne@68: raise TypeError("Iterable cannot be a SeqRecord") jpayne@68: jpayne@68: for c in other: jpayne@68: if isinstance(c, SeqRecord): jpayne@68: raise TypeError("Iterable cannot contain SeqRecords") jpayne@68: elif not isinstance(c, (str, _SeqAbstractBaseClass)): jpayne@68: raise TypeError( jpayne@68: "Input must be an iterable of Seq objects, MutableSeq objects, or strings" jpayne@68: ) jpayne@68: return self.__class__(str(self).join([str(_) for _ in other])) jpayne@68: jpayne@68: def replace(self, old, new, inplace=False): jpayne@68: """Return a copy with all occurrences of subsequence old replaced by new. jpayne@68: jpayne@68: >>> s = Seq("ACGTAACCGGTT") jpayne@68: >>> t = s.replace("AC", "XYZ") jpayne@68: >>> s jpayne@68: Seq('ACGTAACCGGTT') jpayne@68: >>> t jpayne@68: Seq('XYZGTAXYZCGGTT') jpayne@68: jpayne@68: For mutable sequences, passing inplace=True will modify the sequence in place: jpayne@68: jpayne@68: >>> m = MutableSeq("ACGTAACCGGTT") jpayne@68: >>> t = m.replace("AC", "XYZ") jpayne@68: >>> m jpayne@68: MutableSeq('ACGTAACCGGTT') jpayne@68: >>> t jpayne@68: MutableSeq('XYZGTAXYZCGGTT') jpayne@68: jpayne@68: >>> m = MutableSeq("ACGTAACCGGTT") jpayne@68: >>> t = m.replace("AC", "XYZ", inplace=True) jpayne@68: >>> m jpayne@68: MutableSeq('XYZGTAXYZCGGTT') jpayne@68: >>> t jpayne@68: MutableSeq('XYZGTAXYZCGGTT') jpayne@68: jpayne@68: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@68: ``replace`` is called on a ``Seq`` object with ``inplace=True``. jpayne@68: """ jpayne@68: if isinstance(old, _SeqAbstractBaseClass): jpayne@68: old = bytes(old) jpayne@68: elif isinstance(old, str): jpayne@68: old = old.encode("ASCII") jpayne@68: if isinstance(new, _SeqAbstractBaseClass): jpayne@68: new = bytes(new) jpayne@68: elif isinstance(new, str): jpayne@68: new = new.encode("ASCII") jpayne@68: data = self._data.replace(old, new) jpayne@68: if inplace: jpayne@68: if not isinstance(self._data, bytearray): jpayne@68: raise TypeError("Sequence is immutable") jpayne@68: self._data[:] = data jpayne@68: return self jpayne@68: return self.__class__(data) jpayne@68: jpayne@68: @property jpayne@68: def defined(self): jpayne@68: """Return True if the sequence is defined, False if undefined or partially defined. jpayne@68: jpayne@68: Zero-length sequences are always considered to be defined. jpayne@68: """ jpayne@68: if isinstance(self._data, (bytes, bytearray)): jpayne@68: return True jpayne@68: else: jpayne@68: return self._data.defined jpayne@68: jpayne@68: @property jpayne@68: def defined_ranges(self): jpayne@68: """Return a tuple of the ranges where the sequence contents is defined. jpayne@68: jpayne@68: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@68: """ jpayne@68: if isinstance(self._data, (bytes, bytearray)): jpayne@68: length = len(self) jpayne@68: if length > 0: jpayne@68: return ((0, length),) jpayne@68: else: jpayne@68: return () jpayne@68: else: jpayne@68: return self._data.defined_ranges jpayne@68: jpayne@68: jpayne@68: class Seq(_SeqAbstractBaseClass): jpayne@68: """Read-only sequence object (essentially a string with biological methods). jpayne@68: jpayne@68: Like normal python strings, our basic sequence object is immutable. jpayne@68: This prevents you from doing my_seq[5] = "A" for example, but does allow jpayne@68: Seq objects to be used as dictionary keys. jpayne@68: jpayne@68: The Seq object provides a number of string like methods (such as count, jpayne@68: find, split and strip). jpayne@68: jpayne@68: The Seq object also provides some biological methods, such as complement, jpayne@68: reverse_complement, transcribe, back_transcribe and translate (which are jpayne@68: not applicable to protein sequences). jpayne@68: """ jpayne@68: jpayne@68: _data: Union[bytes, SequenceDataAbstractBaseClass] jpayne@68: jpayne@68: def __init__( jpayne@68: self, jpayne@68: data: Union[ jpayne@68: str, jpayne@68: bytes, jpayne@68: bytearray, jpayne@68: _SeqAbstractBaseClass, jpayne@68: SequenceDataAbstractBaseClass, jpayne@68: dict, jpayne@68: None, jpayne@68: ], jpayne@68: length: Optional[int] = None, jpayne@68: ): jpayne@68: """Create a Seq object. jpayne@68: jpayne@68: Arguments: jpayne@68: - data - Sequence, required (string) jpayne@68: - length - Sequence length, used only if data is None or a dictionary (integer) jpayne@68: jpayne@68: You will typically use Bio.SeqIO to read in sequences from files as jpayne@68: SeqRecord objects, whose sequence will be exposed as a Seq object via jpayne@68: the seq property. jpayne@68: jpayne@68: However, you can also create a Seq object directly: jpayne@68: jpayne@68: >>> from Bio.Seq import Seq jpayne@68: >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF") jpayne@68: >>> my_seq jpayne@68: Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') jpayne@68: >>> print(my_seq) jpayne@68: MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF jpayne@68: jpayne@68: To create a Seq object with for a sequence of known length but jpayne@68: unknown sequence contents, use None for the data argument and pass jpayne@68: the sequence length for the length argument. Trying to access the jpayne@68: sequence contents of a Seq object created in this way will raise jpayne@68: an UndefinedSequenceError: jpayne@68: jpayne@68: >>> my_undefined_sequence = Seq(None, 20) jpayne@68: >>> my_undefined_sequence jpayne@68: Seq(None, length=20) jpayne@68: >>> len(my_undefined_sequence) jpayne@68: 20 jpayne@68: >>> print(my_undefined_sequence) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Seq.UndefinedSequenceError: Sequence content is undefined jpayne@68: jpayne@68: If the sequence contents is known for parts of the sequence only, use jpayne@68: a dictionary for the data argument to pass the known sequence segments: jpayne@68: jpayne@68: >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10) jpayne@68: >>> my_partially_defined_sequence jpayne@68: Seq({3: 'ACGT'}, length=10) jpayne@68: >>> len(my_partially_defined_sequence) jpayne@68: 10 jpayne@68: >>> print(my_partially_defined_sequence) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined jpayne@68: >>> my_partially_defined_sequence[3:7] jpayne@68: Seq('ACGT') jpayne@68: >>> print(my_partially_defined_sequence[3:7]) jpayne@68: ACGT jpayne@68: """ jpayne@68: if data is None: jpayne@68: if length is None: jpayne@68: raise ValueError("length must not be None if data is None") jpayne@68: elif length == 0: jpayne@68: self._data = b"" jpayne@68: elif length < 0: jpayne@68: raise ValueError("length must not be negative.") jpayne@68: else: jpayne@68: self._data = _UndefinedSequenceData(length) jpayne@68: elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)): jpayne@68: self._data = data jpayne@68: elif isinstance(data, (bytearray, _SeqAbstractBaseClass)): jpayne@68: self._data = bytes(data) jpayne@68: elif isinstance(data, str): jpayne@68: self._data = bytes(data, encoding="ASCII") jpayne@68: elif isinstance(data, dict): jpayne@68: if length is None: jpayne@68: raise ValueError("length must not be None if data is a dictionary") jpayne@68: elif length == 0: jpayne@68: self._data = b"" jpayne@68: elif length < 0: jpayne@68: raise ValueError("length must not be negative.") jpayne@68: else: jpayne@68: current = 0 # not needed here, but it keeps mypy happy jpayne@68: end = -1 jpayne@68: starts = sorted(data.keys()) jpayne@68: _data: Dict[int, bytes] = {} jpayne@68: for start in starts: jpayne@68: seq = data[start] jpayne@68: if isinstance(seq, str): jpayne@68: seq = bytes(seq, encoding="ASCII") jpayne@68: else: jpayne@68: try: jpayne@68: seq = bytes(seq) jpayne@68: except Exception: jpayne@68: raise ValueError("Expected bytes-like objects or strings") jpayne@68: if start < end: jpayne@68: raise ValueError("Sequence data are overlapping.") jpayne@68: elif start == end: jpayne@68: _data[current] += seq # noqa: F821 jpayne@68: else: jpayne@68: _data[start] = seq jpayne@68: current = start jpayne@68: end = start + len(seq) jpayne@68: if end > length: jpayne@68: raise ValueError( jpayne@68: "Provided sequence data extend beyond sequence length." jpayne@68: ) jpayne@68: elif end == length and current == 0: jpayne@68: # sequence is fully defined jpayne@68: self._data = _data[current] jpayne@68: else: jpayne@68: self._data = _PartiallyDefinedSequenceData(length, _data) jpayne@68: else: jpayne@68: raise TypeError( jpayne@68: "data should be a string, bytes, bytearray, Seq, or MutableSeq object" jpayne@68: ) jpayne@68: jpayne@68: def __hash__(self): jpayne@68: """Hash of the sequence as a string for comparison. jpayne@68: jpayne@68: See Seq object comparison documentation (method ``__eq__`` in jpayne@68: particular) as this has changed in Biopython 1.65. Older versions jpayne@68: would hash on object identity. jpayne@68: """ jpayne@68: return hash(self._data) jpayne@68: jpayne@68: jpayne@68: class MutableSeq(_SeqAbstractBaseClass): jpayne@68: """An editable sequence object. jpayne@68: jpayne@68: Unlike normal python strings and our basic sequence object (the Seq class) jpayne@68: which are immutable, the MutableSeq lets you edit the sequence in place. jpayne@68: However, this means you cannot use a MutableSeq object as a dictionary key. jpayne@68: jpayne@68: >>> from Bio.Seq import MutableSeq jpayne@68: >>> my_seq = MutableSeq("ACTCGTCGTCG") jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGTCGTCG') jpayne@68: >>> my_seq[5] jpayne@68: 'T' jpayne@68: >>> my_seq[5] = "A" jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq[5] jpayne@68: 'A' jpayne@68: >>> my_seq[5:8] = "NNN" jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGNNNTCG') jpayne@68: >>> len(my_seq) jpayne@68: 11 jpayne@68: jpayne@68: Note that the MutableSeq object does not support as many string-like jpayne@68: or biological methods as the Seq object. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, data): jpayne@68: """Create a MutableSeq object.""" jpayne@68: if isinstance(data, bytearray): jpayne@68: self._data = data jpayne@68: elif isinstance(data, bytes): jpayne@68: self._data = bytearray(data) jpayne@68: elif isinstance(data, str): jpayne@68: self._data = bytearray(data, "ASCII") jpayne@68: elif isinstance(data, MutableSeq): jpayne@68: self._data = data._data[:] # Take a copy jpayne@68: elif isinstance(data, Seq): jpayne@68: # Make no assumptions about the Seq subclass internal storage jpayne@68: self._data = bytearray(bytes(data)) jpayne@68: else: jpayne@68: raise TypeError( jpayne@68: "data should be a string, bytearray object, Seq object, or a " jpayne@68: "MutableSeq object" jpayne@68: ) jpayne@68: jpayne@68: def __setitem__(self, index, value): jpayne@68: """Set a subsequence of single letter via value parameter. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq[0] = 'T' jpayne@68: >>> my_seq jpayne@68: MutableSeq('TCTCGACGTCG') jpayne@68: """ jpayne@68: if isinstance(index, numbers.Integral): jpayne@68: # Replacing a single letter with a new string jpayne@68: self._data[index] = ord(value) jpayne@68: else: jpayne@68: # Replacing a sub-sequence jpayne@68: if isinstance(value, MutableSeq): jpayne@68: self._data[index] = value._data jpayne@68: elif isinstance(value, Seq): jpayne@68: self._data[index] = bytes(value) jpayne@68: elif isinstance(value, str): jpayne@68: self._data[index] = value.encode("ASCII") jpayne@68: else: jpayne@68: raise TypeError(f"received unexpected type '{type(value).__name__}'") jpayne@68: jpayne@68: def __delitem__(self, index): jpayne@68: """Delete a subsequence of single letter. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> del my_seq[0] jpayne@68: >>> my_seq jpayne@68: MutableSeq('CTCGACGTCG') jpayne@68: """ jpayne@68: # Could be deleting a single letter, or a slice jpayne@68: del self._data[index] jpayne@68: jpayne@68: def append(self, c): jpayne@68: """Add a subsequence to the mutable sequence object. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq.append('A') jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGTCGA') jpayne@68: jpayne@68: No return value. jpayne@68: """ jpayne@68: self._data.append(ord(c.encode("ASCII"))) jpayne@68: jpayne@68: def insert(self, i, c): jpayne@68: """Add a subsequence to the mutable sequence object at a given index. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq.insert(0,'A') jpayne@68: >>> my_seq jpayne@68: MutableSeq('AACTCGACGTCG') jpayne@68: >>> my_seq.insert(8,'G') jpayne@68: >>> my_seq jpayne@68: MutableSeq('AACTCGACGGTCG') jpayne@68: jpayne@68: No return value. jpayne@68: """ jpayne@68: self._data.insert(i, ord(c.encode("ASCII"))) jpayne@68: jpayne@68: def pop(self, i=(-1)): jpayne@68: """Remove a subsequence of a single letter at given index. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq.pop() jpayne@68: 'G' jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGTC') jpayne@68: >>> my_seq.pop() jpayne@68: 'C' jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGT') jpayne@68: jpayne@68: Returns the last character of the sequence. jpayne@68: """ jpayne@68: c = self._data[i] jpayne@68: del self._data[i] jpayne@68: return chr(c) jpayne@68: jpayne@68: def remove(self, item): jpayne@68: """Remove a subsequence of a single letter from mutable sequence. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq.remove('C') jpayne@68: >>> my_seq jpayne@68: MutableSeq('ATCGACGTCG') jpayne@68: >>> my_seq.remove('A') jpayne@68: >>> my_seq jpayne@68: MutableSeq('TCGACGTCG') jpayne@68: jpayne@68: No return value. jpayne@68: """ jpayne@68: codepoint = ord(item) jpayne@68: try: jpayne@68: self._data.remove(codepoint) jpayne@68: except ValueError: jpayne@68: raise ValueError("value not found in MutableSeq") from None jpayne@68: jpayne@68: def reverse(self): jpayne@68: """Modify the mutable sequence to reverse itself. jpayne@68: jpayne@68: No return value. jpayne@68: """ jpayne@68: self._data.reverse() jpayne@68: jpayne@68: def extend(self, other): jpayne@68: """Add a sequence to the original mutable sequence object. jpayne@68: jpayne@68: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@68: >>> my_seq.extend('A') jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGTCGA') jpayne@68: >>> my_seq.extend('TTT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('ACTCGACGTCGATTT') jpayne@68: jpayne@68: No return value. jpayne@68: """ jpayne@68: if isinstance(other, MutableSeq): jpayne@68: self._data.extend(other._data) jpayne@68: elif isinstance(other, Seq): jpayne@68: self._data.extend(bytes(other)) jpayne@68: elif isinstance(other, str): jpayne@68: self._data.extend(other.encode("ASCII")) jpayne@68: else: jpayne@68: raise TypeError("expected a string, Seq or MutableSeq") jpayne@68: jpayne@68: jpayne@68: class UndefinedSequenceError(ValueError): jpayne@68: """Sequence contents is undefined.""" jpayne@68: jpayne@68: jpayne@68: class _UndefinedSequenceData(SequenceDataAbstractBaseClass): jpayne@68: """Stores the length of a sequence with an undefined sequence contents (PRIVATE). jpayne@68: jpayne@68: Objects of this class can be used to create a Seq object to represent jpayne@68: sequences with a known length, but an unknown sequence contents. jpayne@68: Calling __len__ returns the sequence length, calling __getitem__ raises an jpayne@68: UndefinedSequenceError except for requests of zero size, for which it jpayne@68: returns an empty bytes object. jpayne@68: """ jpayne@68: jpayne@68: __slots__ = ("_length",) jpayne@68: jpayne@68: def __init__(self, length): jpayne@68: """Initialize the object with the sequence length. jpayne@68: jpayne@68: The calling function is responsible for ensuring that the length is jpayne@68: greater than zero. jpayne@68: """ jpayne@68: self._length = length jpayne@68: super().__init__() jpayne@68: jpayne@68: def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]: jpayne@68: if isinstance(key, slice): jpayne@68: start, end, step = key.indices(self._length) jpayne@68: size = len(range(start, end, step)) jpayne@68: if size == 0: jpayne@68: return b"" jpayne@68: return _UndefinedSequenceData(size) jpayne@68: else: jpayne@68: raise UndefinedSequenceError("Sequence content is undefined") jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self._length jpayne@68: jpayne@68: def __bytes__(self): jpayne@68: raise UndefinedSequenceError("Sequence content is undefined") jpayne@68: jpayne@68: def __add__(self, other): jpayne@68: length = len(self) + len(other) jpayne@68: try: jpayne@68: other = bytes(other) jpayne@68: except UndefinedSequenceError: jpayne@68: if isinstance(other, _UndefinedSequenceData): jpayne@68: return _UndefinedSequenceData(length) jpayne@68: else: jpayne@68: return NotImplemented jpayne@68: # _PartiallyDefinedSequenceData.__radd__ will handle this jpayne@68: else: jpayne@68: data = {len(self): other} jpayne@68: return _PartiallyDefinedSequenceData(length, data) jpayne@68: jpayne@68: def __radd__(self, other): jpayne@68: data = {0: bytes(other)} jpayne@68: length = len(other) + len(self) jpayne@68: return _PartiallyDefinedSequenceData(length, data) jpayne@68: jpayne@68: def upper(self): jpayne@68: """Return an upper case copy of the sequence.""" jpayne@68: # An upper case copy of an undefined sequence is an undefined jpayne@68: # sequence of the same length jpayne@68: return _UndefinedSequenceData(self._length) jpayne@68: jpayne@68: def lower(self): jpayne@68: """Return a lower case copy of the sequence.""" jpayne@68: # A lower case copy of an undefined sequence is an undefined jpayne@68: # sequence of the same length jpayne@68: return _UndefinedSequenceData(self._length) jpayne@68: jpayne@68: def isupper(self): jpayne@68: """Return True if all ASCII characters in data are uppercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: # Character case is irrelevant for an undefined sequence jpayne@68: raise UndefinedSequenceError("Sequence content is undefined") jpayne@68: jpayne@68: def islower(self): jpayne@68: """Return True if all ASCII characters in data are lowercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: # Character case is irrelevant for an undefined sequence jpayne@68: raise UndefinedSequenceError("Sequence content is undefined") jpayne@68: jpayne@68: def replace(self, old, new): jpayne@68: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@68: # Replacing substring old by new in an undefined sequence will result jpayne@68: # in an undefined sequence of the same length, if old and new have the jpayne@68: # number of characters. jpayne@68: if len(old) != len(new): jpayne@68: raise UndefinedSequenceError("Sequence content is undefined") jpayne@68: return _UndefinedSequenceData(self._length) jpayne@68: jpayne@68: @property jpayne@68: def defined(self): jpayne@68: """Return False, as the sequence is not defined and has a non-zero length.""" jpayne@68: return False jpayne@68: jpayne@68: @property jpayne@68: def defined_ranges(self): jpayne@68: """Return a tuple of the ranges where the sequence contents is defined. jpayne@68: jpayne@68: As the sequence contents of an _UndefinedSequenceData object is fully jpayne@68: undefined, the return value is always an empty tuple. jpayne@68: """ jpayne@68: return () jpayne@68: jpayne@68: jpayne@68: class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass): jpayne@68: """Stores the length of a sequence with an undefined sequence contents (PRIVATE). jpayne@68: jpayne@68: Objects of this class can be used to create a Seq object to represent jpayne@68: sequences with a known length, but with a sequence contents that is only jpayne@68: partially known. jpayne@68: Calling __len__ returns the sequence length, calling __getitem__ returns jpayne@68: the sequence contents if known, otherwise an UndefinedSequenceError is jpayne@68: raised. jpayne@68: """ jpayne@68: jpayne@68: __slots__ = ("_length", "_data") jpayne@68: jpayne@68: def __init__(self, length, data): jpayne@68: """Initialize with the sequence length and defined sequence segments. jpayne@68: jpayne@68: The calling function is responsible for ensuring that the length is jpayne@68: greater than zero. jpayne@68: """ jpayne@68: self._length = length jpayne@68: self._data = data jpayne@68: super().__init__() jpayne@68: jpayne@68: def __getitem__( jpayne@68: self, key: Union[slice, int] jpayne@68: ) -> Union[bytes, SequenceDataAbstractBaseClass]: jpayne@68: if isinstance(key, slice): jpayne@68: start, end, step = key.indices(self._length) jpayne@68: size = len(range(start, end, step)) jpayne@68: if size == 0: jpayne@68: return b"" jpayne@68: data = {} jpayne@68: for s, d in self._data.items(): jpayne@68: indices = range(-s, -s + self._length)[key] jpayne@68: e: Optional[int] = indices.stop jpayne@68: assert e is not None jpayne@68: if step > 0: jpayne@68: if e <= 0: jpayne@68: continue jpayne@68: if indices.start < 0: jpayne@68: s = indices.start % step jpayne@68: else: jpayne@68: s = indices.start jpayne@68: else: # step < 0 jpayne@68: if e < 0: jpayne@68: e = None jpayne@68: end = len(d) - 1 jpayne@68: if indices.start > end: jpayne@68: s = end + (indices.start - end) % step jpayne@68: else: jpayne@68: s = indices.start jpayne@68: if s < 0: jpayne@68: continue jpayne@68: start = (s - indices.start) // step jpayne@68: d = d[s:e:step] jpayne@68: if d: jpayne@68: data[start] = d jpayne@68: if len(data) == 0: # Fully undefined sequence jpayne@68: return _UndefinedSequenceData(size) jpayne@68: # merge adjacent sequence segments jpayne@68: end = -1 jpayne@68: previous = 0 # not needed here, but it keeps flake happy jpayne@68: items = data.items() jpayne@68: data = {} jpayne@68: for start, seq in items: jpayne@68: if end == start: jpayne@68: data[previous] += seq jpayne@68: else: jpayne@68: data[start] = seq jpayne@68: previous = start jpayne@68: end = start + len(seq) jpayne@68: if len(data) == 1: jpayne@68: seq = data.get(0) jpayne@68: if seq is not None and len(seq) == size: jpayne@68: return seq # Fully defined sequence; return bytes jpayne@68: if step < 0: jpayne@68: # use this after we drop Python 3.7: jpayne@68: # data = {start: data[start] for start in reversed(data)} jpayne@68: # use this as long as we support Python 3.7: jpayne@68: data = {start: data[start] for start in reversed(list(data.keys()))} jpayne@68: return _PartiallyDefinedSequenceData(size, data) jpayne@68: elif self._length <= key: jpayne@68: raise IndexError("sequence index out of range") jpayne@68: else: jpayne@68: for start, seq in self._data.items(): jpayne@68: if start <= key and key < start + len(seq): jpayne@68: return seq[key - start] jpayne@68: raise UndefinedSequenceError("Sequence at position %d is undefined" % key) jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self._length jpayne@68: jpayne@68: def __bytes__(self): jpayne@68: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@68: jpayne@68: def __add__(self, other): jpayne@68: length = len(self) + len(other) jpayne@68: data = dict(self._data) jpayne@68: items = list(self._data.items()) jpayne@68: start, seq = items[-1] jpayne@68: end = start + len(seq) jpayne@68: try: jpayne@68: other = bytes(other) jpayne@68: except UndefinedSequenceError: jpayne@68: if isinstance(other, _UndefinedSequenceData): jpayne@68: pass jpayne@68: elif isinstance(other, _PartiallyDefinedSequenceData): jpayne@68: other_items = list(other._data.items()) jpayne@68: if end == len(self): jpayne@68: other_start, other_seq = other_items.pop(0) jpayne@68: if other_start == 0: jpayne@68: data[start] += other_seq jpayne@68: else: jpayne@68: data[len(self) + other_start] = other_seq jpayne@68: for other_start, other_seq in other_items: jpayne@68: data[len(self) + other_start] = other_seq jpayne@68: else: jpayne@68: if end == len(self): jpayne@68: data[start] += other jpayne@68: else: jpayne@68: data[len(self)] = other jpayne@68: return _PartiallyDefinedSequenceData(length, data) jpayne@68: jpayne@68: def __radd__(self, other): jpayne@68: length = len(other) + len(self) jpayne@68: try: jpayne@68: other = bytes(other) jpayne@68: except UndefinedSequenceError: jpayne@68: data = {len(other) + start: seq for start, seq in self._data.items()} jpayne@68: else: jpayne@68: data = {0: other} jpayne@68: items = list(self._data.items()) jpayne@68: start, seq = items.pop(0) jpayne@68: if start == 0: jpayne@68: data[0] += seq jpayne@68: else: jpayne@68: data[len(other) + start] = seq jpayne@68: for start, seq in items: jpayne@68: data[len(other) + start] = seq jpayne@68: return _PartiallyDefinedSequenceData(length, data) jpayne@68: jpayne@68: def __mul__(self, other): jpayne@68: length = self._length jpayne@68: items = self._data.items() jpayne@68: data = {} jpayne@68: end = -1 jpayne@68: previous = 0 # not needed here, but it keeps flake happy jpayne@68: for i in range(other): jpayne@68: for start, seq in items: jpayne@68: start += i * length jpayne@68: if end == start: jpayne@68: data[previous] += seq jpayne@68: else: jpayne@68: data[start] = seq jpayne@68: previous = start jpayne@68: end = start + len(seq) jpayne@68: return _PartiallyDefinedSequenceData(length * other, data) jpayne@68: jpayne@68: def upper(self): jpayne@68: """Return an upper case copy of the sequence.""" jpayne@68: data = {start: seq.upper() for start, seq in self._data.items()} jpayne@68: return _PartiallyDefinedSequenceData(self._length, data) jpayne@68: jpayne@68: def lower(self): jpayne@68: """Return a lower case copy of the sequence.""" jpayne@68: data = {start: seq.lower() for start, seq in self._data.items()} jpayne@68: return _PartiallyDefinedSequenceData(self._length, data) jpayne@68: jpayne@68: def isupper(self): jpayne@68: """Return True if all ASCII characters in data are uppercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: # Character case is irrelevant for an undefined sequence jpayne@68: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@68: jpayne@68: def islower(self): jpayne@68: """Return True if all ASCII characters in data are lowercase. jpayne@68: jpayne@68: If there are no cased characters, the method returns False. jpayne@68: """ jpayne@68: # Character case is irrelevant for an undefined sequence jpayne@68: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@68: jpayne@68: def translate(self, table, delete=b""): jpayne@68: """Return a copy with each character mapped by the given translation table. jpayne@68: jpayne@68: table jpayne@68: Translation table, which must be a bytes object of length 256. jpayne@68: jpayne@68: All characters occurring in the optional argument delete are removed. jpayne@68: The remaining characters are mapped through the given translation table. jpayne@68: """ jpayne@68: items = self._data.items() jpayne@68: data = {start: seq.translate(table, delete) for start, seq in items} jpayne@68: return _PartiallyDefinedSequenceData(self._length, data) jpayne@68: jpayne@68: def replace(self, old, new): jpayne@68: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@68: # Replacing substring old by new in the undefined sequence segments jpayne@68: # will result in an undefined sequence segment of the same length, if jpayne@68: # old and new have the number of characters. If not, an error is raised, jpayne@68: # as the correct start positions cannot be calculated reliably. jpayne@68: if len(old) != len(new): jpayne@68: raise UndefinedSequenceError( jpayne@68: "Sequence content is only partially defined; substring \n" jpayne@68: "replacement cannot be performed reliably" jpayne@68: ) jpayne@68: items = self._data.items() jpayne@68: data = {start: seq.replace(old, new) for start, seq in items} jpayne@68: return _PartiallyDefinedSequenceData(self._length, data) jpayne@68: jpayne@68: @property jpayne@68: def defined(self): jpayne@68: """Return False, as the sequence is not fully defined and has a non-zero length.""" jpayne@68: return False jpayne@68: jpayne@68: @property jpayne@68: def defined_ranges(self): jpayne@68: """Return a tuple of the ranges where the sequence contents is defined. jpayne@68: jpayne@68: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@68: """ jpayne@68: return tuple((start, start + len(seq)) for start, seq in self._data.items()) jpayne@68: jpayne@68: jpayne@68: # The transcribe, backward_transcribe, and translate functions are jpayne@68: # user-friendly versions of the corresponding Seq/MutableSeq methods. jpayne@68: # The functions work both on Seq objects, and on strings. jpayne@68: jpayne@68: jpayne@68: def transcribe(dna): jpayne@68: """Transcribe a DNA sequence into RNA. jpayne@68: jpayne@68: Following the usual convention, the sequence is interpreted as the jpayne@68: coding strand of the DNA double helix, not the template strand. This jpayne@68: means we can get the RNA sequence just by switching T to U. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: jpayne@68: Given a Seq or MutableSeq, returns a new Seq object. jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> transcribe("ACTGN") jpayne@68: 'ACUGN' jpayne@68: """ jpayne@68: if isinstance(dna, Seq): jpayne@68: return dna.transcribe() jpayne@68: elif isinstance(dna, MutableSeq): jpayne@68: return Seq(dna).transcribe() jpayne@68: else: jpayne@68: return dna.replace("T", "U").replace("t", "u") jpayne@68: jpayne@68: jpayne@68: def back_transcribe(rna): jpayne@68: """Return the RNA sequence back-transcribed into DNA. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: jpayne@68: Given a Seq or MutableSeq, returns a new Seq object. jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> back_transcribe("ACUGN") jpayne@68: 'ACTGN' jpayne@68: """ jpayne@68: if isinstance(rna, Seq): jpayne@68: return rna.back_transcribe() jpayne@68: elif isinstance(rna, MutableSeq): jpayne@68: return Seq(rna).back_transcribe() jpayne@68: else: jpayne@68: return rna.replace("U", "T").replace("u", "t") jpayne@68: jpayne@68: jpayne@68: def _translate_str( jpayne@68: sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None jpayne@68: ): jpayne@68: """Translate nucleotide string into a protein string (PRIVATE). jpayne@68: jpayne@68: Arguments: jpayne@68: - sequence - a string jpayne@68: - table - Which codon table to use? This can be either a name (string), jpayne@68: an NCBI identifier (integer), or a CodonTable object (useful for jpayne@68: non-standard genetic codes). This defaults to the "Standard" table. jpayne@68: - stop_symbol - a single character string, what to use for terminators. jpayne@68: - to_stop - boolean, should translation terminate at the first jpayne@68: in frame stop codon? If there is no in-frame stop codon jpayne@68: then translation continues to the end. jpayne@68: - pos_stop - a single character string for a possible stop codon jpayne@68: (e.g. TAN or NNN) jpayne@68: - cds - Boolean, indicates this is a complete CDS. If True, this jpayne@68: checks the sequence starts with a valid alternative start jpayne@68: codon (which will be translated as methionine, M), that the jpayne@68: sequence length is a multiple of three, and that there is a jpayne@68: single in frame stop codon at the end (this will be excluded jpayne@68: from the protein sequence, regardless of the to_stop option). jpayne@68: If these tests fail, an exception is raised. jpayne@68: - gap - Single character string to denote symbol used for gaps. jpayne@68: Defaults to None. jpayne@68: jpayne@68: Returns a string. jpayne@68: jpayne@68: e.g. jpayne@68: jpayne@68: >>> from Bio.Data import CodonTable jpayne@68: >>> table = CodonTable.ambiguous_dna_by_id[1] jpayne@68: >>> _translate_str("AAA", table) jpayne@68: 'K' jpayne@68: >>> _translate_str("TAR", table) jpayne@68: '*' jpayne@68: >>> _translate_str("TAN", table) jpayne@68: 'X' jpayne@68: >>> _translate_str("TAN", table, pos_stop="@") jpayne@68: '@' jpayne@68: >>> _translate_str("TA?", table) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid jpayne@68: jpayne@68: In a change to older versions of Biopython, partial codons are now jpayne@68: always regarded as an error (previously only checked if cds=True) jpayne@68: and will trigger a warning (likely to become an exception in a jpayne@68: future release). jpayne@68: jpayne@68: If **cds=True**, the start and stop codons are checked, and the start jpayne@68: codon will be translated at methionine. The sequence must be an jpayne@68: while number of codons. jpayne@68: jpayne@68: >>> _translate_str("ATGCCCTAG", table, cds=True) jpayne@68: 'MP' jpayne@68: >>> _translate_str("AAACCCTAG", table, cds=True) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon jpayne@68: >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found. jpayne@68: """ jpayne@68: try: jpayne@68: table_id = int(table) jpayne@68: except ValueError: jpayne@68: # Assume it's a table name jpayne@68: # The same table can be used for RNA or DNA jpayne@68: try: jpayne@68: codon_table = CodonTable.ambiguous_generic_by_name[table] jpayne@68: except KeyError: jpayne@68: if isinstance(table, str): jpayne@68: raise ValueError( jpayne@68: "The Bio.Seq translate methods and function DO NOT " jpayne@68: "take a character string mapping table like the python " jpayne@68: "string object's translate method. " jpayne@68: "Use str(my_seq).translate(...) instead." jpayne@68: ) from None jpayne@68: else: jpayne@68: raise TypeError("table argument must be integer or string") from None jpayne@68: except (AttributeError, TypeError): jpayne@68: # Assume it's a CodonTable object jpayne@68: if isinstance(table, CodonTable.CodonTable): jpayne@68: codon_table = table jpayne@68: else: jpayne@68: raise ValueError("Bad table argument") from None jpayne@68: else: jpayne@68: # Assume it's a table ID jpayne@68: # The same table can be used for RNA or DNA jpayne@68: codon_table = CodonTable.ambiguous_generic_by_id[table_id] jpayne@68: sequence = sequence.upper() jpayne@68: amino_acids = [] jpayne@68: forward_table = codon_table.forward_table jpayne@68: stop_codons = codon_table.stop_codons jpayne@68: if codon_table.nucleotide_alphabet is not None: jpayne@68: valid_letters = set(codon_table.nucleotide_alphabet.upper()) jpayne@68: else: jpayne@68: # Assume the worst case, ambiguous DNA or RNA: jpayne@68: valid_letters = set( jpayne@68: IUPACData.ambiguous_dna_letters.upper() jpayne@68: + IUPACData.ambiguous_rna_letters.upper() jpayne@68: ) jpayne@68: n = len(sequence) jpayne@68: jpayne@68: # Check for tables with 'ambiguous' (dual-coding) stop codons: jpayne@68: dual_coding = [c for c in stop_codons if c in forward_table] jpayne@68: if dual_coding: jpayne@68: c = dual_coding[0] jpayne@68: if to_stop: jpayne@68: raise ValueError( jpayne@68: "You cannot use 'to_stop=True' with this table as it contains" jpayne@68: f" {len(dual_coding)} codon(s) which can be both STOP and an" jpayne@68: f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)." jpayne@68: ) jpayne@68: warnings.warn( jpayne@68: f"This table contains {len(dual_coding)} codon(s) which code(s) for" jpayne@68: f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'" jpayne@68: " or STOP). Such codons will be translated as amino acid.", jpayne@68: BiopythonWarning, jpayne@68: ) jpayne@68: jpayne@68: if cds: jpayne@68: if str(sequence[:3]).upper() not in codon_table.start_codons: jpayne@68: raise CodonTable.TranslationError( jpayne@68: f"First codon '{sequence[:3]}' is not a start codon" jpayne@68: ) jpayne@68: if n % 3 != 0: jpayne@68: raise CodonTable.TranslationError( jpayne@68: f"Sequence length {n} is not a multiple of three" jpayne@68: ) jpayne@68: if str(sequence[-3:]).upper() not in stop_codons: jpayne@68: raise CodonTable.TranslationError( jpayne@68: f"Final codon '{sequence[-3:]}' is not a stop codon" jpayne@68: ) jpayne@68: # Don't translate the stop symbol, and manually translate the M jpayne@68: sequence = sequence[3:-3] jpayne@68: n -= 6 jpayne@68: amino_acids = ["M"] jpayne@68: elif n % 3 != 0: jpayne@68: warnings.warn( jpayne@68: "Partial codon, len(sequence) not a multiple of three. " jpayne@68: "Explicitly trim the sequence or add trailing N before " jpayne@68: "translation. This may become an error in future.", jpayne@68: BiopythonWarning, jpayne@68: ) jpayne@68: if gap is not None: jpayne@68: if not isinstance(gap, str): jpayne@68: raise TypeError("Gap character should be a single character string.") jpayne@68: elif len(gap) > 1: jpayne@68: raise ValueError("Gap character should be a single character string.") jpayne@68: jpayne@68: for i in range(0, n - n % 3, 3): jpayne@68: codon = sequence[i : i + 3] jpayne@68: try: jpayne@68: amino_acids.append(forward_table[codon]) jpayne@68: except (KeyError, CodonTable.TranslationError): jpayne@68: if codon in codon_table.stop_codons: jpayne@68: if cds: jpayne@68: raise CodonTable.TranslationError( jpayne@68: f"Extra in frame stop codon '{codon}' found." jpayne@68: ) from None jpayne@68: if to_stop: jpayne@68: break jpayne@68: amino_acids.append(stop_symbol) jpayne@68: elif valid_letters.issuperset(set(codon)): jpayne@68: # Possible stop codon (e.g. NNN or TAN) jpayne@68: amino_acids.append(pos_stop) jpayne@68: elif gap is not None and codon == gap * 3: jpayne@68: # Gapped translation jpayne@68: amino_acids.append(gap) jpayne@68: else: jpayne@68: raise CodonTable.TranslationError( jpayne@68: f"Codon '{codon}' is invalid" jpayne@68: ) from None jpayne@68: return "".join(amino_acids) jpayne@68: jpayne@68: jpayne@68: def translate( jpayne@68: sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None jpayne@68: ): jpayne@68: """Translate a nucleotide sequence into amino acids. jpayne@68: jpayne@68: If given a string, returns a new string object. Given a Seq or jpayne@68: MutableSeq, returns a Seq object. jpayne@68: jpayne@68: Arguments: jpayne@68: - table - Which codon table to use? This can be either a name jpayne@68: (string), an NCBI identifier (integer), or a CodonTable object jpayne@68: (useful for non-standard genetic codes). Defaults to the "Standard" jpayne@68: table. jpayne@68: - stop_symbol - Single character string, what to use for any jpayne@68: terminators, defaults to the asterisk, "*". jpayne@68: - to_stop - Boolean, defaults to False meaning do a full jpayne@68: translation continuing on past any stop codons jpayne@68: (translated as the specified stop_symbol). If jpayne@68: True, translation is terminated at the first in jpayne@68: frame stop codon (and the stop_symbol is not jpayne@68: appended to the returned protein sequence). jpayne@68: - cds - Boolean, indicates this is a complete CDS. If True, this jpayne@68: checks the sequence starts with a valid alternative start jpayne@68: codon (which will be translated as methionine, M), that the jpayne@68: sequence length is a multiple of three, and that there is a jpayne@68: single in frame stop codon at the end (this will be excluded jpayne@68: from the protein sequence, regardless of the to_stop option). jpayne@68: If these tests fail, an exception is raised. jpayne@68: - gap - Single character string to denote symbol used for gaps. jpayne@68: Defaults to None. jpayne@68: jpayne@68: A simple string example using the default (standard) genetic code: jpayne@68: jpayne@68: >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" jpayne@68: >>> translate(coding_dna) jpayne@68: 'VAIVMGR*KGAR*' jpayne@68: >>> translate(coding_dna, stop_symbol="@") jpayne@68: 'VAIVMGR@KGAR@' jpayne@68: >>> translate(coding_dna, to_stop=True) jpayne@68: 'VAIVMGR' jpayne@68: jpayne@68: Now using NCBI table 2, where TGA is not a stop codon: jpayne@68: jpayne@68: >>> translate(coding_dna, table=2) jpayne@68: 'VAIVMGRWKGAR*' jpayne@68: >>> translate(coding_dna, table=2, to_stop=True) jpayne@68: 'VAIVMGRWKGAR' jpayne@68: jpayne@68: In fact this example uses an alternative start codon valid under NCBI jpayne@68: table 2, GTG, which means this example is a complete valid CDS which jpayne@68: when translated should really start with methionine (not valine): jpayne@68: jpayne@68: >>> translate(coding_dna, table=2, cds=True) jpayne@68: 'MAIVMGRWKGAR' jpayne@68: jpayne@68: Note that if the sequence has no in-frame stop codon, then the to_stop jpayne@68: argument has no effect: jpayne@68: jpayne@68: >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" jpayne@68: >>> translate(coding_dna2) jpayne@68: 'VAIVMGR' jpayne@68: >>> translate(coding_dna2, to_stop=True) jpayne@68: 'VAIVMGR' jpayne@68: jpayne@68: NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid jpayne@68: or a stop codon. These are translated as "X". Any invalid codon jpayne@68: (e.g. "TA?" or "T-A") will throw a TranslationError. jpayne@68: jpayne@68: It will however translate either DNA or RNA. jpayne@68: jpayne@68: NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous jpayne@68: stop codons'. These are stop codons with unambiguous sequence but which jpayne@68: have a context dependent coding as STOP or as amino acid. With these tables jpayne@68: 'to_stop' must be False (otherwise a ValueError is raised). The dual jpayne@68: coding codons will always be translated as amino acid, except for jpayne@68: 'cds=True', where the last codon will be translated as STOP. jpayne@68: jpayne@68: >>> coding_dna3 = "ATGGCACGGAAGTGA" jpayne@68: >>> translate(coding_dna3) jpayne@68: 'MARK*' jpayne@68: jpayne@68: >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W jpayne@68: 'MARKW' jpayne@68: jpayne@68: It will however raise a BiopythonWarning (not shown). jpayne@68: jpayne@68: >>> translate(coding_dna3, table=27, cds=True) jpayne@68: 'MARK' jpayne@68: jpayne@68: >>> translate(coding_dna3, table=27, to_stop=True) jpayne@68: Traceback (most recent call last): jpayne@68: ... jpayne@68: ValueError: You cannot use 'to_stop=True' with this table ... jpayne@68: """ jpayne@68: if isinstance(sequence, Seq): jpayne@68: return sequence.translate(table, stop_symbol, to_stop, cds) jpayne@68: elif isinstance(sequence, MutableSeq): jpayne@68: # Return a Seq object jpayne@68: return Seq(sequence).translate(table, stop_symbol, to_stop, cds) jpayne@68: else: jpayne@68: # Assume it's a string, return a string jpayne@68: return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap) jpayne@68: jpayne@68: jpayne@68: def reverse_complement(sequence, inplace=False): jpayne@68: """Return the reverse complement as a DNA sequence. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: Given a Seq object, returns a new Seq object. jpayne@68: Given a MutableSeq, returns a new MutableSeq object. jpayne@68: Given a SeqRecord object, returns a new SeqRecord object. jpayne@68: jpayne@68: >>> my_seq = "CGA" jpayne@68: >>> reverse_complement(my_seq) jpayne@68: 'TCG' jpayne@68: >>> my_seq = Seq("CGA") jpayne@68: >>> reverse_complement(my_seq) jpayne@68: Seq('TCG') jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> reverse_complement(my_seq) jpayne@68: MutableSeq('TCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: Any U in the sequence is treated as a T: jpayne@68: jpayne@68: >>> reverse_complement(Seq("CGAUT")) jpayne@68: Seq('AATCG') jpayne@68: jpayne@68: In contrast, ``reverse_complement_rna`` returns an RNA sequence: jpayne@68: jpayne@68: >>> reverse_complement_rna(Seq("CGAUT")) jpayne@68: Seq('AAUCG') jpayne@68: jpayne@68: Supports and lower- and upper-case characters, and unambiguous and jpayne@68: ambiguous nucleotides. All other characters are not converted: jpayne@68: jpayne@68: >>> reverse_complement("ACGTUacgtuXYZxyz") jpayne@68: 'zrxZRXaacgtAACGT' jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> reverse_complement(my_seq, inplace=True) jpayne@68: MutableSeq('TCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('TCG') jpayne@68: jpayne@68: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@68: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@68: jpayne@68: if isinstance(sequence, (Seq, MutableSeq)): jpayne@68: return sequence.reverse_complement(inplace) jpayne@68: if isinstance(sequence, SeqRecord): jpayne@68: if inplace: jpayne@68: raise TypeError("SeqRecords are immutable") jpayne@68: return sequence.reverse_complement() jpayne@68: # Assume it's a string. jpayne@68: if inplace: jpayne@68: raise TypeError("strings are immutable") jpayne@68: sequence = sequence.encode("ASCII") jpayne@68: sequence = sequence.translate(_dna_complement_table) jpayne@68: sequence = sequence.decode("ASCII") jpayne@68: return sequence[::-1] jpayne@68: jpayne@68: jpayne@68: def reverse_complement_rna(sequence, inplace=False): jpayne@68: """Return the reverse complement as an RNA sequence. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: Given a Seq object, returns a new Seq object. jpayne@68: Given a MutableSeq, returns a new MutableSeq object. jpayne@68: Given a SeqRecord object, returns a new SeqRecord object. jpayne@68: jpayne@68: >>> my_seq = "CGA" jpayne@68: >>> reverse_complement_rna(my_seq) jpayne@68: 'UCG' jpayne@68: >>> my_seq = Seq("CGA") jpayne@68: >>> reverse_complement_rna(my_seq) jpayne@68: Seq('UCG') jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> reverse_complement_rna(my_seq) jpayne@68: MutableSeq('UCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: Any T in the sequence is treated as a U: jpayne@68: jpayne@68: >>> reverse_complement_rna(Seq("CGAUT")) jpayne@68: Seq('AAUCG') jpayne@68: jpayne@68: In contrast, ``reverse_complement`` returns a DNA sequence: jpayne@68: jpayne@68: >>> reverse_complement(Seq("CGAUT"), inplace=False) jpayne@68: Seq('AATCG') jpayne@68: jpayne@68: Supports and lower- and upper-case characters, and unambiguous and jpayne@68: ambiguous nucleotides. All other characters are not converted: jpayne@68: jpayne@68: >>> reverse_complement_rna("ACGTUacgtuXYZxyz") jpayne@68: 'zrxZRXaacguAACGU' jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> reverse_complement_rna(my_seq, inplace=True) jpayne@68: MutableSeq('UCG') jpayne@68: >>> my_seq jpayne@68: MutableSeq('UCG') jpayne@68: jpayne@68: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@68: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@68: jpayne@68: if isinstance(sequence, (Seq, MutableSeq)): jpayne@68: return sequence.reverse_complement_rna(inplace) jpayne@68: if isinstance(sequence, SeqRecord): jpayne@68: if inplace: jpayne@68: raise TypeError("SeqRecords are immutable") jpayne@68: return sequence.reverse_complement_rna() jpayne@68: # Assume it's a string. jpayne@68: if inplace: jpayne@68: raise TypeError("strings are immutable") jpayne@68: sequence = sequence.encode("ASCII") jpayne@68: sequence = sequence.translate(_rna_complement_table) jpayne@68: sequence = sequence.decode("ASCII") jpayne@68: return sequence[::-1] jpayne@68: jpayne@68: jpayne@68: def complement(sequence, inplace=False): jpayne@68: """Return the complement as a DNA sequence. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: Given a Seq object, returns a new Seq object. jpayne@68: Given a MutableSeq, returns a new MutableSeq object. jpayne@68: Given a SeqRecord object, returns a new SeqRecord object. jpayne@68: jpayne@68: >>> my_seq = "CGA" jpayne@68: >>> complement(my_seq) jpayne@68: 'GCT' jpayne@68: >>> my_seq = Seq("CGA") jpayne@68: >>> complement(my_seq) jpayne@68: Seq('GCT') jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> complement(my_seq) jpayne@68: MutableSeq('GCT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: Any U in the sequence is treated as a T: jpayne@68: jpayne@68: >>> complement(Seq("CGAUT")) jpayne@68: Seq('GCTAA') jpayne@68: jpayne@68: In contrast, ``complement_rna`` returns an RNA sequence: jpayne@68: jpayne@68: >>> complement_rna(Seq("CGAUT")) jpayne@68: Seq('GCUAA') jpayne@68: jpayne@68: Supports and lower- and upper-case characters, and unambiguous and jpayne@68: ambiguous nucleotides. All other characters are not converted: jpayne@68: jpayne@68: >>> complement("ACGTUacgtuXYZxyz") jpayne@68: 'TGCAAtgcaaXRZxrz' jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> complement(my_seq, inplace=True) jpayne@68: MutableSeq('GCT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('GCT') jpayne@68: jpayne@68: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@68: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@68: jpayne@68: if isinstance(sequence, (Seq, MutableSeq)): jpayne@68: return sequence.complement(inplace) jpayne@68: if isinstance(sequence, SeqRecord): jpayne@68: if inplace: jpayne@68: raise TypeError("SeqRecords are immutable") jpayne@68: return sequence.complement() jpayne@68: # Assume it's a string. jpayne@68: if inplace is True: jpayne@68: raise TypeError("strings are immutable") jpayne@68: sequence = sequence.encode("ASCII") jpayne@68: sequence = sequence.translate(_dna_complement_table) jpayne@68: return sequence.decode("ASCII") jpayne@68: jpayne@68: jpayne@68: def complement_rna(sequence, inplace=False): jpayne@68: """Return the complement as an RNA sequence. jpayne@68: jpayne@68: If given a string, returns a new string object. jpayne@68: Given a Seq object, returns a new Seq object. jpayne@68: Given a MutableSeq, returns a new MutableSeq object. jpayne@68: Given a SeqRecord object, returns a new SeqRecord object. jpayne@68: jpayne@68: >>> my_seq = "CGA" jpayne@68: >>> complement_rna(my_seq) jpayne@68: 'GCU' jpayne@68: >>> my_seq = Seq("CGA") jpayne@68: >>> complement_rna(my_seq) jpayne@68: Seq('GCU') jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> complement_rna(my_seq) jpayne@68: MutableSeq('GCU') jpayne@68: >>> my_seq jpayne@68: MutableSeq('CGA') jpayne@68: jpayne@68: Any T in the sequence is treated as a U: jpayne@68: jpayne@68: >>> complement_rna(Seq("CGAUT")) jpayne@68: Seq('GCUAA') jpayne@68: jpayne@68: In contrast, ``complement`` returns a DNA sequence: jpayne@68: jpayne@68: >>> complement(Seq("CGAUT")) jpayne@68: Seq('GCTAA') jpayne@68: jpayne@68: Supports and lower- and upper-case characters, and unambiguous and jpayne@68: ambiguous nucleotides. All other characters are not converted: jpayne@68: jpayne@68: >>> complement_rna("ACGTUacgtuXYZxyz") jpayne@68: 'UGCAAugcaaXRZxrz' jpayne@68: jpayne@68: The sequence is modified in-place and returned if inplace is True: jpayne@68: jpayne@68: >>> my_seq = MutableSeq("CGA") jpayne@68: >>> complement(my_seq, inplace=True) jpayne@68: MutableSeq('GCT') jpayne@68: >>> my_seq jpayne@68: MutableSeq('GCT') jpayne@68: jpayne@68: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@68: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@68: ``inplace=True``. jpayne@68: """ jpayne@68: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@68: jpayne@68: if isinstance(sequence, (Seq, MutableSeq)): jpayne@68: return sequence.complement_rna(inplace) jpayne@68: if isinstance(sequence, SeqRecord): jpayne@68: if inplace: jpayne@68: raise TypeError("SeqRecords are immutable") jpayne@68: return sequence.complement_rna() jpayne@68: # Assume it's a string. jpayne@68: if inplace: jpayne@68: raise TypeError("strings are immutable") jpayne@68: sequence = sequence.encode("ASCII") jpayne@68: sequence = sequence.translate(_rna_complement_table) jpayne@68: return sequence.decode("ASCII") jpayne@68: jpayne@68: jpayne@68: def _test(): jpayne@68: """Run the Bio.Seq module's doctests (PRIVATE).""" jpayne@68: print("Running doctests...") jpayne@68: import doctest jpayne@68: jpayne@68: doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) jpayne@68: print("Done") jpayne@68: jpayne@68: jpayne@68: if __name__ == "__main__": jpayne@68: _test()