jpayne@69: # Copyright 2000 Andrew Dalke. jpayne@69: # Copyright 2000-2002 Brad Chapman. jpayne@69: # Copyright 2004-2005, 2010 by M de Hoon. jpayne@69: # Copyright 2007-2023 by Peter Cock. jpayne@69: # All rights reserved. jpayne@69: # jpayne@69: # This file is part of the Biopython distribution and governed by your jpayne@69: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@69: # Please see the LICENSE file that should have been included as part of this jpayne@69: # package. jpayne@69: """Provide objects to represent biological sequences. jpayne@69: jpayne@69: See also the Seq_ wiki and the chapter in our tutorial: jpayne@69: - `HTML Tutorial`_ jpayne@69: - `PDF Tutorial`_ jpayne@69: jpayne@69: .. _Seq: http://biopython.org/wiki/Seq jpayne@69: .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html jpayne@69: .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf jpayne@69: jpayne@69: """ jpayne@69: import array jpayne@69: import collections jpayne@69: import numbers jpayne@69: import warnings jpayne@69: jpayne@69: from abc import ABC jpayne@69: from abc import abstractmethod jpayne@69: from typing import overload, Optional, Union, Dict jpayne@69: jpayne@69: from Bio import BiopythonWarning jpayne@69: from Bio.Data import CodonTable jpayne@69: from Bio.Data import IUPACData jpayne@69: jpayne@69: jpayne@69: def _maketrans(complement_mapping): jpayne@69: """Make a python string translation table (PRIVATE). jpayne@69: jpayne@69: Arguments: jpayne@69: - complement_mapping - a dictionary such as ambiguous_dna_complement jpayne@69: and ambiguous_rna_complement from Data.IUPACData. jpayne@69: jpayne@69: Returns a translation table (a bytes object of length 256) for use with jpayne@69: the python string's translate method to use in a (reverse) complement. jpayne@69: jpayne@69: Compatible with lower case and upper case sequences. jpayne@69: jpayne@69: For internal use only. jpayne@69: """ jpayne@69: keys = "".join(complement_mapping.keys()).encode("ASCII") jpayne@69: values = "".join(complement_mapping.values()).encode("ASCII") jpayne@69: return bytes.maketrans(keys + keys.lower(), values + values.lower()) jpayne@69: jpayne@69: jpayne@69: ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement) jpayne@69: ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"] jpayne@69: _dna_complement_table = _maketrans(ambiguous_dna_complement) jpayne@69: del ambiguous_dna_complement jpayne@69: ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement) jpayne@69: ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"] jpayne@69: _rna_complement_table = _maketrans(ambiguous_rna_complement) jpayne@69: del ambiguous_rna_complement jpayne@69: jpayne@69: jpayne@69: class SequenceDataAbstractBaseClass(ABC): jpayne@69: """Abstract base class for sequence content providers. jpayne@69: jpayne@69: Most users will not need to use this class. It is used internally as a base jpayne@69: class for sequence content provider classes such as _UndefinedSequenceData jpayne@69: defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO. jpayne@69: Instances of these classes can be used instead of a ``bytes`` object as the jpayne@69: data argument when creating a Seq object, and provide the sequence content jpayne@69: only when requested via ``__getitem__``. This allows lazy parsers to load jpayne@69: and parse sequence data from a file only for the requested sequence regions, jpayne@69: and _UndefinedSequenceData instances to raise an exception when undefined jpayne@69: sequence data are requested. jpayne@69: jpayne@69: Future implementations of lazy parsers that similarly provide on-demand jpayne@69: parsing of sequence data should use a subclass of this abstract class and jpayne@69: implement the abstract methods ``__len__`` and ``__getitem__``: jpayne@69: jpayne@69: * ``__len__`` must return the sequence length; jpayne@69: * ``__getitem__`` must return jpayne@69: jpayne@69: * a ``bytes`` object for the requested region; or jpayne@69: * a new instance of the subclass for the requested region; or jpayne@69: * raise an ``UndefinedSequenceError``. jpayne@69: jpayne@69: Calling ``__getitem__`` for a sequence region of size zero should always jpayne@69: return an empty ``bytes`` object. jpayne@69: Calling ``__getitem__`` for the full sequence (as in data[:]) should jpayne@69: either return a ``bytes`` object with the full sequence, or raise an jpayne@69: ``UndefinedSequenceError``. jpayne@69: jpayne@69: Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()`` jpayne@69: as part of their ``__init__`` method. jpayne@69: """ jpayne@69: jpayne@69: __slots__ = () jpayne@69: jpayne@69: def __init__(self): jpayne@69: """Check if ``__getitem__`` returns a bytes-like object.""" jpayne@69: assert self[:0] == b"" jpayne@69: jpayne@69: @abstractmethod jpayne@69: def __len__(self): jpayne@69: pass jpayne@69: jpayne@69: @abstractmethod jpayne@69: def __getitem__(self, key): jpayne@69: pass jpayne@69: jpayne@69: def __bytes__(self): jpayne@69: return self[:] jpayne@69: jpayne@69: def __hash__(self): jpayne@69: return hash(bytes(self)) jpayne@69: jpayne@69: def __eq__(self, other): jpayne@69: return bytes(self) == other jpayne@69: jpayne@69: def __lt__(self, other): jpayne@69: return bytes(self) < other jpayne@69: jpayne@69: def __le__(self, other): jpayne@69: return bytes(self) <= other jpayne@69: jpayne@69: def __gt__(self, other): jpayne@69: return bytes(self) > other jpayne@69: jpayne@69: def __ge__(self, other): jpayne@69: return bytes(self) >= other jpayne@69: jpayne@69: def __add__(self, other): jpayne@69: try: jpayne@69: return bytes(self) + bytes(other) jpayne@69: except UndefinedSequenceError: jpayne@69: return NotImplemented jpayne@69: # will be handled by _UndefinedSequenceData.__radd__ or jpayne@69: # by _PartiallyDefinedSequenceData.__radd__ jpayne@69: jpayne@69: def __radd__(self, other): jpayne@69: return other + bytes(self) jpayne@69: jpayne@69: def __mul__(self, other): jpayne@69: return other * bytes(self) jpayne@69: jpayne@69: def __contains__(self, item): jpayne@69: return bytes(self).__contains__(item) jpayne@69: jpayne@69: def decode(self, encoding="utf-8"): jpayne@69: """Decode the data as bytes using the codec registered for encoding. jpayne@69: jpayne@69: encoding jpayne@69: The encoding with which to decode the bytes. jpayne@69: """ jpayne@69: return bytes(self).decode(encoding) jpayne@69: jpayne@69: def count(self, sub, start=None, end=None): jpayne@69: """Return the number of non-overlapping occurrences of sub in data[start:end]. jpayne@69: jpayne@69: Optional arguments start and end are interpreted as in slice notation. jpayne@69: This method behaves as the count method of Python strings. jpayne@69: """ jpayne@69: return bytes(self).count(sub, start, end) jpayne@69: jpayne@69: def find(self, sub, start=None, end=None): jpayne@69: """Return the lowest index in data where subsection sub is found. jpayne@69: jpayne@69: Return the lowest index in data where subsection sub is found, jpayne@69: such that sub is contained within data[start,end]. Optional jpayne@69: arguments start and end are interpreted as in slice notation. jpayne@69: jpayne@69: Return -1 on failure. jpayne@69: """ jpayne@69: return bytes(self).find(sub, start, end) jpayne@69: jpayne@69: def rfind(self, sub, start=None, end=None): jpayne@69: """Return the highest index in data where subsection sub is found. jpayne@69: jpayne@69: Return the highest index in data where subsection sub is found, jpayne@69: such that sub is contained within data[start,end]. Optional jpayne@69: arguments start and end are interpreted as in slice notation. jpayne@69: jpayne@69: Return -1 on failure. jpayne@69: """ jpayne@69: return bytes(self).rfind(sub, start, end) jpayne@69: jpayne@69: def index(self, sub, start=None, end=None): jpayne@69: """Return the lowest index in data where subsection sub is found. jpayne@69: jpayne@69: Return the lowest index in data where subsection sub is found, jpayne@69: such that sub is contained within data[start,end]. Optional jpayne@69: arguments start and end are interpreted as in slice notation. jpayne@69: jpayne@69: Raises ValueError when the subsection is not found. jpayne@69: """ jpayne@69: return bytes(self).index(sub, start, end) jpayne@69: jpayne@69: def rindex(self, sub, start=None, end=None): jpayne@69: """Return the highest index in data where subsection sub is found. jpayne@69: jpayne@69: Return the highest index in data where subsection sub is found, jpayne@69: such that sub is contained within data[start,end]. Optional jpayne@69: arguments start and end are interpreted as in slice notation. jpayne@69: jpayne@69: Raise ValueError when the subsection is not found. jpayne@69: """ jpayne@69: return bytes(self).rindex(sub, start, end) jpayne@69: jpayne@69: def startswith(self, prefix, start=None, end=None): jpayne@69: """Return True if data starts with the specified prefix, False otherwise. jpayne@69: jpayne@69: With optional start, test data beginning at that position. jpayne@69: With optional end, stop comparing data at that position. jpayne@69: prefix can also be a tuple of bytes to try. jpayne@69: """ jpayne@69: return bytes(self).startswith(prefix, start, end) jpayne@69: jpayne@69: def endswith(self, suffix, start=None, end=None): jpayne@69: """Return True if data ends with the specified suffix, False otherwise. jpayne@69: jpayne@69: With optional start, test data beginning at that position. jpayne@69: With optional end, stop comparing data at that position. jpayne@69: suffix can also be a tuple of bytes to try. jpayne@69: """ jpayne@69: return bytes(self).endswith(suffix, start, end) jpayne@69: jpayne@69: def split(self, sep=None, maxsplit=-1): jpayne@69: """Return a list of the sections in the data, using sep as the delimiter. jpayne@69: jpayne@69: sep jpayne@69: The delimiter according which to split the data. jpayne@69: None (the default value) means split on ASCII whitespace characters jpayne@69: (space, tab, return, newline, formfeed, vertical tab). jpayne@69: maxsplit jpayne@69: Maximum number of splits to do. jpayne@69: -1 (the default value) means no limit. jpayne@69: """ jpayne@69: return bytes(self).split(sep, maxsplit) jpayne@69: jpayne@69: def rsplit(self, sep=None, maxsplit=-1): jpayne@69: """Return a list of the sections in the data, using sep as the delimiter. jpayne@69: jpayne@69: sep jpayne@69: The delimiter according which to split the data. jpayne@69: None (the default value) means split on ASCII whitespace characters jpayne@69: (space, tab, return, newline, formfeed, vertical tab). jpayne@69: maxsplit jpayne@69: Maximum number of splits to do. jpayne@69: -1 (the default value) means no limit. jpayne@69: jpayne@69: Splitting is done starting at the end of the data and working to the front. jpayne@69: """ jpayne@69: return bytes(self).rsplit(sep, maxsplit) jpayne@69: jpayne@69: def strip(self, chars=None): jpayne@69: """Strip leading and trailing characters contained in the argument. jpayne@69: jpayne@69: If the argument is omitted or None, strip leading and trailing ASCII whitespace. jpayne@69: """ jpayne@69: return bytes(self).strip(chars) jpayne@69: jpayne@69: def lstrip(self, chars=None): jpayne@69: """Strip leading characters contained in the argument. jpayne@69: jpayne@69: If the argument is omitted or None, strip leading ASCII whitespace. jpayne@69: """ jpayne@69: return bytes(self).lstrip(chars) jpayne@69: jpayne@69: def rstrip(self, chars=None): jpayne@69: """Strip trailing characters contained in the argument. jpayne@69: jpayne@69: If the argument is omitted or None, strip trailing ASCII whitespace. jpayne@69: """ jpayne@69: return bytes(self).rstrip(chars) jpayne@69: jpayne@69: def removeprefix(self, prefix): jpayne@69: """Remove the prefix if present.""" jpayne@69: # Want to do just this, but need Python 3.9+ jpayne@69: # return bytes(self).removeprefix(prefix) jpayne@69: data = bytes(self) jpayne@69: try: jpayne@69: return data.removeprefix(prefix) jpayne@69: except AttributeError: jpayne@69: if data.startswith(prefix): jpayne@69: return data[len(prefix) :] jpayne@69: else: jpayne@69: return data jpayne@69: jpayne@69: def removesuffix(self, suffix): jpayne@69: """Remove the suffix if present.""" jpayne@69: # Want to do just this, but need Python 3.9+ jpayne@69: # return bytes(self).removesuffix(suffix) jpayne@69: data = bytes(self) jpayne@69: try: jpayne@69: return data.removesuffix(suffix) jpayne@69: except AttributeError: jpayne@69: if data.startswith(suffix): jpayne@69: return data[: -len(suffix)] jpayne@69: else: jpayne@69: return data jpayne@69: jpayne@69: def upper(self): jpayne@69: """Return a copy of data with all ASCII characters converted to uppercase.""" jpayne@69: return bytes(self).upper() jpayne@69: jpayne@69: def lower(self): jpayne@69: """Return a copy of data with all ASCII characters converted to lowercase.""" jpayne@69: return bytes(self).lower() jpayne@69: jpayne@69: def isupper(self): jpayne@69: """Return True if all ASCII characters in data are uppercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: return bytes(self).isupper() jpayne@69: jpayne@69: def islower(self): jpayne@69: """Return True if all ASCII characters in data are lowercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: return bytes(self).islower() jpayne@69: jpayne@69: def replace(self, old, new): jpayne@69: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@69: return bytes(self).replace(old, new) jpayne@69: jpayne@69: def translate(self, table, delete=b""): jpayne@69: """Return a copy with each character mapped by the given translation table. jpayne@69: jpayne@69: table jpayne@69: Translation table, which must be a bytes object of length 256. jpayne@69: jpayne@69: All characters occurring in the optional argument delete are removed. jpayne@69: The remaining characters are mapped through the given translation table. jpayne@69: """ jpayne@69: return bytes(self).translate(table, delete) jpayne@69: jpayne@69: @property jpayne@69: def defined(self): jpayne@69: """Return True if the sequence is defined, False if undefined or partially defined. jpayne@69: jpayne@69: Zero-length sequences are always considered to be defined. jpayne@69: """ jpayne@69: return True jpayne@69: jpayne@69: @property jpayne@69: def defined_ranges(self): jpayne@69: """Return a tuple of the ranges where the sequence contents is defined. jpayne@69: jpayne@69: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@69: """ jpayne@69: length = len(self) jpayne@69: if length > 0: jpayne@69: return ((0, length),) jpayne@69: else: jpayne@69: return () jpayne@69: jpayne@69: jpayne@69: class _SeqAbstractBaseClass(ABC): jpayne@69: """Abstract base class for the Seq and MutableSeq classes (PRIVATE). jpayne@69: jpayne@69: Most users will not need to use this class. It is used internally as an jpayne@69: abstract base class for Seq and MutableSeq, as most of their methods are jpayne@69: identical. jpayne@69: """ jpayne@69: jpayne@69: __slots__ = ("_data",) jpayne@69: __array_ufunc__ = None # turn off numpy Ufuncs jpayne@69: jpayne@69: @abstractmethod jpayne@69: def __init__(self): jpayne@69: pass jpayne@69: jpayne@69: def __bytes__(self): jpayne@69: return bytes(self._data) jpayne@69: jpayne@69: def __repr__(self): jpayne@69: """Return (truncated) representation of the sequence.""" jpayne@69: data = self._data jpayne@69: if isinstance(data, _UndefinedSequenceData): jpayne@69: return f"Seq(None, length={len(self)})" jpayne@69: if isinstance(data, _PartiallyDefinedSequenceData): jpayne@69: d = {} jpayne@69: for position, seq in data._data.items(): jpayne@69: if len(seq) > 60: jpayne@69: start = seq[:54].decode("ASCII") jpayne@69: end = seq[-3:].decode("ASCII") jpayne@69: seq = f"{start}...{end}" jpayne@69: else: jpayne@69: seq = seq.decode("ASCII") jpayne@69: d[position] = seq jpayne@69: return "Seq(%r, length=%d)" % (d, len(self)) jpayne@69: if len(data) > 60: jpayne@69: # Shows the last three letters as it is often useful to see if jpayne@69: # there is a stop codon at the end of a sequence. jpayne@69: # Note total length is 54+3+3=60 jpayne@69: start = data[:54].decode("ASCII") jpayne@69: end = data[-3:].decode("ASCII") jpayne@69: return f"{self.__class__.__name__}('{start}...{end}')" jpayne@69: else: jpayne@69: data = data.decode("ASCII") jpayne@69: return f"{self.__class__.__name__}('{data}')" jpayne@69: jpayne@69: def __str__(self): jpayne@69: """Return the full sequence as a python string.""" jpayne@69: return self._data.decode("ASCII") jpayne@69: jpayne@69: def __eq__(self, other): jpayne@69: """Compare the sequence to another sequence or a string. jpayne@69: jpayne@69: Sequences are equal to each other if their sequence contents is jpayne@69: identical: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> seq1 = Seq("ACGT") jpayne@69: >>> seq2 = Seq("ACGT") jpayne@69: >>> mutable_seq = MutableSeq("ACGT") jpayne@69: >>> seq1 == seq2 jpayne@69: True jpayne@69: >>> seq1 == mutable_seq jpayne@69: True jpayne@69: >>> seq1 == "ACGT" jpayne@69: True jpayne@69: jpayne@69: Note that the sequence objects themselves are not identical to each jpayne@69: other: jpayne@69: jpayne@69: >>> id(seq1) == id(seq2) jpayne@69: False jpayne@69: >>> seq1 is seq2 jpayne@69: False jpayne@69: jpayne@69: Sequences can also be compared to strings, ``bytes``, and ``bytearray`` jpayne@69: objects: jpayne@69: jpayne@69: >>> seq1 == "ACGT" jpayne@69: True jpayne@69: >>> seq1 == b"ACGT" jpayne@69: True jpayne@69: >>> seq1 == bytearray(b"ACGT") jpayne@69: True jpayne@69: """ jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self._data == other._data jpayne@69: elif isinstance(other, str): jpayne@69: return self._data == other.encode("ASCII") jpayne@69: else: jpayne@69: return self._data == other jpayne@69: jpayne@69: def __lt__(self, other): jpayne@69: """Implement the less-than operand.""" jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self._data < other._data jpayne@69: elif isinstance(other, str): jpayne@69: return self._data < other.encode("ASCII") jpayne@69: else: jpayne@69: return self._data < other jpayne@69: jpayne@69: def __le__(self, other): jpayne@69: """Implement the less-than or equal operand.""" jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self._data <= other._data jpayne@69: elif isinstance(other, str): jpayne@69: return self._data <= other.encode("ASCII") jpayne@69: else: jpayne@69: return self._data <= other jpayne@69: jpayne@69: def __gt__(self, other): jpayne@69: """Implement the greater-than operand.""" jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self._data > other._data jpayne@69: elif isinstance(other, str): jpayne@69: return self._data > other.encode("ASCII") jpayne@69: else: jpayne@69: return self._data > other jpayne@69: jpayne@69: def __ge__(self, other): jpayne@69: """Implement the greater-than or equal operand.""" jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self._data >= other._data jpayne@69: elif isinstance(other, str): jpayne@69: return self._data >= other.encode("ASCII") jpayne@69: else: jpayne@69: return self._data >= other jpayne@69: jpayne@69: def __len__(self): jpayne@69: """Return the length of the sequence.""" jpayne@69: return len(self._data) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: """Return an iterable of the sequence.""" jpayne@69: return self._data.decode("ASCII").__iter__() jpayne@69: jpayne@69: @overload jpayne@69: def __getitem__(self, index: int) -> str: jpayne@69: ... jpayne@69: jpayne@69: @overload jpayne@69: def __getitem__(self, index: slice) -> "Seq": jpayne@69: ... jpayne@69: jpayne@69: def __getitem__(self, index): jpayne@69: """Return a subsequence as a single letter or as a sequence object. jpayne@69: jpayne@69: If the index is an integer, a single letter is returned as a Python jpayne@69: string: jpayne@69: jpayne@69: >>> seq = Seq('ACTCGACGTCG') jpayne@69: >>> seq[5] jpayne@69: 'A' jpayne@69: jpayne@69: Otherwise, a new sequence object of the same class is returned: jpayne@69: jpayne@69: >>> seq[5:8] jpayne@69: Seq('ACG') jpayne@69: >>> mutable_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> mutable_seq[5:8] jpayne@69: MutableSeq('ACG') jpayne@69: """ jpayne@69: if isinstance(index, numbers.Integral): jpayne@69: # Return a single letter as a string jpayne@69: return chr(self._data[index]) jpayne@69: else: jpayne@69: # Return the (sub)sequence as another Seq/MutableSeq object jpayne@69: return self.__class__(self._data[index]) jpayne@69: jpayne@69: def __add__(self, other): jpayne@69: """Add a sequence or string to this sequence. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> Seq("MELKI") + "LV" jpayne@69: Seq('MELKILV') jpayne@69: >>> MutableSeq("MELKI") + "LV" jpayne@69: MutableSeq('MELKILV') jpayne@69: """ jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self.__class__(self._data + other._data) jpayne@69: elif isinstance(other, str): jpayne@69: return self.__class__(self._data + other.encode("ASCII")) jpayne@69: else: jpayne@69: # If other is a SeqRecord, then SeqRecord's __radd__ will handle jpayne@69: # this. If not, returning NotImplemented will trigger a TypeError. jpayne@69: return NotImplemented jpayne@69: jpayne@69: def __radd__(self, other): jpayne@69: """Add a sequence string on the left. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> "LV" + Seq("MELKI") jpayne@69: Seq('LVMELKI') jpayne@69: >>> "LV" + MutableSeq("MELKI") jpayne@69: MutableSeq('LVMELKI') jpayne@69: jpayne@69: Adding two sequence objects is handled via the __add__ method. jpayne@69: """ jpayne@69: if isinstance(other, str): jpayne@69: return self.__class__(other.encode("ASCII") + self._data) jpayne@69: else: jpayne@69: return NotImplemented jpayne@69: jpayne@69: def __mul__(self, other): jpayne@69: """Multiply sequence by integer. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> Seq('ATG') * 2 jpayne@69: Seq('ATGATG') jpayne@69: >>> MutableSeq('ATG') * 2 jpayne@69: MutableSeq('ATGATG') jpayne@69: """ jpayne@69: if not isinstance(other, numbers.Integral): jpayne@69: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@69: # we would like to simply write jpayne@69: # data = self._data * other jpayne@69: # here, but currently that causes a bug on PyPy if self._data is a jpayne@69: # bytearray and other is a numpy integer. Using this workaround: jpayne@69: data = self._data.__mul__(other) jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def __rmul__(self, other): jpayne@69: """Multiply integer by sequence. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> 2 * Seq('ATG') jpayne@69: Seq('ATGATG') jpayne@69: """ jpayne@69: if not isinstance(other, numbers.Integral): jpayne@69: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@69: # we would like to simply write jpayne@69: # data = self._data * other jpayne@69: # here, but currently that causes a bug on PyPy if self._data is a jpayne@69: # bytearray and other is a numpy integer. Using this workaround: jpayne@69: data = self._data.__mul__(other) jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def __imul__(self, other): jpayne@69: """Multiply the sequence object by other and assign. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> seq = Seq('ATG') jpayne@69: >>> seq *= 2 jpayne@69: >>> seq jpayne@69: Seq('ATGATG') jpayne@69: jpayne@69: Note that this is different from in-place multiplication. The ``seq`` jpayne@69: variable is reassigned to the multiplication result, but any variable jpayne@69: pointing to ``seq`` will remain unchanged: jpayne@69: jpayne@69: >>> seq = Seq('ATG') jpayne@69: >>> seq2 = seq jpayne@69: >>> id(seq) == id(seq2) jpayne@69: True jpayne@69: >>> seq *= 2 jpayne@69: >>> seq jpayne@69: Seq('ATGATG') jpayne@69: >>> seq2 jpayne@69: Seq('ATG') jpayne@69: >>> id(seq) == id(seq2) jpayne@69: False jpayne@69: """ jpayne@69: if not isinstance(other, numbers.Integral): jpayne@69: raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") jpayne@69: # we would like to simply write jpayne@69: # data = self._data * other jpayne@69: # here, but currently that causes a bug on PyPy if self._data is a jpayne@69: # bytearray and other is a numpy integer. Using this workaround: jpayne@69: data = self._data.__mul__(other) jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def count(self, sub, start=None, end=None): jpayne@69: """Return a non-overlapping count, like that of a python string. jpayne@69: jpayne@69: The number of occurrences of substring argument sub in the jpayne@69: (sub)sequence given by [start:end] is returned as an integer. jpayne@69: Optional arguments start and end are interpreted as in slice jpayne@69: notation. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq object to look for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_seq = Seq("AAAATGA") jpayne@69: >>> print(my_seq.count("A")) jpayne@69: 5 jpayne@69: >>> print(my_seq.count("ATG")) jpayne@69: 1 jpayne@69: >>> print(my_seq.count(Seq("AT"))) jpayne@69: 1 jpayne@69: >>> print(my_seq.count("AT", 2, -1)) jpayne@69: 1 jpayne@69: jpayne@69: HOWEVER, please note because the ``count`` method of Seq and MutableSeq jpayne@69: objects, like that of Python strings, do a non-overlapping search, this jpayne@69: may not give the answer you expect: jpayne@69: jpayne@69: >>> "AAAA".count("AA") jpayne@69: 2 jpayne@69: >>> print(Seq("AAAA").count("AA")) jpayne@69: 2 jpayne@69: jpayne@69: For an overlapping search, use the ``count_overlap`` method: jpayne@69: jpayne@69: >>> print(Seq("AAAA").count_overlap("AA")) jpayne@69: 3 jpayne@69: """ jpayne@69: if isinstance(sub, MutableSeq): jpayne@69: sub = sub._data jpayne@69: elif isinstance(sub, Seq): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: return self._data.count(sub, start, end) jpayne@69: jpayne@69: def count_overlap(self, sub, start=None, end=None): jpayne@69: """Return an overlapping count. jpayne@69: jpayne@69: Returns an integer, the number of occurrences of substring jpayne@69: argument sub in the (sub)sequence given by [start:end]. jpayne@69: Optional arguments start and end are interpreted as in slice jpayne@69: notation. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq object to look for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> print(Seq("AAAA").count_overlap("AA")) jpayne@69: 3 jpayne@69: >>> print(Seq("ATATATATA").count_overlap("ATA")) jpayne@69: 4 jpayne@69: >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1)) jpayne@69: 1 jpayne@69: jpayne@69: For a non-overlapping search, use the ``count`` method: jpayne@69: jpayne@69: >>> print(Seq("AAAA").count("AA")) jpayne@69: 2 jpayne@69: jpayne@69: Where substrings do not overlap, ``count_overlap`` behaves the same as jpayne@69: the ``count`` method: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_seq = Seq("AAAATGA") jpayne@69: >>> print(my_seq.count_overlap("A")) jpayne@69: 5 jpayne@69: >>> my_seq.count_overlap("A") == my_seq.count("A") jpayne@69: True jpayne@69: >>> print(my_seq.count_overlap("ATG")) jpayne@69: 1 jpayne@69: >>> my_seq.count_overlap("ATG") == my_seq.count("ATG") jpayne@69: True jpayne@69: >>> print(my_seq.count_overlap(Seq("AT"))) jpayne@69: 1 jpayne@69: >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT")) jpayne@69: True jpayne@69: >>> print(my_seq.count_overlap("AT", 2, -1)) jpayne@69: 1 jpayne@69: >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1) jpayne@69: True jpayne@69: jpayne@69: HOWEVER, do not use this method for such cases because the jpayne@69: count() method is much for efficient. jpayne@69: """ jpayne@69: if isinstance(sub, MutableSeq): jpayne@69: sub = sub._data jpayne@69: elif isinstance(sub, Seq): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: data = self._data jpayne@69: overlap_count = 0 jpayne@69: while True: jpayne@69: start = data.find(sub, start, end) + 1 jpayne@69: if start != 0: jpayne@69: overlap_count += 1 jpayne@69: else: jpayne@69: return overlap_count jpayne@69: jpayne@69: def __contains__(self, item): jpayne@69: """Return True if item is a subsequence of the sequence, and False otherwise. jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> my_dna = Seq("ATATGAAATTTGAAAA") jpayne@69: >>> "AAA" in my_dna jpayne@69: True jpayne@69: >>> Seq("AAA") in my_dna jpayne@69: True jpayne@69: >>> MutableSeq("AAA") in my_dna jpayne@69: True jpayne@69: """ jpayne@69: if isinstance(item, _SeqAbstractBaseClass): jpayne@69: item = bytes(item) jpayne@69: elif isinstance(item, str): jpayne@69: item = item.encode("ASCII") jpayne@69: return item in self._data jpayne@69: jpayne@69: def find(self, sub, start=None, end=None): jpayne@69: """Return the lowest index in the sequence where subsequence sub is found. jpayne@69: jpayne@69: With optional arguments start and end, return the lowest index in the jpayne@69: sequence such that the subsequence sub is contained within the sequence jpayne@69: region [start:end]. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq or MutableSeq object to search for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: Returns -1 if the subsequence is NOT found. jpayne@69: jpayne@69: e.g. Locating the first typical start codon, AUG, in an RNA sequence: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.find("AUG") jpayne@69: 3 jpayne@69: jpayne@69: The next typical start codon can then be found by starting the search jpayne@69: at position 4: jpayne@69: jpayne@69: >>> my_rna.find("AUG", 4) jpayne@69: 15 jpayne@69: jpayne@69: See the ``search`` method to find the locations of multiple subsequences jpayne@69: at the same time. jpayne@69: """ jpayne@69: if isinstance(sub, _SeqAbstractBaseClass): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: return self._data.find(sub, start, end) jpayne@69: jpayne@69: def rfind(self, sub, start=None, end=None): jpayne@69: """Return the highest index in the sequence where subsequence sub is found. jpayne@69: jpayne@69: With optional arguments start and end, return the highest index in the jpayne@69: sequence such that the subsequence sub is contained within the sequence jpayne@69: region [start:end]. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq or MutableSeq object to search for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: Returns -1 if the subsequence is NOT found. jpayne@69: jpayne@69: e.g. Locating the last typical start codon, AUG, in an RNA sequence: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.rfind("AUG") jpayne@69: 15 jpayne@69: jpayne@69: The location of the typical start codon before that can be found by jpayne@69: ending the search at position 15: jpayne@69: jpayne@69: >>> my_rna.rfind("AUG", end=15) jpayne@69: 3 jpayne@69: jpayne@69: See the ``search`` method to find the locations of multiple subsequences jpayne@69: at the same time. jpayne@69: """ jpayne@69: if isinstance(sub, _SeqAbstractBaseClass): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: return self._data.rfind(sub, start, end) jpayne@69: jpayne@69: def index(self, sub, start=None, end=None): jpayne@69: """Return the lowest index in the sequence where subsequence sub is found. jpayne@69: jpayne@69: With optional arguments start and end, return the lowest index in the jpayne@69: sequence such that the subsequence sub is contained within the sequence jpayne@69: region [start:end]. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq or MutableSeq object to search for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: Raises a ValueError if the subsequence is NOT found. jpayne@69: jpayne@69: e.g. Locating the first typical start codon, AUG, in an RNA sequence: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.index("AUG") jpayne@69: 3 jpayne@69: jpayne@69: The next typical start codon can then be found by starting the search jpayne@69: at position 4: jpayne@69: jpayne@69: >>> my_rna.index("AUG", 4) jpayne@69: 15 jpayne@69: jpayne@69: This method performs the same search as the ``find`` method. However, jpayne@69: if the subsequence is not found, ``find`` returns -1 while ``index`` jpayne@69: raises a ValueError: jpayne@69: jpayne@69: >>> my_rna.index("T") jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: ValueError: ... jpayne@69: >>> my_rna.find("T") jpayne@69: -1 jpayne@69: jpayne@69: See the ``search`` method to find the locations of multiple subsequences jpayne@69: at the same time. jpayne@69: """ jpayne@69: if isinstance(sub, MutableSeq): jpayne@69: sub = sub._data jpayne@69: elif isinstance(sub, Seq): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: return self._data.index(sub, start, end) jpayne@69: jpayne@69: def rindex(self, sub, start=None, end=None): jpayne@69: """Return the highest index in the sequence where subsequence sub is found. jpayne@69: jpayne@69: With optional arguments start and end, return the highest index in the jpayne@69: sequence such that the subsequence sub is contained within the sequence jpayne@69: region [start:end]. jpayne@69: jpayne@69: Arguments: jpayne@69: - sub - a string or another Seq or MutableSeq object to search for jpayne@69: - start - optional integer, slice start jpayne@69: - end - optional integer, slice end jpayne@69: jpayne@69: Returns -1 if the subsequence is NOT found. jpayne@69: jpayne@69: e.g. Locating the last typical start codon, AUG, in an RNA sequence: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.rindex("AUG") jpayne@69: 15 jpayne@69: jpayne@69: The location of the typical start codon before that can be found by jpayne@69: ending the search at position 15: jpayne@69: jpayne@69: >>> my_rna.rindex("AUG", end=15) jpayne@69: 3 jpayne@69: jpayne@69: This method performs the same search as the ``rfind`` method. However, jpayne@69: if the subsequence is not found, ``rfind`` returns -1 which ``rindex`` jpayne@69: raises a ValueError: jpayne@69: jpayne@69: >>> my_rna.rindex("T") jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: ValueError: ... jpayne@69: >>> my_rna.rfind("T") jpayne@69: -1 jpayne@69: jpayne@69: See the ``search`` method to find the locations of multiple subsequences jpayne@69: at the same time. jpayne@69: """ jpayne@69: if isinstance(sub, MutableSeq): jpayne@69: sub = sub._data jpayne@69: elif isinstance(sub, Seq): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, (bytes, bytearray)): jpayne@69: raise TypeError( jpayne@69: "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % type(sub) jpayne@69: ) jpayne@69: return self._data.rindex(sub, start, end) jpayne@69: jpayne@69: def search(self, subs): jpayne@69: """Search the substrings subs in self and yield the index and substring found. jpayne@69: jpayne@69: Arguments: jpayne@69: - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray jpayne@69: objects containing the substrings to search for. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG") jpayne@69: >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")]) jpayne@69: >>> for index, substring in matches: jpayne@69: ... print(index, substring) jpayne@69: ... jpayne@69: 7 CC jpayne@69: 9 ATTG jpayne@69: 20 CC jpayne@69: 34 CC jpayne@69: 34 CCC jpayne@69: 35 CC jpayne@69: """ jpayne@69: subdict = collections.defaultdict(set) jpayne@69: for index, sub in enumerate(subs): jpayne@69: if isinstance(sub, (_SeqAbstractBaseClass, bytearray)): jpayne@69: sub = bytes(sub) jpayne@69: elif isinstance(sub, str): jpayne@69: sub = sub.encode("ASCII") jpayne@69: elif not isinstance(sub, bytes): jpayne@69: raise TypeError( jpayne@69: "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" jpayne@69: % (index, type(sub)) jpayne@69: ) jpayne@69: length = len(sub) jpayne@69: subdict[length].add(sub) jpayne@69: for start in range(len(self) - 1): jpayne@69: for length, subs in subdict.items(): jpayne@69: stop = start + length jpayne@69: for sub in subs: jpayne@69: if self._data[start:stop] == sub: jpayne@69: yield (start, sub.decode()) jpayne@69: break jpayne@69: jpayne@69: def startswith(self, prefix, start=None, end=None): jpayne@69: """Return True if the sequence starts with the given prefix, False otherwise. jpayne@69: jpayne@69: Return True if the sequence starts with the specified prefix jpayne@69: (a string or another Seq object), False otherwise. jpayne@69: With optional start, test sequence beginning at that position. jpayne@69: With optional end, stop comparing sequence at that position. jpayne@69: prefix can also be a tuple of strings to try. e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.startswith("GUC") jpayne@69: True jpayne@69: >>> my_rna.startswith("AUG") jpayne@69: False jpayne@69: >>> my_rna.startswith("AUG", 3) jpayne@69: True jpayne@69: >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) jpayne@69: True jpayne@69: """ jpayne@69: if isinstance(prefix, tuple): jpayne@69: prefix = tuple( jpayne@69: bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") jpayne@69: for p in prefix jpayne@69: ) jpayne@69: elif isinstance(prefix, _SeqAbstractBaseClass): jpayne@69: prefix = bytes(prefix) jpayne@69: elif isinstance(prefix, str): jpayne@69: prefix = prefix.encode("ASCII") jpayne@69: return self._data.startswith(prefix, start, end) jpayne@69: jpayne@69: def endswith(self, suffix, start=None, end=None): jpayne@69: """Return True if the sequence ends with the given suffix, False otherwise. jpayne@69: jpayne@69: Return True if the sequence ends with the specified suffix jpayne@69: (a string or another Seq object), False otherwise. jpayne@69: With optional start, test sequence beginning at that position. jpayne@69: With optional end, stop comparing sequence at that position. jpayne@69: suffix can also be a tuple of strings to try. e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_rna.endswith("UUG") jpayne@69: True jpayne@69: >>> my_rna.endswith("AUG") jpayne@69: False jpayne@69: >>> my_rna.endswith("AUG", 0, 18) jpayne@69: True jpayne@69: >>> my_rna.endswith(("UCC", "UCA", "UUG")) jpayne@69: True jpayne@69: """ jpayne@69: if isinstance(suffix, tuple): jpayne@69: suffix = tuple( jpayne@69: bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") jpayne@69: for p in suffix jpayne@69: ) jpayne@69: elif isinstance(suffix, _SeqAbstractBaseClass): jpayne@69: suffix = bytes(suffix) jpayne@69: elif isinstance(suffix, str): jpayne@69: suffix = suffix.encode("ASCII") jpayne@69: return self._data.endswith(suffix, start, end) jpayne@69: jpayne@69: def split(self, sep=None, maxsplit=-1): jpayne@69: """Return a list of subsequences when splitting the sequence by separator sep. jpayne@69: jpayne@69: Return a list of the subsequences in the sequence (as Seq objects), jpayne@69: using sep as the delimiter string. If maxsplit is given, at jpayne@69: most maxsplit splits are done. If maxsplit is omitted, all jpayne@69: splits are made. jpayne@69: jpayne@69: For consistency with the ``split`` method of Python strings, any jpayne@69: whitespace (tabs, spaces, newlines) is a separator if sep is None, the jpayne@69: default value jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_aa = my_rna.translate() jpayne@69: >>> my_aa jpayne@69: Seq('VMAIVMGR*KGAR*L') jpayne@69: >>> for pep in my_aa.split("*"): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR') jpayne@69: Seq('KGAR') jpayne@69: Seq('L') jpayne@69: >>> for pep in my_aa.split("*", 1): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR') jpayne@69: Seq('KGAR*L') jpayne@69: jpayne@69: See also the rsplit method, which splits the sequence starting from the jpayne@69: end: jpayne@69: jpayne@69: >>> for pep in my_aa.rsplit("*", 1): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR*KGAR') jpayne@69: Seq('L') jpayne@69: """ jpayne@69: if isinstance(sep, _SeqAbstractBaseClass): jpayne@69: sep = bytes(sep) jpayne@69: elif isinstance(sep, str): jpayne@69: sep = sep.encode("ASCII") jpayne@69: return [Seq(part) for part in self._data.split(sep, maxsplit)] jpayne@69: jpayne@69: def rsplit(self, sep=None, maxsplit=-1): jpayne@69: """Return a list of subsequences by splitting the sequence from the right. jpayne@69: jpayne@69: Return a list of the subsequences in the sequence (as Seq objects), jpayne@69: using sep as the delimiter string. If maxsplit is given, at jpayne@69: most maxsplit splits are done. If maxsplit is omitted, all jpayne@69: splits are made. jpayne@69: jpayne@69: For consistency with the ``rsplit`` method of Python strings, any jpayne@69: whitespace (tabs, spaces, newlines) is a separator if sep is None, the jpayne@69: default value jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") jpayne@69: >>> my_aa = my_rna.translate() jpayne@69: >>> my_aa jpayne@69: Seq('VMAIVMGR*KGAR*L') jpayne@69: >>> for pep in my_aa.rsplit("*"): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR') jpayne@69: Seq('KGAR') jpayne@69: Seq('L') jpayne@69: >>> for pep in my_aa.rsplit("*", 1): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR*KGAR') jpayne@69: Seq('L') jpayne@69: jpayne@69: See also the split method, which splits the sequence starting from the jpayne@69: beginning: jpayne@69: jpayne@69: >>> for pep in my_aa.split("*", 1): jpayne@69: ... pep jpayne@69: Seq('VMAIVMGR') jpayne@69: Seq('KGAR*L') jpayne@69: """ jpayne@69: if isinstance(sep, _SeqAbstractBaseClass): jpayne@69: sep = bytes(sep) jpayne@69: elif isinstance(sep, str): jpayne@69: sep = sep.encode("ASCII") jpayne@69: return [Seq(part) for part in self._data.rsplit(sep, maxsplit)] jpayne@69: jpayne@69: def strip(self, chars=None, inplace=False): jpayne@69: """Return a sequence object with leading and trailing ends stripped. jpayne@69: jpayne@69: With default arguments, leading and trailing whitespace is removed: jpayne@69: jpayne@69: >>> seq = Seq(" ACGT ") jpayne@69: >>> seq.strip() jpayne@69: Seq('ACGT') jpayne@69: >>> seq jpayne@69: Seq(' ACGT ') jpayne@69: jpayne@69: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@69: instead. The order of the characters to be removed is not important: jpayne@69: jpayne@69: >>> Seq("ACGTACGT").strip("TGCA") jpayne@69: Seq('') jpayne@69: jpayne@69: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@69: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@69: in-place and returned. jpayne@69: jpayne@69: >>> seq = MutableSeq(" ACGT ") jpayne@69: >>> seq.strip() jpayne@69: MutableSeq('ACGT') jpayne@69: >>> seq jpayne@69: MutableSeq(' ACGT ') jpayne@69: >>> seq.strip(inplace=True) jpayne@69: MutableSeq('ACGT') jpayne@69: >>> seq jpayne@69: MutableSeq('ACGT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip`` jpayne@69: is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the lstrip and rstrip methods. jpayne@69: """ jpayne@69: if isinstance(chars, _SeqAbstractBaseClass): jpayne@69: chars = bytes(chars) jpayne@69: elif isinstance(chars, str): jpayne@69: chars = chars.encode("ASCII") jpayne@69: try: jpayne@69: data = self._data.strip(chars) jpayne@69: except TypeError: jpayne@69: raise TypeError( jpayne@69: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@69: ) from None jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def lstrip(self, chars=None, inplace=False): jpayne@69: """Return a sequence object with leading and trailing ends stripped. jpayne@69: jpayne@69: With default arguments, leading whitespace is removed: jpayne@69: jpayne@69: >>> seq = Seq(" ACGT ") jpayne@69: >>> seq.lstrip() jpayne@69: Seq('ACGT ') jpayne@69: >>> seq jpayne@69: Seq(' ACGT ') jpayne@69: jpayne@69: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@69: from the leading end instead. The order of the characters to be removed jpayne@69: is not important: jpayne@69: jpayne@69: >>> Seq("ACGACGTTACG").lstrip("GCA") jpayne@69: Seq('TTACG') jpayne@69: jpayne@69: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@69: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@69: in-place and returned. jpayne@69: jpayne@69: >>> seq = MutableSeq(" ACGT ") jpayne@69: >>> seq.lstrip() jpayne@69: MutableSeq('ACGT ') jpayne@69: >>> seq jpayne@69: MutableSeq(' ACGT ') jpayne@69: >>> seq.lstrip(inplace=True) jpayne@69: MutableSeq('ACGT ') jpayne@69: >>> seq jpayne@69: MutableSeq('ACGT ') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``lstrip`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the strip and rstrip methods. jpayne@69: """ jpayne@69: if isinstance(chars, _SeqAbstractBaseClass): jpayne@69: chars = bytes(chars) jpayne@69: elif isinstance(chars, str): jpayne@69: chars = chars.encode("ASCII") jpayne@69: try: jpayne@69: data = self._data.lstrip(chars) jpayne@69: except TypeError: jpayne@69: raise TypeError( jpayne@69: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@69: ) from None jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def rstrip(self, chars=None, inplace=False): jpayne@69: """Return a sequence object with trailing ends stripped. jpayne@69: jpayne@69: With default arguments, trailing whitespace is removed: jpayne@69: jpayne@69: >>> seq = Seq(" ACGT ") jpayne@69: >>> seq.rstrip() jpayne@69: Seq(' ACGT') jpayne@69: >>> seq jpayne@69: Seq(' ACGT ') jpayne@69: jpayne@69: If ``chars`` is given and not ``None``, remove characters in ``chars`` jpayne@69: from the trailing end instead. The order of the characters to be jpayne@69: removed is not important: jpayne@69: jpayne@69: >>> Seq("ACGACGTTACG").rstrip("GCA") jpayne@69: Seq('ACGACGTT') jpayne@69: jpayne@69: A copy of the sequence is returned if ``inplace`` is ``False`` (the jpayne@69: default value). If ``inplace`` is ``True``, the sequence is stripped jpayne@69: in-place and returned. jpayne@69: jpayne@69: >>> seq = MutableSeq(" ACGT ") jpayne@69: >>> seq.rstrip() jpayne@69: MutableSeq(' ACGT') jpayne@69: >>> seq jpayne@69: MutableSeq(' ACGT ') jpayne@69: >>> seq.rstrip(inplace=True) jpayne@69: MutableSeq(' ACGT') jpayne@69: >>> seq jpayne@69: MutableSeq(' ACGT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``rstrip`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the strip and lstrip methods. jpayne@69: """ jpayne@69: if isinstance(chars, _SeqAbstractBaseClass): jpayne@69: chars = bytes(chars) jpayne@69: elif isinstance(chars, str): jpayne@69: chars = chars.encode("ASCII") jpayne@69: try: jpayne@69: data = self._data.rstrip(chars) jpayne@69: except TypeError: jpayne@69: raise TypeError( jpayne@69: "argument must be None or a string, Seq, MutableSeq, or bytes-like object" jpayne@69: ) from None jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def removeprefix(self, prefix, inplace=False): jpayne@69: """Return a new Seq object with prefix (left) removed. jpayne@69: jpayne@69: This behaves like the python string method of the same name. jpayne@69: jpayne@69: e.g. Removing a start Codon: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_seq = Seq("ATGGTGTGTGT") jpayne@69: >>> my_seq jpayne@69: Seq('ATGGTGTGTGT') jpayne@69: >>> my_seq.removeprefix('ATG') jpayne@69: Seq('GTGTGTGT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the removesuffix method. jpayne@69: """ jpayne@69: if isinstance(prefix, _SeqAbstractBaseClass): jpayne@69: prefix = bytes(prefix) jpayne@69: elif isinstance(prefix, str): jpayne@69: prefix = prefix.encode("ASCII") jpayne@69: try: jpayne@69: data = self._data.removeprefix(prefix) jpayne@69: except TypeError: jpayne@69: raise TypeError( jpayne@69: "argument must be a string, Seq, MutableSeq, or bytes-like object" jpayne@69: ) from None jpayne@69: except AttributeError: jpayne@69: # Fall back for pre-Python 3.9 jpayne@69: data = self._data jpayne@69: if data.startswith(prefix): jpayne@69: data = data[len(prefix) :] jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def removesuffix(self, suffix, inplace=False): jpayne@69: """Return a new Seq object with suffix (right) removed. jpayne@69: jpayne@69: This behaves like the python string method of the same name. jpayne@69: jpayne@69: e.g. Removing a stop codon: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_seq = Seq("GTGTGTGTTAG") jpayne@69: >>> my_seq jpayne@69: Seq('GTGTGTGTTAG') jpayne@69: >>> stop_codon = Seq("TAG") jpayne@69: >>> my_seq.removesuffix(stop_codon) jpayne@69: Seq('GTGTGTGT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the removeprefix method. jpayne@69: """ jpayne@69: if isinstance(suffix, _SeqAbstractBaseClass): jpayne@69: suffix = bytes(suffix) jpayne@69: elif isinstance(suffix, str): jpayne@69: suffix = suffix.encode("ASCII") jpayne@69: try: jpayne@69: data = self._data.removesuffix(suffix) jpayne@69: except TypeError: jpayne@69: raise TypeError( jpayne@69: "argument must be a string, Seq, MutableSeq, or bytes-like object" jpayne@69: ) from None jpayne@69: except AttributeError: jpayne@69: # Fall back for pre-Python 3.9 jpayne@69: data = self._data jpayne@69: if data.endswith(suffix): jpayne@69: data = data[: -len(suffix)] jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def upper(self, inplace=False): jpayne@69: """Return the sequence in upper case. jpayne@69: jpayne@69: An upper-case copy of the sequence is returned if inplace is False, jpayne@69: the default value: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> my_seq = Seq("VHLTPeeK*") jpayne@69: >>> my_seq jpayne@69: Seq('VHLTPeeK*') jpayne@69: >>> my_seq.lower() jpayne@69: Seq('vhltpeek*') jpayne@69: >>> my_seq.upper() jpayne@69: Seq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: Seq('VHLTPeeK*') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("VHLTPeeK*") jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPeeK*') jpayne@69: >>> my_seq.lower() jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq.upper() jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPeeK*') jpayne@69: jpayne@69: >>> my_seq.lower(inplace=True) jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq.upper(inplace=True) jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``upper`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the ``lower`` method. jpayne@69: """ jpayne@69: data = self._data.upper() jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def lower(self, inplace=False): jpayne@69: """Return the sequence in lower case. jpayne@69: jpayne@69: An lower-case copy of the sequence is returned if inplace is False, jpayne@69: the default value: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq, MutableSeq jpayne@69: >>> my_seq = Seq("VHLTPeeK*") jpayne@69: >>> my_seq jpayne@69: Seq('VHLTPeeK*') jpayne@69: >>> my_seq.lower() jpayne@69: Seq('vhltpeek*') jpayne@69: >>> my_seq.upper() jpayne@69: Seq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: Seq('VHLTPeeK*') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("VHLTPeeK*") jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPeeK*') jpayne@69: >>> my_seq.lower() jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq.upper() jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPeeK*') jpayne@69: jpayne@69: >>> my_seq.lower(inplace=True) jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('vhltpeek*') jpayne@69: >>> my_seq.upper(inplace=True) jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: >>> my_seq jpayne@69: MutableSeq('VHLTPEEK*') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``lower`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: See also the ``upper`` method. jpayne@69: """ jpayne@69: data = self._data.lower() jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: else: jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def isupper(self): jpayne@69: """Return True if all ASCII characters in data are uppercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: return self._data.isupper() jpayne@69: jpayne@69: def islower(self): jpayne@69: """Return True if all ASCII characters in data are lowercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: return self._data.islower() jpayne@69: jpayne@69: def translate( jpayne@69: self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-" jpayne@69: ): jpayne@69: """Turn a nucleotide sequence into a protein sequence by creating a new sequence object. jpayne@69: jpayne@69: This method will translate DNA or RNA sequences. It should not jpayne@69: be used on protein sequences as any result will be biologically jpayne@69: meaningless. jpayne@69: jpayne@69: Arguments: jpayne@69: - table - Which codon table to use? This can be either a name jpayne@69: (string), an NCBI identifier (integer), or a CodonTable jpayne@69: object (useful for non-standard genetic codes). This jpayne@69: defaults to the "Standard" table. jpayne@69: - stop_symbol - Single character string, what to use for jpayne@69: terminators. This defaults to the asterisk, "*". jpayne@69: - to_stop - Boolean, defaults to False meaning do a full jpayne@69: translation continuing on past any stop codons (translated as the jpayne@69: specified stop_symbol). If True, translation is terminated at jpayne@69: the first in frame stop codon (and the stop_symbol is not jpayne@69: appended to the returned protein sequence). jpayne@69: - cds - Boolean, indicates this is a complete CDS. If True, jpayne@69: this checks the sequence starts with a valid alternative start jpayne@69: codon (which will be translated as methionine, M), that the jpayne@69: sequence length is a multiple of three, and that there is a jpayne@69: single in frame stop codon at the end (this will be excluded jpayne@69: from the protein sequence, regardless of the to_stop option). jpayne@69: If these tests fail, an exception is raised. jpayne@69: - gap - Single character string to denote symbol used for gaps. jpayne@69: Defaults to the minus sign. jpayne@69: jpayne@69: A ``Seq`` object is returned if ``translate`` is called on a ``Seq`` jpayne@69: object; a ``MutableSeq`` object is returned if ``translate`` is called jpayne@69: pn a ``MutableSeq`` object. jpayne@69: jpayne@69: e.g. Using the standard table: jpayne@69: jpayne@69: >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@69: >>> coding_dna.translate() jpayne@69: Seq('VAIVMGR*KGAR*') jpayne@69: >>> coding_dna.translate(stop_symbol="@") jpayne@69: Seq('VAIVMGR@KGAR@') jpayne@69: >>> coding_dna.translate(to_stop=True) jpayne@69: Seq('VAIVMGR') jpayne@69: jpayne@69: Now using NCBI table 2, where TGA is not a stop codon: jpayne@69: jpayne@69: >>> coding_dna.translate(table=2) jpayne@69: Seq('VAIVMGRWKGAR*') jpayne@69: >>> coding_dna.translate(table=2, to_stop=True) jpayne@69: Seq('VAIVMGRWKGAR') jpayne@69: jpayne@69: In fact, GTG is an alternative start codon under NCBI table 2, meaning jpayne@69: this sequence could be a complete CDS: jpayne@69: jpayne@69: >>> coding_dna.translate(table=2, cds=True) jpayne@69: Seq('MAIVMGRWKGAR') jpayne@69: jpayne@69: It isn't a valid CDS under NCBI table 1, due to both the start codon jpayne@69: and also the in frame stop codons: jpayne@69: jpayne@69: >>> coding_dna.translate(table=1, cds=True) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon jpayne@69: jpayne@69: If the sequence has no in-frame stop codon, then the to_stop argument jpayne@69: has no effect: jpayne@69: jpayne@69: >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") jpayne@69: >>> coding_dna2.translate() jpayne@69: Seq('LAIVMGR') jpayne@69: >>> coding_dna2.translate(to_stop=True) jpayne@69: Seq('LAIVMGR') jpayne@69: jpayne@69: NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid jpayne@69: or a stop codon. These are translated as "X". Any invalid codon jpayne@69: (e.g. "TA?" or "T-A") will throw a TranslationError. jpayne@69: jpayne@69: NOTE - This does NOT behave like the python string's translate jpayne@69: method. For that use str(my_seq).translate(...) instead jpayne@69: """ jpayne@69: try: jpayne@69: data = str(self) jpayne@69: except UndefinedSequenceError: jpayne@69: # translating an undefined sequence yields an undefined jpayne@69: # sequence with the length divided by 3 jpayne@69: n = len(self) jpayne@69: if n % 3 != 0: jpayne@69: warnings.warn( jpayne@69: "Partial codon, len(sequence) not a multiple of three. " jpayne@69: "This may become an error in future.", jpayne@69: BiopythonWarning, jpayne@69: ) jpayne@69: return Seq(None, n // 3) jpayne@69: jpayne@69: return self.__class__( jpayne@69: _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap) jpayne@69: ) jpayne@69: jpayne@69: def complement(self, inplace=False): jpayne@69: """Return the complement as a DNA sequence. jpayne@69: jpayne@69: >>> Seq("CGA").complement() jpayne@69: Seq('GCT') jpayne@69: jpayne@69: Any U in the sequence is treated as a T: jpayne@69: jpayne@69: >>> Seq("CGAUT").complement() jpayne@69: Seq('GCTAA') jpayne@69: jpayne@69: In contrast, ``complement_rna`` returns an RNA sequence: jpayne@69: jpayne@69: >>> Seq("CGAUT").complement_rna() jpayne@69: Seq('GCUAA') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: >>> my_seq.complement() jpayne@69: MutableSeq('GCT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: >>> my_seq.complement(inplace=True) jpayne@69: MutableSeq('GCT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('GCT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: """ jpayne@69: ttable = _dna_complement_table jpayne@69: try: jpayne@69: data = self._data.translate(ttable) jpayne@69: except UndefinedSequenceError: jpayne@69: # complement of an undefined sequence is an undefined sequence jpayne@69: # of the same length jpayne@69: return self jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def complement_rna(self, inplace=False): jpayne@69: """Return the complement as an RNA sequence. jpayne@69: jpayne@69: >>> Seq("CGA").complement_rna() jpayne@69: Seq('GCU') jpayne@69: jpayne@69: Any T in the sequence is treated as a U: jpayne@69: jpayne@69: >>> Seq("CGAUT").complement_rna() jpayne@69: Seq('GCUAA') jpayne@69: jpayne@69: In contrast, ``complement`` returns a DNA sequence by default: jpayne@69: jpayne@69: >>> Seq("CGA").complement() jpayne@69: Seq('GCT') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: >>> my_seq.complement_rna() jpayne@69: MutableSeq('GCU') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: >>> my_seq.complement_rna(inplace=True) jpayne@69: MutableSeq('GCU') jpayne@69: >>> my_seq jpayne@69: MutableSeq('GCU') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: """ jpayne@69: try: jpayne@69: data = self._data.translate(_rna_complement_table) jpayne@69: except UndefinedSequenceError: jpayne@69: # complement of an undefined sequence is an undefined sequence jpayne@69: # of the same length jpayne@69: return self jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def reverse_complement(self, inplace=False): jpayne@69: """Return the reverse complement as a DNA sequence. jpayne@69: jpayne@69: >>> Seq("CGA").reverse_complement() jpayne@69: Seq('TCG') jpayne@69: jpayne@69: Any U in the sequence is treated as a T: jpayne@69: jpayne@69: >>> Seq("CGAUT").reverse_complement() jpayne@69: Seq('AATCG') jpayne@69: jpayne@69: In contrast, ``reverse_complement_rna`` returns an RNA sequence: jpayne@69: jpayne@69: >>> Seq("CGA").reverse_complement_rna() jpayne@69: Seq('UCG') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: >>> my_seq.reverse_complement() jpayne@69: MutableSeq('TCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: >>> my_seq.reverse_complement(inplace=True) jpayne@69: MutableSeq('TCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('TCG') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``reverse_complement`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: try: jpayne@69: data = self._data.translate(_dna_complement_table) jpayne@69: except UndefinedSequenceError: jpayne@69: # reverse complement of an undefined sequence is an undefined sequence jpayne@69: # of the same length jpayne@69: return self jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[::-1] = data jpayne@69: return self jpayne@69: return self.__class__(data[::-1]) jpayne@69: jpayne@69: def reverse_complement_rna(self, inplace=False): jpayne@69: """Return the reverse complement as an RNA sequence. jpayne@69: jpayne@69: >>> Seq("CGA").reverse_complement_rna() jpayne@69: Seq('UCG') jpayne@69: jpayne@69: Any T in the sequence is treated as a U: jpayne@69: jpayne@69: >>> Seq("CGAUT").reverse_complement_rna() jpayne@69: Seq('AAUCG') jpayne@69: jpayne@69: In contrast, ``reverse_complement`` returns a DNA sequence: jpayne@69: jpayne@69: >>> Seq("CGA").reverse_complement() jpayne@69: Seq('TCG') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: >>> my_seq.reverse_complement_rna() jpayne@69: MutableSeq('UCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: >>> my_seq.reverse_complement_rna(inplace=True) jpayne@69: MutableSeq('UCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('UCG') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``reverse_complement_rna`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: try: jpayne@69: data = self._data.translate(_rna_complement_table) jpayne@69: except UndefinedSequenceError: jpayne@69: # reverse complement of an undefined sequence is an undefined sequence jpayne@69: # of the same length jpayne@69: return self jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[::-1] = data jpayne@69: return self jpayne@69: return self.__class__(data[::-1]) jpayne@69: jpayne@69: def transcribe(self, inplace=False): jpayne@69: """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object. jpayne@69: jpayne@69: Following the usual convention, the sequence is interpreted as the jpayne@69: coding strand of the DNA double helix, not the template strand. This jpayne@69: means we can get the RNA sequence just by switching T to U. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@69: >>> coding_dna jpayne@69: Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: >>> coding_dna.transcribe() jpayne@69: Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") jpayne@69: >>> sequence jpayne@69: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: >>> sequence.transcribe() jpayne@69: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: >>> sequence jpayne@69: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: jpayne@69: >>> sequence.transcribe(inplace=True) jpayne@69: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: >>> sequence jpayne@69: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: Trying to transcribe an RNA sequence has no effect. jpayne@69: If you have a nucleotide sequence which might be DNA or RNA jpayne@69: (or even a mixture), calling the transcribe method will ensure jpayne@69: any T becomes U. jpayne@69: jpayne@69: Trying to transcribe a protein sequence will replace any jpayne@69: T for Threonine with U for Selenocysteine, which has no jpayne@69: biologically plausible rational. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_protein = Seq("MAIVMGRT") jpayne@69: >>> my_protein.transcribe() jpayne@69: Seq('MAIVMGRU') jpayne@69: """ jpayne@69: data = self._data.replace(b"T", b"U").replace(b"t", b"u") jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def back_transcribe(self, inplace=False): jpayne@69: """Return the DNA sequence from an RNA sequence by creating a new Seq object. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") jpayne@69: >>> messenger_rna jpayne@69: Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: >>> messenger_rna.back_transcribe() jpayne@69: Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") jpayne@69: >>> sequence jpayne@69: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: >>> sequence.back_transcribe() jpayne@69: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: >>> sequence jpayne@69: MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') jpayne@69: jpayne@69: >>> sequence.back_transcribe(inplace=True) jpayne@69: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: >>> sequence jpayne@69: MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: jpayne@69: Trying to back-transcribe DNA has no effect, If you have a nucleotide jpayne@69: sequence which might be DNA or RNA (or even a mixture), calling the jpayne@69: back-transcribe method will ensure any U becomes T. jpayne@69: jpayne@69: Trying to back-transcribe a protein sequence will replace any U for jpayne@69: Selenocysteine with T for Threonine, which is biologically meaningless. jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_protein = Seq("MAIVMGRU") jpayne@69: >>> my_protein.back_transcribe() jpayne@69: Seq('MAIVMGRT') jpayne@69: """ jpayne@69: data = self._data.replace(b"U", b"T").replace(b"u", b"t") jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: def join(self, other): jpayne@69: """Return a merge of the sequences in other, spaced by the sequence from self. jpayne@69: jpayne@69: Accepts a Seq object, MutableSeq object, or string (and iterates over jpayne@69: the letters), or an iterable containing Seq, MutableSeq, or string jpayne@69: objects. These arguments will be concatenated with the calling sequence jpayne@69: as the spacer: jpayne@69: jpayne@69: >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")]) jpayne@69: >>> concatenated jpayne@69: Seq('AAANNNNNTTTNNNNNPPP') jpayne@69: jpayne@69: Joining the letters of a single sequence: jpayne@69: jpayne@69: >>> Seq('NNNNN').join(Seq("ACGT")) jpayne@69: Seq('ANNNNNCNNNNNGNNNNNT') jpayne@69: >>> Seq('NNNNN').join("ACGT") jpayne@69: Seq('ANNNNNCNNNNNGNNNNNT') jpayne@69: """ jpayne@69: if isinstance(other, _SeqAbstractBaseClass): jpayne@69: return self.__class__(str(self).join(str(other))) jpayne@69: elif isinstance(other, str): jpayne@69: return self.__class__(str(self).join(other)) jpayne@69: jpayne@69: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@69: jpayne@69: if isinstance(other, SeqRecord): jpayne@69: raise TypeError("Iterable cannot be a SeqRecord") jpayne@69: jpayne@69: for c in other: jpayne@69: if isinstance(c, SeqRecord): jpayne@69: raise TypeError("Iterable cannot contain SeqRecords") jpayne@69: elif not isinstance(c, (str, _SeqAbstractBaseClass)): jpayne@69: raise TypeError( jpayne@69: "Input must be an iterable of Seq objects, MutableSeq objects, or strings" jpayne@69: ) jpayne@69: return self.__class__(str(self).join([str(_) for _ in other])) jpayne@69: jpayne@69: def replace(self, old, new, inplace=False): jpayne@69: """Return a copy with all occurrences of subsequence old replaced by new. jpayne@69: jpayne@69: >>> s = Seq("ACGTAACCGGTT") jpayne@69: >>> t = s.replace("AC", "XYZ") jpayne@69: >>> s jpayne@69: Seq('ACGTAACCGGTT') jpayne@69: >>> t jpayne@69: Seq('XYZGTAXYZCGGTT') jpayne@69: jpayne@69: For mutable sequences, passing inplace=True will modify the sequence in place: jpayne@69: jpayne@69: >>> m = MutableSeq("ACGTAACCGGTT") jpayne@69: >>> t = m.replace("AC", "XYZ") jpayne@69: >>> m jpayne@69: MutableSeq('ACGTAACCGGTT') jpayne@69: >>> t jpayne@69: MutableSeq('XYZGTAXYZCGGTT') jpayne@69: jpayne@69: >>> m = MutableSeq("ACGTAACCGGTT") jpayne@69: >>> t = m.replace("AC", "XYZ", inplace=True) jpayne@69: >>> m jpayne@69: MutableSeq('XYZGTAXYZCGGTT') jpayne@69: >>> t jpayne@69: MutableSeq('XYZGTAXYZCGGTT') jpayne@69: jpayne@69: As ``Seq`` objects are immutable, a ``TypeError`` is raised if jpayne@69: ``replace`` is called on a ``Seq`` object with ``inplace=True``. jpayne@69: """ jpayne@69: if isinstance(old, _SeqAbstractBaseClass): jpayne@69: old = bytes(old) jpayne@69: elif isinstance(old, str): jpayne@69: old = old.encode("ASCII") jpayne@69: if isinstance(new, _SeqAbstractBaseClass): jpayne@69: new = bytes(new) jpayne@69: elif isinstance(new, str): jpayne@69: new = new.encode("ASCII") jpayne@69: data = self._data.replace(old, new) jpayne@69: if inplace: jpayne@69: if not isinstance(self._data, bytearray): jpayne@69: raise TypeError("Sequence is immutable") jpayne@69: self._data[:] = data jpayne@69: return self jpayne@69: return self.__class__(data) jpayne@69: jpayne@69: @property jpayne@69: def defined(self): jpayne@69: """Return True if the sequence is defined, False if undefined or partially defined. jpayne@69: jpayne@69: Zero-length sequences are always considered to be defined. jpayne@69: """ jpayne@69: if isinstance(self._data, (bytes, bytearray)): jpayne@69: return True jpayne@69: else: jpayne@69: return self._data.defined jpayne@69: jpayne@69: @property jpayne@69: def defined_ranges(self): jpayne@69: """Return a tuple of the ranges where the sequence contents is defined. jpayne@69: jpayne@69: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@69: """ jpayne@69: if isinstance(self._data, (bytes, bytearray)): jpayne@69: length = len(self) jpayne@69: if length > 0: jpayne@69: return ((0, length),) jpayne@69: else: jpayne@69: return () jpayne@69: else: jpayne@69: return self._data.defined_ranges jpayne@69: jpayne@69: jpayne@69: class Seq(_SeqAbstractBaseClass): jpayne@69: """Read-only sequence object (essentially a string with biological methods). jpayne@69: jpayne@69: Like normal python strings, our basic sequence object is immutable. jpayne@69: This prevents you from doing my_seq[5] = "A" for example, but does allow jpayne@69: Seq objects to be used as dictionary keys. jpayne@69: jpayne@69: The Seq object provides a number of string like methods (such as count, jpayne@69: find, split and strip). jpayne@69: jpayne@69: The Seq object also provides some biological methods, such as complement, jpayne@69: reverse_complement, transcribe, back_transcribe and translate (which are jpayne@69: not applicable to protein sequences). jpayne@69: """ jpayne@69: jpayne@69: _data: Union[bytes, SequenceDataAbstractBaseClass] jpayne@69: jpayne@69: def __init__( jpayne@69: self, jpayne@69: data: Union[ jpayne@69: str, jpayne@69: bytes, jpayne@69: bytearray, jpayne@69: _SeqAbstractBaseClass, jpayne@69: SequenceDataAbstractBaseClass, jpayne@69: dict, jpayne@69: None, jpayne@69: ], jpayne@69: length: Optional[int] = None, jpayne@69: ): jpayne@69: """Create a Seq object. jpayne@69: jpayne@69: Arguments: jpayne@69: - data - Sequence, required (string) jpayne@69: - length - Sequence length, used only if data is None or a dictionary (integer) jpayne@69: jpayne@69: You will typically use Bio.SeqIO to read in sequences from files as jpayne@69: SeqRecord objects, whose sequence will be exposed as a Seq object via jpayne@69: the seq property. jpayne@69: jpayne@69: However, you can also create a Seq object directly: jpayne@69: jpayne@69: >>> from Bio.Seq import Seq jpayne@69: >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF") jpayne@69: >>> my_seq jpayne@69: Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') jpayne@69: >>> print(my_seq) jpayne@69: MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF jpayne@69: jpayne@69: To create a Seq object with for a sequence of known length but jpayne@69: unknown sequence contents, use None for the data argument and pass jpayne@69: the sequence length for the length argument. Trying to access the jpayne@69: sequence contents of a Seq object created in this way will raise jpayne@69: an UndefinedSequenceError: jpayne@69: jpayne@69: >>> my_undefined_sequence = Seq(None, 20) jpayne@69: >>> my_undefined_sequence jpayne@69: Seq(None, length=20) jpayne@69: >>> len(my_undefined_sequence) jpayne@69: 20 jpayne@69: >>> print(my_undefined_sequence) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Seq.UndefinedSequenceError: Sequence content is undefined jpayne@69: jpayne@69: If the sequence contents is known for parts of the sequence only, use jpayne@69: a dictionary for the data argument to pass the known sequence segments: jpayne@69: jpayne@69: >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10) jpayne@69: >>> my_partially_defined_sequence jpayne@69: Seq({3: 'ACGT'}, length=10) jpayne@69: >>> len(my_partially_defined_sequence) jpayne@69: 10 jpayne@69: >>> print(my_partially_defined_sequence) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined jpayne@69: >>> my_partially_defined_sequence[3:7] jpayne@69: Seq('ACGT') jpayne@69: >>> print(my_partially_defined_sequence[3:7]) jpayne@69: ACGT jpayne@69: """ jpayne@69: if data is None: jpayne@69: if length is None: jpayne@69: raise ValueError("length must not be None if data is None") jpayne@69: elif length == 0: jpayne@69: self._data = b"" jpayne@69: elif length < 0: jpayne@69: raise ValueError("length must not be negative.") jpayne@69: else: jpayne@69: self._data = _UndefinedSequenceData(length) jpayne@69: elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)): jpayne@69: self._data = data jpayne@69: elif isinstance(data, (bytearray, _SeqAbstractBaseClass)): jpayne@69: self._data = bytes(data) jpayne@69: elif isinstance(data, str): jpayne@69: self._data = bytes(data, encoding="ASCII") jpayne@69: elif isinstance(data, dict): jpayne@69: if length is None: jpayne@69: raise ValueError("length must not be None if data is a dictionary") jpayne@69: elif length == 0: jpayne@69: self._data = b"" jpayne@69: elif length < 0: jpayne@69: raise ValueError("length must not be negative.") jpayne@69: else: jpayne@69: current = 0 # not needed here, but it keeps mypy happy jpayne@69: end = -1 jpayne@69: starts = sorted(data.keys()) jpayne@69: _data: Dict[int, bytes] = {} jpayne@69: for start in starts: jpayne@69: seq = data[start] jpayne@69: if isinstance(seq, str): jpayne@69: seq = bytes(seq, encoding="ASCII") jpayne@69: else: jpayne@69: try: jpayne@69: seq = bytes(seq) jpayne@69: except Exception: jpayne@69: raise ValueError("Expected bytes-like objects or strings") jpayne@69: if start < end: jpayne@69: raise ValueError("Sequence data are overlapping.") jpayne@69: elif start == end: jpayne@69: _data[current] += seq # noqa: F821 jpayne@69: else: jpayne@69: _data[start] = seq jpayne@69: current = start jpayne@69: end = start + len(seq) jpayne@69: if end > length: jpayne@69: raise ValueError( jpayne@69: "Provided sequence data extend beyond sequence length." jpayne@69: ) jpayne@69: elif end == length and current == 0: jpayne@69: # sequence is fully defined jpayne@69: self._data = _data[current] jpayne@69: else: jpayne@69: self._data = _PartiallyDefinedSequenceData(length, _data) jpayne@69: else: jpayne@69: raise TypeError( jpayne@69: "data should be a string, bytes, bytearray, Seq, or MutableSeq object" jpayne@69: ) jpayne@69: jpayne@69: def __hash__(self): jpayne@69: """Hash of the sequence as a string for comparison. jpayne@69: jpayne@69: See Seq object comparison documentation (method ``__eq__`` in jpayne@69: particular) as this has changed in Biopython 1.65. Older versions jpayne@69: would hash on object identity. jpayne@69: """ jpayne@69: return hash(self._data) jpayne@69: jpayne@69: jpayne@69: class MutableSeq(_SeqAbstractBaseClass): jpayne@69: """An editable sequence object. jpayne@69: jpayne@69: Unlike normal python strings and our basic sequence object (the Seq class) jpayne@69: which are immutable, the MutableSeq lets you edit the sequence in place. jpayne@69: However, this means you cannot use a MutableSeq object as a dictionary key. jpayne@69: jpayne@69: >>> from Bio.Seq import MutableSeq jpayne@69: >>> my_seq = MutableSeq("ACTCGTCGTCG") jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGTCGTCG') jpayne@69: >>> my_seq[5] jpayne@69: 'T' jpayne@69: >>> my_seq[5] = "A" jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq[5] jpayne@69: 'A' jpayne@69: >>> my_seq[5:8] = "NNN" jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGNNNTCG') jpayne@69: >>> len(my_seq) jpayne@69: 11 jpayne@69: jpayne@69: Note that the MutableSeq object does not support as many string-like jpayne@69: or biological methods as the Seq object. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, data): jpayne@69: """Create a MutableSeq object.""" jpayne@69: if isinstance(data, bytearray): jpayne@69: self._data = data jpayne@69: elif isinstance(data, bytes): jpayne@69: self._data = bytearray(data) jpayne@69: elif isinstance(data, str): jpayne@69: self._data = bytearray(data, "ASCII") jpayne@69: elif isinstance(data, MutableSeq): jpayne@69: self._data = data._data[:] # Take a copy jpayne@69: elif isinstance(data, Seq): jpayne@69: # Make no assumptions about the Seq subclass internal storage jpayne@69: self._data = bytearray(bytes(data)) jpayne@69: else: jpayne@69: raise TypeError( jpayne@69: "data should be a string, bytearray object, Seq object, or a " jpayne@69: "MutableSeq object" jpayne@69: ) jpayne@69: jpayne@69: def __setitem__(self, index, value): jpayne@69: """Set a subsequence of single letter via value parameter. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq[0] = 'T' jpayne@69: >>> my_seq jpayne@69: MutableSeq('TCTCGACGTCG') jpayne@69: """ jpayne@69: if isinstance(index, numbers.Integral): jpayne@69: # Replacing a single letter with a new string jpayne@69: self._data[index] = ord(value) jpayne@69: else: jpayne@69: # Replacing a sub-sequence jpayne@69: if isinstance(value, MutableSeq): jpayne@69: self._data[index] = value._data jpayne@69: elif isinstance(value, Seq): jpayne@69: self._data[index] = bytes(value) jpayne@69: elif isinstance(value, str): jpayne@69: self._data[index] = value.encode("ASCII") jpayne@69: else: jpayne@69: raise TypeError(f"received unexpected type '{type(value).__name__}'") jpayne@69: jpayne@69: def __delitem__(self, index): jpayne@69: """Delete a subsequence of single letter. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> del my_seq[0] jpayne@69: >>> my_seq jpayne@69: MutableSeq('CTCGACGTCG') jpayne@69: """ jpayne@69: # Could be deleting a single letter, or a slice jpayne@69: del self._data[index] jpayne@69: jpayne@69: def append(self, c): jpayne@69: """Add a subsequence to the mutable sequence object. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq.append('A') jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGTCGA') jpayne@69: jpayne@69: No return value. jpayne@69: """ jpayne@69: self._data.append(ord(c.encode("ASCII"))) jpayne@69: jpayne@69: def insert(self, i, c): jpayne@69: """Add a subsequence to the mutable sequence object at a given index. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq.insert(0,'A') jpayne@69: >>> my_seq jpayne@69: MutableSeq('AACTCGACGTCG') jpayne@69: >>> my_seq.insert(8,'G') jpayne@69: >>> my_seq jpayne@69: MutableSeq('AACTCGACGGTCG') jpayne@69: jpayne@69: No return value. jpayne@69: """ jpayne@69: self._data.insert(i, ord(c.encode("ASCII"))) jpayne@69: jpayne@69: def pop(self, i=(-1)): jpayne@69: """Remove a subsequence of a single letter at given index. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq.pop() jpayne@69: 'G' jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGTC') jpayne@69: >>> my_seq.pop() jpayne@69: 'C' jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGT') jpayne@69: jpayne@69: Returns the last character of the sequence. jpayne@69: """ jpayne@69: c = self._data[i] jpayne@69: del self._data[i] jpayne@69: return chr(c) jpayne@69: jpayne@69: def remove(self, item): jpayne@69: """Remove a subsequence of a single letter from mutable sequence. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq.remove('C') jpayne@69: >>> my_seq jpayne@69: MutableSeq('ATCGACGTCG') jpayne@69: >>> my_seq.remove('A') jpayne@69: >>> my_seq jpayne@69: MutableSeq('TCGACGTCG') jpayne@69: jpayne@69: No return value. jpayne@69: """ jpayne@69: codepoint = ord(item) jpayne@69: try: jpayne@69: self._data.remove(codepoint) jpayne@69: except ValueError: jpayne@69: raise ValueError("value not found in MutableSeq") from None jpayne@69: jpayne@69: def reverse(self): jpayne@69: """Modify the mutable sequence to reverse itself. jpayne@69: jpayne@69: No return value. jpayne@69: """ jpayne@69: self._data.reverse() jpayne@69: jpayne@69: def extend(self, other): jpayne@69: """Add a sequence to the original mutable sequence object. jpayne@69: jpayne@69: >>> my_seq = MutableSeq('ACTCGACGTCG') jpayne@69: >>> my_seq.extend('A') jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGTCGA') jpayne@69: >>> my_seq.extend('TTT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('ACTCGACGTCGATTT') jpayne@69: jpayne@69: No return value. jpayne@69: """ jpayne@69: if isinstance(other, MutableSeq): jpayne@69: self._data.extend(other._data) jpayne@69: elif isinstance(other, Seq): jpayne@69: self._data.extend(bytes(other)) jpayne@69: elif isinstance(other, str): jpayne@69: self._data.extend(other.encode("ASCII")) jpayne@69: else: jpayne@69: raise TypeError("expected a string, Seq or MutableSeq") jpayne@69: jpayne@69: jpayne@69: class UndefinedSequenceError(ValueError): jpayne@69: """Sequence contents is undefined.""" jpayne@69: jpayne@69: jpayne@69: class _UndefinedSequenceData(SequenceDataAbstractBaseClass): jpayne@69: """Stores the length of a sequence with an undefined sequence contents (PRIVATE). jpayne@69: jpayne@69: Objects of this class can be used to create a Seq object to represent jpayne@69: sequences with a known length, but an unknown sequence contents. jpayne@69: Calling __len__ returns the sequence length, calling __getitem__ raises an jpayne@69: UndefinedSequenceError except for requests of zero size, for which it jpayne@69: returns an empty bytes object. jpayne@69: """ jpayne@69: jpayne@69: __slots__ = ("_length",) jpayne@69: jpayne@69: def __init__(self, length): jpayne@69: """Initialize the object with the sequence length. jpayne@69: jpayne@69: The calling function is responsible for ensuring that the length is jpayne@69: greater than zero. jpayne@69: """ jpayne@69: self._length = length jpayne@69: super().__init__() jpayne@69: jpayne@69: def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]: jpayne@69: if isinstance(key, slice): jpayne@69: start, end, step = key.indices(self._length) jpayne@69: size = len(range(start, end, step)) jpayne@69: if size == 0: jpayne@69: return b"" jpayne@69: return _UndefinedSequenceData(size) jpayne@69: else: jpayne@69: raise UndefinedSequenceError("Sequence content is undefined") jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self._length jpayne@69: jpayne@69: def __bytes__(self): jpayne@69: raise UndefinedSequenceError("Sequence content is undefined") jpayne@69: jpayne@69: def __add__(self, other): jpayne@69: length = len(self) + len(other) jpayne@69: try: jpayne@69: other = bytes(other) jpayne@69: except UndefinedSequenceError: jpayne@69: if isinstance(other, _UndefinedSequenceData): jpayne@69: return _UndefinedSequenceData(length) jpayne@69: else: jpayne@69: return NotImplemented jpayne@69: # _PartiallyDefinedSequenceData.__radd__ will handle this jpayne@69: else: jpayne@69: data = {len(self): other} jpayne@69: return _PartiallyDefinedSequenceData(length, data) jpayne@69: jpayne@69: def __radd__(self, other): jpayne@69: data = {0: bytes(other)} jpayne@69: length = len(other) + len(self) jpayne@69: return _PartiallyDefinedSequenceData(length, data) jpayne@69: jpayne@69: def upper(self): jpayne@69: """Return an upper case copy of the sequence.""" jpayne@69: # An upper case copy of an undefined sequence is an undefined jpayne@69: # sequence of the same length jpayne@69: return _UndefinedSequenceData(self._length) jpayne@69: jpayne@69: def lower(self): jpayne@69: """Return a lower case copy of the sequence.""" jpayne@69: # A lower case copy of an undefined sequence is an undefined jpayne@69: # sequence of the same length jpayne@69: return _UndefinedSequenceData(self._length) jpayne@69: jpayne@69: def isupper(self): jpayne@69: """Return True if all ASCII characters in data are uppercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: # Character case is irrelevant for an undefined sequence jpayne@69: raise UndefinedSequenceError("Sequence content is undefined") jpayne@69: jpayne@69: def islower(self): jpayne@69: """Return True if all ASCII characters in data are lowercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: # Character case is irrelevant for an undefined sequence jpayne@69: raise UndefinedSequenceError("Sequence content is undefined") jpayne@69: jpayne@69: def replace(self, old, new): jpayne@69: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@69: # Replacing substring old by new in an undefined sequence will result jpayne@69: # in an undefined sequence of the same length, if old and new have the jpayne@69: # number of characters. jpayne@69: if len(old) != len(new): jpayne@69: raise UndefinedSequenceError("Sequence content is undefined") jpayne@69: return _UndefinedSequenceData(self._length) jpayne@69: jpayne@69: @property jpayne@69: def defined(self): jpayne@69: """Return False, as the sequence is not defined and has a non-zero length.""" jpayne@69: return False jpayne@69: jpayne@69: @property jpayne@69: def defined_ranges(self): jpayne@69: """Return a tuple of the ranges where the sequence contents is defined. jpayne@69: jpayne@69: As the sequence contents of an _UndefinedSequenceData object is fully jpayne@69: undefined, the return value is always an empty tuple. jpayne@69: """ jpayne@69: return () jpayne@69: jpayne@69: jpayne@69: class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass): jpayne@69: """Stores the length of a sequence with an undefined sequence contents (PRIVATE). jpayne@69: jpayne@69: Objects of this class can be used to create a Seq object to represent jpayne@69: sequences with a known length, but with a sequence contents that is only jpayne@69: partially known. jpayne@69: Calling __len__ returns the sequence length, calling __getitem__ returns jpayne@69: the sequence contents if known, otherwise an UndefinedSequenceError is jpayne@69: raised. jpayne@69: """ jpayne@69: jpayne@69: __slots__ = ("_length", "_data") jpayne@69: jpayne@69: def __init__(self, length, data): jpayne@69: """Initialize with the sequence length and defined sequence segments. jpayne@69: jpayne@69: The calling function is responsible for ensuring that the length is jpayne@69: greater than zero. jpayne@69: """ jpayne@69: self._length = length jpayne@69: self._data = data jpayne@69: super().__init__() jpayne@69: jpayne@69: def __getitem__( jpayne@69: self, key: Union[slice, int] jpayne@69: ) -> Union[bytes, SequenceDataAbstractBaseClass]: jpayne@69: if isinstance(key, slice): jpayne@69: start, end, step = key.indices(self._length) jpayne@69: size = len(range(start, end, step)) jpayne@69: if size == 0: jpayne@69: return b"" jpayne@69: data = {} jpayne@69: for s, d in self._data.items(): jpayne@69: indices = range(-s, -s + self._length)[key] jpayne@69: e: Optional[int] = indices.stop jpayne@69: assert e is not None jpayne@69: if step > 0: jpayne@69: if e <= 0: jpayne@69: continue jpayne@69: if indices.start < 0: jpayne@69: s = indices.start % step jpayne@69: else: jpayne@69: s = indices.start jpayne@69: else: # step < 0 jpayne@69: if e < 0: jpayne@69: e = None jpayne@69: end = len(d) - 1 jpayne@69: if indices.start > end: jpayne@69: s = end + (indices.start - end) % step jpayne@69: else: jpayne@69: s = indices.start jpayne@69: if s < 0: jpayne@69: continue jpayne@69: start = (s - indices.start) // step jpayne@69: d = d[s:e:step] jpayne@69: if d: jpayne@69: data[start] = d jpayne@69: if len(data) == 0: # Fully undefined sequence jpayne@69: return _UndefinedSequenceData(size) jpayne@69: # merge adjacent sequence segments jpayne@69: end = -1 jpayne@69: previous = 0 # not needed here, but it keeps flake happy jpayne@69: items = data.items() jpayne@69: data = {} jpayne@69: for start, seq in items: jpayne@69: if end == start: jpayne@69: data[previous] += seq jpayne@69: else: jpayne@69: data[start] = seq jpayne@69: previous = start jpayne@69: end = start + len(seq) jpayne@69: if len(data) == 1: jpayne@69: seq = data.get(0) jpayne@69: if seq is not None and len(seq) == size: jpayne@69: return seq # Fully defined sequence; return bytes jpayne@69: if step < 0: jpayne@69: # use this after we drop Python 3.7: jpayne@69: # data = {start: data[start] for start in reversed(data)} jpayne@69: # use this as long as we support Python 3.7: jpayne@69: data = {start: data[start] for start in reversed(list(data.keys()))} jpayne@69: return _PartiallyDefinedSequenceData(size, data) jpayne@69: elif self._length <= key: jpayne@69: raise IndexError("sequence index out of range") jpayne@69: else: jpayne@69: for start, seq in self._data.items(): jpayne@69: if start <= key and key < start + len(seq): jpayne@69: return seq[key - start] jpayne@69: raise UndefinedSequenceError("Sequence at position %d is undefined" % key) jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self._length jpayne@69: jpayne@69: def __bytes__(self): jpayne@69: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@69: jpayne@69: def __add__(self, other): jpayne@69: length = len(self) + len(other) jpayne@69: data = dict(self._data) jpayne@69: items = list(self._data.items()) jpayne@69: start, seq = items[-1] jpayne@69: end = start + len(seq) jpayne@69: try: jpayne@69: other = bytes(other) jpayne@69: except UndefinedSequenceError: jpayne@69: if isinstance(other, _UndefinedSequenceData): jpayne@69: pass jpayne@69: elif isinstance(other, _PartiallyDefinedSequenceData): jpayne@69: other_items = list(other._data.items()) jpayne@69: if end == len(self): jpayne@69: other_start, other_seq = other_items.pop(0) jpayne@69: if other_start == 0: jpayne@69: data[start] += other_seq jpayne@69: else: jpayne@69: data[len(self) + other_start] = other_seq jpayne@69: for other_start, other_seq in other_items: jpayne@69: data[len(self) + other_start] = other_seq jpayne@69: else: jpayne@69: if end == len(self): jpayne@69: data[start] += other jpayne@69: else: jpayne@69: data[len(self)] = other jpayne@69: return _PartiallyDefinedSequenceData(length, data) jpayne@69: jpayne@69: def __radd__(self, other): jpayne@69: length = len(other) + len(self) jpayne@69: try: jpayne@69: other = bytes(other) jpayne@69: except UndefinedSequenceError: jpayne@69: data = {len(other) + start: seq for start, seq in self._data.items()} jpayne@69: else: jpayne@69: data = {0: other} jpayne@69: items = list(self._data.items()) jpayne@69: start, seq = items.pop(0) jpayne@69: if start == 0: jpayne@69: data[0] += seq jpayne@69: else: jpayne@69: data[len(other) + start] = seq jpayne@69: for start, seq in items: jpayne@69: data[len(other) + start] = seq jpayne@69: return _PartiallyDefinedSequenceData(length, data) jpayne@69: jpayne@69: def __mul__(self, other): jpayne@69: length = self._length jpayne@69: items = self._data.items() jpayne@69: data = {} jpayne@69: end = -1 jpayne@69: previous = 0 # not needed here, but it keeps flake happy jpayne@69: for i in range(other): jpayne@69: for start, seq in items: jpayne@69: start += i * length jpayne@69: if end == start: jpayne@69: data[previous] += seq jpayne@69: else: jpayne@69: data[start] = seq jpayne@69: previous = start jpayne@69: end = start + len(seq) jpayne@69: return _PartiallyDefinedSequenceData(length * other, data) jpayne@69: jpayne@69: def upper(self): jpayne@69: """Return an upper case copy of the sequence.""" jpayne@69: data = {start: seq.upper() for start, seq in self._data.items()} jpayne@69: return _PartiallyDefinedSequenceData(self._length, data) jpayne@69: jpayne@69: def lower(self): jpayne@69: """Return a lower case copy of the sequence.""" jpayne@69: data = {start: seq.lower() for start, seq in self._data.items()} jpayne@69: return _PartiallyDefinedSequenceData(self._length, data) jpayne@69: jpayne@69: def isupper(self): jpayne@69: """Return True if all ASCII characters in data are uppercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: # Character case is irrelevant for an undefined sequence jpayne@69: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@69: jpayne@69: def islower(self): jpayne@69: """Return True if all ASCII characters in data are lowercase. jpayne@69: jpayne@69: If there are no cased characters, the method returns False. jpayne@69: """ jpayne@69: # Character case is irrelevant for an undefined sequence jpayne@69: raise UndefinedSequenceError("Sequence content is only partially defined") jpayne@69: jpayne@69: def translate(self, table, delete=b""): jpayne@69: """Return a copy with each character mapped by the given translation table. jpayne@69: jpayne@69: table jpayne@69: Translation table, which must be a bytes object of length 256. jpayne@69: jpayne@69: All characters occurring in the optional argument delete are removed. jpayne@69: The remaining characters are mapped through the given translation table. jpayne@69: """ jpayne@69: items = self._data.items() jpayne@69: data = {start: seq.translate(table, delete) for start, seq in items} jpayne@69: return _PartiallyDefinedSequenceData(self._length, data) jpayne@69: jpayne@69: def replace(self, old, new): jpayne@69: """Return a copy with all occurrences of substring old replaced by new.""" jpayne@69: # Replacing substring old by new in the undefined sequence segments jpayne@69: # will result in an undefined sequence segment of the same length, if jpayne@69: # old and new have the number of characters. If not, an error is raised, jpayne@69: # as the correct start positions cannot be calculated reliably. jpayne@69: if len(old) != len(new): jpayne@69: raise UndefinedSequenceError( jpayne@69: "Sequence content is only partially defined; substring \n" jpayne@69: "replacement cannot be performed reliably" jpayne@69: ) jpayne@69: items = self._data.items() jpayne@69: data = {start: seq.replace(old, new) for start, seq in items} jpayne@69: return _PartiallyDefinedSequenceData(self._length, data) jpayne@69: jpayne@69: @property jpayne@69: def defined(self): jpayne@69: """Return False, as the sequence is not fully defined and has a non-zero length.""" jpayne@69: return False jpayne@69: jpayne@69: @property jpayne@69: def defined_ranges(self): jpayne@69: """Return a tuple of the ranges where the sequence contents is defined. jpayne@69: jpayne@69: The return value has the format ((start1, end1), (start2, end2), ...). jpayne@69: """ jpayne@69: return tuple((start, start + len(seq)) for start, seq in self._data.items()) jpayne@69: jpayne@69: jpayne@69: # The transcribe, backward_transcribe, and translate functions are jpayne@69: # user-friendly versions of the corresponding Seq/MutableSeq methods. jpayne@69: # The functions work both on Seq objects, and on strings. jpayne@69: jpayne@69: jpayne@69: def transcribe(dna): jpayne@69: """Transcribe a DNA sequence into RNA. jpayne@69: jpayne@69: Following the usual convention, the sequence is interpreted as the jpayne@69: coding strand of the DNA double helix, not the template strand. This jpayne@69: means we can get the RNA sequence just by switching T to U. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: jpayne@69: Given a Seq or MutableSeq, returns a new Seq object. jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> transcribe("ACTGN") jpayne@69: 'ACUGN' jpayne@69: """ jpayne@69: if isinstance(dna, Seq): jpayne@69: return dna.transcribe() jpayne@69: elif isinstance(dna, MutableSeq): jpayne@69: return Seq(dna).transcribe() jpayne@69: else: jpayne@69: return dna.replace("T", "U").replace("t", "u") jpayne@69: jpayne@69: jpayne@69: def back_transcribe(rna): jpayne@69: """Return the RNA sequence back-transcribed into DNA. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: jpayne@69: Given a Seq or MutableSeq, returns a new Seq object. jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> back_transcribe("ACUGN") jpayne@69: 'ACTGN' jpayne@69: """ jpayne@69: if isinstance(rna, Seq): jpayne@69: return rna.back_transcribe() jpayne@69: elif isinstance(rna, MutableSeq): jpayne@69: return Seq(rna).back_transcribe() jpayne@69: else: jpayne@69: return rna.replace("U", "T").replace("u", "t") jpayne@69: jpayne@69: jpayne@69: def _translate_str( jpayne@69: sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None jpayne@69: ): jpayne@69: """Translate nucleotide string into a protein string (PRIVATE). jpayne@69: jpayne@69: Arguments: jpayne@69: - sequence - a string jpayne@69: - table - Which codon table to use? This can be either a name (string), jpayne@69: an NCBI identifier (integer), or a CodonTable object (useful for jpayne@69: non-standard genetic codes). This defaults to the "Standard" table. jpayne@69: - stop_symbol - a single character string, what to use for terminators. jpayne@69: - to_stop - boolean, should translation terminate at the first jpayne@69: in frame stop codon? If there is no in-frame stop codon jpayne@69: then translation continues to the end. jpayne@69: - pos_stop - a single character string for a possible stop codon jpayne@69: (e.g. TAN or NNN) jpayne@69: - cds - Boolean, indicates this is a complete CDS. If True, this jpayne@69: checks the sequence starts with a valid alternative start jpayne@69: codon (which will be translated as methionine, M), that the jpayne@69: sequence length is a multiple of three, and that there is a jpayne@69: single in frame stop codon at the end (this will be excluded jpayne@69: from the protein sequence, regardless of the to_stop option). jpayne@69: If these tests fail, an exception is raised. jpayne@69: - gap - Single character string to denote symbol used for gaps. jpayne@69: Defaults to None. jpayne@69: jpayne@69: Returns a string. jpayne@69: jpayne@69: e.g. jpayne@69: jpayne@69: >>> from Bio.Data import CodonTable jpayne@69: >>> table = CodonTable.ambiguous_dna_by_id[1] jpayne@69: >>> _translate_str("AAA", table) jpayne@69: 'K' jpayne@69: >>> _translate_str("TAR", table) jpayne@69: '*' jpayne@69: >>> _translate_str("TAN", table) jpayne@69: 'X' jpayne@69: >>> _translate_str("TAN", table, pos_stop="@") jpayne@69: '@' jpayne@69: >>> _translate_str("TA?", table) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid jpayne@69: jpayne@69: In a change to older versions of Biopython, partial codons are now jpayne@69: always regarded as an error (previously only checked if cds=True) jpayne@69: and will trigger a warning (likely to become an exception in a jpayne@69: future release). jpayne@69: jpayne@69: If **cds=True**, the start and stop codons are checked, and the start jpayne@69: codon will be translated at methionine. The sequence must be an jpayne@69: while number of codons. jpayne@69: jpayne@69: >>> _translate_str("ATGCCCTAG", table, cds=True) jpayne@69: 'MP' jpayne@69: >>> _translate_str("AAACCCTAG", table, cds=True) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon jpayne@69: >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found. jpayne@69: """ jpayne@69: try: jpayne@69: table_id = int(table) jpayne@69: except ValueError: jpayne@69: # Assume it's a table name jpayne@69: # The same table can be used for RNA or DNA jpayne@69: try: jpayne@69: codon_table = CodonTable.ambiguous_generic_by_name[table] jpayne@69: except KeyError: jpayne@69: if isinstance(table, str): jpayne@69: raise ValueError( jpayne@69: "The Bio.Seq translate methods and function DO NOT " jpayne@69: "take a character string mapping table like the python " jpayne@69: "string object's translate method. " jpayne@69: "Use str(my_seq).translate(...) instead." jpayne@69: ) from None jpayne@69: else: jpayne@69: raise TypeError("table argument must be integer or string") from None jpayne@69: except (AttributeError, TypeError): jpayne@69: # Assume it's a CodonTable object jpayne@69: if isinstance(table, CodonTable.CodonTable): jpayne@69: codon_table = table jpayne@69: else: jpayne@69: raise ValueError("Bad table argument") from None jpayne@69: else: jpayne@69: # Assume it's a table ID jpayne@69: # The same table can be used for RNA or DNA jpayne@69: codon_table = CodonTable.ambiguous_generic_by_id[table_id] jpayne@69: sequence = sequence.upper() jpayne@69: amino_acids = [] jpayne@69: forward_table = codon_table.forward_table jpayne@69: stop_codons = codon_table.stop_codons jpayne@69: if codon_table.nucleotide_alphabet is not None: jpayne@69: valid_letters = set(codon_table.nucleotide_alphabet.upper()) jpayne@69: else: jpayne@69: # Assume the worst case, ambiguous DNA or RNA: jpayne@69: valid_letters = set( jpayne@69: IUPACData.ambiguous_dna_letters.upper() jpayne@69: + IUPACData.ambiguous_rna_letters.upper() jpayne@69: ) jpayne@69: n = len(sequence) jpayne@69: jpayne@69: # Check for tables with 'ambiguous' (dual-coding) stop codons: jpayne@69: dual_coding = [c for c in stop_codons if c in forward_table] jpayne@69: if dual_coding: jpayne@69: c = dual_coding[0] jpayne@69: if to_stop: jpayne@69: raise ValueError( jpayne@69: "You cannot use 'to_stop=True' with this table as it contains" jpayne@69: f" {len(dual_coding)} codon(s) which can be both STOP and an" jpayne@69: f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)." jpayne@69: ) jpayne@69: warnings.warn( jpayne@69: f"This table contains {len(dual_coding)} codon(s) which code(s) for" jpayne@69: f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'" jpayne@69: " or STOP). Such codons will be translated as amino acid.", jpayne@69: BiopythonWarning, jpayne@69: ) jpayne@69: jpayne@69: if cds: jpayne@69: if str(sequence[:3]).upper() not in codon_table.start_codons: jpayne@69: raise CodonTable.TranslationError( jpayne@69: f"First codon '{sequence[:3]}' is not a start codon" jpayne@69: ) jpayne@69: if n % 3 != 0: jpayne@69: raise CodonTable.TranslationError( jpayne@69: f"Sequence length {n} is not a multiple of three" jpayne@69: ) jpayne@69: if str(sequence[-3:]).upper() not in stop_codons: jpayne@69: raise CodonTable.TranslationError( jpayne@69: f"Final codon '{sequence[-3:]}' is not a stop codon" jpayne@69: ) jpayne@69: # Don't translate the stop symbol, and manually translate the M jpayne@69: sequence = sequence[3:-3] jpayne@69: n -= 6 jpayne@69: amino_acids = ["M"] jpayne@69: elif n % 3 != 0: jpayne@69: warnings.warn( jpayne@69: "Partial codon, len(sequence) not a multiple of three. " jpayne@69: "Explicitly trim the sequence or add trailing N before " jpayne@69: "translation. This may become an error in future.", jpayne@69: BiopythonWarning, jpayne@69: ) jpayne@69: if gap is not None: jpayne@69: if not isinstance(gap, str): jpayne@69: raise TypeError("Gap character should be a single character string.") jpayne@69: elif len(gap) > 1: jpayne@69: raise ValueError("Gap character should be a single character string.") jpayne@69: jpayne@69: for i in range(0, n - n % 3, 3): jpayne@69: codon = sequence[i : i + 3] jpayne@69: try: jpayne@69: amino_acids.append(forward_table[codon]) jpayne@69: except (KeyError, CodonTable.TranslationError): jpayne@69: if codon in codon_table.stop_codons: jpayne@69: if cds: jpayne@69: raise CodonTable.TranslationError( jpayne@69: f"Extra in frame stop codon '{codon}' found." jpayne@69: ) from None jpayne@69: if to_stop: jpayne@69: break jpayne@69: amino_acids.append(stop_symbol) jpayne@69: elif valid_letters.issuperset(set(codon)): jpayne@69: # Possible stop codon (e.g. NNN or TAN) jpayne@69: amino_acids.append(pos_stop) jpayne@69: elif gap is not None and codon == gap * 3: jpayne@69: # Gapped translation jpayne@69: amino_acids.append(gap) jpayne@69: else: jpayne@69: raise CodonTable.TranslationError( jpayne@69: f"Codon '{codon}' is invalid" jpayne@69: ) from None jpayne@69: return "".join(amino_acids) jpayne@69: jpayne@69: jpayne@69: def translate( jpayne@69: sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None jpayne@69: ): jpayne@69: """Translate a nucleotide sequence into amino acids. jpayne@69: jpayne@69: If given a string, returns a new string object. Given a Seq or jpayne@69: MutableSeq, returns a Seq object. jpayne@69: jpayne@69: Arguments: jpayne@69: - table - Which codon table to use? This can be either a name jpayne@69: (string), an NCBI identifier (integer), or a CodonTable object jpayne@69: (useful for non-standard genetic codes). Defaults to the "Standard" jpayne@69: table. jpayne@69: - stop_symbol - Single character string, what to use for any jpayne@69: terminators, defaults to the asterisk, "*". jpayne@69: - to_stop - Boolean, defaults to False meaning do a full jpayne@69: translation continuing on past any stop codons jpayne@69: (translated as the specified stop_symbol). If jpayne@69: True, translation is terminated at the first in jpayne@69: frame stop codon (and the stop_symbol is not jpayne@69: appended to the returned protein sequence). jpayne@69: - cds - Boolean, indicates this is a complete CDS. If True, this jpayne@69: checks the sequence starts with a valid alternative start jpayne@69: codon (which will be translated as methionine, M), that the jpayne@69: sequence length is a multiple of three, and that there is a jpayne@69: single in frame stop codon at the end (this will be excluded jpayne@69: from the protein sequence, regardless of the to_stop option). jpayne@69: If these tests fail, an exception is raised. jpayne@69: - gap - Single character string to denote symbol used for gaps. jpayne@69: Defaults to None. jpayne@69: jpayne@69: A simple string example using the default (standard) genetic code: jpayne@69: jpayne@69: >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" jpayne@69: >>> translate(coding_dna) jpayne@69: 'VAIVMGR*KGAR*' jpayne@69: >>> translate(coding_dna, stop_symbol="@") jpayne@69: 'VAIVMGR@KGAR@' jpayne@69: >>> translate(coding_dna, to_stop=True) jpayne@69: 'VAIVMGR' jpayne@69: jpayne@69: Now using NCBI table 2, where TGA is not a stop codon: jpayne@69: jpayne@69: >>> translate(coding_dna, table=2) jpayne@69: 'VAIVMGRWKGAR*' jpayne@69: >>> translate(coding_dna, table=2, to_stop=True) jpayne@69: 'VAIVMGRWKGAR' jpayne@69: jpayne@69: In fact this example uses an alternative start codon valid under NCBI jpayne@69: table 2, GTG, which means this example is a complete valid CDS which jpayne@69: when translated should really start with methionine (not valine): jpayne@69: jpayne@69: >>> translate(coding_dna, table=2, cds=True) jpayne@69: 'MAIVMGRWKGAR' jpayne@69: jpayne@69: Note that if the sequence has no in-frame stop codon, then the to_stop jpayne@69: argument has no effect: jpayne@69: jpayne@69: >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" jpayne@69: >>> translate(coding_dna2) jpayne@69: 'VAIVMGR' jpayne@69: >>> translate(coding_dna2, to_stop=True) jpayne@69: 'VAIVMGR' jpayne@69: jpayne@69: NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid jpayne@69: or a stop codon. These are translated as "X". Any invalid codon jpayne@69: (e.g. "TA?" or "T-A") will throw a TranslationError. jpayne@69: jpayne@69: It will however translate either DNA or RNA. jpayne@69: jpayne@69: NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous jpayne@69: stop codons'. These are stop codons with unambiguous sequence but which jpayne@69: have a context dependent coding as STOP or as amino acid. With these tables jpayne@69: 'to_stop' must be False (otherwise a ValueError is raised). The dual jpayne@69: coding codons will always be translated as amino acid, except for jpayne@69: 'cds=True', where the last codon will be translated as STOP. jpayne@69: jpayne@69: >>> coding_dna3 = "ATGGCACGGAAGTGA" jpayne@69: >>> translate(coding_dna3) jpayne@69: 'MARK*' jpayne@69: jpayne@69: >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W jpayne@69: 'MARKW' jpayne@69: jpayne@69: It will however raise a BiopythonWarning (not shown). jpayne@69: jpayne@69: >>> translate(coding_dna3, table=27, cds=True) jpayne@69: 'MARK' jpayne@69: jpayne@69: >>> translate(coding_dna3, table=27, to_stop=True) jpayne@69: Traceback (most recent call last): jpayne@69: ... jpayne@69: ValueError: You cannot use 'to_stop=True' with this table ... jpayne@69: """ jpayne@69: if isinstance(sequence, Seq): jpayne@69: return sequence.translate(table, stop_symbol, to_stop, cds) jpayne@69: elif isinstance(sequence, MutableSeq): jpayne@69: # Return a Seq object jpayne@69: return Seq(sequence).translate(table, stop_symbol, to_stop, cds) jpayne@69: else: jpayne@69: # Assume it's a string, return a string jpayne@69: return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap) jpayne@69: jpayne@69: jpayne@69: def reverse_complement(sequence, inplace=False): jpayne@69: """Return the reverse complement as a DNA sequence. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: Given a Seq object, returns a new Seq object. jpayne@69: Given a MutableSeq, returns a new MutableSeq object. jpayne@69: Given a SeqRecord object, returns a new SeqRecord object. jpayne@69: jpayne@69: >>> my_seq = "CGA" jpayne@69: >>> reverse_complement(my_seq) jpayne@69: 'TCG' jpayne@69: >>> my_seq = Seq("CGA") jpayne@69: >>> reverse_complement(my_seq) jpayne@69: Seq('TCG') jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> reverse_complement(my_seq) jpayne@69: MutableSeq('TCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: Any U in the sequence is treated as a T: jpayne@69: jpayne@69: >>> reverse_complement(Seq("CGAUT")) jpayne@69: Seq('AATCG') jpayne@69: jpayne@69: In contrast, ``reverse_complement_rna`` returns an RNA sequence: jpayne@69: jpayne@69: >>> reverse_complement_rna(Seq("CGAUT")) jpayne@69: Seq('AAUCG') jpayne@69: jpayne@69: Supports and lower- and upper-case characters, and unambiguous and jpayne@69: ambiguous nucleotides. All other characters are not converted: jpayne@69: jpayne@69: >>> reverse_complement("ACGTUacgtuXYZxyz") jpayne@69: 'zrxZRXaacgtAACGT' jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> reverse_complement(my_seq, inplace=True) jpayne@69: MutableSeq('TCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('TCG') jpayne@69: jpayne@69: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@69: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@69: jpayne@69: if isinstance(sequence, (Seq, MutableSeq)): jpayne@69: return sequence.reverse_complement(inplace) jpayne@69: if isinstance(sequence, SeqRecord): jpayne@69: if inplace: jpayne@69: raise TypeError("SeqRecords are immutable") jpayne@69: return sequence.reverse_complement() jpayne@69: # Assume it's a string. jpayne@69: if inplace: jpayne@69: raise TypeError("strings are immutable") jpayne@69: sequence = sequence.encode("ASCII") jpayne@69: sequence = sequence.translate(_dna_complement_table) jpayne@69: sequence = sequence.decode("ASCII") jpayne@69: return sequence[::-1] jpayne@69: jpayne@69: jpayne@69: def reverse_complement_rna(sequence, inplace=False): jpayne@69: """Return the reverse complement as an RNA sequence. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: Given a Seq object, returns a new Seq object. jpayne@69: Given a MutableSeq, returns a new MutableSeq object. jpayne@69: Given a SeqRecord object, returns a new SeqRecord object. jpayne@69: jpayne@69: >>> my_seq = "CGA" jpayne@69: >>> reverse_complement_rna(my_seq) jpayne@69: 'UCG' jpayne@69: >>> my_seq = Seq("CGA") jpayne@69: >>> reverse_complement_rna(my_seq) jpayne@69: Seq('UCG') jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> reverse_complement_rna(my_seq) jpayne@69: MutableSeq('UCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: Any T in the sequence is treated as a U: jpayne@69: jpayne@69: >>> reverse_complement_rna(Seq("CGAUT")) jpayne@69: Seq('AAUCG') jpayne@69: jpayne@69: In contrast, ``reverse_complement`` returns a DNA sequence: jpayne@69: jpayne@69: >>> reverse_complement(Seq("CGAUT"), inplace=False) jpayne@69: Seq('AATCG') jpayne@69: jpayne@69: Supports and lower- and upper-case characters, and unambiguous and jpayne@69: ambiguous nucleotides. All other characters are not converted: jpayne@69: jpayne@69: >>> reverse_complement_rna("ACGTUacgtuXYZxyz") jpayne@69: 'zrxZRXaacguAACGU' jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> reverse_complement_rna(my_seq, inplace=True) jpayne@69: MutableSeq('UCG') jpayne@69: >>> my_seq jpayne@69: MutableSeq('UCG') jpayne@69: jpayne@69: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@69: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@69: jpayne@69: if isinstance(sequence, (Seq, MutableSeq)): jpayne@69: return sequence.reverse_complement_rna(inplace) jpayne@69: if isinstance(sequence, SeqRecord): jpayne@69: if inplace: jpayne@69: raise TypeError("SeqRecords are immutable") jpayne@69: return sequence.reverse_complement_rna() jpayne@69: # Assume it's a string. jpayne@69: if inplace: jpayne@69: raise TypeError("strings are immutable") jpayne@69: sequence = sequence.encode("ASCII") jpayne@69: sequence = sequence.translate(_rna_complement_table) jpayne@69: sequence = sequence.decode("ASCII") jpayne@69: return sequence[::-1] jpayne@69: jpayne@69: jpayne@69: def complement(sequence, inplace=False): jpayne@69: """Return the complement as a DNA sequence. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: Given a Seq object, returns a new Seq object. jpayne@69: Given a MutableSeq, returns a new MutableSeq object. jpayne@69: Given a SeqRecord object, returns a new SeqRecord object. jpayne@69: jpayne@69: >>> my_seq = "CGA" jpayne@69: >>> complement(my_seq) jpayne@69: 'GCT' jpayne@69: >>> my_seq = Seq("CGA") jpayne@69: >>> complement(my_seq) jpayne@69: Seq('GCT') jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> complement(my_seq) jpayne@69: MutableSeq('GCT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: Any U in the sequence is treated as a T: jpayne@69: jpayne@69: >>> complement(Seq("CGAUT")) jpayne@69: Seq('GCTAA') jpayne@69: jpayne@69: In contrast, ``complement_rna`` returns an RNA sequence: jpayne@69: jpayne@69: >>> complement_rna(Seq("CGAUT")) jpayne@69: Seq('GCUAA') jpayne@69: jpayne@69: Supports and lower- and upper-case characters, and unambiguous and jpayne@69: ambiguous nucleotides. All other characters are not converted: jpayne@69: jpayne@69: >>> complement("ACGTUacgtuXYZxyz") jpayne@69: 'TGCAAtgcaaXRZxrz' jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> complement(my_seq, inplace=True) jpayne@69: MutableSeq('GCT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('GCT') jpayne@69: jpayne@69: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@69: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@69: jpayne@69: if isinstance(sequence, (Seq, MutableSeq)): jpayne@69: return sequence.complement(inplace) jpayne@69: if isinstance(sequence, SeqRecord): jpayne@69: if inplace: jpayne@69: raise TypeError("SeqRecords are immutable") jpayne@69: return sequence.complement() jpayne@69: # Assume it's a string. jpayne@69: if inplace is True: jpayne@69: raise TypeError("strings are immutable") jpayne@69: sequence = sequence.encode("ASCII") jpayne@69: sequence = sequence.translate(_dna_complement_table) jpayne@69: return sequence.decode("ASCII") jpayne@69: jpayne@69: jpayne@69: def complement_rna(sequence, inplace=False): jpayne@69: """Return the complement as an RNA sequence. jpayne@69: jpayne@69: If given a string, returns a new string object. jpayne@69: Given a Seq object, returns a new Seq object. jpayne@69: Given a MutableSeq, returns a new MutableSeq object. jpayne@69: Given a SeqRecord object, returns a new SeqRecord object. jpayne@69: jpayne@69: >>> my_seq = "CGA" jpayne@69: >>> complement_rna(my_seq) jpayne@69: 'GCU' jpayne@69: >>> my_seq = Seq("CGA") jpayne@69: >>> complement_rna(my_seq) jpayne@69: Seq('GCU') jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> complement_rna(my_seq) jpayne@69: MutableSeq('GCU') jpayne@69: >>> my_seq jpayne@69: MutableSeq('CGA') jpayne@69: jpayne@69: Any T in the sequence is treated as a U: jpayne@69: jpayne@69: >>> complement_rna(Seq("CGAUT")) jpayne@69: Seq('GCUAA') jpayne@69: jpayne@69: In contrast, ``complement`` returns a DNA sequence: jpayne@69: jpayne@69: >>> complement(Seq("CGAUT")) jpayne@69: Seq('GCTAA') jpayne@69: jpayne@69: Supports and lower- and upper-case characters, and unambiguous and jpayne@69: ambiguous nucleotides. All other characters are not converted: jpayne@69: jpayne@69: >>> complement_rna("ACGTUacgtuXYZxyz") jpayne@69: 'UGCAAugcaaXRZxrz' jpayne@69: jpayne@69: The sequence is modified in-place and returned if inplace is True: jpayne@69: jpayne@69: >>> my_seq = MutableSeq("CGA") jpayne@69: >>> complement(my_seq, inplace=True) jpayne@69: MutableSeq('GCT') jpayne@69: >>> my_seq jpayne@69: MutableSeq('GCT') jpayne@69: jpayne@69: As strings and ``Seq`` objects are immutable, a ``TypeError`` is jpayne@69: raised if ``reverse_complement`` is called on a ``Seq`` object with jpayne@69: ``inplace=True``. jpayne@69: """ jpayne@69: from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports jpayne@69: jpayne@69: if isinstance(sequence, (Seq, MutableSeq)): jpayne@69: return sequence.complement_rna(inplace) jpayne@69: if isinstance(sequence, SeqRecord): jpayne@69: if inplace: jpayne@69: raise TypeError("SeqRecords are immutable") jpayne@69: return sequence.complement_rna() jpayne@69: # Assume it's a string. jpayne@69: if inplace: jpayne@69: raise TypeError("strings are immutable") jpayne@69: sequence = sequence.encode("ASCII") jpayne@69: sequence = sequence.translate(_rna_complement_table) jpayne@69: return sequence.decode("ASCII") jpayne@69: jpayne@69: jpayne@69: def _test(): jpayne@69: """Run the Bio.Seq module's doctests (PRIVATE).""" jpayne@69: print("Running doctests...") jpayne@69: import doctest jpayne@69: jpayne@69: doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) jpayne@69: print("Done") jpayne@69: jpayne@69: jpayne@69: if __name__ == "__main__": jpayne@69: _test()