csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:33d812a61356
+# Copyright 2000 Andrew Dalke.
+# Copyright 2000-2002 Brad Chapman.
+# Copyright 2004-2005, 2010 by M de Hoon.
+# Copyright 2007-2023 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Provide objects to represent biological sequences.
+See also the Seq_ wiki and the chapter in our tutorial:
+- `HTML Tutorial`_
+- `PDF Tutorial`_
+.. _Seq: http://biopython.org/wiki/Seq
+.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
+.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
+"""
+import array
+import collections
+import numbers
+import warnings
+from abc import ABC
+from abc import abstractmethod
+from typing import overload, Optional, Union, Dict
+from Bio import BiopythonWarning
+from Bio.Data import CodonTable
+from Bio.Data import IUPACData
+def _maketrans(complement_mapping):
+"""Make a python string translation table (PRIVATE).
+Arguments:
+- complement_mapping - a dictionary such as ambiguous_dna_complement
+and ambiguous_rna_complement from Data.IUPACData.
+Returns a translation table (a bytes object of length 256) for use with
+the python string's translate method to use in a (reverse) complement.
+Compatible with lower case and upper case sequences.
+For internal use only.
+"""
+keys = "".join(complement_mapping.keys()).encode("ASCII")
+values = "".join(complement_mapping.values()).encode("ASCII")
+return bytes.maketrans(keys + keys.lower(), values + values.lower())
+ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
+ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
+_dna_complement_table = _maketrans(ambiguous_dna_complement)
+del ambiguous_dna_complement
+ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
+ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
+_rna_complement_table = _maketrans(ambiguous_rna_complement)
+del ambiguous_rna_complement
+class SequenceDataAbstractBaseClass(ABC):
+"""Abstract base class for sequence content providers.
+Most users will not need to use this class. It is used internally as a base
+class for sequence content provider classes such as _UndefinedSequenceData
+defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
+Instances of these classes can be used instead of a ``bytes`` object as the
+data argument when creating a Seq object, and provide the sequence content
+only when requested via ``__getitem__``. This allows lazy parsers to load
+and parse sequence data from a file only for the requested sequence regions,
+and _UndefinedSequenceData instances to raise an exception when undefined
+sequence data are requested.
+Future implementations of lazy parsers that similarly provide on-demand
+parsing of sequence data should use a subclass of this abstract class and
+implement the abstract methods ``__len__`` and ``__getitem__``:
+* ``__len__`` must return the sequence length;
+* ``__getitem__`` must return
+* a ``bytes`` object for the requested region; or
+* a new instance of the subclass for the requested region; or
+* raise an ``UndefinedSequenceError``.
+Calling ``__getitem__`` for a sequence region of size zero should always
+return an empty ``bytes`` object.
+Calling ``__getitem__`` for the full sequence (as in data[:]) should
+either return a ``bytes`` object with the full sequence, or raise an
+``UndefinedSequenceError``.
+Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
+as part of their ``__init__`` method.
+"""
+__slots__ = ()
+def __init__(self):
+"""Check if ``__getitem__`` returns a bytes-like object."""
+assert self[:0] == b""
+@abstractmethod
+def __len__(self):
+pass
+@abstractmethod
+def __getitem__(self, key):
+pass
+def __bytes__(self):
+return self[:]
+def __hash__(self):
+return hash(bytes(self))
+def __eq__(self, other):
+return bytes(self) == other
+def __lt__(self, other):
+return bytes(self) < other
+def __le__(self, other):
+return bytes(self) <= other
+def __gt__(self, other):
+return bytes(self) > other
+def __ge__(self, other):
+return bytes(self) >= other
+def __add__(self, other):
+try:
+return bytes(self) + bytes(other)
+except UndefinedSequenceError:
+return NotImplemented
+# will be handled by _UndefinedSequenceData.__radd__ or
+# by _PartiallyDefinedSequenceData.__radd__
+def __radd__(self, other):
+return other + bytes(self)
+def __mul__(self, other):
+return other * bytes(self)
+def __contains__(self, item):
+return bytes(self).__contains__(item)
+def decode(self, encoding="utf-8"):
+"""Decode the data as bytes using the codec registered for encoding.
+encoding
+The encoding with which to decode the bytes.
+"""
+return bytes(self).decode(encoding)
+def count(self, sub, start=None, end=None):
+"""Return the number of non-overlapping occurrences of sub in data[start:end].
+Optional arguments start and end are interpreted as in slice notation.
+This method behaves as the count method of Python strings.
+"""
+return bytes(self).count(sub, start, end)
+def find(self, sub, start=None, end=None):
+"""Return the lowest index in data where subsection sub is found.
+Return the lowest index in data where subsection sub is found,
+such that sub is contained within data[start,end].  Optional
+arguments start and end are interpreted as in slice notation.
+Return -1 on failure.
+"""
+return bytes(self).find(sub, start, end)
+def rfind(self, sub, start=None, end=None):
+"""Return the highest index in data where subsection sub is found.
+Return the highest index in data where subsection sub is found,
+such that sub is contained within data[start,end].  Optional
+arguments start and end are interpreted as in slice notation.
+Return -1 on failure.
+"""
+return bytes(self).rfind(sub, start, end)
+def index(self, sub, start=None, end=None):
+"""Return the lowest index in data where subsection sub is found.
+Return the lowest index in data where subsection sub is found,
+such that sub is contained within data[start,end].  Optional
+arguments start and end are interpreted as in slice notation.
+Raises ValueError when the subsection is not found.
+"""
+return bytes(self).index(sub, start, end)
+def rindex(self, sub, start=None, end=None):
+"""Return the highest index in data where subsection sub is found.
+Return the highest index in data where subsection sub is found,
+such that sub is contained within data[start,end].  Optional
+arguments start and end are interpreted as in slice notation.
+Raise ValueError when the subsection is not found.
+"""
+return bytes(self).rindex(sub, start, end)
+def startswith(self, prefix, start=None, end=None):
+"""Return True if data starts with the specified prefix, False otherwise.
+With optional start, test data beginning at that position.
+With optional end, stop comparing data at that position.
+prefix can also be a tuple of bytes to try.
+"""
+return bytes(self).startswith(prefix, start, end)
+def endswith(self, suffix, start=None, end=None):
+"""Return True if data ends with the specified suffix, False otherwise.
+With optional start, test data beginning at that position.
+With optional end, stop comparing data at that position.
+suffix can also be a tuple of bytes to try.
+"""
+return bytes(self).endswith(suffix, start, end)
+def split(self, sep=None, maxsplit=-1):
+"""Return a list of the sections in the data, using sep as the delimiter.
+sep
+The delimiter according which to split the data.
+None (the default value) means split on ASCII whitespace characters
+(space, tab, return, newline, formfeed, vertical tab).
+maxsplit
+Maximum number of splits to do.
+-1 (the default value) means no limit.
+"""
+return bytes(self).split(sep, maxsplit)
+def rsplit(self, sep=None, maxsplit=-1):
+"""Return a list of the sections in the data, using sep as the delimiter.
+sep
+The delimiter according which to split the data.
+None (the default value) means split on ASCII whitespace characters
+(space, tab, return, newline, formfeed, vertical tab).
+maxsplit
+Maximum number of splits to do.
+-1 (the default value) means no limit.
+Splitting is done starting at the end of the data and working to the front.
+"""
+return bytes(self).rsplit(sep, maxsplit)
+def strip(self, chars=None):
+"""Strip leading and trailing characters contained in the argument.
+If the argument is omitted or None, strip leading and trailing ASCII whitespace.
+"""
+return bytes(self).strip(chars)
+def lstrip(self, chars=None):
+"""Strip leading characters contained in the argument.
+If the argument is omitted or None, strip leading ASCII whitespace.
+"""
+return bytes(self).lstrip(chars)
+def rstrip(self, chars=None):
+"""Strip trailing characters contained in the argument.
+If the argument is omitted or None, strip trailing ASCII whitespace.
+"""
+return bytes(self).rstrip(chars)
+def removeprefix(self, prefix):
+"""Remove the prefix if present."""
+# Want to do just this, but need Python 3.9+
+# return bytes(self).removeprefix(prefix)
+data = bytes(self)
+try:
+return data.removeprefix(prefix)
+except AttributeError:
+if data.startswith(prefix):
+return data[len(prefix) :]
+else:
+return data
+def removesuffix(self, suffix):
+"""Remove the suffix if present."""
+# Want to do just this, but need Python 3.9+
+# return bytes(self).removesuffix(suffix)
+data = bytes(self)
+try:
+return data.removesuffix(suffix)
+except AttributeError:
+if data.startswith(suffix):
+return data[: -len(suffix)]
+else:
+return data
+def upper(self):
+"""Return a copy of data with all ASCII characters converted to uppercase."""
+return bytes(self).upper()
+def lower(self):
+"""Return a copy of data with all ASCII characters converted to lowercase."""
+return bytes(self).lower()
+def isupper(self):
+"""Return True if all ASCII characters in data are uppercase.
+If there are no cased characters, the method returns False.
+"""
+return bytes(self).isupper()
+def islower(self):
+"""Return True if all ASCII characters in data are lowercase.
+If there are no cased characters, the method returns False.
+"""
+return bytes(self).islower()
+def replace(self, old, new):
+"""Return a copy with all occurrences of substring old replaced by new."""
+return bytes(self).replace(old, new)
+def translate(self, table, delete=b""):
+"""Return a copy with each character mapped by the given translation table.
+table
+Translation table, which must be a bytes object of length 256.
+All characters occurring in the optional argument delete are removed.
+The remaining characters are mapped through the given translation table.
+"""
+return bytes(self).translate(table, delete)
+@property
+def defined(self):
+"""Return True if the sequence is defined, False if undefined or partially defined.
+Zero-length sequences are always considered to be defined.
+"""
+return True
+@property
+def defined_ranges(self):
+"""Return a tuple of the ranges where the sequence contents is defined.
+The return value has the format ((start1, end1), (start2, end2), ...).
+"""
+length = len(self)
+if length > 0:
+return ((0, length),)
+else:
+return ()
+class _SeqAbstractBaseClass(ABC):
+"""Abstract base class for the Seq and MutableSeq classes (PRIVATE).
+Most users will not need to use this class. It is used internally as an
+abstract base class for Seq and MutableSeq, as most of their methods are
+identical.
+"""
+__slots__ = ("_data",)
+__array_ufunc__ = None  # turn off numpy Ufuncs
+@abstractmethod
+def __init__(self):
+pass
+def __bytes__(self):
+return bytes(self._data)
+def __repr__(self):
+"""Return (truncated) representation of the sequence."""
+data = self._data
+if isinstance(data, _UndefinedSequenceData):
+return f"Seq(None, length={len(self)})"
+if isinstance(data, _PartiallyDefinedSequenceData):
+d = {}
+for position, seq in data._data.items():
+if len(seq) > 60:
+start = seq[:54].decode("ASCII")
+end = seq[-3:].decode("ASCII")
+seq = f"{start}...{end}"
+else:
+seq = seq.decode("ASCII")
+d[position] = seq
+return "Seq(%r, length=%d)" % (d, len(self))
+if len(data) > 60:
+# Shows the last three letters as it is often useful to see if
+# there is a stop codon at the end of a sequence.
+# Note total length is 54+3+3=60
+start = data[:54].decode("ASCII")
+end = data[-3:].decode("ASCII")
+return f"{self.__class__.__name__}('{start}...{end}')"
+else:
+data = data.decode("ASCII")
+return f"{self.__class__.__name__}('{data}')"
+def __str__(self):
+"""Return the full sequence as a python string."""
+return self._data.decode("ASCII")
+def __eq__(self, other):
+"""Compare the sequence to another sequence or a string.
+Sequences are equal to each other if their sequence contents is
+identical:
+>>> from Bio.Seq import Seq, MutableSeq
+>>> seq1 = Seq("ACGT")
+>>> seq2 = Seq("ACGT")
+>>> mutable_seq = MutableSeq("ACGT")
+>>> seq1 == seq2
+True
+>>> seq1 == mutable_seq
+True
+>>> seq1 == "ACGT"
+True
+Note that the sequence objects themselves are not identical to each
+other:
+>>> id(seq1) == id(seq2)
+False
+>>> seq1 is seq2
+False
+Sequences can also be compared to strings, ``bytes``, and ``bytearray``
+objects:
+>>> seq1 == "ACGT"
+True
+>>> seq1 == b"ACGT"
+True
+>>> seq1 == bytearray(b"ACGT")
+True
+"""
+if isinstance(other, _SeqAbstractBaseClass):
+return self._data == other._data
+elif isinstance(other, str):
+return self._data == other.encode("ASCII")
+else:
+return self._data == other
+def __lt__(self, other):
+"""Implement the less-than operand."""
+if isinstance(other, _SeqAbstractBaseClass):
+return self._data < other._data
+elif isinstance(other, str):
+return self._data < other.encode("ASCII")
+else:
+return self._data < other
+def __le__(self, other):
+"""Implement the less-than or equal operand."""
+if isinstance(other, _SeqAbstractBaseClass):
+return self._data <= other._data
+elif isinstance(other, str):
+return self._data <= other.encode("ASCII")
+else:
+return self._data <= other
+def __gt__(self, other):
+"""Implement the greater-than operand."""
+if isinstance(other, _SeqAbstractBaseClass):
+return self._data > other._data
+elif isinstance(other, str):
+return self._data > other.encode("ASCII")
+else:
+return self._data > other
+def __ge__(self, other):
+"""Implement the greater-than or equal operand."""
+if isinstance(other, _SeqAbstractBaseClass):
+return self._data >= other._data
+elif isinstance(other, str):
+return self._data >= other.encode("ASCII")
+else:
+return self._data >= other
+def __len__(self):
+"""Return the length of the sequence."""
+return len(self._data)
+def __iter__(self):
+"""Return an iterable of the sequence."""
+return self._data.decode("ASCII").__iter__()
+@overload
+def __getitem__(self, index: int) -> str:
+...
+@overload
+def __getitem__(self, index: slice) -> "Seq":
+...
+def __getitem__(self, index):
+"""Return a subsequence as a single letter or as a sequence object.
+If the index is an integer, a single letter is returned as a Python
+string:
+>>> seq = Seq('ACTCGACGTCG')
+>>> seq[5]
+'A'
+Otherwise, a new sequence object of the same class is returned:
+>>> seq[5:8]
+Seq('ACG')
+>>> mutable_seq = MutableSeq('ACTCGACGTCG')
+>>> mutable_seq[5:8]
+MutableSeq('ACG')
+"""
+if isinstance(index, numbers.Integral):
+# Return a single letter as a string
+return chr(self._data[index])
+else:
+# Return the (sub)sequence as another Seq/MutableSeq object
+return self.__class__(self._data[index])
+def __add__(self, other):
+"""Add a sequence or string to this sequence.
+>>> from Bio.Seq import Seq, MutableSeq
+>>> Seq("MELKI") + "LV"
+Seq('MELKILV')
+>>> MutableSeq("MELKI") + "LV"
+MutableSeq('MELKILV')
+"""
+if isinstance(other, _SeqAbstractBaseClass):
+return self.__class__(self._data + other._data)
+elif isinstance(other, str):
+return self.__class__(self._data + other.encode("ASCII"))
+else:
+# If other is a SeqRecord, then SeqRecord's __radd__ will handle
+# this. If not, returning NotImplemented will trigger a TypeError.
+return NotImplemented
+def __radd__(self, other):
+"""Add a sequence string on the left.
+>>> from Bio.Seq import Seq, MutableSeq
+>>> "LV" + Seq("MELKI")
+Seq('LVMELKI')
+>>> "LV" + MutableSeq("MELKI")
+MutableSeq('LVMELKI')
+Adding two sequence objects is handled via the __add__ method.
+"""
+if isinstance(other, str):
+return self.__class__(other.encode("ASCII") + self._data)
+else:
+return NotImplemented
+def __mul__(self, other):
+"""Multiply sequence by integer.
+>>> from Bio.Seq import Seq, MutableSeq
+>>> Seq('ATG') * 2
+Seq('ATGATG')
+>>> MutableSeq('ATG') * 2
+MutableSeq('ATGATG')
+"""
+if not isinstance(other, numbers.Integral):
+raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+# we would like to simply write
+# data = self._data * other
+# here, but currently that causes a bug on PyPy if self._data is a
+# bytearray and other is a numpy integer. Using this workaround:
+data = self._data.__mul__(other)
+return self.__class__(data)
+def __rmul__(self, other):
+"""Multiply integer by sequence.
+>>> from Bio.Seq import Seq
+>>> 2 * Seq('ATG')
+Seq('ATGATG')
+"""
+if not isinstance(other, numbers.Integral):
+raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+# we would like to simply write
+# data = self._data * other
+# here, but currently that causes a bug on PyPy if self._data is a
+# bytearray and other is a numpy integer. Using this workaround:
+data = self._data.__mul__(other)
+return self.__class__(data)
+def __imul__(self, other):
+"""Multiply the sequence object by other and assign.
+>>> from Bio.Seq import Seq
+>>> seq = Seq('ATG')
+>>> seq *= 2
+>>> seq
+Seq('ATGATG')
+Note that this is different from in-place multiplication. The ``seq``
+variable is reassigned to the multiplication result, but any variable
+pointing to ``seq`` will remain unchanged:
+>>> seq = Seq('ATG')
+>>> seq2 = seq
+>>> id(seq) == id(seq2)
+True
+>>> seq *= 2
+>>> seq
+Seq('ATGATG')
+>>> seq2
+Seq('ATG')
+>>> id(seq) == id(seq2)
+False
+"""
+if not isinstance(other, numbers.Integral):
+raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
+# we would like to simply write
+# data = self._data * other
+# here, but currently that causes a bug on PyPy if self._data is a
+# bytearray and other is a numpy integer. Using this workaround:
+data = self._data.__mul__(other)
+return self.__class__(data)
+def count(self, sub, start=None, end=None):
+"""Return a non-overlapping count, like that of a python string.
+The number of occurrences of substring argument sub in the
+(sub)sequence given by [start:end] is returned as an integer.
+Optional arguments start and end are interpreted as in slice
+notation.
+Arguments:
+- sub - a string or another Seq object to look for
+- start - optional integer, slice start
+- end - optional integer, slice end
+e.g.
+>>> from Bio.Seq import Seq
+>>> my_seq = Seq("AAAATGA")
+>>> print(my_seq.count("A"))
+5
+>>> print(my_seq.count("ATG"))
+1
+>>> print(my_seq.count(Seq("AT")))
+1
+>>> print(my_seq.count("AT", 2, -1))
+1
+HOWEVER, please note because the ``count`` method of Seq and MutableSeq
+objects, like that of Python strings, do a non-overlapping search, this
+may not give the answer you expect:
+>>> "AAAA".count("AA")
+2
+>>> print(Seq("AAAA").count("AA"))
+2
+For an overlapping search, use the ``count_overlap`` method:
+>>> print(Seq("AAAA").count_overlap("AA"))
+3
+"""
+if isinstance(sub, MutableSeq):
+sub = sub._data
+elif isinstance(sub, Seq):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+return self._data.count(sub, start, end)
+def count_overlap(self, sub, start=None, end=None):
+"""Return an overlapping count.
+Returns an integer, the number of occurrences of substring
+argument sub in the (sub)sequence given by [start:end].
+Optional arguments start and end are interpreted as in slice
+notation.
+Arguments:
+- sub - a string or another Seq object to look for
+- start - optional integer, slice start
+- end - optional integer, slice end
+e.g.
+>>> from Bio.Seq import Seq
+>>> print(Seq("AAAA").count_overlap("AA"))
+3
+>>> print(Seq("ATATATATA").count_overlap("ATA"))
+4
+>>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
+1
+For a non-overlapping search, use the ``count`` method:
+>>> print(Seq("AAAA").count("AA"))
+2
+Where substrings do not overlap, ``count_overlap`` behaves the same as
+the ``count`` method:
+>>> from Bio.Seq import Seq
+>>> my_seq = Seq("AAAATGA")
+>>> print(my_seq.count_overlap("A"))
+5
+>>> my_seq.count_overlap("A") == my_seq.count("A")
+True
+>>> print(my_seq.count_overlap("ATG"))
+1
+>>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
+True
+>>> print(my_seq.count_overlap(Seq("AT")))
+1
+>>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
+True
+>>> print(my_seq.count_overlap("AT", 2, -1))
+1
+>>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
+True
+HOWEVER, do not use this method for such cases because the
+count() method is much for efficient.
+"""
+if isinstance(sub, MutableSeq):
+sub = sub._data
+elif isinstance(sub, Seq):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+data = self._data
+overlap_count = 0
+while True:
+start = data.find(sub, start, end) + 1
+if start != 0:
+overlap_count += 1
+else:
+return overlap_count
+def __contains__(self, item):
+"""Return True if item is a subsequence of the sequence, and False otherwise.
+e.g.
+>>> from Bio.Seq import Seq, MutableSeq
+>>> my_dna = Seq("ATATGAAATTTGAAAA")
+>>> "AAA" in my_dna
+True
+>>> Seq("AAA") in my_dna
+True
+>>> MutableSeq("AAA") in my_dna
+True
+"""
+if isinstance(item, _SeqAbstractBaseClass):
+item = bytes(item)
+elif isinstance(item, str):
+item = item.encode("ASCII")
+return item in self._data
+def find(self, sub, start=None, end=None):
+"""Return the lowest index in the sequence where subsequence sub is found.
+With optional arguments start and end, return the lowest index in the
+sequence such that the subsequence sub is contained within the sequence
+region [start:end].
+Arguments:
+- sub - a string or another Seq or MutableSeq object to search for
+- start - optional integer, slice start
+- end - optional integer, slice end
+Returns -1 if the subsequence is NOT found.
+e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.find("AUG")
+3
+The next typical start codon can then be found by starting the search
+at position 4:
+>>> my_rna.find("AUG", 4)
+15
+See the ``search`` method to find the locations of multiple subsequences
+at the same time.
+"""
+if isinstance(sub, _SeqAbstractBaseClass):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+return self._data.find(sub, start, end)
+def rfind(self, sub, start=None, end=None):
+"""Return the highest index in the sequence where subsequence sub is found.
+With optional arguments start and end, return the highest index in the
+sequence such that the subsequence sub is contained within the sequence
+region [start:end].
+Arguments:
+- sub - a string or another Seq or MutableSeq object to search for
+- start - optional integer, slice start
+- end - optional integer, slice end
+Returns -1 if the subsequence is NOT found.
+e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.rfind("AUG")
+15
+The location of the typical start codon before that can be found by
+ending the search at position 15:
+>>> my_rna.rfind("AUG", end=15)
+3
+See the ``search`` method to find the locations of multiple subsequences
+at the same time.
+"""
+if isinstance(sub, _SeqAbstractBaseClass):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+return self._data.rfind(sub, start, end)
+def index(self, sub, start=None, end=None):
+"""Return the lowest index in the sequence where subsequence sub is found.
+With optional arguments start and end, return the lowest index in the
+sequence such that the subsequence sub is contained within the sequence
+region [start:end].
+Arguments:
+- sub - a string or another Seq or MutableSeq object to search for
+- start - optional integer, slice start
+- end - optional integer, slice end
+Raises a ValueError if the subsequence is NOT found.
+e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.index("AUG")
+3
+The next typical start codon can then be found by starting the search
+at position 4:
+>>> my_rna.index("AUG", 4)
+15
+This method performs the same search as the ``find`` method.  However,
+if the subsequence is not found, ``find`` returns -1 while ``index``
+raises a ValueError:
+>>> my_rna.index("T")
+Traceback (most recent call last):
+...
+ValueError: ...
+>>> my_rna.find("T")
+-1
+See the ``search`` method to find the locations of multiple subsequences
+at the same time.
+"""
+if isinstance(sub, MutableSeq):
+sub = sub._data
+elif isinstance(sub, Seq):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+return self._data.index(sub, start, end)
+def rindex(self, sub, start=None, end=None):
+"""Return the highest index in the sequence where subsequence sub is found.
+With optional arguments start and end, return the highest index in the
+sequence such that the subsequence sub is contained within the sequence
+region [start:end].
+Arguments:
+- sub - a string or another Seq or MutableSeq object to search for
+- start - optional integer, slice start
+- end - optional integer, slice end
+Returns -1 if the subsequence is NOT found.
+e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.rindex("AUG")
+15
+The location of the typical start codon before that can be found by
+ending the search at position 15:
+>>> my_rna.rindex("AUG", end=15)
+3
+This method performs the same search as the ``rfind`` method.  However,
+if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
+raises a ValueError:
+>>> my_rna.rindex("T")
+Traceback (most recent call last):
+...
+ValueError: ...
+>>> my_rna.rfind("T")
+-1
+See the ``search`` method to find the locations of multiple subsequences
+at the same time.
+"""
+if isinstance(sub, MutableSeq):
+sub = sub._data
+elif isinstance(sub, Seq):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, (bytes, bytearray)):
+raise TypeError(
+"a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% type(sub)
+)
+return self._data.rindex(sub, start, end)
+def search(self, subs):
+"""Search the substrings subs in self and yield the index and substring found.
+Arguments:
+- subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
+objects containing the substrings to search for.
+>>> from Bio.Seq import Seq
+>>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
+>>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
+>>> for index, substring in matches:
+...     print(index, substring)
+...
+7 CC
+9 ATTG
+20 CC
+34 CC
+34 CCC
+35 CC
+"""
+subdict = collections.defaultdict(set)
+for index, sub in enumerate(subs):
+if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
+sub = bytes(sub)
+elif isinstance(sub, str):
+sub = sub.encode("ASCII")
+elif not isinstance(sub, bytes):
+raise TypeError(
+"subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
+% (index, type(sub))
+)
+length = len(sub)
+subdict[length].add(sub)
+for start in range(len(self) - 1):
+for length, subs in subdict.items():
+stop = start + length
+for sub in subs:
+if self._data[start:stop] == sub:
+yield (start, sub.decode())
+break
+def startswith(self, prefix, start=None, end=None):
+"""Return True if the sequence starts with the given prefix, False otherwise.
+Return True if the sequence starts with the specified prefix
+(a string or another Seq object), False otherwise.
+With optional start, test sequence beginning at that position.
+With optional end, stop comparing sequence at that position.
+prefix can also be a tuple of strings to try.  e.g.
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.startswith("GUC")
+True
+>>> my_rna.startswith("AUG")
+False
+>>> my_rna.startswith("AUG", 3)
+True
+>>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
+True
+"""
+if isinstance(prefix, tuple):
+prefix = tuple(
+bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
+for p in prefix
+)
+elif isinstance(prefix, _SeqAbstractBaseClass):
+prefix = bytes(prefix)
+elif isinstance(prefix, str):
+prefix = prefix.encode("ASCII")
+return self._data.startswith(prefix, start, end)
+def endswith(self, suffix, start=None, end=None):
+"""Return True if the sequence ends with the given suffix, False otherwise.
+Return True if the sequence ends with the specified suffix
+(a string or another Seq object), False otherwise.
+With optional start, test sequence beginning at that position.
+With optional end, stop comparing sequence at that position.
+suffix can also be a tuple of strings to try.  e.g.
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_rna.endswith("UUG")
+True
+>>> my_rna.endswith("AUG")
+False
+>>> my_rna.endswith("AUG", 0, 18)
+True
+>>> my_rna.endswith(("UCC", "UCA", "UUG"))
+True
+"""
+if isinstance(suffix, tuple):
+suffix = tuple(
+bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
+for p in suffix
+)
+elif isinstance(suffix, _SeqAbstractBaseClass):
+suffix = bytes(suffix)
+elif isinstance(suffix, str):
+suffix = suffix.encode("ASCII")
+return self._data.endswith(suffix, start, end)
+def split(self, sep=None, maxsplit=-1):
+"""Return a list of subsequences when splitting the sequence by separator sep.
+Return a list of the subsequences in the sequence (as Seq objects),
+using sep as the delimiter string.  If maxsplit is given, at
+most maxsplit splits are done.  If maxsplit is omitted, all
+splits are made.
+For consistency with the ``split`` method of Python strings, any
+whitespace (tabs, spaces, newlines) is a separator if sep is None, the
+default value
+e.g.
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_aa = my_rna.translate()
+>>> my_aa
+Seq('VMAIVMGR*KGAR*L')
+>>> for pep in my_aa.split("*"):
+...     pep
+Seq('VMAIVMGR')
+Seq('KGAR')
+Seq('L')
+>>> for pep in my_aa.split("*", 1):
+...     pep
+Seq('VMAIVMGR')
+Seq('KGAR*L')
+See also the rsplit method, which splits the sequence starting from the
+end:
+>>> for pep in my_aa.rsplit("*", 1):
+...     pep
+Seq('VMAIVMGR*KGAR')
+Seq('L')
+"""
+if isinstance(sep, _SeqAbstractBaseClass):
+sep = bytes(sep)
+elif isinstance(sep, str):
+sep = sep.encode("ASCII")
+return [Seq(part) for part in self._data.split(sep, maxsplit)]
+def rsplit(self, sep=None, maxsplit=-1):
+"""Return a list of subsequences by splitting the sequence from the right.
+Return a list of the subsequences in the sequence (as Seq objects),
+using sep as the delimiter string.  If maxsplit is given, at
+most maxsplit splits are done.  If maxsplit is omitted, all
+splits are made.
+For consistency with the ``rsplit`` method of Python strings, any
+whitespace (tabs, spaces, newlines) is a separator if sep is None, the
+default value
+e.g.
+>>> from Bio.Seq import Seq
+>>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+>>> my_aa = my_rna.translate()
+>>> my_aa
+Seq('VMAIVMGR*KGAR*L')
+>>> for pep in my_aa.rsplit("*"):
+...     pep
+Seq('VMAIVMGR')
+Seq('KGAR')
+Seq('L')
+>>> for pep in my_aa.rsplit("*", 1):
+...     pep
+Seq('VMAIVMGR*KGAR')
+Seq('L')
+See also the split method, which splits the sequence starting from the
+beginning:
+>>> for pep in my_aa.split("*", 1):
+...     pep
+Seq('VMAIVMGR')
+Seq('KGAR*L')
+"""
+if isinstance(sep, _SeqAbstractBaseClass):
+sep = bytes(sep)
+elif isinstance(sep, str):
+sep = sep.encode("ASCII")
+return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
+def strip(self, chars=None, inplace=False):
+"""Return a sequence object with leading and trailing ends stripped.
+With default arguments, leading and trailing whitespace is removed:
+>>> seq = Seq(" ACGT ")
+>>> seq.strip()
+Seq('ACGT')
+>>> seq
+Seq(' ACGT ')
+If ``chars`` is given and not ``None``, remove characters in ``chars``
+instead.  The order of the characters to be removed is not important:
+>>> Seq("ACGTACGT").strip("TGCA")
+Seq('')
+A copy of the sequence is returned if ``inplace`` is ``False`` (the
+default value).  If ``inplace`` is ``True``, the sequence is stripped
+in-place and returned.
+>>> seq = MutableSeq(" ACGT ")
+>>> seq.strip()
+MutableSeq('ACGT')
+>>> seq
+MutableSeq(' ACGT ')
+>>> seq.strip(inplace=True)
+MutableSeq('ACGT')
+>>> seq
+MutableSeq('ACGT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
+is called on a ``Seq`` object with ``inplace=True``.
+See also the lstrip and rstrip methods.
+"""
+if isinstance(chars, _SeqAbstractBaseClass):
+chars = bytes(chars)
+elif isinstance(chars, str):
+chars = chars.encode("ASCII")
+try:
+data = self._data.strip(chars)
+except TypeError:
+raise TypeError(
+"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+) from None
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def lstrip(self, chars=None, inplace=False):
+"""Return a sequence object with leading and trailing ends stripped.
+With default arguments, leading whitespace is removed:
+>>> seq = Seq(" ACGT ")
+>>> seq.lstrip()
+Seq('ACGT ')
+>>> seq
+Seq(' ACGT ')
+If ``chars`` is given and not ``None``, remove characters in ``chars``
+from the leading end instead.  The order of the characters to be removed
+is not important:
+>>> Seq("ACGACGTTACG").lstrip("GCA")
+Seq('TTACG')
+A copy of the sequence is returned if ``inplace`` is ``False`` (the
+default value).  If ``inplace`` is ``True``, the sequence is stripped
+in-place and returned.
+>>> seq = MutableSeq(" ACGT ")
+>>> seq.lstrip()
+MutableSeq('ACGT ')
+>>> seq
+MutableSeq(' ACGT ')
+>>> seq.lstrip(inplace=True)
+MutableSeq('ACGT ')
+>>> seq
+MutableSeq('ACGT ')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
+See also the strip and rstrip methods.
+"""
+if isinstance(chars, _SeqAbstractBaseClass):
+chars = bytes(chars)
+elif isinstance(chars, str):
+chars = chars.encode("ASCII")
+try:
+data = self._data.lstrip(chars)
+except TypeError:
+raise TypeError(
+"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+) from None
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def rstrip(self, chars=None, inplace=False):
+"""Return a sequence object with trailing ends stripped.
+With default arguments, trailing whitespace is removed:
+>>> seq = Seq(" ACGT ")
+>>> seq.rstrip()
+Seq(' ACGT')
+>>> seq
+Seq(' ACGT ')
+If ``chars`` is given and not ``None``, remove characters in ``chars``
+from the trailing end instead.  The order of the characters to be
+removed is not important:
+>>> Seq("ACGACGTTACG").rstrip("GCA")
+Seq('ACGACGTT')
+A copy of the sequence is returned if ``inplace`` is ``False`` (the
+default value).  If ``inplace`` is ``True``, the sequence is stripped
+in-place and returned.
+>>> seq = MutableSeq(" ACGT ")
+>>> seq.rstrip()
+MutableSeq(' ACGT')
+>>> seq
+MutableSeq(' ACGT ')
+>>> seq.rstrip(inplace=True)
+MutableSeq(' ACGT')
+>>> seq
+MutableSeq(' ACGT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
+See also the strip and lstrip methods.
+"""
+if isinstance(chars, _SeqAbstractBaseClass):
+chars = bytes(chars)
+elif isinstance(chars, str):
+chars = chars.encode("ASCII")
+try:
+data = self._data.rstrip(chars)
+except TypeError:
+raise TypeError(
+"argument must be None or a string, Seq, MutableSeq, or bytes-like object"
+) from None
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def removeprefix(self, prefix, inplace=False):
+"""Return a new Seq object with prefix (left) removed.
+This behaves like the python string method of the same name.
+e.g. Removing a start Codon:
+>>> from Bio.Seq import Seq
+>>> my_seq = Seq("ATGGTGTGTGT")
+>>> my_seq
+Seq('ATGGTGTGTGT')
+>>> my_seq.removeprefix('ATG')
+Seq('GTGTGTGT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
+See also the removesuffix method.
+"""
+if isinstance(prefix, _SeqAbstractBaseClass):
+prefix = bytes(prefix)
+elif isinstance(prefix, str):
+prefix = prefix.encode("ASCII")
+try:
+data = self._data.removeprefix(prefix)
+except TypeError:
+raise TypeError(
+"argument must be a string, Seq, MutableSeq, or bytes-like object"
+) from None
+except AttributeError:
+# Fall back for pre-Python 3.9
+data = self._data
+if data.startswith(prefix):
+data = data[len(prefix) :]
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def removesuffix(self, suffix, inplace=False):
+"""Return a new Seq object with suffix (right) removed.
+This behaves like the python string method of the same name.
+e.g. Removing a stop codon:
+>>> from Bio.Seq import Seq
+>>> my_seq = Seq("GTGTGTGTTAG")
+>>> my_seq
+Seq('GTGTGTGTTAG')
+>>> stop_codon = Seq("TAG")
+>>> my_seq.removesuffix(stop_codon)
+Seq('GTGTGTGT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
+See also the removeprefix method.
+"""
+if isinstance(suffix, _SeqAbstractBaseClass):
+suffix = bytes(suffix)
+elif isinstance(suffix, str):
+suffix = suffix.encode("ASCII")
+try:
+data = self._data.removesuffix(suffix)
+except TypeError:
+raise TypeError(
+"argument must be a string, Seq, MutableSeq, or bytes-like object"
+) from None
+except AttributeError:
+# Fall back for pre-Python 3.9
+data = self._data
+if data.endswith(suffix):
+data = data[: -len(suffix)]
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def upper(self, inplace=False):
+"""Return the sequence in upper case.
+An upper-case copy of the sequence is returned if inplace is False,
+the default value:
+>>> from Bio.Seq import Seq, MutableSeq
+>>> my_seq = Seq("VHLTPeeK*")
+>>> my_seq
+Seq('VHLTPeeK*')
+>>> my_seq.lower()
+Seq('vhltpeek*')
+>>> my_seq.upper()
+Seq('VHLTPEEK*')
+>>> my_seq
+Seq('VHLTPeeK*')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("VHLTPeeK*")
+>>> my_seq
+MutableSeq('VHLTPeeK*')
+>>> my_seq.lower()
+MutableSeq('vhltpeek*')
+>>> my_seq.upper()
+MutableSeq('VHLTPEEK*')
+>>> my_seq
+MutableSeq('VHLTPeeK*')
+>>> my_seq.lower(inplace=True)
+MutableSeq('vhltpeek*')
+>>> my_seq
+MutableSeq('vhltpeek*')
+>>> my_seq.upper(inplace=True)
+MutableSeq('VHLTPEEK*')
+>>> my_seq
+MutableSeq('VHLTPEEK*')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``upper`` is called on a ``Seq`` object with ``inplace=True``.
+See also the ``lower`` method.
+"""
+data = self._data.upper()
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def lower(self, inplace=False):
+"""Return the sequence in lower case.
+An lower-case copy of the sequence is returned if inplace is False,
+the default value:
+>>> from Bio.Seq import Seq, MutableSeq
+>>> my_seq = Seq("VHLTPeeK*")
+>>> my_seq
+Seq('VHLTPeeK*')
+>>> my_seq.lower()
+Seq('vhltpeek*')
+>>> my_seq.upper()
+Seq('VHLTPEEK*')
+>>> my_seq
+Seq('VHLTPeeK*')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("VHLTPeeK*")
+>>> my_seq
+MutableSeq('VHLTPeeK*')
+>>> my_seq.lower()
+MutableSeq('vhltpeek*')
+>>> my_seq.upper()
+MutableSeq('VHLTPEEK*')
+>>> my_seq
+MutableSeq('VHLTPeeK*')
+>>> my_seq.lower(inplace=True)
+MutableSeq('vhltpeek*')
+>>> my_seq
+MutableSeq('vhltpeek*')
+>>> my_seq.upper(inplace=True)
+MutableSeq('VHLTPEEK*')
+>>> my_seq
+MutableSeq('VHLTPEEK*')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``lower`` is called on a ``Seq`` object with ``inplace=True``.
+See also the ``upper`` method.
+"""
+data = self._data.lower()
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+else:
+return self.__class__(data)
+def isupper(self):
+"""Return True if all ASCII characters in data are uppercase.
+If there are no cased characters, the method returns False.
+"""
+return self._data.isupper()
+def islower(self):
+"""Return True if all ASCII characters in data are lowercase.
+If there are no cased characters, the method returns False.
+"""
+return self._data.islower()
+def translate(
+self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
+):
+"""Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
+This method will translate DNA or RNA sequences. It should not
+be used on protein sequences as any result will be biologically
+meaningless.
+Arguments:
+- table - Which codon table to use?  This can be either a name
+(string), an NCBI identifier (integer), or a CodonTable
+object (useful for non-standard genetic codes).  This
+defaults to the "Standard" table.
+- stop_symbol - Single character string, what to use for
+terminators.  This defaults to the asterisk, "*".
+- to_stop - Boolean, defaults to False meaning do a full
+translation continuing on past any stop codons (translated as the
+specified stop_symbol).  If True, translation is terminated at
+the first in frame stop codon (and the stop_symbol is not
+appended to the returned protein sequence).
+- cds - Boolean, indicates this is a complete CDS.  If True,
+this checks the sequence starts with a valid alternative start
+codon (which will be translated as methionine, M), that the
+sequence length is a multiple of three, and that there is a
+single in frame stop codon at the end (this will be excluded
+from the protein sequence, regardless of the to_stop option).
+If these tests fail, an exception is raised.
+- gap - Single character string to denote symbol used for gaps.
+Defaults to the minus sign.
+A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
+object; a ``MutableSeq`` object is returned if ``translate`` is called
+pn a ``MutableSeq`` object.
+e.g. Using the standard table:
+>>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+>>> coding_dna.translate()
+Seq('VAIVMGR*KGAR*')
+>>> coding_dna.translate(stop_symbol="@")
+Seq('VAIVMGR@KGAR@')
+>>> coding_dna.translate(to_stop=True)
+Seq('VAIVMGR')
+Now using NCBI table 2, where TGA is not a stop codon:
+>>> coding_dna.translate(table=2)
+Seq('VAIVMGRWKGAR*')
+>>> coding_dna.translate(table=2, to_stop=True)
+Seq('VAIVMGRWKGAR')
+In fact, GTG is an alternative start codon under NCBI table 2, meaning
+this sequence could be a complete CDS:
+>>> coding_dna.translate(table=2, cds=True)
+Seq('MAIVMGRWKGAR')
+It isn't a valid CDS under NCBI table 1, due to both the start codon
+and also the in frame stop codons:
+>>> coding_dna.translate(table=1, cds=True)
+Traceback (most recent call last):
+...
+Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
+If the sequence has no in-frame stop codon, then the to_stop argument
+has no effect:
+>>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
+>>> coding_dna2.translate()
+Seq('LAIVMGR')
+>>> coding_dna2.translate(to_stop=True)
+Seq('LAIVMGR')
+NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+or a stop codon.  These are translated as "X".  Any invalid codon
+(e.g. "TA?" or "T-A") will throw a TranslationError.
+NOTE - This does NOT behave like the python string's translate
+method.  For that use str(my_seq).translate(...) instead
+"""
+try:
+data = str(self)
+except UndefinedSequenceError:
+# translating an undefined sequence yields an undefined
+# sequence with the length divided by 3
+n = len(self)
+if n % 3 != 0:
+warnings.warn(
+"Partial codon, len(sequence) not a multiple of three. "
+"This may become an error in future.",
+BiopythonWarning,
+)
+return Seq(None, n // 3)
+return self.__class__(
+_translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
+)
+def complement(self, inplace=False):
+"""Return the complement as a DNA sequence.
+>>> Seq("CGA").complement()
+Seq('GCT')
+Any U in the sequence is treated as a T:
+>>> Seq("CGAUT").complement()
+Seq('GCTAA')
+In contrast, ``complement_rna`` returns an RNA sequence:
+>>> Seq("CGAUT").complement_rna()
+Seq('GCUAA')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.complement()
+MutableSeq('GCT')
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.complement(inplace=True)
+MutableSeq('GCT')
+>>> my_seq
+MutableSeq('GCT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
+"""
+ttable = _dna_complement_table
+try:
+data = self._data.translate(ttable)
+except UndefinedSequenceError:
+# complement of an undefined sequence is an undefined sequence
+# of the same length
+return self
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+return self.__class__(data)
+def complement_rna(self, inplace=False):
+"""Return the complement as an RNA sequence.
+>>> Seq("CGA").complement_rna()
+Seq('GCU')
+Any T in the sequence is treated as a U:
+>>> Seq("CGAUT").complement_rna()
+Seq('GCUAA')
+In contrast, ``complement`` returns a DNA sequence by default:
+>>> Seq("CGA").complement()
+Seq('GCT')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.complement_rna()
+MutableSeq('GCU')
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.complement_rna(inplace=True)
+MutableSeq('GCU')
+>>> my_seq
+MutableSeq('GCU')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
+"""
+try:
+data = self._data.translate(_rna_complement_table)
+except UndefinedSequenceError:
+# complement of an undefined sequence is an undefined sequence
+# of the same length
+return self
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+return self.__class__(data)
+def reverse_complement(self, inplace=False):
+"""Return the reverse complement as a DNA sequence.
+>>> Seq("CGA").reverse_complement()
+Seq('TCG')
+Any U in the sequence is treated as a T:
+>>> Seq("CGAUT").reverse_complement()
+Seq('AATCG')
+In contrast, ``reverse_complement_rna`` returns an RNA sequence:
+>>> Seq("CGA").reverse_complement_rna()
+Seq('UCG')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.reverse_complement()
+MutableSeq('TCG')
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.reverse_complement(inplace=True)
+MutableSeq('TCG')
+>>> my_seq
+MutableSeq('TCG')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``reverse_complement`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+try:
+data = self._data.translate(_dna_complement_table)
+except UndefinedSequenceError:
+# reverse complement of an undefined sequence is an undefined sequence
+# of the same length
+return self
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[::-1] = data
+return self
+return self.__class__(data[::-1])
+def reverse_complement_rna(self, inplace=False):
+"""Return the reverse complement as an RNA sequence.
+>>> Seq("CGA").reverse_complement_rna()
+Seq('UCG')
+Any T in the sequence is treated as a U:
+>>> Seq("CGAUT").reverse_complement_rna()
+Seq('AAUCG')
+In contrast, ``reverse_complement`` returns a DNA sequence:
+>>> Seq("CGA").reverse_complement()
+Seq('TCG')
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.reverse_complement_rna()
+MutableSeq('UCG')
+>>> my_seq
+MutableSeq('CGA')
+>>> my_seq.reverse_complement_rna(inplace=True)
+MutableSeq('UCG')
+>>> my_seq
+MutableSeq('UCG')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``reverse_complement_rna`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+try:
+data = self._data.translate(_rna_complement_table)
+except UndefinedSequenceError:
+# reverse complement of an undefined sequence is an undefined sequence
+# of the same length
+return self
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[::-1] = data
+return self
+return self.__class__(data[::-1])
+def transcribe(self, inplace=False):
+"""Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
+Following the usual convention, the sequence is interpreted as the
+coding strand of the DNA double helix, not the template strand. This
+means we can get the RNA sequence just by switching T to U.
+>>> from Bio.Seq import Seq
+>>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+>>> coding_dna
+Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+>>> coding_dna.transcribe()
+Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+The sequence is modified in-place and returned if inplace is True:
+>>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+>>> sequence
+MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+>>> sequence.transcribe()
+MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+>>> sequence
+MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+>>> sequence.transcribe(inplace=True)
+MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+>>> sequence
+MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
+Trying to transcribe an RNA sequence has no effect.
+If you have a nucleotide sequence which might be DNA or RNA
+(or even a mixture), calling the transcribe method will ensure
+any T becomes U.
+Trying to transcribe a protein sequence will replace any
+T for Threonine with U for Selenocysteine, which has no
+biologically plausible rational.
+>>> from Bio.Seq import Seq
+>>> my_protein = Seq("MAIVMGRT")
+>>> my_protein.transcribe()
+Seq('MAIVMGRU')
+"""
+data = self._data.replace(b"T", b"U").replace(b"t", b"u")
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+return self.__class__(data)
+def back_transcribe(self, inplace=False):
+"""Return the DNA sequence from an RNA sequence by creating a new Seq object.
+>>> from Bio.Seq import Seq
+>>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+>>> messenger_rna
+Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+>>> messenger_rna.back_transcribe()
+Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+The sequence is modified in-place and returned if inplace is True:
+>>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+>>> sequence
+MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+>>> sequence.back_transcribe()
+MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+>>> sequence
+MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
+>>> sequence.back_transcribe(inplace=True)
+MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+>>> sequence
+MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
+Trying to back-transcribe DNA has no effect, If you have a nucleotide
+sequence which might be DNA or RNA (or even a mixture), calling the
+back-transcribe method will ensure any U becomes T.
+Trying to back-transcribe a protein sequence will replace any U for
+Selenocysteine with T for Threonine, which is biologically meaningless.
+>>> from Bio.Seq import Seq
+>>> my_protein = Seq("MAIVMGRU")
+>>> my_protein.back_transcribe()
+Seq('MAIVMGRT')
+"""
+data = self._data.replace(b"U", b"T").replace(b"u", b"t")
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+return self.__class__(data)
+def join(self, other):
+"""Return a merge of the sequences in other, spaced by the sequence from self.
+Accepts a Seq object, MutableSeq object, or string (and iterates over
+the letters), or an iterable containing Seq, MutableSeq, or string
+objects. These arguments will be concatenated with the calling sequence
+as the spacer:
+>>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
+>>> concatenated
+Seq('AAANNNNNTTTNNNNNPPP')
+Joining the letters of a single sequence:
+>>> Seq('NNNNN').join(Seq("ACGT"))
+Seq('ANNNNNCNNNNNGNNNNNT')
+>>> Seq('NNNNN').join("ACGT")
+Seq('ANNNNNCNNNNNGNNNNNT')
+"""
+if isinstance(other, _SeqAbstractBaseClass):
+return self.__class__(str(self).join(str(other)))
+elif isinstance(other, str):
+return self.__class__(str(self).join(other))
+from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+if isinstance(other, SeqRecord):
+raise TypeError("Iterable cannot be a SeqRecord")
+for c in other:
+if isinstance(c, SeqRecord):
+raise TypeError("Iterable cannot contain SeqRecords")
+elif not isinstance(c, (str, _SeqAbstractBaseClass)):
+raise TypeError(
+"Input must be an iterable of Seq objects, MutableSeq objects, or strings"
+)
+return self.__class__(str(self).join([str(_) for _ in other]))
+def replace(self, old, new, inplace=False):
+"""Return a copy with all occurrences of subsequence old replaced by new.
+>>> s = Seq("ACGTAACCGGTT")
+>>> t = s.replace("AC", "XYZ")
+>>> s
+Seq('ACGTAACCGGTT')
+>>> t
+Seq('XYZGTAXYZCGGTT')
+For mutable sequences, passing inplace=True will modify the sequence in place:
+>>> m = MutableSeq("ACGTAACCGGTT")
+>>> t = m.replace("AC", "XYZ")
+>>> m
+MutableSeq('ACGTAACCGGTT')
+>>> t
+MutableSeq('XYZGTAXYZCGGTT')
+>>> m = MutableSeq("ACGTAACCGGTT")
+>>> t = m.replace("AC", "XYZ", inplace=True)
+>>> m
+MutableSeq('XYZGTAXYZCGGTT')
+>>> t
+MutableSeq('XYZGTAXYZCGGTT')
+As ``Seq`` objects are immutable, a ``TypeError`` is raised if
+``replace`` is called on a ``Seq`` object with ``inplace=True``.
+"""
+if isinstance(old, _SeqAbstractBaseClass):
+old = bytes(old)
+elif isinstance(old, str):
+old = old.encode("ASCII")
+if isinstance(new, _SeqAbstractBaseClass):
+new = bytes(new)
+elif isinstance(new, str):
+new = new.encode("ASCII")
+data = self._data.replace(old, new)
+if inplace:
+if not isinstance(self._data, bytearray):
+raise TypeError("Sequence is immutable")
+self._data[:] = data
+return self
+return self.__class__(data)
+@property
+def defined(self):
+"""Return True if the sequence is defined, False if undefined or partially defined.
+Zero-length sequences are always considered to be defined.
+"""
+if isinstance(self._data, (bytes, bytearray)):
+return True
+else:
+return self._data.defined
+@property
+def defined_ranges(self):
+"""Return a tuple of the ranges where the sequence contents is defined.
+The return value has the format ((start1, end1), (start2, end2), ...).
+"""
+if isinstance(self._data, (bytes, bytearray)):
+length = len(self)
+if length > 0:
+return ((0, length),)
+else:
+return ()
+else:
+return self._data.defined_ranges
+class Seq(_SeqAbstractBaseClass):
+"""Read-only sequence object (essentially a string with biological methods).
+Like normal python strings, our basic sequence object is immutable.
+This prevents you from doing my_seq[5] = "A" for example, but does allow
+Seq objects to be used as dictionary keys.
+The Seq object provides a number of string like methods (such as count,
+find, split and strip).
+The Seq object also provides some biological methods, such as complement,
+reverse_complement, transcribe, back_transcribe and translate (which are
+not applicable to protein sequences).
+"""
+_data: Union[bytes, SequenceDataAbstractBaseClass]
+def __init__(
+self,
+data: Union[
+str,
+bytes,
+bytearray,
+_SeqAbstractBaseClass,
+SequenceDataAbstractBaseClass,
+dict,
+None,
+],
+length: Optional[int] = None,
+):
+"""Create a Seq object.
+Arguments:
+- data - Sequence, required (string)
+- length - Sequence length, used only if data is None or a dictionary (integer)
+You will typically use Bio.SeqIO to read in sequences from files as
+SeqRecord objects, whose sequence will be exposed as a Seq object via
+the seq property.
+However, you can also create a Seq object directly:
+>>> from Bio.Seq import Seq
+>>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
+>>> my_seq
+Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
+>>> print(my_seq)
+MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+To create a Seq object with for a sequence of known length but
+unknown sequence contents, use None for the data argument and pass
+the sequence length for the length argument. Trying to access the
+sequence contents of a Seq object created in this way will raise
+an UndefinedSequenceError:
+>>> my_undefined_sequence = Seq(None, 20)
+>>> my_undefined_sequence
+Seq(None, length=20)
+>>> len(my_undefined_sequence)
+20
+>>> print(my_undefined_sequence)
+Traceback (most recent call last):
+...
+Bio.Seq.UndefinedSequenceError: Sequence content is undefined
+If the sequence contents is known for parts of the sequence only, use
+a dictionary for the data argument to pass the known sequence segments:
+>>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
+>>> my_partially_defined_sequence
+Seq({3: 'ACGT'}, length=10)
+>>> len(my_partially_defined_sequence)
+10
+>>> print(my_partially_defined_sequence)
+Traceback (most recent call last):
+...
+Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
+>>> my_partially_defined_sequence[3:7]
+Seq('ACGT')
+>>> print(my_partially_defined_sequence[3:7])
+ACGT
+"""
+if data is None:
+if length is None:
+raise ValueError("length must not be None if data is None")
+elif length == 0:
+self._data = b""
+elif length < 0:
+raise ValueError("length must not be negative.")
+else:
+self._data = _UndefinedSequenceData(length)
+elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
+self._data = data
+elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
+self._data = bytes(data)
+elif isinstance(data, str):
+self._data = bytes(data, encoding="ASCII")
+elif isinstance(data, dict):
+if length is None:
+raise ValueError("length must not be None if data is a dictionary")
+elif length == 0:
+self._data = b""
+elif length < 0:
+raise ValueError("length must not be negative.")
+else:
+current = 0  # not needed here, but it keeps mypy happy
+end = -1
+starts = sorted(data.keys())
+_data: Dict[int, bytes] = {}
+for start in starts:
+seq = data[start]
+if isinstance(seq, str):
+seq = bytes(seq, encoding="ASCII")
+else:
+try:
+seq = bytes(seq)
+except Exception:
+raise ValueError("Expected bytes-like objects or strings")
+if start < end:
+raise ValueError("Sequence data are overlapping.")
+elif start == end:
+_data[current] += seq  # noqa: F821
+else:
+_data[start] = seq
+current = start
+end = start + len(seq)
+if end > length:
+raise ValueError(
+"Provided sequence data extend beyond sequence length."
+)
+elif end == length and current == 0:
+# sequence is fully defined
+self._data = _data[current]
+else:
+self._data = _PartiallyDefinedSequenceData(length, _data)
+else:
+raise TypeError(
+"data should be a string, bytes, bytearray, Seq, or MutableSeq object"
+)
+def __hash__(self):
+"""Hash of the sequence as a string for comparison.
+See Seq object comparison documentation (method ``__eq__`` in
+particular) as this has changed in Biopython 1.65. Older versions
+would hash on object identity.
+"""
+return hash(self._data)
+class MutableSeq(_SeqAbstractBaseClass):
+"""An editable sequence object.
+Unlike normal python strings and our basic sequence object (the Seq class)
+which are immutable, the MutableSeq lets you edit the sequence in place.
+However, this means you cannot use a MutableSeq object as a dictionary key.
+>>> from Bio.Seq import MutableSeq
+>>> my_seq = MutableSeq("ACTCGTCGTCG")
+>>> my_seq
+MutableSeq('ACTCGTCGTCG')
+>>> my_seq[5]
+'T'
+>>> my_seq[5] = "A"
+>>> my_seq
+MutableSeq('ACTCGACGTCG')
+>>> my_seq[5]
+'A'
+>>> my_seq[5:8] = "NNN"
+>>> my_seq
+MutableSeq('ACTCGNNNTCG')
+>>> len(my_seq)
+11
+Note that the MutableSeq object does not support as many string-like
+or biological methods as the Seq object.
+"""
+def __init__(self, data):
+"""Create a MutableSeq object."""
+if isinstance(data, bytearray):
+self._data = data
+elif isinstance(data, bytes):
+self._data = bytearray(data)
+elif isinstance(data, str):
+self._data = bytearray(data, "ASCII")
+elif isinstance(data, MutableSeq):
+self._data = data._data[:]  # Take a copy
+elif isinstance(data, Seq):
+# Make no assumptions about the Seq subclass internal storage
+self._data = bytearray(bytes(data))
+else:
+raise TypeError(
+"data should be a string, bytearray object, Seq object, or a "
+"MutableSeq object"
+)
+def __setitem__(self, index, value):
+"""Set a subsequence of single letter via value parameter.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq[0] = 'T'
+>>> my_seq
+MutableSeq('TCTCGACGTCG')
+"""
+if isinstance(index, numbers.Integral):
+# Replacing a single letter with a new string
+self._data[index] = ord(value)
+else:
+# Replacing a sub-sequence
+if isinstance(value, MutableSeq):
+self._data[index] = value._data
+elif isinstance(value, Seq):
+self._data[index] = bytes(value)
+elif isinstance(value, str):
+self._data[index] = value.encode("ASCII")
+else:
+raise TypeError(f"received unexpected type '{type(value).__name__}'")
+def __delitem__(self, index):
+"""Delete a subsequence of single letter.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> del my_seq[0]
+>>> my_seq
+MutableSeq('CTCGACGTCG')
+"""
+# Could be deleting a single letter, or a slice
+del self._data[index]
+def append(self, c):
+"""Add a subsequence to the mutable sequence object.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq.append('A')
+>>> my_seq
+MutableSeq('ACTCGACGTCGA')
+No return value.
+"""
+self._data.append(ord(c.encode("ASCII")))
+def insert(self, i, c):
+"""Add a subsequence to the mutable sequence object at a given index.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq.insert(0,'A')
+>>> my_seq
+MutableSeq('AACTCGACGTCG')
+>>> my_seq.insert(8,'G')
+>>> my_seq
+MutableSeq('AACTCGACGGTCG')
+No return value.
+"""
+self._data.insert(i, ord(c.encode("ASCII")))
+def pop(self, i=(-1)):
+"""Remove a subsequence of a single letter at given index.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq.pop()
+'G'
+>>> my_seq
+MutableSeq('ACTCGACGTC')
+>>> my_seq.pop()
+'C'
+>>> my_seq
+MutableSeq('ACTCGACGT')
+Returns the last character of the sequence.
+"""
+c = self._data[i]
+del self._data[i]
+return chr(c)
+def remove(self, item):
+"""Remove a subsequence of a single letter from mutable sequence.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq.remove('C')
+>>> my_seq
+MutableSeq('ATCGACGTCG')
+>>> my_seq.remove('A')
+>>> my_seq
+MutableSeq('TCGACGTCG')
+No return value.
+"""
+codepoint = ord(item)
+try:
+self._data.remove(codepoint)
+except ValueError:
+raise ValueError("value not found in MutableSeq") from None
+def reverse(self):
+"""Modify the mutable sequence to reverse itself.
+No return value.
+"""
+self._data.reverse()
+def extend(self, other):
+"""Add a sequence to the original mutable sequence object.
+>>> my_seq = MutableSeq('ACTCGACGTCG')
+>>> my_seq.extend('A')
+>>> my_seq
+MutableSeq('ACTCGACGTCGA')
+>>> my_seq.extend('TTT')
+>>> my_seq
+MutableSeq('ACTCGACGTCGATTT')
+No return value.
+"""
+if isinstance(other, MutableSeq):
+self._data.extend(other._data)
+elif isinstance(other, Seq):
+self._data.extend(bytes(other))
+elif isinstance(other, str):
+self._data.extend(other.encode("ASCII"))
+else:
+raise TypeError("expected a string, Seq or MutableSeq")
+class UndefinedSequenceError(ValueError):
+"""Sequence contents is undefined."""
+class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
+"""Stores the length of a sequence with an undefined sequence contents (PRIVATE).
+Objects of this class can be used to create a Seq object to represent
+sequences with a known length, but an unknown sequence contents.
+Calling __len__ returns the sequence length, calling __getitem__ raises an
+UndefinedSequenceError except for requests of zero size, for which it
+returns an empty bytes object.
+"""
+__slots__ = ("_length",)
+def __init__(self, length):
+"""Initialize the object with the sequence length.
+The calling function is responsible for ensuring that the length is
+greater than zero.
+"""
+self._length = length
+super().__init__()
+def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
+if isinstance(key, slice):
+start, end, step = key.indices(self._length)
+size = len(range(start, end, step))
+if size == 0:
+return b""
+return _UndefinedSequenceData(size)
+else:
+raise UndefinedSequenceError("Sequence content is undefined")
+def __len__(self):
+return self._length
+def __bytes__(self):
+raise UndefinedSequenceError("Sequence content is undefined")
+def __add__(self, other):
+length = len(self) + len(other)
+try:
+other = bytes(other)
+except UndefinedSequenceError:
+if isinstance(other, _UndefinedSequenceData):
+return _UndefinedSequenceData(length)
+else:
+return NotImplemented
+# _PartiallyDefinedSequenceData.__radd__ will handle this
+else:
+data = {len(self): other}
+return _PartiallyDefinedSequenceData(length, data)
+def __radd__(self, other):
+data = {0: bytes(other)}
+length = len(other) + len(self)
+return _PartiallyDefinedSequenceData(length, data)
+def upper(self):
+"""Return an upper case copy of the sequence."""
+# An upper case copy of an undefined sequence is an undefined
+# sequence of the same length
+return _UndefinedSequenceData(self._length)
+def lower(self):
+"""Return a lower case copy of the sequence."""
+# A lower case copy of an undefined sequence is an undefined
+# sequence of the same length
+return _UndefinedSequenceData(self._length)
+def isupper(self):
+"""Return True if all ASCII characters in data are uppercase.
+If there are no cased characters, the method returns False.
+"""
+# Character case is irrelevant for an undefined sequence
+raise UndefinedSequenceError("Sequence content is undefined")
+def islower(self):
+"""Return True if all ASCII characters in data are lowercase.
+If there are no cased characters, the method returns False.
+"""
+# Character case is irrelevant for an undefined sequence
+raise UndefinedSequenceError("Sequence content is undefined")
+def replace(self, old, new):
+"""Return a copy with all occurrences of substring old replaced by new."""
+# Replacing substring old by new in an undefined sequence will result
+# in an undefined sequence of the same length, if old and new have the
+# number of characters.
+if len(old) != len(new):
+raise UndefinedSequenceError("Sequence content is undefined")
+return _UndefinedSequenceData(self._length)
+@property
+def defined(self):
+"""Return False, as the sequence is not defined and has a non-zero length."""
+return False
+@property
+def defined_ranges(self):
+"""Return a tuple of the ranges where the sequence contents is defined.
+As the sequence contents of an _UndefinedSequenceData object is fully
+undefined, the return value is always an empty tuple.
+"""
+return ()
+class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
+"""Stores the length of a sequence with an undefined sequence contents (PRIVATE).
+Objects of this class can be used to create a Seq object to represent
+sequences with a known length, but with a sequence contents that is only
+partially known.
+Calling __len__ returns the sequence length, calling __getitem__ returns
+the sequence contents if known, otherwise an UndefinedSequenceError is
+raised.
+"""
+__slots__ = ("_length", "_data")
+def __init__(self, length, data):
+"""Initialize with the sequence length and defined sequence segments.
+The calling function is responsible for ensuring that the length is
+greater than zero.
+"""
+self._length = length
+self._data = data
+super().__init__()
+def __getitem__(
+self, key: Union[slice, int]
+) -> Union[bytes, SequenceDataAbstractBaseClass]:
+if isinstance(key, slice):
+start, end, step = key.indices(self._length)
+size = len(range(start, end, step))
+if size == 0:
+return b""
+data = {}
+for s, d in self._data.items():
+indices = range(-s, -s + self._length)[key]
+e: Optional[int] = indices.stop
+assert e is not None
+if step > 0:
+if e <= 0:
+continue
+if indices.start < 0:
+s = indices.start % step
+else:
+s = indices.start
+else:  # step < 0
+if e < 0:
+e = None
+end = len(d) - 1
+if indices.start > end:
+s = end + (indices.start - end) % step
+else:
+s = indices.start
+if s < 0:
+continue
+start = (s - indices.start) // step
+d = d[s:e:step]
+if d:
+data[start] = d
+if len(data) == 0:  # Fully undefined sequence
+return _UndefinedSequenceData(size)
+# merge adjacent sequence segments
+end = -1
+previous = 0  # not needed here, but it keeps flake happy
+items = data.items()
+data = {}
+for start, seq in items:
+if end == start:
+data[previous] += seq
+else:
+data[start] = seq
+previous = start
+end = start + len(seq)
+if len(data) == 1:
+seq = data.get(0)
+if seq is not None and len(seq) == size:
+return seq  # Fully defined sequence; return bytes
+if step < 0:
+# use this after we drop Python 3.7:
+# data = {start: data[start] for start in reversed(data)}
+# use this as long as we support Python 3.7:
+data = {start: data[start] for start in reversed(list(data.keys()))}
+return _PartiallyDefinedSequenceData(size, data)
+elif self._length <= key:
+raise IndexError("sequence index out of range")
+else:
+for start, seq in self._data.items():
+if start <= key and key < start + len(seq):
+return seq[key - start]
+raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
+def __len__(self):
+return self._length
+def __bytes__(self):
+raise UndefinedSequenceError("Sequence content is only partially defined")
+def __add__(self, other):
+length = len(self) + len(other)
+data = dict(self._data)
+items = list(self._data.items())
+start, seq = items[-1]
+end = start + len(seq)
+try:
+other = bytes(other)
+except UndefinedSequenceError:
+if isinstance(other, _UndefinedSequenceData):
+pass
+elif isinstance(other, _PartiallyDefinedSequenceData):
+other_items = list(other._data.items())
+if end == len(self):
+other_start, other_seq = other_items.pop(0)
+if other_start == 0:
+data[start] += other_seq
+else:
+data[len(self) + other_start] = other_seq
+for other_start, other_seq in other_items:
+data[len(self) + other_start] = other_seq
+else:
+if end == len(self):
+data[start] += other
+else:
+data[len(self)] = other
+return _PartiallyDefinedSequenceData(length, data)
+def __radd__(self, other):
+length = len(other) + len(self)
+try:
+other = bytes(other)
+except UndefinedSequenceError:
+data = {len(other) + start: seq for start, seq in self._data.items()}
+else:
+data = {0: other}
+items = list(self._data.items())
+start, seq = items.pop(0)
+if start == 0:
+data[0] += seq
+else:
+data[len(other) + start] = seq
+for start, seq in items:
+data[len(other) + start] = seq
+return _PartiallyDefinedSequenceData(length, data)
+def __mul__(self, other):
+length = self._length
+items = self._data.items()
+data = {}
+end = -1
+previous = 0  # not needed here, but it keeps flake happy
+for i in range(other):
+for start, seq in items:
+start += i * length
+if end == start:
+data[previous] += seq
+else:
+data[start] = seq
+previous = start
+end = start + len(seq)
+return _PartiallyDefinedSequenceData(length * other, data)
+def upper(self):
+"""Return an upper case copy of the sequence."""
+data = {start: seq.upper() for start, seq in self._data.items()}
+return _PartiallyDefinedSequenceData(self._length, data)
+def lower(self):
+"""Return a lower case copy of the sequence."""
+data = {start: seq.lower() for start, seq in self._data.items()}
+return _PartiallyDefinedSequenceData(self._length, data)
+def isupper(self):
+"""Return True if all ASCII characters in data are uppercase.
+If there are no cased characters, the method returns False.
+"""
+# Character case is irrelevant for an undefined sequence
+raise UndefinedSequenceError("Sequence content is only partially defined")
+def islower(self):
+"""Return True if all ASCII characters in data are lowercase.
+If there are no cased characters, the method returns False.
+"""
+# Character case is irrelevant for an undefined sequence
+raise UndefinedSequenceError("Sequence content is only partially defined")
+def translate(self, table, delete=b""):
+"""Return a copy with each character mapped by the given translation table.
+table
+Translation table, which must be a bytes object of length 256.
+All characters occurring in the optional argument delete are removed.
+The remaining characters are mapped through the given translation table.
+"""
+items = self._data.items()
+data = {start: seq.translate(table, delete) for start, seq in items}
+return _PartiallyDefinedSequenceData(self._length, data)
+def replace(self, old, new):
+"""Return a copy with all occurrences of substring old replaced by new."""
+# Replacing substring old by new in the undefined sequence segments
+# will result in an undefined sequence segment of the same length, if
+# old and new have the number of characters. If not, an error is raised,
+# as the correct start positions cannot be calculated reliably.
+if len(old) != len(new):
+raise UndefinedSequenceError(
+"Sequence content is only partially defined; substring \n"
+"replacement cannot be performed reliably"
+)
+items = self._data.items()
+data = {start: seq.replace(old, new) for start, seq in items}
+return _PartiallyDefinedSequenceData(self._length, data)
+@property
+def defined(self):
+"""Return False, as the sequence is not fully defined and has a non-zero length."""
+return False
+@property
+def defined_ranges(self):
+"""Return a tuple of the ranges where the sequence contents is defined.
+The return value has the format ((start1, end1), (start2, end2), ...).
+"""
+return tuple((start, start + len(seq)) for start, seq in self._data.items())
+# The transcribe, backward_transcribe, and translate functions are
+# user-friendly versions of the corresponding Seq/MutableSeq methods.
+# The functions work both on Seq objects, and on strings.
+def transcribe(dna):
+"""Transcribe a DNA sequence into RNA.
+Following the usual convention, the sequence is interpreted as the
+coding strand of the DNA double helix, not the template strand. This
+means we can get the RNA sequence just by switching T to U.
+If given a string, returns a new string object.
+Given a Seq or MutableSeq, returns a new Seq object.
+e.g.
+>>> transcribe("ACTGN")
+'ACUGN'
+"""
+if isinstance(dna, Seq):
+return dna.transcribe()
+elif isinstance(dna, MutableSeq):
+return Seq(dna).transcribe()
+else:
+return dna.replace("T", "U").replace("t", "u")
+def back_transcribe(rna):
+"""Return the RNA sequence back-transcribed into DNA.
+If given a string, returns a new string object.
+Given a Seq or MutableSeq, returns a new Seq object.
+e.g.
+>>> back_transcribe("ACUGN")
+'ACTGN'
+"""
+if isinstance(rna, Seq):
+return rna.back_transcribe()
+elif isinstance(rna, MutableSeq):
+return Seq(rna).back_transcribe()
+else:
+return rna.replace("U", "T").replace("u", "t")
+def _translate_str(
+sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
+):
+"""Translate nucleotide string into a protein string (PRIVATE).
+Arguments:
+- sequence - a string
+- table - Which codon table to use?  This can be either a name (string),
+an NCBI identifier (integer), or a CodonTable object (useful for
+non-standard genetic codes).  This defaults to the "Standard" table.
+- stop_symbol - a single character string, what to use for terminators.
+- to_stop - boolean, should translation terminate at the first
+in frame stop codon?  If there is no in-frame stop codon
+then translation continues to the end.
+- pos_stop - a single character string for a possible stop codon
+(e.g. TAN or NNN)
+- cds - Boolean, indicates this is a complete CDS.  If True, this
+checks the sequence starts with a valid alternative start
+codon (which will be translated as methionine, M), that the
+sequence length is a multiple of three, and that there is a
+single in frame stop codon at the end (this will be excluded
+from the protein sequence, regardless of the to_stop option).
+If these tests fail, an exception is raised.
+- gap - Single character string to denote symbol used for gaps.
+Defaults to None.
+Returns a string.
+e.g.
+>>> from Bio.Data import CodonTable
+>>> table = CodonTable.ambiguous_dna_by_id[1]
+>>> _translate_str("AAA", table)
+'K'
+>>> _translate_str("TAR", table)
+'*'
+>>> _translate_str("TAN", table)
+'X'
+>>> _translate_str("TAN", table, pos_stop="@")
+'@'
+>>> _translate_str("TA?", table)
+Traceback (most recent call last):
+...
+Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
+In a change to older versions of Biopython, partial codons are now
+always regarded as an error (previously only checked if cds=True)
+and will trigger a warning (likely to become an exception in a
+future release).
+If **cds=True**, the start and stop codons are checked, and the start
+codon will be translated at methionine. The sequence must be an
+while number of codons.
+>>> _translate_str("ATGCCCTAG", table, cds=True)
+'MP'
+>>> _translate_str("AAACCCTAG", table, cds=True)
+Traceback (most recent call last):
+...
+Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
+>>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
+Traceback (most recent call last):
+...
+Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
+"""
+try:
+table_id = int(table)
+except ValueError:
+# Assume it's a table name
+# The same table can be used for RNA or DNA
+try:
+codon_table = CodonTable.ambiguous_generic_by_name[table]
+except KeyError:
+if isinstance(table, str):
+raise ValueError(
+"The Bio.Seq translate methods and function DO NOT "
+"take a character string mapping table like the python "
+"string object's translate method. "
+"Use str(my_seq).translate(...) instead."
+) from None
+else:
+raise TypeError("table argument must be integer or string") from None
+except (AttributeError, TypeError):
+# Assume it's a CodonTable object
+if isinstance(table, CodonTable.CodonTable):
+codon_table = table
+else:
+raise ValueError("Bad table argument") from None
+else:
+# Assume it's a table ID
+# The same table can be used for RNA or DNA
+codon_table = CodonTable.ambiguous_generic_by_id[table_id]
+sequence = sequence.upper()
+amino_acids = []
+forward_table = codon_table.forward_table
+stop_codons = codon_table.stop_codons
+if codon_table.nucleotide_alphabet is not None:
+valid_letters = set(codon_table.nucleotide_alphabet.upper())
+else:
+# Assume the worst case, ambiguous DNA or RNA:
+valid_letters = set(
+IUPACData.ambiguous_dna_letters.upper()
++ IUPACData.ambiguous_rna_letters.upper()
+)
+n = len(sequence)
+# Check for tables with 'ambiguous' (dual-coding) stop codons:
+dual_coding = [c for c in stop_codons if c in forward_table]
+if dual_coding:
+c = dual_coding[0]
+if to_stop:
+raise ValueError(
+"You cannot use 'to_stop=True' with this table as it contains"
+f" {len(dual_coding)} codon(s) which can be both STOP and an"
+f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
+)
+warnings.warn(
+f"This table contains {len(dual_coding)} codon(s) which code(s) for"
+f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
+" or STOP). Such codons will be translated as amino acid.",
+BiopythonWarning,
+)
+if cds:
+if str(sequence[:3]).upper() not in codon_table.start_codons:
+raise CodonTable.TranslationError(
+f"First codon '{sequence[:3]}' is not a start codon"
+)
+if n % 3 != 0:
+raise CodonTable.TranslationError(
+f"Sequence length {n} is not a multiple of three"
+)
+if str(sequence[-3:]).upper() not in stop_codons:
+raise CodonTable.TranslationError(
+f"Final codon '{sequence[-3:]}' is not a stop codon"
+)
+# Don't translate the stop symbol, and manually translate the M
+sequence = sequence[3:-3]
+n -= 6
+amino_acids = ["M"]
+elif n % 3 != 0:
+warnings.warn(
+"Partial codon, len(sequence) not a multiple of three. "
+"Explicitly trim the sequence or add trailing N before "
+"translation. This may become an error in future.",
+BiopythonWarning,
+)
+if gap is not None:
+if not isinstance(gap, str):
+raise TypeError("Gap character should be a single character string.")
+elif len(gap) > 1:
+raise ValueError("Gap character should be a single character string.")
+for i in range(0, n - n % 3, 3):
+codon = sequence[i : i + 3]
+try:
+amino_acids.append(forward_table[codon])
+except (KeyError, CodonTable.TranslationError):
+if codon in codon_table.stop_codons:
+if cds:
+raise CodonTable.TranslationError(
+f"Extra in frame stop codon '{codon}' found."
+) from None
+if to_stop:
+break
+amino_acids.append(stop_symbol)
+elif valid_letters.issuperset(set(codon)):
+# Possible stop codon (e.g. NNN or TAN)
+amino_acids.append(pos_stop)
+elif gap is not None and codon == gap * 3:
+# Gapped translation
+amino_acids.append(gap)
+else:
+raise CodonTable.TranslationError(
+f"Codon '{codon}' is invalid"
+) from None
+return "".join(amino_acids)
+def translate(
+sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
+):
+"""Translate a nucleotide sequence into amino acids.
+If given a string, returns a new string object. Given a Seq or
+MutableSeq, returns a Seq object.
+Arguments:
+- table - Which codon table to use?  This can be either a name
+(string), an NCBI identifier (integer), or a CodonTable object
+(useful for non-standard genetic codes).  Defaults to the "Standard"
+table.
+- stop_symbol - Single character string, what to use for any
+terminators, defaults to the asterisk, "*".
+- to_stop - Boolean, defaults to False meaning do a full
+translation continuing on past any stop codons
+(translated as the specified stop_symbol).  If
+True, translation is terminated at the first in
+frame stop codon (and the stop_symbol is not
+appended to the returned protein sequence).
+- cds - Boolean, indicates this is a complete CDS.  If True, this
+checks the sequence starts with a valid alternative start
+codon (which will be translated as methionine, M), that the
+sequence length is a multiple of three, and that there is a
+single in frame stop codon at the end (this will be excluded
+from the protein sequence, regardless of the to_stop option).
+If these tests fail, an exception is raised.
+- gap - Single character string to denote symbol used for gaps.
+Defaults to None.
+A simple string example using the default (standard) genetic code:
+>>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
+>>> translate(coding_dna)
+'VAIVMGR*KGAR*'
+>>> translate(coding_dna, stop_symbol="@")
+'VAIVMGR@KGAR@'
+>>> translate(coding_dna, to_stop=True)
+'VAIVMGR'
+Now using NCBI table 2, where TGA is not a stop codon:
+>>> translate(coding_dna, table=2)
+'VAIVMGRWKGAR*'
+>>> translate(coding_dna, table=2, to_stop=True)
+'VAIVMGRWKGAR'
+In fact this example uses an alternative start codon valid under NCBI
+table 2, GTG, which means this example is a complete valid CDS which
+when translated should really start with methionine (not valine):
+>>> translate(coding_dna, table=2, cds=True)
+'MAIVMGRWKGAR'
+Note that if the sequence has no in-frame stop codon, then the to_stop
+argument has no effect:
+>>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
+>>> translate(coding_dna2)
+'VAIVMGR'
+>>> translate(coding_dna2, to_stop=True)
+'VAIVMGR'
+NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+or a stop codon.  These are translated as "X".  Any invalid codon
+(e.g. "TA?" or "T-A") will throw a TranslationError.
+It will however translate either DNA or RNA.
+NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
+stop codons'. These are stop codons with unambiguous sequence but which
+have a context dependent coding as STOP or as amino acid. With these tables
+'to_stop' must be False (otherwise a ValueError is raised). The dual
+coding codons will always be translated as amino acid, except for
+'cds=True', where the last codon will be translated as STOP.
+>>> coding_dna3 = "ATGGCACGGAAGTGA"
+>>> translate(coding_dna3)
+'MARK*'
+>>> translate(coding_dna3, table=27)  # Table 27: TGA -> STOP or W
+'MARKW'
+It will however raise a BiopythonWarning (not shown).
+>>> translate(coding_dna3, table=27, cds=True)
+'MARK'
+>>> translate(coding_dna3, table=27, to_stop=True)
+Traceback (most recent call last):
+...
+ValueError: You cannot use 'to_stop=True' with this table ...
+"""
+if isinstance(sequence, Seq):
+return sequence.translate(table, stop_symbol, to_stop, cds)
+elif isinstance(sequence, MutableSeq):
+# Return a Seq object
+return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
+else:
+# Assume it's a string, return a string
+return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
+def reverse_complement(sequence, inplace=False):
+"""Return the reverse complement as a DNA sequence.
+If given a string, returns a new string object.
+Given a Seq object, returns a new Seq object.
+Given a MutableSeq, returns a new MutableSeq object.
+Given a SeqRecord object, returns a new SeqRecord object.
+>>> my_seq = "CGA"
+>>> reverse_complement(my_seq)
+'TCG'
+>>> my_seq = Seq("CGA")
+>>> reverse_complement(my_seq)
+Seq('TCG')
+>>> my_seq = MutableSeq("CGA")
+>>> reverse_complement(my_seq)
+MutableSeq('TCG')
+>>> my_seq
+MutableSeq('CGA')
+Any U in the sequence is treated as a T:
+>>> reverse_complement(Seq("CGAUT"))
+Seq('AATCG')
+In contrast, ``reverse_complement_rna`` returns an RNA sequence:
+>>> reverse_complement_rna(Seq("CGAUT"))
+Seq('AAUCG')
+Supports and lower- and upper-case characters, and unambiguous and
+ambiguous nucleotides. All other characters are not converted:
+>>> reverse_complement("ACGTUacgtuXYZxyz")
+'zrxZRXaacgtAACGT'
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> reverse_complement(my_seq, inplace=True)
+MutableSeq('TCG')
+>>> my_seq
+MutableSeq('TCG')
+As strings and ``Seq`` objects are immutable, a ``TypeError`` is
+raised if ``reverse_complement`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+if isinstance(sequence, (Seq, MutableSeq)):
+return sequence.reverse_complement(inplace)
+if isinstance(sequence, SeqRecord):
+if inplace:
+raise TypeError("SeqRecords are immutable")
+return sequence.reverse_complement()
+# Assume it's a string.
+if inplace:
+raise TypeError("strings are immutable")
+sequence = sequence.encode("ASCII")
+sequence = sequence.translate(_dna_complement_table)
+sequence = sequence.decode("ASCII")
+return sequence[::-1]
+def reverse_complement_rna(sequence, inplace=False):
+"""Return the reverse complement as an RNA sequence.
+If given a string, returns a new string object.
+Given a Seq object, returns a new Seq object.
+Given a MutableSeq, returns a new MutableSeq object.
+Given a SeqRecord object, returns a new SeqRecord object.
+>>> my_seq = "CGA"
+>>> reverse_complement_rna(my_seq)
+'UCG'
+>>> my_seq = Seq("CGA")
+>>> reverse_complement_rna(my_seq)
+Seq('UCG')
+>>> my_seq = MutableSeq("CGA")
+>>> reverse_complement_rna(my_seq)
+MutableSeq('UCG')
+>>> my_seq
+MutableSeq('CGA')
+Any T in the sequence is treated as a U:
+>>> reverse_complement_rna(Seq("CGAUT"))
+Seq('AAUCG')
+In contrast, ``reverse_complement`` returns a DNA sequence:
+>>> reverse_complement(Seq("CGAUT"), inplace=False)
+Seq('AATCG')
+Supports and lower- and upper-case characters, and unambiguous and
+ambiguous nucleotides. All other characters are not converted:
+>>> reverse_complement_rna("ACGTUacgtuXYZxyz")
+'zrxZRXaacguAACGU'
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> reverse_complement_rna(my_seq, inplace=True)
+MutableSeq('UCG')
+>>> my_seq
+MutableSeq('UCG')
+As strings and ``Seq`` objects are immutable, a ``TypeError`` is
+raised if ``reverse_complement`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+if isinstance(sequence, (Seq, MutableSeq)):
+return sequence.reverse_complement_rna(inplace)
+if isinstance(sequence, SeqRecord):
+if inplace:
+raise TypeError("SeqRecords are immutable")
+return sequence.reverse_complement_rna()
+# Assume it's a string.
+if inplace:
+raise TypeError("strings are immutable")
+sequence = sequence.encode("ASCII")
+sequence = sequence.translate(_rna_complement_table)
+sequence = sequence.decode("ASCII")
+return sequence[::-1]
+def complement(sequence, inplace=False):
+"""Return the complement as a DNA sequence.
+If given a string, returns a new string object.
+Given a Seq object, returns a new Seq object.
+Given a MutableSeq, returns a new MutableSeq object.
+Given a SeqRecord object, returns a new SeqRecord object.
+>>> my_seq = "CGA"
+>>> complement(my_seq)
+'GCT'
+>>> my_seq = Seq("CGA")
+>>> complement(my_seq)
+Seq('GCT')
+>>> my_seq = MutableSeq("CGA")
+>>> complement(my_seq)
+MutableSeq('GCT')
+>>> my_seq
+MutableSeq('CGA')
+Any U in the sequence is treated as a T:
+>>> complement(Seq("CGAUT"))
+Seq('GCTAA')
+In contrast, ``complement_rna`` returns an RNA sequence:
+>>> complement_rna(Seq("CGAUT"))
+Seq('GCUAA')
+Supports and lower- and upper-case characters, and unambiguous and
+ambiguous nucleotides. All other characters are not converted:
+>>> complement("ACGTUacgtuXYZxyz")
+'TGCAAtgcaaXRZxrz'
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> complement(my_seq, inplace=True)
+MutableSeq('GCT')
+>>> my_seq
+MutableSeq('GCT')
+As strings and ``Seq`` objects are immutable, a ``TypeError`` is
+raised if ``reverse_complement`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+if isinstance(sequence, (Seq, MutableSeq)):
+return sequence.complement(inplace)
+if isinstance(sequence, SeqRecord):
+if inplace:
+raise TypeError("SeqRecords are immutable")
+return sequence.complement()
+# Assume it's a string.
+if inplace is True:
+raise TypeError("strings are immutable")
+sequence = sequence.encode("ASCII")
+sequence = sequence.translate(_dna_complement_table)
+return sequence.decode("ASCII")
+def complement_rna(sequence, inplace=False):
+"""Return the complement as an RNA sequence.
+If given a string, returns a new string object.
+Given a Seq object, returns a new Seq object.
+Given a MutableSeq, returns a new MutableSeq object.
+Given a SeqRecord object, returns a new SeqRecord object.
+>>> my_seq = "CGA"
+>>> complement_rna(my_seq)
+'GCU'
+>>> my_seq = Seq("CGA")
+>>> complement_rna(my_seq)
+Seq('GCU')
+>>> my_seq = MutableSeq("CGA")
+>>> complement_rna(my_seq)
+MutableSeq('GCU')
+>>> my_seq
+MutableSeq('CGA')
+Any T in the sequence is treated as a U:
+>>> complement_rna(Seq("CGAUT"))
+Seq('GCUAA')
+In contrast, ``complement`` returns a DNA sequence:
+>>> complement(Seq("CGAUT"))
+Seq('GCTAA')
+Supports and lower- and upper-case characters, and unambiguous and
+ambiguous nucleotides. All other characters are not converted:
+>>> complement_rna("ACGTUacgtuXYZxyz")
+'UGCAAugcaaXRZxrz'
+The sequence is modified in-place and returned if inplace is True:
+>>> my_seq = MutableSeq("CGA")
+>>> complement(my_seq, inplace=True)
+MutableSeq('GCT')
+>>> my_seq
+MutableSeq('GCT')
+As strings and ``Seq`` objects are immutable, a ``TypeError`` is
+raised if ``reverse_complement`` is called on a ``Seq`` object with
+``inplace=True``.
+"""
+from Bio.SeqRecord import SeqRecord  # Lazy to avoid circular imports
+if isinstance(sequence, (Seq, MutableSeq)):
+return sequence.complement_rna(inplace)
+if isinstance(sequence, SeqRecord):
+if inplace:
+raise TypeError("SeqRecords are immutable")
+return sequence.complement_rna()
+# Assume it's a string.
+if inplace:
+raise TypeError("strings are immutable")
+sequence = sequence.encode("ASCII")
+sequence = sequence.translate(_rna_complement_table)
+return sequence.decode("ASCII")
+def _test():
+"""Run the Bio.Seq module's doctests (PRIVATE)."""
+print("Running doctests...")
+import doctest
+doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
+print("Done")
+if __name__ == "__main__":
+_test()

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356