csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # Copyright 2000 Andrew Dalke.
jpayne@68	2 # Copyright 2000-2002 Brad Chapman.
jpayne@68	3 # Copyright 2004-2005, 2010 by M de Hoon.
jpayne@68	4 # Copyright 2007-2023 by Peter Cock.
jpayne@68	5 # All rights reserved.
jpayne@68	6 #
jpayne@68	7 # This file is part of the Biopython distribution and governed by your
jpayne@68	8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68	9 # Please see the LICENSE file that should have been included as part of this
jpayne@68	10 # package.
jpayne@68	11 """Provide objects to represent biological sequences.
jpayne@68	12
jpayne@68	13 See also the Seq_ wiki and the chapter in our tutorial:
jpayne@68	14 - `HTML Tutorial`_
jpayne@68	15 - `PDF Tutorial`_
jpayne@68	16
jpayne@68	17 .. _Seq: http://biopython.org/wiki/Seq
jpayne@68	18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
jpayne@68	19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
jpayne@68	20
jpayne@68	21 """
jpayne@68	22 import array
jpayne@68	23 import collections
jpayne@68	24 import numbers
jpayne@68	25 import warnings
jpayne@68	26
jpayne@68	27 from abc import ABC
jpayne@68	28 from abc import abstractmethod
jpayne@68	29 from typing import overload, Optional, Union, Dict
jpayne@68	30
jpayne@68	31 from Bio import BiopythonWarning
jpayne@68	32 from Bio.Data import CodonTable
jpayne@68	33 from Bio.Data import IUPACData
jpayne@68	34
jpayne@68	35
jpayne@68	36 def _maketrans(complement_mapping):
jpayne@68	37 """Make a python string translation table (PRIVATE).
jpayne@68	38
jpayne@68	39 Arguments:
jpayne@68	40 - complement_mapping - a dictionary such as ambiguous_dna_complement
jpayne@68	41 and ambiguous_rna_complement from Data.IUPACData.
jpayne@68	42
jpayne@68	43 Returns a translation table (a bytes object of length 256) for use with
jpayne@68	44 the python string's translate method to use in a (reverse) complement.
jpayne@68	45
jpayne@68	46 Compatible with lower case and upper case sequences.
jpayne@68	47
jpayne@68	48 For internal use only.
jpayne@68	49 """
jpayne@68	50 keys = "".join(complement_mapping.keys()).encode("ASCII")
jpayne@68	51 values = "".join(complement_mapping.values()).encode("ASCII")
jpayne@68	52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
jpayne@68	53
jpayne@68	54
jpayne@68	55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
jpayne@68	56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
jpayne@68	57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
jpayne@68	58 del ambiguous_dna_complement
jpayne@68	59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
jpayne@68	60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
jpayne@68	61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
jpayne@68	62 del ambiguous_rna_complement
jpayne@68	63
jpayne@68	64
jpayne@68	65 class SequenceDataAbstractBaseClass(ABC):
jpayne@68	66 """Abstract base class for sequence content providers.
jpayne@68	67
jpayne@68	68 Most users will not need to use this class. It is used internally as a base
jpayne@68	69 class for sequence content provider classes such as _UndefinedSequenceData
jpayne@68	70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
jpayne@68	71 Instances of these classes can be used instead of a ``bytes`` object as the
jpayne@68	72 data argument when creating a Seq object, and provide the sequence content
jpayne@68	73 only when requested via ``__getitem__``. This allows lazy parsers to load
jpayne@68	74 and parse sequence data from a file only for the requested sequence regions,
jpayne@68	75 and _UndefinedSequenceData instances to raise an exception when undefined
jpayne@68	76 sequence data are requested.
jpayne@68	77
jpayne@68	78 Future implementations of lazy parsers that similarly provide on-demand
jpayne@68	79 parsing of sequence data should use a subclass of this abstract class and
jpayne@68	80 implement the abstract methods ``__len__`` and ``__getitem__``:
jpayne@68	81
jpayne@68	82 * ``__len__`` must return the sequence length;
jpayne@68	83 * ``__getitem__`` must return
jpayne@68	84
jpayne@68	85 * a ``bytes`` object for the requested region; or
jpayne@68	86 * a new instance of the subclass for the requested region; or
jpayne@68	87 * raise an ``UndefinedSequenceError``.
jpayne@68	88
jpayne@68	89 Calling ``__getitem__`` for a sequence region of size zero should always
jpayne@68	90 return an empty ``bytes`` object.
jpayne@68	91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
jpayne@68	92 either return a ``bytes`` object with the full sequence, or raise an
jpayne@68	93 ``UndefinedSequenceError``.
jpayne@68	94
jpayne@68	95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
jpayne@68	96 as part of their ``__init__`` method.
jpayne@68	97 """
jpayne@68	98
jpayne@68	99 __slots__ = ()
jpayne@68	100
jpayne@68	101 def __init__(self):
jpayne@68	102 """Check if ``__getitem__`` returns a bytes-like object."""
jpayne@68	103 assert self[:0] == b""
jpayne@68	104
jpayne@68	105 @abstractmethod
jpayne@68	106 def __len__(self):
jpayne@68	107 pass
jpayne@68	108
jpayne@68	109 @abstractmethod
jpayne@68	110 def __getitem__(self, key):
jpayne@68	111 pass
jpayne@68	112
jpayne@68	113 def __bytes__(self):
jpayne@68	114 return self[:]
jpayne@68	115
jpayne@68	116 def __hash__(self):
jpayne@68	117 return hash(bytes(self))
jpayne@68	118
jpayne@68	119 def __eq__(self, other):
jpayne@68	120 return bytes(self) == other
jpayne@68	121
jpayne@68	122 def __lt__(self, other):
jpayne@68	123 return bytes(self) < other
jpayne@68	124
jpayne@68	125 def __le__(self, other):
jpayne@68	126 return bytes(self) <= other
jpayne@68	127
jpayne@68	128 def __gt__(self, other):
jpayne@68	129 return bytes(self) > other
jpayne@68	130
jpayne@68	131 def __ge__(self, other):
jpayne@68	132 return bytes(self) >= other
jpayne@68	133
jpayne@68	134 def __add__(self, other):
jpayne@68	135 try:
jpayne@68	136 return bytes(self) + bytes(other)
jpayne@68	137 except UndefinedSequenceError:
jpayne@68	138 return NotImplemented
jpayne@68	139 # will be handled by _UndefinedSequenceData.__radd__ or
jpayne@68	140 # by _PartiallyDefinedSequenceData.__radd__
jpayne@68	141
jpayne@68	142 def __radd__(self, other):
jpayne@68	143 return other + bytes(self)
jpayne@68	144
jpayne@68	145 def __mul__(self, other):
jpayne@68	146 return other * bytes(self)
jpayne@68	147
jpayne@68	148 def __contains__(self, item):
jpayne@68	149 return bytes(self).__contains__(item)
jpayne@68	150
jpayne@68	151 def decode(self, encoding="utf-8"):
jpayne@68	152 """Decode the data as bytes using the codec registered for encoding.
jpayne@68	153
jpayne@68	154 encoding
jpayne@68	155 The encoding with which to decode the bytes.
jpayne@68	156 """
jpayne@68	157 return bytes(self).decode(encoding)
jpayne@68	158
jpayne@68	159 def count(self, sub, start=None, end=None):
jpayne@68	160 """Return the number of non-overlapping occurrences of sub in data[start:end].
jpayne@68	161
jpayne@68	162 Optional arguments start and end are interpreted as in slice notation.
jpayne@68	163 This method behaves as the count method of Python strings.
jpayne@68	164 """
jpayne@68	165 return bytes(self).count(sub, start, end)
jpayne@68	166
jpayne@68	167 def find(self, sub, start=None, end=None):
jpayne@68	168 """Return the lowest index in data where subsection sub is found.
jpayne@68	169
jpayne@68	170 Return the lowest index in data where subsection sub is found,
jpayne@68	171 such that sub is contained within data[start,end]. Optional
jpayne@68	172 arguments start and end are interpreted as in slice notation.
jpayne@68	173
jpayne@68	174 Return -1 on failure.
jpayne@68	175 """
jpayne@68	176 return bytes(self).find(sub, start, end)
jpayne@68	177
jpayne@68	178 def rfind(self, sub, start=None, end=None):
jpayne@68	179 """Return the highest index in data where subsection sub is found.
jpayne@68	180
jpayne@68	181 Return the highest index in data where subsection sub is found,
jpayne@68	182 such that sub is contained within data[start,end]. Optional
jpayne@68	183 arguments start and end are interpreted as in slice notation.
jpayne@68	184
jpayne@68	185 Return -1 on failure.
jpayne@68	186 """
jpayne@68	187 return bytes(self).rfind(sub, start, end)
jpayne@68	188
jpayne@68	189 def index(self, sub, start=None, end=None):
jpayne@68	190 """Return the lowest index in data where subsection sub is found.
jpayne@68	191
jpayne@68	192 Return the lowest index in data where subsection sub is found,
jpayne@68	193 such that sub is contained within data[start,end]. Optional
jpayne@68	194 arguments start and end are interpreted as in slice notation.
jpayne@68	195
jpayne@68	196 Raises ValueError when the subsection is not found.
jpayne@68	197 """
jpayne@68	198 return bytes(self).index(sub, start, end)
jpayne@68	199
jpayne@68	200 def rindex(self, sub, start=None, end=None):
jpayne@68	201 """Return the highest index in data where subsection sub is found.
jpayne@68	202
jpayne@68	203 Return the highest index in data where subsection sub is found,
jpayne@68	204 such that sub is contained within data[start,end]. Optional
jpayne@68	205 arguments start and end are interpreted as in slice notation.
jpayne@68	206
jpayne@68	207 Raise ValueError when the subsection is not found.
jpayne@68	208 """
jpayne@68	209 return bytes(self).rindex(sub, start, end)
jpayne@68	210
jpayne@68	211 def startswith(self, prefix, start=None, end=None):
jpayne@68	212 """Return True if data starts with the specified prefix, False otherwise.
jpayne@68	213
jpayne@68	214 With optional start, test data beginning at that position.
jpayne@68	215 With optional end, stop comparing data at that position.
jpayne@68	216 prefix can also be a tuple of bytes to try.
jpayne@68	217 """
jpayne@68	218 return bytes(self).startswith(prefix, start, end)
jpayne@68	219
jpayne@68	220 def endswith(self, suffix, start=None, end=None):
jpayne@68	221 """Return True if data ends with the specified suffix, False otherwise.
jpayne@68	222
jpayne@68	223 With optional start, test data beginning at that position.
jpayne@68	224 With optional end, stop comparing data at that position.
jpayne@68	225 suffix can also be a tuple of bytes to try.
jpayne@68	226 """
jpayne@68	227 return bytes(self).endswith(suffix, start, end)
jpayne@68	228
jpayne@68	229 def split(self, sep=None, maxsplit=-1):
jpayne@68	230 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68	231
jpayne@68	232 sep
jpayne@68	233 The delimiter according which to split the data.
jpayne@68	234 None (the default value) means split on ASCII whitespace characters
jpayne@68	235 (space, tab, return, newline, formfeed, vertical tab).
jpayne@68	236 maxsplit
jpayne@68	237 Maximum number of splits to do.
jpayne@68	238 -1 (the default value) means no limit.
jpayne@68	239 """
jpayne@68	240 return bytes(self).split(sep, maxsplit)
jpayne@68	241
jpayne@68	242 def rsplit(self, sep=None, maxsplit=-1):
jpayne@68	243 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68	244
jpayne@68	245 sep
jpayne@68	246 The delimiter according which to split the data.
jpayne@68	247 None (the default value) means split on ASCII whitespace characters
jpayne@68	248 (space, tab, return, newline, formfeed, vertical tab).
jpayne@68	249 maxsplit
jpayne@68	250 Maximum number of splits to do.
jpayne@68	251 -1 (the default value) means no limit.
jpayne@68	252
jpayne@68	253 Splitting is done starting at the end of the data and working to the front.
jpayne@68	254 """
jpayne@68	255 return bytes(self).rsplit(sep, maxsplit)
jpayne@68	256
jpayne@68	257 def strip(self, chars=None):
jpayne@68	258 """Strip leading and trailing characters contained in the argument.
jpayne@68	259
jpayne@68	260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
jpayne@68	261 """
jpayne@68	262 return bytes(self).strip(chars)
jpayne@68	263
jpayne@68	264 def lstrip(self, chars=None):
jpayne@68	265 """Strip leading characters contained in the argument.
jpayne@68	266
jpayne@68	267 If the argument is omitted or None, strip leading ASCII whitespace.
jpayne@68	268 """
jpayne@68	269 return bytes(self).lstrip(chars)
jpayne@68	270
jpayne@68	271 def rstrip(self, chars=None):
jpayne@68	272 """Strip trailing characters contained in the argument.
jpayne@68	273
jpayne@68	274 If the argument is omitted or None, strip trailing ASCII whitespace.
jpayne@68	275 """
jpayne@68	276 return bytes(self).rstrip(chars)
jpayne@68	277
jpayne@68	278 def removeprefix(self, prefix):
jpayne@68	279 """Remove the prefix if present."""
jpayne@68	280 # Want to do just this, but need Python 3.9+
jpayne@68	281 # return bytes(self).removeprefix(prefix)
jpayne@68	282 data = bytes(self)
jpayne@68	283 try:
jpayne@68	284 return data.removeprefix(prefix)
jpayne@68	285 except AttributeError:
jpayne@68	286 if data.startswith(prefix):
jpayne@68	287 return data[len(prefix) :]
jpayne@68	288 else:
jpayne@68	289 return data
jpayne@68	290
jpayne@68	291 def removesuffix(self, suffix):
jpayne@68	292 """Remove the suffix if present."""
jpayne@68	293 # Want to do just this, but need Python 3.9+
jpayne@68	294 # return bytes(self).removesuffix(suffix)
jpayne@68	295 data = bytes(self)
jpayne@68	296 try:
jpayne@68	297 return data.removesuffix(suffix)
jpayne@68	298 except AttributeError:
jpayne@68	299 if data.startswith(suffix):
jpayne@68	300 return data[: -len(suffix)]
jpayne@68	301 else:
jpayne@68	302 return data
jpayne@68	303
jpayne@68	304 def upper(self):
jpayne@68	305 """Return a copy of data with all ASCII characters converted to uppercase."""
jpayne@68	306 return bytes(self).upper()
jpayne@68	307
jpayne@68	308 def lower(self):
jpayne@68	309 """Return a copy of data with all ASCII characters converted to lowercase."""
jpayne@68	310 return bytes(self).lower()
jpayne@68	311
jpayne@68	312 def isupper(self):
jpayne@68	313 """Return True if all ASCII characters in data are uppercase.
jpayne@68	314
jpayne@68	315 If there are no cased characters, the method returns False.
jpayne@68	316 """
jpayne@68	317 return bytes(self).isupper()
jpayne@68	318
jpayne@68	319 def islower(self):
jpayne@68	320 """Return True if all ASCII characters in data are lowercase.
jpayne@68	321
jpayne@68	322 If there are no cased characters, the method returns False.
jpayne@68	323 """
jpayne@68	324 return bytes(self).islower()
jpayne@68	325
jpayne@68	326 def replace(self, old, new):
jpayne@68	327 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68	328 return bytes(self).replace(old, new)
jpayne@68	329
jpayne@68	330 def translate(self, table, delete=b""):
jpayne@68	331 """Return a copy with each character mapped by the given translation table.
jpayne@68	332
jpayne@68	333 table
jpayne@68	334 Translation table, which must be a bytes object of length 256.
jpayne@68	335
jpayne@68	336 All characters occurring in the optional argument delete are removed.
jpayne@68	337 The remaining characters are mapped through the given translation table.
jpayne@68	338 """
jpayne@68	339 return bytes(self).translate(table, delete)
jpayne@68	340
jpayne@68	341 @property
jpayne@68	342 def defined(self):
jpayne@68	343 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68	344
jpayne@68	345 Zero-length sequences are always considered to be defined.
jpayne@68	346 """
jpayne@68	347 return True
jpayne@68	348
jpayne@68	349 @property
jpayne@68	350 def defined_ranges(self):
jpayne@68	351 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68	352
jpayne@68	353 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68	354 """
jpayne@68	355 length = len(self)
jpayne@68	356 if length > 0:
jpayne@68	357 return ((0, length),)
jpayne@68	358 else:
jpayne@68	359 return ()
jpayne@68	360
jpayne@68	361
jpayne@68	362 class _SeqAbstractBaseClass(ABC):
jpayne@68	363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
jpayne@68	364
jpayne@68	365 Most users will not need to use this class. It is used internally as an
jpayne@68	366 abstract base class for Seq and MutableSeq, as most of their methods are
jpayne@68	367 identical.
jpayne@68	368 """
jpayne@68	369
jpayne@68	370 __slots__ = ("_data",)
jpayne@68	371 __array_ufunc__ = None # turn off numpy Ufuncs
jpayne@68	372
jpayne@68	373 @abstractmethod
jpayne@68	374 def __init__(self):
jpayne@68	375 pass
jpayne@68	376
jpayne@68	377 def __bytes__(self):
jpayne@68	378 return bytes(self._data)
jpayne@68	379
jpayne@68	380 def __repr__(self):
jpayne@68	381 """Return (truncated) representation of the sequence."""
jpayne@68	382 data = self._data
jpayne@68	383 if isinstance(data, _UndefinedSequenceData):
jpayne@68	384 return f"Seq(None, length={len(self)})"
jpayne@68	385 if isinstance(data, _PartiallyDefinedSequenceData):
jpayne@68	386 d = {}
jpayne@68	387 for position, seq in data._data.items():
jpayne@68	388 if len(seq) > 60:
jpayne@68	389 start = seq[:54].decode("ASCII")
jpayne@68	390 end = seq[-3:].decode("ASCII")
jpayne@68	391 seq = f"{start}...{end}"
jpayne@68	392 else:
jpayne@68	393 seq = seq.decode("ASCII")
jpayne@68	394 d[position] = seq
jpayne@68	395 return "Seq(%r, length=%d)" % (d, len(self))
jpayne@68	396 if len(data) > 60:
jpayne@68	397 # Shows the last three letters as it is often useful to see if
jpayne@68	398 # there is a stop codon at the end of a sequence.
jpayne@68	399 # Note total length is 54+3+3=60
jpayne@68	400 start = data[:54].decode("ASCII")
jpayne@68	401 end = data[-3:].decode("ASCII")
jpayne@68	402 return f"{self.__class__.__name__}('{start}...{end}')"
jpayne@68	403 else:
jpayne@68	404 data = data.decode("ASCII")
jpayne@68	405 return f"{self.__class__.__name__}('{data}')"
jpayne@68	406
jpayne@68	407 def __str__(self):
jpayne@68	408 """Return the full sequence as a python string."""
jpayne@68	409 return self._data.decode("ASCII")
jpayne@68	410
jpayne@68	411 def __eq__(self, other):
jpayne@68	412 """Compare the sequence to another sequence or a string.
jpayne@68	413
jpayne@68	414 Sequences are equal to each other if their sequence contents is
jpayne@68	415 identical:
jpayne@68	416
jpayne@68	417 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	418 >>> seq1 = Seq("ACGT")
jpayne@68	419 >>> seq2 = Seq("ACGT")
jpayne@68	420 >>> mutable_seq = MutableSeq("ACGT")
jpayne@68	421 >>> seq1 == seq2
jpayne@68	422 True
jpayne@68	423 >>> seq1 == mutable_seq
jpayne@68	424 True
jpayne@68	425 >>> seq1 == "ACGT"
jpayne@68	426 True
jpayne@68	427
jpayne@68	428 Note that the sequence objects themselves are not identical to each
jpayne@68	429 other:
jpayne@68	430
jpayne@68	431 >>> id(seq1) == id(seq2)
jpayne@68	432 False
jpayne@68	433 >>> seq1 is seq2
jpayne@68	434 False
jpayne@68	435
jpayne@68	436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
jpayne@68	437 objects:
jpayne@68	438
jpayne@68	439 >>> seq1 == "ACGT"
jpayne@68	440 True
jpayne@68	441 >>> seq1 == b"ACGT"
jpayne@68	442 True
jpayne@68	443 >>> seq1 == bytearray(b"ACGT")
jpayne@68	444 True
jpayne@68	445 """
jpayne@68	446 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	447 return self._data == other._data
jpayne@68	448 elif isinstance(other, str):
jpayne@68	449 return self._data == other.encode("ASCII")
jpayne@68	450 else:
jpayne@68	451 return self._data == other
jpayne@68	452
jpayne@68	453 def __lt__(self, other):
jpayne@68	454 """Implement the less-than operand."""
jpayne@68	455 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	456 return self._data < other._data
jpayne@68	457 elif isinstance(other, str):
jpayne@68	458 return self._data < other.encode("ASCII")
jpayne@68	459 else:
jpayne@68	460 return self._data < other
jpayne@68	461
jpayne@68	462 def __le__(self, other):
jpayne@68	463 """Implement the less-than or equal operand."""
jpayne@68	464 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	465 return self._data <= other._data
jpayne@68	466 elif isinstance(other, str):
jpayne@68	467 return self._data <= other.encode("ASCII")
jpayne@68	468 else:
jpayne@68	469 return self._data <= other
jpayne@68	470
jpayne@68	471 def __gt__(self, other):
jpayne@68	472 """Implement the greater-than operand."""
jpayne@68	473 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	474 return self._data > other._data
jpayne@68	475 elif isinstance(other, str):
jpayne@68	476 return self._data > other.encode("ASCII")
jpayne@68	477 else:
jpayne@68	478 return self._data > other
jpayne@68	479
jpayne@68	480 def __ge__(self, other):
jpayne@68	481 """Implement the greater-than or equal operand."""
jpayne@68	482 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	483 return self._data >= other._data
jpayne@68	484 elif isinstance(other, str):
jpayne@68	485 return self._data >= other.encode("ASCII")
jpayne@68	486 else:
jpayne@68	487 return self._data >= other
jpayne@68	488
jpayne@68	489 def __len__(self):
jpayne@68	490 """Return the length of the sequence."""
jpayne@68	491 return len(self._data)
jpayne@68	492
jpayne@68	493 def __iter__(self):
jpayne@68	494 """Return an iterable of the sequence."""
jpayne@68	495 return self._data.decode("ASCII").__iter__()
jpayne@68	496
jpayne@68	497 @overload
jpayne@68	498 def __getitem__(self, index: int) -> str:
jpayne@68	499 ...
jpayne@68	500
jpayne@68	501 @overload
jpayne@68	502 def __getitem__(self, index: slice) -> "Seq":
jpayne@68	503 ...
jpayne@68	504
jpayne@68	505 def __getitem__(self, index):
jpayne@68	506 """Return a subsequence as a single letter or as a sequence object.
jpayne@68	507
jpayne@68	508 If the index is an integer, a single letter is returned as a Python
jpayne@68	509 string:
jpayne@68	510
jpayne@68	511 >>> seq = Seq('ACTCGACGTCG')
jpayne@68	512 >>> seq[5]
jpayne@68	513 'A'
jpayne@68	514
jpayne@68	515 Otherwise, a new sequence object of the same class is returned:
jpayne@68	516
jpayne@68	517 >>> seq[5:8]
jpayne@68	518 Seq('ACG')
jpayne@68	519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	520 >>> mutable_seq[5:8]
jpayne@68	521 MutableSeq('ACG')
jpayne@68	522 """
jpayne@68	523 if isinstance(index, numbers.Integral):
jpayne@68	524 # Return a single letter as a string
jpayne@68	525 return chr(self._data[index])
jpayne@68	526 else:
jpayne@68	527 # Return the (sub)sequence as another Seq/MutableSeq object
jpayne@68	528 return self.__class__(self._data[index])
jpayne@68	529
jpayne@68	530 def __add__(self, other):
jpayne@68	531 """Add a sequence or string to this sequence.
jpayne@68	532
jpayne@68	533 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	534 >>> Seq("MELKI") + "LV"
jpayne@68	535 Seq('MELKILV')
jpayne@68	536 >>> MutableSeq("MELKI") + "LV"
jpayne@68	537 MutableSeq('MELKILV')
jpayne@68	538 """
jpayne@68	539 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	540 return self.__class__(self._data + other._data)
jpayne@68	541 elif isinstance(other, str):
jpayne@68	542 return self.__class__(self._data + other.encode("ASCII"))
jpayne@68	543 else:
jpayne@68	544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
jpayne@68	545 # this. If not, returning NotImplemented will trigger a TypeError.
jpayne@68	546 return NotImplemented
jpayne@68	547
jpayne@68	548 def __radd__(self, other):
jpayne@68	549 """Add a sequence string on the left.
jpayne@68	550
jpayne@68	551 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	552 >>> "LV" + Seq("MELKI")
jpayne@68	553 Seq('LVMELKI')
jpayne@68	554 >>> "LV" + MutableSeq("MELKI")
jpayne@68	555 MutableSeq('LVMELKI')
jpayne@68	556
jpayne@68	557 Adding two sequence objects is handled via the __add__ method.
jpayne@68	558 """
jpayne@68	559 if isinstance(other, str):
jpayne@68	560 return self.__class__(other.encode("ASCII") + self._data)
jpayne@68	561 else:
jpayne@68	562 return NotImplemented
jpayne@68	563
jpayne@68	564 def __mul__(self, other):
jpayne@68	565 """Multiply sequence by integer.
jpayne@68	566
jpayne@68	567 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	568 >>> Seq('ATG') * 2
jpayne@68	569 Seq('ATGATG')
jpayne@68	570 >>> MutableSeq('ATG') * 2
jpayne@68	571 MutableSeq('ATGATG')
jpayne@68	572 """
jpayne@68	573 if not isinstance(other, numbers.Integral):
jpayne@68	574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68	575 # we would like to simply write
jpayne@68	576 # data = self._data * other
jpayne@68	577 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68	578 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68	579 data = self._data.__mul__(other)
jpayne@68	580 return self.__class__(data)
jpayne@68	581
jpayne@68	582 def __rmul__(self, other):
jpayne@68	583 """Multiply integer by sequence.
jpayne@68	584
jpayne@68	585 >>> from Bio.Seq import Seq
jpayne@68	586 >>> 2 * Seq('ATG')
jpayne@68	587 Seq('ATGATG')
jpayne@68	588 """
jpayne@68	589 if not isinstance(other, numbers.Integral):
jpayne@68	590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68	591 # we would like to simply write
jpayne@68	592 # data = self._data * other
jpayne@68	593 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68	594 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68	595 data = self._data.__mul__(other)
jpayne@68	596 return self.__class__(data)
jpayne@68	597
jpayne@68	598 def __imul__(self, other):
jpayne@68	599 """Multiply the sequence object by other and assign.
jpayne@68	600
jpayne@68	601 >>> from Bio.Seq import Seq
jpayne@68	602 >>> seq = Seq('ATG')
jpayne@68	603 >>> seq *= 2
jpayne@68	604 >>> seq
jpayne@68	605 Seq('ATGATG')
jpayne@68	606
jpayne@68	607 Note that this is different from in-place multiplication. The ``seq``
jpayne@68	608 variable is reassigned to the multiplication result, but any variable
jpayne@68	609 pointing to ``seq`` will remain unchanged:
jpayne@68	610
jpayne@68	611 >>> seq = Seq('ATG')
jpayne@68	612 >>> seq2 = seq
jpayne@68	613 >>> id(seq) == id(seq2)
jpayne@68	614 True
jpayne@68	615 >>> seq *= 2
jpayne@68	616 >>> seq
jpayne@68	617 Seq('ATGATG')
jpayne@68	618 >>> seq2
jpayne@68	619 Seq('ATG')
jpayne@68	620 >>> id(seq) == id(seq2)
jpayne@68	621 False
jpayne@68	622 """
jpayne@68	623 if not isinstance(other, numbers.Integral):
jpayne@68	624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68	625 # we would like to simply write
jpayne@68	626 # data = self._data * other
jpayne@68	627 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68	628 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68	629 data = self._data.__mul__(other)
jpayne@68	630 return self.__class__(data)
jpayne@68	631
jpayne@68	632 def count(self, sub, start=None, end=None):
jpayne@68	633 """Return a non-overlapping count, like that of a python string.
jpayne@68	634
jpayne@68	635 The number of occurrences of substring argument sub in the
jpayne@68	636 (sub)sequence given by [start:end] is returned as an integer.
jpayne@68	637 Optional arguments start and end are interpreted as in slice
jpayne@68	638 notation.
jpayne@68	639
jpayne@68	640 Arguments:
jpayne@68	641 - sub - a string or another Seq object to look for
jpayne@68	642 - start - optional integer, slice start
jpayne@68	643 - end - optional integer, slice end
jpayne@68	644
jpayne@68	645 e.g.
jpayne@68	646
jpayne@68	647 >>> from Bio.Seq import Seq
jpayne@68	648 >>> my_seq = Seq("AAAATGA")
jpayne@68	649 >>> print(my_seq.count("A"))
jpayne@68	650 5
jpayne@68	651 >>> print(my_seq.count("ATG"))
jpayne@68	652 1
jpayne@68	653 >>> print(my_seq.count(Seq("AT")))
jpayne@68	654 1
jpayne@68	655 >>> print(my_seq.count("AT", 2, -1))
jpayne@68	656 1
jpayne@68	657
jpayne@68	658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
jpayne@68	659 objects, like that of Python strings, do a non-overlapping search, this
jpayne@68	660 may not give the answer you expect:
jpayne@68	661
jpayne@68	662 >>> "AAAA".count("AA")
jpayne@68	663 2
jpayne@68	664 >>> print(Seq("AAAA").count("AA"))
jpayne@68	665 2
jpayne@68	666
jpayne@68	667 For an overlapping search, use the ``count_overlap`` method:
jpayne@68	668
jpayne@68	669 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68	670 3
jpayne@68	671 """
jpayne@68	672 if isinstance(sub, MutableSeq):
jpayne@68	673 sub = sub._data
jpayne@68	674 elif isinstance(sub, Seq):
jpayne@68	675 sub = bytes(sub)
jpayne@68	676 elif isinstance(sub, str):
jpayne@68	677 sub = sub.encode("ASCII")
jpayne@68	678 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	679 raise TypeError(
jpayne@68	680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	681 % type(sub)
jpayne@68	682 )
jpayne@68	683 return self._data.count(sub, start, end)
jpayne@68	684
jpayne@68	685 def count_overlap(self, sub, start=None, end=None):
jpayne@68	686 """Return an overlapping count.
jpayne@68	687
jpayne@68	688 Returns an integer, the number of occurrences of substring
jpayne@68	689 argument sub in the (sub)sequence given by [start:end].
jpayne@68	690 Optional arguments start and end are interpreted as in slice
jpayne@68	691 notation.
jpayne@68	692
jpayne@68	693 Arguments:
jpayne@68	694 - sub - a string or another Seq object to look for
jpayne@68	695 - start - optional integer, slice start
jpayne@68	696 - end - optional integer, slice end
jpayne@68	697
jpayne@68	698 e.g.
jpayne@68	699
jpayne@68	700 >>> from Bio.Seq import Seq
jpayne@68	701 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68	702 3
jpayne@68	703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
jpayne@68	704 4
jpayne@68	705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
jpayne@68	706 1
jpayne@68	707
jpayne@68	708 For a non-overlapping search, use the ``count`` method:
jpayne@68	709
jpayne@68	710 >>> print(Seq("AAAA").count("AA"))
jpayne@68	711 2
jpayne@68	712
jpayne@68	713 Where substrings do not overlap, ``count_overlap`` behaves the same as
jpayne@68	714 the ``count`` method:
jpayne@68	715
jpayne@68	716 >>> from Bio.Seq import Seq
jpayne@68	717 >>> my_seq = Seq("AAAATGA")
jpayne@68	718 >>> print(my_seq.count_overlap("A"))
jpayne@68	719 5
jpayne@68	720 >>> my_seq.count_overlap("A") == my_seq.count("A")
jpayne@68	721 True
jpayne@68	722 >>> print(my_seq.count_overlap("ATG"))
jpayne@68	723 1
jpayne@68	724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
jpayne@68	725 True
jpayne@68	726 >>> print(my_seq.count_overlap(Seq("AT")))
jpayne@68	727 1
jpayne@68	728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
jpayne@68	729 True
jpayne@68	730 >>> print(my_seq.count_overlap("AT", 2, -1))
jpayne@68	731 1
jpayne@68	732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
jpayne@68	733 True
jpayne@68	734
jpayne@68	735 HOWEVER, do not use this method for such cases because the
jpayne@68	736 count() method is much for efficient.
jpayne@68	737 """
jpayne@68	738 if isinstance(sub, MutableSeq):
jpayne@68	739 sub = sub._data
jpayne@68	740 elif isinstance(sub, Seq):
jpayne@68	741 sub = bytes(sub)
jpayne@68	742 elif isinstance(sub, str):
jpayne@68	743 sub = sub.encode("ASCII")
jpayne@68	744 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	745 raise TypeError(
jpayne@68	746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	747 % type(sub)
jpayne@68	748 )
jpayne@68	749 data = self._data
jpayne@68	750 overlap_count = 0
jpayne@68	751 while True:
jpayne@68	752 start = data.find(sub, start, end) + 1
jpayne@68	753 if start != 0:
jpayne@68	754 overlap_count += 1
jpayne@68	755 else:
jpayne@68	756 return overlap_count
jpayne@68	757
jpayne@68	758 def __contains__(self, item):
jpayne@68	759 """Return True if item is a subsequence of the sequence, and False otherwise.
jpayne@68	760
jpayne@68	761 e.g.
jpayne@68	762
jpayne@68	763 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
jpayne@68	765 >>> "AAA" in my_dna
jpayne@68	766 True
jpayne@68	767 >>> Seq("AAA") in my_dna
jpayne@68	768 True
jpayne@68	769 >>> MutableSeq("AAA") in my_dna
jpayne@68	770 True
jpayne@68	771 """
jpayne@68	772 if isinstance(item, _SeqAbstractBaseClass):
jpayne@68	773 item = bytes(item)
jpayne@68	774 elif isinstance(item, str):
jpayne@68	775 item = item.encode("ASCII")
jpayne@68	776 return item in self._data
jpayne@68	777
jpayne@68	778 def find(self, sub, start=None, end=None):
jpayne@68	779 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68	780
jpayne@68	781 With optional arguments start and end, return the lowest index in the
jpayne@68	782 sequence such that the subsequence sub is contained within the sequence
jpayne@68	783 region [start:end].
jpayne@68	784
jpayne@68	785 Arguments:
jpayne@68	786 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68	787 - start - optional integer, slice start
jpayne@68	788 - end - optional integer, slice end
jpayne@68	789
jpayne@68	790 Returns -1 if the subsequence is NOT found.
jpayne@68	791
jpayne@68	792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68	793
jpayne@68	794 >>> from Bio.Seq import Seq
jpayne@68	795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	796 >>> my_rna.find("AUG")
jpayne@68	797 3
jpayne@68	798
jpayne@68	799 The next typical start codon can then be found by starting the search
jpayne@68	800 at position 4:
jpayne@68	801
jpayne@68	802 >>> my_rna.find("AUG", 4)
jpayne@68	803 15
jpayne@68	804
jpayne@68	805 See the ``search`` method to find the locations of multiple subsequences
jpayne@68	806 at the same time.
jpayne@68	807 """
jpayne@68	808 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68	809 sub = bytes(sub)
jpayne@68	810 elif isinstance(sub, str):
jpayne@68	811 sub = sub.encode("ASCII")
jpayne@68	812 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	813 raise TypeError(
jpayne@68	814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	815 % type(sub)
jpayne@68	816 )
jpayne@68	817 return self._data.find(sub, start, end)
jpayne@68	818
jpayne@68	819 def rfind(self, sub, start=None, end=None):
jpayne@68	820 """Return the highest index in the sequence where subsequence sub is found.
jpayne@68	821
jpayne@68	822 With optional arguments start and end, return the highest index in the
jpayne@68	823 sequence such that the subsequence sub is contained within the sequence
jpayne@68	824 region [start:end].
jpayne@68	825
jpayne@68	826 Arguments:
jpayne@68	827 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68	828 - start - optional integer, slice start
jpayne@68	829 - end - optional integer, slice end
jpayne@68	830
jpayne@68	831 Returns -1 if the subsequence is NOT found.
jpayne@68	832
jpayne@68	833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68	834
jpayne@68	835 >>> from Bio.Seq import Seq
jpayne@68	836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	837 >>> my_rna.rfind("AUG")
jpayne@68	838 15
jpayne@68	839
jpayne@68	840 The location of the typical start codon before that can be found by
jpayne@68	841 ending the search at position 15:
jpayne@68	842
jpayne@68	843 >>> my_rna.rfind("AUG", end=15)
jpayne@68	844 3
jpayne@68	845
jpayne@68	846 See the ``search`` method to find the locations of multiple subsequences
jpayne@68	847 at the same time.
jpayne@68	848 """
jpayne@68	849 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68	850 sub = bytes(sub)
jpayne@68	851 elif isinstance(sub, str):
jpayne@68	852 sub = sub.encode("ASCII")
jpayne@68	853 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	854 raise TypeError(
jpayne@68	855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	856 % type(sub)
jpayne@68	857 )
jpayne@68	858 return self._data.rfind(sub, start, end)
jpayne@68	859
jpayne@68	860 def index(self, sub, start=None, end=None):
jpayne@68	861 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68	862
jpayne@68	863 With optional arguments start and end, return the lowest index in the
jpayne@68	864 sequence such that the subsequence sub is contained within the sequence
jpayne@68	865 region [start:end].
jpayne@68	866
jpayne@68	867 Arguments:
jpayne@68	868 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68	869 - start - optional integer, slice start
jpayne@68	870 - end - optional integer, slice end
jpayne@68	871
jpayne@68	872 Raises a ValueError if the subsequence is NOT found.
jpayne@68	873
jpayne@68	874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68	875
jpayne@68	876 >>> from Bio.Seq import Seq
jpayne@68	877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	878 >>> my_rna.index("AUG")
jpayne@68	879 3
jpayne@68	880
jpayne@68	881 The next typical start codon can then be found by starting the search
jpayne@68	882 at position 4:
jpayne@68	883
jpayne@68	884 >>> my_rna.index("AUG", 4)
jpayne@68	885 15
jpayne@68	886
jpayne@68	887 This method performs the same search as the ``find`` method. However,
jpayne@68	888 if the subsequence is not found, ``find`` returns -1 while ``index``
jpayne@68	889 raises a ValueError:
jpayne@68	890
jpayne@68	891 >>> my_rna.index("T")
jpayne@68	892 Traceback (most recent call last):
jpayne@68	893 ...
jpayne@68	894 ValueError: ...
jpayne@68	895 >>> my_rna.find("T")
jpayne@68	896 -1
jpayne@68	897
jpayne@68	898 See the ``search`` method to find the locations of multiple subsequences
jpayne@68	899 at the same time.
jpayne@68	900 """
jpayne@68	901 if isinstance(sub, MutableSeq):
jpayne@68	902 sub = sub._data
jpayne@68	903 elif isinstance(sub, Seq):
jpayne@68	904 sub = bytes(sub)
jpayne@68	905 elif isinstance(sub, str):
jpayne@68	906 sub = sub.encode("ASCII")
jpayne@68	907 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	908 raise TypeError(
jpayne@68	909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	910 % type(sub)
jpayne@68	911 )
jpayne@68	912 return self._data.index(sub, start, end)
jpayne@68	913
jpayne@68	914 def rindex(self, sub, start=None, end=None):
jpayne@68	915 """Return the highest index in the sequence where subsequence sub is found.
jpayne@68	916
jpayne@68	917 With optional arguments start and end, return the highest index in the
jpayne@68	918 sequence such that the subsequence sub is contained within the sequence
jpayne@68	919 region [start:end].
jpayne@68	920
jpayne@68	921 Arguments:
jpayne@68	922 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68	923 - start - optional integer, slice start
jpayne@68	924 - end - optional integer, slice end
jpayne@68	925
jpayne@68	926 Returns -1 if the subsequence is NOT found.
jpayne@68	927
jpayne@68	928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68	929
jpayne@68	930 >>> from Bio.Seq import Seq
jpayne@68	931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	932 >>> my_rna.rindex("AUG")
jpayne@68	933 15
jpayne@68	934
jpayne@68	935 The location of the typical start codon before that can be found by
jpayne@68	936 ending the search at position 15:
jpayne@68	937
jpayne@68	938 >>> my_rna.rindex("AUG", end=15)
jpayne@68	939 3
jpayne@68	940
jpayne@68	941 This method performs the same search as the ``rfind`` method. However,
jpayne@68	942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
jpayne@68	943 raises a ValueError:
jpayne@68	944
jpayne@68	945 >>> my_rna.rindex("T")
jpayne@68	946 Traceback (most recent call last):
jpayne@68	947 ...
jpayne@68	948 ValueError: ...
jpayne@68	949 >>> my_rna.rfind("T")
jpayne@68	950 -1
jpayne@68	951
jpayne@68	952 See the ``search`` method to find the locations of multiple subsequences
jpayne@68	953 at the same time.
jpayne@68	954 """
jpayne@68	955 if isinstance(sub, MutableSeq):
jpayne@68	956 sub = sub._data
jpayne@68	957 elif isinstance(sub, Seq):
jpayne@68	958 sub = bytes(sub)
jpayne@68	959 elif isinstance(sub, str):
jpayne@68	960 sub = sub.encode("ASCII")
jpayne@68	961 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68	962 raise TypeError(
jpayne@68	963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	964 % type(sub)
jpayne@68	965 )
jpayne@68	966 return self._data.rindex(sub, start, end)
jpayne@68	967
jpayne@68	968 def search(self, subs):
jpayne@68	969 """Search the substrings subs in self and yield the index and substring found.
jpayne@68	970
jpayne@68	971 Arguments:
jpayne@68	972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
jpayne@68	973 objects containing the substrings to search for.
jpayne@68	974
jpayne@68	975 >>> from Bio.Seq import Seq
jpayne@68	976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
jpayne@68	977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
jpayne@68	978 >>> for index, substring in matches:
jpayne@68	979 ... print(index, substring)
jpayne@68	980 ...
jpayne@68	981 7 CC
jpayne@68	982 9 ATTG
jpayne@68	983 20 CC
jpayne@68	984 34 CC
jpayne@68	985 34 CCC
jpayne@68	986 35 CC
jpayne@68	987 """
jpayne@68	988 subdict = collections.defaultdict(set)
jpayne@68	989 for index, sub in enumerate(subs):
jpayne@68	990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
jpayne@68	991 sub = bytes(sub)
jpayne@68	992 elif isinstance(sub, str):
jpayne@68	993 sub = sub.encode("ASCII")
jpayne@68	994 elif not isinstance(sub, bytes):
jpayne@68	995 raise TypeError(
jpayne@68	996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68	997 % (index, type(sub))
jpayne@68	998 )
jpayne@68	999 length = len(sub)
jpayne@68	1000 subdict[length].add(sub)
jpayne@68	1001 for start in range(len(self) - 1):
jpayne@68	1002 for length, subs in subdict.items():
jpayne@68	1003 stop = start + length
jpayne@68	1004 for sub in subs:
jpayne@68	1005 if self._data[start:stop] == sub:
jpayne@68	1006 yield (start, sub.decode())
jpayne@68	1007 break
jpayne@68	1008
jpayne@68	1009 def startswith(self, prefix, start=None, end=None):
jpayne@68	1010 """Return True if the sequence starts with the given prefix, False otherwise.
jpayne@68	1011
jpayne@68	1012 Return True if the sequence starts with the specified prefix
jpayne@68	1013 (a string or another Seq object), False otherwise.
jpayne@68	1014 With optional start, test sequence beginning at that position.
jpayne@68	1015 With optional end, stop comparing sequence at that position.
jpayne@68	1016 prefix can also be a tuple of strings to try. e.g.
jpayne@68	1017
jpayne@68	1018 >>> from Bio.Seq import Seq
jpayne@68	1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	1020 >>> my_rna.startswith("GUC")
jpayne@68	1021 True
jpayne@68	1022 >>> my_rna.startswith("AUG")
jpayne@68	1023 False
jpayne@68	1024 >>> my_rna.startswith("AUG", 3)
jpayne@68	1025 True
jpayne@68	1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
jpayne@68	1027 True
jpayne@68	1028 """
jpayne@68	1029 if isinstance(prefix, tuple):
jpayne@68	1030 prefix = tuple(
jpayne@68	1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68	1032 for p in prefix
jpayne@68	1033 )
jpayne@68	1034 elif isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68	1035 prefix = bytes(prefix)
jpayne@68	1036 elif isinstance(prefix, str):
jpayne@68	1037 prefix = prefix.encode("ASCII")
jpayne@68	1038 return self._data.startswith(prefix, start, end)
jpayne@68	1039
jpayne@68	1040 def endswith(self, suffix, start=None, end=None):
jpayne@68	1041 """Return True if the sequence ends with the given suffix, False otherwise.
jpayne@68	1042
jpayne@68	1043 Return True if the sequence ends with the specified suffix
jpayne@68	1044 (a string or another Seq object), False otherwise.
jpayne@68	1045 With optional start, test sequence beginning at that position.
jpayne@68	1046 With optional end, stop comparing sequence at that position.
jpayne@68	1047 suffix can also be a tuple of strings to try. e.g.
jpayne@68	1048
jpayne@68	1049 >>> from Bio.Seq import Seq
jpayne@68	1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	1051 >>> my_rna.endswith("UUG")
jpayne@68	1052 True
jpayne@68	1053 >>> my_rna.endswith("AUG")
jpayne@68	1054 False
jpayne@68	1055 >>> my_rna.endswith("AUG", 0, 18)
jpayne@68	1056 True
jpayne@68	1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
jpayne@68	1058 True
jpayne@68	1059 """
jpayne@68	1060 if isinstance(suffix, tuple):
jpayne@68	1061 suffix = tuple(
jpayne@68	1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68	1063 for p in suffix
jpayne@68	1064 )
jpayne@68	1065 elif isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68	1066 suffix = bytes(suffix)
jpayne@68	1067 elif isinstance(suffix, str):
jpayne@68	1068 suffix = suffix.encode("ASCII")
jpayne@68	1069 return self._data.endswith(suffix, start, end)
jpayne@68	1070
jpayne@68	1071 def split(self, sep=None, maxsplit=-1):
jpayne@68	1072 """Return a list of subsequences when splitting the sequence by separator sep.
jpayne@68	1073
jpayne@68	1074 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68	1075 using sep as the delimiter string. If maxsplit is given, at
jpayne@68	1076 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@68	1077 splits are made.
jpayne@68	1078
jpayne@68	1079 For consistency with the ``split`` method of Python strings, any
jpayne@68	1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68	1081 default value
jpayne@68	1082
jpayne@68	1083 e.g.
jpayne@68	1084
jpayne@68	1085 >>> from Bio.Seq import Seq
jpayne@68	1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	1087 >>> my_aa = my_rna.translate()
jpayne@68	1088 >>> my_aa
jpayne@68	1089 Seq('VMAIVMGRKGARL')
jpayne@68	1090 >>> for pep in my_aa.split("*"):
jpayne@68	1091 ... pep
jpayne@68	1092 Seq('VMAIVMGR')
jpayne@68	1093 Seq('KGAR')
jpayne@68	1094 Seq('L')
jpayne@68	1095 >>> for pep in my_aa.split("*", 1):
jpayne@68	1096 ... pep
jpayne@68	1097 Seq('VMAIVMGR')
jpayne@68	1098 Seq('KGAR*L')
jpayne@68	1099
jpayne@68	1100 See also the rsplit method, which splits the sequence starting from the
jpayne@68	1101 end:
jpayne@68	1102
jpayne@68	1103 >>> for pep in my_aa.rsplit("*", 1):
jpayne@68	1104 ... pep
jpayne@68	1105 Seq('VMAIVMGR*KGAR')
jpayne@68	1106 Seq('L')
jpayne@68	1107 """
jpayne@68	1108 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68	1109 sep = bytes(sep)
jpayne@68	1110 elif isinstance(sep, str):
jpayne@68	1111 sep = sep.encode("ASCII")
jpayne@68	1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
jpayne@68	1113
jpayne@68	1114 def rsplit(self, sep=None, maxsplit=-1):
jpayne@68	1115 """Return a list of subsequences by splitting the sequence from the right.
jpayne@68	1116
jpayne@68	1117 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68	1118 using sep as the delimiter string. If maxsplit is given, at
jpayne@68	1119 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@68	1120 splits are made.
jpayne@68	1121
jpayne@68	1122 For consistency with the ``rsplit`` method of Python strings, any
jpayne@68	1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68	1124 default value
jpayne@68	1125
jpayne@68	1126 e.g.
jpayne@68	1127
jpayne@68	1128 >>> from Bio.Seq import Seq
jpayne@68	1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68	1130 >>> my_aa = my_rna.translate()
jpayne@68	1131 >>> my_aa
jpayne@68	1132 Seq('VMAIVMGRKGARL')
jpayne@68	1133 >>> for pep in my_aa.rsplit("*"):
jpayne@68	1134 ... pep
jpayne@68	1135 Seq('VMAIVMGR')
jpayne@68	1136 Seq('KGAR')
jpayne@68	1137 Seq('L')
jpayne@68	1138 >>> for pep in my_aa.rsplit("*", 1):
jpayne@68	1139 ... pep
jpayne@68	1140 Seq('VMAIVMGR*KGAR')
jpayne@68	1141 Seq('L')
jpayne@68	1142
jpayne@68	1143 See also the split method, which splits the sequence starting from the
jpayne@68	1144 beginning:
jpayne@68	1145
jpayne@68	1146 >>> for pep in my_aa.split("*", 1):
jpayne@68	1147 ... pep
jpayne@68	1148 Seq('VMAIVMGR')
jpayne@68	1149 Seq('KGAR*L')
jpayne@68	1150 """
jpayne@68	1151 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68	1152 sep = bytes(sep)
jpayne@68	1153 elif isinstance(sep, str):
jpayne@68	1154 sep = sep.encode("ASCII")
jpayne@68	1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
jpayne@68	1156
jpayne@68	1157 def strip(self, chars=None, inplace=False):
jpayne@68	1158 """Return a sequence object with leading and trailing ends stripped.
jpayne@68	1159
jpayne@68	1160 With default arguments, leading and trailing whitespace is removed:
jpayne@68	1161
jpayne@68	1162 >>> seq = Seq(" ACGT ")
jpayne@68	1163 >>> seq.strip()
jpayne@68	1164 Seq('ACGT')
jpayne@68	1165 >>> seq
jpayne@68	1166 Seq(' ACGT ')
jpayne@68	1167
jpayne@68	1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68	1169 instead. The order of the characters to be removed is not important:
jpayne@68	1170
jpayne@68	1171 >>> Seq("ACGTACGT").strip("TGCA")
jpayne@68	1172 Seq('')
jpayne@68	1173
jpayne@68	1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68	1175 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68	1176 in-place and returned.
jpayne@68	1177
jpayne@68	1178 >>> seq = MutableSeq(" ACGT ")
jpayne@68	1179 >>> seq.strip()
jpayne@68	1180 MutableSeq('ACGT')
jpayne@68	1181 >>> seq
jpayne@68	1182 MutableSeq(' ACGT ')
jpayne@68	1183 >>> seq.strip(inplace=True)
jpayne@68	1184 MutableSeq('ACGT')
jpayne@68	1185 >>> seq
jpayne@68	1186 MutableSeq('ACGT')
jpayne@68	1187
jpayne@68	1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
jpayne@68	1189 is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1190
jpayne@68	1191 See also the lstrip and rstrip methods.
jpayne@68	1192 """
jpayne@68	1193 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68	1194 chars = bytes(chars)
jpayne@68	1195 elif isinstance(chars, str):
jpayne@68	1196 chars = chars.encode("ASCII")
jpayne@68	1197 try:
jpayne@68	1198 data = self._data.strip(chars)
jpayne@68	1199 except TypeError:
jpayne@68	1200 raise TypeError(
jpayne@68	1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68	1202 ) from None
jpayne@68	1203 if inplace:
jpayne@68	1204 if not isinstance(self._data, bytearray):
jpayne@68	1205 raise TypeError("Sequence is immutable")
jpayne@68	1206 self._data[:] = data
jpayne@68	1207 return self
jpayne@68	1208 else:
jpayne@68	1209 return self.__class__(data)
jpayne@68	1210
jpayne@68	1211 def lstrip(self, chars=None, inplace=False):
jpayne@68	1212 """Return a sequence object with leading and trailing ends stripped.
jpayne@68	1213
jpayne@68	1214 With default arguments, leading whitespace is removed:
jpayne@68	1215
jpayne@68	1216 >>> seq = Seq(" ACGT ")
jpayne@68	1217 >>> seq.lstrip()
jpayne@68	1218 Seq('ACGT ')
jpayne@68	1219 >>> seq
jpayne@68	1220 Seq(' ACGT ')
jpayne@68	1221
jpayne@68	1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68	1223 from the leading end instead. The order of the characters to be removed
jpayne@68	1224 is not important:
jpayne@68	1225
jpayne@68	1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
jpayne@68	1227 Seq('TTACG')
jpayne@68	1228
jpayne@68	1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68	1230 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68	1231 in-place and returned.
jpayne@68	1232
jpayne@68	1233 >>> seq = MutableSeq(" ACGT ")
jpayne@68	1234 >>> seq.lstrip()
jpayne@68	1235 MutableSeq('ACGT ')
jpayne@68	1236 >>> seq
jpayne@68	1237 MutableSeq(' ACGT ')
jpayne@68	1238 >>> seq.lstrip(inplace=True)
jpayne@68	1239 MutableSeq('ACGT ')
jpayne@68	1240 >>> seq
jpayne@68	1241 MutableSeq('ACGT ')
jpayne@68	1242
jpayne@68	1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1245
jpayne@68	1246 See also the strip and rstrip methods.
jpayne@68	1247 """
jpayne@68	1248 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68	1249 chars = bytes(chars)
jpayne@68	1250 elif isinstance(chars, str):
jpayne@68	1251 chars = chars.encode("ASCII")
jpayne@68	1252 try:
jpayne@68	1253 data = self._data.lstrip(chars)
jpayne@68	1254 except TypeError:
jpayne@68	1255 raise TypeError(
jpayne@68	1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68	1257 ) from None
jpayne@68	1258 if inplace:
jpayne@68	1259 if not isinstance(self._data, bytearray):
jpayne@68	1260 raise TypeError("Sequence is immutable")
jpayne@68	1261 self._data[:] = data
jpayne@68	1262 return self
jpayne@68	1263 else:
jpayne@68	1264 return self.__class__(data)
jpayne@68	1265
jpayne@68	1266 def rstrip(self, chars=None, inplace=False):
jpayne@68	1267 """Return a sequence object with trailing ends stripped.
jpayne@68	1268
jpayne@68	1269 With default arguments, trailing whitespace is removed:
jpayne@68	1270
jpayne@68	1271 >>> seq = Seq(" ACGT ")
jpayne@68	1272 >>> seq.rstrip()
jpayne@68	1273 Seq(' ACGT')
jpayne@68	1274 >>> seq
jpayne@68	1275 Seq(' ACGT ')
jpayne@68	1276
jpayne@68	1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68	1278 from the trailing end instead. The order of the characters to be
jpayne@68	1279 removed is not important:
jpayne@68	1280
jpayne@68	1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
jpayne@68	1282 Seq('ACGACGTT')
jpayne@68	1283
jpayne@68	1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68	1285 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68	1286 in-place and returned.
jpayne@68	1287
jpayne@68	1288 >>> seq = MutableSeq(" ACGT ")
jpayne@68	1289 >>> seq.rstrip()
jpayne@68	1290 MutableSeq(' ACGT')
jpayne@68	1291 >>> seq
jpayne@68	1292 MutableSeq(' ACGT ')
jpayne@68	1293 >>> seq.rstrip(inplace=True)
jpayne@68	1294 MutableSeq(' ACGT')
jpayne@68	1295 >>> seq
jpayne@68	1296 MutableSeq(' ACGT')
jpayne@68	1297
jpayne@68	1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1300
jpayne@68	1301 See also the strip and lstrip methods.
jpayne@68	1302 """
jpayne@68	1303 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68	1304 chars = bytes(chars)
jpayne@68	1305 elif isinstance(chars, str):
jpayne@68	1306 chars = chars.encode("ASCII")
jpayne@68	1307 try:
jpayne@68	1308 data = self._data.rstrip(chars)
jpayne@68	1309 except TypeError:
jpayne@68	1310 raise TypeError(
jpayne@68	1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68	1312 ) from None
jpayne@68	1313 if inplace:
jpayne@68	1314 if not isinstance(self._data, bytearray):
jpayne@68	1315 raise TypeError("Sequence is immutable")
jpayne@68	1316 self._data[:] = data
jpayne@68	1317 return self
jpayne@68	1318 else:
jpayne@68	1319 return self.__class__(data)
jpayne@68	1320
jpayne@68	1321 def removeprefix(self, prefix, inplace=False):
jpayne@68	1322 """Return a new Seq object with prefix (left) removed.
jpayne@68	1323
jpayne@68	1324 This behaves like the python string method of the same name.
jpayne@68	1325
jpayne@68	1326 e.g. Removing a start Codon:
jpayne@68	1327
jpayne@68	1328 >>> from Bio.Seq import Seq
jpayne@68	1329 >>> my_seq = Seq("ATGGTGTGTGT")
jpayne@68	1330 >>> my_seq
jpayne@68	1331 Seq('ATGGTGTGTGT')
jpayne@68	1332 >>> my_seq.removeprefix('ATG')
jpayne@68	1333 Seq('GTGTGTGT')
jpayne@68	1334
jpayne@68	1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1337
jpayne@68	1338 See also the removesuffix method.
jpayne@68	1339 """
jpayne@68	1340 if isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68	1341 prefix = bytes(prefix)
jpayne@68	1342 elif isinstance(prefix, str):
jpayne@68	1343 prefix = prefix.encode("ASCII")
jpayne@68	1344 try:
jpayne@68	1345 data = self._data.removeprefix(prefix)
jpayne@68	1346 except TypeError:
jpayne@68	1347 raise TypeError(
jpayne@68	1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68	1349 ) from None
jpayne@68	1350 except AttributeError:
jpayne@68	1351 # Fall back for pre-Python 3.9
jpayne@68	1352 data = self._data
jpayne@68	1353 if data.startswith(prefix):
jpayne@68	1354 data = data[len(prefix) :]
jpayne@68	1355 if inplace:
jpayne@68	1356 if not isinstance(self._data, bytearray):
jpayne@68	1357 raise TypeError("Sequence is immutable")
jpayne@68	1358 self._data[:] = data
jpayne@68	1359 return self
jpayne@68	1360 else:
jpayne@68	1361 return self.__class__(data)
jpayne@68	1362
jpayne@68	1363 def removesuffix(self, suffix, inplace=False):
jpayne@68	1364 """Return a new Seq object with suffix (right) removed.
jpayne@68	1365
jpayne@68	1366 This behaves like the python string method of the same name.
jpayne@68	1367
jpayne@68	1368 e.g. Removing a stop codon:
jpayne@68	1369
jpayne@68	1370 >>> from Bio.Seq import Seq
jpayne@68	1371 >>> my_seq = Seq("GTGTGTGTTAG")
jpayne@68	1372 >>> my_seq
jpayne@68	1373 Seq('GTGTGTGTTAG')
jpayne@68	1374 >>> stop_codon = Seq("TAG")
jpayne@68	1375 >>> my_seq.removesuffix(stop_codon)
jpayne@68	1376 Seq('GTGTGTGT')
jpayne@68	1377
jpayne@68	1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1380
jpayne@68	1381 See also the removeprefix method.
jpayne@68	1382 """
jpayne@68	1383 if isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68	1384 suffix = bytes(suffix)
jpayne@68	1385 elif isinstance(suffix, str):
jpayne@68	1386 suffix = suffix.encode("ASCII")
jpayne@68	1387 try:
jpayne@68	1388 data = self._data.removesuffix(suffix)
jpayne@68	1389 except TypeError:
jpayne@68	1390 raise TypeError(
jpayne@68	1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68	1392 ) from None
jpayne@68	1393 except AttributeError:
jpayne@68	1394 # Fall back for pre-Python 3.9
jpayne@68	1395 data = self._data
jpayne@68	1396 if data.endswith(suffix):
jpayne@68	1397 data = data[: -len(suffix)]
jpayne@68	1398 if inplace:
jpayne@68	1399 if not isinstance(self._data, bytearray):
jpayne@68	1400 raise TypeError("Sequence is immutable")
jpayne@68	1401 self._data[:] = data
jpayne@68	1402 return self
jpayne@68	1403 else:
jpayne@68	1404 return self.__class__(data)
jpayne@68	1405
jpayne@68	1406 def upper(self, inplace=False):
jpayne@68	1407 """Return the sequence in upper case.
jpayne@68	1408
jpayne@68	1409 An upper-case copy of the sequence is returned if inplace is False,
jpayne@68	1410 the default value:
jpayne@68	1411
jpayne@68	1412 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	1413 >>> my_seq = Seq("VHLTPeeK*")
jpayne@68	1414 >>> my_seq
jpayne@68	1415 Seq('VHLTPeeK*')
jpayne@68	1416 >>> my_seq.lower()
jpayne@68	1417 Seq('vhltpeek*')
jpayne@68	1418 >>> my_seq.upper()
jpayne@68	1419 Seq('VHLTPEEK*')
jpayne@68	1420 >>> my_seq
jpayne@68	1421 Seq('VHLTPeeK*')
jpayne@68	1422
jpayne@68	1423 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1424
jpayne@68	1425 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68	1426 >>> my_seq
jpayne@68	1427 MutableSeq('VHLTPeeK*')
jpayne@68	1428 >>> my_seq.lower()
jpayne@68	1429 MutableSeq('vhltpeek*')
jpayne@68	1430 >>> my_seq.upper()
jpayne@68	1431 MutableSeq('VHLTPEEK*')
jpayne@68	1432 >>> my_seq
jpayne@68	1433 MutableSeq('VHLTPeeK*')
jpayne@68	1434
jpayne@68	1435 >>> my_seq.lower(inplace=True)
jpayne@68	1436 MutableSeq('vhltpeek*')
jpayne@68	1437 >>> my_seq
jpayne@68	1438 MutableSeq('vhltpeek*')
jpayne@68	1439 >>> my_seq.upper(inplace=True)
jpayne@68	1440 MutableSeq('VHLTPEEK*')
jpayne@68	1441 >>> my_seq
jpayne@68	1442 MutableSeq('VHLTPEEK*')
jpayne@68	1443
jpayne@68	1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1446
jpayne@68	1447 See also the ``lower`` method.
jpayne@68	1448 """
jpayne@68	1449 data = self._data.upper()
jpayne@68	1450 if inplace:
jpayne@68	1451 if not isinstance(self._data, bytearray):
jpayne@68	1452 raise TypeError("Sequence is immutable")
jpayne@68	1453 self._data[:] = data
jpayne@68	1454 return self
jpayne@68	1455 else:
jpayne@68	1456 return self.__class__(data)
jpayne@68	1457
jpayne@68	1458 def lower(self, inplace=False):
jpayne@68	1459 """Return the sequence in lower case.
jpayne@68	1460
jpayne@68	1461 An lower-case copy of the sequence is returned if inplace is False,
jpayne@68	1462 the default value:
jpayne@68	1463
jpayne@68	1464 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68	1465 >>> my_seq = Seq("VHLTPeeK*")
jpayne@68	1466 >>> my_seq
jpayne@68	1467 Seq('VHLTPeeK*')
jpayne@68	1468 >>> my_seq.lower()
jpayne@68	1469 Seq('vhltpeek*')
jpayne@68	1470 >>> my_seq.upper()
jpayne@68	1471 Seq('VHLTPEEK*')
jpayne@68	1472 >>> my_seq
jpayne@68	1473 Seq('VHLTPeeK*')
jpayne@68	1474
jpayne@68	1475 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1476
jpayne@68	1477 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68	1478 >>> my_seq
jpayne@68	1479 MutableSeq('VHLTPeeK*')
jpayne@68	1480 >>> my_seq.lower()
jpayne@68	1481 MutableSeq('vhltpeek*')
jpayne@68	1482 >>> my_seq.upper()
jpayne@68	1483 MutableSeq('VHLTPEEK*')
jpayne@68	1484 >>> my_seq
jpayne@68	1485 MutableSeq('VHLTPeeK*')
jpayne@68	1486
jpayne@68	1487 >>> my_seq.lower(inplace=True)
jpayne@68	1488 MutableSeq('vhltpeek*')
jpayne@68	1489 >>> my_seq
jpayne@68	1490 MutableSeq('vhltpeek*')
jpayne@68	1491 >>> my_seq.upper(inplace=True)
jpayne@68	1492 MutableSeq('VHLTPEEK*')
jpayne@68	1493 >>> my_seq
jpayne@68	1494 MutableSeq('VHLTPEEK*')
jpayne@68	1495
jpayne@68	1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1498
jpayne@68	1499 See also the ``upper`` method.
jpayne@68	1500 """
jpayne@68	1501 data = self._data.lower()
jpayne@68	1502 if inplace:
jpayne@68	1503 if not isinstance(self._data, bytearray):
jpayne@68	1504 raise TypeError("Sequence is immutable")
jpayne@68	1505 self._data[:] = data
jpayne@68	1506 return self
jpayne@68	1507 else:
jpayne@68	1508 return self.__class__(data)
jpayne@68	1509
jpayne@68	1510 def isupper(self):
jpayne@68	1511 """Return True if all ASCII characters in data are uppercase.
jpayne@68	1512
jpayne@68	1513 If there are no cased characters, the method returns False.
jpayne@68	1514 """
jpayne@68	1515 return self._data.isupper()
jpayne@68	1516
jpayne@68	1517 def islower(self):
jpayne@68	1518 """Return True if all ASCII characters in data are lowercase.
jpayne@68	1519
jpayne@68	1520 If there are no cased characters, the method returns False.
jpayne@68	1521 """
jpayne@68	1522 return self._data.islower()
jpayne@68	1523
jpayne@68	1524 def translate(
jpayne@68	1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
jpayne@68	1526 ):
jpayne@68	1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
jpayne@68	1528
jpayne@68	1529 This method will translate DNA or RNA sequences. It should not
jpayne@68	1530 be used on protein sequences as any result will be biologically
jpayne@68	1531 meaningless.
jpayne@68	1532
jpayne@68	1533 Arguments:
jpayne@68	1534 - table - Which codon table to use? This can be either a name
jpayne@68	1535 (string), an NCBI identifier (integer), or a CodonTable
jpayne@68	1536 object (useful for non-standard genetic codes). This
jpayne@68	1537 defaults to the "Standard" table.
jpayne@68	1538 - stop_symbol - Single character string, what to use for
jpayne@68	1539 terminators. This defaults to the asterisk, "*".
jpayne@68	1540 - to_stop - Boolean, defaults to False meaning do a full
jpayne@68	1541 translation continuing on past any stop codons (translated as the
jpayne@68	1542 specified stop_symbol). If True, translation is terminated at
jpayne@68	1543 the first in frame stop codon (and the stop_symbol is not
jpayne@68	1544 appended to the returned protein sequence).
jpayne@68	1545 - cds - Boolean, indicates this is a complete CDS. If True,
jpayne@68	1546 this checks the sequence starts with a valid alternative start
jpayne@68	1547 codon (which will be translated as methionine, M), that the
jpayne@68	1548 sequence length is a multiple of three, and that there is a
jpayne@68	1549 single in frame stop codon at the end (this will be excluded
jpayne@68	1550 from the protein sequence, regardless of the to_stop option).
jpayne@68	1551 If these tests fail, an exception is raised.
jpayne@68	1552 - gap - Single character string to denote symbol used for gaps.
jpayne@68	1553 Defaults to the minus sign.
jpayne@68	1554
jpayne@68	1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
jpayne@68	1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
jpayne@68	1557 pn a ``MutableSeq`` object.
jpayne@68	1558
jpayne@68	1559 e.g. Using the standard table:
jpayne@68	1560
jpayne@68	1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68	1562 >>> coding_dna.translate()
jpayne@68	1563 Seq('VAIVMGRKGAR')
jpayne@68	1564 >>> coding_dna.translate(stop_symbol="@")
jpayne@68	1565 Seq('VAIVMGR@KGAR@')
jpayne@68	1566 >>> coding_dna.translate(to_stop=True)
jpayne@68	1567 Seq('VAIVMGR')
jpayne@68	1568
jpayne@68	1569 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68	1570
jpayne@68	1571 >>> coding_dna.translate(table=2)
jpayne@68	1572 Seq('VAIVMGRWKGAR*')
jpayne@68	1573 >>> coding_dna.translate(table=2, to_stop=True)
jpayne@68	1574 Seq('VAIVMGRWKGAR')
jpayne@68	1575
jpayne@68	1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
jpayne@68	1577 this sequence could be a complete CDS:
jpayne@68	1578
jpayne@68	1579 >>> coding_dna.translate(table=2, cds=True)
jpayne@68	1580 Seq('MAIVMGRWKGAR')
jpayne@68	1581
jpayne@68	1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
jpayne@68	1583 and also the in frame stop codons:
jpayne@68	1584
jpayne@68	1585 >>> coding_dna.translate(table=1, cds=True)
jpayne@68	1586 Traceback (most recent call last):
jpayne@68	1587 ...
jpayne@68	1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
jpayne@68	1589
jpayne@68	1590 If the sequence has no in-frame stop codon, then the to_stop argument
jpayne@68	1591 has no effect:
jpayne@68	1592
jpayne@68	1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
jpayne@68	1594 >>> coding_dna2.translate()
jpayne@68	1595 Seq('LAIVMGR')
jpayne@68	1596 >>> coding_dna2.translate(to_stop=True)
jpayne@68	1597 Seq('LAIVMGR')
jpayne@68	1598
jpayne@68	1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68	1600 or a stop codon. These are translated as "X". Any invalid codon
jpayne@68	1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68	1602
jpayne@68	1603 NOTE - This does NOT behave like the python string's translate
jpayne@68	1604 method. For that use str(my_seq).translate(...) instead
jpayne@68	1605 """
jpayne@68	1606 try:
jpayne@68	1607 data = str(self)
jpayne@68	1608 except UndefinedSequenceError:
jpayne@68	1609 # translating an undefined sequence yields an undefined
jpayne@68	1610 # sequence with the length divided by 3
jpayne@68	1611 n = len(self)
jpayne@68	1612 if n % 3 != 0:
jpayne@68	1613 warnings.warn(
jpayne@68	1614 "Partial codon, len(sequence) not a multiple of three. "
jpayne@68	1615 "This may become an error in future.",
jpayne@68	1616 BiopythonWarning,
jpayne@68	1617 )
jpayne@68	1618 return Seq(None, n // 3)
jpayne@68	1619
jpayne@68	1620 return self.__class__(
jpayne@68	1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68	1622 )
jpayne@68	1623
jpayne@68	1624 def complement(self, inplace=False):
jpayne@68	1625 """Return the complement as a DNA sequence.
jpayne@68	1626
jpayne@68	1627 >>> Seq("CGA").complement()
jpayne@68	1628 Seq('GCT')
jpayne@68	1629
jpayne@68	1630 Any U in the sequence is treated as a T:
jpayne@68	1631
jpayne@68	1632 >>> Seq("CGAUT").complement()
jpayne@68	1633 Seq('GCTAA')
jpayne@68	1634
jpayne@68	1635 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68	1636
jpayne@68	1637 >>> Seq("CGAUT").complement_rna()
jpayne@68	1638 Seq('GCUAA')
jpayne@68	1639
jpayne@68	1640 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1641
jpayne@68	1642 >>> my_seq = MutableSeq("CGA")
jpayne@68	1643 >>> my_seq
jpayne@68	1644 MutableSeq('CGA')
jpayne@68	1645 >>> my_seq.complement()
jpayne@68	1646 MutableSeq('GCT')
jpayne@68	1647 >>> my_seq
jpayne@68	1648 MutableSeq('CGA')
jpayne@68	1649
jpayne@68	1650 >>> my_seq.complement(inplace=True)
jpayne@68	1651 MutableSeq('GCT')
jpayne@68	1652 >>> my_seq
jpayne@68	1653 MutableSeq('GCT')
jpayne@68	1654
jpayne@68	1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1657 """
jpayne@68	1658 ttable = _dna_complement_table
jpayne@68	1659 try:
jpayne@68	1660 data = self._data.translate(ttable)
jpayne@68	1661 except UndefinedSequenceError:
jpayne@68	1662 # complement of an undefined sequence is an undefined sequence
jpayne@68	1663 # of the same length
jpayne@68	1664 return self
jpayne@68	1665 if inplace:
jpayne@68	1666 if not isinstance(self._data, bytearray):
jpayne@68	1667 raise TypeError("Sequence is immutable")
jpayne@68	1668 self._data[:] = data
jpayne@68	1669 return self
jpayne@68	1670 return self.__class__(data)
jpayne@68	1671
jpayne@68	1672 def complement_rna(self, inplace=False):
jpayne@68	1673 """Return the complement as an RNA sequence.
jpayne@68	1674
jpayne@68	1675 >>> Seq("CGA").complement_rna()
jpayne@68	1676 Seq('GCU')
jpayne@68	1677
jpayne@68	1678 Any T in the sequence is treated as a U:
jpayne@68	1679
jpayne@68	1680 >>> Seq("CGAUT").complement_rna()
jpayne@68	1681 Seq('GCUAA')
jpayne@68	1682
jpayne@68	1683 In contrast, ``complement`` returns a DNA sequence by default:
jpayne@68	1684
jpayne@68	1685 >>> Seq("CGA").complement()
jpayne@68	1686 Seq('GCT')
jpayne@68	1687
jpayne@68	1688 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1689
jpayne@68	1690 >>> my_seq = MutableSeq("CGA")
jpayne@68	1691 >>> my_seq
jpayne@68	1692 MutableSeq('CGA')
jpayne@68	1693 >>> my_seq.complement_rna()
jpayne@68	1694 MutableSeq('GCU')
jpayne@68	1695 >>> my_seq
jpayne@68	1696 MutableSeq('CGA')
jpayne@68	1697
jpayne@68	1698 >>> my_seq.complement_rna(inplace=True)
jpayne@68	1699 MutableSeq('GCU')
jpayne@68	1700 >>> my_seq
jpayne@68	1701 MutableSeq('GCU')
jpayne@68	1702
jpayne@68	1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1705 """
jpayne@68	1706 try:
jpayne@68	1707 data = self._data.translate(_rna_complement_table)
jpayne@68	1708 except UndefinedSequenceError:
jpayne@68	1709 # complement of an undefined sequence is an undefined sequence
jpayne@68	1710 # of the same length
jpayne@68	1711 return self
jpayne@68	1712 if inplace:
jpayne@68	1713 if not isinstance(self._data, bytearray):
jpayne@68	1714 raise TypeError("Sequence is immutable")
jpayne@68	1715 self._data[:] = data
jpayne@68	1716 return self
jpayne@68	1717 return self.__class__(data)
jpayne@68	1718
jpayne@68	1719 def reverse_complement(self, inplace=False):
jpayne@68	1720 """Return the reverse complement as a DNA sequence.
jpayne@68	1721
jpayne@68	1722 >>> Seq("CGA").reverse_complement()
jpayne@68	1723 Seq('TCG')
jpayne@68	1724
jpayne@68	1725 Any U in the sequence is treated as a T:
jpayne@68	1726
jpayne@68	1727 >>> Seq("CGAUT").reverse_complement()
jpayne@68	1728 Seq('AATCG')
jpayne@68	1729
jpayne@68	1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68	1731
jpayne@68	1732 >>> Seq("CGA").reverse_complement_rna()
jpayne@68	1733 Seq('UCG')
jpayne@68	1734
jpayne@68	1735 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1736
jpayne@68	1737 >>> my_seq = MutableSeq("CGA")
jpayne@68	1738 >>> my_seq
jpayne@68	1739 MutableSeq('CGA')
jpayne@68	1740 >>> my_seq.reverse_complement()
jpayne@68	1741 MutableSeq('TCG')
jpayne@68	1742 >>> my_seq
jpayne@68	1743 MutableSeq('CGA')
jpayne@68	1744
jpayne@68	1745 >>> my_seq.reverse_complement(inplace=True)
jpayne@68	1746 MutableSeq('TCG')
jpayne@68	1747 >>> my_seq
jpayne@68	1748 MutableSeq('TCG')
jpayne@68	1749
jpayne@68	1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1751 ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68	1752 ``inplace=True``.
jpayne@68	1753 """
jpayne@68	1754 try:
jpayne@68	1755 data = self._data.translate(_dna_complement_table)
jpayne@68	1756 except UndefinedSequenceError:
jpayne@68	1757 # reverse complement of an undefined sequence is an undefined sequence
jpayne@68	1758 # of the same length
jpayne@68	1759 return self
jpayne@68	1760 if inplace:
jpayne@68	1761 if not isinstance(self._data, bytearray):
jpayne@68	1762 raise TypeError("Sequence is immutable")
jpayne@68	1763 self._data[::-1] = data
jpayne@68	1764 return self
jpayne@68	1765 return self.__class__(data[::-1])
jpayne@68	1766
jpayne@68	1767 def reverse_complement_rna(self, inplace=False):
jpayne@68	1768 """Return the reverse complement as an RNA sequence.
jpayne@68	1769
jpayne@68	1770 >>> Seq("CGA").reverse_complement_rna()
jpayne@68	1771 Seq('UCG')
jpayne@68	1772
jpayne@68	1773 Any T in the sequence is treated as a U:
jpayne@68	1774
jpayne@68	1775 >>> Seq("CGAUT").reverse_complement_rna()
jpayne@68	1776 Seq('AAUCG')
jpayne@68	1777
jpayne@68	1778 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68	1779
jpayne@68	1780 >>> Seq("CGA").reverse_complement()
jpayne@68	1781 Seq('TCG')
jpayne@68	1782
jpayne@68	1783 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1784
jpayne@68	1785 >>> my_seq = MutableSeq("CGA")
jpayne@68	1786 >>> my_seq
jpayne@68	1787 MutableSeq('CGA')
jpayne@68	1788 >>> my_seq.reverse_complement_rna()
jpayne@68	1789 MutableSeq('UCG')
jpayne@68	1790 >>> my_seq
jpayne@68	1791 MutableSeq('CGA')
jpayne@68	1792
jpayne@68	1793 >>> my_seq.reverse_complement_rna(inplace=True)
jpayne@68	1794 MutableSeq('UCG')
jpayne@68	1795 >>> my_seq
jpayne@68	1796 MutableSeq('UCG')
jpayne@68	1797
jpayne@68	1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
jpayne@68	1800 ``inplace=True``.
jpayne@68	1801 """
jpayne@68	1802 try:
jpayne@68	1803 data = self._data.translate(_rna_complement_table)
jpayne@68	1804 except UndefinedSequenceError:
jpayne@68	1805 # reverse complement of an undefined sequence is an undefined sequence
jpayne@68	1806 # of the same length
jpayne@68	1807 return self
jpayne@68	1808 if inplace:
jpayne@68	1809 if not isinstance(self._data, bytearray):
jpayne@68	1810 raise TypeError("Sequence is immutable")
jpayne@68	1811 self._data[::-1] = data
jpayne@68	1812 return self
jpayne@68	1813 return self.__class__(data[::-1])
jpayne@68	1814
jpayne@68	1815 def transcribe(self, inplace=False):
jpayne@68	1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
jpayne@68	1817
jpayne@68	1818 Following the usual convention, the sequence is interpreted as the
jpayne@68	1819 coding strand of the DNA double helix, not the template strand. This
jpayne@68	1820 means we can get the RNA sequence just by switching T to U.
jpayne@68	1821
jpayne@68	1822 >>> from Bio.Seq import Seq
jpayne@68	1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68	1824 >>> coding_dna
jpayne@68	1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1826 >>> coding_dna.transcribe()
jpayne@68	1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1828
jpayne@68	1829 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1830
jpayne@68	1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68	1832 >>> sequence
jpayne@68	1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1834 >>> sequence.transcribe()
jpayne@68	1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1836 >>> sequence
jpayne@68	1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1838
jpayne@68	1839 >>> sequence.transcribe(inplace=True)
jpayne@68	1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1841 >>> sequence
jpayne@68	1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1843
jpayne@68	1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1846
jpayne@68	1847 Trying to transcribe an RNA sequence has no effect.
jpayne@68	1848 If you have a nucleotide sequence which might be DNA or RNA
jpayne@68	1849 (or even a mixture), calling the transcribe method will ensure
jpayne@68	1850 any T becomes U.
jpayne@68	1851
jpayne@68	1852 Trying to transcribe a protein sequence will replace any
jpayne@68	1853 T for Threonine with U for Selenocysteine, which has no
jpayne@68	1854 biologically plausible rational.
jpayne@68	1855
jpayne@68	1856 >>> from Bio.Seq import Seq
jpayne@68	1857 >>> my_protein = Seq("MAIVMGRT")
jpayne@68	1858 >>> my_protein.transcribe()
jpayne@68	1859 Seq('MAIVMGRU')
jpayne@68	1860 """
jpayne@68	1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
jpayne@68	1862 if inplace:
jpayne@68	1863 if not isinstance(self._data, bytearray):
jpayne@68	1864 raise TypeError("Sequence is immutable")
jpayne@68	1865 self._data[:] = data
jpayne@68	1866 return self
jpayne@68	1867 return self.__class__(data)
jpayne@68	1868
jpayne@68	1869 def back_transcribe(self, inplace=False):
jpayne@68	1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
jpayne@68	1871
jpayne@68	1872 >>> from Bio.Seq import Seq
jpayne@68	1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68	1874 >>> messenger_rna
jpayne@68	1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1876 >>> messenger_rna.back_transcribe()
jpayne@68	1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1878
jpayne@68	1879 The sequence is modified in-place and returned if inplace is True:
jpayne@68	1880
jpayne@68	1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68	1882 >>> sequence
jpayne@68	1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1884 >>> sequence.back_transcribe()
jpayne@68	1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1886 >>> sequence
jpayne@68	1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68	1888
jpayne@68	1889 >>> sequence.back_transcribe(inplace=True)
jpayne@68	1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1891 >>> sequence
jpayne@68	1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68	1893
jpayne@68	1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1896
jpayne@68	1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
jpayne@68	1898 sequence which might be DNA or RNA (or even a mixture), calling the
jpayne@68	1899 back-transcribe method will ensure any U becomes T.
jpayne@68	1900
jpayne@68	1901 Trying to back-transcribe a protein sequence will replace any U for
jpayne@68	1902 Selenocysteine with T for Threonine, which is biologically meaningless.
jpayne@68	1903
jpayne@68	1904 >>> from Bio.Seq import Seq
jpayne@68	1905 >>> my_protein = Seq("MAIVMGRU")
jpayne@68	1906 >>> my_protein.back_transcribe()
jpayne@68	1907 Seq('MAIVMGRT')
jpayne@68	1908 """
jpayne@68	1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
jpayne@68	1910 if inplace:
jpayne@68	1911 if not isinstance(self._data, bytearray):
jpayne@68	1912 raise TypeError("Sequence is immutable")
jpayne@68	1913 self._data[:] = data
jpayne@68	1914 return self
jpayne@68	1915 return self.__class__(data)
jpayne@68	1916
jpayne@68	1917 def join(self, other):
jpayne@68	1918 """Return a merge of the sequences in other, spaced by the sequence from self.
jpayne@68	1919
jpayne@68	1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
jpayne@68	1921 the letters), or an iterable containing Seq, MutableSeq, or string
jpayne@68	1922 objects. These arguments will be concatenated with the calling sequence
jpayne@68	1923 as the spacer:
jpayne@68	1924
jpayne@68	1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
jpayne@68	1926 >>> concatenated
jpayne@68	1927 Seq('AAANNNNNTTTNNNNNPPP')
jpayne@68	1928
jpayne@68	1929 Joining the letters of a single sequence:
jpayne@68	1930
jpayne@68	1931 >>> Seq('NNNNN').join(Seq("ACGT"))
jpayne@68	1932 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68	1933 >>> Seq('NNNNN').join("ACGT")
jpayne@68	1934 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68	1935 """
jpayne@68	1936 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68	1937 return self.__class__(str(self).join(str(other)))
jpayne@68	1938 elif isinstance(other, str):
jpayne@68	1939 return self.__class__(str(self).join(other))
jpayne@68	1940
jpayne@68	1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68	1942
jpayne@68	1943 if isinstance(other, SeqRecord):
jpayne@68	1944 raise TypeError("Iterable cannot be a SeqRecord")
jpayne@68	1945
jpayne@68	1946 for c in other:
jpayne@68	1947 if isinstance(c, SeqRecord):
jpayne@68	1948 raise TypeError("Iterable cannot contain SeqRecords")
jpayne@68	1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
jpayne@68	1950 raise TypeError(
jpayne@68	1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
jpayne@68	1952 )
jpayne@68	1953 return self.__class__(str(self).join([str(_) for _ in other]))
jpayne@68	1954
jpayne@68	1955 def replace(self, old, new, inplace=False):
jpayne@68	1956 """Return a copy with all occurrences of subsequence old replaced by new.
jpayne@68	1957
jpayne@68	1958 >>> s = Seq("ACGTAACCGGTT")
jpayne@68	1959 >>> t = s.replace("AC", "XYZ")
jpayne@68	1960 >>> s
jpayne@68	1961 Seq('ACGTAACCGGTT')
jpayne@68	1962 >>> t
jpayne@68	1963 Seq('XYZGTAXYZCGGTT')
jpayne@68	1964
jpayne@68	1965 For mutable sequences, passing inplace=True will modify the sequence in place:
jpayne@68	1966
jpayne@68	1967 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68	1968 >>> t = m.replace("AC", "XYZ")
jpayne@68	1969 >>> m
jpayne@68	1970 MutableSeq('ACGTAACCGGTT')
jpayne@68	1971 >>> t
jpayne@68	1972 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68	1973
jpayne@68	1974 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68	1975 >>> t = m.replace("AC", "XYZ", inplace=True)
jpayne@68	1976 >>> m
jpayne@68	1977 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68	1978 >>> t
jpayne@68	1979 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68	1980
jpayne@68	1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68	1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68	1983 """
jpayne@68	1984 if isinstance(old, _SeqAbstractBaseClass):
jpayne@68	1985 old = bytes(old)
jpayne@68	1986 elif isinstance(old, str):
jpayne@68	1987 old = old.encode("ASCII")
jpayne@68	1988 if isinstance(new, _SeqAbstractBaseClass):
jpayne@68	1989 new = bytes(new)
jpayne@68	1990 elif isinstance(new, str):
jpayne@68	1991 new = new.encode("ASCII")
jpayne@68	1992 data = self._data.replace(old, new)
jpayne@68	1993 if inplace:
jpayne@68	1994 if not isinstance(self._data, bytearray):
jpayne@68	1995 raise TypeError("Sequence is immutable")
jpayne@68	1996 self._data[:] = data
jpayne@68	1997 return self
jpayne@68	1998 return self.__class__(data)
jpayne@68	1999
jpayne@68	2000 @property
jpayne@68	2001 def defined(self):
jpayne@68	2002 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68	2003
jpayne@68	2004 Zero-length sequences are always considered to be defined.
jpayne@68	2005 """
jpayne@68	2006 if isinstance(self._data, (bytes, bytearray)):
jpayne@68	2007 return True
jpayne@68	2008 else:
jpayne@68	2009 return self._data.defined
jpayne@68	2010
jpayne@68	2011 @property
jpayne@68	2012 def defined_ranges(self):
jpayne@68	2013 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68	2014
jpayne@68	2015 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68	2016 """
jpayne@68	2017 if isinstance(self._data, (bytes, bytearray)):
jpayne@68	2018 length = len(self)
jpayne@68	2019 if length > 0:
jpayne@68	2020 return ((0, length),)
jpayne@68	2021 else:
jpayne@68	2022 return ()
jpayne@68	2023 else:
jpayne@68	2024 return self._data.defined_ranges
jpayne@68	2025
jpayne@68	2026
jpayne@68	2027 class Seq(_SeqAbstractBaseClass):
jpayne@68	2028 """Read-only sequence object (essentially a string with biological methods).
jpayne@68	2029
jpayne@68	2030 Like normal python strings, our basic sequence object is immutable.
jpayne@68	2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
jpayne@68	2032 Seq objects to be used as dictionary keys.
jpayne@68	2033
jpayne@68	2034 The Seq object provides a number of string like methods (such as count,
jpayne@68	2035 find, split and strip).
jpayne@68	2036
jpayne@68	2037 The Seq object also provides some biological methods, such as complement,
jpayne@68	2038 reverse_complement, transcribe, back_transcribe and translate (which are
jpayne@68	2039 not applicable to protein sequences).
jpayne@68	2040 """
jpayne@68	2041
jpayne@68	2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
jpayne@68	2043
jpayne@68	2044 def __init__(
jpayne@68	2045 self,
jpayne@68	2046 data: Union[
jpayne@68	2047 str,
jpayne@68	2048 bytes,
jpayne@68	2049 bytearray,
jpayne@68	2050 _SeqAbstractBaseClass,
jpayne@68	2051 SequenceDataAbstractBaseClass,
jpayne@68	2052 dict,
jpayne@68	2053 None,
jpayne@68	2054 ],
jpayne@68	2055 length: Optional[int] = None,
jpayne@68	2056 ):
jpayne@68	2057 """Create a Seq object.
jpayne@68	2058
jpayne@68	2059 Arguments:
jpayne@68	2060 - data - Sequence, required (string)
jpayne@68	2061 - length - Sequence length, used only if data is None or a dictionary (integer)
jpayne@68	2062
jpayne@68	2063 You will typically use Bio.SeqIO to read in sequences from files as
jpayne@68	2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
jpayne@68	2065 the seq property.
jpayne@68	2066
jpayne@68	2067 However, you can also create a Seq object directly:
jpayne@68	2068
jpayne@68	2069 >>> from Bio.Seq import Seq
jpayne@68	2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
jpayne@68	2071 >>> my_seq
jpayne@68	2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
jpayne@68	2073 >>> print(my_seq)
jpayne@68	2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
jpayne@68	2075
jpayne@68	2076 To create a Seq object with for a sequence of known length but
jpayne@68	2077 unknown sequence contents, use None for the data argument and pass
jpayne@68	2078 the sequence length for the length argument. Trying to access the
jpayne@68	2079 sequence contents of a Seq object created in this way will raise
jpayne@68	2080 an UndefinedSequenceError:
jpayne@68	2081
jpayne@68	2082 >>> my_undefined_sequence = Seq(None, 20)
jpayne@68	2083 >>> my_undefined_sequence
jpayne@68	2084 Seq(None, length=20)
jpayne@68	2085 >>> len(my_undefined_sequence)
jpayne@68	2086 20
jpayne@68	2087 >>> print(my_undefined_sequence)
jpayne@68	2088 Traceback (most recent call last):
jpayne@68	2089 ...
jpayne@68	2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
jpayne@68	2091
jpayne@68	2092 If the sequence contents is known for parts of the sequence only, use
jpayne@68	2093 a dictionary for the data argument to pass the known sequence segments:
jpayne@68	2094
jpayne@68	2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
jpayne@68	2096 >>> my_partially_defined_sequence
jpayne@68	2097 Seq({3: 'ACGT'}, length=10)
jpayne@68	2098 >>> len(my_partially_defined_sequence)
jpayne@68	2099 10
jpayne@68	2100 >>> print(my_partially_defined_sequence)
jpayne@68	2101 Traceback (most recent call last):
jpayne@68	2102 ...
jpayne@68	2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
jpayne@68	2104 >>> my_partially_defined_sequence[3:7]
jpayne@68	2105 Seq('ACGT')
jpayne@68	2106 >>> print(my_partially_defined_sequence[3:7])
jpayne@68	2107 ACGT
jpayne@68	2108 """
jpayne@68	2109 if data is None:
jpayne@68	2110 if length is None:
jpayne@68	2111 raise ValueError("length must not be None if data is None")
jpayne@68	2112 elif length == 0:
jpayne@68	2113 self._data = b""
jpayne@68	2114 elif length < 0:
jpayne@68	2115 raise ValueError("length must not be negative.")
jpayne@68	2116 else:
jpayne@68	2117 self._data = _UndefinedSequenceData(length)
jpayne@68	2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
jpayne@68	2119 self._data = data
jpayne@68	2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
jpayne@68	2121 self._data = bytes(data)
jpayne@68	2122 elif isinstance(data, str):
jpayne@68	2123 self._data = bytes(data, encoding="ASCII")
jpayne@68	2124 elif isinstance(data, dict):
jpayne@68	2125 if length is None:
jpayne@68	2126 raise ValueError("length must not be None if data is a dictionary")
jpayne@68	2127 elif length == 0:
jpayne@68	2128 self._data = b""
jpayne@68	2129 elif length < 0:
jpayne@68	2130 raise ValueError("length must not be negative.")
jpayne@68	2131 else:
jpayne@68	2132 current = 0 # not needed here, but it keeps mypy happy
jpayne@68	2133 end = -1
jpayne@68	2134 starts = sorted(data.keys())
jpayne@68	2135 _data: Dict[int, bytes] = {}
jpayne@68	2136 for start in starts:
jpayne@68	2137 seq = data[start]
jpayne@68	2138 if isinstance(seq, str):
jpayne@68	2139 seq = bytes(seq, encoding="ASCII")
jpayne@68	2140 else:
jpayne@68	2141 try:
jpayne@68	2142 seq = bytes(seq)
jpayne@68	2143 except Exception:
jpayne@68	2144 raise ValueError("Expected bytes-like objects or strings")
jpayne@68	2145 if start < end:
jpayne@68	2146 raise ValueError("Sequence data are overlapping.")
jpayne@68	2147 elif start == end:
jpayne@68	2148 _data[current] += seq # noqa: F821
jpayne@68	2149 else:
jpayne@68	2150 _data[start] = seq
jpayne@68	2151 current = start
jpayne@68	2152 end = start + len(seq)
jpayne@68	2153 if end > length:
jpayne@68	2154 raise ValueError(
jpayne@68	2155 "Provided sequence data extend beyond sequence length."
jpayne@68	2156 )
jpayne@68	2157 elif end == length and current == 0:
jpayne@68	2158 # sequence is fully defined
jpayne@68	2159 self._data = _data[current]
jpayne@68	2160 else:
jpayne@68	2161 self._data = _PartiallyDefinedSequenceData(length, _data)
jpayne@68	2162 else:
jpayne@68	2163 raise TypeError(
jpayne@68	2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
jpayne@68	2165 )
jpayne@68	2166
jpayne@68	2167 def __hash__(self):
jpayne@68	2168 """Hash of the sequence as a string for comparison.
jpayne@68	2169
jpayne@68	2170 See Seq object comparison documentation (method ``__eq__`` in
jpayne@68	2171 particular) as this has changed in Biopython 1.65. Older versions
jpayne@68	2172 would hash on object identity.
jpayne@68	2173 """
jpayne@68	2174 return hash(self._data)
jpayne@68	2175
jpayne@68	2176
jpayne@68	2177 class MutableSeq(_SeqAbstractBaseClass):
jpayne@68	2178 """An editable sequence object.
jpayne@68	2179
jpayne@68	2180 Unlike normal python strings and our basic sequence object (the Seq class)
jpayne@68	2181 which are immutable, the MutableSeq lets you edit the sequence in place.
jpayne@68	2182 However, this means you cannot use a MutableSeq object as a dictionary key.
jpayne@68	2183
jpayne@68	2184 >>> from Bio.Seq import MutableSeq
jpayne@68	2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
jpayne@68	2186 >>> my_seq
jpayne@68	2187 MutableSeq('ACTCGTCGTCG')
jpayne@68	2188 >>> my_seq[5]
jpayne@68	2189 'T'
jpayne@68	2190 >>> my_seq[5] = "A"
jpayne@68	2191 >>> my_seq
jpayne@68	2192 MutableSeq('ACTCGACGTCG')
jpayne@68	2193 >>> my_seq[5]
jpayne@68	2194 'A'
jpayne@68	2195 >>> my_seq[5:8] = "NNN"
jpayne@68	2196 >>> my_seq
jpayne@68	2197 MutableSeq('ACTCGNNNTCG')
jpayne@68	2198 >>> len(my_seq)
jpayne@68	2199 11
jpayne@68	2200
jpayne@68	2201 Note that the MutableSeq object does not support as many string-like
jpayne@68	2202 or biological methods as the Seq object.
jpayne@68	2203 """
jpayne@68	2204
jpayne@68	2205 def __init__(self, data):
jpayne@68	2206 """Create a MutableSeq object."""
jpayne@68	2207 if isinstance(data, bytearray):
jpayne@68	2208 self._data = data
jpayne@68	2209 elif isinstance(data, bytes):
jpayne@68	2210 self._data = bytearray(data)
jpayne@68	2211 elif isinstance(data, str):
jpayne@68	2212 self._data = bytearray(data, "ASCII")
jpayne@68	2213 elif isinstance(data, MutableSeq):
jpayne@68	2214 self._data = data._data[:] # Take a copy
jpayne@68	2215 elif isinstance(data, Seq):
jpayne@68	2216 # Make no assumptions about the Seq subclass internal storage
jpayne@68	2217 self._data = bytearray(bytes(data))
jpayne@68	2218 else:
jpayne@68	2219 raise TypeError(
jpayne@68	2220 "data should be a string, bytearray object, Seq object, or a "
jpayne@68	2221 "MutableSeq object"
jpayne@68	2222 )
jpayne@68	2223
jpayne@68	2224 def __setitem__(self, index, value):
jpayne@68	2225 """Set a subsequence of single letter via value parameter.
jpayne@68	2226
jpayne@68	2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2228 >>> my_seq[0] = 'T'
jpayne@68	2229 >>> my_seq
jpayne@68	2230 MutableSeq('TCTCGACGTCG')
jpayne@68	2231 """
jpayne@68	2232 if isinstance(index, numbers.Integral):
jpayne@68	2233 # Replacing a single letter with a new string
jpayne@68	2234 self._data[index] = ord(value)
jpayne@68	2235 else:
jpayne@68	2236 # Replacing a sub-sequence
jpayne@68	2237 if isinstance(value, MutableSeq):
jpayne@68	2238 self._data[index] = value._data
jpayne@68	2239 elif isinstance(value, Seq):
jpayne@68	2240 self._data[index] = bytes(value)
jpayne@68	2241 elif isinstance(value, str):
jpayne@68	2242 self._data[index] = value.encode("ASCII")
jpayne@68	2243 else:
jpayne@68	2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
jpayne@68	2245
jpayne@68	2246 def __delitem__(self, index):
jpayne@68	2247 """Delete a subsequence of single letter.
jpayne@68	2248
jpayne@68	2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2250 >>> del my_seq[0]
jpayne@68	2251 >>> my_seq
jpayne@68	2252 MutableSeq('CTCGACGTCG')
jpayne@68	2253 """
jpayne@68	2254 # Could be deleting a single letter, or a slice
jpayne@68	2255 del self._data[index]
jpayne@68	2256
jpayne@68	2257 def append(self, c):
jpayne@68	2258 """Add a subsequence to the mutable sequence object.
jpayne@68	2259
jpayne@68	2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2261 >>> my_seq.append('A')
jpayne@68	2262 >>> my_seq
jpayne@68	2263 MutableSeq('ACTCGACGTCGA')
jpayne@68	2264
jpayne@68	2265 No return value.
jpayne@68	2266 """
jpayne@68	2267 self._data.append(ord(c.encode("ASCII")))
jpayne@68	2268
jpayne@68	2269 def insert(self, i, c):
jpayne@68	2270 """Add a subsequence to the mutable sequence object at a given index.
jpayne@68	2271
jpayne@68	2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2273 >>> my_seq.insert(0,'A')
jpayne@68	2274 >>> my_seq
jpayne@68	2275 MutableSeq('AACTCGACGTCG')
jpayne@68	2276 >>> my_seq.insert(8,'G')
jpayne@68	2277 >>> my_seq
jpayne@68	2278 MutableSeq('AACTCGACGGTCG')
jpayne@68	2279
jpayne@68	2280 No return value.
jpayne@68	2281 """
jpayne@68	2282 self._data.insert(i, ord(c.encode("ASCII")))
jpayne@68	2283
jpayne@68	2284 def pop(self, i=(-1)):
jpayne@68	2285 """Remove a subsequence of a single letter at given index.
jpayne@68	2286
jpayne@68	2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2288 >>> my_seq.pop()
jpayne@68	2289 'G'
jpayne@68	2290 >>> my_seq
jpayne@68	2291 MutableSeq('ACTCGACGTC')
jpayne@68	2292 >>> my_seq.pop()
jpayne@68	2293 'C'
jpayne@68	2294 >>> my_seq
jpayne@68	2295 MutableSeq('ACTCGACGT')
jpayne@68	2296
jpayne@68	2297 Returns the last character of the sequence.
jpayne@68	2298 """
jpayne@68	2299 c = self._data[i]
jpayne@68	2300 del self._data[i]
jpayne@68	2301 return chr(c)
jpayne@68	2302
jpayne@68	2303 def remove(self, item):
jpayne@68	2304 """Remove a subsequence of a single letter from mutable sequence.
jpayne@68	2305
jpayne@68	2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2307 >>> my_seq.remove('C')
jpayne@68	2308 >>> my_seq
jpayne@68	2309 MutableSeq('ATCGACGTCG')
jpayne@68	2310 >>> my_seq.remove('A')
jpayne@68	2311 >>> my_seq
jpayne@68	2312 MutableSeq('TCGACGTCG')
jpayne@68	2313
jpayne@68	2314 No return value.
jpayne@68	2315 """
jpayne@68	2316 codepoint = ord(item)
jpayne@68	2317 try:
jpayne@68	2318 self._data.remove(codepoint)
jpayne@68	2319 except ValueError:
jpayne@68	2320 raise ValueError("value not found in MutableSeq") from None
jpayne@68	2321
jpayne@68	2322 def reverse(self):
jpayne@68	2323 """Modify the mutable sequence to reverse itself.
jpayne@68	2324
jpayne@68	2325 No return value.
jpayne@68	2326 """
jpayne@68	2327 self._data.reverse()
jpayne@68	2328
jpayne@68	2329 def extend(self, other):
jpayne@68	2330 """Add a sequence to the original mutable sequence object.
jpayne@68	2331
jpayne@68	2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68	2333 >>> my_seq.extend('A')
jpayne@68	2334 >>> my_seq
jpayne@68	2335 MutableSeq('ACTCGACGTCGA')
jpayne@68	2336 >>> my_seq.extend('TTT')
jpayne@68	2337 >>> my_seq
jpayne@68	2338 MutableSeq('ACTCGACGTCGATTT')
jpayne@68	2339
jpayne@68	2340 No return value.
jpayne@68	2341 """
jpayne@68	2342 if isinstance(other, MutableSeq):
jpayne@68	2343 self._data.extend(other._data)
jpayne@68	2344 elif isinstance(other, Seq):
jpayne@68	2345 self._data.extend(bytes(other))
jpayne@68	2346 elif isinstance(other, str):
jpayne@68	2347 self._data.extend(other.encode("ASCII"))
jpayne@68	2348 else:
jpayne@68	2349 raise TypeError("expected a string, Seq or MutableSeq")
jpayne@68	2350
jpayne@68	2351
jpayne@68	2352 class UndefinedSequenceError(ValueError):
jpayne@68	2353 """Sequence contents is undefined."""
jpayne@68	2354
jpayne@68	2355
jpayne@68	2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68	2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68	2358
jpayne@68	2359 Objects of this class can be used to create a Seq object to represent
jpayne@68	2360 sequences with a known length, but an unknown sequence contents.
jpayne@68	2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
jpayne@68	2362 UndefinedSequenceError except for requests of zero size, for which it
jpayne@68	2363 returns an empty bytes object.
jpayne@68	2364 """
jpayne@68	2365
jpayne@68	2366 __slots__ = ("_length",)
jpayne@68	2367
jpayne@68	2368 def __init__(self, length):
jpayne@68	2369 """Initialize the object with the sequence length.
jpayne@68	2370
jpayne@68	2371 The calling function is responsible for ensuring that the length is
jpayne@68	2372 greater than zero.
jpayne@68	2373 """
jpayne@68	2374 self._length = length
jpayne@68	2375 super().__init__()
jpayne@68	2376
jpayne@68	2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
jpayne@68	2378 if isinstance(key, slice):
jpayne@68	2379 start, end, step = key.indices(self._length)
jpayne@68	2380 size = len(range(start, end, step))
jpayne@68	2381 if size == 0:
jpayne@68	2382 return b""
jpayne@68	2383 return _UndefinedSequenceData(size)
jpayne@68	2384 else:
jpayne@68	2385 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68	2386
jpayne@68	2387 def __len__(self):
jpayne@68	2388 return self._length
jpayne@68	2389
jpayne@68	2390 def __bytes__(self):
jpayne@68	2391 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68	2392
jpayne@68	2393 def __add__(self, other):
jpayne@68	2394 length = len(self) + len(other)
jpayne@68	2395 try:
jpayne@68	2396 other = bytes(other)
jpayne@68	2397 except UndefinedSequenceError:
jpayne@68	2398 if isinstance(other, _UndefinedSequenceData):
jpayne@68	2399 return _UndefinedSequenceData(length)
jpayne@68	2400 else:
jpayne@68	2401 return NotImplemented
jpayne@68	2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
jpayne@68	2403 else:
jpayne@68	2404 data = {len(self): other}
jpayne@68	2405 return _PartiallyDefinedSequenceData(length, data)
jpayne@68	2406
jpayne@68	2407 def __radd__(self, other):
jpayne@68	2408 data = {0: bytes(other)}
jpayne@68	2409 length = len(other) + len(self)
jpayne@68	2410 return _PartiallyDefinedSequenceData(length, data)
jpayne@68	2411
jpayne@68	2412 def upper(self):
jpayne@68	2413 """Return an upper case copy of the sequence."""
jpayne@68	2414 # An upper case copy of an undefined sequence is an undefined
jpayne@68	2415 # sequence of the same length
jpayne@68	2416 return _UndefinedSequenceData(self._length)
jpayne@68	2417
jpayne@68	2418 def lower(self):
jpayne@68	2419 """Return a lower case copy of the sequence."""
jpayne@68	2420 # A lower case copy of an undefined sequence is an undefined
jpayne@68	2421 # sequence of the same length
jpayne@68	2422 return _UndefinedSequenceData(self._length)
jpayne@68	2423
jpayne@68	2424 def isupper(self):
jpayne@68	2425 """Return True if all ASCII characters in data are uppercase.
jpayne@68	2426
jpayne@68	2427 If there are no cased characters, the method returns False.
jpayne@68	2428 """
jpayne@68	2429 # Character case is irrelevant for an undefined sequence
jpayne@68	2430 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68	2431
jpayne@68	2432 def islower(self):
jpayne@68	2433 """Return True if all ASCII characters in data are lowercase.
jpayne@68	2434
jpayne@68	2435 If there are no cased characters, the method returns False.
jpayne@68	2436 """
jpayne@68	2437 # Character case is irrelevant for an undefined sequence
jpayne@68	2438 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68	2439
jpayne@68	2440 def replace(self, old, new):
jpayne@68	2441 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68	2442 # Replacing substring old by new in an undefined sequence will result
jpayne@68	2443 # in an undefined sequence of the same length, if old and new have the
jpayne@68	2444 # number of characters.
jpayne@68	2445 if len(old) != len(new):
jpayne@68	2446 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68	2447 return _UndefinedSequenceData(self._length)
jpayne@68	2448
jpayne@68	2449 @property
jpayne@68	2450 def defined(self):
jpayne@68	2451 """Return False, as the sequence is not defined and has a non-zero length."""
jpayne@68	2452 return False
jpayne@68	2453
jpayne@68	2454 @property
jpayne@68	2455 def defined_ranges(self):
jpayne@68	2456 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68	2457
jpayne@68	2458 As the sequence contents of an _UndefinedSequenceData object is fully
jpayne@68	2459 undefined, the return value is always an empty tuple.
jpayne@68	2460 """
jpayne@68	2461 return ()
jpayne@68	2462
jpayne@68	2463
jpayne@68	2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68	2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68	2466
jpayne@68	2467 Objects of this class can be used to create a Seq object to represent
jpayne@68	2468 sequences with a known length, but with a sequence contents that is only
jpayne@68	2469 partially known.
jpayne@68	2470 Calling __len__ returns the sequence length, calling __getitem__ returns
jpayne@68	2471 the sequence contents if known, otherwise an UndefinedSequenceError is
jpayne@68	2472 raised.
jpayne@68	2473 """
jpayne@68	2474
jpayne@68	2475 __slots__ = ("_length", "_data")
jpayne@68	2476
jpayne@68	2477 def __init__(self, length, data):
jpayne@68	2478 """Initialize with the sequence length and defined sequence segments.
jpayne@68	2479
jpayne@68	2480 The calling function is responsible for ensuring that the length is
jpayne@68	2481 greater than zero.
jpayne@68	2482 """
jpayne@68	2483 self._length = length
jpayne@68	2484 self._data = data
jpayne@68	2485 super().__init__()
jpayne@68	2486
jpayne@68	2487 def __getitem__(
jpayne@68	2488 self, key: Union[slice, int]
jpayne@68	2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
jpayne@68	2490 if isinstance(key, slice):
jpayne@68	2491 start, end, step = key.indices(self._length)
jpayne@68	2492 size = len(range(start, end, step))
jpayne@68	2493 if size == 0:
jpayne@68	2494 return b""
jpayne@68	2495 data = {}
jpayne@68	2496 for s, d in self._data.items():
jpayne@68	2497 indices = range(-s, -s + self._length)[key]
jpayne@68	2498 e: Optional[int] = indices.stop
jpayne@68	2499 assert e is not None
jpayne@68	2500 if step > 0:
jpayne@68	2501 if e <= 0:
jpayne@68	2502 continue
jpayne@68	2503 if indices.start < 0:
jpayne@68	2504 s = indices.start % step
jpayne@68	2505 else:
jpayne@68	2506 s = indices.start
jpayne@68	2507 else: # step < 0
jpayne@68	2508 if e < 0:
jpayne@68	2509 e = None
jpayne@68	2510 end = len(d) - 1
jpayne@68	2511 if indices.start > end:
jpayne@68	2512 s = end + (indices.start - end) % step
jpayne@68	2513 else:
jpayne@68	2514 s = indices.start
jpayne@68	2515 if s < 0:
jpayne@68	2516 continue
jpayne@68	2517 start = (s - indices.start) // step
jpayne@68	2518 d = d[s:e:step]
jpayne@68	2519 if d:
jpayne@68	2520 data[start] = d
jpayne@68	2521 if len(data) == 0: # Fully undefined sequence
jpayne@68	2522 return _UndefinedSequenceData(size)
jpayne@68	2523 # merge adjacent sequence segments
jpayne@68	2524 end = -1
jpayne@68	2525 previous = 0 # not needed here, but it keeps flake happy
jpayne@68	2526 items = data.items()
jpayne@68	2527 data = {}
jpayne@68	2528 for start, seq in items:
jpayne@68	2529 if end == start:
jpayne@68	2530 data[previous] += seq
jpayne@68	2531 else:
jpayne@68	2532 data[start] = seq
jpayne@68	2533 previous = start
jpayne@68	2534 end = start + len(seq)
jpayne@68	2535 if len(data) == 1:
jpayne@68	2536 seq = data.get(0)
jpayne@68	2537 if seq is not None and len(seq) == size:
jpayne@68	2538 return seq # Fully defined sequence; return bytes
jpayne@68	2539 if step < 0:
jpayne@68	2540 # use this after we drop Python 3.7:
jpayne@68	2541 # data = {start: data[start] for start in reversed(data)}
jpayne@68	2542 # use this as long as we support Python 3.7:
jpayne@68	2543 data = {start: data[start] for start in reversed(list(data.keys()))}
jpayne@68	2544 return _PartiallyDefinedSequenceData(size, data)
jpayne@68	2545 elif self._length <= key:
jpayne@68	2546 raise IndexError("sequence index out of range")
jpayne@68	2547 else:
jpayne@68	2548 for start, seq in self._data.items():
jpayne@68	2549 if start <= key and key < start + len(seq):
jpayne@68	2550 return seq[key - start]
jpayne@68	2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
jpayne@68	2552
jpayne@68	2553 def __len__(self):
jpayne@68	2554 return self._length
jpayne@68	2555
jpayne@68	2556 def __bytes__(self):
jpayne@68	2557 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68	2558
jpayne@68	2559 def __add__(self, other):
jpayne@68	2560 length = len(self) + len(other)
jpayne@68	2561 data = dict(self._data)
jpayne@68	2562 items = list(self._data.items())
jpayne@68	2563 start, seq = items[-1]
jpayne@68	2564 end = start + len(seq)
jpayne@68	2565 try:
jpayne@68	2566 other = bytes(other)
jpayne@68	2567 except UndefinedSequenceError:
jpayne@68	2568 if isinstance(other, _UndefinedSequenceData):
jpayne@68	2569 pass
jpayne@68	2570 elif isinstance(other, _PartiallyDefinedSequenceData):
jpayne@68	2571 other_items = list(other._data.items())
jpayne@68	2572 if end == len(self):
jpayne@68	2573 other_start, other_seq = other_items.pop(0)
jpayne@68	2574 if other_start == 0:
jpayne@68	2575 data[start] += other_seq
jpayne@68	2576 else:
jpayne@68	2577 data[len(self) + other_start] = other_seq
jpayne@68	2578 for other_start, other_seq in other_items:
jpayne@68	2579 data[len(self) + other_start] = other_seq
jpayne@68	2580 else:
jpayne@68	2581 if end == len(self):
jpayne@68	2582 data[start] += other
jpayne@68	2583 else:
jpayne@68	2584 data[len(self)] = other
jpayne@68	2585 return _PartiallyDefinedSequenceData(length, data)
jpayne@68	2586
jpayne@68	2587 def __radd__(self, other):
jpayne@68	2588 length = len(other) + len(self)
jpayne@68	2589 try:
jpayne@68	2590 other = bytes(other)
jpayne@68	2591 except UndefinedSequenceError:
jpayne@68	2592 data = {len(other) + start: seq for start, seq in self._data.items()}
jpayne@68	2593 else:
jpayne@68	2594 data = {0: other}
jpayne@68	2595 items = list(self._data.items())
jpayne@68	2596 start, seq = items.pop(0)
jpayne@68	2597 if start == 0:
jpayne@68	2598 data[0] += seq
jpayne@68	2599 else:
jpayne@68	2600 data[len(other) + start] = seq
jpayne@68	2601 for start, seq in items:
jpayne@68	2602 data[len(other) + start] = seq
jpayne@68	2603 return _PartiallyDefinedSequenceData(length, data)
jpayne@68	2604
jpayne@68	2605 def __mul__(self, other):
jpayne@68	2606 length = self._length
jpayne@68	2607 items = self._data.items()
jpayne@68	2608 data = {}
jpayne@68	2609 end = -1
jpayne@68	2610 previous = 0 # not needed here, but it keeps flake happy
jpayne@68	2611 for i in range(other):
jpayne@68	2612 for start, seq in items:
jpayne@68	2613 start += i * length
jpayne@68	2614 if end == start:
jpayne@68	2615 data[previous] += seq
jpayne@68	2616 else:
jpayne@68	2617 data[start] = seq
jpayne@68	2618 previous = start
jpayne@68	2619 end = start + len(seq)
jpayne@68	2620 return _PartiallyDefinedSequenceData(length * other, data)
jpayne@68	2621
jpayne@68	2622 def upper(self):
jpayne@68	2623 """Return an upper case copy of the sequence."""
jpayne@68	2624 data = {start: seq.upper() for start, seq in self._data.items()}
jpayne@68	2625 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68	2626
jpayne@68	2627 def lower(self):
jpayne@68	2628 """Return a lower case copy of the sequence."""
jpayne@68	2629 data = {start: seq.lower() for start, seq in self._data.items()}
jpayne@68	2630 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68	2631
jpayne@68	2632 def isupper(self):
jpayne@68	2633 """Return True if all ASCII characters in data are uppercase.
jpayne@68	2634
jpayne@68	2635 If there are no cased characters, the method returns False.
jpayne@68	2636 """
jpayne@68	2637 # Character case is irrelevant for an undefined sequence
jpayne@68	2638 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68	2639
jpayne@68	2640 def islower(self):
jpayne@68	2641 """Return True if all ASCII characters in data are lowercase.
jpayne@68	2642
jpayne@68	2643 If there are no cased characters, the method returns False.
jpayne@68	2644 """
jpayne@68	2645 # Character case is irrelevant for an undefined sequence
jpayne@68	2646 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68	2647
jpayne@68	2648 def translate(self, table, delete=b""):
jpayne@68	2649 """Return a copy with each character mapped by the given translation table.
jpayne@68	2650
jpayne@68	2651 table
jpayne@68	2652 Translation table, which must be a bytes object of length 256.
jpayne@68	2653
jpayne@68	2654 All characters occurring in the optional argument delete are removed.
jpayne@68	2655 The remaining characters are mapped through the given translation table.
jpayne@68	2656 """
jpayne@68	2657 items = self._data.items()
jpayne@68	2658 data = {start: seq.translate(table, delete) for start, seq in items}
jpayne@68	2659 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68	2660
jpayne@68	2661 def replace(self, old, new):
jpayne@68	2662 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68	2663 # Replacing substring old by new in the undefined sequence segments
jpayne@68	2664 # will result in an undefined sequence segment of the same length, if
jpayne@68	2665 # old and new have the number of characters. If not, an error is raised,
jpayne@68	2666 # as the correct start positions cannot be calculated reliably.
jpayne@68	2667 if len(old) != len(new):
jpayne@68	2668 raise UndefinedSequenceError(
jpayne@68	2669 "Sequence content is only partially defined; substring \n"
jpayne@68	2670 "replacement cannot be performed reliably"
jpayne@68	2671 )
jpayne@68	2672 items = self._data.items()
jpayne@68	2673 data = {start: seq.replace(old, new) for start, seq in items}
jpayne@68	2674 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68	2675
jpayne@68	2676 @property
jpayne@68	2677 def defined(self):
jpayne@68	2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
jpayne@68	2679 return False
jpayne@68	2680
jpayne@68	2681 @property
jpayne@68	2682 def defined_ranges(self):
jpayne@68	2683 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68	2684
jpayne@68	2685 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68	2686 """
jpayne@68	2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
jpayne@68	2688
jpayne@68	2689
jpayne@68	2690 # The transcribe, backward_transcribe, and translate functions are
jpayne@68	2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
jpayne@68	2692 # The functions work both on Seq objects, and on strings.
jpayne@68	2693
jpayne@68	2694
jpayne@68	2695 def transcribe(dna):
jpayne@68	2696 """Transcribe a DNA sequence into RNA.
jpayne@68	2697
jpayne@68	2698 Following the usual convention, the sequence is interpreted as the
jpayne@68	2699 coding strand of the DNA double helix, not the template strand. This
jpayne@68	2700 means we can get the RNA sequence just by switching T to U.
jpayne@68	2701
jpayne@68	2702 If given a string, returns a new string object.
jpayne@68	2703
jpayne@68	2704 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68	2705
jpayne@68	2706 e.g.
jpayne@68	2707
jpayne@68	2708 >>> transcribe("ACTGN")
jpayne@68	2709 'ACUGN'
jpayne@68	2710 """
jpayne@68	2711 if isinstance(dna, Seq):
jpayne@68	2712 return dna.transcribe()
jpayne@68	2713 elif isinstance(dna, MutableSeq):
jpayne@68	2714 return Seq(dna).transcribe()
jpayne@68	2715 else:
jpayne@68	2716 return dna.replace("T", "U").replace("t", "u")
jpayne@68	2717
jpayne@68	2718
jpayne@68	2719 def back_transcribe(rna):
jpayne@68	2720 """Return the RNA sequence back-transcribed into DNA.
jpayne@68	2721
jpayne@68	2722 If given a string, returns a new string object.
jpayne@68	2723
jpayne@68	2724 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68	2725
jpayne@68	2726 e.g.
jpayne@68	2727
jpayne@68	2728 >>> back_transcribe("ACUGN")
jpayne@68	2729 'ACTGN'
jpayne@68	2730 """
jpayne@68	2731 if isinstance(rna, Seq):
jpayne@68	2732 return rna.back_transcribe()
jpayne@68	2733 elif isinstance(rna, MutableSeq):
jpayne@68	2734 return Seq(rna).back_transcribe()
jpayne@68	2735 else:
jpayne@68	2736 return rna.replace("U", "T").replace("u", "t")
jpayne@68	2737
jpayne@68	2738
jpayne@68	2739 def _translate_str(
jpayne@68	2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
jpayne@68	2741 ):
jpayne@68	2742 """Translate nucleotide string into a protein string (PRIVATE).
jpayne@68	2743
jpayne@68	2744 Arguments:
jpayne@68	2745 - sequence - a string
jpayne@68	2746 - table - Which codon table to use? This can be either a name (string),
jpayne@68	2747 an NCBI identifier (integer), or a CodonTable object (useful for
jpayne@68	2748 non-standard genetic codes). This defaults to the "Standard" table.
jpayne@68	2749 - stop_symbol - a single character string, what to use for terminators.
jpayne@68	2750 - to_stop - boolean, should translation terminate at the first
jpayne@68	2751 in frame stop codon? If there is no in-frame stop codon
jpayne@68	2752 then translation continues to the end.
jpayne@68	2753 - pos_stop - a single character string for a possible stop codon
jpayne@68	2754 (e.g. TAN or NNN)
jpayne@68	2755 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@68	2756 checks the sequence starts with a valid alternative start
jpayne@68	2757 codon (which will be translated as methionine, M), that the
jpayne@68	2758 sequence length is a multiple of three, and that there is a
jpayne@68	2759 single in frame stop codon at the end (this will be excluded
jpayne@68	2760 from the protein sequence, regardless of the to_stop option).
jpayne@68	2761 If these tests fail, an exception is raised.
jpayne@68	2762 - gap - Single character string to denote symbol used for gaps.
jpayne@68	2763 Defaults to None.
jpayne@68	2764
jpayne@68	2765 Returns a string.
jpayne@68	2766
jpayne@68	2767 e.g.
jpayne@68	2768
jpayne@68	2769 >>> from Bio.Data import CodonTable
jpayne@68	2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
jpayne@68	2771 >>> _translate_str("AAA", table)
jpayne@68	2772 'K'
jpayne@68	2773 >>> _translate_str("TAR", table)
jpayne@68	2774 '*'
jpayne@68	2775 >>> _translate_str("TAN", table)
jpayne@68	2776 'X'
jpayne@68	2777 >>> _translate_str("TAN", table, pos_stop="@")
jpayne@68	2778 '@'
jpayne@68	2779 >>> _translate_str("TA?", table)
jpayne@68	2780 Traceback (most recent call last):
jpayne@68	2781 ...
jpayne@68	2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
jpayne@68	2783
jpayne@68	2784 In a change to older versions of Biopython, partial codons are now
jpayne@68	2785 always regarded as an error (previously only checked if cds=True)
jpayne@68	2786 and will trigger a warning (likely to become an exception in a
jpayne@68	2787 future release).
jpayne@68	2788
jpayne@68	2789 If cds=True, the start and stop codons are checked, and the start
jpayne@68	2790 codon will be translated at methionine. The sequence must be an
jpayne@68	2791 while number of codons.
jpayne@68	2792
jpayne@68	2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
jpayne@68	2794 'MP'
jpayne@68	2795 >>> _translate_str("AAACCCTAG", table, cds=True)
jpayne@68	2796 Traceback (most recent call last):
jpayne@68	2797 ...
jpayne@68	2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
jpayne@68	2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
jpayne@68	2800 Traceback (most recent call last):
jpayne@68	2801 ...
jpayne@68	2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
jpayne@68	2803 """
jpayne@68	2804 try:
jpayne@68	2805 table_id = int(table)
jpayne@68	2806 except ValueError:
jpayne@68	2807 # Assume it's a table name
jpayne@68	2808 # The same table can be used for RNA or DNA
jpayne@68	2809 try:
jpayne@68	2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
jpayne@68	2811 except KeyError:
jpayne@68	2812 if isinstance(table, str):
jpayne@68	2813 raise ValueError(
jpayne@68	2814 "The Bio.Seq translate methods and function DO NOT "
jpayne@68	2815 "take a character string mapping table like the python "
jpayne@68	2816 "string object's translate method. "
jpayne@68	2817 "Use str(my_seq).translate(...) instead."
jpayne@68	2818 ) from None
jpayne@68	2819 else:
jpayne@68	2820 raise TypeError("table argument must be integer or string") from None
jpayne@68	2821 except (AttributeError, TypeError):
jpayne@68	2822 # Assume it's a CodonTable object
jpayne@68	2823 if isinstance(table, CodonTable.CodonTable):
jpayne@68	2824 codon_table = table
jpayne@68	2825 else:
jpayne@68	2826 raise ValueError("Bad table argument") from None
jpayne@68	2827 else:
jpayne@68	2828 # Assume it's a table ID
jpayne@68	2829 # The same table can be used for RNA or DNA
jpayne@68	2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
jpayne@68	2831 sequence = sequence.upper()
jpayne@68	2832 amino_acids = []
jpayne@68	2833 forward_table = codon_table.forward_table
jpayne@68	2834 stop_codons = codon_table.stop_codons
jpayne@68	2835 if codon_table.nucleotide_alphabet is not None:
jpayne@68	2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
jpayne@68	2837 else:
jpayne@68	2838 # Assume the worst case, ambiguous DNA or RNA:
jpayne@68	2839 valid_letters = set(
jpayne@68	2840 IUPACData.ambiguous_dna_letters.upper()
jpayne@68	2841 + IUPACData.ambiguous_rna_letters.upper()
jpayne@68	2842 )
jpayne@68	2843 n = len(sequence)
jpayne@68	2844
jpayne@68	2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
jpayne@68	2846 dual_coding = [c for c in stop_codons if c in forward_table]
jpayne@68	2847 if dual_coding:
jpayne@68	2848 c = dual_coding[0]
jpayne@68	2849 if to_stop:
jpayne@68	2850 raise ValueError(
jpayne@68	2851 "You cannot use 'to_stop=True' with this table as it contains"
jpayne@68	2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
jpayne@68	2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
jpayne@68	2854 )
jpayne@68	2855 warnings.warn(
jpayne@68	2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
jpayne@68	2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
jpayne@68	2858 " or STOP). Such codons will be translated as amino acid.",
jpayne@68	2859 BiopythonWarning,
jpayne@68	2860 )
jpayne@68	2861
jpayne@68	2862 if cds:
jpayne@68	2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
jpayne@68	2864 raise CodonTable.TranslationError(
jpayne@68	2865 f"First codon '{sequence[:3]}' is not a start codon"
jpayne@68	2866 )
jpayne@68	2867 if n % 3 != 0:
jpayne@68	2868 raise CodonTable.TranslationError(
jpayne@68	2869 f"Sequence length {n} is not a multiple of three"
jpayne@68	2870 )
jpayne@68	2871 if str(sequence[-3:]).upper() not in stop_codons:
jpayne@68	2872 raise CodonTable.TranslationError(
jpayne@68	2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
jpayne@68	2874 )
jpayne@68	2875 # Don't translate the stop symbol, and manually translate the M
jpayne@68	2876 sequence = sequence[3:-3]
jpayne@68	2877 n -= 6
jpayne@68	2878 amino_acids = ["M"]
jpayne@68	2879 elif n % 3 != 0:
jpayne@68	2880 warnings.warn(
jpayne@68	2881 "Partial codon, len(sequence) not a multiple of three. "
jpayne@68	2882 "Explicitly trim the sequence or add trailing N before "
jpayne@68	2883 "translation. This may become an error in future.",
jpayne@68	2884 BiopythonWarning,
jpayne@68	2885 )
jpayne@68	2886 if gap is not None:
jpayne@68	2887 if not isinstance(gap, str):
jpayne@68	2888 raise TypeError("Gap character should be a single character string.")
jpayne@68	2889 elif len(gap) > 1:
jpayne@68	2890 raise ValueError("Gap character should be a single character string.")
jpayne@68	2891
jpayne@68	2892 for i in range(0, n - n % 3, 3):
jpayne@68	2893 codon = sequence[i : i + 3]
jpayne@68	2894 try:
jpayne@68	2895 amino_acids.append(forward_table[codon])
jpayne@68	2896 except (KeyError, CodonTable.TranslationError):
jpayne@68	2897 if codon in codon_table.stop_codons:
jpayne@68	2898 if cds:
jpayne@68	2899 raise CodonTable.TranslationError(
jpayne@68	2900 f"Extra in frame stop codon '{codon}' found."
jpayne@68	2901 ) from None
jpayne@68	2902 if to_stop:
jpayne@68	2903 break
jpayne@68	2904 amino_acids.append(stop_symbol)
jpayne@68	2905 elif valid_letters.issuperset(set(codon)):
jpayne@68	2906 # Possible stop codon (e.g. NNN or TAN)
jpayne@68	2907 amino_acids.append(pos_stop)
jpayne@68	2908 elif gap is not None and codon == gap * 3:
jpayne@68	2909 # Gapped translation
jpayne@68	2910 amino_acids.append(gap)
jpayne@68	2911 else:
jpayne@68	2912 raise CodonTable.TranslationError(
jpayne@68	2913 f"Codon '{codon}' is invalid"
jpayne@68	2914 ) from None
jpayne@68	2915 return "".join(amino_acids)
jpayne@68	2916
jpayne@68	2917
jpayne@68	2918 def translate(
jpayne@68	2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
jpayne@68	2920 ):
jpayne@68	2921 """Translate a nucleotide sequence into amino acids.
jpayne@68	2922
jpayne@68	2923 If given a string, returns a new string object. Given a Seq or
jpayne@68	2924 MutableSeq, returns a Seq object.
jpayne@68	2925
jpayne@68	2926 Arguments:
jpayne@68	2927 - table - Which codon table to use? This can be either a name
jpayne@68	2928 (string), an NCBI identifier (integer), or a CodonTable object
jpayne@68	2929 (useful for non-standard genetic codes). Defaults to the "Standard"
jpayne@68	2930 table.
jpayne@68	2931 - stop_symbol - Single character string, what to use for any
jpayne@68	2932 terminators, defaults to the asterisk, "*".
jpayne@68	2933 - to_stop - Boolean, defaults to False meaning do a full
jpayne@68	2934 translation continuing on past any stop codons
jpayne@68	2935 (translated as the specified stop_symbol). If
jpayne@68	2936 True, translation is terminated at the first in
jpayne@68	2937 frame stop codon (and the stop_symbol is not
jpayne@68	2938 appended to the returned protein sequence).
jpayne@68	2939 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@68	2940 checks the sequence starts with a valid alternative start
jpayne@68	2941 codon (which will be translated as methionine, M), that the
jpayne@68	2942 sequence length is a multiple of three, and that there is a
jpayne@68	2943 single in frame stop codon at the end (this will be excluded
jpayne@68	2944 from the protein sequence, regardless of the to_stop option).
jpayne@68	2945 If these tests fail, an exception is raised.
jpayne@68	2946 - gap - Single character string to denote symbol used for gaps.
jpayne@68	2947 Defaults to None.
jpayne@68	2948
jpayne@68	2949 A simple string example using the default (standard) genetic code:
jpayne@68	2950
jpayne@68	2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
jpayne@68	2952 >>> translate(coding_dna)
jpayne@68	2953 'VAIVMGRKGAR'
jpayne@68	2954 >>> translate(coding_dna, stop_symbol="@")
jpayne@68	2955 'VAIVMGR@KGAR@'
jpayne@68	2956 >>> translate(coding_dna, to_stop=True)
jpayne@68	2957 'VAIVMGR'
jpayne@68	2958
jpayne@68	2959 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68	2960
jpayne@68	2961 >>> translate(coding_dna, table=2)
jpayne@68	2962 'VAIVMGRWKGAR*'
jpayne@68	2963 >>> translate(coding_dna, table=2, to_stop=True)
jpayne@68	2964 'VAIVMGRWKGAR'
jpayne@68	2965
jpayne@68	2966 In fact this example uses an alternative start codon valid under NCBI
jpayne@68	2967 table 2, GTG, which means this example is a complete valid CDS which
jpayne@68	2968 when translated should really start with methionine (not valine):
jpayne@68	2969
jpayne@68	2970 >>> translate(coding_dna, table=2, cds=True)
jpayne@68	2971 'MAIVMGRWKGAR'
jpayne@68	2972
jpayne@68	2973 Note that if the sequence has no in-frame stop codon, then the to_stop
jpayne@68	2974 argument has no effect:
jpayne@68	2975
jpayne@68	2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
jpayne@68	2977 >>> translate(coding_dna2)
jpayne@68	2978 'VAIVMGR'
jpayne@68	2979 >>> translate(coding_dna2, to_stop=True)
jpayne@68	2980 'VAIVMGR'
jpayne@68	2981
jpayne@68	2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68	2983 or a stop codon. These are translated as "X". Any invalid codon
jpayne@68	2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68	2985
jpayne@68	2986 It will however translate either DNA or RNA.
jpayne@68	2987
jpayne@68	2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
jpayne@68	2989 stop codons'. These are stop codons with unambiguous sequence but which
jpayne@68	2990 have a context dependent coding as STOP or as amino acid. With these tables
jpayne@68	2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
jpayne@68	2992 coding codons will always be translated as amino acid, except for
jpayne@68	2993 'cds=True', where the last codon will be translated as STOP.
jpayne@68	2994
jpayne@68	2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
jpayne@68	2996 >>> translate(coding_dna3)
jpayne@68	2997 'MARK*'
jpayne@68	2998
jpayne@68	2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
jpayne@68	3000 'MARKW'
jpayne@68	3001
jpayne@68	3002 It will however raise a BiopythonWarning (not shown).
jpayne@68	3003
jpayne@68	3004 >>> translate(coding_dna3, table=27, cds=True)
jpayne@68	3005 'MARK'
jpayne@68	3006
jpayne@68	3007 >>> translate(coding_dna3, table=27, to_stop=True)
jpayne@68	3008 Traceback (most recent call last):
jpayne@68	3009 ...
jpayne@68	3010 ValueError: You cannot use 'to_stop=True' with this table ...
jpayne@68	3011 """
jpayne@68	3012 if isinstance(sequence, Seq):
jpayne@68	3013 return sequence.translate(table, stop_symbol, to_stop, cds)
jpayne@68	3014 elif isinstance(sequence, MutableSeq):
jpayne@68	3015 # Return a Seq object
jpayne@68	3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
jpayne@68	3017 else:
jpayne@68	3018 # Assume it's a string, return a string
jpayne@68	3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68	3020
jpayne@68	3021
jpayne@68	3022 def reverse_complement(sequence, inplace=False):
jpayne@68	3023 """Return the reverse complement as a DNA sequence.
jpayne@68	3024
jpayne@68	3025 If given a string, returns a new string object.
jpayne@68	3026 Given a Seq object, returns a new Seq object.
jpayne@68	3027 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68	3028 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68	3029
jpayne@68	3030 >>> my_seq = "CGA"
jpayne@68	3031 >>> reverse_complement(my_seq)
jpayne@68	3032 'TCG'
jpayne@68	3033 >>> my_seq = Seq("CGA")
jpayne@68	3034 >>> reverse_complement(my_seq)
jpayne@68	3035 Seq('TCG')
jpayne@68	3036 >>> my_seq = MutableSeq("CGA")
jpayne@68	3037 >>> reverse_complement(my_seq)
jpayne@68	3038 MutableSeq('TCG')
jpayne@68	3039 >>> my_seq
jpayne@68	3040 MutableSeq('CGA')
jpayne@68	3041
jpayne@68	3042 Any U in the sequence is treated as a T:
jpayne@68	3043
jpayne@68	3044 >>> reverse_complement(Seq("CGAUT"))
jpayne@68	3045 Seq('AATCG')
jpayne@68	3046
jpayne@68	3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68	3048
jpayne@68	3049 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68	3050 Seq('AAUCG')
jpayne@68	3051
jpayne@68	3052 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68	3053 ambiguous nucleotides. All other characters are not converted:
jpayne@68	3054
jpayne@68	3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
jpayne@68	3056 'zrxZRXaacgtAACGT'
jpayne@68	3057
jpayne@68	3058 The sequence is modified in-place and returned if inplace is True:
jpayne@68	3059
jpayne@68	3060 >>> my_seq = MutableSeq("CGA")
jpayne@68	3061 >>> reverse_complement(my_seq, inplace=True)
jpayne@68	3062 MutableSeq('TCG')
jpayne@68	3063 >>> my_seq
jpayne@68	3064 MutableSeq('TCG')
jpayne@68	3065
jpayne@68	3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68	3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68	3068 ``inplace=True``.
jpayne@68	3069 """
jpayne@68	3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68	3071
jpayne@68	3072 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68	3073 return sequence.reverse_complement(inplace)
jpayne@68	3074 if isinstance(sequence, SeqRecord):
jpayne@68	3075 if inplace:
jpayne@68	3076 raise TypeError("SeqRecords are immutable")
jpayne@68	3077 return sequence.reverse_complement()
jpayne@68	3078 # Assume it's a string.
jpayne@68	3079 if inplace:
jpayne@68	3080 raise TypeError("strings are immutable")
jpayne@68	3081 sequence = sequence.encode("ASCII")
jpayne@68	3082 sequence = sequence.translate(_dna_complement_table)
jpayne@68	3083 sequence = sequence.decode("ASCII")
jpayne@68	3084 return sequence[::-1]
jpayne@68	3085
jpayne@68	3086
jpayne@68	3087 def reverse_complement_rna(sequence, inplace=False):
jpayne@68	3088 """Return the reverse complement as an RNA sequence.
jpayne@68	3089
jpayne@68	3090 If given a string, returns a new string object.
jpayne@68	3091 Given a Seq object, returns a new Seq object.
jpayne@68	3092 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68	3093 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68	3094
jpayne@68	3095 >>> my_seq = "CGA"
jpayne@68	3096 >>> reverse_complement_rna(my_seq)
jpayne@68	3097 'UCG'
jpayne@68	3098 >>> my_seq = Seq("CGA")
jpayne@68	3099 >>> reverse_complement_rna(my_seq)
jpayne@68	3100 Seq('UCG')
jpayne@68	3101 >>> my_seq = MutableSeq("CGA")
jpayne@68	3102 >>> reverse_complement_rna(my_seq)
jpayne@68	3103 MutableSeq('UCG')
jpayne@68	3104 >>> my_seq
jpayne@68	3105 MutableSeq('CGA')
jpayne@68	3106
jpayne@68	3107 Any T in the sequence is treated as a U:
jpayne@68	3108
jpayne@68	3109 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68	3110 Seq('AAUCG')
jpayne@68	3111
jpayne@68	3112 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68	3113
jpayne@68	3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
jpayne@68	3115 Seq('AATCG')
jpayne@68	3116
jpayne@68	3117 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68	3118 ambiguous nucleotides. All other characters are not converted:
jpayne@68	3119
jpayne@68	3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
jpayne@68	3121 'zrxZRXaacguAACGU'
jpayne@68	3122
jpayne@68	3123 The sequence is modified in-place and returned if inplace is True:
jpayne@68	3124
jpayne@68	3125 >>> my_seq = MutableSeq("CGA")
jpayne@68	3126 >>> reverse_complement_rna(my_seq, inplace=True)
jpayne@68	3127 MutableSeq('UCG')
jpayne@68	3128 >>> my_seq
jpayne@68	3129 MutableSeq('UCG')
jpayne@68	3130
jpayne@68	3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68	3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68	3133 ``inplace=True``.
jpayne@68	3134 """
jpayne@68	3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68	3136
jpayne@68	3137 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68	3138 return sequence.reverse_complement_rna(inplace)
jpayne@68	3139 if isinstance(sequence, SeqRecord):
jpayne@68	3140 if inplace:
jpayne@68	3141 raise TypeError("SeqRecords are immutable")
jpayne@68	3142 return sequence.reverse_complement_rna()
jpayne@68	3143 # Assume it's a string.
jpayne@68	3144 if inplace:
jpayne@68	3145 raise TypeError("strings are immutable")
jpayne@68	3146 sequence = sequence.encode("ASCII")
jpayne@68	3147 sequence = sequence.translate(_rna_complement_table)
jpayne@68	3148 sequence = sequence.decode("ASCII")
jpayne@68	3149 return sequence[::-1]
jpayne@68	3150
jpayne@68	3151
jpayne@68	3152 def complement(sequence, inplace=False):
jpayne@68	3153 """Return the complement as a DNA sequence.
jpayne@68	3154
jpayne@68	3155 If given a string, returns a new string object.
jpayne@68	3156 Given a Seq object, returns a new Seq object.
jpayne@68	3157 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68	3158 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68	3159
jpayne@68	3160 >>> my_seq = "CGA"
jpayne@68	3161 >>> complement(my_seq)
jpayne@68	3162 'GCT'
jpayne@68	3163 >>> my_seq = Seq("CGA")
jpayne@68	3164 >>> complement(my_seq)
jpayne@68	3165 Seq('GCT')
jpayne@68	3166 >>> my_seq = MutableSeq("CGA")
jpayne@68	3167 >>> complement(my_seq)
jpayne@68	3168 MutableSeq('GCT')
jpayne@68	3169 >>> my_seq
jpayne@68	3170 MutableSeq('CGA')
jpayne@68	3171
jpayne@68	3172 Any U in the sequence is treated as a T:
jpayne@68	3173
jpayne@68	3174 >>> complement(Seq("CGAUT"))
jpayne@68	3175 Seq('GCTAA')
jpayne@68	3176
jpayne@68	3177 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68	3178
jpayne@68	3179 >>> complement_rna(Seq("CGAUT"))
jpayne@68	3180 Seq('GCUAA')
jpayne@68	3181
jpayne@68	3182 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68	3183 ambiguous nucleotides. All other characters are not converted:
jpayne@68	3184
jpayne@68	3185 >>> complement("ACGTUacgtuXYZxyz")
jpayne@68	3186 'TGCAAtgcaaXRZxrz'
jpayne@68	3187
jpayne@68	3188 The sequence is modified in-place and returned if inplace is True:
jpayne@68	3189
jpayne@68	3190 >>> my_seq = MutableSeq("CGA")
jpayne@68	3191 >>> complement(my_seq, inplace=True)
jpayne@68	3192 MutableSeq('GCT')
jpayne@68	3193 >>> my_seq
jpayne@68	3194 MutableSeq('GCT')
jpayne@68	3195
jpayne@68	3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68	3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68	3198 ``inplace=True``.
jpayne@68	3199 """
jpayne@68	3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68	3201
jpayne@68	3202 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68	3203 return sequence.complement(inplace)
jpayne@68	3204 if isinstance(sequence, SeqRecord):
jpayne@68	3205 if inplace:
jpayne@68	3206 raise TypeError("SeqRecords are immutable")
jpayne@68	3207 return sequence.complement()
jpayne@68	3208 # Assume it's a string.
jpayne@68	3209 if inplace is True:
jpayne@68	3210 raise TypeError("strings are immutable")
jpayne@68	3211 sequence = sequence.encode("ASCII")
jpayne@68	3212 sequence = sequence.translate(_dna_complement_table)
jpayne@68	3213 return sequence.decode("ASCII")
jpayne@68	3214
jpayne@68	3215
jpayne@68	3216 def complement_rna(sequence, inplace=False):
jpayne@68	3217 """Return the complement as an RNA sequence.
jpayne@68	3218
jpayne@68	3219 If given a string, returns a new string object.
jpayne@68	3220 Given a Seq object, returns a new Seq object.
jpayne@68	3221 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68	3222 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68	3223
jpayne@68	3224 >>> my_seq = "CGA"
jpayne@68	3225 >>> complement_rna(my_seq)
jpayne@68	3226 'GCU'
jpayne@68	3227 >>> my_seq = Seq("CGA")
jpayne@68	3228 >>> complement_rna(my_seq)
jpayne@68	3229 Seq('GCU')
jpayne@68	3230 >>> my_seq = MutableSeq("CGA")
jpayne@68	3231 >>> complement_rna(my_seq)
jpayne@68	3232 MutableSeq('GCU')
jpayne@68	3233 >>> my_seq
jpayne@68	3234 MutableSeq('CGA')
jpayne@68	3235
jpayne@68	3236 Any T in the sequence is treated as a U:
jpayne@68	3237
jpayne@68	3238 >>> complement_rna(Seq("CGAUT"))
jpayne@68	3239 Seq('GCUAA')
jpayne@68	3240
jpayne@68	3241 In contrast, ``complement`` returns a DNA sequence:
jpayne@68	3242
jpayne@68	3243 >>> complement(Seq("CGAUT"))
jpayne@68	3244 Seq('GCTAA')
jpayne@68	3245
jpayne@68	3246 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68	3247 ambiguous nucleotides. All other characters are not converted:
jpayne@68	3248
jpayne@68	3249 >>> complement_rna("ACGTUacgtuXYZxyz")
jpayne@68	3250 'UGCAAugcaaXRZxrz'
jpayne@68	3251
jpayne@68	3252 The sequence is modified in-place and returned if inplace is True:
jpayne@68	3253
jpayne@68	3254 >>> my_seq = MutableSeq("CGA")
jpayne@68	3255 >>> complement(my_seq, inplace=True)
jpayne@68	3256 MutableSeq('GCT')
jpayne@68	3257 >>> my_seq
jpayne@68	3258 MutableSeq('GCT')
jpayne@68	3259
jpayne@68	3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68	3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68	3262 ``inplace=True``.
jpayne@68	3263 """
jpayne@68	3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68	3265
jpayne@68	3266 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68	3267 return sequence.complement_rna(inplace)
jpayne@68	3268 if isinstance(sequence, SeqRecord):
jpayne@68	3269 if inplace:
jpayne@68	3270 raise TypeError("SeqRecords are immutable")
jpayne@68	3271 return sequence.complement_rna()
jpayne@68	3272 # Assume it's a string.
jpayne@68	3273 if inplace:
jpayne@68	3274 raise TypeError("strings are immutable")
jpayne@68	3275 sequence = sequence.encode("ASCII")
jpayne@68	3276 sequence = sequence.translate(_rna_complement_table)
jpayne@68	3277 return sequence.decode("ASCII")
jpayne@68	3278
jpayne@68	3279
jpayne@68	3280 def _test():
jpayne@68	3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
jpayne@68	3282 print("Running doctests...")
jpayne@68	3283 import doctest
jpayne@68	3284
jpayne@68	3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
jpayne@68	3286 print("Done")
jpayne@68	3287
jpayne@68	3288
jpayne@68	3289 if __name__ == "__main__":
jpayne@68	3290 _test()

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 68:5028fdace37b