csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright 2000 Andrew Dalke.
jpayne@69	2 # Copyright 2000-2002 Brad Chapman.
jpayne@69	3 # Copyright 2004-2005, 2010 by M de Hoon.
jpayne@69	4 # Copyright 2007-2023 by Peter Cock.
jpayne@69	5 # All rights reserved.
jpayne@69	6 #
jpayne@69	7 # This file is part of the Biopython distribution and governed by your
jpayne@69	8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69	9 # Please see the LICENSE file that should have been included as part of this
jpayne@69	10 # package.
jpayne@69	11 """Provide objects to represent biological sequences.
jpayne@69	12
jpayne@69	13 See also the Seq_ wiki and the chapter in our tutorial:
jpayne@69	14 - `HTML Tutorial`_
jpayne@69	15 - `PDF Tutorial`_
jpayne@69	16
jpayne@69	17 .. _Seq: http://biopython.org/wiki/Seq
jpayne@69	18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
jpayne@69	19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
jpayne@69	20
jpayne@69	21 """
jpayne@69	22 import array
jpayne@69	23 import collections
jpayne@69	24 import numbers
jpayne@69	25 import warnings
jpayne@69	26
jpayne@69	27 from abc import ABC
jpayne@69	28 from abc import abstractmethod
jpayne@69	29 from typing import overload, Optional, Union, Dict
jpayne@69	30
jpayne@69	31 from Bio import BiopythonWarning
jpayne@69	32 from Bio.Data import CodonTable
jpayne@69	33 from Bio.Data import IUPACData
jpayne@69	34
jpayne@69	35
jpayne@69	36 def _maketrans(complement_mapping):
jpayne@69	37 """Make a python string translation table (PRIVATE).
jpayne@69	38
jpayne@69	39 Arguments:
jpayne@69	40 - complement_mapping - a dictionary such as ambiguous_dna_complement
jpayne@69	41 and ambiguous_rna_complement from Data.IUPACData.
jpayne@69	42
jpayne@69	43 Returns a translation table (a bytes object of length 256) for use with
jpayne@69	44 the python string's translate method to use in a (reverse) complement.
jpayne@69	45
jpayne@69	46 Compatible with lower case and upper case sequences.
jpayne@69	47
jpayne@69	48 For internal use only.
jpayne@69	49 """
jpayne@69	50 keys = "".join(complement_mapping.keys()).encode("ASCII")
jpayne@69	51 values = "".join(complement_mapping.values()).encode("ASCII")
jpayne@69	52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
jpayne@69	53
jpayne@69	54
jpayne@69	55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
jpayne@69	56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
jpayne@69	57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
jpayne@69	58 del ambiguous_dna_complement
jpayne@69	59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
jpayne@69	60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
jpayne@69	61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
jpayne@69	62 del ambiguous_rna_complement
jpayne@69	63
jpayne@69	64
jpayne@69	65 class SequenceDataAbstractBaseClass(ABC):
jpayne@69	66 """Abstract base class for sequence content providers.
jpayne@69	67
jpayne@69	68 Most users will not need to use this class. It is used internally as a base
jpayne@69	69 class for sequence content provider classes such as _UndefinedSequenceData
jpayne@69	70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
jpayne@69	71 Instances of these classes can be used instead of a ``bytes`` object as the
jpayne@69	72 data argument when creating a Seq object, and provide the sequence content
jpayne@69	73 only when requested via ``__getitem__``. This allows lazy parsers to load
jpayne@69	74 and parse sequence data from a file only for the requested sequence regions,
jpayne@69	75 and _UndefinedSequenceData instances to raise an exception when undefined
jpayne@69	76 sequence data are requested.
jpayne@69	77
jpayne@69	78 Future implementations of lazy parsers that similarly provide on-demand
jpayne@69	79 parsing of sequence data should use a subclass of this abstract class and
jpayne@69	80 implement the abstract methods ``__len__`` and ``__getitem__``:
jpayne@69	81
jpayne@69	82 * ``__len__`` must return the sequence length;
jpayne@69	83 * ``__getitem__`` must return
jpayne@69	84
jpayne@69	85 * a ``bytes`` object for the requested region; or
jpayne@69	86 * a new instance of the subclass for the requested region; or
jpayne@69	87 * raise an ``UndefinedSequenceError``.
jpayne@69	88
jpayne@69	89 Calling ``__getitem__`` for a sequence region of size zero should always
jpayne@69	90 return an empty ``bytes`` object.
jpayne@69	91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
jpayne@69	92 either return a ``bytes`` object with the full sequence, or raise an
jpayne@69	93 ``UndefinedSequenceError``.
jpayne@69	94
jpayne@69	95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
jpayne@69	96 as part of their ``__init__`` method.
jpayne@69	97 """
jpayne@69	98
jpayne@69	99 __slots__ = ()
jpayne@69	100
jpayne@69	101 def __init__(self):
jpayne@69	102 """Check if ``__getitem__`` returns a bytes-like object."""
jpayne@69	103 assert self[:0] == b""
jpayne@69	104
jpayne@69	105 @abstractmethod
jpayne@69	106 def __len__(self):
jpayne@69	107 pass
jpayne@69	108
jpayne@69	109 @abstractmethod
jpayne@69	110 def __getitem__(self, key):
jpayne@69	111 pass
jpayne@69	112
jpayne@69	113 def __bytes__(self):
jpayne@69	114 return self[:]
jpayne@69	115
jpayne@69	116 def __hash__(self):
jpayne@69	117 return hash(bytes(self))
jpayne@69	118
jpayne@69	119 def __eq__(self, other):
jpayne@69	120 return bytes(self) == other
jpayne@69	121
jpayne@69	122 def __lt__(self, other):
jpayne@69	123 return bytes(self) < other
jpayne@69	124
jpayne@69	125 def __le__(self, other):
jpayne@69	126 return bytes(self) <= other
jpayne@69	127
jpayne@69	128 def __gt__(self, other):
jpayne@69	129 return bytes(self) > other
jpayne@69	130
jpayne@69	131 def __ge__(self, other):
jpayne@69	132 return bytes(self) >= other
jpayne@69	133
jpayne@69	134 def __add__(self, other):
jpayne@69	135 try:
jpayne@69	136 return bytes(self) + bytes(other)
jpayne@69	137 except UndefinedSequenceError:
jpayne@69	138 return NotImplemented
jpayne@69	139 # will be handled by _UndefinedSequenceData.__radd__ or
jpayne@69	140 # by _PartiallyDefinedSequenceData.__radd__
jpayne@69	141
jpayne@69	142 def __radd__(self, other):
jpayne@69	143 return other + bytes(self)
jpayne@69	144
jpayne@69	145 def __mul__(self, other):
jpayne@69	146 return other * bytes(self)
jpayne@69	147
jpayne@69	148 def __contains__(self, item):
jpayne@69	149 return bytes(self).__contains__(item)
jpayne@69	150
jpayne@69	151 def decode(self, encoding="utf-8"):
jpayne@69	152 """Decode the data as bytes using the codec registered for encoding.
jpayne@69	153
jpayne@69	154 encoding
jpayne@69	155 The encoding with which to decode the bytes.
jpayne@69	156 """
jpayne@69	157 return bytes(self).decode(encoding)
jpayne@69	158
jpayne@69	159 def count(self, sub, start=None, end=None):
jpayne@69	160 """Return the number of non-overlapping occurrences of sub in data[start:end].
jpayne@69	161
jpayne@69	162 Optional arguments start and end are interpreted as in slice notation.
jpayne@69	163 This method behaves as the count method of Python strings.
jpayne@69	164 """
jpayne@69	165 return bytes(self).count(sub, start, end)
jpayne@69	166
jpayne@69	167 def find(self, sub, start=None, end=None):
jpayne@69	168 """Return the lowest index in data where subsection sub is found.
jpayne@69	169
jpayne@69	170 Return the lowest index in data where subsection sub is found,
jpayne@69	171 such that sub is contained within data[start,end]. Optional
jpayne@69	172 arguments start and end are interpreted as in slice notation.
jpayne@69	173
jpayne@69	174 Return -1 on failure.
jpayne@69	175 """
jpayne@69	176 return bytes(self).find(sub, start, end)
jpayne@69	177
jpayne@69	178 def rfind(self, sub, start=None, end=None):
jpayne@69	179 """Return the highest index in data where subsection sub is found.
jpayne@69	180
jpayne@69	181 Return the highest index in data where subsection sub is found,
jpayne@69	182 such that sub is contained within data[start,end]. Optional
jpayne@69	183 arguments start and end are interpreted as in slice notation.
jpayne@69	184
jpayne@69	185 Return -1 on failure.
jpayne@69	186 """
jpayne@69	187 return bytes(self).rfind(sub, start, end)
jpayne@69	188
jpayne@69	189 def index(self, sub, start=None, end=None):
jpayne@69	190 """Return the lowest index in data where subsection sub is found.
jpayne@69	191
jpayne@69	192 Return the lowest index in data where subsection sub is found,
jpayne@69	193 such that sub is contained within data[start,end]. Optional
jpayne@69	194 arguments start and end are interpreted as in slice notation.
jpayne@69	195
jpayne@69	196 Raises ValueError when the subsection is not found.
jpayne@69	197 """
jpayne@69	198 return bytes(self).index(sub, start, end)
jpayne@69	199
jpayne@69	200 def rindex(self, sub, start=None, end=None):
jpayne@69	201 """Return the highest index in data where subsection sub is found.
jpayne@69	202
jpayne@69	203 Return the highest index in data where subsection sub is found,
jpayne@69	204 such that sub is contained within data[start,end]. Optional
jpayne@69	205 arguments start and end are interpreted as in slice notation.
jpayne@69	206
jpayne@69	207 Raise ValueError when the subsection is not found.
jpayne@69	208 """
jpayne@69	209 return bytes(self).rindex(sub, start, end)
jpayne@69	210
jpayne@69	211 def startswith(self, prefix, start=None, end=None):
jpayne@69	212 """Return True if data starts with the specified prefix, False otherwise.
jpayne@69	213
jpayne@69	214 With optional start, test data beginning at that position.
jpayne@69	215 With optional end, stop comparing data at that position.
jpayne@69	216 prefix can also be a tuple of bytes to try.
jpayne@69	217 """
jpayne@69	218 return bytes(self).startswith(prefix, start, end)
jpayne@69	219
jpayne@69	220 def endswith(self, suffix, start=None, end=None):
jpayne@69	221 """Return True if data ends with the specified suffix, False otherwise.
jpayne@69	222
jpayne@69	223 With optional start, test data beginning at that position.
jpayne@69	224 With optional end, stop comparing data at that position.
jpayne@69	225 suffix can also be a tuple of bytes to try.
jpayne@69	226 """
jpayne@69	227 return bytes(self).endswith(suffix, start, end)
jpayne@69	228
jpayne@69	229 def split(self, sep=None, maxsplit=-1):
jpayne@69	230 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@69	231
jpayne@69	232 sep
jpayne@69	233 The delimiter according which to split the data.
jpayne@69	234 None (the default value) means split on ASCII whitespace characters
jpayne@69	235 (space, tab, return, newline, formfeed, vertical tab).
jpayne@69	236 maxsplit
jpayne@69	237 Maximum number of splits to do.
jpayne@69	238 -1 (the default value) means no limit.
jpayne@69	239 """
jpayne@69	240 return bytes(self).split(sep, maxsplit)
jpayne@69	241
jpayne@69	242 def rsplit(self, sep=None, maxsplit=-1):
jpayne@69	243 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@69	244
jpayne@69	245 sep
jpayne@69	246 The delimiter according which to split the data.
jpayne@69	247 None (the default value) means split on ASCII whitespace characters
jpayne@69	248 (space, tab, return, newline, formfeed, vertical tab).
jpayne@69	249 maxsplit
jpayne@69	250 Maximum number of splits to do.
jpayne@69	251 -1 (the default value) means no limit.
jpayne@69	252
jpayne@69	253 Splitting is done starting at the end of the data and working to the front.
jpayne@69	254 """
jpayne@69	255 return bytes(self).rsplit(sep, maxsplit)
jpayne@69	256
jpayne@69	257 def strip(self, chars=None):
jpayne@69	258 """Strip leading and trailing characters contained in the argument.
jpayne@69	259
jpayne@69	260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
jpayne@69	261 """
jpayne@69	262 return bytes(self).strip(chars)
jpayne@69	263
jpayne@69	264 def lstrip(self, chars=None):
jpayne@69	265 """Strip leading characters contained in the argument.
jpayne@69	266
jpayne@69	267 If the argument is omitted or None, strip leading ASCII whitespace.
jpayne@69	268 """
jpayne@69	269 return bytes(self).lstrip(chars)
jpayne@69	270
jpayne@69	271 def rstrip(self, chars=None):
jpayne@69	272 """Strip trailing characters contained in the argument.
jpayne@69	273
jpayne@69	274 If the argument is omitted or None, strip trailing ASCII whitespace.
jpayne@69	275 """
jpayne@69	276 return bytes(self).rstrip(chars)
jpayne@69	277
jpayne@69	278 def removeprefix(self, prefix):
jpayne@69	279 """Remove the prefix if present."""
jpayne@69	280 # Want to do just this, but need Python 3.9+
jpayne@69	281 # return bytes(self).removeprefix(prefix)
jpayne@69	282 data = bytes(self)
jpayne@69	283 try:
jpayne@69	284 return data.removeprefix(prefix)
jpayne@69	285 except AttributeError:
jpayne@69	286 if data.startswith(prefix):
jpayne@69	287 return data[len(prefix) :]
jpayne@69	288 else:
jpayne@69	289 return data
jpayne@69	290
jpayne@69	291 def removesuffix(self, suffix):
jpayne@69	292 """Remove the suffix if present."""
jpayne@69	293 # Want to do just this, but need Python 3.9+
jpayne@69	294 # return bytes(self).removesuffix(suffix)
jpayne@69	295 data = bytes(self)
jpayne@69	296 try:
jpayne@69	297 return data.removesuffix(suffix)
jpayne@69	298 except AttributeError:
jpayne@69	299 if data.startswith(suffix):
jpayne@69	300 return data[: -len(suffix)]
jpayne@69	301 else:
jpayne@69	302 return data
jpayne@69	303
jpayne@69	304 def upper(self):
jpayne@69	305 """Return a copy of data with all ASCII characters converted to uppercase."""
jpayne@69	306 return bytes(self).upper()
jpayne@69	307
jpayne@69	308 def lower(self):
jpayne@69	309 """Return a copy of data with all ASCII characters converted to lowercase."""
jpayne@69	310 return bytes(self).lower()
jpayne@69	311
jpayne@69	312 def isupper(self):
jpayne@69	313 """Return True if all ASCII characters in data are uppercase.
jpayne@69	314
jpayne@69	315 If there are no cased characters, the method returns False.
jpayne@69	316 """
jpayne@69	317 return bytes(self).isupper()
jpayne@69	318
jpayne@69	319 def islower(self):
jpayne@69	320 """Return True if all ASCII characters in data are lowercase.
jpayne@69	321
jpayne@69	322 If there are no cased characters, the method returns False.
jpayne@69	323 """
jpayne@69	324 return bytes(self).islower()
jpayne@69	325
jpayne@69	326 def replace(self, old, new):
jpayne@69	327 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69	328 return bytes(self).replace(old, new)
jpayne@69	329
jpayne@69	330 def translate(self, table, delete=b""):
jpayne@69	331 """Return a copy with each character mapped by the given translation table.
jpayne@69	332
jpayne@69	333 table
jpayne@69	334 Translation table, which must be a bytes object of length 256.
jpayne@69	335
jpayne@69	336 All characters occurring in the optional argument delete are removed.
jpayne@69	337 The remaining characters are mapped through the given translation table.
jpayne@69	338 """
jpayne@69	339 return bytes(self).translate(table, delete)
jpayne@69	340
jpayne@69	341 @property
jpayne@69	342 def defined(self):
jpayne@69	343 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@69	344
jpayne@69	345 Zero-length sequences are always considered to be defined.
jpayne@69	346 """
jpayne@69	347 return True
jpayne@69	348
jpayne@69	349 @property
jpayne@69	350 def defined_ranges(self):
jpayne@69	351 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69	352
jpayne@69	353 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69	354 """
jpayne@69	355 length = len(self)
jpayne@69	356 if length > 0:
jpayne@69	357 return ((0, length),)
jpayne@69	358 else:
jpayne@69	359 return ()
jpayne@69	360
jpayne@69	361
jpayne@69	362 class _SeqAbstractBaseClass(ABC):
jpayne@69	363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
jpayne@69	364
jpayne@69	365 Most users will not need to use this class. It is used internally as an
jpayne@69	366 abstract base class for Seq and MutableSeq, as most of their methods are
jpayne@69	367 identical.
jpayne@69	368 """
jpayne@69	369
jpayne@69	370 __slots__ = ("_data",)
jpayne@69	371 __array_ufunc__ = None # turn off numpy Ufuncs
jpayne@69	372
jpayne@69	373 @abstractmethod
jpayne@69	374 def __init__(self):
jpayne@69	375 pass
jpayne@69	376
jpayne@69	377 def __bytes__(self):
jpayne@69	378 return bytes(self._data)
jpayne@69	379
jpayne@69	380 def __repr__(self):
jpayne@69	381 """Return (truncated) representation of the sequence."""
jpayne@69	382 data = self._data
jpayne@69	383 if isinstance(data, _UndefinedSequenceData):
jpayne@69	384 return f"Seq(None, length={len(self)})"
jpayne@69	385 if isinstance(data, _PartiallyDefinedSequenceData):
jpayne@69	386 d = {}
jpayne@69	387 for position, seq in data._data.items():
jpayne@69	388 if len(seq) > 60:
jpayne@69	389 start = seq[:54].decode("ASCII")
jpayne@69	390 end = seq[-3:].decode("ASCII")
jpayne@69	391 seq = f"{start}...{end}"
jpayne@69	392 else:
jpayne@69	393 seq = seq.decode("ASCII")
jpayne@69	394 d[position] = seq
jpayne@69	395 return "Seq(%r, length=%d)" % (d, len(self))
jpayne@69	396 if len(data) > 60:
jpayne@69	397 # Shows the last three letters as it is often useful to see if
jpayne@69	398 # there is a stop codon at the end of a sequence.
jpayne@69	399 # Note total length is 54+3+3=60
jpayne@69	400 start = data[:54].decode("ASCII")
jpayne@69	401 end = data[-3:].decode("ASCII")
jpayne@69	402 return f"{self.__class__.__name__}('{start}...{end}')"
jpayne@69	403 else:
jpayne@69	404 data = data.decode("ASCII")
jpayne@69	405 return f"{self.__class__.__name__}('{data}')"
jpayne@69	406
jpayne@69	407 def __str__(self):
jpayne@69	408 """Return the full sequence as a python string."""
jpayne@69	409 return self._data.decode("ASCII")
jpayne@69	410
jpayne@69	411 def __eq__(self, other):
jpayne@69	412 """Compare the sequence to another sequence or a string.
jpayne@69	413
jpayne@69	414 Sequences are equal to each other if their sequence contents is
jpayne@69	415 identical:
jpayne@69	416
jpayne@69	417 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	418 >>> seq1 = Seq("ACGT")
jpayne@69	419 >>> seq2 = Seq("ACGT")
jpayne@69	420 >>> mutable_seq = MutableSeq("ACGT")
jpayne@69	421 >>> seq1 == seq2
jpayne@69	422 True
jpayne@69	423 >>> seq1 == mutable_seq
jpayne@69	424 True
jpayne@69	425 >>> seq1 == "ACGT"
jpayne@69	426 True
jpayne@69	427
jpayne@69	428 Note that the sequence objects themselves are not identical to each
jpayne@69	429 other:
jpayne@69	430
jpayne@69	431 >>> id(seq1) == id(seq2)
jpayne@69	432 False
jpayne@69	433 >>> seq1 is seq2
jpayne@69	434 False
jpayne@69	435
jpayne@69	436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
jpayne@69	437 objects:
jpayne@69	438
jpayne@69	439 >>> seq1 == "ACGT"
jpayne@69	440 True
jpayne@69	441 >>> seq1 == b"ACGT"
jpayne@69	442 True
jpayne@69	443 >>> seq1 == bytearray(b"ACGT")
jpayne@69	444 True
jpayne@69	445 """
jpayne@69	446 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	447 return self._data == other._data
jpayne@69	448 elif isinstance(other, str):
jpayne@69	449 return self._data == other.encode("ASCII")
jpayne@69	450 else:
jpayne@69	451 return self._data == other
jpayne@69	452
jpayne@69	453 def __lt__(self, other):
jpayne@69	454 """Implement the less-than operand."""
jpayne@69	455 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	456 return self._data < other._data
jpayne@69	457 elif isinstance(other, str):
jpayne@69	458 return self._data < other.encode("ASCII")
jpayne@69	459 else:
jpayne@69	460 return self._data < other
jpayne@69	461
jpayne@69	462 def __le__(self, other):
jpayne@69	463 """Implement the less-than or equal operand."""
jpayne@69	464 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	465 return self._data <= other._data
jpayne@69	466 elif isinstance(other, str):
jpayne@69	467 return self._data <= other.encode("ASCII")
jpayne@69	468 else:
jpayne@69	469 return self._data <= other
jpayne@69	470
jpayne@69	471 def __gt__(self, other):
jpayne@69	472 """Implement the greater-than operand."""
jpayne@69	473 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	474 return self._data > other._data
jpayne@69	475 elif isinstance(other, str):
jpayne@69	476 return self._data > other.encode("ASCII")
jpayne@69	477 else:
jpayne@69	478 return self._data > other
jpayne@69	479
jpayne@69	480 def __ge__(self, other):
jpayne@69	481 """Implement the greater-than or equal operand."""
jpayne@69	482 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	483 return self._data >= other._data
jpayne@69	484 elif isinstance(other, str):
jpayne@69	485 return self._data >= other.encode("ASCII")
jpayne@69	486 else:
jpayne@69	487 return self._data >= other
jpayne@69	488
jpayne@69	489 def __len__(self):
jpayne@69	490 """Return the length of the sequence."""
jpayne@69	491 return len(self._data)
jpayne@69	492
jpayne@69	493 def __iter__(self):
jpayne@69	494 """Return an iterable of the sequence."""
jpayne@69	495 return self._data.decode("ASCII").__iter__()
jpayne@69	496
jpayne@69	497 @overload
jpayne@69	498 def __getitem__(self, index: int) -> str:
jpayne@69	499 ...
jpayne@69	500
jpayne@69	501 @overload
jpayne@69	502 def __getitem__(self, index: slice) -> "Seq":
jpayne@69	503 ...
jpayne@69	504
jpayne@69	505 def __getitem__(self, index):
jpayne@69	506 """Return a subsequence as a single letter or as a sequence object.
jpayne@69	507
jpayne@69	508 If the index is an integer, a single letter is returned as a Python
jpayne@69	509 string:
jpayne@69	510
jpayne@69	511 >>> seq = Seq('ACTCGACGTCG')
jpayne@69	512 >>> seq[5]
jpayne@69	513 'A'
jpayne@69	514
jpayne@69	515 Otherwise, a new sequence object of the same class is returned:
jpayne@69	516
jpayne@69	517 >>> seq[5:8]
jpayne@69	518 Seq('ACG')
jpayne@69	519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	520 >>> mutable_seq[5:8]
jpayne@69	521 MutableSeq('ACG')
jpayne@69	522 """
jpayne@69	523 if isinstance(index, numbers.Integral):
jpayne@69	524 # Return a single letter as a string
jpayne@69	525 return chr(self._data[index])
jpayne@69	526 else:
jpayne@69	527 # Return the (sub)sequence as another Seq/MutableSeq object
jpayne@69	528 return self.__class__(self._data[index])
jpayne@69	529
jpayne@69	530 def __add__(self, other):
jpayne@69	531 """Add a sequence or string to this sequence.
jpayne@69	532
jpayne@69	533 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	534 >>> Seq("MELKI") + "LV"
jpayne@69	535 Seq('MELKILV')
jpayne@69	536 >>> MutableSeq("MELKI") + "LV"
jpayne@69	537 MutableSeq('MELKILV')
jpayne@69	538 """
jpayne@69	539 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	540 return self.__class__(self._data + other._data)
jpayne@69	541 elif isinstance(other, str):
jpayne@69	542 return self.__class__(self._data + other.encode("ASCII"))
jpayne@69	543 else:
jpayne@69	544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
jpayne@69	545 # this. If not, returning NotImplemented will trigger a TypeError.
jpayne@69	546 return NotImplemented
jpayne@69	547
jpayne@69	548 def __radd__(self, other):
jpayne@69	549 """Add a sequence string on the left.
jpayne@69	550
jpayne@69	551 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	552 >>> "LV" + Seq("MELKI")
jpayne@69	553 Seq('LVMELKI')
jpayne@69	554 >>> "LV" + MutableSeq("MELKI")
jpayne@69	555 MutableSeq('LVMELKI')
jpayne@69	556
jpayne@69	557 Adding two sequence objects is handled via the __add__ method.
jpayne@69	558 """
jpayne@69	559 if isinstance(other, str):
jpayne@69	560 return self.__class__(other.encode("ASCII") + self._data)
jpayne@69	561 else:
jpayne@69	562 return NotImplemented
jpayne@69	563
jpayne@69	564 def __mul__(self, other):
jpayne@69	565 """Multiply sequence by integer.
jpayne@69	566
jpayne@69	567 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	568 >>> Seq('ATG') * 2
jpayne@69	569 Seq('ATGATG')
jpayne@69	570 >>> MutableSeq('ATG') * 2
jpayne@69	571 MutableSeq('ATGATG')
jpayne@69	572 """
jpayne@69	573 if not isinstance(other, numbers.Integral):
jpayne@69	574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69	575 # we would like to simply write
jpayne@69	576 # data = self._data * other
jpayne@69	577 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69	578 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69	579 data = self._data.__mul__(other)
jpayne@69	580 return self.__class__(data)
jpayne@69	581
jpayne@69	582 def __rmul__(self, other):
jpayne@69	583 """Multiply integer by sequence.
jpayne@69	584
jpayne@69	585 >>> from Bio.Seq import Seq
jpayne@69	586 >>> 2 * Seq('ATG')
jpayne@69	587 Seq('ATGATG')
jpayne@69	588 """
jpayne@69	589 if not isinstance(other, numbers.Integral):
jpayne@69	590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69	591 # we would like to simply write
jpayne@69	592 # data = self._data * other
jpayne@69	593 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69	594 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69	595 data = self._data.__mul__(other)
jpayne@69	596 return self.__class__(data)
jpayne@69	597
jpayne@69	598 def __imul__(self, other):
jpayne@69	599 """Multiply the sequence object by other and assign.
jpayne@69	600
jpayne@69	601 >>> from Bio.Seq import Seq
jpayne@69	602 >>> seq = Seq('ATG')
jpayne@69	603 >>> seq *= 2
jpayne@69	604 >>> seq
jpayne@69	605 Seq('ATGATG')
jpayne@69	606
jpayne@69	607 Note that this is different from in-place multiplication. The ``seq``
jpayne@69	608 variable is reassigned to the multiplication result, but any variable
jpayne@69	609 pointing to ``seq`` will remain unchanged:
jpayne@69	610
jpayne@69	611 >>> seq = Seq('ATG')
jpayne@69	612 >>> seq2 = seq
jpayne@69	613 >>> id(seq) == id(seq2)
jpayne@69	614 True
jpayne@69	615 >>> seq *= 2
jpayne@69	616 >>> seq
jpayne@69	617 Seq('ATGATG')
jpayne@69	618 >>> seq2
jpayne@69	619 Seq('ATG')
jpayne@69	620 >>> id(seq) == id(seq2)
jpayne@69	621 False
jpayne@69	622 """
jpayne@69	623 if not isinstance(other, numbers.Integral):
jpayne@69	624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69	625 # we would like to simply write
jpayne@69	626 # data = self._data * other
jpayne@69	627 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69	628 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69	629 data = self._data.__mul__(other)
jpayne@69	630 return self.__class__(data)
jpayne@69	631
jpayne@69	632 def count(self, sub, start=None, end=None):
jpayne@69	633 """Return a non-overlapping count, like that of a python string.
jpayne@69	634
jpayne@69	635 The number of occurrences of substring argument sub in the
jpayne@69	636 (sub)sequence given by [start:end] is returned as an integer.
jpayne@69	637 Optional arguments start and end are interpreted as in slice
jpayne@69	638 notation.
jpayne@69	639
jpayne@69	640 Arguments:
jpayne@69	641 - sub - a string or another Seq object to look for
jpayne@69	642 - start - optional integer, slice start
jpayne@69	643 - end - optional integer, slice end
jpayne@69	644
jpayne@69	645 e.g.
jpayne@69	646
jpayne@69	647 >>> from Bio.Seq import Seq
jpayne@69	648 >>> my_seq = Seq("AAAATGA")
jpayne@69	649 >>> print(my_seq.count("A"))
jpayne@69	650 5
jpayne@69	651 >>> print(my_seq.count("ATG"))
jpayne@69	652 1
jpayne@69	653 >>> print(my_seq.count(Seq("AT")))
jpayne@69	654 1
jpayne@69	655 >>> print(my_seq.count("AT", 2, -1))
jpayne@69	656 1
jpayne@69	657
jpayne@69	658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
jpayne@69	659 objects, like that of Python strings, do a non-overlapping search, this
jpayne@69	660 may not give the answer you expect:
jpayne@69	661
jpayne@69	662 >>> "AAAA".count("AA")
jpayne@69	663 2
jpayne@69	664 >>> print(Seq("AAAA").count("AA"))
jpayne@69	665 2
jpayne@69	666
jpayne@69	667 For an overlapping search, use the ``count_overlap`` method:
jpayne@69	668
jpayne@69	669 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@69	670 3
jpayne@69	671 """
jpayne@69	672 if isinstance(sub, MutableSeq):
jpayne@69	673 sub = sub._data
jpayne@69	674 elif isinstance(sub, Seq):
jpayne@69	675 sub = bytes(sub)
jpayne@69	676 elif isinstance(sub, str):
jpayne@69	677 sub = sub.encode("ASCII")
jpayne@69	678 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	679 raise TypeError(
jpayne@69	680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	681 % type(sub)
jpayne@69	682 )
jpayne@69	683 return self._data.count(sub, start, end)
jpayne@69	684
jpayne@69	685 def count_overlap(self, sub, start=None, end=None):
jpayne@69	686 """Return an overlapping count.
jpayne@69	687
jpayne@69	688 Returns an integer, the number of occurrences of substring
jpayne@69	689 argument sub in the (sub)sequence given by [start:end].
jpayne@69	690 Optional arguments start and end are interpreted as in slice
jpayne@69	691 notation.
jpayne@69	692
jpayne@69	693 Arguments:
jpayne@69	694 - sub - a string or another Seq object to look for
jpayne@69	695 - start - optional integer, slice start
jpayne@69	696 - end - optional integer, slice end
jpayne@69	697
jpayne@69	698 e.g.
jpayne@69	699
jpayne@69	700 >>> from Bio.Seq import Seq
jpayne@69	701 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@69	702 3
jpayne@69	703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
jpayne@69	704 4
jpayne@69	705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
jpayne@69	706 1
jpayne@69	707
jpayne@69	708 For a non-overlapping search, use the ``count`` method:
jpayne@69	709
jpayne@69	710 >>> print(Seq("AAAA").count("AA"))
jpayne@69	711 2
jpayne@69	712
jpayne@69	713 Where substrings do not overlap, ``count_overlap`` behaves the same as
jpayne@69	714 the ``count`` method:
jpayne@69	715
jpayne@69	716 >>> from Bio.Seq import Seq
jpayne@69	717 >>> my_seq = Seq("AAAATGA")
jpayne@69	718 >>> print(my_seq.count_overlap("A"))
jpayne@69	719 5
jpayne@69	720 >>> my_seq.count_overlap("A") == my_seq.count("A")
jpayne@69	721 True
jpayne@69	722 >>> print(my_seq.count_overlap("ATG"))
jpayne@69	723 1
jpayne@69	724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
jpayne@69	725 True
jpayne@69	726 >>> print(my_seq.count_overlap(Seq("AT")))
jpayne@69	727 1
jpayne@69	728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
jpayne@69	729 True
jpayne@69	730 >>> print(my_seq.count_overlap("AT", 2, -1))
jpayne@69	731 1
jpayne@69	732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
jpayne@69	733 True
jpayne@69	734
jpayne@69	735 HOWEVER, do not use this method for such cases because the
jpayne@69	736 count() method is much for efficient.
jpayne@69	737 """
jpayne@69	738 if isinstance(sub, MutableSeq):
jpayne@69	739 sub = sub._data
jpayne@69	740 elif isinstance(sub, Seq):
jpayne@69	741 sub = bytes(sub)
jpayne@69	742 elif isinstance(sub, str):
jpayne@69	743 sub = sub.encode("ASCII")
jpayne@69	744 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	745 raise TypeError(
jpayne@69	746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	747 % type(sub)
jpayne@69	748 )
jpayne@69	749 data = self._data
jpayne@69	750 overlap_count = 0
jpayne@69	751 while True:
jpayne@69	752 start = data.find(sub, start, end) + 1
jpayne@69	753 if start != 0:
jpayne@69	754 overlap_count += 1
jpayne@69	755 else:
jpayne@69	756 return overlap_count
jpayne@69	757
jpayne@69	758 def __contains__(self, item):
jpayne@69	759 """Return True if item is a subsequence of the sequence, and False otherwise.
jpayne@69	760
jpayne@69	761 e.g.
jpayne@69	762
jpayne@69	763 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
jpayne@69	765 >>> "AAA" in my_dna
jpayne@69	766 True
jpayne@69	767 >>> Seq("AAA") in my_dna
jpayne@69	768 True
jpayne@69	769 >>> MutableSeq("AAA") in my_dna
jpayne@69	770 True
jpayne@69	771 """
jpayne@69	772 if isinstance(item, _SeqAbstractBaseClass):
jpayne@69	773 item = bytes(item)
jpayne@69	774 elif isinstance(item, str):
jpayne@69	775 item = item.encode("ASCII")
jpayne@69	776 return item in self._data
jpayne@69	777
jpayne@69	778 def find(self, sub, start=None, end=None):
jpayne@69	779 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@69	780
jpayne@69	781 With optional arguments start and end, return the lowest index in the
jpayne@69	782 sequence such that the subsequence sub is contained within the sequence
jpayne@69	783 region [start:end].
jpayne@69	784
jpayne@69	785 Arguments:
jpayne@69	786 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69	787 - start - optional integer, slice start
jpayne@69	788 - end - optional integer, slice end
jpayne@69	789
jpayne@69	790 Returns -1 if the subsequence is NOT found.
jpayne@69	791
jpayne@69	792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@69	793
jpayne@69	794 >>> from Bio.Seq import Seq
jpayne@69	795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	796 >>> my_rna.find("AUG")
jpayne@69	797 3
jpayne@69	798
jpayne@69	799 The next typical start codon can then be found by starting the search
jpayne@69	800 at position 4:
jpayne@69	801
jpayne@69	802 >>> my_rna.find("AUG", 4)
jpayne@69	803 15
jpayne@69	804
jpayne@69	805 See the ``search`` method to find the locations of multiple subsequences
jpayne@69	806 at the same time.
jpayne@69	807 """
jpayne@69	808 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@69	809 sub = bytes(sub)
jpayne@69	810 elif isinstance(sub, str):
jpayne@69	811 sub = sub.encode("ASCII")
jpayne@69	812 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	813 raise TypeError(
jpayne@69	814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	815 % type(sub)
jpayne@69	816 )
jpayne@69	817 return self._data.find(sub, start, end)
jpayne@69	818
jpayne@69	819 def rfind(self, sub, start=None, end=None):
jpayne@69	820 """Return the highest index in the sequence where subsequence sub is found.
jpayne@69	821
jpayne@69	822 With optional arguments start and end, return the highest index in the
jpayne@69	823 sequence such that the subsequence sub is contained within the sequence
jpayne@69	824 region [start:end].
jpayne@69	825
jpayne@69	826 Arguments:
jpayne@69	827 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69	828 - start - optional integer, slice start
jpayne@69	829 - end - optional integer, slice end
jpayne@69	830
jpayne@69	831 Returns -1 if the subsequence is NOT found.
jpayne@69	832
jpayne@69	833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@69	834
jpayne@69	835 >>> from Bio.Seq import Seq
jpayne@69	836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	837 >>> my_rna.rfind("AUG")
jpayne@69	838 15
jpayne@69	839
jpayne@69	840 The location of the typical start codon before that can be found by
jpayne@69	841 ending the search at position 15:
jpayne@69	842
jpayne@69	843 >>> my_rna.rfind("AUG", end=15)
jpayne@69	844 3
jpayne@69	845
jpayne@69	846 See the ``search`` method to find the locations of multiple subsequences
jpayne@69	847 at the same time.
jpayne@69	848 """
jpayne@69	849 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@69	850 sub = bytes(sub)
jpayne@69	851 elif isinstance(sub, str):
jpayne@69	852 sub = sub.encode("ASCII")
jpayne@69	853 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	854 raise TypeError(
jpayne@69	855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	856 % type(sub)
jpayne@69	857 )
jpayne@69	858 return self._data.rfind(sub, start, end)
jpayne@69	859
jpayne@69	860 def index(self, sub, start=None, end=None):
jpayne@69	861 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@69	862
jpayne@69	863 With optional arguments start and end, return the lowest index in the
jpayne@69	864 sequence such that the subsequence sub is contained within the sequence
jpayne@69	865 region [start:end].
jpayne@69	866
jpayne@69	867 Arguments:
jpayne@69	868 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69	869 - start - optional integer, slice start
jpayne@69	870 - end - optional integer, slice end
jpayne@69	871
jpayne@69	872 Raises a ValueError if the subsequence is NOT found.
jpayne@69	873
jpayne@69	874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@69	875
jpayne@69	876 >>> from Bio.Seq import Seq
jpayne@69	877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	878 >>> my_rna.index("AUG")
jpayne@69	879 3
jpayne@69	880
jpayne@69	881 The next typical start codon can then be found by starting the search
jpayne@69	882 at position 4:
jpayne@69	883
jpayne@69	884 >>> my_rna.index("AUG", 4)
jpayne@69	885 15
jpayne@69	886
jpayne@69	887 This method performs the same search as the ``find`` method. However,
jpayne@69	888 if the subsequence is not found, ``find`` returns -1 while ``index``
jpayne@69	889 raises a ValueError:
jpayne@69	890
jpayne@69	891 >>> my_rna.index("T")
jpayne@69	892 Traceback (most recent call last):
jpayne@69	893 ...
jpayne@69	894 ValueError: ...
jpayne@69	895 >>> my_rna.find("T")
jpayne@69	896 -1
jpayne@69	897
jpayne@69	898 See the ``search`` method to find the locations of multiple subsequences
jpayne@69	899 at the same time.
jpayne@69	900 """
jpayne@69	901 if isinstance(sub, MutableSeq):
jpayne@69	902 sub = sub._data
jpayne@69	903 elif isinstance(sub, Seq):
jpayne@69	904 sub = bytes(sub)
jpayne@69	905 elif isinstance(sub, str):
jpayne@69	906 sub = sub.encode("ASCII")
jpayne@69	907 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	908 raise TypeError(
jpayne@69	909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	910 % type(sub)
jpayne@69	911 )
jpayne@69	912 return self._data.index(sub, start, end)
jpayne@69	913
jpayne@69	914 def rindex(self, sub, start=None, end=None):
jpayne@69	915 """Return the highest index in the sequence where subsequence sub is found.
jpayne@69	916
jpayne@69	917 With optional arguments start and end, return the highest index in the
jpayne@69	918 sequence such that the subsequence sub is contained within the sequence
jpayne@69	919 region [start:end].
jpayne@69	920
jpayne@69	921 Arguments:
jpayne@69	922 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69	923 - start - optional integer, slice start
jpayne@69	924 - end - optional integer, slice end
jpayne@69	925
jpayne@69	926 Returns -1 if the subsequence is NOT found.
jpayne@69	927
jpayne@69	928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@69	929
jpayne@69	930 >>> from Bio.Seq import Seq
jpayne@69	931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	932 >>> my_rna.rindex("AUG")
jpayne@69	933 15
jpayne@69	934
jpayne@69	935 The location of the typical start codon before that can be found by
jpayne@69	936 ending the search at position 15:
jpayne@69	937
jpayne@69	938 >>> my_rna.rindex("AUG", end=15)
jpayne@69	939 3
jpayne@69	940
jpayne@69	941 This method performs the same search as the ``rfind`` method. However,
jpayne@69	942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
jpayne@69	943 raises a ValueError:
jpayne@69	944
jpayne@69	945 >>> my_rna.rindex("T")
jpayne@69	946 Traceback (most recent call last):
jpayne@69	947 ...
jpayne@69	948 ValueError: ...
jpayne@69	949 >>> my_rna.rfind("T")
jpayne@69	950 -1
jpayne@69	951
jpayne@69	952 See the ``search`` method to find the locations of multiple subsequences
jpayne@69	953 at the same time.
jpayne@69	954 """
jpayne@69	955 if isinstance(sub, MutableSeq):
jpayne@69	956 sub = sub._data
jpayne@69	957 elif isinstance(sub, Seq):
jpayne@69	958 sub = bytes(sub)
jpayne@69	959 elif isinstance(sub, str):
jpayne@69	960 sub = sub.encode("ASCII")
jpayne@69	961 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69	962 raise TypeError(
jpayne@69	963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	964 % type(sub)
jpayne@69	965 )
jpayne@69	966 return self._data.rindex(sub, start, end)
jpayne@69	967
jpayne@69	968 def search(self, subs):
jpayne@69	969 """Search the substrings subs in self and yield the index and substring found.
jpayne@69	970
jpayne@69	971 Arguments:
jpayne@69	972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
jpayne@69	973 objects containing the substrings to search for.
jpayne@69	974
jpayne@69	975 >>> from Bio.Seq import Seq
jpayne@69	976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
jpayne@69	977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
jpayne@69	978 >>> for index, substring in matches:
jpayne@69	979 ... print(index, substring)
jpayne@69	980 ...
jpayne@69	981 7 CC
jpayne@69	982 9 ATTG
jpayne@69	983 20 CC
jpayne@69	984 34 CC
jpayne@69	985 34 CCC
jpayne@69	986 35 CC
jpayne@69	987 """
jpayne@69	988 subdict = collections.defaultdict(set)
jpayne@69	989 for index, sub in enumerate(subs):
jpayne@69	990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
jpayne@69	991 sub = bytes(sub)
jpayne@69	992 elif isinstance(sub, str):
jpayne@69	993 sub = sub.encode("ASCII")
jpayne@69	994 elif not isinstance(sub, bytes):
jpayne@69	995 raise TypeError(
jpayne@69	996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69	997 % (index, type(sub))
jpayne@69	998 )
jpayne@69	999 length = len(sub)
jpayne@69	1000 subdict[length].add(sub)
jpayne@69	1001 for start in range(len(self) - 1):
jpayne@69	1002 for length, subs in subdict.items():
jpayne@69	1003 stop = start + length
jpayne@69	1004 for sub in subs:
jpayne@69	1005 if self._data[start:stop] == sub:
jpayne@69	1006 yield (start, sub.decode())
jpayne@69	1007 break
jpayne@69	1008
jpayne@69	1009 def startswith(self, prefix, start=None, end=None):
jpayne@69	1010 """Return True if the sequence starts with the given prefix, False otherwise.
jpayne@69	1011
jpayne@69	1012 Return True if the sequence starts with the specified prefix
jpayne@69	1013 (a string or another Seq object), False otherwise.
jpayne@69	1014 With optional start, test sequence beginning at that position.
jpayne@69	1015 With optional end, stop comparing sequence at that position.
jpayne@69	1016 prefix can also be a tuple of strings to try. e.g.
jpayne@69	1017
jpayne@69	1018 >>> from Bio.Seq import Seq
jpayne@69	1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	1020 >>> my_rna.startswith("GUC")
jpayne@69	1021 True
jpayne@69	1022 >>> my_rna.startswith("AUG")
jpayne@69	1023 False
jpayne@69	1024 >>> my_rna.startswith("AUG", 3)
jpayne@69	1025 True
jpayne@69	1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
jpayne@69	1027 True
jpayne@69	1028 """
jpayne@69	1029 if isinstance(prefix, tuple):
jpayne@69	1030 prefix = tuple(
jpayne@69	1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@69	1032 for p in prefix
jpayne@69	1033 )
jpayne@69	1034 elif isinstance(prefix, _SeqAbstractBaseClass):
jpayne@69	1035 prefix = bytes(prefix)
jpayne@69	1036 elif isinstance(prefix, str):
jpayne@69	1037 prefix = prefix.encode("ASCII")
jpayne@69	1038 return self._data.startswith(prefix, start, end)
jpayne@69	1039
jpayne@69	1040 def endswith(self, suffix, start=None, end=None):
jpayne@69	1041 """Return True if the sequence ends with the given suffix, False otherwise.
jpayne@69	1042
jpayne@69	1043 Return True if the sequence ends with the specified suffix
jpayne@69	1044 (a string or another Seq object), False otherwise.
jpayne@69	1045 With optional start, test sequence beginning at that position.
jpayne@69	1046 With optional end, stop comparing sequence at that position.
jpayne@69	1047 suffix can also be a tuple of strings to try. e.g.
jpayne@69	1048
jpayne@69	1049 >>> from Bio.Seq import Seq
jpayne@69	1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	1051 >>> my_rna.endswith("UUG")
jpayne@69	1052 True
jpayne@69	1053 >>> my_rna.endswith("AUG")
jpayne@69	1054 False
jpayne@69	1055 >>> my_rna.endswith("AUG", 0, 18)
jpayne@69	1056 True
jpayne@69	1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
jpayne@69	1058 True
jpayne@69	1059 """
jpayne@69	1060 if isinstance(suffix, tuple):
jpayne@69	1061 suffix = tuple(
jpayne@69	1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@69	1063 for p in suffix
jpayne@69	1064 )
jpayne@69	1065 elif isinstance(suffix, _SeqAbstractBaseClass):
jpayne@69	1066 suffix = bytes(suffix)
jpayne@69	1067 elif isinstance(suffix, str):
jpayne@69	1068 suffix = suffix.encode("ASCII")
jpayne@69	1069 return self._data.endswith(suffix, start, end)
jpayne@69	1070
jpayne@69	1071 def split(self, sep=None, maxsplit=-1):
jpayne@69	1072 """Return a list of subsequences when splitting the sequence by separator sep.
jpayne@69	1073
jpayne@69	1074 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@69	1075 using sep as the delimiter string. If maxsplit is given, at
jpayne@69	1076 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@69	1077 splits are made.
jpayne@69	1078
jpayne@69	1079 For consistency with the ``split`` method of Python strings, any
jpayne@69	1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@69	1081 default value
jpayne@69	1082
jpayne@69	1083 e.g.
jpayne@69	1084
jpayne@69	1085 >>> from Bio.Seq import Seq
jpayne@69	1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	1087 >>> my_aa = my_rna.translate()
jpayne@69	1088 >>> my_aa
jpayne@69	1089 Seq('VMAIVMGRKGARL')
jpayne@69	1090 >>> for pep in my_aa.split("*"):
jpayne@69	1091 ... pep
jpayne@69	1092 Seq('VMAIVMGR')
jpayne@69	1093 Seq('KGAR')
jpayne@69	1094 Seq('L')
jpayne@69	1095 >>> for pep in my_aa.split("*", 1):
jpayne@69	1096 ... pep
jpayne@69	1097 Seq('VMAIVMGR')
jpayne@69	1098 Seq('KGAR*L')
jpayne@69	1099
jpayne@69	1100 See also the rsplit method, which splits the sequence starting from the
jpayne@69	1101 end:
jpayne@69	1102
jpayne@69	1103 >>> for pep in my_aa.rsplit("*", 1):
jpayne@69	1104 ... pep
jpayne@69	1105 Seq('VMAIVMGR*KGAR')
jpayne@69	1106 Seq('L')
jpayne@69	1107 """
jpayne@69	1108 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@69	1109 sep = bytes(sep)
jpayne@69	1110 elif isinstance(sep, str):
jpayne@69	1111 sep = sep.encode("ASCII")
jpayne@69	1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
jpayne@69	1113
jpayne@69	1114 def rsplit(self, sep=None, maxsplit=-1):
jpayne@69	1115 """Return a list of subsequences by splitting the sequence from the right.
jpayne@69	1116
jpayne@69	1117 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@69	1118 using sep as the delimiter string. If maxsplit is given, at
jpayne@69	1119 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@69	1120 splits are made.
jpayne@69	1121
jpayne@69	1122 For consistency with the ``rsplit`` method of Python strings, any
jpayne@69	1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@69	1124 default value
jpayne@69	1125
jpayne@69	1126 e.g.
jpayne@69	1127
jpayne@69	1128 >>> from Bio.Seq import Seq
jpayne@69	1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69	1130 >>> my_aa = my_rna.translate()
jpayne@69	1131 >>> my_aa
jpayne@69	1132 Seq('VMAIVMGRKGARL')
jpayne@69	1133 >>> for pep in my_aa.rsplit("*"):
jpayne@69	1134 ... pep
jpayne@69	1135 Seq('VMAIVMGR')
jpayne@69	1136 Seq('KGAR')
jpayne@69	1137 Seq('L')
jpayne@69	1138 >>> for pep in my_aa.rsplit("*", 1):
jpayne@69	1139 ... pep
jpayne@69	1140 Seq('VMAIVMGR*KGAR')
jpayne@69	1141 Seq('L')
jpayne@69	1142
jpayne@69	1143 See also the split method, which splits the sequence starting from the
jpayne@69	1144 beginning:
jpayne@69	1145
jpayne@69	1146 >>> for pep in my_aa.split("*", 1):
jpayne@69	1147 ... pep
jpayne@69	1148 Seq('VMAIVMGR')
jpayne@69	1149 Seq('KGAR*L')
jpayne@69	1150 """
jpayne@69	1151 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@69	1152 sep = bytes(sep)
jpayne@69	1153 elif isinstance(sep, str):
jpayne@69	1154 sep = sep.encode("ASCII")
jpayne@69	1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
jpayne@69	1156
jpayne@69	1157 def strip(self, chars=None, inplace=False):
jpayne@69	1158 """Return a sequence object with leading and trailing ends stripped.
jpayne@69	1159
jpayne@69	1160 With default arguments, leading and trailing whitespace is removed:
jpayne@69	1161
jpayne@69	1162 >>> seq = Seq(" ACGT ")
jpayne@69	1163 >>> seq.strip()
jpayne@69	1164 Seq('ACGT')
jpayne@69	1165 >>> seq
jpayne@69	1166 Seq(' ACGT ')
jpayne@69	1167
jpayne@69	1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69	1169 instead. The order of the characters to be removed is not important:
jpayne@69	1170
jpayne@69	1171 >>> Seq("ACGTACGT").strip("TGCA")
jpayne@69	1172 Seq('')
jpayne@69	1173
jpayne@69	1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69	1175 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69	1176 in-place and returned.
jpayne@69	1177
jpayne@69	1178 >>> seq = MutableSeq(" ACGT ")
jpayne@69	1179 >>> seq.strip()
jpayne@69	1180 MutableSeq('ACGT')
jpayne@69	1181 >>> seq
jpayne@69	1182 MutableSeq(' ACGT ')
jpayne@69	1183 >>> seq.strip(inplace=True)
jpayne@69	1184 MutableSeq('ACGT')
jpayne@69	1185 >>> seq
jpayne@69	1186 MutableSeq('ACGT')
jpayne@69	1187
jpayne@69	1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
jpayne@69	1189 is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1190
jpayne@69	1191 See also the lstrip and rstrip methods.
jpayne@69	1192 """
jpayne@69	1193 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69	1194 chars = bytes(chars)
jpayne@69	1195 elif isinstance(chars, str):
jpayne@69	1196 chars = chars.encode("ASCII")
jpayne@69	1197 try:
jpayne@69	1198 data = self._data.strip(chars)
jpayne@69	1199 except TypeError:
jpayne@69	1200 raise TypeError(
jpayne@69	1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69	1202 ) from None
jpayne@69	1203 if inplace:
jpayne@69	1204 if not isinstance(self._data, bytearray):
jpayne@69	1205 raise TypeError("Sequence is immutable")
jpayne@69	1206 self._data[:] = data
jpayne@69	1207 return self
jpayne@69	1208 else:
jpayne@69	1209 return self.__class__(data)
jpayne@69	1210
jpayne@69	1211 def lstrip(self, chars=None, inplace=False):
jpayne@69	1212 """Return a sequence object with leading and trailing ends stripped.
jpayne@69	1213
jpayne@69	1214 With default arguments, leading whitespace is removed:
jpayne@69	1215
jpayne@69	1216 >>> seq = Seq(" ACGT ")
jpayne@69	1217 >>> seq.lstrip()
jpayne@69	1218 Seq('ACGT ')
jpayne@69	1219 >>> seq
jpayne@69	1220 Seq(' ACGT ')
jpayne@69	1221
jpayne@69	1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69	1223 from the leading end instead. The order of the characters to be removed
jpayne@69	1224 is not important:
jpayne@69	1225
jpayne@69	1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
jpayne@69	1227 Seq('TTACG')
jpayne@69	1228
jpayne@69	1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69	1230 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69	1231 in-place and returned.
jpayne@69	1232
jpayne@69	1233 >>> seq = MutableSeq(" ACGT ")
jpayne@69	1234 >>> seq.lstrip()
jpayne@69	1235 MutableSeq('ACGT ')
jpayne@69	1236 >>> seq
jpayne@69	1237 MutableSeq(' ACGT ')
jpayne@69	1238 >>> seq.lstrip(inplace=True)
jpayne@69	1239 MutableSeq('ACGT ')
jpayne@69	1240 >>> seq
jpayne@69	1241 MutableSeq('ACGT ')
jpayne@69	1242
jpayne@69	1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1245
jpayne@69	1246 See also the strip and rstrip methods.
jpayne@69	1247 """
jpayne@69	1248 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69	1249 chars = bytes(chars)
jpayne@69	1250 elif isinstance(chars, str):
jpayne@69	1251 chars = chars.encode("ASCII")
jpayne@69	1252 try:
jpayne@69	1253 data = self._data.lstrip(chars)
jpayne@69	1254 except TypeError:
jpayne@69	1255 raise TypeError(
jpayne@69	1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69	1257 ) from None
jpayne@69	1258 if inplace:
jpayne@69	1259 if not isinstance(self._data, bytearray):
jpayne@69	1260 raise TypeError("Sequence is immutable")
jpayne@69	1261 self._data[:] = data
jpayne@69	1262 return self
jpayne@69	1263 else:
jpayne@69	1264 return self.__class__(data)
jpayne@69	1265
jpayne@69	1266 def rstrip(self, chars=None, inplace=False):
jpayne@69	1267 """Return a sequence object with trailing ends stripped.
jpayne@69	1268
jpayne@69	1269 With default arguments, trailing whitespace is removed:
jpayne@69	1270
jpayne@69	1271 >>> seq = Seq(" ACGT ")
jpayne@69	1272 >>> seq.rstrip()
jpayne@69	1273 Seq(' ACGT')
jpayne@69	1274 >>> seq
jpayne@69	1275 Seq(' ACGT ')
jpayne@69	1276
jpayne@69	1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69	1278 from the trailing end instead. The order of the characters to be
jpayne@69	1279 removed is not important:
jpayne@69	1280
jpayne@69	1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
jpayne@69	1282 Seq('ACGACGTT')
jpayne@69	1283
jpayne@69	1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69	1285 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69	1286 in-place and returned.
jpayne@69	1287
jpayne@69	1288 >>> seq = MutableSeq(" ACGT ")
jpayne@69	1289 >>> seq.rstrip()
jpayne@69	1290 MutableSeq(' ACGT')
jpayne@69	1291 >>> seq
jpayne@69	1292 MutableSeq(' ACGT ')
jpayne@69	1293 >>> seq.rstrip(inplace=True)
jpayne@69	1294 MutableSeq(' ACGT')
jpayne@69	1295 >>> seq
jpayne@69	1296 MutableSeq(' ACGT')
jpayne@69	1297
jpayne@69	1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1300
jpayne@69	1301 See also the strip and lstrip methods.
jpayne@69	1302 """
jpayne@69	1303 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69	1304 chars = bytes(chars)
jpayne@69	1305 elif isinstance(chars, str):
jpayne@69	1306 chars = chars.encode("ASCII")
jpayne@69	1307 try:
jpayne@69	1308 data = self._data.rstrip(chars)
jpayne@69	1309 except TypeError:
jpayne@69	1310 raise TypeError(
jpayne@69	1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69	1312 ) from None
jpayne@69	1313 if inplace:
jpayne@69	1314 if not isinstance(self._data, bytearray):
jpayne@69	1315 raise TypeError("Sequence is immutable")
jpayne@69	1316 self._data[:] = data
jpayne@69	1317 return self
jpayne@69	1318 else:
jpayne@69	1319 return self.__class__(data)
jpayne@69	1320
jpayne@69	1321 def removeprefix(self, prefix, inplace=False):
jpayne@69	1322 """Return a new Seq object with prefix (left) removed.
jpayne@69	1323
jpayne@69	1324 This behaves like the python string method of the same name.
jpayne@69	1325
jpayne@69	1326 e.g. Removing a start Codon:
jpayne@69	1327
jpayne@69	1328 >>> from Bio.Seq import Seq
jpayne@69	1329 >>> my_seq = Seq("ATGGTGTGTGT")
jpayne@69	1330 >>> my_seq
jpayne@69	1331 Seq('ATGGTGTGTGT')
jpayne@69	1332 >>> my_seq.removeprefix('ATG')
jpayne@69	1333 Seq('GTGTGTGT')
jpayne@69	1334
jpayne@69	1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1337
jpayne@69	1338 See also the removesuffix method.
jpayne@69	1339 """
jpayne@69	1340 if isinstance(prefix, _SeqAbstractBaseClass):
jpayne@69	1341 prefix = bytes(prefix)
jpayne@69	1342 elif isinstance(prefix, str):
jpayne@69	1343 prefix = prefix.encode("ASCII")
jpayne@69	1344 try:
jpayne@69	1345 data = self._data.removeprefix(prefix)
jpayne@69	1346 except TypeError:
jpayne@69	1347 raise TypeError(
jpayne@69	1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@69	1349 ) from None
jpayne@69	1350 except AttributeError:
jpayne@69	1351 # Fall back for pre-Python 3.9
jpayne@69	1352 data = self._data
jpayne@69	1353 if data.startswith(prefix):
jpayne@69	1354 data = data[len(prefix) :]
jpayne@69	1355 if inplace:
jpayne@69	1356 if not isinstance(self._data, bytearray):
jpayne@69	1357 raise TypeError("Sequence is immutable")
jpayne@69	1358 self._data[:] = data
jpayne@69	1359 return self
jpayne@69	1360 else:
jpayne@69	1361 return self.__class__(data)
jpayne@69	1362
jpayne@69	1363 def removesuffix(self, suffix, inplace=False):
jpayne@69	1364 """Return a new Seq object with suffix (right) removed.
jpayne@69	1365
jpayne@69	1366 This behaves like the python string method of the same name.
jpayne@69	1367
jpayne@69	1368 e.g. Removing a stop codon:
jpayne@69	1369
jpayne@69	1370 >>> from Bio.Seq import Seq
jpayne@69	1371 >>> my_seq = Seq("GTGTGTGTTAG")
jpayne@69	1372 >>> my_seq
jpayne@69	1373 Seq('GTGTGTGTTAG')
jpayne@69	1374 >>> stop_codon = Seq("TAG")
jpayne@69	1375 >>> my_seq.removesuffix(stop_codon)
jpayne@69	1376 Seq('GTGTGTGT')
jpayne@69	1377
jpayne@69	1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1380
jpayne@69	1381 See also the removeprefix method.
jpayne@69	1382 """
jpayne@69	1383 if isinstance(suffix, _SeqAbstractBaseClass):
jpayne@69	1384 suffix = bytes(suffix)
jpayne@69	1385 elif isinstance(suffix, str):
jpayne@69	1386 suffix = suffix.encode("ASCII")
jpayne@69	1387 try:
jpayne@69	1388 data = self._data.removesuffix(suffix)
jpayne@69	1389 except TypeError:
jpayne@69	1390 raise TypeError(
jpayne@69	1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@69	1392 ) from None
jpayne@69	1393 except AttributeError:
jpayne@69	1394 # Fall back for pre-Python 3.9
jpayne@69	1395 data = self._data
jpayne@69	1396 if data.endswith(suffix):
jpayne@69	1397 data = data[: -len(suffix)]
jpayne@69	1398 if inplace:
jpayne@69	1399 if not isinstance(self._data, bytearray):
jpayne@69	1400 raise TypeError("Sequence is immutable")
jpayne@69	1401 self._data[:] = data
jpayne@69	1402 return self
jpayne@69	1403 else:
jpayne@69	1404 return self.__class__(data)
jpayne@69	1405
jpayne@69	1406 def upper(self, inplace=False):
jpayne@69	1407 """Return the sequence in upper case.
jpayne@69	1408
jpayne@69	1409 An upper-case copy of the sequence is returned if inplace is False,
jpayne@69	1410 the default value:
jpayne@69	1411
jpayne@69	1412 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	1413 >>> my_seq = Seq("VHLTPeeK*")
jpayne@69	1414 >>> my_seq
jpayne@69	1415 Seq('VHLTPeeK*')
jpayne@69	1416 >>> my_seq.lower()
jpayne@69	1417 Seq('vhltpeek*')
jpayne@69	1418 >>> my_seq.upper()
jpayne@69	1419 Seq('VHLTPEEK*')
jpayne@69	1420 >>> my_seq
jpayne@69	1421 Seq('VHLTPeeK*')
jpayne@69	1422
jpayne@69	1423 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1424
jpayne@69	1425 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@69	1426 >>> my_seq
jpayne@69	1427 MutableSeq('VHLTPeeK*')
jpayne@69	1428 >>> my_seq.lower()
jpayne@69	1429 MutableSeq('vhltpeek*')
jpayne@69	1430 >>> my_seq.upper()
jpayne@69	1431 MutableSeq('VHLTPEEK*')
jpayne@69	1432 >>> my_seq
jpayne@69	1433 MutableSeq('VHLTPeeK*')
jpayne@69	1434
jpayne@69	1435 >>> my_seq.lower(inplace=True)
jpayne@69	1436 MutableSeq('vhltpeek*')
jpayne@69	1437 >>> my_seq
jpayne@69	1438 MutableSeq('vhltpeek*')
jpayne@69	1439 >>> my_seq.upper(inplace=True)
jpayne@69	1440 MutableSeq('VHLTPEEK*')
jpayne@69	1441 >>> my_seq
jpayne@69	1442 MutableSeq('VHLTPEEK*')
jpayne@69	1443
jpayne@69	1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1446
jpayne@69	1447 See also the ``lower`` method.
jpayne@69	1448 """
jpayne@69	1449 data = self._data.upper()
jpayne@69	1450 if inplace:
jpayne@69	1451 if not isinstance(self._data, bytearray):
jpayne@69	1452 raise TypeError("Sequence is immutable")
jpayne@69	1453 self._data[:] = data
jpayne@69	1454 return self
jpayne@69	1455 else:
jpayne@69	1456 return self.__class__(data)
jpayne@69	1457
jpayne@69	1458 def lower(self, inplace=False):
jpayne@69	1459 """Return the sequence in lower case.
jpayne@69	1460
jpayne@69	1461 An lower-case copy of the sequence is returned if inplace is False,
jpayne@69	1462 the default value:
jpayne@69	1463
jpayne@69	1464 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69	1465 >>> my_seq = Seq("VHLTPeeK*")
jpayne@69	1466 >>> my_seq
jpayne@69	1467 Seq('VHLTPeeK*')
jpayne@69	1468 >>> my_seq.lower()
jpayne@69	1469 Seq('vhltpeek*')
jpayne@69	1470 >>> my_seq.upper()
jpayne@69	1471 Seq('VHLTPEEK*')
jpayne@69	1472 >>> my_seq
jpayne@69	1473 Seq('VHLTPeeK*')
jpayne@69	1474
jpayne@69	1475 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1476
jpayne@69	1477 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@69	1478 >>> my_seq
jpayne@69	1479 MutableSeq('VHLTPeeK*')
jpayne@69	1480 >>> my_seq.lower()
jpayne@69	1481 MutableSeq('vhltpeek*')
jpayne@69	1482 >>> my_seq.upper()
jpayne@69	1483 MutableSeq('VHLTPEEK*')
jpayne@69	1484 >>> my_seq
jpayne@69	1485 MutableSeq('VHLTPeeK*')
jpayne@69	1486
jpayne@69	1487 >>> my_seq.lower(inplace=True)
jpayne@69	1488 MutableSeq('vhltpeek*')
jpayne@69	1489 >>> my_seq
jpayne@69	1490 MutableSeq('vhltpeek*')
jpayne@69	1491 >>> my_seq.upper(inplace=True)
jpayne@69	1492 MutableSeq('VHLTPEEK*')
jpayne@69	1493 >>> my_seq
jpayne@69	1494 MutableSeq('VHLTPEEK*')
jpayne@69	1495
jpayne@69	1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1498
jpayne@69	1499 See also the ``upper`` method.
jpayne@69	1500 """
jpayne@69	1501 data = self._data.lower()
jpayne@69	1502 if inplace:
jpayne@69	1503 if not isinstance(self._data, bytearray):
jpayne@69	1504 raise TypeError("Sequence is immutable")
jpayne@69	1505 self._data[:] = data
jpayne@69	1506 return self
jpayne@69	1507 else:
jpayne@69	1508 return self.__class__(data)
jpayne@69	1509
jpayne@69	1510 def isupper(self):
jpayne@69	1511 """Return True if all ASCII characters in data are uppercase.
jpayne@69	1512
jpayne@69	1513 If there are no cased characters, the method returns False.
jpayne@69	1514 """
jpayne@69	1515 return self._data.isupper()
jpayne@69	1516
jpayne@69	1517 def islower(self):
jpayne@69	1518 """Return True if all ASCII characters in data are lowercase.
jpayne@69	1519
jpayne@69	1520 If there are no cased characters, the method returns False.
jpayne@69	1521 """
jpayne@69	1522 return self._data.islower()
jpayne@69	1523
jpayne@69	1524 def translate(
jpayne@69	1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
jpayne@69	1526 ):
jpayne@69	1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
jpayne@69	1528
jpayne@69	1529 This method will translate DNA or RNA sequences. It should not
jpayne@69	1530 be used on protein sequences as any result will be biologically
jpayne@69	1531 meaningless.
jpayne@69	1532
jpayne@69	1533 Arguments:
jpayne@69	1534 - table - Which codon table to use? This can be either a name
jpayne@69	1535 (string), an NCBI identifier (integer), or a CodonTable
jpayne@69	1536 object (useful for non-standard genetic codes). This
jpayne@69	1537 defaults to the "Standard" table.
jpayne@69	1538 - stop_symbol - Single character string, what to use for
jpayne@69	1539 terminators. This defaults to the asterisk, "*".
jpayne@69	1540 - to_stop - Boolean, defaults to False meaning do a full
jpayne@69	1541 translation continuing on past any stop codons (translated as the
jpayne@69	1542 specified stop_symbol). If True, translation is terminated at
jpayne@69	1543 the first in frame stop codon (and the stop_symbol is not
jpayne@69	1544 appended to the returned protein sequence).
jpayne@69	1545 - cds - Boolean, indicates this is a complete CDS. If True,
jpayne@69	1546 this checks the sequence starts with a valid alternative start
jpayne@69	1547 codon (which will be translated as methionine, M), that the
jpayne@69	1548 sequence length is a multiple of three, and that there is a
jpayne@69	1549 single in frame stop codon at the end (this will be excluded
jpayne@69	1550 from the protein sequence, regardless of the to_stop option).
jpayne@69	1551 If these tests fail, an exception is raised.
jpayne@69	1552 - gap - Single character string to denote symbol used for gaps.
jpayne@69	1553 Defaults to the minus sign.
jpayne@69	1554
jpayne@69	1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
jpayne@69	1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
jpayne@69	1557 pn a ``MutableSeq`` object.
jpayne@69	1558
jpayne@69	1559 e.g. Using the standard table:
jpayne@69	1560
jpayne@69	1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69	1562 >>> coding_dna.translate()
jpayne@69	1563 Seq('VAIVMGRKGAR')
jpayne@69	1564 >>> coding_dna.translate(stop_symbol="@")
jpayne@69	1565 Seq('VAIVMGR@KGAR@')
jpayne@69	1566 >>> coding_dna.translate(to_stop=True)
jpayne@69	1567 Seq('VAIVMGR')
jpayne@69	1568
jpayne@69	1569 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@69	1570
jpayne@69	1571 >>> coding_dna.translate(table=2)
jpayne@69	1572 Seq('VAIVMGRWKGAR*')
jpayne@69	1573 >>> coding_dna.translate(table=2, to_stop=True)
jpayne@69	1574 Seq('VAIVMGRWKGAR')
jpayne@69	1575
jpayne@69	1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
jpayne@69	1577 this sequence could be a complete CDS:
jpayne@69	1578
jpayne@69	1579 >>> coding_dna.translate(table=2, cds=True)
jpayne@69	1580 Seq('MAIVMGRWKGAR')
jpayne@69	1581
jpayne@69	1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
jpayne@69	1583 and also the in frame stop codons:
jpayne@69	1584
jpayne@69	1585 >>> coding_dna.translate(table=1, cds=True)
jpayne@69	1586 Traceback (most recent call last):
jpayne@69	1587 ...
jpayne@69	1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
jpayne@69	1589
jpayne@69	1590 If the sequence has no in-frame stop codon, then the to_stop argument
jpayne@69	1591 has no effect:
jpayne@69	1592
jpayne@69	1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
jpayne@69	1594 >>> coding_dna2.translate()
jpayne@69	1595 Seq('LAIVMGR')
jpayne@69	1596 >>> coding_dna2.translate(to_stop=True)
jpayne@69	1597 Seq('LAIVMGR')
jpayne@69	1598
jpayne@69	1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@69	1600 or a stop codon. These are translated as "X". Any invalid codon
jpayne@69	1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@69	1602
jpayne@69	1603 NOTE - This does NOT behave like the python string's translate
jpayne@69	1604 method. For that use str(my_seq).translate(...) instead
jpayne@69	1605 """
jpayne@69	1606 try:
jpayne@69	1607 data = str(self)
jpayne@69	1608 except UndefinedSequenceError:
jpayne@69	1609 # translating an undefined sequence yields an undefined
jpayne@69	1610 # sequence with the length divided by 3
jpayne@69	1611 n = len(self)
jpayne@69	1612 if n % 3 != 0:
jpayne@69	1613 warnings.warn(
jpayne@69	1614 "Partial codon, len(sequence) not a multiple of three. "
jpayne@69	1615 "This may become an error in future.",
jpayne@69	1616 BiopythonWarning,
jpayne@69	1617 )
jpayne@69	1618 return Seq(None, n // 3)
jpayne@69	1619
jpayne@69	1620 return self.__class__(
jpayne@69	1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
jpayne@69	1622 )
jpayne@69	1623
jpayne@69	1624 def complement(self, inplace=False):
jpayne@69	1625 """Return the complement as a DNA sequence.
jpayne@69	1626
jpayne@69	1627 >>> Seq("CGA").complement()
jpayne@69	1628 Seq('GCT')
jpayne@69	1629
jpayne@69	1630 Any U in the sequence is treated as a T:
jpayne@69	1631
jpayne@69	1632 >>> Seq("CGAUT").complement()
jpayne@69	1633 Seq('GCTAA')
jpayne@69	1634
jpayne@69	1635 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@69	1636
jpayne@69	1637 >>> Seq("CGAUT").complement_rna()
jpayne@69	1638 Seq('GCUAA')
jpayne@69	1639
jpayne@69	1640 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1641
jpayne@69	1642 >>> my_seq = MutableSeq("CGA")
jpayne@69	1643 >>> my_seq
jpayne@69	1644 MutableSeq('CGA')
jpayne@69	1645 >>> my_seq.complement()
jpayne@69	1646 MutableSeq('GCT')
jpayne@69	1647 >>> my_seq
jpayne@69	1648 MutableSeq('CGA')
jpayne@69	1649
jpayne@69	1650 >>> my_seq.complement(inplace=True)
jpayne@69	1651 MutableSeq('GCT')
jpayne@69	1652 >>> my_seq
jpayne@69	1653 MutableSeq('GCT')
jpayne@69	1654
jpayne@69	1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1657 """
jpayne@69	1658 ttable = _dna_complement_table
jpayne@69	1659 try:
jpayne@69	1660 data = self._data.translate(ttable)
jpayne@69	1661 except UndefinedSequenceError:
jpayne@69	1662 # complement of an undefined sequence is an undefined sequence
jpayne@69	1663 # of the same length
jpayne@69	1664 return self
jpayne@69	1665 if inplace:
jpayne@69	1666 if not isinstance(self._data, bytearray):
jpayne@69	1667 raise TypeError("Sequence is immutable")
jpayne@69	1668 self._data[:] = data
jpayne@69	1669 return self
jpayne@69	1670 return self.__class__(data)
jpayne@69	1671
jpayne@69	1672 def complement_rna(self, inplace=False):
jpayne@69	1673 """Return the complement as an RNA sequence.
jpayne@69	1674
jpayne@69	1675 >>> Seq("CGA").complement_rna()
jpayne@69	1676 Seq('GCU')
jpayne@69	1677
jpayne@69	1678 Any T in the sequence is treated as a U:
jpayne@69	1679
jpayne@69	1680 >>> Seq("CGAUT").complement_rna()
jpayne@69	1681 Seq('GCUAA')
jpayne@69	1682
jpayne@69	1683 In contrast, ``complement`` returns a DNA sequence by default:
jpayne@69	1684
jpayne@69	1685 >>> Seq("CGA").complement()
jpayne@69	1686 Seq('GCT')
jpayne@69	1687
jpayne@69	1688 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1689
jpayne@69	1690 >>> my_seq = MutableSeq("CGA")
jpayne@69	1691 >>> my_seq
jpayne@69	1692 MutableSeq('CGA')
jpayne@69	1693 >>> my_seq.complement_rna()
jpayne@69	1694 MutableSeq('GCU')
jpayne@69	1695 >>> my_seq
jpayne@69	1696 MutableSeq('CGA')
jpayne@69	1697
jpayne@69	1698 >>> my_seq.complement_rna(inplace=True)
jpayne@69	1699 MutableSeq('GCU')
jpayne@69	1700 >>> my_seq
jpayne@69	1701 MutableSeq('GCU')
jpayne@69	1702
jpayne@69	1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1705 """
jpayne@69	1706 try:
jpayne@69	1707 data = self._data.translate(_rna_complement_table)
jpayne@69	1708 except UndefinedSequenceError:
jpayne@69	1709 # complement of an undefined sequence is an undefined sequence
jpayne@69	1710 # of the same length
jpayne@69	1711 return self
jpayne@69	1712 if inplace:
jpayne@69	1713 if not isinstance(self._data, bytearray):
jpayne@69	1714 raise TypeError("Sequence is immutable")
jpayne@69	1715 self._data[:] = data
jpayne@69	1716 return self
jpayne@69	1717 return self.__class__(data)
jpayne@69	1718
jpayne@69	1719 def reverse_complement(self, inplace=False):
jpayne@69	1720 """Return the reverse complement as a DNA sequence.
jpayne@69	1721
jpayne@69	1722 >>> Seq("CGA").reverse_complement()
jpayne@69	1723 Seq('TCG')
jpayne@69	1724
jpayne@69	1725 Any U in the sequence is treated as a T:
jpayne@69	1726
jpayne@69	1727 >>> Seq("CGAUT").reverse_complement()
jpayne@69	1728 Seq('AATCG')
jpayne@69	1729
jpayne@69	1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@69	1731
jpayne@69	1732 >>> Seq("CGA").reverse_complement_rna()
jpayne@69	1733 Seq('UCG')
jpayne@69	1734
jpayne@69	1735 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1736
jpayne@69	1737 >>> my_seq = MutableSeq("CGA")
jpayne@69	1738 >>> my_seq
jpayne@69	1739 MutableSeq('CGA')
jpayne@69	1740 >>> my_seq.reverse_complement()
jpayne@69	1741 MutableSeq('TCG')
jpayne@69	1742 >>> my_seq
jpayne@69	1743 MutableSeq('CGA')
jpayne@69	1744
jpayne@69	1745 >>> my_seq.reverse_complement(inplace=True)
jpayne@69	1746 MutableSeq('TCG')
jpayne@69	1747 >>> my_seq
jpayne@69	1748 MutableSeq('TCG')
jpayne@69	1749
jpayne@69	1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1751 ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69	1752 ``inplace=True``.
jpayne@69	1753 """
jpayne@69	1754 try:
jpayne@69	1755 data = self._data.translate(_dna_complement_table)
jpayne@69	1756 except UndefinedSequenceError:
jpayne@69	1757 # reverse complement of an undefined sequence is an undefined sequence
jpayne@69	1758 # of the same length
jpayne@69	1759 return self
jpayne@69	1760 if inplace:
jpayne@69	1761 if not isinstance(self._data, bytearray):
jpayne@69	1762 raise TypeError("Sequence is immutable")
jpayne@69	1763 self._data[::-1] = data
jpayne@69	1764 return self
jpayne@69	1765 return self.__class__(data[::-1])
jpayne@69	1766
jpayne@69	1767 def reverse_complement_rna(self, inplace=False):
jpayne@69	1768 """Return the reverse complement as an RNA sequence.
jpayne@69	1769
jpayne@69	1770 >>> Seq("CGA").reverse_complement_rna()
jpayne@69	1771 Seq('UCG')
jpayne@69	1772
jpayne@69	1773 Any T in the sequence is treated as a U:
jpayne@69	1774
jpayne@69	1775 >>> Seq("CGAUT").reverse_complement_rna()
jpayne@69	1776 Seq('AAUCG')
jpayne@69	1777
jpayne@69	1778 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@69	1779
jpayne@69	1780 >>> Seq("CGA").reverse_complement()
jpayne@69	1781 Seq('TCG')
jpayne@69	1782
jpayne@69	1783 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1784
jpayne@69	1785 >>> my_seq = MutableSeq("CGA")
jpayne@69	1786 >>> my_seq
jpayne@69	1787 MutableSeq('CGA')
jpayne@69	1788 >>> my_seq.reverse_complement_rna()
jpayne@69	1789 MutableSeq('UCG')
jpayne@69	1790 >>> my_seq
jpayne@69	1791 MutableSeq('CGA')
jpayne@69	1792
jpayne@69	1793 >>> my_seq.reverse_complement_rna(inplace=True)
jpayne@69	1794 MutableSeq('UCG')
jpayne@69	1795 >>> my_seq
jpayne@69	1796 MutableSeq('UCG')
jpayne@69	1797
jpayne@69	1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
jpayne@69	1800 ``inplace=True``.
jpayne@69	1801 """
jpayne@69	1802 try:
jpayne@69	1803 data = self._data.translate(_rna_complement_table)
jpayne@69	1804 except UndefinedSequenceError:
jpayne@69	1805 # reverse complement of an undefined sequence is an undefined sequence
jpayne@69	1806 # of the same length
jpayne@69	1807 return self
jpayne@69	1808 if inplace:
jpayne@69	1809 if not isinstance(self._data, bytearray):
jpayne@69	1810 raise TypeError("Sequence is immutable")
jpayne@69	1811 self._data[::-1] = data
jpayne@69	1812 return self
jpayne@69	1813 return self.__class__(data[::-1])
jpayne@69	1814
jpayne@69	1815 def transcribe(self, inplace=False):
jpayne@69	1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
jpayne@69	1817
jpayne@69	1818 Following the usual convention, the sequence is interpreted as the
jpayne@69	1819 coding strand of the DNA double helix, not the template strand. This
jpayne@69	1820 means we can get the RNA sequence just by switching T to U.
jpayne@69	1821
jpayne@69	1822 >>> from Bio.Seq import Seq
jpayne@69	1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69	1824 >>> coding_dna
jpayne@69	1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1826 >>> coding_dna.transcribe()
jpayne@69	1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1828
jpayne@69	1829 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1830
jpayne@69	1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69	1832 >>> sequence
jpayne@69	1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1834 >>> sequence.transcribe()
jpayne@69	1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1836 >>> sequence
jpayne@69	1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1838
jpayne@69	1839 >>> sequence.transcribe(inplace=True)
jpayne@69	1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1841 >>> sequence
jpayne@69	1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1843
jpayne@69	1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1846
jpayne@69	1847 Trying to transcribe an RNA sequence has no effect.
jpayne@69	1848 If you have a nucleotide sequence which might be DNA or RNA
jpayne@69	1849 (or even a mixture), calling the transcribe method will ensure
jpayne@69	1850 any T becomes U.
jpayne@69	1851
jpayne@69	1852 Trying to transcribe a protein sequence will replace any
jpayne@69	1853 T for Threonine with U for Selenocysteine, which has no
jpayne@69	1854 biologically plausible rational.
jpayne@69	1855
jpayne@69	1856 >>> from Bio.Seq import Seq
jpayne@69	1857 >>> my_protein = Seq("MAIVMGRT")
jpayne@69	1858 >>> my_protein.transcribe()
jpayne@69	1859 Seq('MAIVMGRU')
jpayne@69	1860 """
jpayne@69	1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
jpayne@69	1862 if inplace:
jpayne@69	1863 if not isinstance(self._data, bytearray):
jpayne@69	1864 raise TypeError("Sequence is immutable")
jpayne@69	1865 self._data[:] = data
jpayne@69	1866 return self
jpayne@69	1867 return self.__class__(data)
jpayne@69	1868
jpayne@69	1869 def back_transcribe(self, inplace=False):
jpayne@69	1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
jpayne@69	1871
jpayne@69	1872 >>> from Bio.Seq import Seq
jpayne@69	1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@69	1874 >>> messenger_rna
jpayne@69	1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1876 >>> messenger_rna.back_transcribe()
jpayne@69	1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1878
jpayne@69	1879 The sequence is modified in-place and returned if inplace is True:
jpayne@69	1880
jpayne@69	1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@69	1882 >>> sequence
jpayne@69	1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1884 >>> sequence.back_transcribe()
jpayne@69	1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1886 >>> sequence
jpayne@69	1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69	1888
jpayne@69	1889 >>> sequence.back_transcribe(inplace=True)
jpayne@69	1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1891 >>> sequence
jpayne@69	1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69	1893
jpayne@69	1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1896
jpayne@69	1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
jpayne@69	1898 sequence which might be DNA or RNA (or even a mixture), calling the
jpayne@69	1899 back-transcribe method will ensure any U becomes T.
jpayne@69	1900
jpayne@69	1901 Trying to back-transcribe a protein sequence will replace any U for
jpayne@69	1902 Selenocysteine with T for Threonine, which is biologically meaningless.
jpayne@69	1903
jpayne@69	1904 >>> from Bio.Seq import Seq
jpayne@69	1905 >>> my_protein = Seq("MAIVMGRU")
jpayne@69	1906 >>> my_protein.back_transcribe()
jpayne@69	1907 Seq('MAIVMGRT')
jpayne@69	1908 """
jpayne@69	1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
jpayne@69	1910 if inplace:
jpayne@69	1911 if not isinstance(self._data, bytearray):
jpayne@69	1912 raise TypeError("Sequence is immutable")
jpayne@69	1913 self._data[:] = data
jpayne@69	1914 return self
jpayne@69	1915 return self.__class__(data)
jpayne@69	1916
jpayne@69	1917 def join(self, other):
jpayne@69	1918 """Return a merge of the sequences in other, spaced by the sequence from self.
jpayne@69	1919
jpayne@69	1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
jpayne@69	1921 the letters), or an iterable containing Seq, MutableSeq, or string
jpayne@69	1922 objects. These arguments will be concatenated with the calling sequence
jpayne@69	1923 as the spacer:
jpayne@69	1924
jpayne@69	1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
jpayne@69	1926 >>> concatenated
jpayne@69	1927 Seq('AAANNNNNTTTNNNNNPPP')
jpayne@69	1928
jpayne@69	1929 Joining the letters of a single sequence:
jpayne@69	1930
jpayne@69	1931 >>> Seq('NNNNN').join(Seq("ACGT"))
jpayne@69	1932 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@69	1933 >>> Seq('NNNNN').join("ACGT")
jpayne@69	1934 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@69	1935 """
jpayne@69	1936 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69	1937 return self.__class__(str(self).join(str(other)))
jpayne@69	1938 elif isinstance(other, str):
jpayne@69	1939 return self.__class__(str(self).join(other))
jpayne@69	1940
jpayne@69	1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69	1942
jpayne@69	1943 if isinstance(other, SeqRecord):
jpayne@69	1944 raise TypeError("Iterable cannot be a SeqRecord")
jpayne@69	1945
jpayne@69	1946 for c in other:
jpayne@69	1947 if isinstance(c, SeqRecord):
jpayne@69	1948 raise TypeError("Iterable cannot contain SeqRecords")
jpayne@69	1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
jpayne@69	1950 raise TypeError(
jpayne@69	1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
jpayne@69	1952 )
jpayne@69	1953 return self.__class__(str(self).join([str(_) for _ in other]))
jpayne@69	1954
jpayne@69	1955 def replace(self, old, new, inplace=False):
jpayne@69	1956 """Return a copy with all occurrences of subsequence old replaced by new.
jpayne@69	1957
jpayne@69	1958 >>> s = Seq("ACGTAACCGGTT")
jpayne@69	1959 >>> t = s.replace("AC", "XYZ")
jpayne@69	1960 >>> s
jpayne@69	1961 Seq('ACGTAACCGGTT')
jpayne@69	1962 >>> t
jpayne@69	1963 Seq('XYZGTAXYZCGGTT')
jpayne@69	1964
jpayne@69	1965 For mutable sequences, passing inplace=True will modify the sequence in place:
jpayne@69	1966
jpayne@69	1967 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@69	1968 >>> t = m.replace("AC", "XYZ")
jpayne@69	1969 >>> m
jpayne@69	1970 MutableSeq('ACGTAACCGGTT')
jpayne@69	1971 >>> t
jpayne@69	1972 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69	1973
jpayne@69	1974 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@69	1975 >>> t = m.replace("AC", "XYZ", inplace=True)
jpayne@69	1976 >>> m
jpayne@69	1977 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69	1978 >>> t
jpayne@69	1979 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69	1980
jpayne@69	1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69	1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69	1983 """
jpayne@69	1984 if isinstance(old, _SeqAbstractBaseClass):
jpayne@69	1985 old = bytes(old)
jpayne@69	1986 elif isinstance(old, str):
jpayne@69	1987 old = old.encode("ASCII")
jpayne@69	1988 if isinstance(new, _SeqAbstractBaseClass):
jpayne@69	1989 new = bytes(new)
jpayne@69	1990 elif isinstance(new, str):
jpayne@69	1991 new = new.encode("ASCII")
jpayne@69	1992 data = self._data.replace(old, new)
jpayne@69	1993 if inplace:
jpayne@69	1994 if not isinstance(self._data, bytearray):
jpayne@69	1995 raise TypeError("Sequence is immutable")
jpayne@69	1996 self._data[:] = data
jpayne@69	1997 return self
jpayne@69	1998 return self.__class__(data)
jpayne@69	1999
jpayne@69	2000 @property
jpayne@69	2001 def defined(self):
jpayne@69	2002 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@69	2003
jpayne@69	2004 Zero-length sequences are always considered to be defined.
jpayne@69	2005 """
jpayne@69	2006 if isinstance(self._data, (bytes, bytearray)):
jpayne@69	2007 return True
jpayne@69	2008 else:
jpayne@69	2009 return self._data.defined
jpayne@69	2010
jpayne@69	2011 @property
jpayne@69	2012 def defined_ranges(self):
jpayne@69	2013 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69	2014
jpayne@69	2015 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69	2016 """
jpayne@69	2017 if isinstance(self._data, (bytes, bytearray)):
jpayne@69	2018 length = len(self)
jpayne@69	2019 if length > 0:
jpayne@69	2020 return ((0, length),)
jpayne@69	2021 else:
jpayne@69	2022 return ()
jpayne@69	2023 else:
jpayne@69	2024 return self._data.defined_ranges
jpayne@69	2025
jpayne@69	2026
jpayne@69	2027 class Seq(_SeqAbstractBaseClass):
jpayne@69	2028 """Read-only sequence object (essentially a string with biological methods).
jpayne@69	2029
jpayne@69	2030 Like normal python strings, our basic sequence object is immutable.
jpayne@69	2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
jpayne@69	2032 Seq objects to be used as dictionary keys.
jpayne@69	2033
jpayne@69	2034 The Seq object provides a number of string like methods (such as count,
jpayne@69	2035 find, split and strip).
jpayne@69	2036
jpayne@69	2037 The Seq object also provides some biological methods, such as complement,
jpayne@69	2038 reverse_complement, transcribe, back_transcribe and translate (which are
jpayne@69	2039 not applicable to protein sequences).
jpayne@69	2040 """
jpayne@69	2041
jpayne@69	2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
jpayne@69	2043
jpayne@69	2044 def __init__(
jpayne@69	2045 self,
jpayne@69	2046 data: Union[
jpayne@69	2047 str,
jpayne@69	2048 bytes,
jpayne@69	2049 bytearray,
jpayne@69	2050 _SeqAbstractBaseClass,
jpayne@69	2051 SequenceDataAbstractBaseClass,
jpayne@69	2052 dict,
jpayne@69	2053 None,
jpayne@69	2054 ],
jpayne@69	2055 length: Optional[int] = None,
jpayne@69	2056 ):
jpayne@69	2057 """Create a Seq object.
jpayne@69	2058
jpayne@69	2059 Arguments:
jpayne@69	2060 - data - Sequence, required (string)
jpayne@69	2061 - length - Sequence length, used only if data is None or a dictionary (integer)
jpayne@69	2062
jpayne@69	2063 You will typically use Bio.SeqIO to read in sequences from files as
jpayne@69	2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
jpayne@69	2065 the seq property.
jpayne@69	2066
jpayne@69	2067 However, you can also create a Seq object directly:
jpayne@69	2068
jpayne@69	2069 >>> from Bio.Seq import Seq
jpayne@69	2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
jpayne@69	2071 >>> my_seq
jpayne@69	2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
jpayne@69	2073 >>> print(my_seq)
jpayne@69	2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
jpayne@69	2075
jpayne@69	2076 To create a Seq object with for a sequence of known length but
jpayne@69	2077 unknown sequence contents, use None for the data argument and pass
jpayne@69	2078 the sequence length for the length argument. Trying to access the
jpayne@69	2079 sequence contents of a Seq object created in this way will raise
jpayne@69	2080 an UndefinedSequenceError:
jpayne@69	2081
jpayne@69	2082 >>> my_undefined_sequence = Seq(None, 20)
jpayne@69	2083 >>> my_undefined_sequence
jpayne@69	2084 Seq(None, length=20)
jpayne@69	2085 >>> len(my_undefined_sequence)
jpayne@69	2086 20
jpayne@69	2087 >>> print(my_undefined_sequence)
jpayne@69	2088 Traceback (most recent call last):
jpayne@69	2089 ...
jpayne@69	2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
jpayne@69	2091
jpayne@69	2092 If the sequence contents is known for parts of the sequence only, use
jpayne@69	2093 a dictionary for the data argument to pass the known sequence segments:
jpayne@69	2094
jpayne@69	2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
jpayne@69	2096 >>> my_partially_defined_sequence
jpayne@69	2097 Seq({3: 'ACGT'}, length=10)
jpayne@69	2098 >>> len(my_partially_defined_sequence)
jpayne@69	2099 10
jpayne@69	2100 >>> print(my_partially_defined_sequence)
jpayne@69	2101 Traceback (most recent call last):
jpayne@69	2102 ...
jpayne@69	2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
jpayne@69	2104 >>> my_partially_defined_sequence[3:7]
jpayne@69	2105 Seq('ACGT')
jpayne@69	2106 >>> print(my_partially_defined_sequence[3:7])
jpayne@69	2107 ACGT
jpayne@69	2108 """
jpayne@69	2109 if data is None:
jpayne@69	2110 if length is None:
jpayne@69	2111 raise ValueError("length must not be None if data is None")
jpayne@69	2112 elif length == 0:
jpayne@69	2113 self._data = b""
jpayne@69	2114 elif length < 0:
jpayne@69	2115 raise ValueError("length must not be negative.")
jpayne@69	2116 else:
jpayne@69	2117 self._data = _UndefinedSequenceData(length)
jpayne@69	2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
jpayne@69	2119 self._data = data
jpayne@69	2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
jpayne@69	2121 self._data = bytes(data)
jpayne@69	2122 elif isinstance(data, str):
jpayne@69	2123 self._data = bytes(data, encoding="ASCII")
jpayne@69	2124 elif isinstance(data, dict):
jpayne@69	2125 if length is None:
jpayne@69	2126 raise ValueError("length must not be None if data is a dictionary")
jpayne@69	2127 elif length == 0:
jpayne@69	2128 self._data = b""
jpayne@69	2129 elif length < 0:
jpayne@69	2130 raise ValueError("length must not be negative.")
jpayne@69	2131 else:
jpayne@69	2132 current = 0 # not needed here, but it keeps mypy happy
jpayne@69	2133 end = -1
jpayne@69	2134 starts = sorted(data.keys())
jpayne@69	2135 _data: Dict[int, bytes] = {}
jpayne@69	2136 for start in starts:
jpayne@69	2137 seq = data[start]
jpayne@69	2138 if isinstance(seq, str):
jpayne@69	2139 seq = bytes(seq, encoding="ASCII")
jpayne@69	2140 else:
jpayne@69	2141 try:
jpayne@69	2142 seq = bytes(seq)
jpayne@69	2143 except Exception:
jpayne@69	2144 raise ValueError("Expected bytes-like objects or strings")
jpayne@69	2145 if start < end:
jpayne@69	2146 raise ValueError("Sequence data are overlapping.")
jpayne@69	2147 elif start == end:
jpayne@69	2148 _data[current] += seq # noqa: F821
jpayne@69	2149 else:
jpayne@69	2150 _data[start] = seq
jpayne@69	2151 current = start
jpayne@69	2152 end = start + len(seq)
jpayne@69	2153 if end > length:
jpayne@69	2154 raise ValueError(
jpayne@69	2155 "Provided sequence data extend beyond sequence length."
jpayne@69	2156 )
jpayne@69	2157 elif end == length and current == 0:
jpayne@69	2158 # sequence is fully defined
jpayne@69	2159 self._data = _data[current]
jpayne@69	2160 else:
jpayne@69	2161 self._data = _PartiallyDefinedSequenceData(length, _data)
jpayne@69	2162 else:
jpayne@69	2163 raise TypeError(
jpayne@69	2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
jpayne@69	2165 )
jpayne@69	2166
jpayne@69	2167 def __hash__(self):
jpayne@69	2168 """Hash of the sequence as a string for comparison.
jpayne@69	2169
jpayne@69	2170 See Seq object comparison documentation (method ``__eq__`` in
jpayne@69	2171 particular) as this has changed in Biopython 1.65. Older versions
jpayne@69	2172 would hash on object identity.
jpayne@69	2173 """
jpayne@69	2174 return hash(self._data)
jpayne@69	2175
jpayne@69	2176
jpayne@69	2177 class MutableSeq(_SeqAbstractBaseClass):
jpayne@69	2178 """An editable sequence object.
jpayne@69	2179
jpayne@69	2180 Unlike normal python strings and our basic sequence object (the Seq class)
jpayne@69	2181 which are immutable, the MutableSeq lets you edit the sequence in place.
jpayne@69	2182 However, this means you cannot use a MutableSeq object as a dictionary key.
jpayne@69	2183
jpayne@69	2184 >>> from Bio.Seq import MutableSeq
jpayne@69	2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
jpayne@69	2186 >>> my_seq
jpayne@69	2187 MutableSeq('ACTCGTCGTCG')
jpayne@69	2188 >>> my_seq[5]
jpayne@69	2189 'T'
jpayne@69	2190 >>> my_seq[5] = "A"
jpayne@69	2191 >>> my_seq
jpayne@69	2192 MutableSeq('ACTCGACGTCG')
jpayne@69	2193 >>> my_seq[5]
jpayne@69	2194 'A'
jpayne@69	2195 >>> my_seq[5:8] = "NNN"
jpayne@69	2196 >>> my_seq
jpayne@69	2197 MutableSeq('ACTCGNNNTCG')
jpayne@69	2198 >>> len(my_seq)
jpayne@69	2199 11
jpayne@69	2200
jpayne@69	2201 Note that the MutableSeq object does not support as many string-like
jpayne@69	2202 or biological methods as the Seq object.
jpayne@69	2203 """
jpayne@69	2204
jpayne@69	2205 def __init__(self, data):
jpayne@69	2206 """Create a MutableSeq object."""
jpayne@69	2207 if isinstance(data, bytearray):
jpayne@69	2208 self._data = data
jpayne@69	2209 elif isinstance(data, bytes):
jpayne@69	2210 self._data = bytearray(data)
jpayne@69	2211 elif isinstance(data, str):
jpayne@69	2212 self._data = bytearray(data, "ASCII")
jpayne@69	2213 elif isinstance(data, MutableSeq):
jpayne@69	2214 self._data = data._data[:] # Take a copy
jpayne@69	2215 elif isinstance(data, Seq):
jpayne@69	2216 # Make no assumptions about the Seq subclass internal storage
jpayne@69	2217 self._data = bytearray(bytes(data))
jpayne@69	2218 else:
jpayne@69	2219 raise TypeError(
jpayne@69	2220 "data should be a string, bytearray object, Seq object, or a "
jpayne@69	2221 "MutableSeq object"
jpayne@69	2222 )
jpayne@69	2223
jpayne@69	2224 def __setitem__(self, index, value):
jpayne@69	2225 """Set a subsequence of single letter via value parameter.
jpayne@69	2226
jpayne@69	2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2228 >>> my_seq[0] = 'T'
jpayne@69	2229 >>> my_seq
jpayne@69	2230 MutableSeq('TCTCGACGTCG')
jpayne@69	2231 """
jpayne@69	2232 if isinstance(index, numbers.Integral):
jpayne@69	2233 # Replacing a single letter with a new string
jpayne@69	2234 self._data[index] = ord(value)
jpayne@69	2235 else:
jpayne@69	2236 # Replacing a sub-sequence
jpayne@69	2237 if isinstance(value, MutableSeq):
jpayne@69	2238 self._data[index] = value._data
jpayne@69	2239 elif isinstance(value, Seq):
jpayne@69	2240 self._data[index] = bytes(value)
jpayne@69	2241 elif isinstance(value, str):
jpayne@69	2242 self._data[index] = value.encode("ASCII")
jpayne@69	2243 else:
jpayne@69	2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
jpayne@69	2245
jpayne@69	2246 def __delitem__(self, index):
jpayne@69	2247 """Delete a subsequence of single letter.
jpayne@69	2248
jpayne@69	2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2250 >>> del my_seq[0]
jpayne@69	2251 >>> my_seq
jpayne@69	2252 MutableSeq('CTCGACGTCG')
jpayne@69	2253 """
jpayne@69	2254 # Could be deleting a single letter, or a slice
jpayne@69	2255 del self._data[index]
jpayne@69	2256
jpayne@69	2257 def append(self, c):
jpayne@69	2258 """Add a subsequence to the mutable sequence object.
jpayne@69	2259
jpayne@69	2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2261 >>> my_seq.append('A')
jpayne@69	2262 >>> my_seq
jpayne@69	2263 MutableSeq('ACTCGACGTCGA')
jpayne@69	2264
jpayne@69	2265 No return value.
jpayne@69	2266 """
jpayne@69	2267 self._data.append(ord(c.encode("ASCII")))
jpayne@69	2268
jpayne@69	2269 def insert(self, i, c):
jpayne@69	2270 """Add a subsequence to the mutable sequence object at a given index.
jpayne@69	2271
jpayne@69	2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2273 >>> my_seq.insert(0,'A')
jpayne@69	2274 >>> my_seq
jpayne@69	2275 MutableSeq('AACTCGACGTCG')
jpayne@69	2276 >>> my_seq.insert(8,'G')
jpayne@69	2277 >>> my_seq
jpayne@69	2278 MutableSeq('AACTCGACGGTCG')
jpayne@69	2279
jpayne@69	2280 No return value.
jpayne@69	2281 """
jpayne@69	2282 self._data.insert(i, ord(c.encode("ASCII")))
jpayne@69	2283
jpayne@69	2284 def pop(self, i=(-1)):
jpayne@69	2285 """Remove a subsequence of a single letter at given index.
jpayne@69	2286
jpayne@69	2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2288 >>> my_seq.pop()
jpayne@69	2289 'G'
jpayne@69	2290 >>> my_seq
jpayne@69	2291 MutableSeq('ACTCGACGTC')
jpayne@69	2292 >>> my_seq.pop()
jpayne@69	2293 'C'
jpayne@69	2294 >>> my_seq
jpayne@69	2295 MutableSeq('ACTCGACGT')
jpayne@69	2296
jpayne@69	2297 Returns the last character of the sequence.
jpayne@69	2298 """
jpayne@69	2299 c = self._data[i]
jpayne@69	2300 del self._data[i]
jpayne@69	2301 return chr(c)
jpayne@69	2302
jpayne@69	2303 def remove(self, item):
jpayne@69	2304 """Remove a subsequence of a single letter from mutable sequence.
jpayne@69	2305
jpayne@69	2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2307 >>> my_seq.remove('C')
jpayne@69	2308 >>> my_seq
jpayne@69	2309 MutableSeq('ATCGACGTCG')
jpayne@69	2310 >>> my_seq.remove('A')
jpayne@69	2311 >>> my_seq
jpayne@69	2312 MutableSeq('TCGACGTCG')
jpayne@69	2313
jpayne@69	2314 No return value.
jpayne@69	2315 """
jpayne@69	2316 codepoint = ord(item)
jpayne@69	2317 try:
jpayne@69	2318 self._data.remove(codepoint)
jpayne@69	2319 except ValueError:
jpayne@69	2320 raise ValueError("value not found in MutableSeq") from None
jpayne@69	2321
jpayne@69	2322 def reverse(self):
jpayne@69	2323 """Modify the mutable sequence to reverse itself.
jpayne@69	2324
jpayne@69	2325 No return value.
jpayne@69	2326 """
jpayne@69	2327 self._data.reverse()
jpayne@69	2328
jpayne@69	2329 def extend(self, other):
jpayne@69	2330 """Add a sequence to the original mutable sequence object.
jpayne@69	2331
jpayne@69	2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69	2333 >>> my_seq.extend('A')
jpayne@69	2334 >>> my_seq
jpayne@69	2335 MutableSeq('ACTCGACGTCGA')
jpayne@69	2336 >>> my_seq.extend('TTT')
jpayne@69	2337 >>> my_seq
jpayne@69	2338 MutableSeq('ACTCGACGTCGATTT')
jpayne@69	2339
jpayne@69	2340 No return value.
jpayne@69	2341 """
jpayne@69	2342 if isinstance(other, MutableSeq):
jpayne@69	2343 self._data.extend(other._data)
jpayne@69	2344 elif isinstance(other, Seq):
jpayne@69	2345 self._data.extend(bytes(other))
jpayne@69	2346 elif isinstance(other, str):
jpayne@69	2347 self._data.extend(other.encode("ASCII"))
jpayne@69	2348 else:
jpayne@69	2349 raise TypeError("expected a string, Seq or MutableSeq")
jpayne@69	2350
jpayne@69	2351
jpayne@69	2352 class UndefinedSequenceError(ValueError):
jpayne@69	2353 """Sequence contents is undefined."""
jpayne@69	2354
jpayne@69	2355
jpayne@69	2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@69	2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@69	2358
jpayne@69	2359 Objects of this class can be used to create a Seq object to represent
jpayne@69	2360 sequences with a known length, but an unknown sequence contents.
jpayne@69	2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
jpayne@69	2362 UndefinedSequenceError except for requests of zero size, for which it
jpayne@69	2363 returns an empty bytes object.
jpayne@69	2364 """
jpayne@69	2365
jpayne@69	2366 __slots__ = ("_length",)
jpayne@69	2367
jpayne@69	2368 def __init__(self, length):
jpayne@69	2369 """Initialize the object with the sequence length.
jpayne@69	2370
jpayne@69	2371 The calling function is responsible for ensuring that the length is
jpayne@69	2372 greater than zero.
jpayne@69	2373 """
jpayne@69	2374 self._length = length
jpayne@69	2375 super().__init__()
jpayne@69	2376
jpayne@69	2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
jpayne@69	2378 if isinstance(key, slice):
jpayne@69	2379 start, end, step = key.indices(self._length)
jpayne@69	2380 size = len(range(start, end, step))
jpayne@69	2381 if size == 0:
jpayne@69	2382 return b""
jpayne@69	2383 return _UndefinedSequenceData(size)
jpayne@69	2384 else:
jpayne@69	2385 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69	2386
jpayne@69	2387 def __len__(self):
jpayne@69	2388 return self._length
jpayne@69	2389
jpayne@69	2390 def __bytes__(self):
jpayne@69	2391 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69	2392
jpayne@69	2393 def __add__(self, other):
jpayne@69	2394 length = len(self) + len(other)
jpayne@69	2395 try:
jpayne@69	2396 other = bytes(other)
jpayne@69	2397 except UndefinedSequenceError:
jpayne@69	2398 if isinstance(other, _UndefinedSequenceData):
jpayne@69	2399 return _UndefinedSequenceData(length)
jpayne@69	2400 else:
jpayne@69	2401 return NotImplemented
jpayne@69	2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
jpayne@69	2403 else:
jpayne@69	2404 data = {len(self): other}
jpayne@69	2405 return _PartiallyDefinedSequenceData(length, data)
jpayne@69	2406
jpayne@69	2407 def __radd__(self, other):
jpayne@69	2408 data = {0: bytes(other)}
jpayne@69	2409 length = len(other) + len(self)
jpayne@69	2410 return _PartiallyDefinedSequenceData(length, data)
jpayne@69	2411
jpayne@69	2412 def upper(self):
jpayne@69	2413 """Return an upper case copy of the sequence."""
jpayne@69	2414 # An upper case copy of an undefined sequence is an undefined
jpayne@69	2415 # sequence of the same length
jpayne@69	2416 return _UndefinedSequenceData(self._length)
jpayne@69	2417
jpayne@69	2418 def lower(self):
jpayne@69	2419 """Return a lower case copy of the sequence."""
jpayne@69	2420 # A lower case copy of an undefined sequence is an undefined
jpayne@69	2421 # sequence of the same length
jpayne@69	2422 return _UndefinedSequenceData(self._length)
jpayne@69	2423
jpayne@69	2424 def isupper(self):
jpayne@69	2425 """Return True if all ASCII characters in data are uppercase.
jpayne@69	2426
jpayne@69	2427 If there are no cased characters, the method returns False.
jpayne@69	2428 """
jpayne@69	2429 # Character case is irrelevant for an undefined sequence
jpayne@69	2430 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69	2431
jpayne@69	2432 def islower(self):
jpayne@69	2433 """Return True if all ASCII characters in data are lowercase.
jpayne@69	2434
jpayne@69	2435 If there are no cased characters, the method returns False.
jpayne@69	2436 """
jpayne@69	2437 # Character case is irrelevant for an undefined sequence
jpayne@69	2438 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69	2439
jpayne@69	2440 def replace(self, old, new):
jpayne@69	2441 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69	2442 # Replacing substring old by new in an undefined sequence will result
jpayne@69	2443 # in an undefined sequence of the same length, if old and new have the
jpayne@69	2444 # number of characters.
jpayne@69	2445 if len(old) != len(new):
jpayne@69	2446 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69	2447 return _UndefinedSequenceData(self._length)
jpayne@69	2448
jpayne@69	2449 @property
jpayne@69	2450 def defined(self):
jpayne@69	2451 """Return False, as the sequence is not defined and has a non-zero length."""
jpayne@69	2452 return False
jpayne@69	2453
jpayne@69	2454 @property
jpayne@69	2455 def defined_ranges(self):
jpayne@69	2456 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69	2457
jpayne@69	2458 As the sequence contents of an _UndefinedSequenceData object is fully
jpayne@69	2459 undefined, the return value is always an empty tuple.
jpayne@69	2460 """
jpayne@69	2461 return ()
jpayne@69	2462
jpayne@69	2463
jpayne@69	2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@69	2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@69	2466
jpayne@69	2467 Objects of this class can be used to create a Seq object to represent
jpayne@69	2468 sequences with a known length, but with a sequence contents that is only
jpayne@69	2469 partially known.
jpayne@69	2470 Calling __len__ returns the sequence length, calling __getitem__ returns
jpayne@69	2471 the sequence contents if known, otherwise an UndefinedSequenceError is
jpayne@69	2472 raised.
jpayne@69	2473 """
jpayne@69	2474
jpayne@69	2475 __slots__ = ("_length", "_data")
jpayne@69	2476
jpayne@69	2477 def __init__(self, length, data):
jpayne@69	2478 """Initialize with the sequence length and defined sequence segments.
jpayne@69	2479
jpayne@69	2480 The calling function is responsible for ensuring that the length is
jpayne@69	2481 greater than zero.
jpayne@69	2482 """
jpayne@69	2483 self._length = length
jpayne@69	2484 self._data = data
jpayne@69	2485 super().__init__()
jpayne@69	2486
jpayne@69	2487 def __getitem__(
jpayne@69	2488 self, key: Union[slice, int]
jpayne@69	2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
jpayne@69	2490 if isinstance(key, slice):
jpayne@69	2491 start, end, step = key.indices(self._length)
jpayne@69	2492 size = len(range(start, end, step))
jpayne@69	2493 if size == 0:
jpayne@69	2494 return b""
jpayne@69	2495 data = {}
jpayne@69	2496 for s, d in self._data.items():
jpayne@69	2497 indices = range(-s, -s + self._length)[key]
jpayne@69	2498 e: Optional[int] = indices.stop
jpayne@69	2499 assert e is not None
jpayne@69	2500 if step > 0:
jpayne@69	2501 if e <= 0:
jpayne@69	2502 continue
jpayne@69	2503 if indices.start < 0:
jpayne@69	2504 s = indices.start % step
jpayne@69	2505 else:
jpayne@69	2506 s = indices.start
jpayne@69	2507 else: # step < 0
jpayne@69	2508 if e < 0:
jpayne@69	2509 e = None
jpayne@69	2510 end = len(d) - 1
jpayne@69	2511 if indices.start > end:
jpayne@69	2512 s = end + (indices.start - end) % step
jpayne@69	2513 else:
jpayne@69	2514 s = indices.start
jpayne@69	2515 if s < 0:
jpayne@69	2516 continue
jpayne@69	2517 start = (s - indices.start) // step
jpayne@69	2518 d = d[s:e:step]
jpayne@69	2519 if d:
jpayne@69	2520 data[start] = d
jpayne@69	2521 if len(data) == 0: # Fully undefined sequence
jpayne@69	2522 return _UndefinedSequenceData(size)
jpayne@69	2523 # merge adjacent sequence segments
jpayne@69	2524 end = -1
jpayne@69	2525 previous = 0 # not needed here, but it keeps flake happy
jpayne@69	2526 items = data.items()
jpayne@69	2527 data = {}
jpayne@69	2528 for start, seq in items:
jpayne@69	2529 if end == start:
jpayne@69	2530 data[previous] += seq
jpayne@69	2531 else:
jpayne@69	2532 data[start] = seq
jpayne@69	2533 previous = start
jpayne@69	2534 end = start + len(seq)
jpayne@69	2535 if len(data) == 1:
jpayne@69	2536 seq = data.get(0)
jpayne@69	2537 if seq is not None and len(seq) == size:
jpayne@69	2538 return seq # Fully defined sequence; return bytes
jpayne@69	2539 if step < 0:
jpayne@69	2540 # use this after we drop Python 3.7:
jpayne@69	2541 # data = {start: data[start] for start in reversed(data)}
jpayne@69	2542 # use this as long as we support Python 3.7:
jpayne@69	2543 data = {start: data[start] for start in reversed(list(data.keys()))}
jpayne@69	2544 return _PartiallyDefinedSequenceData(size, data)
jpayne@69	2545 elif self._length <= key:
jpayne@69	2546 raise IndexError("sequence index out of range")
jpayne@69	2547 else:
jpayne@69	2548 for start, seq in self._data.items():
jpayne@69	2549 if start <= key and key < start + len(seq):
jpayne@69	2550 return seq[key - start]
jpayne@69	2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
jpayne@69	2552
jpayne@69	2553 def __len__(self):
jpayne@69	2554 return self._length
jpayne@69	2555
jpayne@69	2556 def __bytes__(self):
jpayne@69	2557 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69	2558
jpayne@69	2559 def __add__(self, other):
jpayne@69	2560 length = len(self) + len(other)
jpayne@69	2561 data = dict(self._data)
jpayne@69	2562 items = list(self._data.items())
jpayne@69	2563 start, seq = items[-1]
jpayne@69	2564 end = start + len(seq)
jpayne@69	2565 try:
jpayne@69	2566 other = bytes(other)
jpayne@69	2567 except UndefinedSequenceError:
jpayne@69	2568 if isinstance(other, _UndefinedSequenceData):
jpayne@69	2569 pass
jpayne@69	2570 elif isinstance(other, _PartiallyDefinedSequenceData):
jpayne@69	2571 other_items = list(other._data.items())
jpayne@69	2572 if end == len(self):
jpayne@69	2573 other_start, other_seq = other_items.pop(0)
jpayne@69	2574 if other_start == 0:
jpayne@69	2575 data[start] += other_seq
jpayne@69	2576 else:
jpayne@69	2577 data[len(self) + other_start] = other_seq
jpayne@69	2578 for other_start, other_seq in other_items:
jpayne@69	2579 data[len(self) + other_start] = other_seq
jpayne@69	2580 else:
jpayne@69	2581 if end == len(self):
jpayne@69	2582 data[start] += other
jpayne@69	2583 else:
jpayne@69	2584 data[len(self)] = other
jpayne@69	2585 return _PartiallyDefinedSequenceData(length, data)
jpayne@69	2586
jpayne@69	2587 def __radd__(self, other):
jpayne@69	2588 length = len(other) + len(self)
jpayne@69	2589 try:
jpayne@69	2590 other = bytes(other)
jpayne@69	2591 except UndefinedSequenceError:
jpayne@69	2592 data = {len(other) + start: seq for start, seq in self._data.items()}
jpayne@69	2593 else:
jpayne@69	2594 data = {0: other}
jpayne@69	2595 items = list(self._data.items())
jpayne@69	2596 start, seq = items.pop(0)
jpayne@69	2597 if start == 0:
jpayne@69	2598 data[0] += seq
jpayne@69	2599 else:
jpayne@69	2600 data[len(other) + start] = seq
jpayne@69	2601 for start, seq in items:
jpayne@69	2602 data[len(other) + start] = seq
jpayne@69	2603 return _PartiallyDefinedSequenceData(length, data)
jpayne@69	2604
jpayne@69	2605 def __mul__(self, other):
jpayne@69	2606 length = self._length
jpayne@69	2607 items = self._data.items()
jpayne@69	2608 data = {}
jpayne@69	2609 end = -1
jpayne@69	2610 previous = 0 # not needed here, but it keeps flake happy
jpayne@69	2611 for i in range(other):
jpayne@69	2612 for start, seq in items:
jpayne@69	2613 start += i * length
jpayne@69	2614 if end == start:
jpayne@69	2615 data[previous] += seq
jpayne@69	2616 else:
jpayne@69	2617 data[start] = seq
jpayne@69	2618 previous = start
jpayne@69	2619 end = start + len(seq)
jpayne@69	2620 return _PartiallyDefinedSequenceData(length * other, data)
jpayne@69	2621
jpayne@69	2622 def upper(self):
jpayne@69	2623 """Return an upper case copy of the sequence."""
jpayne@69	2624 data = {start: seq.upper() for start, seq in self._data.items()}
jpayne@69	2625 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69	2626
jpayne@69	2627 def lower(self):
jpayne@69	2628 """Return a lower case copy of the sequence."""
jpayne@69	2629 data = {start: seq.lower() for start, seq in self._data.items()}
jpayne@69	2630 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69	2631
jpayne@69	2632 def isupper(self):
jpayne@69	2633 """Return True if all ASCII characters in data are uppercase.
jpayne@69	2634
jpayne@69	2635 If there are no cased characters, the method returns False.
jpayne@69	2636 """
jpayne@69	2637 # Character case is irrelevant for an undefined sequence
jpayne@69	2638 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69	2639
jpayne@69	2640 def islower(self):
jpayne@69	2641 """Return True if all ASCII characters in data are lowercase.
jpayne@69	2642
jpayne@69	2643 If there are no cased characters, the method returns False.
jpayne@69	2644 """
jpayne@69	2645 # Character case is irrelevant for an undefined sequence
jpayne@69	2646 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69	2647
jpayne@69	2648 def translate(self, table, delete=b""):
jpayne@69	2649 """Return a copy with each character mapped by the given translation table.
jpayne@69	2650
jpayne@69	2651 table
jpayne@69	2652 Translation table, which must be a bytes object of length 256.
jpayne@69	2653
jpayne@69	2654 All characters occurring in the optional argument delete are removed.
jpayne@69	2655 The remaining characters are mapped through the given translation table.
jpayne@69	2656 """
jpayne@69	2657 items = self._data.items()
jpayne@69	2658 data = {start: seq.translate(table, delete) for start, seq in items}
jpayne@69	2659 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69	2660
jpayne@69	2661 def replace(self, old, new):
jpayne@69	2662 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69	2663 # Replacing substring old by new in the undefined sequence segments
jpayne@69	2664 # will result in an undefined sequence segment of the same length, if
jpayne@69	2665 # old and new have the number of characters. If not, an error is raised,
jpayne@69	2666 # as the correct start positions cannot be calculated reliably.
jpayne@69	2667 if len(old) != len(new):
jpayne@69	2668 raise UndefinedSequenceError(
jpayne@69	2669 "Sequence content is only partially defined; substring \n"
jpayne@69	2670 "replacement cannot be performed reliably"
jpayne@69	2671 )
jpayne@69	2672 items = self._data.items()
jpayne@69	2673 data = {start: seq.replace(old, new) for start, seq in items}
jpayne@69	2674 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69	2675
jpayne@69	2676 @property
jpayne@69	2677 def defined(self):
jpayne@69	2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
jpayne@69	2679 return False
jpayne@69	2680
jpayne@69	2681 @property
jpayne@69	2682 def defined_ranges(self):
jpayne@69	2683 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69	2684
jpayne@69	2685 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69	2686 """
jpayne@69	2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
jpayne@69	2688
jpayne@69	2689
jpayne@69	2690 # The transcribe, backward_transcribe, and translate functions are
jpayne@69	2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
jpayne@69	2692 # The functions work both on Seq objects, and on strings.
jpayne@69	2693
jpayne@69	2694
jpayne@69	2695 def transcribe(dna):
jpayne@69	2696 """Transcribe a DNA sequence into RNA.
jpayne@69	2697
jpayne@69	2698 Following the usual convention, the sequence is interpreted as the
jpayne@69	2699 coding strand of the DNA double helix, not the template strand. This
jpayne@69	2700 means we can get the RNA sequence just by switching T to U.
jpayne@69	2701
jpayne@69	2702 If given a string, returns a new string object.
jpayne@69	2703
jpayne@69	2704 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@69	2705
jpayne@69	2706 e.g.
jpayne@69	2707
jpayne@69	2708 >>> transcribe("ACTGN")
jpayne@69	2709 'ACUGN'
jpayne@69	2710 """
jpayne@69	2711 if isinstance(dna, Seq):
jpayne@69	2712 return dna.transcribe()
jpayne@69	2713 elif isinstance(dna, MutableSeq):
jpayne@69	2714 return Seq(dna).transcribe()
jpayne@69	2715 else:
jpayne@69	2716 return dna.replace("T", "U").replace("t", "u")
jpayne@69	2717
jpayne@69	2718
jpayne@69	2719 def back_transcribe(rna):
jpayne@69	2720 """Return the RNA sequence back-transcribed into DNA.
jpayne@69	2721
jpayne@69	2722 If given a string, returns a new string object.
jpayne@69	2723
jpayne@69	2724 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@69	2725
jpayne@69	2726 e.g.
jpayne@69	2727
jpayne@69	2728 >>> back_transcribe("ACUGN")
jpayne@69	2729 'ACTGN'
jpayne@69	2730 """
jpayne@69	2731 if isinstance(rna, Seq):
jpayne@69	2732 return rna.back_transcribe()
jpayne@69	2733 elif isinstance(rna, MutableSeq):
jpayne@69	2734 return Seq(rna).back_transcribe()
jpayne@69	2735 else:
jpayne@69	2736 return rna.replace("U", "T").replace("u", "t")
jpayne@69	2737
jpayne@69	2738
jpayne@69	2739 def _translate_str(
jpayne@69	2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
jpayne@69	2741 ):
jpayne@69	2742 """Translate nucleotide string into a protein string (PRIVATE).
jpayne@69	2743
jpayne@69	2744 Arguments:
jpayne@69	2745 - sequence - a string
jpayne@69	2746 - table - Which codon table to use? This can be either a name (string),
jpayne@69	2747 an NCBI identifier (integer), or a CodonTable object (useful for
jpayne@69	2748 non-standard genetic codes). This defaults to the "Standard" table.
jpayne@69	2749 - stop_symbol - a single character string, what to use for terminators.
jpayne@69	2750 - to_stop - boolean, should translation terminate at the first
jpayne@69	2751 in frame stop codon? If there is no in-frame stop codon
jpayne@69	2752 then translation continues to the end.
jpayne@69	2753 - pos_stop - a single character string for a possible stop codon
jpayne@69	2754 (e.g. TAN or NNN)
jpayne@69	2755 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@69	2756 checks the sequence starts with a valid alternative start
jpayne@69	2757 codon (which will be translated as methionine, M), that the
jpayne@69	2758 sequence length is a multiple of three, and that there is a
jpayne@69	2759 single in frame stop codon at the end (this will be excluded
jpayne@69	2760 from the protein sequence, regardless of the to_stop option).
jpayne@69	2761 If these tests fail, an exception is raised.
jpayne@69	2762 - gap - Single character string to denote symbol used for gaps.
jpayne@69	2763 Defaults to None.
jpayne@69	2764
jpayne@69	2765 Returns a string.
jpayne@69	2766
jpayne@69	2767 e.g.
jpayne@69	2768
jpayne@69	2769 >>> from Bio.Data import CodonTable
jpayne@69	2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
jpayne@69	2771 >>> _translate_str("AAA", table)
jpayne@69	2772 'K'
jpayne@69	2773 >>> _translate_str("TAR", table)
jpayne@69	2774 '*'
jpayne@69	2775 >>> _translate_str("TAN", table)
jpayne@69	2776 'X'
jpayne@69	2777 >>> _translate_str("TAN", table, pos_stop="@")
jpayne@69	2778 '@'
jpayne@69	2779 >>> _translate_str("TA?", table)
jpayne@69	2780 Traceback (most recent call last):
jpayne@69	2781 ...
jpayne@69	2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
jpayne@69	2783
jpayne@69	2784 In a change to older versions of Biopython, partial codons are now
jpayne@69	2785 always regarded as an error (previously only checked if cds=True)
jpayne@69	2786 and will trigger a warning (likely to become an exception in a
jpayne@69	2787 future release).
jpayne@69	2788
jpayne@69	2789 If cds=True, the start and stop codons are checked, and the start
jpayne@69	2790 codon will be translated at methionine. The sequence must be an
jpayne@69	2791 while number of codons.
jpayne@69	2792
jpayne@69	2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
jpayne@69	2794 'MP'
jpayne@69	2795 >>> _translate_str("AAACCCTAG", table, cds=True)
jpayne@69	2796 Traceback (most recent call last):
jpayne@69	2797 ...
jpayne@69	2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
jpayne@69	2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
jpayne@69	2800 Traceback (most recent call last):
jpayne@69	2801 ...
jpayne@69	2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
jpayne@69	2803 """
jpayne@69	2804 try:
jpayne@69	2805 table_id = int(table)
jpayne@69	2806 except ValueError:
jpayne@69	2807 # Assume it's a table name
jpayne@69	2808 # The same table can be used for RNA or DNA
jpayne@69	2809 try:
jpayne@69	2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
jpayne@69	2811 except KeyError:
jpayne@69	2812 if isinstance(table, str):
jpayne@69	2813 raise ValueError(
jpayne@69	2814 "The Bio.Seq translate methods and function DO NOT "
jpayne@69	2815 "take a character string mapping table like the python "
jpayne@69	2816 "string object's translate method. "
jpayne@69	2817 "Use str(my_seq).translate(...) instead."
jpayne@69	2818 ) from None
jpayne@69	2819 else:
jpayne@69	2820 raise TypeError("table argument must be integer or string") from None
jpayne@69	2821 except (AttributeError, TypeError):
jpayne@69	2822 # Assume it's a CodonTable object
jpayne@69	2823 if isinstance(table, CodonTable.CodonTable):
jpayne@69	2824 codon_table = table
jpayne@69	2825 else:
jpayne@69	2826 raise ValueError("Bad table argument") from None
jpayne@69	2827 else:
jpayne@69	2828 # Assume it's a table ID
jpayne@69	2829 # The same table can be used for RNA or DNA
jpayne@69	2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
jpayne@69	2831 sequence = sequence.upper()
jpayne@69	2832 amino_acids = []
jpayne@69	2833 forward_table = codon_table.forward_table
jpayne@69	2834 stop_codons = codon_table.stop_codons
jpayne@69	2835 if codon_table.nucleotide_alphabet is not None:
jpayne@69	2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
jpayne@69	2837 else:
jpayne@69	2838 # Assume the worst case, ambiguous DNA or RNA:
jpayne@69	2839 valid_letters = set(
jpayne@69	2840 IUPACData.ambiguous_dna_letters.upper()
jpayne@69	2841 + IUPACData.ambiguous_rna_letters.upper()
jpayne@69	2842 )
jpayne@69	2843 n = len(sequence)
jpayne@69	2844
jpayne@69	2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
jpayne@69	2846 dual_coding = [c for c in stop_codons if c in forward_table]
jpayne@69	2847 if dual_coding:
jpayne@69	2848 c = dual_coding[0]
jpayne@69	2849 if to_stop:
jpayne@69	2850 raise ValueError(
jpayne@69	2851 "You cannot use 'to_stop=True' with this table as it contains"
jpayne@69	2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
jpayne@69	2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
jpayne@69	2854 )
jpayne@69	2855 warnings.warn(
jpayne@69	2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
jpayne@69	2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
jpayne@69	2858 " or STOP). Such codons will be translated as amino acid.",
jpayne@69	2859 BiopythonWarning,
jpayne@69	2860 )
jpayne@69	2861
jpayne@69	2862 if cds:
jpayne@69	2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
jpayne@69	2864 raise CodonTable.TranslationError(
jpayne@69	2865 f"First codon '{sequence[:3]}' is not a start codon"
jpayne@69	2866 )
jpayne@69	2867 if n % 3 != 0:
jpayne@69	2868 raise CodonTable.TranslationError(
jpayne@69	2869 f"Sequence length {n} is not a multiple of three"
jpayne@69	2870 )
jpayne@69	2871 if str(sequence[-3:]).upper() not in stop_codons:
jpayne@69	2872 raise CodonTable.TranslationError(
jpayne@69	2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
jpayne@69	2874 )
jpayne@69	2875 # Don't translate the stop symbol, and manually translate the M
jpayne@69	2876 sequence = sequence[3:-3]
jpayne@69	2877 n -= 6
jpayne@69	2878 amino_acids = ["M"]
jpayne@69	2879 elif n % 3 != 0:
jpayne@69	2880 warnings.warn(
jpayne@69	2881 "Partial codon, len(sequence) not a multiple of three. "
jpayne@69	2882 "Explicitly trim the sequence or add trailing N before "
jpayne@69	2883 "translation. This may become an error in future.",
jpayne@69	2884 BiopythonWarning,
jpayne@69	2885 )
jpayne@69	2886 if gap is not None:
jpayne@69	2887 if not isinstance(gap, str):
jpayne@69	2888 raise TypeError("Gap character should be a single character string.")
jpayne@69	2889 elif len(gap) > 1:
jpayne@69	2890 raise ValueError("Gap character should be a single character string.")
jpayne@69	2891
jpayne@69	2892 for i in range(0, n - n % 3, 3):
jpayne@69	2893 codon = sequence[i : i + 3]
jpayne@69	2894 try:
jpayne@69	2895 amino_acids.append(forward_table[codon])
jpayne@69	2896 except (KeyError, CodonTable.TranslationError):
jpayne@69	2897 if codon in codon_table.stop_codons:
jpayne@69	2898 if cds:
jpayne@69	2899 raise CodonTable.TranslationError(
jpayne@69	2900 f"Extra in frame stop codon '{codon}' found."
jpayne@69	2901 ) from None
jpayne@69	2902 if to_stop:
jpayne@69	2903 break
jpayne@69	2904 amino_acids.append(stop_symbol)
jpayne@69	2905 elif valid_letters.issuperset(set(codon)):
jpayne@69	2906 # Possible stop codon (e.g. NNN or TAN)
jpayne@69	2907 amino_acids.append(pos_stop)
jpayne@69	2908 elif gap is not None and codon == gap * 3:
jpayne@69	2909 # Gapped translation
jpayne@69	2910 amino_acids.append(gap)
jpayne@69	2911 else:
jpayne@69	2912 raise CodonTable.TranslationError(
jpayne@69	2913 f"Codon '{codon}' is invalid"
jpayne@69	2914 ) from None
jpayne@69	2915 return "".join(amino_acids)
jpayne@69	2916
jpayne@69	2917
jpayne@69	2918 def translate(
jpayne@69	2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
jpayne@69	2920 ):
jpayne@69	2921 """Translate a nucleotide sequence into amino acids.
jpayne@69	2922
jpayne@69	2923 If given a string, returns a new string object. Given a Seq or
jpayne@69	2924 MutableSeq, returns a Seq object.
jpayne@69	2925
jpayne@69	2926 Arguments:
jpayne@69	2927 - table - Which codon table to use? This can be either a name
jpayne@69	2928 (string), an NCBI identifier (integer), or a CodonTable object
jpayne@69	2929 (useful for non-standard genetic codes). Defaults to the "Standard"
jpayne@69	2930 table.
jpayne@69	2931 - stop_symbol - Single character string, what to use for any
jpayne@69	2932 terminators, defaults to the asterisk, "*".
jpayne@69	2933 - to_stop - Boolean, defaults to False meaning do a full
jpayne@69	2934 translation continuing on past any stop codons
jpayne@69	2935 (translated as the specified stop_symbol). If
jpayne@69	2936 True, translation is terminated at the first in
jpayne@69	2937 frame stop codon (and the stop_symbol is not
jpayne@69	2938 appended to the returned protein sequence).
jpayne@69	2939 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@69	2940 checks the sequence starts with a valid alternative start
jpayne@69	2941 codon (which will be translated as methionine, M), that the
jpayne@69	2942 sequence length is a multiple of three, and that there is a
jpayne@69	2943 single in frame stop codon at the end (this will be excluded
jpayne@69	2944 from the protein sequence, regardless of the to_stop option).
jpayne@69	2945 If these tests fail, an exception is raised.
jpayne@69	2946 - gap - Single character string to denote symbol used for gaps.
jpayne@69	2947 Defaults to None.
jpayne@69	2948
jpayne@69	2949 A simple string example using the default (standard) genetic code:
jpayne@69	2950
jpayne@69	2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
jpayne@69	2952 >>> translate(coding_dna)
jpayne@69	2953 'VAIVMGRKGAR'
jpayne@69	2954 >>> translate(coding_dna, stop_symbol="@")
jpayne@69	2955 'VAIVMGR@KGAR@'
jpayne@69	2956 >>> translate(coding_dna, to_stop=True)
jpayne@69	2957 'VAIVMGR'
jpayne@69	2958
jpayne@69	2959 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@69	2960
jpayne@69	2961 >>> translate(coding_dna, table=2)
jpayne@69	2962 'VAIVMGRWKGAR*'
jpayne@69	2963 >>> translate(coding_dna, table=2, to_stop=True)
jpayne@69	2964 'VAIVMGRWKGAR'
jpayne@69	2965
jpayne@69	2966 In fact this example uses an alternative start codon valid under NCBI
jpayne@69	2967 table 2, GTG, which means this example is a complete valid CDS which
jpayne@69	2968 when translated should really start with methionine (not valine):
jpayne@69	2969
jpayne@69	2970 >>> translate(coding_dna, table=2, cds=True)
jpayne@69	2971 'MAIVMGRWKGAR'
jpayne@69	2972
jpayne@69	2973 Note that if the sequence has no in-frame stop codon, then the to_stop
jpayne@69	2974 argument has no effect:
jpayne@69	2975
jpayne@69	2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
jpayne@69	2977 >>> translate(coding_dna2)
jpayne@69	2978 'VAIVMGR'
jpayne@69	2979 >>> translate(coding_dna2, to_stop=True)
jpayne@69	2980 'VAIVMGR'
jpayne@69	2981
jpayne@69	2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@69	2983 or a stop codon. These are translated as "X". Any invalid codon
jpayne@69	2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@69	2985
jpayne@69	2986 It will however translate either DNA or RNA.
jpayne@69	2987
jpayne@69	2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
jpayne@69	2989 stop codons'. These are stop codons with unambiguous sequence but which
jpayne@69	2990 have a context dependent coding as STOP or as amino acid. With these tables
jpayne@69	2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
jpayne@69	2992 coding codons will always be translated as amino acid, except for
jpayne@69	2993 'cds=True', where the last codon will be translated as STOP.
jpayne@69	2994
jpayne@69	2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
jpayne@69	2996 >>> translate(coding_dna3)
jpayne@69	2997 'MARK*'
jpayne@69	2998
jpayne@69	2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
jpayne@69	3000 'MARKW'
jpayne@69	3001
jpayne@69	3002 It will however raise a BiopythonWarning (not shown).
jpayne@69	3003
jpayne@69	3004 >>> translate(coding_dna3, table=27, cds=True)
jpayne@69	3005 'MARK'
jpayne@69	3006
jpayne@69	3007 >>> translate(coding_dna3, table=27, to_stop=True)
jpayne@69	3008 Traceback (most recent call last):
jpayne@69	3009 ...
jpayne@69	3010 ValueError: You cannot use 'to_stop=True' with this table ...
jpayne@69	3011 """
jpayne@69	3012 if isinstance(sequence, Seq):
jpayne@69	3013 return sequence.translate(table, stop_symbol, to_stop, cds)
jpayne@69	3014 elif isinstance(sequence, MutableSeq):
jpayne@69	3015 # Return a Seq object
jpayne@69	3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
jpayne@69	3017 else:
jpayne@69	3018 # Assume it's a string, return a string
jpayne@69	3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
jpayne@69	3020
jpayne@69	3021
jpayne@69	3022 def reverse_complement(sequence, inplace=False):
jpayne@69	3023 """Return the reverse complement as a DNA sequence.
jpayne@69	3024
jpayne@69	3025 If given a string, returns a new string object.
jpayne@69	3026 Given a Seq object, returns a new Seq object.
jpayne@69	3027 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69	3028 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69	3029
jpayne@69	3030 >>> my_seq = "CGA"
jpayne@69	3031 >>> reverse_complement(my_seq)
jpayne@69	3032 'TCG'
jpayne@69	3033 >>> my_seq = Seq("CGA")
jpayne@69	3034 >>> reverse_complement(my_seq)
jpayne@69	3035 Seq('TCG')
jpayne@69	3036 >>> my_seq = MutableSeq("CGA")
jpayne@69	3037 >>> reverse_complement(my_seq)
jpayne@69	3038 MutableSeq('TCG')
jpayne@69	3039 >>> my_seq
jpayne@69	3040 MutableSeq('CGA')
jpayne@69	3041
jpayne@69	3042 Any U in the sequence is treated as a T:
jpayne@69	3043
jpayne@69	3044 >>> reverse_complement(Seq("CGAUT"))
jpayne@69	3045 Seq('AATCG')
jpayne@69	3046
jpayne@69	3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@69	3048
jpayne@69	3049 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@69	3050 Seq('AAUCG')
jpayne@69	3051
jpayne@69	3052 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69	3053 ambiguous nucleotides. All other characters are not converted:
jpayne@69	3054
jpayne@69	3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
jpayne@69	3056 'zrxZRXaacgtAACGT'
jpayne@69	3057
jpayne@69	3058 The sequence is modified in-place and returned if inplace is True:
jpayne@69	3059
jpayne@69	3060 >>> my_seq = MutableSeq("CGA")
jpayne@69	3061 >>> reverse_complement(my_seq, inplace=True)
jpayne@69	3062 MutableSeq('TCG')
jpayne@69	3063 >>> my_seq
jpayne@69	3064 MutableSeq('TCG')
jpayne@69	3065
jpayne@69	3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69	3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69	3068 ``inplace=True``.
jpayne@69	3069 """
jpayne@69	3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69	3071
jpayne@69	3072 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69	3073 return sequence.reverse_complement(inplace)
jpayne@69	3074 if isinstance(sequence, SeqRecord):
jpayne@69	3075 if inplace:
jpayne@69	3076 raise TypeError("SeqRecords are immutable")
jpayne@69	3077 return sequence.reverse_complement()
jpayne@69	3078 # Assume it's a string.
jpayne@69	3079 if inplace:
jpayne@69	3080 raise TypeError("strings are immutable")
jpayne@69	3081 sequence = sequence.encode("ASCII")
jpayne@69	3082 sequence = sequence.translate(_dna_complement_table)
jpayne@69	3083 sequence = sequence.decode("ASCII")
jpayne@69	3084 return sequence[::-1]
jpayne@69	3085
jpayne@69	3086
jpayne@69	3087 def reverse_complement_rna(sequence, inplace=False):
jpayne@69	3088 """Return the reverse complement as an RNA sequence.
jpayne@69	3089
jpayne@69	3090 If given a string, returns a new string object.
jpayne@69	3091 Given a Seq object, returns a new Seq object.
jpayne@69	3092 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69	3093 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69	3094
jpayne@69	3095 >>> my_seq = "CGA"
jpayne@69	3096 >>> reverse_complement_rna(my_seq)
jpayne@69	3097 'UCG'
jpayne@69	3098 >>> my_seq = Seq("CGA")
jpayne@69	3099 >>> reverse_complement_rna(my_seq)
jpayne@69	3100 Seq('UCG')
jpayne@69	3101 >>> my_seq = MutableSeq("CGA")
jpayne@69	3102 >>> reverse_complement_rna(my_seq)
jpayne@69	3103 MutableSeq('UCG')
jpayne@69	3104 >>> my_seq
jpayne@69	3105 MutableSeq('CGA')
jpayne@69	3106
jpayne@69	3107 Any T in the sequence is treated as a U:
jpayne@69	3108
jpayne@69	3109 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@69	3110 Seq('AAUCG')
jpayne@69	3111
jpayne@69	3112 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@69	3113
jpayne@69	3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
jpayne@69	3115 Seq('AATCG')
jpayne@69	3116
jpayne@69	3117 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69	3118 ambiguous nucleotides. All other characters are not converted:
jpayne@69	3119
jpayne@69	3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
jpayne@69	3121 'zrxZRXaacguAACGU'
jpayne@69	3122
jpayne@69	3123 The sequence is modified in-place and returned if inplace is True:
jpayne@69	3124
jpayne@69	3125 >>> my_seq = MutableSeq("CGA")
jpayne@69	3126 >>> reverse_complement_rna(my_seq, inplace=True)
jpayne@69	3127 MutableSeq('UCG')
jpayne@69	3128 >>> my_seq
jpayne@69	3129 MutableSeq('UCG')
jpayne@69	3130
jpayne@69	3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69	3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69	3133 ``inplace=True``.
jpayne@69	3134 """
jpayne@69	3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69	3136
jpayne@69	3137 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69	3138 return sequence.reverse_complement_rna(inplace)
jpayne@69	3139 if isinstance(sequence, SeqRecord):
jpayne@69	3140 if inplace:
jpayne@69	3141 raise TypeError("SeqRecords are immutable")
jpayne@69	3142 return sequence.reverse_complement_rna()
jpayne@69	3143 # Assume it's a string.
jpayne@69	3144 if inplace:
jpayne@69	3145 raise TypeError("strings are immutable")
jpayne@69	3146 sequence = sequence.encode("ASCII")
jpayne@69	3147 sequence = sequence.translate(_rna_complement_table)
jpayne@69	3148 sequence = sequence.decode("ASCII")
jpayne@69	3149 return sequence[::-1]
jpayne@69	3150
jpayne@69	3151
jpayne@69	3152 def complement(sequence, inplace=False):
jpayne@69	3153 """Return the complement as a DNA sequence.
jpayne@69	3154
jpayne@69	3155 If given a string, returns a new string object.
jpayne@69	3156 Given a Seq object, returns a new Seq object.
jpayne@69	3157 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69	3158 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69	3159
jpayne@69	3160 >>> my_seq = "CGA"
jpayne@69	3161 >>> complement(my_seq)
jpayne@69	3162 'GCT'
jpayne@69	3163 >>> my_seq = Seq("CGA")
jpayne@69	3164 >>> complement(my_seq)
jpayne@69	3165 Seq('GCT')
jpayne@69	3166 >>> my_seq = MutableSeq("CGA")
jpayne@69	3167 >>> complement(my_seq)
jpayne@69	3168 MutableSeq('GCT')
jpayne@69	3169 >>> my_seq
jpayne@69	3170 MutableSeq('CGA')
jpayne@69	3171
jpayne@69	3172 Any U in the sequence is treated as a T:
jpayne@69	3173
jpayne@69	3174 >>> complement(Seq("CGAUT"))
jpayne@69	3175 Seq('GCTAA')
jpayne@69	3176
jpayne@69	3177 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@69	3178
jpayne@69	3179 >>> complement_rna(Seq("CGAUT"))
jpayne@69	3180 Seq('GCUAA')
jpayne@69	3181
jpayne@69	3182 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69	3183 ambiguous nucleotides. All other characters are not converted:
jpayne@69	3184
jpayne@69	3185 >>> complement("ACGTUacgtuXYZxyz")
jpayne@69	3186 'TGCAAtgcaaXRZxrz'
jpayne@69	3187
jpayne@69	3188 The sequence is modified in-place and returned if inplace is True:
jpayne@69	3189
jpayne@69	3190 >>> my_seq = MutableSeq("CGA")
jpayne@69	3191 >>> complement(my_seq, inplace=True)
jpayne@69	3192 MutableSeq('GCT')
jpayne@69	3193 >>> my_seq
jpayne@69	3194 MutableSeq('GCT')
jpayne@69	3195
jpayne@69	3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69	3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69	3198 ``inplace=True``.
jpayne@69	3199 """
jpayne@69	3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69	3201
jpayne@69	3202 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69	3203 return sequence.complement(inplace)
jpayne@69	3204 if isinstance(sequence, SeqRecord):
jpayne@69	3205 if inplace:
jpayne@69	3206 raise TypeError("SeqRecords are immutable")
jpayne@69	3207 return sequence.complement()
jpayne@69	3208 # Assume it's a string.
jpayne@69	3209 if inplace is True:
jpayne@69	3210 raise TypeError("strings are immutable")
jpayne@69	3211 sequence = sequence.encode("ASCII")
jpayne@69	3212 sequence = sequence.translate(_dna_complement_table)
jpayne@69	3213 return sequence.decode("ASCII")
jpayne@69	3214
jpayne@69	3215
jpayne@69	3216 def complement_rna(sequence, inplace=False):
jpayne@69	3217 """Return the complement as an RNA sequence.
jpayne@69	3218
jpayne@69	3219 If given a string, returns a new string object.
jpayne@69	3220 Given a Seq object, returns a new Seq object.
jpayne@69	3221 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69	3222 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69	3223
jpayne@69	3224 >>> my_seq = "CGA"
jpayne@69	3225 >>> complement_rna(my_seq)
jpayne@69	3226 'GCU'
jpayne@69	3227 >>> my_seq = Seq("CGA")
jpayne@69	3228 >>> complement_rna(my_seq)
jpayne@69	3229 Seq('GCU')
jpayne@69	3230 >>> my_seq = MutableSeq("CGA")
jpayne@69	3231 >>> complement_rna(my_seq)
jpayne@69	3232 MutableSeq('GCU')
jpayne@69	3233 >>> my_seq
jpayne@69	3234 MutableSeq('CGA')
jpayne@69	3235
jpayne@69	3236 Any T in the sequence is treated as a U:
jpayne@69	3237
jpayne@69	3238 >>> complement_rna(Seq("CGAUT"))
jpayne@69	3239 Seq('GCUAA')
jpayne@69	3240
jpayne@69	3241 In contrast, ``complement`` returns a DNA sequence:
jpayne@69	3242
jpayne@69	3243 >>> complement(Seq("CGAUT"))
jpayne@69	3244 Seq('GCTAA')
jpayne@69	3245
jpayne@69	3246 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69	3247 ambiguous nucleotides. All other characters are not converted:
jpayne@69	3248
jpayne@69	3249 >>> complement_rna("ACGTUacgtuXYZxyz")
jpayne@69	3250 'UGCAAugcaaXRZxrz'
jpayne@69	3251
jpayne@69	3252 The sequence is modified in-place and returned if inplace is True:
jpayne@69	3253
jpayne@69	3254 >>> my_seq = MutableSeq("CGA")
jpayne@69	3255 >>> complement(my_seq, inplace=True)
jpayne@69	3256 MutableSeq('GCT')
jpayne@69	3257 >>> my_seq
jpayne@69	3258 MutableSeq('GCT')
jpayne@69	3259
jpayne@69	3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69	3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69	3262 ``inplace=True``.
jpayne@69	3263 """
jpayne@69	3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69	3265
jpayne@69	3266 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69	3267 return sequence.complement_rna(inplace)
jpayne@69	3268 if isinstance(sequence, SeqRecord):
jpayne@69	3269 if inplace:
jpayne@69	3270 raise TypeError("SeqRecords are immutable")
jpayne@69	3271 return sequence.complement_rna()
jpayne@69	3272 # Assume it's a string.
jpayne@69	3273 if inplace:
jpayne@69	3274 raise TypeError("strings are immutable")
jpayne@69	3275 sequence = sequence.encode("ASCII")
jpayne@69	3276 sequence = sequence.translate(_rna_complement_table)
jpayne@69	3277 return sequence.decode("ASCII")
jpayne@69	3278
jpayne@69	3279
jpayne@69	3280 def _test():
jpayne@69	3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
jpayne@69	3282 print("Running doctests...")
jpayne@69	3283 import doctest
jpayne@69	3284
jpayne@69	3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
jpayne@69	3286 print("Done")
jpayne@69	3287
jpayne@69	3288
jpayne@69	3289 if __name__ == "__main__":
jpayne@69	3290 _test()

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356