annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # Copyright 2000 Andrew Dalke.
jpayne@69 2 # Copyright 2000-2002 Brad Chapman.
jpayne@69 3 # Copyright 2004-2005, 2010 by M de Hoon.
jpayne@69 4 # Copyright 2007-2023 by Peter Cock.
jpayne@69 5 # All rights reserved.
jpayne@69 6 #
jpayne@69 7 # This file is part of the Biopython distribution and governed by your
jpayne@69 8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69 9 # Please see the LICENSE file that should have been included as part of this
jpayne@69 10 # package.
jpayne@69 11 """Provide objects to represent biological sequences.
jpayne@69 12
jpayne@69 13 See also the Seq_ wiki and the chapter in our tutorial:
jpayne@69 14 - `HTML Tutorial`_
jpayne@69 15 - `PDF Tutorial`_
jpayne@69 16
jpayne@69 17 .. _Seq: http://biopython.org/wiki/Seq
jpayne@69 18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
jpayne@69 19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
jpayne@69 20
jpayne@69 21 """
jpayne@69 22 import array
jpayne@69 23 import collections
jpayne@69 24 import numbers
jpayne@69 25 import warnings
jpayne@69 26
jpayne@69 27 from abc import ABC
jpayne@69 28 from abc import abstractmethod
jpayne@69 29 from typing import overload, Optional, Union, Dict
jpayne@69 30
jpayne@69 31 from Bio import BiopythonWarning
jpayne@69 32 from Bio.Data import CodonTable
jpayne@69 33 from Bio.Data import IUPACData
jpayne@69 34
jpayne@69 35
jpayne@69 36 def _maketrans(complement_mapping):
jpayne@69 37 """Make a python string translation table (PRIVATE).
jpayne@69 38
jpayne@69 39 Arguments:
jpayne@69 40 - complement_mapping - a dictionary such as ambiguous_dna_complement
jpayne@69 41 and ambiguous_rna_complement from Data.IUPACData.
jpayne@69 42
jpayne@69 43 Returns a translation table (a bytes object of length 256) for use with
jpayne@69 44 the python string's translate method to use in a (reverse) complement.
jpayne@69 45
jpayne@69 46 Compatible with lower case and upper case sequences.
jpayne@69 47
jpayne@69 48 For internal use only.
jpayne@69 49 """
jpayne@69 50 keys = "".join(complement_mapping.keys()).encode("ASCII")
jpayne@69 51 values = "".join(complement_mapping.values()).encode("ASCII")
jpayne@69 52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
jpayne@69 53
jpayne@69 54
jpayne@69 55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
jpayne@69 56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
jpayne@69 57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
jpayne@69 58 del ambiguous_dna_complement
jpayne@69 59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
jpayne@69 60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
jpayne@69 61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
jpayne@69 62 del ambiguous_rna_complement
jpayne@69 63
jpayne@69 64
jpayne@69 65 class SequenceDataAbstractBaseClass(ABC):
jpayne@69 66 """Abstract base class for sequence content providers.
jpayne@69 67
jpayne@69 68 Most users will not need to use this class. It is used internally as a base
jpayne@69 69 class for sequence content provider classes such as _UndefinedSequenceData
jpayne@69 70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
jpayne@69 71 Instances of these classes can be used instead of a ``bytes`` object as the
jpayne@69 72 data argument when creating a Seq object, and provide the sequence content
jpayne@69 73 only when requested via ``__getitem__``. This allows lazy parsers to load
jpayne@69 74 and parse sequence data from a file only for the requested sequence regions,
jpayne@69 75 and _UndefinedSequenceData instances to raise an exception when undefined
jpayne@69 76 sequence data are requested.
jpayne@69 77
jpayne@69 78 Future implementations of lazy parsers that similarly provide on-demand
jpayne@69 79 parsing of sequence data should use a subclass of this abstract class and
jpayne@69 80 implement the abstract methods ``__len__`` and ``__getitem__``:
jpayne@69 81
jpayne@69 82 * ``__len__`` must return the sequence length;
jpayne@69 83 * ``__getitem__`` must return
jpayne@69 84
jpayne@69 85 * a ``bytes`` object for the requested region; or
jpayne@69 86 * a new instance of the subclass for the requested region; or
jpayne@69 87 * raise an ``UndefinedSequenceError``.
jpayne@69 88
jpayne@69 89 Calling ``__getitem__`` for a sequence region of size zero should always
jpayne@69 90 return an empty ``bytes`` object.
jpayne@69 91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
jpayne@69 92 either return a ``bytes`` object with the full sequence, or raise an
jpayne@69 93 ``UndefinedSequenceError``.
jpayne@69 94
jpayne@69 95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
jpayne@69 96 as part of their ``__init__`` method.
jpayne@69 97 """
jpayne@69 98
jpayne@69 99 __slots__ = ()
jpayne@69 100
jpayne@69 101 def __init__(self):
jpayne@69 102 """Check if ``__getitem__`` returns a bytes-like object."""
jpayne@69 103 assert self[:0] == b""
jpayne@69 104
jpayne@69 105 @abstractmethod
jpayne@69 106 def __len__(self):
jpayne@69 107 pass
jpayne@69 108
jpayne@69 109 @abstractmethod
jpayne@69 110 def __getitem__(self, key):
jpayne@69 111 pass
jpayne@69 112
jpayne@69 113 def __bytes__(self):
jpayne@69 114 return self[:]
jpayne@69 115
jpayne@69 116 def __hash__(self):
jpayne@69 117 return hash(bytes(self))
jpayne@69 118
jpayne@69 119 def __eq__(self, other):
jpayne@69 120 return bytes(self) == other
jpayne@69 121
jpayne@69 122 def __lt__(self, other):
jpayne@69 123 return bytes(self) < other
jpayne@69 124
jpayne@69 125 def __le__(self, other):
jpayne@69 126 return bytes(self) <= other
jpayne@69 127
jpayne@69 128 def __gt__(self, other):
jpayne@69 129 return bytes(self) > other
jpayne@69 130
jpayne@69 131 def __ge__(self, other):
jpayne@69 132 return bytes(self) >= other
jpayne@69 133
jpayne@69 134 def __add__(self, other):
jpayne@69 135 try:
jpayne@69 136 return bytes(self) + bytes(other)
jpayne@69 137 except UndefinedSequenceError:
jpayne@69 138 return NotImplemented
jpayne@69 139 # will be handled by _UndefinedSequenceData.__radd__ or
jpayne@69 140 # by _PartiallyDefinedSequenceData.__radd__
jpayne@69 141
jpayne@69 142 def __radd__(self, other):
jpayne@69 143 return other + bytes(self)
jpayne@69 144
jpayne@69 145 def __mul__(self, other):
jpayne@69 146 return other * bytes(self)
jpayne@69 147
jpayne@69 148 def __contains__(self, item):
jpayne@69 149 return bytes(self).__contains__(item)
jpayne@69 150
jpayne@69 151 def decode(self, encoding="utf-8"):
jpayne@69 152 """Decode the data as bytes using the codec registered for encoding.
jpayne@69 153
jpayne@69 154 encoding
jpayne@69 155 The encoding with which to decode the bytes.
jpayne@69 156 """
jpayne@69 157 return bytes(self).decode(encoding)
jpayne@69 158
jpayne@69 159 def count(self, sub, start=None, end=None):
jpayne@69 160 """Return the number of non-overlapping occurrences of sub in data[start:end].
jpayne@69 161
jpayne@69 162 Optional arguments start and end are interpreted as in slice notation.
jpayne@69 163 This method behaves as the count method of Python strings.
jpayne@69 164 """
jpayne@69 165 return bytes(self).count(sub, start, end)
jpayne@69 166
jpayne@69 167 def find(self, sub, start=None, end=None):
jpayne@69 168 """Return the lowest index in data where subsection sub is found.
jpayne@69 169
jpayne@69 170 Return the lowest index in data where subsection sub is found,
jpayne@69 171 such that sub is contained within data[start,end]. Optional
jpayne@69 172 arguments start and end are interpreted as in slice notation.
jpayne@69 173
jpayne@69 174 Return -1 on failure.
jpayne@69 175 """
jpayne@69 176 return bytes(self).find(sub, start, end)
jpayne@69 177
jpayne@69 178 def rfind(self, sub, start=None, end=None):
jpayne@69 179 """Return the highest index in data where subsection sub is found.
jpayne@69 180
jpayne@69 181 Return the highest index in data where subsection sub is found,
jpayne@69 182 such that sub is contained within data[start,end]. Optional
jpayne@69 183 arguments start and end are interpreted as in slice notation.
jpayne@69 184
jpayne@69 185 Return -1 on failure.
jpayne@69 186 """
jpayne@69 187 return bytes(self).rfind(sub, start, end)
jpayne@69 188
jpayne@69 189 def index(self, sub, start=None, end=None):
jpayne@69 190 """Return the lowest index in data where subsection sub is found.
jpayne@69 191
jpayne@69 192 Return the lowest index in data where subsection sub is found,
jpayne@69 193 such that sub is contained within data[start,end]. Optional
jpayne@69 194 arguments start and end are interpreted as in slice notation.
jpayne@69 195
jpayne@69 196 Raises ValueError when the subsection is not found.
jpayne@69 197 """
jpayne@69 198 return bytes(self).index(sub, start, end)
jpayne@69 199
jpayne@69 200 def rindex(self, sub, start=None, end=None):
jpayne@69 201 """Return the highest index in data where subsection sub is found.
jpayne@69 202
jpayne@69 203 Return the highest index in data where subsection sub is found,
jpayne@69 204 such that sub is contained within data[start,end]. Optional
jpayne@69 205 arguments start and end are interpreted as in slice notation.
jpayne@69 206
jpayne@69 207 Raise ValueError when the subsection is not found.
jpayne@69 208 """
jpayne@69 209 return bytes(self).rindex(sub, start, end)
jpayne@69 210
jpayne@69 211 def startswith(self, prefix, start=None, end=None):
jpayne@69 212 """Return True if data starts with the specified prefix, False otherwise.
jpayne@69 213
jpayne@69 214 With optional start, test data beginning at that position.
jpayne@69 215 With optional end, stop comparing data at that position.
jpayne@69 216 prefix can also be a tuple of bytes to try.
jpayne@69 217 """
jpayne@69 218 return bytes(self).startswith(prefix, start, end)
jpayne@69 219
jpayne@69 220 def endswith(self, suffix, start=None, end=None):
jpayne@69 221 """Return True if data ends with the specified suffix, False otherwise.
jpayne@69 222
jpayne@69 223 With optional start, test data beginning at that position.
jpayne@69 224 With optional end, stop comparing data at that position.
jpayne@69 225 suffix can also be a tuple of bytes to try.
jpayne@69 226 """
jpayne@69 227 return bytes(self).endswith(suffix, start, end)
jpayne@69 228
jpayne@69 229 def split(self, sep=None, maxsplit=-1):
jpayne@69 230 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@69 231
jpayne@69 232 sep
jpayne@69 233 The delimiter according which to split the data.
jpayne@69 234 None (the default value) means split on ASCII whitespace characters
jpayne@69 235 (space, tab, return, newline, formfeed, vertical tab).
jpayne@69 236 maxsplit
jpayne@69 237 Maximum number of splits to do.
jpayne@69 238 -1 (the default value) means no limit.
jpayne@69 239 """
jpayne@69 240 return bytes(self).split(sep, maxsplit)
jpayne@69 241
jpayne@69 242 def rsplit(self, sep=None, maxsplit=-1):
jpayne@69 243 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@69 244
jpayne@69 245 sep
jpayne@69 246 The delimiter according which to split the data.
jpayne@69 247 None (the default value) means split on ASCII whitespace characters
jpayne@69 248 (space, tab, return, newline, formfeed, vertical tab).
jpayne@69 249 maxsplit
jpayne@69 250 Maximum number of splits to do.
jpayne@69 251 -1 (the default value) means no limit.
jpayne@69 252
jpayne@69 253 Splitting is done starting at the end of the data and working to the front.
jpayne@69 254 """
jpayne@69 255 return bytes(self).rsplit(sep, maxsplit)
jpayne@69 256
jpayne@69 257 def strip(self, chars=None):
jpayne@69 258 """Strip leading and trailing characters contained in the argument.
jpayne@69 259
jpayne@69 260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
jpayne@69 261 """
jpayne@69 262 return bytes(self).strip(chars)
jpayne@69 263
jpayne@69 264 def lstrip(self, chars=None):
jpayne@69 265 """Strip leading characters contained in the argument.
jpayne@69 266
jpayne@69 267 If the argument is omitted or None, strip leading ASCII whitespace.
jpayne@69 268 """
jpayne@69 269 return bytes(self).lstrip(chars)
jpayne@69 270
jpayne@69 271 def rstrip(self, chars=None):
jpayne@69 272 """Strip trailing characters contained in the argument.
jpayne@69 273
jpayne@69 274 If the argument is omitted or None, strip trailing ASCII whitespace.
jpayne@69 275 """
jpayne@69 276 return bytes(self).rstrip(chars)
jpayne@69 277
jpayne@69 278 def removeprefix(self, prefix):
jpayne@69 279 """Remove the prefix if present."""
jpayne@69 280 # Want to do just this, but need Python 3.9+
jpayne@69 281 # return bytes(self).removeprefix(prefix)
jpayne@69 282 data = bytes(self)
jpayne@69 283 try:
jpayne@69 284 return data.removeprefix(prefix)
jpayne@69 285 except AttributeError:
jpayne@69 286 if data.startswith(prefix):
jpayne@69 287 return data[len(prefix) :]
jpayne@69 288 else:
jpayne@69 289 return data
jpayne@69 290
jpayne@69 291 def removesuffix(self, suffix):
jpayne@69 292 """Remove the suffix if present."""
jpayne@69 293 # Want to do just this, but need Python 3.9+
jpayne@69 294 # return bytes(self).removesuffix(suffix)
jpayne@69 295 data = bytes(self)
jpayne@69 296 try:
jpayne@69 297 return data.removesuffix(suffix)
jpayne@69 298 except AttributeError:
jpayne@69 299 if data.startswith(suffix):
jpayne@69 300 return data[: -len(suffix)]
jpayne@69 301 else:
jpayne@69 302 return data
jpayne@69 303
jpayne@69 304 def upper(self):
jpayne@69 305 """Return a copy of data with all ASCII characters converted to uppercase."""
jpayne@69 306 return bytes(self).upper()
jpayne@69 307
jpayne@69 308 def lower(self):
jpayne@69 309 """Return a copy of data with all ASCII characters converted to lowercase."""
jpayne@69 310 return bytes(self).lower()
jpayne@69 311
jpayne@69 312 def isupper(self):
jpayne@69 313 """Return True if all ASCII characters in data are uppercase.
jpayne@69 314
jpayne@69 315 If there are no cased characters, the method returns False.
jpayne@69 316 """
jpayne@69 317 return bytes(self).isupper()
jpayne@69 318
jpayne@69 319 def islower(self):
jpayne@69 320 """Return True if all ASCII characters in data are lowercase.
jpayne@69 321
jpayne@69 322 If there are no cased characters, the method returns False.
jpayne@69 323 """
jpayne@69 324 return bytes(self).islower()
jpayne@69 325
jpayne@69 326 def replace(self, old, new):
jpayne@69 327 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69 328 return bytes(self).replace(old, new)
jpayne@69 329
jpayne@69 330 def translate(self, table, delete=b""):
jpayne@69 331 """Return a copy with each character mapped by the given translation table.
jpayne@69 332
jpayne@69 333 table
jpayne@69 334 Translation table, which must be a bytes object of length 256.
jpayne@69 335
jpayne@69 336 All characters occurring in the optional argument delete are removed.
jpayne@69 337 The remaining characters are mapped through the given translation table.
jpayne@69 338 """
jpayne@69 339 return bytes(self).translate(table, delete)
jpayne@69 340
jpayne@69 341 @property
jpayne@69 342 def defined(self):
jpayne@69 343 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@69 344
jpayne@69 345 Zero-length sequences are always considered to be defined.
jpayne@69 346 """
jpayne@69 347 return True
jpayne@69 348
jpayne@69 349 @property
jpayne@69 350 def defined_ranges(self):
jpayne@69 351 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69 352
jpayne@69 353 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69 354 """
jpayne@69 355 length = len(self)
jpayne@69 356 if length > 0:
jpayne@69 357 return ((0, length),)
jpayne@69 358 else:
jpayne@69 359 return ()
jpayne@69 360
jpayne@69 361
jpayne@69 362 class _SeqAbstractBaseClass(ABC):
jpayne@69 363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
jpayne@69 364
jpayne@69 365 Most users will not need to use this class. It is used internally as an
jpayne@69 366 abstract base class for Seq and MutableSeq, as most of their methods are
jpayne@69 367 identical.
jpayne@69 368 """
jpayne@69 369
jpayne@69 370 __slots__ = ("_data",)
jpayne@69 371 __array_ufunc__ = None # turn off numpy Ufuncs
jpayne@69 372
jpayne@69 373 @abstractmethod
jpayne@69 374 def __init__(self):
jpayne@69 375 pass
jpayne@69 376
jpayne@69 377 def __bytes__(self):
jpayne@69 378 return bytes(self._data)
jpayne@69 379
jpayne@69 380 def __repr__(self):
jpayne@69 381 """Return (truncated) representation of the sequence."""
jpayne@69 382 data = self._data
jpayne@69 383 if isinstance(data, _UndefinedSequenceData):
jpayne@69 384 return f"Seq(None, length={len(self)})"
jpayne@69 385 if isinstance(data, _PartiallyDefinedSequenceData):
jpayne@69 386 d = {}
jpayne@69 387 for position, seq in data._data.items():
jpayne@69 388 if len(seq) > 60:
jpayne@69 389 start = seq[:54].decode("ASCII")
jpayne@69 390 end = seq[-3:].decode("ASCII")
jpayne@69 391 seq = f"{start}...{end}"
jpayne@69 392 else:
jpayne@69 393 seq = seq.decode("ASCII")
jpayne@69 394 d[position] = seq
jpayne@69 395 return "Seq(%r, length=%d)" % (d, len(self))
jpayne@69 396 if len(data) > 60:
jpayne@69 397 # Shows the last three letters as it is often useful to see if
jpayne@69 398 # there is a stop codon at the end of a sequence.
jpayne@69 399 # Note total length is 54+3+3=60
jpayne@69 400 start = data[:54].decode("ASCII")
jpayne@69 401 end = data[-3:].decode("ASCII")
jpayne@69 402 return f"{self.__class__.__name__}('{start}...{end}')"
jpayne@69 403 else:
jpayne@69 404 data = data.decode("ASCII")
jpayne@69 405 return f"{self.__class__.__name__}('{data}')"
jpayne@69 406
jpayne@69 407 def __str__(self):
jpayne@69 408 """Return the full sequence as a python string."""
jpayne@69 409 return self._data.decode("ASCII")
jpayne@69 410
jpayne@69 411 def __eq__(self, other):
jpayne@69 412 """Compare the sequence to another sequence or a string.
jpayne@69 413
jpayne@69 414 Sequences are equal to each other if their sequence contents is
jpayne@69 415 identical:
jpayne@69 416
jpayne@69 417 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 418 >>> seq1 = Seq("ACGT")
jpayne@69 419 >>> seq2 = Seq("ACGT")
jpayne@69 420 >>> mutable_seq = MutableSeq("ACGT")
jpayne@69 421 >>> seq1 == seq2
jpayne@69 422 True
jpayne@69 423 >>> seq1 == mutable_seq
jpayne@69 424 True
jpayne@69 425 >>> seq1 == "ACGT"
jpayne@69 426 True
jpayne@69 427
jpayne@69 428 Note that the sequence objects themselves are not identical to each
jpayne@69 429 other:
jpayne@69 430
jpayne@69 431 >>> id(seq1) == id(seq2)
jpayne@69 432 False
jpayne@69 433 >>> seq1 is seq2
jpayne@69 434 False
jpayne@69 435
jpayne@69 436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
jpayne@69 437 objects:
jpayne@69 438
jpayne@69 439 >>> seq1 == "ACGT"
jpayne@69 440 True
jpayne@69 441 >>> seq1 == b"ACGT"
jpayne@69 442 True
jpayne@69 443 >>> seq1 == bytearray(b"ACGT")
jpayne@69 444 True
jpayne@69 445 """
jpayne@69 446 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 447 return self._data == other._data
jpayne@69 448 elif isinstance(other, str):
jpayne@69 449 return self._data == other.encode("ASCII")
jpayne@69 450 else:
jpayne@69 451 return self._data == other
jpayne@69 452
jpayne@69 453 def __lt__(self, other):
jpayne@69 454 """Implement the less-than operand."""
jpayne@69 455 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 456 return self._data < other._data
jpayne@69 457 elif isinstance(other, str):
jpayne@69 458 return self._data < other.encode("ASCII")
jpayne@69 459 else:
jpayne@69 460 return self._data < other
jpayne@69 461
jpayne@69 462 def __le__(self, other):
jpayne@69 463 """Implement the less-than or equal operand."""
jpayne@69 464 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 465 return self._data <= other._data
jpayne@69 466 elif isinstance(other, str):
jpayne@69 467 return self._data <= other.encode("ASCII")
jpayne@69 468 else:
jpayne@69 469 return self._data <= other
jpayne@69 470
jpayne@69 471 def __gt__(self, other):
jpayne@69 472 """Implement the greater-than operand."""
jpayne@69 473 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 474 return self._data > other._data
jpayne@69 475 elif isinstance(other, str):
jpayne@69 476 return self._data > other.encode("ASCII")
jpayne@69 477 else:
jpayne@69 478 return self._data > other
jpayne@69 479
jpayne@69 480 def __ge__(self, other):
jpayne@69 481 """Implement the greater-than or equal operand."""
jpayne@69 482 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 483 return self._data >= other._data
jpayne@69 484 elif isinstance(other, str):
jpayne@69 485 return self._data >= other.encode("ASCII")
jpayne@69 486 else:
jpayne@69 487 return self._data >= other
jpayne@69 488
jpayne@69 489 def __len__(self):
jpayne@69 490 """Return the length of the sequence."""
jpayne@69 491 return len(self._data)
jpayne@69 492
jpayne@69 493 def __iter__(self):
jpayne@69 494 """Return an iterable of the sequence."""
jpayne@69 495 return self._data.decode("ASCII").__iter__()
jpayne@69 496
jpayne@69 497 @overload
jpayne@69 498 def __getitem__(self, index: int) -> str:
jpayne@69 499 ...
jpayne@69 500
jpayne@69 501 @overload
jpayne@69 502 def __getitem__(self, index: slice) -> "Seq":
jpayne@69 503 ...
jpayne@69 504
jpayne@69 505 def __getitem__(self, index):
jpayne@69 506 """Return a subsequence as a single letter or as a sequence object.
jpayne@69 507
jpayne@69 508 If the index is an integer, a single letter is returned as a Python
jpayne@69 509 string:
jpayne@69 510
jpayne@69 511 >>> seq = Seq('ACTCGACGTCG')
jpayne@69 512 >>> seq[5]
jpayne@69 513 'A'
jpayne@69 514
jpayne@69 515 Otherwise, a new sequence object of the same class is returned:
jpayne@69 516
jpayne@69 517 >>> seq[5:8]
jpayne@69 518 Seq('ACG')
jpayne@69 519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 520 >>> mutable_seq[5:8]
jpayne@69 521 MutableSeq('ACG')
jpayne@69 522 """
jpayne@69 523 if isinstance(index, numbers.Integral):
jpayne@69 524 # Return a single letter as a string
jpayne@69 525 return chr(self._data[index])
jpayne@69 526 else:
jpayne@69 527 # Return the (sub)sequence as another Seq/MutableSeq object
jpayne@69 528 return self.__class__(self._data[index])
jpayne@69 529
jpayne@69 530 def __add__(self, other):
jpayne@69 531 """Add a sequence or string to this sequence.
jpayne@69 532
jpayne@69 533 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 534 >>> Seq("MELKI") + "LV"
jpayne@69 535 Seq('MELKILV')
jpayne@69 536 >>> MutableSeq("MELKI") + "LV"
jpayne@69 537 MutableSeq('MELKILV')
jpayne@69 538 """
jpayne@69 539 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 540 return self.__class__(self._data + other._data)
jpayne@69 541 elif isinstance(other, str):
jpayne@69 542 return self.__class__(self._data + other.encode("ASCII"))
jpayne@69 543 else:
jpayne@69 544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
jpayne@69 545 # this. If not, returning NotImplemented will trigger a TypeError.
jpayne@69 546 return NotImplemented
jpayne@69 547
jpayne@69 548 def __radd__(self, other):
jpayne@69 549 """Add a sequence string on the left.
jpayne@69 550
jpayne@69 551 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 552 >>> "LV" + Seq("MELKI")
jpayne@69 553 Seq('LVMELKI')
jpayne@69 554 >>> "LV" + MutableSeq("MELKI")
jpayne@69 555 MutableSeq('LVMELKI')
jpayne@69 556
jpayne@69 557 Adding two sequence objects is handled via the __add__ method.
jpayne@69 558 """
jpayne@69 559 if isinstance(other, str):
jpayne@69 560 return self.__class__(other.encode("ASCII") + self._data)
jpayne@69 561 else:
jpayne@69 562 return NotImplemented
jpayne@69 563
jpayne@69 564 def __mul__(self, other):
jpayne@69 565 """Multiply sequence by integer.
jpayne@69 566
jpayne@69 567 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 568 >>> Seq('ATG') * 2
jpayne@69 569 Seq('ATGATG')
jpayne@69 570 >>> MutableSeq('ATG') * 2
jpayne@69 571 MutableSeq('ATGATG')
jpayne@69 572 """
jpayne@69 573 if not isinstance(other, numbers.Integral):
jpayne@69 574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69 575 # we would like to simply write
jpayne@69 576 # data = self._data * other
jpayne@69 577 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69 578 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69 579 data = self._data.__mul__(other)
jpayne@69 580 return self.__class__(data)
jpayne@69 581
jpayne@69 582 def __rmul__(self, other):
jpayne@69 583 """Multiply integer by sequence.
jpayne@69 584
jpayne@69 585 >>> from Bio.Seq import Seq
jpayne@69 586 >>> 2 * Seq('ATG')
jpayne@69 587 Seq('ATGATG')
jpayne@69 588 """
jpayne@69 589 if not isinstance(other, numbers.Integral):
jpayne@69 590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69 591 # we would like to simply write
jpayne@69 592 # data = self._data * other
jpayne@69 593 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69 594 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69 595 data = self._data.__mul__(other)
jpayne@69 596 return self.__class__(data)
jpayne@69 597
jpayne@69 598 def __imul__(self, other):
jpayne@69 599 """Multiply the sequence object by other and assign.
jpayne@69 600
jpayne@69 601 >>> from Bio.Seq import Seq
jpayne@69 602 >>> seq = Seq('ATG')
jpayne@69 603 >>> seq *= 2
jpayne@69 604 >>> seq
jpayne@69 605 Seq('ATGATG')
jpayne@69 606
jpayne@69 607 Note that this is different from in-place multiplication. The ``seq``
jpayne@69 608 variable is reassigned to the multiplication result, but any variable
jpayne@69 609 pointing to ``seq`` will remain unchanged:
jpayne@69 610
jpayne@69 611 >>> seq = Seq('ATG')
jpayne@69 612 >>> seq2 = seq
jpayne@69 613 >>> id(seq) == id(seq2)
jpayne@69 614 True
jpayne@69 615 >>> seq *= 2
jpayne@69 616 >>> seq
jpayne@69 617 Seq('ATGATG')
jpayne@69 618 >>> seq2
jpayne@69 619 Seq('ATG')
jpayne@69 620 >>> id(seq) == id(seq2)
jpayne@69 621 False
jpayne@69 622 """
jpayne@69 623 if not isinstance(other, numbers.Integral):
jpayne@69 624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@69 625 # we would like to simply write
jpayne@69 626 # data = self._data * other
jpayne@69 627 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@69 628 # bytearray and other is a numpy integer. Using this workaround:
jpayne@69 629 data = self._data.__mul__(other)
jpayne@69 630 return self.__class__(data)
jpayne@69 631
jpayne@69 632 def count(self, sub, start=None, end=None):
jpayne@69 633 """Return a non-overlapping count, like that of a python string.
jpayne@69 634
jpayne@69 635 The number of occurrences of substring argument sub in the
jpayne@69 636 (sub)sequence given by [start:end] is returned as an integer.
jpayne@69 637 Optional arguments start and end are interpreted as in slice
jpayne@69 638 notation.
jpayne@69 639
jpayne@69 640 Arguments:
jpayne@69 641 - sub - a string or another Seq object to look for
jpayne@69 642 - start - optional integer, slice start
jpayne@69 643 - end - optional integer, slice end
jpayne@69 644
jpayne@69 645 e.g.
jpayne@69 646
jpayne@69 647 >>> from Bio.Seq import Seq
jpayne@69 648 >>> my_seq = Seq("AAAATGA")
jpayne@69 649 >>> print(my_seq.count("A"))
jpayne@69 650 5
jpayne@69 651 >>> print(my_seq.count("ATG"))
jpayne@69 652 1
jpayne@69 653 >>> print(my_seq.count(Seq("AT")))
jpayne@69 654 1
jpayne@69 655 >>> print(my_seq.count("AT", 2, -1))
jpayne@69 656 1
jpayne@69 657
jpayne@69 658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
jpayne@69 659 objects, like that of Python strings, do a non-overlapping search, this
jpayne@69 660 may not give the answer you expect:
jpayne@69 661
jpayne@69 662 >>> "AAAA".count("AA")
jpayne@69 663 2
jpayne@69 664 >>> print(Seq("AAAA").count("AA"))
jpayne@69 665 2
jpayne@69 666
jpayne@69 667 For an overlapping search, use the ``count_overlap`` method:
jpayne@69 668
jpayne@69 669 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@69 670 3
jpayne@69 671 """
jpayne@69 672 if isinstance(sub, MutableSeq):
jpayne@69 673 sub = sub._data
jpayne@69 674 elif isinstance(sub, Seq):
jpayne@69 675 sub = bytes(sub)
jpayne@69 676 elif isinstance(sub, str):
jpayne@69 677 sub = sub.encode("ASCII")
jpayne@69 678 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 679 raise TypeError(
jpayne@69 680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 681 % type(sub)
jpayne@69 682 )
jpayne@69 683 return self._data.count(sub, start, end)
jpayne@69 684
jpayne@69 685 def count_overlap(self, sub, start=None, end=None):
jpayne@69 686 """Return an overlapping count.
jpayne@69 687
jpayne@69 688 Returns an integer, the number of occurrences of substring
jpayne@69 689 argument sub in the (sub)sequence given by [start:end].
jpayne@69 690 Optional arguments start and end are interpreted as in slice
jpayne@69 691 notation.
jpayne@69 692
jpayne@69 693 Arguments:
jpayne@69 694 - sub - a string or another Seq object to look for
jpayne@69 695 - start - optional integer, slice start
jpayne@69 696 - end - optional integer, slice end
jpayne@69 697
jpayne@69 698 e.g.
jpayne@69 699
jpayne@69 700 >>> from Bio.Seq import Seq
jpayne@69 701 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@69 702 3
jpayne@69 703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
jpayne@69 704 4
jpayne@69 705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
jpayne@69 706 1
jpayne@69 707
jpayne@69 708 For a non-overlapping search, use the ``count`` method:
jpayne@69 709
jpayne@69 710 >>> print(Seq("AAAA").count("AA"))
jpayne@69 711 2
jpayne@69 712
jpayne@69 713 Where substrings do not overlap, ``count_overlap`` behaves the same as
jpayne@69 714 the ``count`` method:
jpayne@69 715
jpayne@69 716 >>> from Bio.Seq import Seq
jpayne@69 717 >>> my_seq = Seq("AAAATGA")
jpayne@69 718 >>> print(my_seq.count_overlap("A"))
jpayne@69 719 5
jpayne@69 720 >>> my_seq.count_overlap("A") == my_seq.count("A")
jpayne@69 721 True
jpayne@69 722 >>> print(my_seq.count_overlap("ATG"))
jpayne@69 723 1
jpayne@69 724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
jpayne@69 725 True
jpayne@69 726 >>> print(my_seq.count_overlap(Seq("AT")))
jpayne@69 727 1
jpayne@69 728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
jpayne@69 729 True
jpayne@69 730 >>> print(my_seq.count_overlap("AT", 2, -1))
jpayne@69 731 1
jpayne@69 732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
jpayne@69 733 True
jpayne@69 734
jpayne@69 735 HOWEVER, do not use this method for such cases because the
jpayne@69 736 count() method is much for efficient.
jpayne@69 737 """
jpayne@69 738 if isinstance(sub, MutableSeq):
jpayne@69 739 sub = sub._data
jpayne@69 740 elif isinstance(sub, Seq):
jpayne@69 741 sub = bytes(sub)
jpayne@69 742 elif isinstance(sub, str):
jpayne@69 743 sub = sub.encode("ASCII")
jpayne@69 744 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 745 raise TypeError(
jpayne@69 746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 747 % type(sub)
jpayne@69 748 )
jpayne@69 749 data = self._data
jpayne@69 750 overlap_count = 0
jpayne@69 751 while True:
jpayne@69 752 start = data.find(sub, start, end) + 1
jpayne@69 753 if start != 0:
jpayne@69 754 overlap_count += 1
jpayne@69 755 else:
jpayne@69 756 return overlap_count
jpayne@69 757
jpayne@69 758 def __contains__(self, item):
jpayne@69 759 """Return True if item is a subsequence of the sequence, and False otherwise.
jpayne@69 760
jpayne@69 761 e.g.
jpayne@69 762
jpayne@69 763 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
jpayne@69 765 >>> "AAA" in my_dna
jpayne@69 766 True
jpayne@69 767 >>> Seq("AAA") in my_dna
jpayne@69 768 True
jpayne@69 769 >>> MutableSeq("AAA") in my_dna
jpayne@69 770 True
jpayne@69 771 """
jpayne@69 772 if isinstance(item, _SeqAbstractBaseClass):
jpayne@69 773 item = bytes(item)
jpayne@69 774 elif isinstance(item, str):
jpayne@69 775 item = item.encode("ASCII")
jpayne@69 776 return item in self._data
jpayne@69 777
jpayne@69 778 def find(self, sub, start=None, end=None):
jpayne@69 779 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@69 780
jpayne@69 781 With optional arguments start and end, return the lowest index in the
jpayne@69 782 sequence such that the subsequence sub is contained within the sequence
jpayne@69 783 region [start:end].
jpayne@69 784
jpayne@69 785 Arguments:
jpayne@69 786 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69 787 - start - optional integer, slice start
jpayne@69 788 - end - optional integer, slice end
jpayne@69 789
jpayne@69 790 Returns -1 if the subsequence is NOT found.
jpayne@69 791
jpayne@69 792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@69 793
jpayne@69 794 >>> from Bio.Seq import Seq
jpayne@69 795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 796 >>> my_rna.find("AUG")
jpayne@69 797 3
jpayne@69 798
jpayne@69 799 The next typical start codon can then be found by starting the search
jpayne@69 800 at position 4:
jpayne@69 801
jpayne@69 802 >>> my_rna.find("AUG", 4)
jpayne@69 803 15
jpayne@69 804
jpayne@69 805 See the ``search`` method to find the locations of multiple subsequences
jpayne@69 806 at the same time.
jpayne@69 807 """
jpayne@69 808 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@69 809 sub = bytes(sub)
jpayne@69 810 elif isinstance(sub, str):
jpayne@69 811 sub = sub.encode("ASCII")
jpayne@69 812 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 813 raise TypeError(
jpayne@69 814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 815 % type(sub)
jpayne@69 816 )
jpayne@69 817 return self._data.find(sub, start, end)
jpayne@69 818
jpayne@69 819 def rfind(self, sub, start=None, end=None):
jpayne@69 820 """Return the highest index in the sequence where subsequence sub is found.
jpayne@69 821
jpayne@69 822 With optional arguments start and end, return the highest index in the
jpayne@69 823 sequence such that the subsequence sub is contained within the sequence
jpayne@69 824 region [start:end].
jpayne@69 825
jpayne@69 826 Arguments:
jpayne@69 827 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69 828 - start - optional integer, slice start
jpayne@69 829 - end - optional integer, slice end
jpayne@69 830
jpayne@69 831 Returns -1 if the subsequence is NOT found.
jpayne@69 832
jpayne@69 833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@69 834
jpayne@69 835 >>> from Bio.Seq import Seq
jpayne@69 836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 837 >>> my_rna.rfind("AUG")
jpayne@69 838 15
jpayne@69 839
jpayne@69 840 The location of the typical start codon before that can be found by
jpayne@69 841 ending the search at position 15:
jpayne@69 842
jpayne@69 843 >>> my_rna.rfind("AUG", end=15)
jpayne@69 844 3
jpayne@69 845
jpayne@69 846 See the ``search`` method to find the locations of multiple subsequences
jpayne@69 847 at the same time.
jpayne@69 848 """
jpayne@69 849 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@69 850 sub = bytes(sub)
jpayne@69 851 elif isinstance(sub, str):
jpayne@69 852 sub = sub.encode("ASCII")
jpayne@69 853 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 854 raise TypeError(
jpayne@69 855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 856 % type(sub)
jpayne@69 857 )
jpayne@69 858 return self._data.rfind(sub, start, end)
jpayne@69 859
jpayne@69 860 def index(self, sub, start=None, end=None):
jpayne@69 861 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@69 862
jpayne@69 863 With optional arguments start and end, return the lowest index in the
jpayne@69 864 sequence such that the subsequence sub is contained within the sequence
jpayne@69 865 region [start:end].
jpayne@69 866
jpayne@69 867 Arguments:
jpayne@69 868 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69 869 - start - optional integer, slice start
jpayne@69 870 - end - optional integer, slice end
jpayne@69 871
jpayne@69 872 Raises a ValueError if the subsequence is NOT found.
jpayne@69 873
jpayne@69 874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@69 875
jpayne@69 876 >>> from Bio.Seq import Seq
jpayne@69 877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 878 >>> my_rna.index("AUG")
jpayne@69 879 3
jpayne@69 880
jpayne@69 881 The next typical start codon can then be found by starting the search
jpayne@69 882 at position 4:
jpayne@69 883
jpayne@69 884 >>> my_rna.index("AUG", 4)
jpayne@69 885 15
jpayne@69 886
jpayne@69 887 This method performs the same search as the ``find`` method. However,
jpayne@69 888 if the subsequence is not found, ``find`` returns -1 while ``index``
jpayne@69 889 raises a ValueError:
jpayne@69 890
jpayne@69 891 >>> my_rna.index("T")
jpayne@69 892 Traceback (most recent call last):
jpayne@69 893 ...
jpayne@69 894 ValueError: ...
jpayne@69 895 >>> my_rna.find("T")
jpayne@69 896 -1
jpayne@69 897
jpayne@69 898 See the ``search`` method to find the locations of multiple subsequences
jpayne@69 899 at the same time.
jpayne@69 900 """
jpayne@69 901 if isinstance(sub, MutableSeq):
jpayne@69 902 sub = sub._data
jpayne@69 903 elif isinstance(sub, Seq):
jpayne@69 904 sub = bytes(sub)
jpayne@69 905 elif isinstance(sub, str):
jpayne@69 906 sub = sub.encode("ASCII")
jpayne@69 907 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 908 raise TypeError(
jpayne@69 909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 910 % type(sub)
jpayne@69 911 )
jpayne@69 912 return self._data.index(sub, start, end)
jpayne@69 913
jpayne@69 914 def rindex(self, sub, start=None, end=None):
jpayne@69 915 """Return the highest index in the sequence where subsequence sub is found.
jpayne@69 916
jpayne@69 917 With optional arguments start and end, return the highest index in the
jpayne@69 918 sequence such that the subsequence sub is contained within the sequence
jpayne@69 919 region [start:end].
jpayne@69 920
jpayne@69 921 Arguments:
jpayne@69 922 - sub - a string or another Seq or MutableSeq object to search for
jpayne@69 923 - start - optional integer, slice start
jpayne@69 924 - end - optional integer, slice end
jpayne@69 925
jpayne@69 926 Returns -1 if the subsequence is NOT found.
jpayne@69 927
jpayne@69 928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@69 929
jpayne@69 930 >>> from Bio.Seq import Seq
jpayne@69 931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 932 >>> my_rna.rindex("AUG")
jpayne@69 933 15
jpayne@69 934
jpayne@69 935 The location of the typical start codon before that can be found by
jpayne@69 936 ending the search at position 15:
jpayne@69 937
jpayne@69 938 >>> my_rna.rindex("AUG", end=15)
jpayne@69 939 3
jpayne@69 940
jpayne@69 941 This method performs the same search as the ``rfind`` method. However,
jpayne@69 942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
jpayne@69 943 raises a ValueError:
jpayne@69 944
jpayne@69 945 >>> my_rna.rindex("T")
jpayne@69 946 Traceback (most recent call last):
jpayne@69 947 ...
jpayne@69 948 ValueError: ...
jpayne@69 949 >>> my_rna.rfind("T")
jpayne@69 950 -1
jpayne@69 951
jpayne@69 952 See the ``search`` method to find the locations of multiple subsequences
jpayne@69 953 at the same time.
jpayne@69 954 """
jpayne@69 955 if isinstance(sub, MutableSeq):
jpayne@69 956 sub = sub._data
jpayne@69 957 elif isinstance(sub, Seq):
jpayne@69 958 sub = bytes(sub)
jpayne@69 959 elif isinstance(sub, str):
jpayne@69 960 sub = sub.encode("ASCII")
jpayne@69 961 elif not isinstance(sub, (bytes, bytearray)):
jpayne@69 962 raise TypeError(
jpayne@69 963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 964 % type(sub)
jpayne@69 965 )
jpayne@69 966 return self._data.rindex(sub, start, end)
jpayne@69 967
jpayne@69 968 def search(self, subs):
jpayne@69 969 """Search the substrings subs in self and yield the index and substring found.
jpayne@69 970
jpayne@69 971 Arguments:
jpayne@69 972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
jpayne@69 973 objects containing the substrings to search for.
jpayne@69 974
jpayne@69 975 >>> from Bio.Seq import Seq
jpayne@69 976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
jpayne@69 977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
jpayne@69 978 >>> for index, substring in matches:
jpayne@69 979 ... print(index, substring)
jpayne@69 980 ...
jpayne@69 981 7 CC
jpayne@69 982 9 ATTG
jpayne@69 983 20 CC
jpayne@69 984 34 CC
jpayne@69 985 34 CCC
jpayne@69 986 35 CC
jpayne@69 987 """
jpayne@69 988 subdict = collections.defaultdict(set)
jpayne@69 989 for index, sub in enumerate(subs):
jpayne@69 990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
jpayne@69 991 sub = bytes(sub)
jpayne@69 992 elif isinstance(sub, str):
jpayne@69 993 sub = sub.encode("ASCII")
jpayne@69 994 elif not isinstance(sub, bytes):
jpayne@69 995 raise TypeError(
jpayne@69 996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@69 997 % (index, type(sub))
jpayne@69 998 )
jpayne@69 999 length = len(sub)
jpayne@69 1000 subdict[length].add(sub)
jpayne@69 1001 for start in range(len(self) - 1):
jpayne@69 1002 for length, subs in subdict.items():
jpayne@69 1003 stop = start + length
jpayne@69 1004 for sub in subs:
jpayne@69 1005 if self._data[start:stop] == sub:
jpayne@69 1006 yield (start, sub.decode())
jpayne@69 1007 break
jpayne@69 1008
jpayne@69 1009 def startswith(self, prefix, start=None, end=None):
jpayne@69 1010 """Return True if the sequence starts with the given prefix, False otherwise.
jpayne@69 1011
jpayne@69 1012 Return True if the sequence starts with the specified prefix
jpayne@69 1013 (a string or another Seq object), False otherwise.
jpayne@69 1014 With optional start, test sequence beginning at that position.
jpayne@69 1015 With optional end, stop comparing sequence at that position.
jpayne@69 1016 prefix can also be a tuple of strings to try. e.g.
jpayne@69 1017
jpayne@69 1018 >>> from Bio.Seq import Seq
jpayne@69 1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 1020 >>> my_rna.startswith("GUC")
jpayne@69 1021 True
jpayne@69 1022 >>> my_rna.startswith("AUG")
jpayne@69 1023 False
jpayne@69 1024 >>> my_rna.startswith("AUG", 3)
jpayne@69 1025 True
jpayne@69 1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
jpayne@69 1027 True
jpayne@69 1028 """
jpayne@69 1029 if isinstance(prefix, tuple):
jpayne@69 1030 prefix = tuple(
jpayne@69 1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@69 1032 for p in prefix
jpayne@69 1033 )
jpayne@69 1034 elif isinstance(prefix, _SeqAbstractBaseClass):
jpayne@69 1035 prefix = bytes(prefix)
jpayne@69 1036 elif isinstance(prefix, str):
jpayne@69 1037 prefix = prefix.encode("ASCII")
jpayne@69 1038 return self._data.startswith(prefix, start, end)
jpayne@69 1039
jpayne@69 1040 def endswith(self, suffix, start=None, end=None):
jpayne@69 1041 """Return True if the sequence ends with the given suffix, False otherwise.
jpayne@69 1042
jpayne@69 1043 Return True if the sequence ends with the specified suffix
jpayne@69 1044 (a string or another Seq object), False otherwise.
jpayne@69 1045 With optional start, test sequence beginning at that position.
jpayne@69 1046 With optional end, stop comparing sequence at that position.
jpayne@69 1047 suffix can also be a tuple of strings to try. e.g.
jpayne@69 1048
jpayne@69 1049 >>> from Bio.Seq import Seq
jpayne@69 1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 1051 >>> my_rna.endswith("UUG")
jpayne@69 1052 True
jpayne@69 1053 >>> my_rna.endswith("AUG")
jpayne@69 1054 False
jpayne@69 1055 >>> my_rna.endswith("AUG", 0, 18)
jpayne@69 1056 True
jpayne@69 1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
jpayne@69 1058 True
jpayne@69 1059 """
jpayne@69 1060 if isinstance(suffix, tuple):
jpayne@69 1061 suffix = tuple(
jpayne@69 1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@69 1063 for p in suffix
jpayne@69 1064 )
jpayne@69 1065 elif isinstance(suffix, _SeqAbstractBaseClass):
jpayne@69 1066 suffix = bytes(suffix)
jpayne@69 1067 elif isinstance(suffix, str):
jpayne@69 1068 suffix = suffix.encode("ASCII")
jpayne@69 1069 return self._data.endswith(suffix, start, end)
jpayne@69 1070
jpayne@69 1071 def split(self, sep=None, maxsplit=-1):
jpayne@69 1072 """Return a list of subsequences when splitting the sequence by separator sep.
jpayne@69 1073
jpayne@69 1074 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@69 1075 using sep as the delimiter string. If maxsplit is given, at
jpayne@69 1076 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@69 1077 splits are made.
jpayne@69 1078
jpayne@69 1079 For consistency with the ``split`` method of Python strings, any
jpayne@69 1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@69 1081 default value
jpayne@69 1082
jpayne@69 1083 e.g.
jpayne@69 1084
jpayne@69 1085 >>> from Bio.Seq import Seq
jpayne@69 1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 1087 >>> my_aa = my_rna.translate()
jpayne@69 1088 >>> my_aa
jpayne@69 1089 Seq('VMAIVMGR*KGAR*L')
jpayne@69 1090 >>> for pep in my_aa.split("*"):
jpayne@69 1091 ... pep
jpayne@69 1092 Seq('VMAIVMGR')
jpayne@69 1093 Seq('KGAR')
jpayne@69 1094 Seq('L')
jpayne@69 1095 >>> for pep in my_aa.split("*", 1):
jpayne@69 1096 ... pep
jpayne@69 1097 Seq('VMAIVMGR')
jpayne@69 1098 Seq('KGAR*L')
jpayne@69 1099
jpayne@69 1100 See also the rsplit method, which splits the sequence starting from the
jpayne@69 1101 end:
jpayne@69 1102
jpayne@69 1103 >>> for pep in my_aa.rsplit("*", 1):
jpayne@69 1104 ... pep
jpayne@69 1105 Seq('VMAIVMGR*KGAR')
jpayne@69 1106 Seq('L')
jpayne@69 1107 """
jpayne@69 1108 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@69 1109 sep = bytes(sep)
jpayne@69 1110 elif isinstance(sep, str):
jpayne@69 1111 sep = sep.encode("ASCII")
jpayne@69 1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
jpayne@69 1113
jpayne@69 1114 def rsplit(self, sep=None, maxsplit=-1):
jpayne@69 1115 """Return a list of subsequences by splitting the sequence from the right.
jpayne@69 1116
jpayne@69 1117 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@69 1118 using sep as the delimiter string. If maxsplit is given, at
jpayne@69 1119 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@69 1120 splits are made.
jpayne@69 1121
jpayne@69 1122 For consistency with the ``rsplit`` method of Python strings, any
jpayne@69 1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@69 1124 default value
jpayne@69 1125
jpayne@69 1126 e.g.
jpayne@69 1127
jpayne@69 1128 >>> from Bio.Seq import Seq
jpayne@69 1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@69 1130 >>> my_aa = my_rna.translate()
jpayne@69 1131 >>> my_aa
jpayne@69 1132 Seq('VMAIVMGR*KGAR*L')
jpayne@69 1133 >>> for pep in my_aa.rsplit("*"):
jpayne@69 1134 ... pep
jpayne@69 1135 Seq('VMAIVMGR')
jpayne@69 1136 Seq('KGAR')
jpayne@69 1137 Seq('L')
jpayne@69 1138 >>> for pep in my_aa.rsplit("*", 1):
jpayne@69 1139 ... pep
jpayne@69 1140 Seq('VMAIVMGR*KGAR')
jpayne@69 1141 Seq('L')
jpayne@69 1142
jpayne@69 1143 See also the split method, which splits the sequence starting from the
jpayne@69 1144 beginning:
jpayne@69 1145
jpayne@69 1146 >>> for pep in my_aa.split("*", 1):
jpayne@69 1147 ... pep
jpayne@69 1148 Seq('VMAIVMGR')
jpayne@69 1149 Seq('KGAR*L')
jpayne@69 1150 """
jpayne@69 1151 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@69 1152 sep = bytes(sep)
jpayne@69 1153 elif isinstance(sep, str):
jpayne@69 1154 sep = sep.encode("ASCII")
jpayne@69 1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
jpayne@69 1156
jpayne@69 1157 def strip(self, chars=None, inplace=False):
jpayne@69 1158 """Return a sequence object with leading and trailing ends stripped.
jpayne@69 1159
jpayne@69 1160 With default arguments, leading and trailing whitespace is removed:
jpayne@69 1161
jpayne@69 1162 >>> seq = Seq(" ACGT ")
jpayne@69 1163 >>> seq.strip()
jpayne@69 1164 Seq('ACGT')
jpayne@69 1165 >>> seq
jpayne@69 1166 Seq(' ACGT ')
jpayne@69 1167
jpayne@69 1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69 1169 instead. The order of the characters to be removed is not important:
jpayne@69 1170
jpayne@69 1171 >>> Seq("ACGTACGT").strip("TGCA")
jpayne@69 1172 Seq('')
jpayne@69 1173
jpayne@69 1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69 1175 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69 1176 in-place and returned.
jpayne@69 1177
jpayne@69 1178 >>> seq = MutableSeq(" ACGT ")
jpayne@69 1179 >>> seq.strip()
jpayne@69 1180 MutableSeq('ACGT')
jpayne@69 1181 >>> seq
jpayne@69 1182 MutableSeq(' ACGT ')
jpayne@69 1183 >>> seq.strip(inplace=True)
jpayne@69 1184 MutableSeq('ACGT')
jpayne@69 1185 >>> seq
jpayne@69 1186 MutableSeq('ACGT')
jpayne@69 1187
jpayne@69 1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
jpayne@69 1189 is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1190
jpayne@69 1191 See also the lstrip and rstrip methods.
jpayne@69 1192 """
jpayne@69 1193 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69 1194 chars = bytes(chars)
jpayne@69 1195 elif isinstance(chars, str):
jpayne@69 1196 chars = chars.encode("ASCII")
jpayne@69 1197 try:
jpayne@69 1198 data = self._data.strip(chars)
jpayne@69 1199 except TypeError:
jpayne@69 1200 raise TypeError(
jpayne@69 1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69 1202 ) from None
jpayne@69 1203 if inplace:
jpayne@69 1204 if not isinstance(self._data, bytearray):
jpayne@69 1205 raise TypeError("Sequence is immutable")
jpayne@69 1206 self._data[:] = data
jpayne@69 1207 return self
jpayne@69 1208 else:
jpayne@69 1209 return self.__class__(data)
jpayne@69 1210
jpayne@69 1211 def lstrip(self, chars=None, inplace=False):
jpayne@69 1212 """Return a sequence object with leading and trailing ends stripped.
jpayne@69 1213
jpayne@69 1214 With default arguments, leading whitespace is removed:
jpayne@69 1215
jpayne@69 1216 >>> seq = Seq(" ACGT ")
jpayne@69 1217 >>> seq.lstrip()
jpayne@69 1218 Seq('ACGT ')
jpayne@69 1219 >>> seq
jpayne@69 1220 Seq(' ACGT ')
jpayne@69 1221
jpayne@69 1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69 1223 from the leading end instead. The order of the characters to be removed
jpayne@69 1224 is not important:
jpayne@69 1225
jpayne@69 1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
jpayne@69 1227 Seq('TTACG')
jpayne@69 1228
jpayne@69 1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69 1230 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69 1231 in-place and returned.
jpayne@69 1232
jpayne@69 1233 >>> seq = MutableSeq(" ACGT ")
jpayne@69 1234 >>> seq.lstrip()
jpayne@69 1235 MutableSeq('ACGT ')
jpayne@69 1236 >>> seq
jpayne@69 1237 MutableSeq(' ACGT ')
jpayne@69 1238 >>> seq.lstrip(inplace=True)
jpayne@69 1239 MutableSeq('ACGT ')
jpayne@69 1240 >>> seq
jpayne@69 1241 MutableSeq('ACGT ')
jpayne@69 1242
jpayne@69 1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1245
jpayne@69 1246 See also the strip and rstrip methods.
jpayne@69 1247 """
jpayne@69 1248 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69 1249 chars = bytes(chars)
jpayne@69 1250 elif isinstance(chars, str):
jpayne@69 1251 chars = chars.encode("ASCII")
jpayne@69 1252 try:
jpayne@69 1253 data = self._data.lstrip(chars)
jpayne@69 1254 except TypeError:
jpayne@69 1255 raise TypeError(
jpayne@69 1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69 1257 ) from None
jpayne@69 1258 if inplace:
jpayne@69 1259 if not isinstance(self._data, bytearray):
jpayne@69 1260 raise TypeError("Sequence is immutable")
jpayne@69 1261 self._data[:] = data
jpayne@69 1262 return self
jpayne@69 1263 else:
jpayne@69 1264 return self.__class__(data)
jpayne@69 1265
jpayne@69 1266 def rstrip(self, chars=None, inplace=False):
jpayne@69 1267 """Return a sequence object with trailing ends stripped.
jpayne@69 1268
jpayne@69 1269 With default arguments, trailing whitespace is removed:
jpayne@69 1270
jpayne@69 1271 >>> seq = Seq(" ACGT ")
jpayne@69 1272 >>> seq.rstrip()
jpayne@69 1273 Seq(' ACGT')
jpayne@69 1274 >>> seq
jpayne@69 1275 Seq(' ACGT ')
jpayne@69 1276
jpayne@69 1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@69 1278 from the trailing end instead. The order of the characters to be
jpayne@69 1279 removed is not important:
jpayne@69 1280
jpayne@69 1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
jpayne@69 1282 Seq('ACGACGTT')
jpayne@69 1283
jpayne@69 1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@69 1285 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@69 1286 in-place and returned.
jpayne@69 1287
jpayne@69 1288 >>> seq = MutableSeq(" ACGT ")
jpayne@69 1289 >>> seq.rstrip()
jpayne@69 1290 MutableSeq(' ACGT')
jpayne@69 1291 >>> seq
jpayne@69 1292 MutableSeq(' ACGT ')
jpayne@69 1293 >>> seq.rstrip(inplace=True)
jpayne@69 1294 MutableSeq(' ACGT')
jpayne@69 1295 >>> seq
jpayne@69 1296 MutableSeq(' ACGT')
jpayne@69 1297
jpayne@69 1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1300
jpayne@69 1301 See also the strip and lstrip methods.
jpayne@69 1302 """
jpayne@69 1303 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@69 1304 chars = bytes(chars)
jpayne@69 1305 elif isinstance(chars, str):
jpayne@69 1306 chars = chars.encode("ASCII")
jpayne@69 1307 try:
jpayne@69 1308 data = self._data.rstrip(chars)
jpayne@69 1309 except TypeError:
jpayne@69 1310 raise TypeError(
jpayne@69 1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@69 1312 ) from None
jpayne@69 1313 if inplace:
jpayne@69 1314 if not isinstance(self._data, bytearray):
jpayne@69 1315 raise TypeError("Sequence is immutable")
jpayne@69 1316 self._data[:] = data
jpayne@69 1317 return self
jpayne@69 1318 else:
jpayne@69 1319 return self.__class__(data)
jpayne@69 1320
jpayne@69 1321 def removeprefix(self, prefix, inplace=False):
jpayne@69 1322 """Return a new Seq object with prefix (left) removed.
jpayne@69 1323
jpayne@69 1324 This behaves like the python string method of the same name.
jpayne@69 1325
jpayne@69 1326 e.g. Removing a start Codon:
jpayne@69 1327
jpayne@69 1328 >>> from Bio.Seq import Seq
jpayne@69 1329 >>> my_seq = Seq("ATGGTGTGTGT")
jpayne@69 1330 >>> my_seq
jpayne@69 1331 Seq('ATGGTGTGTGT')
jpayne@69 1332 >>> my_seq.removeprefix('ATG')
jpayne@69 1333 Seq('GTGTGTGT')
jpayne@69 1334
jpayne@69 1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1337
jpayne@69 1338 See also the removesuffix method.
jpayne@69 1339 """
jpayne@69 1340 if isinstance(prefix, _SeqAbstractBaseClass):
jpayne@69 1341 prefix = bytes(prefix)
jpayne@69 1342 elif isinstance(prefix, str):
jpayne@69 1343 prefix = prefix.encode("ASCII")
jpayne@69 1344 try:
jpayne@69 1345 data = self._data.removeprefix(prefix)
jpayne@69 1346 except TypeError:
jpayne@69 1347 raise TypeError(
jpayne@69 1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@69 1349 ) from None
jpayne@69 1350 except AttributeError:
jpayne@69 1351 # Fall back for pre-Python 3.9
jpayne@69 1352 data = self._data
jpayne@69 1353 if data.startswith(prefix):
jpayne@69 1354 data = data[len(prefix) :]
jpayne@69 1355 if inplace:
jpayne@69 1356 if not isinstance(self._data, bytearray):
jpayne@69 1357 raise TypeError("Sequence is immutable")
jpayne@69 1358 self._data[:] = data
jpayne@69 1359 return self
jpayne@69 1360 else:
jpayne@69 1361 return self.__class__(data)
jpayne@69 1362
jpayne@69 1363 def removesuffix(self, suffix, inplace=False):
jpayne@69 1364 """Return a new Seq object with suffix (right) removed.
jpayne@69 1365
jpayne@69 1366 This behaves like the python string method of the same name.
jpayne@69 1367
jpayne@69 1368 e.g. Removing a stop codon:
jpayne@69 1369
jpayne@69 1370 >>> from Bio.Seq import Seq
jpayne@69 1371 >>> my_seq = Seq("GTGTGTGTTAG")
jpayne@69 1372 >>> my_seq
jpayne@69 1373 Seq('GTGTGTGTTAG')
jpayne@69 1374 >>> stop_codon = Seq("TAG")
jpayne@69 1375 >>> my_seq.removesuffix(stop_codon)
jpayne@69 1376 Seq('GTGTGTGT')
jpayne@69 1377
jpayne@69 1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1380
jpayne@69 1381 See also the removeprefix method.
jpayne@69 1382 """
jpayne@69 1383 if isinstance(suffix, _SeqAbstractBaseClass):
jpayne@69 1384 suffix = bytes(suffix)
jpayne@69 1385 elif isinstance(suffix, str):
jpayne@69 1386 suffix = suffix.encode("ASCII")
jpayne@69 1387 try:
jpayne@69 1388 data = self._data.removesuffix(suffix)
jpayne@69 1389 except TypeError:
jpayne@69 1390 raise TypeError(
jpayne@69 1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@69 1392 ) from None
jpayne@69 1393 except AttributeError:
jpayne@69 1394 # Fall back for pre-Python 3.9
jpayne@69 1395 data = self._data
jpayne@69 1396 if data.endswith(suffix):
jpayne@69 1397 data = data[: -len(suffix)]
jpayne@69 1398 if inplace:
jpayne@69 1399 if not isinstance(self._data, bytearray):
jpayne@69 1400 raise TypeError("Sequence is immutable")
jpayne@69 1401 self._data[:] = data
jpayne@69 1402 return self
jpayne@69 1403 else:
jpayne@69 1404 return self.__class__(data)
jpayne@69 1405
jpayne@69 1406 def upper(self, inplace=False):
jpayne@69 1407 """Return the sequence in upper case.
jpayne@69 1408
jpayne@69 1409 An upper-case copy of the sequence is returned if inplace is False,
jpayne@69 1410 the default value:
jpayne@69 1411
jpayne@69 1412 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 1413 >>> my_seq = Seq("VHLTPeeK*")
jpayne@69 1414 >>> my_seq
jpayne@69 1415 Seq('VHLTPeeK*')
jpayne@69 1416 >>> my_seq.lower()
jpayne@69 1417 Seq('vhltpeek*')
jpayne@69 1418 >>> my_seq.upper()
jpayne@69 1419 Seq('VHLTPEEK*')
jpayne@69 1420 >>> my_seq
jpayne@69 1421 Seq('VHLTPeeK*')
jpayne@69 1422
jpayne@69 1423 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1424
jpayne@69 1425 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@69 1426 >>> my_seq
jpayne@69 1427 MutableSeq('VHLTPeeK*')
jpayne@69 1428 >>> my_seq.lower()
jpayne@69 1429 MutableSeq('vhltpeek*')
jpayne@69 1430 >>> my_seq.upper()
jpayne@69 1431 MutableSeq('VHLTPEEK*')
jpayne@69 1432 >>> my_seq
jpayne@69 1433 MutableSeq('VHLTPeeK*')
jpayne@69 1434
jpayne@69 1435 >>> my_seq.lower(inplace=True)
jpayne@69 1436 MutableSeq('vhltpeek*')
jpayne@69 1437 >>> my_seq
jpayne@69 1438 MutableSeq('vhltpeek*')
jpayne@69 1439 >>> my_seq.upper(inplace=True)
jpayne@69 1440 MutableSeq('VHLTPEEK*')
jpayne@69 1441 >>> my_seq
jpayne@69 1442 MutableSeq('VHLTPEEK*')
jpayne@69 1443
jpayne@69 1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1446
jpayne@69 1447 See also the ``lower`` method.
jpayne@69 1448 """
jpayne@69 1449 data = self._data.upper()
jpayne@69 1450 if inplace:
jpayne@69 1451 if not isinstance(self._data, bytearray):
jpayne@69 1452 raise TypeError("Sequence is immutable")
jpayne@69 1453 self._data[:] = data
jpayne@69 1454 return self
jpayne@69 1455 else:
jpayne@69 1456 return self.__class__(data)
jpayne@69 1457
jpayne@69 1458 def lower(self, inplace=False):
jpayne@69 1459 """Return the sequence in lower case.
jpayne@69 1460
jpayne@69 1461 An lower-case copy of the sequence is returned if inplace is False,
jpayne@69 1462 the default value:
jpayne@69 1463
jpayne@69 1464 >>> from Bio.Seq import Seq, MutableSeq
jpayne@69 1465 >>> my_seq = Seq("VHLTPeeK*")
jpayne@69 1466 >>> my_seq
jpayne@69 1467 Seq('VHLTPeeK*')
jpayne@69 1468 >>> my_seq.lower()
jpayne@69 1469 Seq('vhltpeek*')
jpayne@69 1470 >>> my_seq.upper()
jpayne@69 1471 Seq('VHLTPEEK*')
jpayne@69 1472 >>> my_seq
jpayne@69 1473 Seq('VHLTPeeK*')
jpayne@69 1474
jpayne@69 1475 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1476
jpayne@69 1477 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@69 1478 >>> my_seq
jpayne@69 1479 MutableSeq('VHLTPeeK*')
jpayne@69 1480 >>> my_seq.lower()
jpayne@69 1481 MutableSeq('vhltpeek*')
jpayne@69 1482 >>> my_seq.upper()
jpayne@69 1483 MutableSeq('VHLTPEEK*')
jpayne@69 1484 >>> my_seq
jpayne@69 1485 MutableSeq('VHLTPeeK*')
jpayne@69 1486
jpayne@69 1487 >>> my_seq.lower(inplace=True)
jpayne@69 1488 MutableSeq('vhltpeek*')
jpayne@69 1489 >>> my_seq
jpayne@69 1490 MutableSeq('vhltpeek*')
jpayne@69 1491 >>> my_seq.upper(inplace=True)
jpayne@69 1492 MutableSeq('VHLTPEEK*')
jpayne@69 1493 >>> my_seq
jpayne@69 1494 MutableSeq('VHLTPEEK*')
jpayne@69 1495
jpayne@69 1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1498
jpayne@69 1499 See also the ``upper`` method.
jpayne@69 1500 """
jpayne@69 1501 data = self._data.lower()
jpayne@69 1502 if inplace:
jpayne@69 1503 if not isinstance(self._data, bytearray):
jpayne@69 1504 raise TypeError("Sequence is immutable")
jpayne@69 1505 self._data[:] = data
jpayne@69 1506 return self
jpayne@69 1507 else:
jpayne@69 1508 return self.__class__(data)
jpayne@69 1509
jpayne@69 1510 def isupper(self):
jpayne@69 1511 """Return True if all ASCII characters in data are uppercase.
jpayne@69 1512
jpayne@69 1513 If there are no cased characters, the method returns False.
jpayne@69 1514 """
jpayne@69 1515 return self._data.isupper()
jpayne@69 1516
jpayne@69 1517 def islower(self):
jpayne@69 1518 """Return True if all ASCII characters in data are lowercase.
jpayne@69 1519
jpayne@69 1520 If there are no cased characters, the method returns False.
jpayne@69 1521 """
jpayne@69 1522 return self._data.islower()
jpayne@69 1523
jpayne@69 1524 def translate(
jpayne@69 1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
jpayne@69 1526 ):
jpayne@69 1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
jpayne@69 1528
jpayne@69 1529 This method will translate DNA or RNA sequences. It should not
jpayne@69 1530 be used on protein sequences as any result will be biologically
jpayne@69 1531 meaningless.
jpayne@69 1532
jpayne@69 1533 Arguments:
jpayne@69 1534 - table - Which codon table to use? This can be either a name
jpayne@69 1535 (string), an NCBI identifier (integer), or a CodonTable
jpayne@69 1536 object (useful for non-standard genetic codes). This
jpayne@69 1537 defaults to the "Standard" table.
jpayne@69 1538 - stop_symbol - Single character string, what to use for
jpayne@69 1539 terminators. This defaults to the asterisk, "*".
jpayne@69 1540 - to_stop - Boolean, defaults to False meaning do a full
jpayne@69 1541 translation continuing on past any stop codons (translated as the
jpayne@69 1542 specified stop_symbol). If True, translation is terminated at
jpayne@69 1543 the first in frame stop codon (and the stop_symbol is not
jpayne@69 1544 appended to the returned protein sequence).
jpayne@69 1545 - cds - Boolean, indicates this is a complete CDS. If True,
jpayne@69 1546 this checks the sequence starts with a valid alternative start
jpayne@69 1547 codon (which will be translated as methionine, M), that the
jpayne@69 1548 sequence length is a multiple of three, and that there is a
jpayne@69 1549 single in frame stop codon at the end (this will be excluded
jpayne@69 1550 from the protein sequence, regardless of the to_stop option).
jpayne@69 1551 If these tests fail, an exception is raised.
jpayne@69 1552 - gap - Single character string to denote symbol used for gaps.
jpayne@69 1553 Defaults to the minus sign.
jpayne@69 1554
jpayne@69 1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
jpayne@69 1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
jpayne@69 1557 pn a ``MutableSeq`` object.
jpayne@69 1558
jpayne@69 1559 e.g. Using the standard table:
jpayne@69 1560
jpayne@69 1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69 1562 >>> coding_dna.translate()
jpayne@69 1563 Seq('VAIVMGR*KGAR*')
jpayne@69 1564 >>> coding_dna.translate(stop_symbol="@")
jpayne@69 1565 Seq('VAIVMGR@KGAR@')
jpayne@69 1566 >>> coding_dna.translate(to_stop=True)
jpayne@69 1567 Seq('VAIVMGR')
jpayne@69 1568
jpayne@69 1569 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@69 1570
jpayne@69 1571 >>> coding_dna.translate(table=2)
jpayne@69 1572 Seq('VAIVMGRWKGAR*')
jpayne@69 1573 >>> coding_dna.translate(table=2, to_stop=True)
jpayne@69 1574 Seq('VAIVMGRWKGAR')
jpayne@69 1575
jpayne@69 1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
jpayne@69 1577 this sequence could be a complete CDS:
jpayne@69 1578
jpayne@69 1579 >>> coding_dna.translate(table=2, cds=True)
jpayne@69 1580 Seq('MAIVMGRWKGAR')
jpayne@69 1581
jpayne@69 1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
jpayne@69 1583 and also the in frame stop codons:
jpayne@69 1584
jpayne@69 1585 >>> coding_dna.translate(table=1, cds=True)
jpayne@69 1586 Traceback (most recent call last):
jpayne@69 1587 ...
jpayne@69 1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
jpayne@69 1589
jpayne@69 1590 If the sequence has no in-frame stop codon, then the to_stop argument
jpayne@69 1591 has no effect:
jpayne@69 1592
jpayne@69 1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
jpayne@69 1594 >>> coding_dna2.translate()
jpayne@69 1595 Seq('LAIVMGR')
jpayne@69 1596 >>> coding_dna2.translate(to_stop=True)
jpayne@69 1597 Seq('LAIVMGR')
jpayne@69 1598
jpayne@69 1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@69 1600 or a stop codon. These are translated as "X". Any invalid codon
jpayne@69 1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@69 1602
jpayne@69 1603 NOTE - This does NOT behave like the python string's translate
jpayne@69 1604 method. For that use str(my_seq).translate(...) instead
jpayne@69 1605 """
jpayne@69 1606 try:
jpayne@69 1607 data = str(self)
jpayne@69 1608 except UndefinedSequenceError:
jpayne@69 1609 # translating an undefined sequence yields an undefined
jpayne@69 1610 # sequence with the length divided by 3
jpayne@69 1611 n = len(self)
jpayne@69 1612 if n % 3 != 0:
jpayne@69 1613 warnings.warn(
jpayne@69 1614 "Partial codon, len(sequence) not a multiple of three. "
jpayne@69 1615 "This may become an error in future.",
jpayne@69 1616 BiopythonWarning,
jpayne@69 1617 )
jpayne@69 1618 return Seq(None, n // 3)
jpayne@69 1619
jpayne@69 1620 return self.__class__(
jpayne@69 1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
jpayne@69 1622 )
jpayne@69 1623
jpayne@69 1624 def complement(self, inplace=False):
jpayne@69 1625 """Return the complement as a DNA sequence.
jpayne@69 1626
jpayne@69 1627 >>> Seq("CGA").complement()
jpayne@69 1628 Seq('GCT')
jpayne@69 1629
jpayne@69 1630 Any U in the sequence is treated as a T:
jpayne@69 1631
jpayne@69 1632 >>> Seq("CGAUT").complement()
jpayne@69 1633 Seq('GCTAA')
jpayne@69 1634
jpayne@69 1635 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@69 1636
jpayne@69 1637 >>> Seq("CGAUT").complement_rna()
jpayne@69 1638 Seq('GCUAA')
jpayne@69 1639
jpayne@69 1640 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1641
jpayne@69 1642 >>> my_seq = MutableSeq("CGA")
jpayne@69 1643 >>> my_seq
jpayne@69 1644 MutableSeq('CGA')
jpayne@69 1645 >>> my_seq.complement()
jpayne@69 1646 MutableSeq('GCT')
jpayne@69 1647 >>> my_seq
jpayne@69 1648 MutableSeq('CGA')
jpayne@69 1649
jpayne@69 1650 >>> my_seq.complement(inplace=True)
jpayne@69 1651 MutableSeq('GCT')
jpayne@69 1652 >>> my_seq
jpayne@69 1653 MutableSeq('GCT')
jpayne@69 1654
jpayne@69 1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1657 """
jpayne@69 1658 ttable = _dna_complement_table
jpayne@69 1659 try:
jpayne@69 1660 data = self._data.translate(ttable)
jpayne@69 1661 except UndefinedSequenceError:
jpayne@69 1662 # complement of an undefined sequence is an undefined sequence
jpayne@69 1663 # of the same length
jpayne@69 1664 return self
jpayne@69 1665 if inplace:
jpayne@69 1666 if not isinstance(self._data, bytearray):
jpayne@69 1667 raise TypeError("Sequence is immutable")
jpayne@69 1668 self._data[:] = data
jpayne@69 1669 return self
jpayne@69 1670 return self.__class__(data)
jpayne@69 1671
jpayne@69 1672 def complement_rna(self, inplace=False):
jpayne@69 1673 """Return the complement as an RNA sequence.
jpayne@69 1674
jpayne@69 1675 >>> Seq("CGA").complement_rna()
jpayne@69 1676 Seq('GCU')
jpayne@69 1677
jpayne@69 1678 Any T in the sequence is treated as a U:
jpayne@69 1679
jpayne@69 1680 >>> Seq("CGAUT").complement_rna()
jpayne@69 1681 Seq('GCUAA')
jpayne@69 1682
jpayne@69 1683 In contrast, ``complement`` returns a DNA sequence by default:
jpayne@69 1684
jpayne@69 1685 >>> Seq("CGA").complement()
jpayne@69 1686 Seq('GCT')
jpayne@69 1687
jpayne@69 1688 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1689
jpayne@69 1690 >>> my_seq = MutableSeq("CGA")
jpayne@69 1691 >>> my_seq
jpayne@69 1692 MutableSeq('CGA')
jpayne@69 1693 >>> my_seq.complement_rna()
jpayne@69 1694 MutableSeq('GCU')
jpayne@69 1695 >>> my_seq
jpayne@69 1696 MutableSeq('CGA')
jpayne@69 1697
jpayne@69 1698 >>> my_seq.complement_rna(inplace=True)
jpayne@69 1699 MutableSeq('GCU')
jpayne@69 1700 >>> my_seq
jpayne@69 1701 MutableSeq('GCU')
jpayne@69 1702
jpayne@69 1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1705 """
jpayne@69 1706 try:
jpayne@69 1707 data = self._data.translate(_rna_complement_table)
jpayne@69 1708 except UndefinedSequenceError:
jpayne@69 1709 # complement of an undefined sequence is an undefined sequence
jpayne@69 1710 # of the same length
jpayne@69 1711 return self
jpayne@69 1712 if inplace:
jpayne@69 1713 if not isinstance(self._data, bytearray):
jpayne@69 1714 raise TypeError("Sequence is immutable")
jpayne@69 1715 self._data[:] = data
jpayne@69 1716 return self
jpayne@69 1717 return self.__class__(data)
jpayne@69 1718
jpayne@69 1719 def reverse_complement(self, inplace=False):
jpayne@69 1720 """Return the reverse complement as a DNA sequence.
jpayne@69 1721
jpayne@69 1722 >>> Seq("CGA").reverse_complement()
jpayne@69 1723 Seq('TCG')
jpayne@69 1724
jpayne@69 1725 Any U in the sequence is treated as a T:
jpayne@69 1726
jpayne@69 1727 >>> Seq("CGAUT").reverse_complement()
jpayne@69 1728 Seq('AATCG')
jpayne@69 1729
jpayne@69 1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@69 1731
jpayne@69 1732 >>> Seq("CGA").reverse_complement_rna()
jpayne@69 1733 Seq('UCG')
jpayne@69 1734
jpayne@69 1735 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1736
jpayne@69 1737 >>> my_seq = MutableSeq("CGA")
jpayne@69 1738 >>> my_seq
jpayne@69 1739 MutableSeq('CGA')
jpayne@69 1740 >>> my_seq.reverse_complement()
jpayne@69 1741 MutableSeq('TCG')
jpayne@69 1742 >>> my_seq
jpayne@69 1743 MutableSeq('CGA')
jpayne@69 1744
jpayne@69 1745 >>> my_seq.reverse_complement(inplace=True)
jpayne@69 1746 MutableSeq('TCG')
jpayne@69 1747 >>> my_seq
jpayne@69 1748 MutableSeq('TCG')
jpayne@69 1749
jpayne@69 1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1751 ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69 1752 ``inplace=True``.
jpayne@69 1753 """
jpayne@69 1754 try:
jpayne@69 1755 data = self._data.translate(_dna_complement_table)
jpayne@69 1756 except UndefinedSequenceError:
jpayne@69 1757 # reverse complement of an undefined sequence is an undefined sequence
jpayne@69 1758 # of the same length
jpayne@69 1759 return self
jpayne@69 1760 if inplace:
jpayne@69 1761 if not isinstance(self._data, bytearray):
jpayne@69 1762 raise TypeError("Sequence is immutable")
jpayne@69 1763 self._data[::-1] = data
jpayne@69 1764 return self
jpayne@69 1765 return self.__class__(data[::-1])
jpayne@69 1766
jpayne@69 1767 def reverse_complement_rna(self, inplace=False):
jpayne@69 1768 """Return the reverse complement as an RNA sequence.
jpayne@69 1769
jpayne@69 1770 >>> Seq("CGA").reverse_complement_rna()
jpayne@69 1771 Seq('UCG')
jpayne@69 1772
jpayne@69 1773 Any T in the sequence is treated as a U:
jpayne@69 1774
jpayne@69 1775 >>> Seq("CGAUT").reverse_complement_rna()
jpayne@69 1776 Seq('AAUCG')
jpayne@69 1777
jpayne@69 1778 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@69 1779
jpayne@69 1780 >>> Seq("CGA").reverse_complement()
jpayne@69 1781 Seq('TCG')
jpayne@69 1782
jpayne@69 1783 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1784
jpayne@69 1785 >>> my_seq = MutableSeq("CGA")
jpayne@69 1786 >>> my_seq
jpayne@69 1787 MutableSeq('CGA')
jpayne@69 1788 >>> my_seq.reverse_complement_rna()
jpayne@69 1789 MutableSeq('UCG')
jpayne@69 1790 >>> my_seq
jpayne@69 1791 MutableSeq('CGA')
jpayne@69 1792
jpayne@69 1793 >>> my_seq.reverse_complement_rna(inplace=True)
jpayne@69 1794 MutableSeq('UCG')
jpayne@69 1795 >>> my_seq
jpayne@69 1796 MutableSeq('UCG')
jpayne@69 1797
jpayne@69 1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
jpayne@69 1800 ``inplace=True``.
jpayne@69 1801 """
jpayne@69 1802 try:
jpayne@69 1803 data = self._data.translate(_rna_complement_table)
jpayne@69 1804 except UndefinedSequenceError:
jpayne@69 1805 # reverse complement of an undefined sequence is an undefined sequence
jpayne@69 1806 # of the same length
jpayne@69 1807 return self
jpayne@69 1808 if inplace:
jpayne@69 1809 if not isinstance(self._data, bytearray):
jpayne@69 1810 raise TypeError("Sequence is immutable")
jpayne@69 1811 self._data[::-1] = data
jpayne@69 1812 return self
jpayne@69 1813 return self.__class__(data[::-1])
jpayne@69 1814
jpayne@69 1815 def transcribe(self, inplace=False):
jpayne@69 1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
jpayne@69 1817
jpayne@69 1818 Following the usual convention, the sequence is interpreted as the
jpayne@69 1819 coding strand of the DNA double helix, not the template strand. This
jpayne@69 1820 means we can get the RNA sequence just by switching T to U.
jpayne@69 1821
jpayne@69 1822 >>> from Bio.Seq import Seq
jpayne@69 1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69 1824 >>> coding_dna
jpayne@69 1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1826 >>> coding_dna.transcribe()
jpayne@69 1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1828
jpayne@69 1829 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1830
jpayne@69 1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@69 1832 >>> sequence
jpayne@69 1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1834 >>> sequence.transcribe()
jpayne@69 1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1836 >>> sequence
jpayne@69 1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1838
jpayne@69 1839 >>> sequence.transcribe(inplace=True)
jpayne@69 1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1841 >>> sequence
jpayne@69 1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1843
jpayne@69 1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1846
jpayne@69 1847 Trying to transcribe an RNA sequence has no effect.
jpayne@69 1848 If you have a nucleotide sequence which might be DNA or RNA
jpayne@69 1849 (or even a mixture), calling the transcribe method will ensure
jpayne@69 1850 any T becomes U.
jpayne@69 1851
jpayne@69 1852 Trying to transcribe a protein sequence will replace any
jpayne@69 1853 T for Threonine with U for Selenocysteine, which has no
jpayne@69 1854 biologically plausible rational.
jpayne@69 1855
jpayne@69 1856 >>> from Bio.Seq import Seq
jpayne@69 1857 >>> my_protein = Seq("MAIVMGRT")
jpayne@69 1858 >>> my_protein.transcribe()
jpayne@69 1859 Seq('MAIVMGRU')
jpayne@69 1860 """
jpayne@69 1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
jpayne@69 1862 if inplace:
jpayne@69 1863 if not isinstance(self._data, bytearray):
jpayne@69 1864 raise TypeError("Sequence is immutable")
jpayne@69 1865 self._data[:] = data
jpayne@69 1866 return self
jpayne@69 1867 return self.__class__(data)
jpayne@69 1868
jpayne@69 1869 def back_transcribe(self, inplace=False):
jpayne@69 1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
jpayne@69 1871
jpayne@69 1872 >>> from Bio.Seq import Seq
jpayne@69 1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@69 1874 >>> messenger_rna
jpayne@69 1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1876 >>> messenger_rna.back_transcribe()
jpayne@69 1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1878
jpayne@69 1879 The sequence is modified in-place and returned if inplace is True:
jpayne@69 1880
jpayne@69 1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@69 1882 >>> sequence
jpayne@69 1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1884 >>> sequence.back_transcribe()
jpayne@69 1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1886 >>> sequence
jpayne@69 1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@69 1888
jpayne@69 1889 >>> sequence.back_transcribe(inplace=True)
jpayne@69 1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1891 >>> sequence
jpayne@69 1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@69 1893
jpayne@69 1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1896
jpayne@69 1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
jpayne@69 1898 sequence which might be DNA or RNA (or even a mixture), calling the
jpayne@69 1899 back-transcribe method will ensure any U becomes T.
jpayne@69 1900
jpayne@69 1901 Trying to back-transcribe a protein sequence will replace any U for
jpayne@69 1902 Selenocysteine with T for Threonine, which is biologically meaningless.
jpayne@69 1903
jpayne@69 1904 >>> from Bio.Seq import Seq
jpayne@69 1905 >>> my_protein = Seq("MAIVMGRU")
jpayne@69 1906 >>> my_protein.back_transcribe()
jpayne@69 1907 Seq('MAIVMGRT')
jpayne@69 1908 """
jpayne@69 1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
jpayne@69 1910 if inplace:
jpayne@69 1911 if not isinstance(self._data, bytearray):
jpayne@69 1912 raise TypeError("Sequence is immutable")
jpayne@69 1913 self._data[:] = data
jpayne@69 1914 return self
jpayne@69 1915 return self.__class__(data)
jpayne@69 1916
jpayne@69 1917 def join(self, other):
jpayne@69 1918 """Return a merge of the sequences in other, spaced by the sequence from self.
jpayne@69 1919
jpayne@69 1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
jpayne@69 1921 the letters), or an iterable containing Seq, MutableSeq, or string
jpayne@69 1922 objects. These arguments will be concatenated with the calling sequence
jpayne@69 1923 as the spacer:
jpayne@69 1924
jpayne@69 1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
jpayne@69 1926 >>> concatenated
jpayne@69 1927 Seq('AAANNNNNTTTNNNNNPPP')
jpayne@69 1928
jpayne@69 1929 Joining the letters of a single sequence:
jpayne@69 1930
jpayne@69 1931 >>> Seq('NNNNN').join(Seq("ACGT"))
jpayne@69 1932 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@69 1933 >>> Seq('NNNNN').join("ACGT")
jpayne@69 1934 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@69 1935 """
jpayne@69 1936 if isinstance(other, _SeqAbstractBaseClass):
jpayne@69 1937 return self.__class__(str(self).join(str(other)))
jpayne@69 1938 elif isinstance(other, str):
jpayne@69 1939 return self.__class__(str(self).join(other))
jpayne@69 1940
jpayne@69 1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69 1942
jpayne@69 1943 if isinstance(other, SeqRecord):
jpayne@69 1944 raise TypeError("Iterable cannot be a SeqRecord")
jpayne@69 1945
jpayne@69 1946 for c in other:
jpayne@69 1947 if isinstance(c, SeqRecord):
jpayne@69 1948 raise TypeError("Iterable cannot contain SeqRecords")
jpayne@69 1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
jpayne@69 1950 raise TypeError(
jpayne@69 1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
jpayne@69 1952 )
jpayne@69 1953 return self.__class__(str(self).join([str(_) for _ in other]))
jpayne@69 1954
jpayne@69 1955 def replace(self, old, new, inplace=False):
jpayne@69 1956 """Return a copy with all occurrences of subsequence old replaced by new.
jpayne@69 1957
jpayne@69 1958 >>> s = Seq("ACGTAACCGGTT")
jpayne@69 1959 >>> t = s.replace("AC", "XYZ")
jpayne@69 1960 >>> s
jpayne@69 1961 Seq('ACGTAACCGGTT')
jpayne@69 1962 >>> t
jpayne@69 1963 Seq('XYZGTAXYZCGGTT')
jpayne@69 1964
jpayne@69 1965 For mutable sequences, passing inplace=True will modify the sequence in place:
jpayne@69 1966
jpayne@69 1967 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@69 1968 >>> t = m.replace("AC", "XYZ")
jpayne@69 1969 >>> m
jpayne@69 1970 MutableSeq('ACGTAACCGGTT')
jpayne@69 1971 >>> t
jpayne@69 1972 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69 1973
jpayne@69 1974 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@69 1975 >>> t = m.replace("AC", "XYZ", inplace=True)
jpayne@69 1976 >>> m
jpayne@69 1977 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69 1978 >>> t
jpayne@69 1979 MutableSeq('XYZGTAXYZCGGTT')
jpayne@69 1980
jpayne@69 1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@69 1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@69 1983 """
jpayne@69 1984 if isinstance(old, _SeqAbstractBaseClass):
jpayne@69 1985 old = bytes(old)
jpayne@69 1986 elif isinstance(old, str):
jpayne@69 1987 old = old.encode("ASCII")
jpayne@69 1988 if isinstance(new, _SeqAbstractBaseClass):
jpayne@69 1989 new = bytes(new)
jpayne@69 1990 elif isinstance(new, str):
jpayne@69 1991 new = new.encode("ASCII")
jpayne@69 1992 data = self._data.replace(old, new)
jpayne@69 1993 if inplace:
jpayne@69 1994 if not isinstance(self._data, bytearray):
jpayne@69 1995 raise TypeError("Sequence is immutable")
jpayne@69 1996 self._data[:] = data
jpayne@69 1997 return self
jpayne@69 1998 return self.__class__(data)
jpayne@69 1999
jpayne@69 2000 @property
jpayne@69 2001 def defined(self):
jpayne@69 2002 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@69 2003
jpayne@69 2004 Zero-length sequences are always considered to be defined.
jpayne@69 2005 """
jpayne@69 2006 if isinstance(self._data, (bytes, bytearray)):
jpayne@69 2007 return True
jpayne@69 2008 else:
jpayne@69 2009 return self._data.defined
jpayne@69 2010
jpayne@69 2011 @property
jpayne@69 2012 def defined_ranges(self):
jpayne@69 2013 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69 2014
jpayne@69 2015 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69 2016 """
jpayne@69 2017 if isinstance(self._data, (bytes, bytearray)):
jpayne@69 2018 length = len(self)
jpayne@69 2019 if length > 0:
jpayne@69 2020 return ((0, length),)
jpayne@69 2021 else:
jpayne@69 2022 return ()
jpayne@69 2023 else:
jpayne@69 2024 return self._data.defined_ranges
jpayne@69 2025
jpayne@69 2026
jpayne@69 2027 class Seq(_SeqAbstractBaseClass):
jpayne@69 2028 """Read-only sequence object (essentially a string with biological methods).
jpayne@69 2029
jpayne@69 2030 Like normal python strings, our basic sequence object is immutable.
jpayne@69 2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
jpayne@69 2032 Seq objects to be used as dictionary keys.
jpayne@69 2033
jpayne@69 2034 The Seq object provides a number of string like methods (such as count,
jpayne@69 2035 find, split and strip).
jpayne@69 2036
jpayne@69 2037 The Seq object also provides some biological methods, such as complement,
jpayne@69 2038 reverse_complement, transcribe, back_transcribe and translate (which are
jpayne@69 2039 not applicable to protein sequences).
jpayne@69 2040 """
jpayne@69 2041
jpayne@69 2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
jpayne@69 2043
jpayne@69 2044 def __init__(
jpayne@69 2045 self,
jpayne@69 2046 data: Union[
jpayne@69 2047 str,
jpayne@69 2048 bytes,
jpayne@69 2049 bytearray,
jpayne@69 2050 _SeqAbstractBaseClass,
jpayne@69 2051 SequenceDataAbstractBaseClass,
jpayne@69 2052 dict,
jpayne@69 2053 None,
jpayne@69 2054 ],
jpayne@69 2055 length: Optional[int] = None,
jpayne@69 2056 ):
jpayne@69 2057 """Create a Seq object.
jpayne@69 2058
jpayne@69 2059 Arguments:
jpayne@69 2060 - data - Sequence, required (string)
jpayne@69 2061 - length - Sequence length, used only if data is None or a dictionary (integer)
jpayne@69 2062
jpayne@69 2063 You will typically use Bio.SeqIO to read in sequences from files as
jpayne@69 2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
jpayne@69 2065 the seq property.
jpayne@69 2066
jpayne@69 2067 However, you can also create a Seq object directly:
jpayne@69 2068
jpayne@69 2069 >>> from Bio.Seq import Seq
jpayne@69 2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
jpayne@69 2071 >>> my_seq
jpayne@69 2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
jpayne@69 2073 >>> print(my_seq)
jpayne@69 2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
jpayne@69 2075
jpayne@69 2076 To create a Seq object with for a sequence of known length but
jpayne@69 2077 unknown sequence contents, use None for the data argument and pass
jpayne@69 2078 the sequence length for the length argument. Trying to access the
jpayne@69 2079 sequence contents of a Seq object created in this way will raise
jpayne@69 2080 an UndefinedSequenceError:
jpayne@69 2081
jpayne@69 2082 >>> my_undefined_sequence = Seq(None, 20)
jpayne@69 2083 >>> my_undefined_sequence
jpayne@69 2084 Seq(None, length=20)
jpayne@69 2085 >>> len(my_undefined_sequence)
jpayne@69 2086 20
jpayne@69 2087 >>> print(my_undefined_sequence)
jpayne@69 2088 Traceback (most recent call last):
jpayne@69 2089 ...
jpayne@69 2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
jpayne@69 2091
jpayne@69 2092 If the sequence contents is known for parts of the sequence only, use
jpayne@69 2093 a dictionary for the data argument to pass the known sequence segments:
jpayne@69 2094
jpayne@69 2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
jpayne@69 2096 >>> my_partially_defined_sequence
jpayne@69 2097 Seq({3: 'ACGT'}, length=10)
jpayne@69 2098 >>> len(my_partially_defined_sequence)
jpayne@69 2099 10
jpayne@69 2100 >>> print(my_partially_defined_sequence)
jpayne@69 2101 Traceback (most recent call last):
jpayne@69 2102 ...
jpayne@69 2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
jpayne@69 2104 >>> my_partially_defined_sequence[3:7]
jpayne@69 2105 Seq('ACGT')
jpayne@69 2106 >>> print(my_partially_defined_sequence[3:7])
jpayne@69 2107 ACGT
jpayne@69 2108 """
jpayne@69 2109 if data is None:
jpayne@69 2110 if length is None:
jpayne@69 2111 raise ValueError("length must not be None if data is None")
jpayne@69 2112 elif length == 0:
jpayne@69 2113 self._data = b""
jpayne@69 2114 elif length < 0:
jpayne@69 2115 raise ValueError("length must not be negative.")
jpayne@69 2116 else:
jpayne@69 2117 self._data = _UndefinedSequenceData(length)
jpayne@69 2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
jpayne@69 2119 self._data = data
jpayne@69 2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
jpayne@69 2121 self._data = bytes(data)
jpayne@69 2122 elif isinstance(data, str):
jpayne@69 2123 self._data = bytes(data, encoding="ASCII")
jpayne@69 2124 elif isinstance(data, dict):
jpayne@69 2125 if length is None:
jpayne@69 2126 raise ValueError("length must not be None if data is a dictionary")
jpayne@69 2127 elif length == 0:
jpayne@69 2128 self._data = b""
jpayne@69 2129 elif length < 0:
jpayne@69 2130 raise ValueError("length must not be negative.")
jpayne@69 2131 else:
jpayne@69 2132 current = 0 # not needed here, but it keeps mypy happy
jpayne@69 2133 end = -1
jpayne@69 2134 starts = sorted(data.keys())
jpayne@69 2135 _data: Dict[int, bytes] = {}
jpayne@69 2136 for start in starts:
jpayne@69 2137 seq = data[start]
jpayne@69 2138 if isinstance(seq, str):
jpayne@69 2139 seq = bytes(seq, encoding="ASCII")
jpayne@69 2140 else:
jpayne@69 2141 try:
jpayne@69 2142 seq = bytes(seq)
jpayne@69 2143 except Exception:
jpayne@69 2144 raise ValueError("Expected bytes-like objects or strings")
jpayne@69 2145 if start < end:
jpayne@69 2146 raise ValueError("Sequence data are overlapping.")
jpayne@69 2147 elif start == end:
jpayne@69 2148 _data[current] += seq # noqa: F821
jpayne@69 2149 else:
jpayne@69 2150 _data[start] = seq
jpayne@69 2151 current = start
jpayne@69 2152 end = start + len(seq)
jpayne@69 2153 if end > length:
jpayne@69 2154 raise ValueError(
jpayne@69 2155 "Provided sequence data extend beyond sequence length."
jpayne@69 2156 )
jpayne@69 2157 elif end == length and current == 0:
jpayne@69 2158 # sequence is fully defined
jpayne@69 2159 self._data = _data[current]
jpayne@69 2160 else:
jpayne@69 2161 self._data = _PartiallyDefinedSequenceData(length, _data)
jpayne@69 2162 else:
jpayne@69 2163 raise TypeError(
jpayne@69 2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
jpayne@69 2165 )
jpayne@69 2166
jpayne@69 2167 def __hash__(self):
jpayne@69 2168 """Hash of the sequence as a string for comparison.
jpayne@69 2169
jpayne@69 2170 See Seq object comparison documentation (method ``__eq__`` in
jpayne@69 2171 particular) as this has changed in Biopython 1.65. Older versions
jpayne@69 2172 would hash on object identity.
jpayne@69 2173 """
jpayne@69 2174 return hash(self._data)
jpayne@69 2175
jpayne@69 2176
jpayne@69 2177 class MutableSeq(_SeqAbstractBaseClass):
jpayne@69 2178 """An editable sequence object.
jpayne@69 2179
jpayne@69 2180 Unlike normal python strings and our basic sequence object (the Seq class)
jpayne@69 2181 which are immutable, the MutableSeq lets you edit the sequence in place.
jpayne@69 2182 However, this means you cannot use a MutableSeq object as a dictionary key.
jpayne@69 2183
jpayne@69 2184 >>> from Bio.Seq import MutableSeq
jpayne@69 2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
jpayne@69 2186 >>> my_seq
jpayne@69 2187 MutableSeq('ACTCGTCGTCG')
jpayne@69 2188 >>> my_seq[5]
jpayne@69 2189 'T'
jpayne@69 2190 >>> my_seq[5] = "A"
jpayne@69 2191 >>> my_seq
jpayne@69 2192 MutableSeq('ACTCGACGTCG')
jpayne@69 2193 >>> my_seq[5]
jpayne@69 2194 'A'
jpayne@69 2195 >>> my_seq[5:8] = "NNN"
jpayne@69 2196 >>> my_seq
jpayne@69 2197 MutableSeq('ACTCGNNNTCG')
jpayne@69 2198 >>> len(my_seq)
jpayne@69 2199 11
jpayne@69 2200
jpayne@69 2201 Note that the MutableSeq object does not support as many string-like
jpayne@69 2202 or biological methods as the Seq object.
jpayne@69 2203 """
jpayne@69 2204
jpayne@69 2205 def __init__(self, data):
jpayne@69 2206 """Create a MutableSeq object."""
jpayne@69 2207 if isinstance(data, bytearray):
jpayne@69 2208 self._data = data
jpayne@69 2209 elif isinstance(data, bytes):
jpayne@69 2210 self._data = bytearray(data)
jpayne@69 2211 elif isinstance(data, str):
jpayne@69 2212 self._data = bytearray(data, "ASCII")
jpayne@69 2213 elif isinstance(data, MutableSeq):
jpayne@69 2214 self._data = data._data[:] # Take a copy
jpayne@69 2215 elif isinstance(data, Seq):
jpayne@69 2216 # Make no assumptions about the Seq subclass internal storage
jpayne@69 2217 self._data = bytearray(bytes(data))
jpayne@69 2218 else:
jpayne@69 2219 raise TypeError(
jpayne@69 2220 "data should be a string, bytearray object, Seq object, or a "
jpayne@69 2221 "MutableSeq object"
jpayne@69 2222 )
jpayne@69 2223
jpayne@69 2224 def __setitem__(self, index, value):
jpayne@69 2225 """Set a subsequence of single letter via value parameter.
jpayne@69 2226
jpayne@69 2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2228 >>> my_seq[0] = 'T'
jpayne@69 2229 >>> my_seq
jpayne@69 2230 MutableSeq('TCTCGACGTCG')
jpayne@69 2231 """
jpayne@69 2232 if isinstance(index, numbers.Integral):
jpayne@69 2233 # Replacing a single letter with a new string
jpayne@69 2234 self._data[index] = ord(value)
jpayne@69 2235 else:
jpayne@69 2236 # Replacing a sub-sequence
jpayne@69 2237 if isinstance(value, MutableSeq):
jpayne@69 2238 self._data[index] = value._data
jpayne@69 2239 elif isinstance(value, Seq):
jpayne@69 2240 self._data[index] = bytes(value)
jpayne@69 2241 elif isinstance(value, str):
jpayne@69 2242 self._data[index] = value.encode("ASCII")
jpayne@69 2243 else:
jpayne@69 2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
jpayne@69 2245
jpayne@69 2246 def __delitem__(self, index):
jpayne@69 2247 """Delete a subsequence of single letter.
jpayne@69 2248
jpayne@69 2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2250 >>> del my_seq[0]
jpayne@69 2251 >>> my_seq
jpayne@69 2252 MutableSeq('CTCGACGTCG')
jpayne@69 2253 """
jpayne@69 2254 # Could be deleting a single letter, or a slice
jpayne@69 2255 del self._data[index]
jpayne@69 2256
jpayne@69 2257 def append(self, c):
jpayne@69 2258 """Add a subsequence to the mutable sequence object.
jpayne@69 2259
jpayne@69 2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2261 >>> my_seq.append('A')
jpayne@69 2262 >>> my_seq
jpayne@69 2263 MutableSeq('ACTCGACGTCGA')
jpayne@69 2264
jpayne@69 2265 No return value.
jpayne@69 2266 """
jpayne@69 2267 self._data.append(ord(c.encode("ASCII")))
jpayne@69 2268
jpayne@69 2269 def insert(self, i, c):
jpayne@69 2270 """Add a subsequence to the mutable sequence object at a given index.
jpayne@69 2271
jpayne@69 2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2273 >>> my_seq.insert(0,'A')
jpayne@69 2274 >>> my_seq
jpayne@69 2275 MutableSeq('AACTCGACGTCG')
jpayne@69 2276 >>> my_seq.insert(8,'G')
jpayne@69 2277 >>> my_seq
jpayne@69 2278 MutableSeq('AACTCGACGGTCG')
jpayne@69 2279
jpayne@69 2280 No return value.
jpayne@69 2281 """
jpayne@69 2282 self._data.insert(i, ord(c.encode("ASCII")))
jpayne@69 2283
jpayne@69 2284 def pop(self, i=(-1)):
jpayne@69 2285 """Remove a subsequence of a single letter at given index.
jpayne@69 2286
jpayne@69 2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2288 >>> my_seq.pop()
jpayne@69 2289 'G'
jpayne@69 2290 >>> my_seq
jpayne@69 2291 MutableSeq('ACTCGACGTC')
jpayne@69 2292 >>> my_seq.pop()
jpayne@69 2293 'C'
jpayne@69 2294 >>> my_seq
jpayne@69 2295 MutableSeq('ACTCGACGT')
jpayne@69 2296
jpayne@69 2297 Returns the last character of the sequence.
jpayne@69 2298 """
jpayne@69 2299 c = self._data[i]
jpayne@69 2300 del self._data[i]
jpayne@69 2301 return chr(c)
jpayne@69 2302
jpayne@69 2303 def remove(self, item):
jpayne@69 2304 """Remove a subsequence of a single letter from mutable sequence.
jpayne@69 2305
jpayne@69 2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2307 >>> my_seq.remove('C')
jpayne@69 2308 >>> my_seq
jpayne@69 2309 MutableSeq('ATCGACGTCG')
jpayne@69 2310 >>> my_seq.remove('A')
jpayne@69 2311 >>> my_seq
jpayne@69 2312 MutableSeq('TCGACGTCG')
jpayne@69 2313
jpayne@69 2314 No return value.
jpayne@69 2315 """
jpayne@69 2316 codepoint = ord(item)
jpayne@69 2317 try:
jpayne@69 2318 self._data.remove(codepoint)
jpayne@69 2319 except ValueError:
jpayne@69 2320 raise ValueError("value not found in MutableSeq") from None
jpayne@69 2321
jpayne@69 2322 def reverse(self):
jpayne@69 2323 """Modify the mutable sequence to reverse itself.
jpayne@69 2324
jpayne@69 2325 No return value.
jpayne@69 2326 """
jpayne@69 2327 self._data.reverse()
jpayne@69 2328
jpayne@69 2329 def extend(self, other):
jpayne@69 2330 """Add a sequence to the original mutable sequence object.
jpayne@69 2331
jpayne@69 2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@69 2333 >>> my_seq.extend('A')
jpayne@69 2334 >>> my_seq
jpayne@69 2335 MutableSeq('ACTCGACGTCGA')
jpayne@69 2336 >>> my_seq.extend('TTT')
jpayne@69 2337 >>> my_seq
jpayne@69 2338 MutableSeq('ACTCGACGTCGATTT')
jpayne@69 2339
jpayne@69 2340 No return value.
jpayne@69 2341 """
jpayne@69 2342 if isinstance(other, MutableSeq):
jpayne@69 2343 self._data.extend(other._data)
jpayne@69 2344 elif isinstance(other, Seq):
jpayne@69 2345 self._data.extend(bytes(other))
jpayne@69 2346 elif isinstance(other, str):
jpayne@69 2347 self._data.extend(other.encode("ASCII"))
jpayne@69 2348 else:
jpayne@69 2349 raise TypeError("expected a string, Seq or MutableSeq")
jpayne@69 2350
jpayne@69 2351
jpayne@69 2352 class UndefinedSequenceError(ValueError):
jpayne@69 2353 """Sequence contents is undefined."""
jpayne@69 2354
jpayne@69 2355
jpayne@69 2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@69 2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@69 2358
jpayne@69 2359 Objects of this class can be used to create a Seq object to represent
jpayne@69 2360 sequences with a known length, but an unknown sequence contents.
jpayne@69 2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
jpayne@69 2362 UndefinedSequenceError except for requests of zero size, for which it
jpayne@69 2363 returns an empty bytes object.
jpayne@69 2364 """
jpayne@69 2365
jpayne@69 2366 __slots__ = ("_length",)
jpayne@69 2367
jpayne@69 2368 def __init__(self, length):
jpayne@69 2369 """Initialize the object with the sequence length.
jpayne@69 2370
jpayne@69 2371 The calling function is responsible for ensuring that the length is
jpayne@69 2372 greater than zero.
jpayne@69 2373 """
jpayne@69 2374 self._length = length
jpayne@69 2375 super().__init__()
jpayne@69 2376
jpayne@69 2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
jpayne@69 2378 if isinstance(key, slice):
jpayne@69 2379 start, end, step = key.indices(self._length)
jpayne@69 2380 size = len(range(start, end, step))
jpayne@69 2381 if size == 0:
jpayne@69 2382 return b""
jpayne@69 2383 return _UndefinedSequenceData(size)
jpayne@69 2384 else:
jpayne@69 2385 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69 2386
jpayne@69 2387 def __len__(self):
jpayne@69 2388 return self._length
jpayne@69 2389
jpayne@69 2390 def __bytes__(self):
jpayne@69 2391 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69 2392
jpayne@69 2393 def __add__(self, other):
jpayne@69 2394 length = len(self) + len(other)
jpayne@69 2395 try:
jpayne@69 2396 other = bytes(other)
jpayne@69 2397 except UndefinedSequenceError:
jpayne@69 2398 if isinstance(other, _UndefinedSequenceData):
jpayne@69 2399 return _UndefinedSequenceData(length)
jpayne@69 2400 else:
jpayne@69 2401 return NotImplemented
jpayne@69 2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
jpayne@69 2403 else:
jpayne@69 2404 data = {len(self): other}
jpayne@69 2405 return _PartiallyDefinedSequenceData(length, data)
jpayne@69 2406
jpayne@69 2407 def __radd__(self, other):
jpayne@69 2408 data = {0: bytes(other)}
jpayne@69 2409 length = len(other) + len(self)
jpayne@69 2410 return _PartiallyDefinedSequenceData(length, data)
jpayne@69 2411
jpayne@69 2412 def upper(self):
jpayne@69 2413 """Return an upper case copy of the sequence."""
jpayne@69 2414 # An upper case copy of an undefined sequence is an undefined
jpayne@69 2415 # sequence of the same length
jpayne@69 2416 return _UndefinedSequenceData(self._length)
jpayne@69 2417
jpayne@69 2418 def lower(self):
jpayne@69 2419 """Return a lower case copy of the sequence."""
jpayne@69 2420 # A lower case copy of an undefined sequence is an undefined
jpayne@69 2421 # sequence of the same length
jpayne@69 2422 return _UndefinedSequenceData(self._length)
jpayne@69 2423
jpayne@69 2424 def isupper(self):
jpayne@69 2425 """Return True if all ASCII characters in data are uppercase.
jpayne@69 2426
jpayne@69 2427 If there are no cased characters, the method returns False.
jpayne@69 2428 """
jpayne@69 2429 # Character case is irrelevant for an undefined sequence
jpayne@69 2430 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69 2431
jpayne@69 2432 def islower(self):
jpayne@69 2433 """Return True if all ASCII characters in data are lowercase.
jpayne@69 2434
jpayne@69 2435 If there are no cased characters, the method returns False.
jpayne@69 2436 """
jpayne@69 2437 # Character case is irrelevant for an undefined sequence
jpayne@69 2438 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69 2439
jpayne@69 2440 def replace(self, old, new):
jpayne@69 2441 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69 2442 # Replacing substring old by new in an undefined sequence will result
jpayne@69 2443 # in an undefined sequence of the same length, if old and new have the
jpayne@69 2444 # number of characters.
jpayne@69 2445 if len(old) != len(new):
jpayne@69 2446 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@69 2447 return _UndefinedSequenceData(self._length)
jpayne@69 2448
jpayne@69 2449 @property
jpayne@69 2450 def defined(self):
jpayne@69 2451 """Return False, as the sequence is not defined and has a non-zero length."""
jpayne@69 2452 return False
jpayne@69 2453
jpayne@69 2454 @property
jpayne@69 2455 def defined_ranges(self):
jpayne@69 2456 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69 2457
jpayne@69 2458 As the sequence contents of an _UndefinedSequenceData object is fully
jpayne@69 2459 undefined, the return value is always an empty tuple.
jpayne@69 2460 """
jpayne@69 2461 return ()
jpayne@69 2462
jpayne@69 2463
jpayne@69 2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@69 2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@69 2466
jpayne@69 2467 Objects of this class can be used to create a Seq object to represent
jpayne@69 2468 sequences with a known length, but with a sequence contents that is only
jpayne@69 2469 partially known.
jpayne@69 2470 Calling __len__ returns the sequence length, calling __getitem__ returns
jpayne@69 2471 the sequence contents if known, otherwise an UndefinedSequenceError is
jpayne@69 2472 raised.
jpayne@69 2473 """
jpayne@69 2474
jpayne@69 2475 __slots__ = ("_length", "_data")
jpayne@69 2476
jpayne@69 2477 def __init__(self, length, data):
jpayne@69 2478 """Initialize with the sequence length and defined sequence segments.
jpayne@69 2479
jpayne@69 2480 The calling function is responsible for ensuring that the length is
jpayne@69 2481 greater than zero.
jpayne@69 2482 """
jpayne@69 2483 self._length = length
jpayne@69 2484 self._data = data
jpayne@69 2485 super().__init__()
jpayne@69 2486
jpayne@69 2487 def __getitem__(
jpayne@69 2488 self, key: Union[slice, int]
jpayne@69 2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
jpayne@69 2490 if isinstance(key, slice):
jpayne@69 2491 start, end, step = key.indices(self._length)
jpayne@69 2492 size = len(range(start, end, step))
jpayne@69 2493 if size == 0:
jpayne@69 2494 return b""
jpayne@69 2495 data = {}
jpayne@69 2496 for s, d in self._data.items():
jpayne@69 2497 indices = range(-s, -s + self._length)[key]
jpayne@69 2498 e: Optional[int] = indices.stop
jpayne@69 2499 assert e is not None
jpayne@69 2500 if step > 0:
jpayne@69 2501 if e <= 0:
jpayne@69 2502 continue
jpayne@69 2503 if indices.start < 0:
jpayne@69 2504 s = indices.start % step
jpayne@69 2505 else:
jpayne@69 2506 s = indices.start
jpayne@69 2507 else: # step < 0
jpayne@69 2508 if e < 0:
jpayne@69 2509 e = None
jpayne@69 2510 end = len(d) - 1
jpayne@69 2511 if indices.start > end:
jpayne@69 2512 s = end + (indices.start - end) % step
jpayne@69 2513 else:
jpayne@69 2514 s = indices.start
jpayne@69 2515 if s < 0:
jpayne@69 2516 continue
jpayne@69 2517 start = (s - indices.start) // step
jpayne@69 2518 d = d[s:e:step]
jpayne@69 2519 if d:
jpayne@69 2520 data[start] = d
jpayne@69 2521 if len(data) == 0: # Fully undefined sequence
jpayne@69 2522 return _UndefinedSequenceData(size)
jpayne@69 2523 # merge adjacent sequence segments
jpayne@69 2524 end = -1
jpayne@69 2525 previous = 0 # not needed here, but it keeps flake happy
jpayne@69 2526 items = data.items()
jpayne@69 2527 data = {}
jpayne@69 2528 for start, seq in items:
jpayne@69 2529 if end == start:
jpayne@69 2530 data[previous] += seq
jpayne@69 2531 else:
jpayne@69 2532 data[start] = seq
jpayne@69 2533 previous = start
jpayne@69 2534 end = start + len(seq)
jpayne@69 2535 if len(data) == 1:
jpayne@69 2536 seq = data.get(0)
jpayne@69 2537 if seq is not None and len(seq) == size:
jpayne@69 2538 return seq # Fully defined sequence; return bytes
jpayne@69 2539 if step < 0:
jpayne@69 2540 # use this after we drop Python 3.7:
jpayne@69 2541 # data = {start: data[start] for start in reversed(data)}
jpayne@69 2542 # use this as long as we support Python 3.7:
jpayne@69 2543 data = {start: data[start] for start in reversed(list(data.keys()))}
jpayne@69 2544 return _PartiallyDefinedSequenceData(size, data)
jpayne@69 2545 elif self._length <= key:
jpayne@69 2546 raise IndexError("sequence index out of range")
jpayne@69 2547 else:
jpayne@69 2548 for start, seq in self._data.items():
jpayne@69 2549 if start <= key and key < start + len(seq):
jpayne@69 2550 return seq[key - start]
jpayne@69 2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
jpayne@69 2552
jpayne@69 2553 def __len__(self):
jpayne@69 2554 return self._length
jpayne@69 2555
jpayne@69 2556 def __bytes__(self):
jpayne@69 2557 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69 2558
jpayne@69 2559 def __add__(self, other):
jpayne@69 2560 length = len(self) + len(other)
jpayne@69 2561 data = dict(self._data)
jpayne@69 2562 items = list(self._data.items())
jpayne@69 2563 start, seq = items[-1]
jpayne@69 2564 end = start + len(seq)
jpayne@69 2565 try:
jpayne@69 2566 other = bytes(other)
jpayne@69 2567 except UndefinedSequenceError:
jpayne@69 2568 if isinstance(other, _UndefinedSequenceData):
jpayne@69 2569 pass
jpayne@69 2570 elif isinstance(other, _PartiallyDefinedSequenceData):
jpayne@69 2571 other_items = list(other._data.items())
jpayne@69 2572 if end == len(self):
jpayne@69 2573 other_start, other_seq = other_items.pop(0)
jpayne@69 2574 if other_start == 0:
jpayne@69 2575 data[start] += other_seq
jpayne@69 2576 else:
jpayne@69 2577 data[len(self) + other_start] = other_seq
jpayne@69 2578 for other_start, other_seq in other_items:
jpayne@69 2579 data[len(self) + other_start] = other_seq
jpayne@69 2580 else:
jpayne@69 2581 if end == len(self):
jpayne@69 2582 data[start] += other
jpayne@69 2583 else:
jpayne@69 2584 data[len(self)] = other
jpayne@69 2585 return _PartiallyDefinedSequenceData(length, data)
jpayne@69 2586
jpayne@69 2587 def __radd__(self, other):
jpayne@69 2588 length = len(other) + len(self)
jpayne@69 2589 try:
jpayne@69 2590 other = bytes(other)
jpayne@69 2591 except UndefinedSequenceError:
jpayne@69 2592 data = {len(other) + start: seq for start, seq in self._data.items()}
jpayne@69 2593 else:
jpayne@69 2594 data = {0: other}
jpayne@69 2595 items = list(self._data.items())
jpayne@69 2596 start, seq = items.pop(0)
jpayne@69 2597 if start == 0:
jpayne@69 2598 data[0] += seq
jpayne@69 2599 else:
jpayne@69 2600 data[len(other) + start] = seq
jpayne@69 2601 for start, seq in items:
jpayne@69 2602 data[len(other) + start] = seq
jpayne@69 2603 return _PartiallyDefinedSequenceData(length, data)
jpayne@69 2604
jpayne@69 2605 def __mul__(self, other):
jpayne@69 2606 length = self._length
jpayne@69 2607 items = self._data.items()
jpayne@69 2608 data = {}
jpayne@69 2609 end = -1
jpayne@69 2610 previous = 0 # not needed here, but it keeps flake happy
jpayne@69 2611 for i in range(other):
jpayne@69 2612 for start, seq in items:
jpayne@69 2613 start += i * length
jpayne@69 2614 if end == start:
jpayne@69 2615 data[previous] += seq
jpayne@69 2616 else:
jpayne@69 2617 data[start] = seq
jpayne@69 2618 previous = start
jpayne@69 2619 end = start + len(seq)
jpayne@69 2620 return _PartiallyDefinedSequenceData(length * other, data)
jpayne@69 2621
jpayne@69 2622 def upper(self):
jpayne@69 2623 """Return an upper case copy of the sequence."""
jpayne@69 2624 data = {start: seq.upper() for start, seq in self._data.items()}
jpayne@69 2625 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69 2626
jpayne@69 2627 def lower(self):
jpayne@69 2628 """Return a lower case copy of the sequence."""
jpayne@69 2629 data = {start: seq.lower() for start, seq in self._data.items()}
jpayne@69 2630 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69 2631
jpayne@69 2632 def isupper(self):
jpayne@69 2633 """Return True if all ASCII characters in data are uppercase.
jpayne@69 2634
jpayne@69 2635 If there are no cased characters, the method returns False.
jpayne@69 2636 """
jpayne@69 2637 # Character case is irrelevant for an undefined sequence
jpayne@69 2638 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69 2639
jpayne@69 2640 def islower(self):
jpayne@69 2641 """Return True if all ASCII characters in data are lowercase.
jpayne@69 2642
jpayne@69 2643 If there are no cased characters, the method returns False.
jpayne@69 2644 """
jpayne@69 2645 # Character case is irrelevant for an undefined sequence
jpayne@69 2646 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@69 2647
jpayne@69 2648 def translate(self, table, delete=b""):
jpayne@69 2649 """Return a copy with each character mapped by the given translation table.
jpayne@69 2650
jpayne@69 2651 table
jpayne@69 2652 Translation table, which must be a bytes object of length 256.
jpayne@69 2653
jpayne@69 2654 All characters occurring in the optional argument delete are removed.
jpayne@69 2655 The remaining characters are mapped through the given translation table.
jpayne@69 2656 """
jpayne@69 2657 items = self._data.items()
jpayne@69 2658 data = {start: seq.translate(table, delete) for start, seq in items}
jpayne@69 2659 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69 2660
jpayne@69 2661 def replace(self, old, new):
jpayne@69 2662 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@69 2663 # Replacing substring old by new in the undefined sequence segments
jpayne@69 2664 # will result in an undefined sequence segment of the same length, if
jpayne@69 2665 # old and new have the number of characters. If not, an error is raised,
jpayne@69 2666 # as the correct start positions cannot be calculated reliably.
jpayne@69 2667 if len(old) != len(new):
jpayne@69 2668 raise UndefinedSequenceError(
jpayne@69 2669 "Sequence content is only partially defined; substring \n"
jpayne@69 2670 "replacement cannot be performed reliably"
jpayne@69 2671 )
jpayne@69 2672 items = self._data.items()
jpayne@69 2673 data = {start: seq.replace(old, new) for start, seq in items}
jpayne@69 2674 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@69 2675
jpayne@69 2676 @property
jpayne@69 2677 def defined(self):
jpayne@69 2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
jpayne@69 2679 return False
jpayne@69 2680
jpayne@69 2681 @property
jpayne@69 2682 def defined_ranges(self):
jpayne@69 2683 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@69 2684
jpayne@69 2685 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@69 2686 """
jpayne@69 2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
jpayne@69 2688
jpayne@69 2689
jpayne@69 2690 # The transcribe, backward_transcribe, and translate functions are
jpayne@69 2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
jpayne@69 2692 # The functions work both on Seq objects, and on strings.
jpayne@69 2693
jpayne@69 2694
jpayne@69 2695 def transcribe(dna):
jpayne@69 2696 """Transcribe a DNA sequence into RNA.
jpayne@69 2697
jpayne@69 2698 Following the usual convention, the sequence is interpreted as the
jpayne@69 2699 coding strand of the DNA double helix, not the template strand. This
jpayne@69 2700 means we can get the RNA sequence just by switching T to U.
jpayne@69 2701
jpayne@69 2702 If given a string, returns a new string object.
jpayne@69 2703
jpayne@69 2704 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@69 2705
jpayne@69 2706 e.g.
jpayne@69 2707
jpayne@69 2708 >>> transcribe("ACTGN")
jpayne@69 2709 'ACUGN'
jpayne@69 2710 """
jpayne@69 2711 if isinstance(dna, Seq):
jpayne@69 2712 return dna.transcribe()
jpayne@69 2713 elif isinstance(dna, MutableSeq):
jpayne@69 2714 return Seq(dna).transcribe()
jpayne@69 2715 else:
jpayne@69 2716 return dna.replace("T", "U").replace("t", "u")
jpayne@69 2717
jpayne@69 2718
jpayne@69 2719 def back_transcribe(rna):
jpayne@69 2720 """Return the RNA sequence back-transcribed into DNA.
jpayne@69 2721
jpayne@69 2722 If given a string, returns a new string object.
jpayne@69 2723
jpayne@69 2724 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@69 2725
jpayne@69 2726 e.g.
jpayne@69 2727
jpayne@69 2728 >>> back_transcribe("ACUGN")
jpayne@69 2729 'ACTGN'
jpayne@69 2730 """
jpayne@69 2731 if isinstance(rna, Seq):
jpayne@69 2732 return rna.back_transcribe()
jpayne@69 2733 elif isinstance(rna, MutableSeq):
jpayne@69 2734 return Seq(rna).back_transcribe()
jpayne@69 2735 else:
jpayne@69 2736 return rna.replace("U", "T").replace("u", "t")
jpayne@69 2737
jpayne@69 2738
jpayne@69 2739 def _translate_str(
jpayne@69 2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
jpayne@69 2741 ):
jpayne@69 2742 """Translate nucleotide string into a protein string (PRIVATE).
jpayne@69 2743
jpayne@69 2744 Arguments:
jpayne@69 2745 - sequence - a string
jpayne@69 2746 - table - Which codon table to use? This can be either a name (string),
jpayne@69 2747 an NCBI identifier (integer), or a CodonTable object (useful for
jpayne@69 2748 non-standard genetic codes). This defaults to the "Standard" table.
jpayne@69 2749 - stop_symbol - a single character string, what to use for terminators.
jpayne@69 2750 - to_stop - boolean, should translation terminate at the first
jpayne@69 2751 in frame stop codon? If there is no in-frame stop codon
jpayne@69 2752 then translation continues to the end.
jpayne@69 2753 - pos_stop - a single character string for a possible stop codon
jpayne@69 2754 (e.g. TAN or NNN)
jpayne@69 2755 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@69 2756 checks the sequence starts with a valid alternative start
jpayne@69 2757 codon (which will be translated as methionine, M), that the
jpayne@69 2758 sequence length is a multiple of three, and that there is a
jpayne@69 2759 single in frame stop codon at the end (this will be excluded
jpayne@69 2760 from the protein sequence, regardless of the to_stop option).
jpayne@69 2761 If these tests fail, an exception is raised.
jpayne@69 2762 - gap - Single character string to denote symbol used for gaps.
jpayne@69 2763 Defaults to None.
jpayne@69 2764
jpayne@69 2765 Returns a string.
jpayne@69 2766
jpayne@69 2767 e.g.
jpayne@69 2768
jpayne@69 2769 >>> from Bio.Data import CodonTable
jpayne@69 2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
jpayne@69 2771 >>> _translate_str("AAA", table)
jpayne@69 2772 'K'
jpayne@69 2773 >>> _translate_str("TAR", table)
jpayne@69 2774 '*'
jpayne@69 2775 >>> _translate_str("TAN", table)
jpayne@69 2776 'X'
jpayne@69 2777 >>> _translate_str("TAN", table, pos_stop="@")
jpayne@69 2778 '@'
jpayne@69 2779 >>> _translate_str("TA?", table)
jpayne@69 2780 Traceback (most recent call last):
jpayne@69 2781 ...
jpayne@69 2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
jpayne@69 2783
jpayne@69 2784 In a change to older versions of Biopython, partial codons are now
jpayne@69 2785 always regarded as an error (previously only checked if cds=True)
jpayne@69 2786 and will trigger a warning (likely to become an exception in a
jpayne@69 2787 future release).
jpayne@69 2788
jpayne@69 2789 If **cds=True**, the start and stop codons are checked, and the start
jpayne@69 2790 codon will be translated at methionine. The sequence must be an
jpayne@69 2791 while number of codons.
jpayne@69 2792
jpayne@69 2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
jpayne@69 2794 'MP'
jpayne@69 2795 >>> _translate_str("AAACCCTAG", table, cds=True)
jpayne@69 2796 Traceback (most recent call last):
jpayne@69 2797 ...
jpayne@69 2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
jpayne@69 2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
jpayne@69 2800 Traceback (most recent call last):
jpayne@69 2801 ...
jpayne@69 2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
jpayne@69 2803 """
jpayne@69 2804 try:
jpayne@69 2805 table_id = int(table)
jpayne@69 2806 except ValueError:
jpayne@69 2807 # Assume it's a table name
jpayne@69 2808 # The same table can be used for RNA or DNA
jpayne@69 2809 try:
jpayne@69 2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
jpayne@69 2811 except KeyError:
jpayne@69 2812 if isinstance(table, str):
jpayne@69 2813 raise ValueError(
jpayne@69 2814 "The Bio.Seq translate methods and function DO NOT "
jpayne@69 2815 "take a character string mapping table like the python "
jpayne@69 2816 "string object's translate method. "
jpayne@69 2817 "Use str(my_seq).translate(...) instead."
jpayne@69 2818 ) from None
jpayne@69 2819 else:
jpayne@69 2820 raise TypeError("table argument must be integer or string") from None
jpayne@69 2821 except (AttributeError, TypeError):
jpayne@69 2822 # Assume it's a CodonTable object
jpayne@69 2823 if isinstance(table, CodonTable.CodonTable):
jpayne@69 2824 codon_table = table
jpayne@69 2825 else:
jpayne@69 2826 raise ValueError("Bad table argument") from None
jpayne@69 2827 else:
jpayne@69 2828 # Assume it's a table ID
jpayne@69 2829 # The same table can be used for RNA or DNA
jpayne@69 2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
jpayne@69 2831 sequence = sequence.upper()
jpayne@69 2832 amino_acids = []
jpayne@69 2833 forward_table = codon_table.forward_table
jpayne@69 2834 stop_codons = codon_table.stop_codons
jpayne@69 2835 if codon_table.nucleotide_alphabet is not None:
jpayne@69 2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
jpayne@69 2837 else:
jpayne@69 2838 # Assume the worst case, ambiguous DNA or RNA:
jpayne@69 2839 valid_letters = set(
jpayne@69 2840 IUPACData.ambiguous_dna_letters.upper()
jpayne@69 2841 + IUPACData.ambiguous_rna_letters.upper()
jpayne@69 2842 )
jpayne@69 2843 n = len(sequence)
jpayne@69 2844
jpayne@69 2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
jpayne@69 2846 dual_coding = [c for c in stop_codons if c in forward_table]
jpayne@69 2847 if dual_coding:
jpayne@69 2848 c = dual_coding[0]
jpayne@69 2849 if to_stop:
jpayne@69 2850 raise ValueError(
jpayne@69 2851 "You cannot use 'to_stop=True' with this table as it contains"
jpayne@69 2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
jpayne@69 2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
jpayne@69 2854 )
jpayne@69 2855 warnings.warn(
jpayne@69 2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
jpayne@69 2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
jpayne@69 2858 " or STOP). Such codons will be translated as amino acid.",
jpayne@69 2859 BiopythonWarning,
jpayne@69 2860 )
jpayne@69 2861
jpayne@69 2862 if cds:
jpayne@69 2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
jpayne@69 2864 raise CodonTable.TranslationError(
jpayne@69 2865 f"First codon '{sequence[:3]}' is not a start codon"
jpayne@69 2866 )
jpayne@69 2867 if n % 3 != 0:
jpayne@69 2868 raise CodonTable.TranslationError(
jpayne@69 2869 f"Sequence length {n} is not a multiple of three"
jpayne@69 2870 )
jpayne@69 2871 if str(sequence[-3:]).upper() not in stop_codons:
jpayne@69 2872 raise CodonTable.TranslationError(
jpayne@69 2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
jpayne@69 2874 )
jpayne@69 2875 # Don't translate the stop symbol, and manually translate the M
jpayne@69 2876 sequence = sequence[3:-3]
jpayne@69 2877 n -= 6
jpayne@69 2878 amino_acids = ["M"]
jpayne@69 2879 elif n % 3 != 0:
jpayne@69 2880 warnings.warn(
jpayne@69 2881 "Partial codon, len(sequence) not a multiple of three. "
jpayne@69 2882 "Explicitly trim the sequence or add trailing N before "
jpayne@69 2883 "translation. This may become an error in future.",
jpayne@69 2884 BiopythonWarning,
jpayne@69 2885 )
jpayne@69 2886 if gap is not None:
jpayne@69 2887 if not isinstance(gap, str):
jpayne@69 2888 raise TypeError("Gap character should be a single character string.")
jpayne@69 2889 elif len(gap) > 1:
jpayne@69 2890 raise ValueError("Gap character should be a single character string.")
jpayne@69 2891
jpayne@69 2892 for i in range(0, n - n % 3, 3):
jpayne@69 2893 codon = sequence[i : i + 3]
jpayne@69 2894 try:
jpayne@69 2895 amino_acids.append(forward_table[codon])
jpayne@69 2896 except (KeyError, CodonTable.TranslationError):
jpayne@69 2897 if codon in codon_table.stop_codons:
jpayne@69 2898 if cds:
jpayne@69 2899 raise CodonTable.TranslationError(
jpayne@69 2900 f"Extra in frame stop codon '{codon}' found."
jpayne@69 2901 ) from None
jpayne@69 2902 if to_stop:
jpayne@69 2903 break
jpayne@69 2904 amino_acids.append(stop_symbol)
jpayne@69 2905 elif valid_letters.issuperset(set(codon)):
jpayne@69 2906 # Possible stop codon (e.g. NNN or TAN)
jpayne@69 2907 amino_acids.append(pos_stop)
jpayne@69 2908 elif gap is not None and codon == gap * 3:
jpayne@69 2909 # Gapped translation
jpayne@69 2910 amino_acids.append(gap)
jpayne@69 2911 else:
jpayne@69 2912 raise CodonTable.TranslationError(
jpayne@69 2913 f"Codon '{codon}' is invalid"
jpayne@69 2914 ) from None
jpayne@69 2915 return "".join(amino_acids)
jpayne@69 2916
jpayne@69 2917
jpayne@69 2918 def translate(
jpayne@69 2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
jpayne@69 2920 ):
jpayne@69 2921 """Translate a nucleotide sequence into amino acids.
jpayne@69 2922
jpayne@69 2923 If given a string, returns a new string object. Given a Seq or
jpayne@69 2924 MutableSeq, returns a Seq object.
jpayne@69 2925
jpayne@69 2926 Arguments:
jpayne@69 2927 - table - Which codon table to use? This can be either a name
jpayne@69 2928 (string), an NCBI identifier (integer), or a CodonTable object
jpayne@69 2929 (useful for non-standard genetic codes). Defaults to the "Standard"
jpayne@69 2930 table.
jpayne@69 2931 - stop_symbol - Single character string, what to use for any
jpayne@69 2932 terminators, defaults to the asterisk, "*".
jpayne@69 2933 - to_stop - Boolean, defaults to False meaning do a full
jpayne@69 2934 translation continuing on past any stop codons
jpayne@69 2935 (translated as the specified stop_symbol). If
jpayne@69 2936 True, translation is terminated at the first in
jpayne@69 2937 frame stop codon (and the stop_symbol is not
jpayne@69 2938 appended to the returned protein sequence).
jpayne@69 2939 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@69 2940 checks the sequence starts with a valid alternative start
jpayne@69 2941 codon (which will be translated as methionine, M), that the
jpayne@69 2942 sequence length is a multiple of three, and that there is a
jpayne@69 2943 single in frame stop codon at the end (this will be excluded
jpayne@69 2944 from the protein sequence, regardless of the to_stop option).
jpayne@69 2945 If these tests fail, an exception is raised.
jpayne@69 2946 - gap - Single character string to denote symbol used for gaps.
jpayne@69 2947 Defaults to None.
jpayne@69 2948
jpayne@69 2949 A simple string example using the default (standard) genetic code:
jpayne@69 2950
jpayne@69 2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
jpayne@69 2952 >>> translate(coding_dna)
jpayne@69 2953 'VAIVMGR*KGAR*'
jpayne@69 2954 >>> translate(coding_dna, stop_symbol="@")
jpayne@69 2955 'VAIVMGR@KGAR@'
jpayne@69 2956 >>> translate(coding_dna, to_stop=True)
jpayne@69 2957 'VAIVMGR'
jpayne@69 2958
jpayne@69 2959 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@69 2960
jpayne@69 2961 >>> translate(coding_dna, table=2)
jpayne@69 2962 'VAIVMGRWKGAR*'
jpayne@69 2963 >>> translate(coding_dna, table=2, to_stop=True)
jpayne@69 2964 'VAIVMGRWKGAR'
jpayne@69 2965
jpayne@69 2966 In fact this example uses an alternative start codon valid under NCBI
jpayne@69 2967 table 2, GTG, which means this example is a complete valid CDS which
jpayne@69 2968 when translated should really start with methionine (not valine):
jpayne@69 2969
jpayne@69 2970 >>> translate(coding_dna, table=2, cds=True)
jpayne@69 2971 'MAIVMGRWKGAR'
jpayne@69 2972
jpayne@69 2973 Note that if the sequence has no in-frame stop codon, then the to_stop
jpayne@69 2974 argument has no effect:
jpayne@69 2975
jpayne@69 2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
jpayne@69 2977 >>> translate(coding_dna2)
jpayne@69 2978 'VAIVMGR'
jpayne@69 2979 >>> translate(coding_dna2, to_stop=True)
jpayne@69 2980 'VAIVMGR'
jpayne@69 2981
jpayne@69 2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@69 2983 or a stop codon. These are translated as "X". Any invalid codon
jpayne@69 2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@69 2985
jpayne@69 2986 It will however translate either DNA or RNA.
jpayne@69 2987
jpayne@69 2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
jpayne@69 2989 stop codons'. These are stop codons with unambiguous sequence but which
jpayne@69 2990 have a context dependent coding as STOP or as amino acid. With these tables
jpayne@69 2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
jpayne@69 2992 coding codons will always be translated as amino acid, except for
jpayne@69 2993 'cds=True', where the last codon will be translated as STOP.
jpayne@69 2994
jpayne@69 2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
jpayne@69 2996 >>> translate(coding_dna3)
jpayne@69 2997 'MARK*'
jpayne@69 2998
jpayne@69 2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
jpayne@69 3000 'MARKW'
jpayne@69 3001
jpayne@69 3002 It will however raise a BiopythonWarning (not shown).
jpayne@69 3003
jpayne@69 3004 >>> translate(coding_dna3, table=27, cds=True)
jpayne@69 3005 'MARK'
jpayne@69 3006
jpayne@69 3007 >>> translate(coding_dna3, table=27, to_stop=True)
jpayne@69 3008 Traceback (most recent call last):
jpayne@69 3009 ...
jpayne@69 3010 ValueError: You cannot use 'to_stop=True' with this table ...
jpayne@69 3011 """
jpayne@69 3012 if isinstance(sequence, Seq):
jpayne@69 3013 return sequence.translate(table, stop_symbol, to_stop, cds)
jpayne@69 3014 elif isinstance(sequence, MutableSeq):
jpayne@69 3015 # Return a Seq object
jpayne@69 3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
jpayne@69 3017 else:
jpayne@69 3018 # Assume it's a string, return a string
jpayne@69 3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
jpayne@69 3020
jpayne@69 3021
jpayne@69 3022 def reverse_complement(sequence, inplace=False):
jpayne@69 3023 """Return the reverse complement as a DNA sequence.
jpayne@69 3024
jpayne@69 3025 If given a string, returns a new string object.
jpayne@69 3026 Given a Seq object, returns a new Seq object.
jpayne@69 3027 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69 3028 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69 3029
jpayne@69 3030 >>> my_seq = "CGA"
jpayne@69 3031 >>> reverse_complement(my_seq)
jpayne@69 3032 'TCG'
jpayne@69 3033 >>> my_seq = Seq("CGA")
jpayne@69 3034 >>> reverse_complement(my_seq)
jpayne@69 3035 Seq('TCG')
jpayne@69 3036 >>> my_seq = MutableSeq("CGA")
jpayne@69 3037 >>> reverse_complement(my_seq)
jpayne@69 3038 MutableSeq('TCG')
jpayne@69 3039 >>> my_seq
jpayne@69 3040 MutableSeq('CGA')
jpayne@69 3041
jpayne@69 3042 Any U in the sequence is treated as a T:
jpayne@69 3043
jpayne@69 3044 >>> reverse_complement(Seq("CGAUT"))
jpayne@69 3045 Seq('AATCG')
jpayne@69 3046
jpayne@69 3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@69 3048
jpayne@69 3049 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@69 3050 Seq('AAUCG')
jpayne@69 3051
jpayne@69 3052 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69 3053 ambiguous nucleotides. All other characters are not converted:
jpayne@69 3054
jpayne@69 3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
jpayne@69 3056 'zrxZRXaacgtAACGT'
jpayne@69 3057
jpayne@69 3058 The sequence is modified in-place and returned if inplace is True:
jpayne@69 3059
jpayne@69 3060 >>> my_seq = MutableSeq("CGA")
jpayne@69 3061 >>> reverse_complement(my_seq, inplace=True)
jpayne@69 3062 MutableSeq('TCG')
jpayne@69 3063 >>> my_seq
jpayne@69 3064 MutableSeq('TCG')
jpayne@69 3065
jpayne@69 3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69 3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69 3068 ``inplace=True``.
jpayne@69 3069 """
jpayne@69 3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69 3071
jpayne@69 3072 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69 3073 return sequence.reverse_complement(inplace)
jpayne@69 3074 if isinstance(sequence, SeqRecord):
jpayne@69 3075 if inplace:
jpayne@69 3076 raise TypeError("SeqRecords are immutable")
jpayne@69 3077 return sequence.reverse_complement()
jpayne@69 3078 # Assume it's a string.
jpayne@69 3079 if inplace:
jpayne@69 3080 raise TypeError("strings are immutable")
jpayne@69 3081 sequence = sequence.encode("ASCII")
jpayne@69 3082 sequence = sequence.translate(_dna_complement_table)
jpayne@69 3083 sequence = sequence.decode("ASCII")
jpayne@69 3084 return sequence[::-1]
jpayne@69 3085
jpayne@69 3086
jpayne@69 3087 def reverse_complement_rna(sequence, inplace=False):
jpayne@69 3088 """Return the reverse complement as an RNA sequence.
jpayne@69 3089
jpayne@69 3090 If given a string, returns a new string object.
jpayne@69 3091 Given a Seq object, returns a new Seq object.
jpayne@69 3092 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69 3093 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69 3094
jpayne@69 3095 >>> my_seq = "CGA"
jpayne@69 3096 >>> reverse_complement_rna(my_seq)
jpayne@69 3097 'UCG'
jpayne@69 3098 >>> my_seq = Seq("CGA")
jpayne@69 3099 >>> reverse_complement_rna(my_seq)
jpayne@69 3100 Seq('UCG')
jpayne@69 3101 >>> my_seq = MutableSeq("CGA")
jpayne@69 3102 >>> reverse_complement_rna(my_seq)
jpayne@69 3103 MutableSeq('UCG')
jpayne@69 3104 >>> my_seq
jpayne@69 3105 MutableSeq('CGA')
jpayne@69 3106
jpayne@69 3107 Any T in the sequence is treated as a U:
jpayne@69 3108
jpayne@69 3109 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@69 3110 Seq('AAUCG')
jpayne@69 3111
jpayne@69 3112 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@69 3113
jpayne@69 3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
jpayne@69 3115 Seq('AATCG')
jpayne@69 3116
jpayne@69 3117 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69 3118 ambiguous nucleotides. All other characters are not converted:
jpayne@69 3119
jpayne@69 3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
jpayne@69 3121 'zrxZRXaacguAACGU'
jpayne@69 3122
jpayne@69 3123 The sequence is modified in-place and returned if inplace is True:
jpayne@69 3124
jpayne@69 3125 >>> my_seq = MutableSeq("CGA")
jpayne@69 3126 >>> reverse_complement_rna(my_seq, inplace=True)
jpayne@69 3127 MutableSeq('UCG')
jpayne@69 3128 >>> my_seq
jpayne@69 3129 MutableSeq('UCG')
jpayne@69 3130
jpayne@69 3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69 3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69 3133 ``inplace=True``.
jpayne@69 3134 """
jpayne@69 3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69 3136
jpayne@69 3137 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69 3138 return sequence.reverse_complement_rna(inplace)
jpayne@69 3139 if isinstance(sequence, SeqRecord):
jpayne@69 3140 if inplace:
jpayne@69 3141 raise TypeError("SeqRecords are immutable")
jpayne@69 3142 return sequence.reverse_complement_rna()
jpayne@69 3143 # Assume it's a string.
jpayne@69 3144 if inplace:
jpayne@69 3145 raise TypeError("strings are immutable")
jpayne@69 3146 sequence = sequence.encode("ASCII")
jpayne@69 3147 sequence = sequence.translate(_rna_complement_table)
jpayne@69 3148 sequence = sequence.decode("ASCII")
jpayne@69 3149 return sequence[::-1]
jpayne@69 3150
jpayne@69 3151
jpayne@69 3152 def complement(sequence, inplace=False):
jpayne@69 3153 """Return the complement as a DNA sequence.
jpayne@69 3154
jpayne@69 3155 If given a string, returns a new string object.
jpayne@69 3156 Given a Seq object, returns a new Seq object.
jpayne@69 3157 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69 3158 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69 3159
jpayne@69 3160 >>> my_seq = "CGA"
jpayne@69 3161 >>> complement(my_seq)
jpayne@69 3162 'GCT'
jpayne@69 3163 >>> my_seq = Seq("CGA")
jpayne@69 3164 >>> complement(my_seq)
jpayne@69 3165 Seq('GCT')
jpayne@69 3166 >>> my_seq = MutableSeq("CGA")
jpayne@69 3167 >>> complement(my_seq)
jpayne@69 3168 MutableSeq('GCT')
jpayne@69 3169 >>> my_seq
jpayne@69 3170 MutableSeq('CGA')
jpayne@69 3171
jpayne@69 3172 Any U in the sequence is treated as a T:
jpayne@69 3173
jpayne@69 3174 >>> complement(Seq("CGAUT"))
jpayne@69 3175 Seq('GCTAA')
jpayne@69 3176
jpayne@69 3177 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@69 3178
jpayne@69 3179 >>> complement_rna(Seq("CGAUT"))
jpayne@69 3180 Seq('GCUAA')
jpayne@69 3181
jpayne@69 3182 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69 3183 ambiguous nucleotides. All other characters are not converted:
jpayne@69 3184
jpayne@69 3185 >>> complement("ACGTUacgtuXYZxyz")
jpayne@69 3186 'TGCAAtgcaaXRZxrz'
jpayne@69 3187
jpayne@69 3188 The sequence is modified in-place and returned if inplace is True:
jpayne@69 3189
jpayne@69 3190 >>> my_seq = MutableSeq("CGA")
jpayne@69 3191 >>> complement(my_seq, inplace=True)
jpayne@69 3192 MutableSeq('GCT')
jpayne@69 3193 >>> my_seq
jpayne@69 3194 MutableSeq('GCT')
jpayne@69 3195
jpayne@69 3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69 3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69 3198 ``inplace=True``.
jpayne@69 3199 """
jpayne@69 3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69 3201
jpayne@69 3202 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69 3203 return sequence.complement(inplace)
jpayne@69 3204 if isinstance(sequence, SeqRecord):
jpayne@69 3205 if inplace:
jpayne@69 3206 raise TypeError("SeqRecords are immutable")
jpayne@69 3207 return sequence.complement()
jpayne@69 3208 # Assume it's a string.
jpayne@69 3209 if inplace is True:
jpayne@69 3210 raise TypeError("strings are immutable")
jpayne@69 3211 sequence = sequence.encode("ASCII")
jpayne@69 3212 sequence = sequence.translate(_dna_complement_table)
jpayne@69 3213 return sequence.decode("ASCII")
jpayne@69 3214
jpayne@69 3215
jpayne@69 3216 def complement_rna(sequence, inplace=False):
jpayne@69 3217 """Return the complement as an RNA sequence.
jpayne@69 3218
jpayne@69 3219 If given a string, returns a new string object.
jpayne@69 3220 Given a Seq object, returns a new Seq object.
jpayne@69 3221 Given a MutableSeq, returns a new MutableSeq object.
jpayne@69 3222 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@69 3223
jpayne@69 3224 >>> my_seq = "CGA"
jpayne@69 3225 >>> complement_rna(my_seq)
jpayne@69 3226 'GCU'
jpayne@69 3227 >>> my_seq = Seq("CGA")
jpayne@69 3228 >>> complement_rna(my_seq)
jpayne@69 3229 Seq('GCU')
jpayne@69 3230 >>> my_seq = MutableSeq("CGA")
jpayne@69 3231 >>> complement_rna(my_seq)
jpayne@69 3232 MutableSeq('GCU')
jpayne@69 3233 >>> my_seq
jpayne@69 3234 MutableSeq('CGA')
jpayne@69 3235
jpayne@69 3236 Any T in the sequence is treated as a U:
jpayne@69 3237
jpayne@69 3238 >>> complement_rna(Seq("CGAUT"))
jpayne@69 3239 Seq('GCUAA')
jpayne@69 3240
jpayne@69 3241 In contrast, ``complement`` returns a DNA sequence:
jpayne@69 3242
jpayne@69 3243 >>> complement(Seq("CGAUT"))
jpayne@69 3244 Seq('GCTAA')
jpayne@69 3245
jpayne@69 3246 Supports and lower- and upper-case characters, and unambiguous and
jpayne@69 3247 ambiguous nucleotides. All other characters are not converted:
jpayne@69 3248
jpayne@69 3249 >>> complement_rna("ACGTUacgtuXYZxyz")
jpayne@69 3250 'UGCAAugcaaXRZxrz'
jpayne@69 3251
jpayne@69 3252 The sequence is modified in-place and returned if inplace is True:
jpayne@69 3253
jpayne@69 3254 >>> my_seq = MutableSeq("CGA")
jpayne@69 3255 >>> complement(my_seq, inplace=True)
jpayne@69 3256 MutableSeq('GCT')
jpayne@69 3257 >>> my_seq
jpayne@69 3258 MutableSeq('GCT')
jpayne@69 3259
jpayne@69 3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@69 3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@69 3262 ``inplace=True``.
jpayne@69 3263 """
jpayne@69 3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@69 3265
jpayne@69 3266 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@69 3267 return sequence.complement_rna(inplace)
jpayne@69 3268 if isinstance(sequence, SeqRecord):
jpayne@69 3269 if inplace:
jpayne@69 3270 raise TypeError("SeqRecords are immutable")
jpayne@69 3271 return sequence.complement_rna()
jpayne@69 3272 # Assume it's a string.
jpayne@69 3273 if inplace:
jpayne@69 3274 raise TypeError("strings are immutable")
jpayne@69 3275 sequence = sequence.encode("ASCII")
jpayne@69 3276 sequence = sequence.translate(_rna_complement_table)
jpayne@69 3277 return sequence.decode("ASCII")
jpayne@69 3278
jpayne@69 3279
jpayne@69 3280 def _test():
jpayne@69 3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
jpayne@69 3282 print("Running doctests...")
jpayne@69 3283 import doctest
jpayne@69 3284
jpayne@69 3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
jpayne@69 3286 print("Done")
jpayne@69 3287
jpayne@69 3288
jpayne@69 3289 if __name__ == "__main__":
jpayne@69 3290 _test()