annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # Copyright 2000 Andrew Dalke.
jpayne@68 2 # Copyright 2000-2002 Brad Chapman.
jpayne@68 3 # Copyright 2004-2005, 2010 by M de Hoon.
jpayne@68 4 # Copyright 2007-2023 by Peter Cock.
jpayne@68 5 # All rights reserved.
jpayne@68 6 #
jpayne@68 7 # This file is part of the Biopython distribution and governed by your
jpayne@68 8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68 9 # Please see the LICENSE file that should have been included as part of this
jpayne@68 10 # package.
jpayne@68 11 """Provide objects to represent biological sequences.
jpayne@68 12
jpayne@68 13 See also the Seq_ wiki and the chapter in our tutorial:
jpayne@68 14 - `HTML Tutorial`_
jpayne@68 15 - `PDF Tutorial`_
jpayne@68 16
jpayne@68 17 .. _Seq: http://biopython.org/wiki/Seq
jpayne@68 18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
jpayne@68 19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
jpayne@68 20
jpayne@68 21 """
jpayne@68 22 import array
jpayne@68 23 import collections
jpayne@68 24 import numbers
jpayne@68 25 import warnings
jpayne@68 26
jpayne@68 27 from abc import ABC
jpayne@68 28 from abc import abstractmethod
jpayne@68 29 from typing import overload, Optional, Union, Dict
jpayne@68 30
jpayne@68 31 from Bio import BiopythonWarning
jpayne@68 32 from Bio.Data import CodonTable
jpayne@68 33 from Bio.Data import IUPACData
jpayne@68 34
jpayne@68 35
jpayne@68 36 def _maketrans(complement_mapping):
jpayne@68 37 """Make a python string translation table (PRIVATE).
jpayne@68 38
jpayne@68 39 Arguments:
jpayne@68 40 - complement_mapping - a dictionary such as ambiguous_dna_complement
jpayne@68 41 and ambiguous_rna_complement from Data.IUPACData.
jpayne@68 42
jpayne@68 43 Returns a translation table (a bytes object of length 256) for use with
jpayne@68 44 the python string's translate method to use in a (reverse) complement.
jpayne@68 45
jpayne@68 46 Compatible with lower case and upper case sequences.
jpayne@68 47
jpayne@68 48 For internal use only.
jpayne@68 49 """
jpayne@68 50 keys = "".join(complement_mapping.keys()).encode("ASCII")
jpayne@68 51 values = "".join(complement_mapping.values()).encode("ASCII")
jpayne@68 52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
jpayne@68 53
jpayne@68 54
jpayne@68 55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
jpayne@68 56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
jpayne@68 57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
jpayne@68 58 del ambiguous_dna_complement
jpayne@68 59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
jpayne@68 60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
jpayne@68 61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
jpayne@68 62 del ambiguous_rna_complement
jpayne@68 63
jpayne@68 64
jpayne@68 65 class SequenceDataAbstractBaseClass(ABC):
jpayne@68 66 """Abstract base class for sequence content providers.
jpayne@68 67
jpayne@68 68 Most users will not need to use this class. It is used internally as a base
jpayne@68 69 class for sequence content provider classes such as _UndefinedSequenceData
jpayne@68 70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
jpayne@68 71 Instances of these classes can be used instead of a ``bytes`` object as the
jpayne@68 72 data argument when creating a Seq object, and provide the sequence content
jpayne@68 73 only when requested via ``__getitem__``. This allows lazy parsers to load
jpayne@68 74 and parse sequence data from a file only for the requested sequence regions,
jpayne@68 75 and _UndefinedSequenceData instances to raise an exception when undefined
jpayne@68 76 sequence data are requested.
jpayne@68 77
jpayne@68 78 Future implementations of lazy parsers that similarly provide on-demand
jpayne@68 79 parsing of sequence data should use a subclass of this abstract class and
jpayne@68 80 implement the abstract methods ``__len__`` and ``__getitem__``:
jpayne@68 81
jpayne@68 82 * ``__len__`` must return the sequence length;
jpayne@68 83 * ``__getitem__`` must return
jpayne@68 84
jpayne@68 85 * a ``bytes`` object for the requested region; or
jpayne@68 86 * a new instance of the subclass for the requested region; or
jpayne@68 87 * raise an ``UndefinedSequenceError``.
jpayne@68 88
jpayne@68 89 Calling ``__getitem__`` for a sequence region of size zero should always
jpayne@68 90 return an empty ``bytes`` object.
jpayne@68 91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
jpayne@68 92 either return a ``bytes`` object with the full sequence, or raise an
jpayne@68 93 ``UndefinedSequenceError``.
jpayne@68 94
jpayne@68 95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
jpayne@68 96 as part of their ``__init__`` method.
jpayne@68 97 """
jpayne@68 98
jpayne@68 99 __slots__ = ()
jpayne@68 100
jpayne@68 101 def __init__(self):
jpayne@68 102 """Check if ``__getitem__`` returns a bytes-like object."""
jpayne@68 103 assert self[:0] == b""
jpayne@68 104
jpayne@68 105 @abstractmethod
jpayne@68 106 def __len__(self):
jpayne@68 107 pass
jpayne@68 108
jpayne@68 109 @abstractmethod
jpayne@68 110 def __getitem__(self, key):
jpayne@68 111 pass
jpayne@68 112
jpayne@68 113 def __bytes__(self):
jpayne@68 114 return self[:]
jpayne@68 115
jpayne@68 116 def __hash__(self):
jpayne@68 117 return hash(bytes(self))
jpayne@68 118
jpayne@68 119 def __eq__(self, other):
jpayne@68 120 return bytes(self) == other
jpayne@68 121
jpayne@68 122 def __lt__(self, other):
jpayne@68 123 return bytes(self) < other
jpayne@68 124
jpayne@68 125 def __le__(self, other):
jpayne@68 126 return bytes(self) <= other
jpayne@68 127
jpayne@68 128 def __gt__(self, other):
jpayne@68 129 return bytes(self) > other
jpayne@68 130
jpayne@68 131 def __ge__(self, other):
jpayne@68 132 return bytes(self) >= other
jpayne@68 133
jpayne@68 134 def __add__(self, other):
jpayne@68 135 try:
jpayne@68 136 return bytes(self) + bytes(other)
jpayne@68 137 except UndefinedSequenceError:
jpayne@68 138 return NotImplemented
jpayne@68 139 # will be handled by _UndefinedSequenceData.__radd__ or
jpayne@68 140 # by _PartiallyDefinedSequenceData.__radd__
jpayne@68 141
jpayne@68 142 def __radd__(self, other):
jpayne@68 143 return other + bytes(self)
jpayne@68 144
jpayne@68 145 def __mul__(self, other):
jpayne@68 146 return other * bytes(self)
jpayne@68 147
jpayne@68 148 def __contains__(self, item):
jpayne@68 149 return bytes(self).__contains__(item)
jpayne@68 150
jpayne@68 151 def decode(self, encoding="utf-8"):
jpayne@68 152 """Decode the data as bytes using the codec registered for encoding.
jpayne@68 153
jpayne@68 154 encoding
jpayne@68 155 The encoding with which to decode the bytes.
jpayne@68 156 """
jpayne@68 157 return bytes(self).decode(encoding)
jpayne@68 158
jpayne@68 159 def count(self, sub, start=None, end=None):
jpayne@68 160 """Return the number of non-overlapping occurrences of sub in data[start:end].
jpayne@68 161
jpayne@68 162 Optional arguments start and end are interpreted as in slice notation.
jpayne@68 163 This method behaves as the count method of Python strings.
jpayne@68 164 """
jpayne@68 165 return bytes(self).count(sub, start, end)
jpayne@68 166
jpayne@68 167 def find(self, sub, start=None, end=None):
jpayne@68 168 """Return the lowest index in data where subsection sub is found.
jpayne@68 169
jpayne@68 170 Return the lowest index in data where subsection sub is found,
jpayne@68 171 such that sub is contained within data[start,end]. Optional
jpayne@68 172 arguments start and end are interpreted as in slice notation.
jpayne@68 173
jpayne@68 174 Return -1 on failure.
jpayne@68 175 """
jpayne@68 176 return bytes(self).find(sub, start, end)
jpayne@68 177
jpayne@68 178 def rfind(self, sub, start=None, end=None):
jpayne@68 179 """Return the highest index in data where subsection sub is found.
jpayne@68 180
jpayne@68 181 Return the highest index in data where subsection sub is found,
jpayne@68 182 such that sub is contained within data[start,end]. Optional
jpayne@68 183 arguments start and end are interpreted as in slice notation.
jpayne@68 184
jpayne@68 185 Return -1 on failure.
jpayne@68 186 """
jpayne@68 187 return bytes(self).rfind(sub, start, end)
jpayne@68 188
jpayne@68 189 def index(self, sub, start=None, end=None):
jpayne@68 190 """Return the lowest index in data where subsection sub is found.
jpayne@68 191
jpayne@68 192 Return the lowest index in data where subsection sub is found,
jpayne@68 193 such that sub is contained within data[start,end]. Optional
jpayne@68 194 arguments start and end are interpreted as in slice notation.
jpayne@68 195
jpayne@68 196 Raises ValueError when the subsection is not found.
jpayne@68 197 """
jpayne@68 198 return bytes(self).index(sub, start, end)
jpayne@68 199
jpayne@68 200 def rindex(self, sub, start=None, end=None):
jpayne@68 201 """Return the highest index in data where subsection sub is found.
jpayne@68 202
jpayne@68 203 Return the highest index in data where subsection sub is found,
jpayne@68 204 such that sub is contained within data[start,end]. Optional
jpayne@68 205 arguments start and end are interpreted as in slice notation.
jpayne@68 206
jpayne@68 207 Raise ValueError when the subsection is not found.
jpayne@68 208 """
jpayne@68 209 return bytes(self).rindex(sub, start, end)
jpayne@68 210
jpayne@68 211 def startswith(self, prefix, start=None, end=None):
jpayne@68 212 """Return True if data starts with the specified prefix, False otherwise.
jpayne@68 213
jpayne@68 214 With optional start, test data beginning at that position.
jpayne@68 215 With optional end, stop comparing data at that position.
jpayne@68 216 prefix can also be a tuple of bytes to try.
jpayne@68 217 """
jpayne@68 218 return bytes(self).startswith(prefix, start, end)
jpayne@68 219
jpayne@68 220 def endswith(self, suffix, start=None, end=None):
jpayne@68 221 """Return True if data ends with the specified suffix, False otherwise.
jpayne@68 222
jpayne@68 223 With optional start, test data beginning at that position.
jpayne@68 224 With optional end, stop comparing data at that position.
jpayne@68 225 suffix can also be a tuple of bytes to try.
jpayne@68 226 """
jpayne@68 227 return bytes(self).endswith(suffix, start, end)
jpayne@68 228
jpayne@68 229 def split(self, sep=None, maxsplit=-1):
jpayne@68 230 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68 231
jpayne@68 232 sep
jpayne@68 233 The delimiter according which to split the data.
jpayne@68 234 None (the default value) means split on ASCII whitespace characters
jpayne@68 235 (space, tab, return, newline, formfeed, vertical tab).
jpayne@68 236 maxsplit
jpayne@68 237 Maximum number of splits to do.
jpayne@68 238 -1 (the default value) means no limit.
jpayne@68 239 """
jpayne@68 240 return bytes(self).split(sep, maxsplit)
jpayne@68 241
jpayne@68 242 def rsplit(self, sep=None, maxsplit=-1):
jpayne@68 243 """Return a list of the sections in the data, using sep as the delimiter.
jpayne@68 244
jpayne@68 245 sep
jpayne@68 246 The delimiter according which to split the data.
jpayne@68 247 None (the default value) means split on ASCII whitespace characters
jpayne@68 248 (space, tab, return, newline, formfeed, vertical tab).
jpayne@68 249 maxsplit
jpayne@68 250 Maximum number of splits to do.
jpayne@68 251 -1 (the default value) means no limit.
jpayne@68 252
jpayne@68 253 Splitting is done starting at the end of the data and working to the front.
jpayne@68 254 """
jpayne@68 255 return bytes(self).rsplit(sep, maxsplit)
jpayne@68 256
jpayne@68 257 def strip(self, chars=None):
jpayne@68 258 """Strip leading and trailing characters contained in the argument.
jpayne@68 259
jpayne@68 260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
jpayne@68 261 """
jpayne@68 262 return bytes(self).strip(chars)
jpayne@68 263
jpayne@68 264 def lstrip(self, chars=None):
jpayne@68 265 """Strip leading characters contained in the argument.
jpayne@68 266
jpayne@68 267 If the argument is omitted or None, strip leading ASCII whitespace.
jpayne@68 268 """
jpayne@68 269 return bytes(self).lstrip(chars)
jpayne@68 270
jpayne@68 271 def rstrip(self, chars=None):
jpayne@68 272 """Strip trailing characters contained in the argument.
jpayne@68 273
jpayne@68 274 If the argument is omitted or None, strip trailing ASCII whitespace.
jpayne@68 275 """
jpayne@68 276 return bytes(self).rstrip(chars)
jpayne@68 277
jpayne@68 278 def removeprefix(self, prefix):
jpayne@68 279 """Remove the prefix if present."""
jpayne@68 280 # Want to do just this, but need Python 3.9+
jpayne@68 281 # return bytes(self).removeprefix(prefix)
jpayne@68 282 data = bytes(self)
jpayne@68 283 try:
jpayne@68 284 return data.removeprefix(prefix)
jpayne@68 285 except AttributeError:
jpayne@68 286 if data.startswith(prefix):
jpayne@68 287 return data[len(prefix) :]
jpayne@68 288 else:
jpayne@68 289 return data
jpayne@68 290
jpayne@68 291 def removesuffix(self, suffix):
jpayne@68 292 """Remove the suffix if present."""
jpayne@68 293 # Want to do just this, but need Python 3.9+
jpayne@68 294 # return bytes(self).removesuffix(suffix)
jpayne@68 295 data = bytes(self)
jpayne@68 296 try:
jpayne@68 297 return data.removesuffix(suffix)
jpayne@68 298 except AttributeError:
jpayne@68 299 if data.startswith(suffix):
jpayne@68 300 return data[: -len(suffix)]
jpayne@68 301 else:
jpayne@68 302 return data
jpayne@68 303
jpayne@68 304 def upper(self):
jpayne@68 305 """Return a copy of data with all ASCII characters converted to uppercase."""
jpayne@68 306 return bytes(self).upper()
jpayne@68 307
jpayne@68 308 def lower(self):
jpayne@68 309 """Return a copy of data with all ASCII characters converted to lowercase."""
jpayne@68 310 return bytes(self).lower()
jpayne@68 311
jpayne@68 312 def isupper(self):
jpayne@68 313 """Return True if all ASCII characters in data are uppercase.
jpayne@68 314
jpayne@68 315 If there are no cased characters, the method returns False.
jpayne@68 316 """
jpayne@68 317 return bytes(self).isupper()
jpayne@68 318
jpayne@68 319 def islower(self):
jpayne@68 320 """Return True if all ASCII characters in data are lowercase.
jpayne@68 321
jpayne@68 322 If there are no cased characters, the method returns False.
jpayne@68 323 """
jpayne@68 324 return bytes(self).islower()
jpayne@68 325
jpayne@68 326 def replace(self, old, new):
jpayne@68 327 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68 328 return bytes(self).replace(old, new)
jpayne@68 329
jpayne@68 330 def translate(self, table, delete=b""):
jpayne@68 331 """Return a copy with each character mapped by the given translation table.
jpayne@68 332
jpayne@68 333 table
jpayne@68 334 Translation table, which must be a bytes object of length 256.
jpayne@68 335
jpayne@68 336 All characters occurring in the optional argument delete are removed.
jpayne@68 337 The remaining characters are mapped through the given translation table.
jpayne@68 338 """
jpayne@68 339 return bytes(self).translate(table, delete)
jpayne@68 340
jpayne@68 341 @property
jpayne@68 342 def defined(self):
jpayne@68 343 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68 344
jpayne@68 345 Zero-length sequences are always considered to be defined.
jpayne@68 346 """
jpayne@68 347 return True
jpayne@68 348
jpayne@68 349 @property
jpayne@68 350 def defined_ranges(self):
jpayne@68 351 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68 352
jpayne@68 353 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68 354 """
jpayne@68 355 length = len(self)
jpayne@68 356 if length > 0:
jpayne@68 357 return ((0, length),)
jpayne@68 358 else:
jpayne@68 359 return ()
jpayne@68 360
jpayne@68 361
jpayne@68 362 class _SeqAbstractBaseClass(ABC):
jpayne@68 363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
jpayne@68 364
jpayne@68 365 Most users will not need to use this class. It is used internally as an
jpayne@68 366 abstract base class for Seq and MutableSeq, as most of their methods are
jpayne@68 367 identical.
jpayne@68 368 """
jpayne@68 369
jpayne@68 370 __slots__ = ("_data",)
jpayne@68 371 __array_ufunc__ = None # turn off numpy Ufuncs
jpayne@68 372
jpayne@68 373 @abstractmethod
jpayne@68 374 def __init__(self):
jpayne@68 375 pass
jpayne@68 376
jpayne@68 377 def __bytes__(self):
jpayne@68 378 return bytes(self._data)
jpayne@68 379
jpayne@68 380 def __repr__(self):
jpayne@68 381 """Return (truncated) representation of the sequence."""
jpayne@68 382 data = self._data
jpayne@68 383 if isinstance(data, _UndefinedSequenceData):
jpayne@68 384 return f"Seq(None, length={len(self)})"
jpayne@68 385 if isinstance(data, _PartiallyDefinedSequenceData):
jpayne@68 386 d = {}
jpayne@68 387 for position, seq in data._data.items():
jpayne@68 388 if len(seq) > 60:
jpayne@68 389 start = seq[:54].decode("ASCII")
jpayne@68 390 end = seq[-3:].decode("ASCII")
jpayne@68 391 seq = f"{start}...{end}"
jpayne@68 392 else:
jpayne@68 393 seq = seq.decode("ASCII")
jpayne@68 394 d[position] = seq
jpayne@68 395 return "Seq(%r, length=%d)" % (d, len(self))
jpayne@68 396 if len(data) > 60:
jpayne@68 397 # Shows the last three letters as it is often useful to see if
jpayne@68 398 # there is a stop codon at the end of a sequence.
jpayne@68 399 # Note total length is 54+3+3=60
jpayne@68 400 start = data[:54].decode("ASCII")
jpayne@68 401 end = data[-3:].decode("ASCII")
jpayne@68 402 return f"{self.__class__.__name__}('{start}...{end}')"
jpayne@68 403 else:
jpayne@68 404 data = data.decode("ASCII")
jpayne@68 405 return f"{self.__class__.__name__}('{data}')"
jpayne@68 406
jpayne@68 407 def __str__(self):
jpayne@68 408 """Return the full sequence as a python string."""
jpayne@68 409 return self._data.decode("ASCII")
jpayne@68 410
jpayne@68 411 def __eq__(self, other):
jpayne@68 412 """Compare the sequence to another sequence or a string.
jpayne@68 413
jpayne@68 414 Sequences are equal to each other if their sequence contents is
jpayne@68 415 identical:
jpayne@68 416
jpayne@68 417 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 418 >>> seq1 = Seq("ACGT")
jpayne@68 419 >>> seq2 = Seq("ACGT")
jpayne@68 420 >>> mutable_seq = MutableSeq("ACGT")
jpayne@68 421 >>> seq1 == seq2
jpayne@68 422 True
jpayne@68 423 >>> seq1 == mutable_seq
jpayne@68 424 True
jpayne@68 425 >>> seq1 == "ACGT"
jpayne@68 426 True
jpayne@68 427
jpayne@68 428 Note that the sequence objects themselves are not identical to each
jpayne@68 429 other:
jpayne@68 430
jpayne@68 431 >>> id(seq1) == id(seq2)
jpayne@68 432 False
jpayne@68 433 >>> seq1 is seq2
jpayne@68 434 False
jpayne@68 435
jpayne@68 436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
jpayne@68 437 objects:
jpayne@68 438
jpayne@68 439 >>> seq1 == "ACGT"
jpayne@68 440 True
jpayne@68 441 >>> seq1 == b"ACGT"
jpayne@68 442 True
jpayne@68 443 >>> seq1 == bytearray(b"ACGT")
jpayne@68 444 True
jpayne@68 445 """
jpayne@68 446 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 447 return self._data == other._data
jpayne@68 448 elif isinstance(other, str):
jpayne@68 449 return self._data == other.encode("ASCII")
jpayne@68 450 else:
jpayne@68 451 return self._data == other
jpayne@68 452
jpayne@68 453 def __lt__(self, other):
jpayne@68 454 """Implement the less-than operand."""
jpayne@68 455 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 456 return self._data < other._data
jpayne@68 457 elif isinstance(other, str):
jpayne@68 458 return self._data < other.encode("ASCII")
jpayne@68 459 else:
jpayne@68 460 return self._data < other
jpayne@68 461
jpayne@68 462 def __le__(self, other):
jpayne@68 463 """Implement the less-than or equal operand."""
jpayne@68 464 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 465 return self._data <= other._data
jpayne@68 466 elif isinstance(other, str):
jpayne@68 467 return self._data <= other.encode("ASCII")
jpayne@68 468 else:
jpayne@68 469 return self._data <= other
jpayne@68 470
jpayne@68 471 def __gt__(self, other):
jpayne@68 472 """Implement the greater-than operand."""
jpayne@68 473 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 474 return self._data > other._data
jpayne@68 475 elif isinstance(other, str):
jpayne@68 476 return self._data > other.encode("ASCII")
jpayne@68 477 else:
jpayne@68 478 return self._data > other
jpayne@68 479
jpayne@68 480 def __ge__(self, other):
jpayne@68 481 """Implement the greater-than or equal operand."""
jpayne@68 482 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 483 return self._data >= other._data
jpayne@68 484 elif isinstance(other, str):
jpayne@68 485 return self._data >= other.encode("ASCII")
jpayne@68 486 else:
jpayne@68 487 return self._data >= other
jpayne@68 488
jpayne@68 489 def __len__(self):
jpayne@68 490 """Return the length of the sequence."""
jpayne@68 491 return len(self._data)
jpayne@68 492
jpayne@68 493 def __iter__(self):
jpayne@68 494 """Return an iterable of the sequence."""
jpayne@68 495 return self._data.decode("ASCII").__iter__()
jpayne@68 496
jpayne@68 497 @overload
jpayne@68 498 def __getitem__(self, index: int) -> str:
jpayne@68 499 ...
jpayne@68 500
jpayne@68 501 @overload
jpayne@68 502 def __getitem__(self, index: slice) -> "Seq":
jpayne@68 503 ...
jpayne@68 504
jpayne@68 505 def __getitem__(self, index):
jpayne@68 506 """Return a subsequence as a single letter or as a sequence object.
jpayne@68 507
jpayne@68 508 If the index is an integer, a single letter is returned as a Python
jpayne@68 509 string:
jpayne@68 510
jpayne@68 511 >>> seq = Seq('ACTCGACGTCG')
jpayne@68 512 >>> seq[5]
jpayne@68 513 'A'
jpayne@68 514
jpayne@68 515 Otherwise, a new sequence object of the same class is returned:
jpayne@68 516
jpayne@68 517 >>> seq[5:8]
jpayne@68 518 Seq('ACG')
jpayne@68 519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 520 >>> mutable_seq[5:8]
jpayne@68 521 MutableSeq('ACG')
jpayne@68 522 """
jpayne@68 523 if isinstance(index, numbers.Integral):
jpayne@68 524 # Return a single letter as a string
jpayne@68 525 return chr(self._data[index])
jpayne@68 526 else:
jpayne@68 527 # Return the (sub)sequence as another Seq/MutableSeq object
jpayne@68 528 return self.__class__(self._data[index])
jpayne@68 529
jpayne@68 530 def __add__(self, other):
jpayne@68 531 """Add a sequence or string to this sequence.
jpayne@68 532
jpayne@68 533 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 534 >>> Seq("MELKI") + "LV"
jpayne@68 535 Seq('MELKILV')
jpayne@68 536 >>> MutableSeq("MELKI") + "LV"
jpayne@68 537 MutableSeq('MELKILV')
jpayne@68 538 """
jpayne@68 539 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 540 return self.__class__(self._data + other._data)
jpayne@68 541 elif isinstance(other, str):
jpayne@68 542 return self.__class__(self._data + other.encode("ASCII"))
jpayne@68 543 else:
jpayne@68 544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
jpayne@68 545 # this. If not, returning NotImplemented will trigger a TypeError.
jpayne@68 546 return NotImplemented
jpayne@68 547
jpayne@68 548 def __radd__(self, other):
jpayne@68 549 """Add a sequence string on the left.
jpayne@68 550
jpayne@68 551 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 552 >>> "LV" + Seq("MELKI")
jpayne@68 553 Seq('LVMELKI')
jpayne@68 554 >>> "LV" + MutableSeq("MELKI")
jpayne@68 555 MutableSeq('LVMELKI')
jpayne@68 556
jpayne@68 557 Adding two sequence objects is handled via the __add__ method.
jpayne@68 558 """
jpayne@68 559 if isinstance(other, str):
jpayne@68 560 return self.__class__(other.encode("ASCII") + self._data)
jpayne@68 561 else:
jpayne@68 562 return NotImplemented
jpayne@68 563
jpayne@68 564 def __mul__(self, other):
jpayne@68 565 """Multiply sequence by integer.
jpayne@68 566
jpayne@68 567 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 568 >>> Seq('ATG') * 2
jpayne@68 569 Seq('ATGATG')
jpayne@68 570 >>> MutableSeq('ATG') * 2
jpayne@68 571 MutableSeq('ATGATG')
jpayne@68 572 """
jpayne@68 573 if not isinstance(other, numbers.Integral):
jpayne@68 574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68 575 # we would like to simply write
jpayne@68 576 # data = self._data * other
jpayne@68 577 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68 578 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68 579 data = self._data.__mul__(other)
jpayne@68 580 return self.__class__(data)
jpayne@68 581
jpayne@68 582 def __rmul__(self, other):
jpayne@68 583 """Multiply integer by sequence.
jpayne@68 584
jpayne@68 585 >>> from Bio.Seq import Seq
jpayne@68 586 >>> 2 * Seq('ATG')
jpayne@68 587 Seq('ATGATG')
jpayne@68 588 """
jpayne@68 589 if not isinstance(other, numbers.Integral):
jpayne@68 590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68 591 # we would like to simply write
jpayne@68 592 # data = self._data * other
jpayne@68 593 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68 594 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68 595 data = self._data.__mul__(other)
jpayne@68 596 return self.__class__(data)
jpayne@68 597
jpayne@68 598 def __imul__(self, other):
jpayne@68 599 """Multiply the sequence object by other and assign.
jpayne@68 600
jpayne@68 601 >>> from Bio.Seq import Seq
jpayne@68 602 >>> seq = Seq('ATG')
jpayne@68 603 >>> seq *= 2
jpayne@68 604 >>> seq
jpayne@68 605 Seq('ATGATG')
jpayne@68 606
jpayne@68 607 Note that this is different from in-place multiplication. The ``seq``
jpayne@68 608 variable is reassigned to the multiplication result, but any variable
jpayne@68 609 pointing to ``seq`` will remain unchanged:
jpayne@68 610
jpayne@68 611 >>> seq = Seq('ATG')
jpayne@68 612 >>> seq2 = seq
jpayne@68 613 >>> id(seq) == id(seq2)
jpayne@68 614 True
jpayne@68 615 >>> seq *= 2
jpayne@68 616 >>> seq
jpayne@68 617 Seq('ATGATG')
jpayne@68 618 >>> seq2
jpayne@68 619 Seq('ATG')
jpayne@68 620 >>> id(seq) == id(seq2)
jpayne@68 621 False
jpayne@68 622 """
jpayne@68 623 if not isinstance(other, numbers.Integral):
jpayne@68 624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
jpayne@68 625 # we would like to simply write
jpayne@68 626 # data = self._data * other
jpayne@68 627 # here, but currently that causes a bug on PyPy if self._data is a
jpayne@68 628 # bytearray and other is a numpy integer. Using this workaround:
jpayne@68 629 data = self._data.__mul__(other)
jpayne@68 630 return self.__class__(data)
jpayne@68 631
jpayne@68 632 def count(self, sub, start=None, end=None):
jpayne@68 633 """Return a non-overlapping count, like that of a python string.
jpayne@68 634
jpayne@68 635 The number of occurrences of substring argument sub in the
jpayne@68 636 (sub)sequence given by [start:end] is returned as an integer.
jpayne@68 637 Optional arguments start and end are interpreted as in slice
jpayne@68 638 notation.
jpayne@68 639
jpayne@68 640 Arguments:
jpayne@68 641 - sub - a string or another Seq object to look for
jpayne@68 642 - start - optional integer, slice start
jpayne@68 643 - end - optional integer, slice end
jpayne@68 644
jpayne@68 645 e.g.
jpayne@68 646
jpayne@68 647 >>> from Bio.Seq import Seq
jpayne@68 648 >>> my_seq = Seq("AAAATGA")
jpayne@68 649 >>> print(my_seq.count("A"))
jpayne@68 650 5
jpayne@68 651 >>> print(my_seq.count("ATG"))
jpayne@68 652 1
jpayne@68 653 >>> print(my_seq.count(Seq("AT")))
jpayne@68 654 1
jpayne@68 655 >>> print(my_seq.count("AT", 2, -1))
jpayne@68 656 1
jpayne@68 657
jpayne@68 658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
jpayne@68 659 objects, like that of Python strings, do a non-overlapping search, this
jpayne@68 660 may not give the answer you expect:
jpayne@68 661
jpayne@68 662 >>> "AAAA".count("AA")
jpayne@68 663 2
jpayne@68 664 >>> print(Seq("AAAA").count("AA"))
jpayne@68 665 2
jpayne@68 666
jpayne@68 667 For an overlapping search, use the ``count_overlap`` method:
jpayne@68 668
jpayne@68 669 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68 670 3
jpayne@68 671 """
jpayne@68 672 if isinstance(sub, MutableSeq):
jpayne@68 673 sub = sub._data
jpayne@68 674 elif isinstance(sub, Seq):
jpayne@68 675 sub = bytes(sub)
jpayne@68 676 elif isinstance(sub, str):
jpayne@68 677 sub = sub.encode("ASCII")
jpayne@68 678 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 679 raise TypeError(
jpayne@68 680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 681 % type(sub)
jpayne@68 682 )
jpayne@68 683 return self._data.count(sub, start, end)
jpayne@68 684
jpayne@68 685 def count_overlap(self, sub, start=None, end=None):
jpayne@68 686 """Return an overlapping count.
jpayne@68 687
jpayne@68 688 Returns an integer, the number of occurrences of substring
jpayne@68 689 argument sub in the (sub)sequence given by [start:end].
jpayne@68 690 Optional arguments start and end are interpreted as in slice
jpayne@68 691 notation.
jpayne@68 692
jpayne@68 693 Arguments:
jpayne@68 694 - sub - a string or another Seq object to look for
jpayne@68 695 - start - optional integer, slice start
jpayne@68 696 - end - optional integer, slice end
jpayne@68 697
jpayne@68 698 e.g.
jpayne@68 699
jpayne@68 700 >>> from Bio.Seq import Seq
jpayne@68 701 >>> print(Seq("AAAA").count_overlap("AA"))
jpayne@68 702 3
jpayne@68 703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
jpayne@68 704 4
jpayne@68 705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
jpayne@68 706 1
jpayne@68 707
jpayne@68 708 For a non-overlapping search, use the ``count`` method:
jpayne@68 709
jpayne@68 710 >>> print(Seq("AAAA").count("AA"))
jpayne@68 711 2
jpayne@68 712
jpayne@68 713 Where substrings do not overlap, ``count_overlap`` behaves the same as
jpayne@68 714 the ``count`` method:
jpayne@68 715
jpayne@68 716 >>> from Bio.Seq import Seq
jpayne@68 717 >>> my_seq = Seq("AAAATGA")
jpayne@68 718 >>> print(my_seq.count_overlap("A"))
jpayne@68 719 5
jpayne@68 720 >>> my_seq.count_overlap("A") == my_seq.count("A")
jpayne@68 721 True
jpayne@68 722 >>> print(my_seq.count_overlap("ATG"))
jpayne@68 723 1
jpayne@68 724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
jpayne@68 725 True
jpayne@68 726 >>> print(my_seq.count_overlap(Seq("AT")))
jpayne@68 727 1
jpayne@68 728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
jpayne@68 729 True
jpayne@68 730 >>> print(my_seq.count_overlap("AT", 2, -1))
jpayne@68 731 1
jpayne@68 732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
jpayne@68 733 True
jpayne@68 734
jpayne@68 735 HOWEVER, do not use this method for such cases because the
jpayne@68 736 count() method is much for efficient.
jpayne@68 737 """
jpayne@68 738 if isinstance(sub, MutableSeq):
jpayne@68 739 sub = sub._data
jpayne@68 740 elif isinstance(sub, Seq):
jpayne@68 741 sub = bytes(sub)
jpayne@68 742 elif isinstance(sub, str):
jpayne@68 743 sub = sub.encode("ASCII")
jpayne@68 744 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 745 raise TypeError(
jpayne@68 746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 747 % type(sub)
jpayne@68 748 )
jpayne@68 749 data = self._data
jpayne@68 750 overlap_count = 0
jpayne@68 751 while True:
jpayne@68 752 start = data.find(sub, start, end) + 1
jpayne@68 753 if start != 0:
jpayne@68 754 overlap_count += 1
jpayne@68 755 else:
jpayne@68 756 return overlap_count
jpayne@68 757
jpayne@68 758 def __contains__(self, item):
jpayne@68 759 """Return True if item is a subsequence of the sequence, and False otherwise.
jpayne@68 760
jpayne@68 761 e.g.
jpayne@68 762
jpayne@68 763 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
jpayne@68 765 >>> "AAA" in my_dna
jpayne@68 766 True
jpayne@68 767 >>> Seq("AAA") in my_dna
jpayne@68 768 True
jpayne@68 769 >>> MutableSeq("AAA") in my_dna
jpayne@68 770 True
jpayne@68 771 """
jpayne@68 772 if isinstance(item, _SeqAbstractBaseClass):
jpayne@68 773 item = bytes(item)
jpayne@68 774 elif isinstance(item, str):
jpayne@68 775 item = item.encode("ASCII")
jpayne@68 776 return item in self._data
jpayne@68 777
jpayne@68 778 def find(self, sub, start=None, end=None):
jpayne@68 779 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68 780
jpayne@68 781 With optional arguments start and end, return the lowest index in the
jpayne@68 782 sequence such that the subsequence sub is contained within the sequence
jpayne@68 783 region [start:end].
jpayne@68 784
jpayne@68 785 Arguments:
jpayne@68 786 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68 787 - start - optional integer, slice start
jpayne@68 788 - end - optional integer, slice end
jpayne@68 789
jpayne@68 790 Returns -1 if the subsequence is NOT found.
jpayne@68 791
jpayne@68 792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68 793
jpayne@68 794 >>> from Bio.Seq import Seq
jpayne@68 795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 796 >>> my_rna.find("AUG")
jpayne@68 797 3
jpayne@68 798
jpayne@68 799 The next typical start codon can then be found by starting the search
jpayne@68 800 at position 4:
jpayne@68 801
jpayne@68 802 >>> my_rna.find("AUG", 4)
jpayne@68 803 15
jpayne@68 804
jpayne@68 805 See the ``search`` method to find the locations of multiple subsequences
jpayne@68 806 at the same time.
jpayne@68 807 """
jpayne@68 808 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68 809 sub = bytes(sub)
jpayne@68 810 elif isinstance(sub, str):
jpayne@68 811 sub = sub.encode("ASCII")
jpayne@68 812 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 813 raise TypeError(
jpayne@68 814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 815 % type(sub)
jpayne@68 816 )
jpayne@68 817 return self._data.find(sub, start, end)
jpayne@68 818
jpayne@68 819 def rfind(self, sub, start=None, end=None):
jpayne@68 820 """Return the highest index in the sequence where subsequence sub is found.
jpayne@68 821
jpayne@68 822 With optional arguments start and end, return the highest index in the
jpayne@68 823 sequence such that the subsequence sub is contained within the sequence
jpayne@68 824 region [start:end].
jpayne@68 825
jpayne@68 826 Arguments:
jpayne@68 827 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68 828 - start - optional integer, slice start
jpayne@68 829 - end - optional integer, slice end
jpayne@68 830
jpayne@68 831 Returns -1 if the subsequence is NOT found.
jpayne@68 832
jpayne@68 833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68 834
jpayne@68 835 >>> from Bio.Seq import Seq
jpayne@68 836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 837 >>> my_rna.rfind("AUG")
jpayne@68 838 15
jpayne@68 839
jpayne@68 840 The location of the typical start codon before that can be found by
jpayne@68 841 ending the search at position 15:
jpayne@68 842
jpayne@68 843 >>> my_rna.rfind("AUG", end=15)
jpayne@68 844 3
jpayne@68 845
jpayne@68 846 See the ``search`` method to find the locations of multiple subsequences
jpayne@68 847 at the same time.
jpayne@68 848 """
jpayne@68 849 if isinstance(sub, _SeqAbstractBaseClass):
jpayne@68 850 sub = bytes(sub)
jpayne@68 851 elif isinstance(sub, str):
jpayne@68 852 sub = sub.encode("ASCII")
jpayne@68 853 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 854 raise TypeError(
jpayne@68 855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 856 % type(sub)
jpayne@68 857 )
jpayne@68 858 return self._data.rfind(sub, start, end)
jpayne@68 859
jpayne@68 860 def index(self, sub, start=None, end=None):
jpayne@68 861 """Return the lowest index in the sequence where subsequence sub is found.
jpayne@68 862
jpayne@68 863 With optional arguments start and end, return the lowest index in the
jpayne@68 864 sequence such that the subsequence sub is contained within the sequence
jpayne@68 865 region [start:end].
jpayne@68 866
jpayne@68 867 Arguments:
jpayne@68 868 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68 869 - start - optional integer, slice start
jpayne@68 870 - end - optional integer, slice end
jpayne@68 871
jpayne@68 872 Raises a ValueError if the subsequence is NOT found.
jpayne@68 873
jpayne@68 874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
jpayne@68 875
jpayne@68 876 >>> from Bio.Seq import Seq
jpayne@68 877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 878 >>> my_rna.index("AUG")
jpayne@68 879 3
jpayne@68 880
jpayne@68 881 The next typical start codon can then be found by starting the search
jpayne@68 882 at position 4:
jpayne@68 883
jpayne@68 884 >>> my_rna.index("AUG", 4)
jpayne@68 885 15
jpayne@68 886
jpayne@68 887 This method performs the same search as the ``find`` method. However,
jpayne@68 888 if the subsequence is not found, ``find`` returns -1 while ``index``
jpayne@68 889 raises a ValueError:
jpayne@68 890
jpayne@68 891 >>> my_rna.index("T")
jpayne@68 892 Traceback (most recent call last):
jpayne@68 893 ...
jpayne@68 894 ValueError: ...
jpayne@68 895 >>> my_rna.find("T")
jpayne@68 896 -1
jpayne@68 897
jpayne@68 898 See the ``search`` method to find the locations of multiple subsequences
jpayne@68 899 at the same time.
jpayne@68 900 """
jpayne@68 901 if isinstance(sub, MutableSeq):
jpayne@68 902 sub = sub._data
jpayne@68 903 elif isinstance(sub, Seq):
jpayne@68 904 sub = bytes(sub)
jpayne@68 905 elif isinstance(sub, str):
jpayne@68 906 sub = sub.encode("ASCII")
jpayne@68 907 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 908 raise TypeError(
jpayne@68 909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 910 % type(sub)
jpayne@68 911 )
jpayne@68 912 return self._data.index(sub, start, end)
jpayne@68 913
jpayne@68 914 def rindex(self, sub, start=None, end=None):
jpayne@68 915 """Return the highest index in the sequence where subsequence sub is found.
jpayne@68 916
jpayne@68 917 With optional arguments start and end, return the highest index in the
jpayne@68 918 sequence such that the subsequence sub is contained within the sequence
jpayne@68 919 region [start:end].
jpayne@68 920
jpayne@68 921 Arguments:
jpayne@68 922 - sub - a string or another Seq or MutableSeq object to search for
jpayne@68 923 - start - optional integer, slice start
jpayne@68 924 - end - optional integer, slice end
jpayne@68 925
jpayne@68 926 Returns -1 if the subsequence is NOT found.
jpayne@68 927
jpayne@68 928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
jpayne@68 929
jpayne@68 930 >>> from Bio.Seq import Seq
jpayne@68 931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 932 >>> my_rna.rindex("AUG")
jpayne@68 933 15
jpayne@68 934
jpayne@68 935 The location of the typical start codon before that can be found by
jpayne@68 936 ending the search at position 15:
jpayne@68 937
jpayne@68 938 >>> my_rna.rindex("AUG", end=15)
jpayne@68 939 3
jpayne@68 940
jpayne@68 941 This method performs the same search as the ``rfind`` method. However,
jpayne@68 942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
jpayne@68 943 raises a ValueError:
jpayne@68 944
jpayne@68 945 >>> my_rna.rindex("T")
jpayne@68 946 Traceback (most recent call last):
jpayne@68 947 ...
jpayne@68 948 ValueError: ...
jpayne@68 949 >>> my_rna.rfind("T")
jpayne@68 950 -1
jpayne@68 951
jpayne@68 952 See the ``search`` method to find the locations of multiple subsequences
jpayne@68 953 at the same time.
jpayne@68 954 """
jpayne@68 955 if isinstance(sub, MutableSeq):
jpayne@68 956 sub = sub._data
jpayne@68 957 elif isinstance(sub, Seq):
jpayne@68 958 sub = bytes(sub)
jpayne@68 959 elif isinstance(sub, str):
jpayne@68 960 sub = sub.encode("ASCII")
jpayne@68 961 elif not isinstance(sub, (bytes, bytearray)):
jpayne@68 962 raise TypeError(
jpayne@68 963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 964 % type(sub)
jpayne@68 965 )
jpayne@68 966 return self._data.rindex(sub, start, end)
jpayne@68 967
jpayne@68 968 def search(self, subs):
jpayne@68 969 """Search the substrings subs in self and yield the index and substring found.
jpayne@68 970
jpayne@68 971 Arguments:
jpayne@68 972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
jpayne@68 973 objects containing the substrings to search for.
jpayne@68 974
jpayne@68 975 >>> from Bio.Seq import Seq
jpayne@68 976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
jpayne@68 977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
jpayne@68 978 >>> for index, substring in matches:
jpayne@68 979 ... print(index, substring)
jpayne@68 980 ...
jpayne@68 981 7 CC
jpayne@68 982 9 ATTG
jpayne@68 983 20 CC
jpayne@68 984 34 CC
jpayne@68 985 34 CCC
jpayne@68 986 35 CC
jpayne@68 987 """
jpayne@68 988 subdict = collections.defaultdict(set)
jpayne@68 989 for index, sub in enumerate(subs):
jpayne@68 990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
jpayne@68 991 sub = bytes(sub)
jpayne@68 992 elif isinstance(sub, str):
jpayne@68 993 sub = sub.encode("ASCII")
jpayne@68 994 elif not isinstance(sub, bytes):
jpayne@68 995 raise TypeError(
jpayne@68 996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
jpayne@68 997 % (index, type(sub))
jpayne@68 998 )
jpayne@68 999 length = len(sub)
jpayne@68 1000 subdict[length].add(sub)
jpayne@68 1001 for start in range(len(self) - 1):
jpayne@68 1002 for length, subs in subdict.items():
jpayne@68 1003 stop = start + length
jpayne@68 1004 for sub in subs:
jpayne@68 1005 if self._data[start:stop] == sub:
jpayne@68 1006 yield (start, sub.decode())
jpayne@68 1007 break
jpayne@68 1008
jpayne@68 1009 def startswith(self, prefix, start=None, end=None):
jpayne@68 1010 """Return True if the sequence starts with the given prefix, False otherwise.
jpayne@68 1011
jpayne@68 1012 Return True if the sequence starts with the specified prefix
jpayne@68 1013 (a string or another Seq object), False otherwise.
jpayne@68 1014 With optional start, test sequence beginning at that position.
jpayne@68 1015 With optional end, stop comparing sequence at that position.
jpayne@68 1016 prefix can also be a tuple of strings to try. e.g.
jpayne@68 1017
jpayne@68 1018 >>> from Bio.Seq import Seq
jpayne@68 1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 1020 >>> my_rna.startswith("GUC")
jpayne@68 1021 True
jpayne@68 1022 >>> my_rna.startswith("AUG")
jpayne@68 1023 False
jpayne@68 1024 >>> my_rna.startswith("AUG", 3)
jpayne@68 1025 True
jpayne@68 1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
jpayne@68 1027 True
jpayne@68 1028 """
jpayne@68 1029 if isinstance(prefix, tuple):
jpayne@68 1030 prefix = tuple(
jpayne@68 1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68 1032 for p in prefix
jpayne@68 1033 )
jpayne@68 1034 elif isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68 1035 prefix = bytes(prefix)
jpayne@68 1036 elif isinstance(prefix, str):
jpayne@68 1037 prefix = prefix.encode("ASCII")
jpayne@68 1038 return self._data.startswith(prefix, start, end)
jpayne@68 1039
jpayne@68 1040 def endswith(self, suffix, start=None, end=None):
jpayne@68 1041 """Return True if the sequence ends with the given suffix, False otherwise.
jpayne@68 1042
jpayne@68 1043 Return True if the sequence ends with the specified suffix
jpayne@68 1044 (a string or another Seq object), False otherwise.
jpayne@68 1045 With optional start, test sequence beginning at that position.
jpayne@68 1046 With optional end, stop comparing sequence at that position.
jpayne@68 1047 suffix can also be a tuple of strings to try. e.g.
jpayne@68 1048
jpayne@68 1049 >>> from Bio.Seq import Seq
jpayne@68 1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 1051 >>> my_rna.endswith("UUG")
jpayne@68 1052 True
jpayne@68 1053 >>> my_rna.endswith("AUG")
jpayne@68 1054 False
jpayne@68 1055 >>> my_rna.endswith("AUG", 0, 18)
jpayne@68 1056 True
jpayne@68 1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
jpayne@68 1058 True
jpayne@68 1059 """
jpayne@68 1060 if isinstance(suffix, tuple):
jpayne@68 1061 suffix = tuple(
jpayne@68 1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
jpayne@68 1063 for p in suffix
jpayne@68 1064 )
jpayne@68 1065 elif isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68 1066 suffix = bytes(suffix)
jpayne@68 1067 elif isinstance(suffix, str):
jpayne@68 1068 suffix = suffix.encode("ASCII")
jpayne@68 1069 return self._data.endswith(suffix, start, end)
jpayne@68 1070
jpayne@68 1071 def split(self, sep=None, maxsplit=-1):
jpayne@68 1072 """Return a list of subsequences when splitting the sequence by separator sep.
jpayne@68 1073
jpayne@68 1074 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68 1075 using sep as the delimiter string. If maxsplit is given, at
jpayne@68 1076 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@68 1077 splits are made.
jpayne@68 1078
jpayne@68 1079 For consistency with the ``split`` method of Python strings, any
jpayne@68 1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68 1081 default value
jpayne@68 1082
jpayne@68 1083 e.g.
jpayne@68 1084
jpayne@68 1085 >>> from Bio.Seq import Seq
jpayne@68 1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 1087 >>> my_aa = my_rna.translate()
jpayne@68 1088 >>> my_aa
jpayne@68 1089 Seq('VMAIVMGR*KGAR*L')
jpayne@68 1090 >>> for pep in my_aa.split("*"):
jpayne@68 1091 ... pep
jpayne@68 1092 Seq('VMAIVMGR')
jpayne@68 1093 Seq('KGAR')
jpayne@68 1094 Seq('L')
jpayne@68 1095 >>> for pep in my_aa.split("*", 1):
jpayne@68 1096 ... pep
jpayne@68 1097 Seq('VMAIVMGR')
jpayne@68 1098 Seq('KGAR*L')
jpayne@68 1099
jpayne@68 1100 See also the rsplit method, which splits the sequence starting from the
jpayne@68 1101 end:
jpayne@68 1102
jpayne@68 1103 >>> for pep in my_aa.rsplit("*", 1):
jpayne@68 1104 ... pep
jpayne@68 1105 Seq('VMAIVMGR*KGAR')
jpayne@68 1106 Seq('L')
jpayne@68 1107 """
jpayne@68 1108 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68 1109 sep = bytes(sep)
jpayne@68 1110 elif isinstance(sep, str):
jpayne@68 1111 sep = sep.encode("ASCII")
jpayne@68 1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
jpayne@68 1113
jpayne@68 1114 def rsplit(self, sep=None, maxsplit=-1):
jpayne@68 1115 """Return a list of subsequences by splitting the sequence from the right.
jpayne@68 1116
jpayne@68 1117 Return a list of the subsequences in the sequence (as Seq objects),
jpayne@68 1118 using sep as the delimiter string. If maxsplit is given, at
jpayne@68 1119 most maxsplit splits are done. If maxsplit is omitted, all
jpayne@68 1120 splits are made.
jpayne@68 1121
jpayne@68 1122 For consistency with the ``rsplit`` method of Python strings, any
jpayne@68 1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
jpayne@68 1124 default value
jpayne@68 1125
jpayne@68 1126 e.g.
jpayne@68 1127
jpayne@68 1128 >>> from Bio.Seq import Seq
jpayne@68 1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
jpayne@68 1130 >>> my_aa = my_rna.translate()
jpayne@68 1131 >>> my_aa
jpayne@68 1132 Seq('VMAIVMGR*KGAR*L')
jpayne@68 1133 >>> for pep in my_aa.rsplit("*"):
jpayne@68 1134 ... pep
jpayne@68 1135 Seq('VMAIVMGR')
jpayne@68 1136 Seq('KGAR')
jpayne@68 1137 Seq('L')
jpayne@68 1138 >>> for pep in my_aa.rsplit("*", 1):
jpayne@68 1139 ... pep
jpayne@68 1140 Seq('VMAIVMGR*KGAR')
jpayne@68 1141 Seq('L')
jpayne@68 1142
jpayne@68 1143 See also the split method, which splits the sequence starting from the
jpayne@68 1144 beginning:
jpayne@68 1145
jpayne@68 1146 >>> for pep in my_aa.split("*", 1):
jpayne@68 1147 ... pep
jpayne@68 1148 Seq('VMAIVMGR')
jpayne@68 1149 Seq('KGAR*L')
jpayne@68 1150 """
jpayne@68 1151 if isinstance(sep, _SeqAbstractBaseClass):
jpayne@68 1152 sep = bytes(sep)
jpayne@68 1153 elif isinstance(sep, str):
jpayne@68 1154 sep = sep.encode("ASCII")
jpayne@68 1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
jpayne@68 1156
jpayne@68 1157 def strip(self, chars=None, inplace=False):
jpayne@68 1158 """Return a sequence object with leading and trailing ends stripped.
jpayne@68 1159
jpayne@68 1160 With default arguments, leading and trailing whitespace is removed:
jpayne@68 1161
jpayne@68 1162 >>> seq = Seq(" ACGT ")
jpayne@68 1163 >>> seq.strip()
jpayne@68 1164 Seq('ACGT')
jpayne@68 1165 >>> seq
jpayne@68 1166 Seq(' ACGT ')
jpayne@68 1167
jpayne@68 1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68 1169 instead. The order of the characters to be removed is not important:
jpayne@68 1170
jpayne@68 1171 >>> Seq("ACGTACGT").strip("TGCA")
jpayne@68 1172 Seq('')
jpayne@68 1173
jpayne@68 1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68 1175 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68 1176 in-place and returned.
jpayne@68 1177
jpayne@68 1178 >>> seq = MutableSeq(" ACGT ")
jpayne@68 1179 >>> seq.strip()
jpayne@68 1180 MutableSeq('ACGT')
jpayne@68 1181 >>> seq
jpayne@68 1182 MutableSeq(' ACGT ')
jpayne@68 1183 >>> seq.strip(inplace=True)
jpayne@68 1184 MutableSeq('ACGT')
jpayne@68 1185 >>> seq
jpayne@68 1186 MutableSeq('ACGT')
jpayne@68 1187
jpayne@68 1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
jpayne@68 1189 is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1190
jpayne@68 1191 See also the lstrip and rstrip methods.
jpayne@68 1192 """
jpayne@68 1193 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68 1194 chars = bytes(chars)
jpayne@68 1195 elif isinstance(chars, str):
jpayne@68 1196 chars = chars.encode("ASCII")
jpayne@68 1197 try:
jpayne@68 1198 data = self._data.strip(chars)
jpayne@68 1199 except TypeError:
jpayne@68 1200 raise TypeError(
jpayne@68 1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68 1202 ) from None
jpayne@68 1203 if inplace:
jpayne@68 1204 if not isinstance(self._data, bytearray):
jpayne@68 1205 raise TypeError("Sequence is immutable")
jpayne@68 1206 self._data[:] = data
jpayne@68 1207 return self
jpayne@68 1208 else:
jpayne@68 1209 return self.__class__(data)
jpayne@68 1210
jpayne@68 1211 def lstrip(self, chars=None, inplace=False):
jpayne@68 1212 """Return a sequence object with leading and trailing ends stripped.
jpayne@68 1213
jpayne@68 1214 With default arguments, leading whitespace is removed:
jpayne@68 1215
jpayne@68 1216 >>> seq = Seq(" ACGT ")
jpayne@68 1217 >>> seq.lstrip()
jpayne@68 1218 Seq('ACGT ')
jpayne@68 1219 >>> seq
jpayne@68 1220 Seq(' ACGT ')
jpayne@68 1221
jpayne@68 1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68 1223 from the leading end instead. The order of the characters to be removed
jpayne@68 1224 is not important:
jpayne@68 1225
jpayne@68 1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
jpayne@68 1227 Seq('TTACG')
jpayne@68 1228
jpayne@68 1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68 1230 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68 1231 in-place and returned.
jpayne@68 1232
jpayne@68 1233 >>> seq = MutableSeq(" ACGT ")
jpayne@68 1234 >>> seq.lstrip()
jpayne@68 1235 MutableSeq('ACGT ')
jpayne@68 1236 >>> seq
jpayne@68 1237 MutableSeq(' ACGT ')
jpayne@68 1238 >>> seq.lstrip(inplace=True)
jpayne@68 1239 MutableSeq('ACGT ')
jpayne@68 1240 >>> seq
jpayne@68 1241 MutableSeq('ACGT ')
jpayne@68 1242
jpayne@68 1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1245
jpayne@68 1246 See also the strip and rstrip methods.
jpayne@68 1247 """
jpayne@68 1248 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68 1249 chars = bytes(chars)
jpayne@68 1250 elif isinstance(chars, str):
jpayne@68 1251 chars = chars.encode("ASCII")
jpayne@68 1252 try:
jpayne@68 1253 data = self._data.lstrip(chars)
jpayne@68 1254 except TypeError:
jpayne@68 1255 raise TypeError(
jpayne@68 1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68 1257 ) from None
jpayne@68 1258 if inplace:
jpayne@68 1259 if not isinstance(self._data, bytearray):
jpayne@68 1260 raise TypeError("Sequence is immutable")
jpayne@68 1261 self._data[:] = data
jpayne@68 1262 return self
jpayne@68 1263 else:
jpayne@68 1264 return self.__class__(data)
jpayne@68 1265
jpayne@68 1266 def rstrip(self, chars=None, inplace=False):
jpayne@68 1267 """Return a sequence object with trailing ends stripped.
jpayne@68 1268
jpayne@68 1269 With default arguments, trailing whitespace is removed:
jpayne@68 1270
jpayne@68 1271 >>> seq = Seq(" ACGT ")
jpayne@68 1272 >>> seq.rstrip()
jpayne@68 1273 Seq(' ACGT')
jpayne@68 1274 >>> seq
jpayne@68 1275 Seq(' ACGT ')
jpayne@68 1276
jpayne@68 1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
jpayne@68 1278 from the trailing end instead. The order of the characters to be
jpayne@68 1279 removed is not important:
jpayne@68 1280
jpayne@68 1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
jpayne@68 1282 Seq('ACGACGTT')
jpayne@68 1283
jpayne@68 1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
jpayne@68 1285 default value). If ``inplace`` is ``True``, the sequence is stripped
jpayne@68 1286 in-place and returned.
jpayne@68 1287
jpayne@68 1288 >>> seq = MutableSeq(" ACGT ")
jpayne@68 1289 >>> seq.rstrip()
jpayne@68 1290 MutableSeq(' ACGT')
jpayne@68 1291 >>> seq
jpayne@68 1292 MutableSeq(' ACGT ')
jpayne@68 1293 >>> seq.rstrip(inplace=True)
jpayne@68 1294 MutableSeq(' ACGT')
jpayne@68 1295 >>> seq
jpayne@68 1296 MutableSeq(' ACGT')
jpayne@68 1297
jpayne@68 1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1300
jpayne@68 1301 See also the strip and lstrip methods.
jpayne@68 1302 """
jpayne@68 1303 if isinstance(chars, _SeqAbstractBaseClass):
jpayne@68 1304 chars = bytes(chars)
jpayne@68 1305 elif isinstance(chars, str):
jpayne@68 1306 chars = chars.encode("ASCII")
jpayne@68 1307 try:
jpayne@68 1308 data = self._data.rstrip(chars)
jpayne@68 1309 except TypeError:
jpayne@68 1310 raise TypeError(
jpayne@68 1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
jpayne@68 1312 ) from None
jpayne@68 1313 if inplace:
jpayne@68 1314 if not isinstance(self._data, bytearray):
jpayne@68 1315 raise TypeError("Sequence is immutable")
jpayne@68 1316 self._data[:] = data
jpayne@68 1317 return self
jpayne@68 1318 else:
jpayne@68 1319 return self.__class__(data)
jpayne@68 1320
jpayne@68 1321 def removeprefix(self, prefix, inplace=False):
jpayne@68 1322 """Return a new Seq object with prefix (left) removed.
jpayne@68 1323
jpayne@68 1324 This behaves like the python string method of the same name.
jpayne@68 1325
jpayne@68 1326 e.g. Removing a start Codon:
jpayne@68 1327
jpayne@68 1328 >>> from Bio.Seq import Seq
jpayne@68 1329 >>> my_seq = Seq("ATGGTGTGTGT")
jpayne@68 1330 >>> my_seq
jpayne@68 1331 Seq('ATGGTGTGTGT')
jpayne@68 1332 >>> my_seq.removeprefix('ATG')
jpayne@68 1333 Seq('GTGTGTGT')
jpayne@68 1334
jpayne@68 1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1337
jpayne@68 1338 See also the removesuffix method.
jpayne@68 1339 """
jpayne@68 1340 if isinstance(prefix, _SeqAbstractBaseClass):
jpayne@68 1341 prefix = bytes(prefix)
jpayne@68 1342 elif isinstance(prefix, str):
jpayne@68 1343 prefix = prefix.encode("ASCII")
jpayne@68 1344 try:
jpayne@68 1345 data = self._data.removeprefix(prefix)
jpayne@68 1346 except TypeError:
jpayne@68 1347 raise TypeError(
jpayne@68 1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68 1349 ) from None
jpayne@68 1350 except AttributeError:
jpayne@68 1351 # Fall back for pre-Python 3.9
jpayne@68 1352 data = self._data
jpayne@68 1353 if data.startswith(prefix):
jpayne@68 1354 data = data[len(prefix) :]
jpayne@68 1355 if inplace:
jpayne@68 1356 if not isinstance(self._data, bytearray):
jpayne@68 1357 raise TypeError("Sequence is immutable")
jpayne@68 1358 self._data[:] = data
jpayne@68 1359 return self
jpayne@68 1360 else:
jpayne@68 1361 return self.__class__(data)
jpayne@68 1362
jpayne@68 1363 def removesuffix(self, suffix, inplace=False):
jpayne@68 1364 """Return a new Seq object with suffix (right) removed.
jpayne@68 1365
jpayne@68 1366 This behaves like the python string method of the same name.
jpayne@68 1367
jpayne@68 1368 e.g. Removing a stop codon:
jpayne@68 1369
jpayne@68 1370 >>> from Bio.Seq import Seq
jpayne@68 1371 >>> my_seq = Seq("GTGTGTGTTAG")
jpayne@68 1372 >>> my_seq
jpayne@68 1373 Seq('GTGTGTGTTAG')
jpayne@68 1374 >>> stop_codon = Seq("TAG")
jpayne@68 1375 >>> my_seq.removesuffix(stop_codon)
jpayne@68 1376 Seq('GTGTGTGT')
jpayne@68 1377
jpayne@68 1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1380
jpayne@68 1381 See also the removeprefix method.
jpayne@68 1382 """
jpayne@68 1383 if isinstance(suffix, _SeqAbstractBaseClass):
jpayne@68 1384 suffix = bytes(suffix)
jpayne@68 1385 elif isinstance(suffix, str):
jpayne@68 1386 suffix = suffix.encode("ASCII")
jpayne@68 1387 try:
jpayne@68 1388 data = self._data.removesuffix(suffix)
jpayne@68 1389 except TypeError:
jpayne@68 1390 raise TypeError(
jpayne@68 1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
jpayne@68 1392 ) from None
jpayne@68 1393 except AttributeError:
jpayne@68 1394 # Fall back for pre-Python 3.9
jpayne@68 1395 data = self._data
jpayne@68 1396 if data.endswith(suffix):
jpayne@68 1397 data = data[: -len(suffix)]
jpayne@68 1398 if inplace:
jpayne@68 1399 if not isinstance(self._data, bytearray):
jpayne@68 1400 raise TypeError("Sequence is immutable")
jpayne@68 1401 self._data[:] = data
jpayne@68 1402 return self
jpayne@68 1403 else:
jpayne@68 1404 return self.__class__(data)
jpayne@68 1405
jpayne@68 1406 def upper(self, inplace=False):
jpayne@68 1407 """Return the sequence in upper case.
jpayne@68 1408
jpayne@68 1409 An upper-case copy of the sequence is returned if inplace is False,
jpayne@68 1410 the default value:
jpayne@68 1411
jpayne@68 1412 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 1413 >>> my_seq = Seq("VHLTPeeK*")
jpayne@68 1414 >>> my_seq
jpayne@68 1415 Seq('VHLTPeeK*')
jpayne@68 1416 >>> my_seq.lower()
jpayne@68 1417 Seq('vhltpeek*')
jpayne@68 1418 >>> my_seq.upper()
jpayne@68 1419 Seq('VHLTPEEK*')
jpayne@68 1420 >>> my_seq
jpayne@68 1421 Seq('VHLTPeeK*')
jpayne@68 1422
jpayne@68 1423 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1424
jpayne@68 1425 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68 1426 >>> my_seq
jpayne@68 1427 MutableSeq('VHLTPeeK*')
jpayne@68 1428 >>> my_seq.lower()
jpayne@68 1429 MutableSeq('vhltpeek*')
jpayne@68 1430 >>> my_seq.upper()
jpayne@68 1431 MutableSeq('VHLTPEEK*')
jpayne@68 1432 >>> my_seq
jpayne@68 1433 MutableSeq('VHLTPeeK*')
jpayne@68 1434
jpayne@68 1435 >>> my_seq.lower(inplace=True)
jpayne@68 1436 MutableSeq('vhltpeek*')
jpayne@68 1437 >>> my_seq
jpayne@68 1438 MutableSeq('vhltpeek*')
jpayne@68 1439 >>> my_seq.upper(inplace=True)
jpayne@68 1440 MutableSeq('VHLTPEEK*')
jpayne@68 1441 >>> my_seq
jpayne@68 1442 MutableSeq('VHLTPEEK*')
jpayne@68 1443
jpayne@68 1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1446
jpayne@68 1447 See also the ``lower`` method.
jpayne@68 1448 """
jpayne@68 1449 data = self._data.upper()
jpayne@68 1450 if inplace:
jpayne@68 1451 if not isinstance(self._data, bytearray):
jpayne@68 1452 raise TypeError("Sequence is immutable")
jpayne@68 1453 self._data[:] = data
jpayne@68 1454 return self
jpayne@68 1455 else:
jpayne@68 1456 return self.__class__(data)
jpayne@68 1457
jpayne@68 1458 def lower(self, inplace=False):
jpayne@68 1459 """Return the sequence in lower case.
jpayne@68 1460
jpayne@68 1461 An lower-case copy of the sequence is returned if inplace is False,
jpayne@68 1462 the default value:
jpayne@68 1463
jpayne@68 1464 >>> from Bio.Seq import Seq, MutableSeq
jpayne@68 1465 >>> my_seq = Seq("VHLTPeeK*")
jpayne@68 1466 >>> my_seq
jpayne@68 1467 Seq('VHLTPeeK*')
jpayne@68 1468 >>> my_seq.lower()
jpayne@68 1469 Seq('vhltpeek*')
jpayne@68 1470 >>> my_seq.upper()
jpayne@68 1471 Seq('VHLTPEEK*')
jpayne@68 1472 >>> my_seq
jpayne@68 1473 Seq('VHLTPeeK*')
jpayne@68 1474
jpayne@68 1475 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1476
jpayne@68 1477 >>> my_seq = MutableSeq("VHLTPeeK*")
jpayne@68 1478 >>> my_seq
jpayne@68 1479 MutableSeq('VHLTPeeK*')
jpayne@68 1480 >>> my_seq.lower()
jpayne@68 1481 MutableSeq('vhltpeek*')
jpayne@68 1482 >>> my_seq.upper()
jpayne@68 1483 MutableSeq('VHLTPEEK*')
jpayne@68 1484 >>> my_seq
jpayne@68 1485 MutableSeq('VHLTPeeK*')
jpayne@68 1486
jpayne@68 1487 >>> my_seq.lower(inplace=True)
jpayne@68 1488 MutableSeq('vhltpeek*')
jpayne@68 1489 >>> my_seq
jpayne@68 1490 MutableSeq('vhltpeek*')
jpayne@68 1491 >>> my_seq.upper(inplace=True)
jpayne@68 1492 MutableSeq('VHLTPEEK*')
jpayne@68 1493 >>> my_seq
jpayne@68 1494 MutableSeq('VHLTPEEK*')
jpayne@68 1495
jpayne@68 1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1498
jpayne@68 1499 See also the ``upper`` method.
jpayne@68 1500 """
jpayne@68 1501 data = self._data.lower()
jpayne@68 1502 if inplace:
jpayne@68 1503 if not isinstance(self._data, bytearray):
jpayne@68 1504 raise TypeError("Sequence is immutable")
jpayne@68 1505 self._data[:] = data
jpayne@68 1506 return self
jpayne@68 1507 else:
jpayne@68 1508 return self.__class__(data)
jpayne@68 1509
jpayne@68 1510 def isupper(self):
jpayne@68 1511 """Return True if all ASCII characters in data are uppercase.
jpayne@68 1512
jpayne@68 1513 If there are no cased characters, the method returns False.
jpayne@68 1514 """
jpayne@68 1515 return self._data.isupper()
jpayne@68 1516
jpayne@68 1517 def islower(self):
jpayne@68 1518 """Return True if all ASCII characters in data are lowercase.
jpayne@68 1519
jpayne@68 1520 If there are no cased characters, the method returns False.
jpayne@68 1521 """
jpayne@68 1522 return self._data.islower()
jpayne@68 1523
jpayne@68 1524 def translate(
jpayne@68 1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
jpayne@68 1526 ):
jpayne@68 1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
jpayne@68 1528
jpayne@68 1529 This method will translate DNA or RNA sequences. It should not
jpayne@68 1530 be used on protein sequences as any result will be biologically
jpayne@68 1531 meaningless.
jpayne@68 1532
jpayne@68 1533 Arguments:
jpayne@68 1534 - table - Which codon table to use? This can be either a name
jpayne@68 1535 (string), an NCBI identifier (integer), or a CodonTable
jpayne@68 1536 object (useful for non-standard genetic codes). This
jpayne@68 1537 defaults to the "Standard" table.
jpayne@68 1538 - stop_symbol - Single character string, what to use for
jpayne@68 1539 terminators. This defaults to the asterisk, "*".
jpayne@68 1540 - to_stop - Boolean, defaults to False meaning do a full
jpayne@68 1541 translation continuing on past any stop codons (translated as the
jpayne@68 1542 specified stop_symbol). If True, translation is terminated at
jpayne@68 1543 the first in frame stop codon (and the stop_symbol is not
jpayne@68 1544 appended to the returned protein sequence).
jpayne@68 1545 - cds - Boolean, indicates this is a complete CDS. If True,
jpayne@68 1546 this checks the sequence starts with a valid alternative start
jpayne@68 1547 codon (which will be translated as methionine, M), that the
jpayne@68 1548 sequence length is a multiple of three, and that there is a
jpayne@68 1549 single in frame stop codon at the end (this will be excluded
jpayne@68 1550 from the protein sequence, regardless of the to_stop option).
jpayne@68 1551 If these tests fail, an exception is raised.
jpayne@68 1552 - gap - Single character string to denote symbol used for gaps.
jpayne@68 1553 Defaults to the minus sign.
jpayne@68 1554
jpayne@68 1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
jpayne@68 1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
jpayne@68 1557 pn a ``MutableSeq`` object.
jpayne@68 1558
jpayne@68 1559 e.g. Using the standard table:
jpayne@68 1560
jpayne@68 1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68 1562 >>> coding_dna.translate()
jpayne@68 1563 Seq('VAIVMGR*KGAR*')
jpayne@68 1564 >>> coding_dna.translate(stop_symbol="@")
jpayne@68 1565 Seq('VAIVMGR@KGAR@')
jpayne@68 1566 >>> coding_dna.translate(to_stop=True)
jpayne@68 1567 Seq('VAIVMGR')
jpayne@68 1568
jpayne@68 1569 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68 1570
jpayne@68 1571 >>> coding_dna.translate(table=2)
jpayne@68 1572 Seq('VAIVMGRWKGAR*')
jpayne@68 1573 >>> coding_dna.translate(table=2, to_stop=True)
jpayne@68 1574 Seq('VAIVMGRWKGAR')
jpayne@68 1575
jpayne@68 1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
jpayne@68 1577 this sequence could be a complete CDS:
jpayne@68 1578
jpayne@68 1579 >>> coding_dna.translate(table=2, cds=True)
jpayne@68 1580 Seq('MAIVMGRWKGAR')
jpayne@68 1581
jpayne@68 1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
jpayne@68 1583 and also the in frame stop codons:
jpayne@68 1584
jpayne@68 1585 >>> coding_dna.translate(table=1, cds=True)
jpayne@68 1586 Traceback (most recent call last):
jpayne@68 1587 ...
jpayne@68 1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
jpayne@68 1589
jpayne@68 1590 If the sequence has no in-frame stop codon, then the to_stop argument
jpayne@68 1591 has no effect:
jpayne@68 1592
jpayne@68 1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
jpayne@68 1594 >>> coding_dna2.translate()
jpayne@68 1595 Seq('LAIVMGR')
jpayne@68 1596 >>> coding_dna2.translate(to_stop=True)
jpayne@68 1597 Seq('LAIVMGR')
jpayne@68 1598
jpayne@68 1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68 1600 or a stop codon. These are translated as "X". Any invalid codon
jpayne@68 1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68 1602
jpayne@68 1603 NOTE - This does NOT behave like the python string's translate
jpayne@68 1604 method. For that use str(my_seq).translate(...) instead
jpayne@68 1605 """
jpayne@68 1606 try:
jpayne@68 1607 data = str(self)
jpayne@68 1608 except UndefinedSequenceError:
jpayne@68 1609 # translating an undefined sequence yields an undefined
jpayne@68 1610 # sequence with the length divided by 3
jpayne@68 1611 n = len(self)
jpayne@68 1612 if n % 3 != 0:
jpayne@68 1613 warnings.warn(
jpayne@68 1614 "Partial codon, len(sequence) not a multiple of three. "
jpayne@68 1615 "This may become an error in future.",
jpayne@68 1616 BiopythonWarning,
jpayne@68 1617 )
jpayne@68 1618 return Seq(None, n // 3)
jpayne@68 1619
jpayne@68 1620 return self.__class__(
jpayne@68 1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68 1622 )
jpayne@68 1623
jpayne@68 1624 def complement(self, inplace=False):
jpayne@68 1625 """Return the complement as a DNA sequence.
jpayne@68 1626
jpayne@68 1627 >>> Seq("CGA").complement()
jpayne@68 1628 Seq('GCT')
jpayne@68 1629
jpayne@68 1630 Any U in the sequence is treated as a T:
jpayne@68 1631
jpayne@68 1632 >>> Seq("CGAUT").complement()
jpayne@68 1633 Seq('GCTAA')
jpayne@68 1634
jpayne@68 1635 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68 1636
jpayne@68 1637 >>> Seq("CGAUT").complement_rna()
jpayne@68 1638 Seq('GCUAA')
jpayne@68 1639
jpayne@68 1640 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1641
jpayne@68 1642 >>> my_seq = MutableSeq("CGA")
jpayne@68 1643 >>> my_seq
jpayne@68 1644 MutableSeq('CGA')
jpayne@68 1645 >>> my_seq.complement()
jpayne@68 1646 MutableSeq('GCT')
jpayne@68 1647 >>> my_seq
jpayne@68 1648 MutableSeq('CGA')
jpayne@68 1649
jpayne@68 1650 >>> my_seq.complement(inplace=True)
jpayne@68 1651 MutableSeq('GCT')
jpayne@68 1652 >>> my_seq
jpayne@68 1653 MutableSeq('GCT')
jpayne@68 1654
jpayne@68 1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1657 """
jpayne@68 1658 ttable = _dna_complement_table
jpayne@68 1659 try:
jpayne@68 1660 data = self._data.translate(ttable)
jpayne@68 1661 except UndefinedSequenceError:
jpayne@68 1662 # complement of an undefined sequence is an undefined sequence
jpayne@68 1663 # of the same length
jpayne@68 1664 return self
jpayne@68 1665 if inplace:
jpayne@68 1666 if not isinstance(self._data, bytearray):
jpayne@68 1667 raise TypeError("Sequence is immutable")
jpayne@68 1668 self._data[:] = data
jpayne@68 1669 return self
jpayne@68 1670 return self.__class__(data)
jpayne@68 1671
jpayne@68 1672 def complement_rna(self, inplace=False):
jpayne@68 1673 """Return the complement as an RNA sequence.
jpayne@68 1674
jpayne@68 1675 >>> Seq("CGA").complement_rna()
jpayne@68 1676 Seq('GCU')
jpayne@68 1677
jpayne@68 1678 Any T in the sequence is treated as a U:
jpayne@68 1679
jpayne@68 1680 >>> Seq("CGAUT").complement_rna()
jpayne@68 1681 Seq('GCUAA')
jpayne@68 1682
jpayne@68 1683 In contrast, ``complement`` returns a DNA sequence by default:
jpayne@68 1684
jpayne@68 1685 >>> Seq("CGA").complement()
jpayne@68 1686 Seq('GCT')
jpayne@68 1687
jpayne@68 1688 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1689
jpayne@68 1690 >>> my_seq = MutableSeq("CGA")
jpayne@68 1691 >>> my_seq
jpayne@68 1692 MutableSeq('CGA')
jpayne@68 1693 >>> my_seq.complement_rna()
jpayne@68 1694 MutableSeq('GCU')
jpayne@68 1695 >>> my_seq
jpayne@68 1696 MutableSeq('CGA')
jpayne@68 1697
jpayne@68 1698 >>> my_seq.complement_rna(inplace=True)
jpayne@68 1699 MutableSeq('GCU')
jpayne@68 1700 >>> my_seq
jpayne@68 1701 MutableSeq('GCU')
jpayne@68 1702
jpayne@68 1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1705 """
jpayne@68 1706 try:
jpayne@68 1707 data = self._data.translate(_rna_complement_table)
jpayne@68 1708 except UndefinedSequenceError:
jpayne@68 1709 # complement of an undefined sequence is an undefined sequence
jpayne@68 1710 # of the same length
jpayne@68 1711 return self
jpayne@68 1712 if inplace:
jpayne@68 1713 if not isinstance(self._data, bytearray):
jpayne@68 1714 raise TypeError("Sequence is immutable")
jpayne@68 1715 self._data[:] = data
jpayne@68 1716 return self
jpayne@68 1717 return self.__class__(data)
jpayne@68 1718
jpayne@68 1719 def reverse_complement(self, inplace=False):
jpayne@68 1720 """Return the reverse complement as a DNA sequence.
jpayne@68 1721
jpayne@68 1722 >>> Seq("CGA").reverse_complement()
jpayne@68 1723 Seq('TCG')
jpayne@68 1724
jpayne@68 1725 Any U in the sequence is treated as a T:
jpayne@68 1726
jpayne@68 1727 >>> Seq("CGAUT").reverse_complement()
jpayne@68 1728 Seq('AATCG')
jpayne@68 1729
jpayne@68 1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68 1731
jpayne@68 1732 >>> Seq("CGA").reverse_complement_rna()
jpayne@68 1733 Seq('UCG')
jpayne@68 1734
jpayne@68 1735 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1736
jpayne@68 1737 >>> my_seq = MutableSeq("CGA")
jpayne@68 1738 >>> my_seq
jpayne@68 1739 MutableSeq('CGA')
jpayne@68 1740 >>> my_seq.reverse_complement()
jpayne@68 1741 MutableSeq('TCG')
jpayne@68 1742 >>> my_seq
jpayne@68 1743 MutableSeq('CGA')
jpayne@68 1744
jpayne@68 1745 >>> my_seq.reverse_complement(inplace=True)
jpayne@68 1746 MutableSeq('TCG')
jpayne@68 1747 >>> my_seq
jpayne@68 1748 MutableSeq('TCG')
jpayne@68 1749
jpayne@68 1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1751 ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68 1752 ``inplace=True``.
jpayne@68 1753 """
jpayne@68 1754 try:
jpayne@68 1755 data = self._data.translate(_dna_complement_table)
jpayne@68 1756 except UndefinedSequenceError:
jpayne@68 1757 # reverse complement of an undefined sequence is an undefined sequence
jpayne@68 1758 # of the same length
jpayne@68 1759 return self
jpayne@68 1760 if inplace:
jpayne@68 1761 if not isinstance(self._data, bytearray):
jpayne@68 1762 raise TypeError("Sequence is immutable")
jpayne@68 1763 self._data[::-1] = data
jpayne@68 1764 return self
jpayne@68 1765 return self.__class__(data[::-1])
jpayne@68 1766
jpayne@68 1767 def reverse_complement_rna(self, inplace=False):
jpayne@68 1768 """Return the reverse complement as an RNA sequence.
jpayne@68 1769
jpayne@68 1770 >>> Seq("CGA").reverse_complement_rna()
jpayne@68 1771 Seq('UCG')
jpayne@68 1772
jpayne@68 1773 Any T in the sequence is treated as a U:
jpayne@68 1774
jpayne@68 1775 >>> Seq("CGAUT").reverse_complement_rna()
jpayne@68 1776 Seq('AAUCG')
jpayne@68 1777
jpayne@68 1778 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68 1779
jpayne@68 1780 >>> Seq("CGA").reverse_complement()
jpayne@68 1781 Seq('TCG')
jpayne@68 1782
jpayne@68 1783 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1784
jpayne@68 1785 >>> my_seq = MutableSeq("CGA")
jpayne@68 1786 >>> my_seq
jpayne@68 1787 MutableSeq('CGA')
jpayne@68 1788 >>> my_seq.reverse_complement_rna()
jpayne@68 1789 MutableSeq('UCG')
jpayne@68 1790 >>> my_seq
jpayne@68 1791 MutableSeq('CGA')
jpayne@68 1792
jpayne@68 1793 >>> my_seq.reverse_complement_rna(inplace=True)
jpayne@68 1794 MutableSeq('UCG')
jpayne@68 1795 >>> my_seq
jpayne@68 1796 MutableSeq('UCG')
jpayne@68 1797
jpayne@68 1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
jpayne@68 1800 ``inplace=True``.
jpayne@68 1801 """
jpayne@68 1802 try:
jpayne@68 1803 data = self._data.translate(_rna_complement_table)
jpayne@68 1804 except UndefinedSequenceError:
jpayne@68 1805 # reverse complement of an undefined sequence is an undefined sequence
jpayne@68 1806 # of the same length
jpayne@68 1807 return self
jpayne@68 1808 if inplace:
jpayne@68 1809 if not isinstance(self._data, bytearray):
jpayne@68 1810 raise TypeError("Sequence is immutable")
jpayne@68 1811 self._data[::-1] = data
jpayne@68 1812 return self
jpayne@68 1813 return self.__class__(data[::-1])
jpayne@68 1814
jpayne@68 1815 def transcribe(self, inplace=False):
jpayne@68 1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
jpayne@68 1817
jpayne@68 1818 Following the usual convention, the sequence is interpreted as the
jpayne@68 1819 coding strand of the DNA double helix, not the template strand. This
jpayne@68 1820 means we can get the RNA sequence just by switching T to U.
jpayne@68 1821
jpayne@68 1822 >>> from Bio.Seq import Seq
jpayne@68 1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68 1824 >>> coding_dna
jpayne@68 1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1826 >>> coding_dna.transcribe()
jpayne@68 1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1828
jpayne@68 1829 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1830
jpayne@68 1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
jpayne@68 1832 >>> sequence
jpayne@68 1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1834 >>> sequence.transcribe()
jpayne@68 1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1836 >>> sequence
jpayne@68 1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1838
jpayne@68 1839 >>> sequence.transcribe(inplace=True)
jpayne@68 1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1841 >>> sequence
jpayne@68 1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1843
jpayne@68 1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1846
jpayne@68 1847 Trying to transcribe an RNA sequence has no effect.
jpayne@68 1848 If you have a nucleotide sequence which might be DNA or RNA
jpayne@68 1849 (or even a mixture), calling the transcribe method will ensure
jpayne@68 1850 any T becomes U.
jpayne@68 1851
jpayne@68 1852 Trying to transcribe a protein sequence will replace any
jpayne@68 1853 T for Threonine with U for Selenocysteine, which has no
jpayne@68 1854 biologically plausible rational.
jpayne@68 1855
jpayne@68 1856 >>> from Bio.Seq import Seq
jpayne@68 1857 >>> my_protein = Seq("MAIVMGRT")
jpayne@68 1858 >>> my_protein.transcribe()
jpayne@68 1859 Seq('MAIVMGRU')
jpayne@68 1860 """
jpayne@68 1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
jpayne@68 1862 if inplace:
jpayne@68 1863 if not isinstance(self._data, bytearray):
jpayne@68 1864 raise TypeError("Sequence is immutable")
jpayne@68 1865 self._data[:] = data
jpayne@68 1866 return self
jpayne@68 1867 return self.__class__(data)
jpayne@68 1868
jpayne@68 1869 def back_transcribe(self, inplace=False):
jpayne@68 1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
jpayne@68 1871
jpayne@68 1872 >>> from Bio.Seq import Seq
jpayne@68 1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68 1874 >>> messenger_rna
jpayne@68 1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1876 >>> messenger_rna.back_transcribe()
jpayne@68 1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1878
jpayne@68 1879 The sequence is modified in-place and returned if inplace is True:
jpayne@68 1880
jpayne@68 1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
jpayne@68 1882 >>> sequence
jpayne@68 1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1884 >>> sequence.back_transcribe()
jpayne@68 1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1886 >>> sequence
jpayne@68 1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
jpayne@68 1888
jpayne@68 1889 >>> sequence.back_transcribe(inplace=True)
jpayne@68 1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1891 >>> sequence
jpayne@68 1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
jpayne@68 1893
jpayne@68 1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1896
jpayne@68 1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
jpayne@68 1898 sequence which might be DNA or RNA (or even a mixture), calling the
jpayne@68 1899 back-transcribe method will ensure any U becomes T.
jpayne@68 1900
jpayne@68 1901 Trying to back-transcribe a protein sequence will replace any U for
jpayne@68 1902 Selenocysteine with T for Threonine, which is biologically meaningless.
jpayne@68 1903
jpayne@68 1904 >>> from Bio.Seq import Seq
jpayne@68 1905 >>> my_protein = Seq("MAIVMGRU")
jpayne@68 1906 >>> my_protein.back_transcribe()
jpayne@68 1907 Seq('MAIVMGRT')
jpayne@68 1908 """
jpayne@68 1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
jpayne@68 1910 if inplace:
jpayne@68 1911 if not isinstance(self._data, bytearray):
jpayne@68 1912 raise TypeError("Sequence is immutable")
jpayne@68 1913 self._data[:] = data
jpayne@68 1914 return self
jpayne@68 1915 return self.__class__(data)
jpayne@68 1916
jpayne@68 1917 def join(self, other):
jpayne@68 1918 """Return a merge of the sequences in other, spaced by the sequence from self.
jpayne@68 1919
jpayne@68 1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
jpayne@68 1921 the letters), or an iterable containing Seq, MutableSeq, or string
jpayne@68 1922 objects. These arguments will be concatenated with the calling sequence
jpayne@68 1923 as the spacer:
jpayne@68 1924
jpayne@68 1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
jpayne@68 1926 >>> concatenated
jpayne@68 1927 Seq('AAANNNNNTTTNNNNNPPP')
jpayne@68 1928
jpayne@68 1929 Joining the letters of a single sequence:
jpayne@68 1930
jpayne@68 1931 >>> Seq('NNNNN').join(Seq("ACGT"))
jpayne@68 1932 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68 1933 >>> Seq('NNNNN').join("ACGT")
jpayne@68 1934 Seq('ANNNNNCNNNNNGNNNNNT')
jpayne@68 1935 """
jpayne@68 1936 if isinstance(other, _SeqAbstractBaseClass):
jpayne@68 1937 return self.__class__(str(self).join(str(other)))
jpayne@68 1938 elif isinstance(other, str):
jpayne@68 1939 return self.__class__(str(self).join(other))
jpayne@68 1940
jpayne@68 1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68 1942
jpayne@68 1943 if isinstance(other, SeqRecord):
jpayne@68 1944 raise TypeError("Iterable cannot be a SeqRecord")
jpayne@68 1945
jpayne@68 1946 for c in other:
jpayne@68 1947 if isinstance(c, SeqRecord):
jpayne@68 1948 raise TypeError("Iterable cannot contain SeqRecords")
jpayne@68 1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
jpayne@68 1950 raise TypeError(
jpayne@68 1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
jpayne@68 1952 )
jpayne@68 1953 return self.__class__(str(self).join([str(_) for _ in other]))
jpayne@68 1954
jpayne@68 1955 def replace(self, old, new, inplace=False):
jpayne@68 1956 """Return a copy with all occurrences of subsequence old replaced by new.
jpayne@68 1957
jpayne@68 1958 >>> s = Seq("ACGTAACCGGTT")
jpayne@68 1959 >>> t = s.replace("AC", "XYZ")
jpayne@68 1960 >>> s
jpayne@68 1961 Seq('ACGTAACCGGTT')
jpayne@68 1962 >>> t
jpayne@68 1963 Seq('XYZGTAXYZCGGTT')
jpayne@68 1964
jpayne@68 1965 For mutable sequences, passing inplace=True will modify the sequence in place:
jpayne@68 1966
jpayne@68 1967 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68 1968 >>> t = m.replace("AC", "XYZ")
jpayne@68 1969 >>> m
jpayne@68 1970 MutableSeq('ACGTAACCGGTT')
jpayne@68 1971 >>> t
jpayne@68 1972 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68 1973
jpayne@68 1974 >>> m = MutableSeq("ACGTAACCGGTT")
jpayne@68 1975 >>> t = m.replace("AC", "XYZ", inplace=True)
jpayne@68 1976 >>> m
jpayne@68 1977 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68 1978 >>> t
jpayne@68 1979 MutableSeq('XYZGTAXYZCGGTT')
jpayne@68 1980
jpayne@68 1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
jpayne@68 1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
jpayne@68 1983 """
jpayne@68 1984 if isinstance(old, _SeqAbstractBaseClass):
jpayne@68 1985 old = bytes(old)
jpayne@68 1986 elif isinstance(old, str):
jpayne@68 1987 old = old.encode("ASCII")
jpayne@68 1988 if isinstance(new, _SeqAbstractBaseClass):
jpayne@68 1989 new = bytes(new)
jpayne@68 1990 elif isinstance(new, str):
jpayne@68 1991 new = new.encode("ASCII")
jpayne@68 1992 data = self._data.replace(old, new)
jpayne@68 1993 if inplace:
jpayne@68 1994 if not isinstance(self._data, bytearray):
jpayne@68 1995 raise TypeError("Sequence is immutable")
jpayne@68 1996 self._data[:] = data
jpayne@68 1997 return self
jpayne@68 1998 return self.__class__(data)
jpayne@68 1999
jpayne@68 2000 @property
jpayne@68 2001 def defined(self):
jpayne@68 2002 """Return True if the sequence is defined, False if undefined or partially defined.
jpayne@68 2003
jpayne@68 2004 Zero-length sequences are always considered to be defined.
jpayne@68 2005 """
jpayne@68 2006 if isinstance(self._data, (bytes, bytearray)):
jpayne@68 2007 return True
jpayne@68 2008 else:
jpayne@68 2009 return self._data.defined
jpayne@68 2010
jpayne@68 2011 @property
jpayne@68 2012 def defined_ranges(self):
jpayne@68 2013 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68 2014
jpayne@68 2015 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68 2016 """
jpayne@68 2017 if isinstance(self._data, (bytes, bytearray)):
jpayne@68 2018 length = len(self)
jpayne@68 2019 if length > 0:
jpayne@68 2020 return ((0, length),)
jpayne@68 2021 else:
jpayne@68 2022 return ()
jpayne@68 2023 else:
jpayne@68 2024 return self._data.defined_ranges
jpayne@68 2025
jpayne@68 2026
jpayne@68 2027 class Seq(_SeqAbstractBaseClass):
jpayne@68 2028 """Read-only sequence object (essentially a string with biological methods).
jpayne@68 2029
jpayne@68 2030 Like normal python strings, our basic sequence object is immutable.
jpayne@68 2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
jpayne@68 2032 Seq objects to be used as dictionary keys.
jpayne@68 2033
jpayne@68 2034 The Seq object provides a number of string like methods (such as count,
jpayne@68 2035 find, split and strip).
jpayne@68 2036
jpayne@68 2037 The Seq object also provides some biological methods, such as complement,
jpayne@68 2038 reverse_complement, transcribe, back_transcribe and translate (which are
jpayne@68 2039 not applicable to protein sequences).
jpayne@68 2040 """
jpayne@68 2041
jpayne@68 2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
jpayne@68 2043
jpayne@68 2044 def __init__(
jpayne@68 2045 self,
jpayne@68 2046 data: Union[
jpayne@68 2047 str,
jpayne@68 2048 bytes,
jpayne@68 2049 bytearray,
jpayne@68 2050 _SeqAbstractBaseClass,
jpayne@68 2051 SequenceDataAbstractBaseClass,
jpayne@68 2052 dict,
jpayne@68 2053 None,
jpayne@68 2054 ],
jpayne@68 2055 length: Optional[int] = None,
jpayne@68 2056 ):
jpayne@68 2057 """Create a Seq object.
jpayne@68 2058
jpayne@68 2059 Arguments:
jpayne@68 2060 - data - Sequence, required (string)
jpayne@68 2061 - length - Sequence length, used only if data is None or a dictionary (integer)
jpayne@68 2062
jpayne@68 2063 You will typically use Bio.SeqIO to read in sequences from files as
jpayne@68 2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
jpayne@68 2065 the seq property.
jpayne@68 2066
jpayne@68 2067 However, you can also create a Seq object directly:
jpayne@68 2068
jpayne@68 2069 >>> from Bio.Seq import Seq
jpayne@68 2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
jpayne@68 2071 >>> my_seq
jpayne@68 2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
jpayne@68 2073 >>> print(my_seq)
jpayne@68 2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
jpayne@68 2075
jpayne@68 2076 To create a Seq object with for a sequence of known length but
jpayne@68 2077 unknown sequence contents, use None for the data argument and pass
jpayne@68 2078 the sequence length for the length argument. Trying to access the
jpayne@68 2079 sequence contents of a Seq object created in this way will raise
jpayne@68 2080 an UndefinedSequenceError:
jpayne@68 2081
jpayne@68 2082 >>> my_undefined_sequence = Seq(None, 20)
jpayne@68 2083 >>> my_undefined_sequence
jpayne@68 2084 Seq(None, length=20)
jpayne@68 2085 >>> len(my_undefined_sequence)
jpayne@68 2086 20
jpayne@68 2087 >>> print(my_undefined_sequence)
jpayne@68 2088 Traceback (most recent call last):
jpayne@68 2089 ...
jpayne@68 2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
jpayne@68 2091
jpayne@68 2092 If the sequence contents is known for parts of the sequence only, use
jpayne@68 2093 a dictionary for the data argument to pass the known sequence segments:
jpayne@68 2094
jpayne@68 2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
jpayne@68 2096 >>> my_partially_defined_sequence
jpayne@68 2097 Seq({3: 'ACGT'}, length=10)
jpayne@68 2098 >>> len(my_partially_defined_sequence)
jpayne@68 2099 10
jpayne@68 2100 >>> print(my_partially_defined_sequence)
jpayne@68 2101 Traceback (most recent call last):
jpayne@68 2102 ...
jpayne@68 2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
jpayne@68 2104 >>> my_partially_defined_sequence[3:7]
jpayne@68 2105 Seq('ACGT')
jpayne@68 2106 >>> print(my_partially_defined_sequence[3:7])
jpayne@68 2107 ACGT
jpayne@68 2108 """
jpayne@68 2109 if data is None:
jpayne@68 2110 if length is None:
jpayne@68 2111 raise ValueError("length must not be None if data is None")
jpayne@68 2112 elif length == 0:
jpayne@68 2113 self._data = b""
jpayne@68 2114 elif length < 0:
jpayne@68 2115 raise ValueError("length must not be negative.")
jpayne@68 2116 else:
jpayne@68 2117 self._data = _UndefinedSequenceData(length)
jpayne@68 2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
jpayne@68 2119 self._data = data
jpayne@68 2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
jpayne@68 2121 self._data = bytes(data)
jpayne@68 2122 elif isinstance(data, str):
jpayne@68 2123 self._data = bytes(data, encoding="ASCII")
jpayne@68 2124 elif isinstance(data, dict):
jpayne@68 2125 if length is None:
jpayne@68 2126 raise ValueError("length must not be None if data is a dictionary")
jpayne@68 2127 elif length == 0:
jpayne@68 2128 self._data = b""
jpayne@68 2129 elif length < 0:
jpayne@68 2130 raise ValueError("length must not be negative.")
jpayne@68 2131 else:
jpayne@68 2132 current = 0 # not needed here, but it keeps mypy happy
jpayne@68 2133 end = -1
jpayne@68 2134 starts = sorted(data.keys())
jpayne@68 2135 _data: Dict[int, bytes] = {}
jpayne@68 2136 for start in starts:
jpayne@68 2137 seq = data[start]
jpayne@68 2138 if isinstance(seq, str):
jpayne@68 2139 seq = bytes(seq, encoding="ASCII")
jpayne@68 2140 else:
jpayne@68 2141 try:
jpayne@68 2142 seq = bytes(seq)
jpayne@68 2143 except Exception:
jpayne@68 2144 raise ValueError("Expected bytes-like objects or strings")
jpayne@68 2145 if start < end:
jpayne@68 2146 raise ValueError("Sequence data are overlapping.")
jpayne@68 2147 elif start == end:
jpayne@68 2148 _data[current] += seq # noqa: F821
jpayne@68 2149 else:
jpayne@68 2150 _data[start] = seq
jpayne@68 2151 current = start
jpayne@68 2152 end = start + len(seq)
jpayne@68 2153 if end > length:
jpayne@68 2154 raise ValueError(
jpayne@68 2155 "Provided sequence data extend beyond sequence length."
jpayne@68 2156 )
jpayne@68 2157 elif end == length and current == 0:
jpayne@68 2158 # sequence is fully defined
jpayne@68 2159 self._data = _data[current]
jpayne@68 2160 else:
jpayne@68 2161 self._data = _PartiallyDefinedSequenceData(length, _data)
jpayne@68 2162 else:
jpayne@68 2163 raise TypeError(
jpayne@68 2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
jpayne@68 2165 )
jpayne@68 2166
jpayne@68 2167 def __hash__(self):
jpayne@68 2168 """Hash of the sequence as a string for comparison.
jpayne@68 2169
jpayne@68 2170 See Seq object comparison documentation (method ``__eq__`` in
jpayne@68 2171 particular) as this has changed in Biopython 1.65. Older versions
jpayne@68 2172 would hash on object identity.
jpayne@68 2173 """
jpayne@68 2174 return hash(self._data)
jpayne@68 2175
jpayne@68 2176
jpayne@68 2177 class MutableSeq(_SeqAbstractBaseClass):
jpayne@68 2178 """An editable sequence object.
jpayne@68 2179
jpayne@68 2180 Unlike normal python strings and our basic sequence object (the Seq class)
jpayne@68 2181 which are immutable, the MutableSeq lets you edit the sequence in place.
jpayne@68 2182 However, this means you cannot use a MutableSeq object as a dictionary key.
jpayne@68 2183
jpayne@68 2184 >>> from Bio.Seq import MutableSeq
jpayne@68 2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
jpayne@68 2186 >>> my_seq
jpayne@68 2187 MutableSeq('ACTCGTCGTCG')
jpayne@68 2188 >>> my_seq[5]
jpayne@68 2189 'T'
jpayne@68 2190 >>> my_seq[5] = "A"
jpayne@68 2191 >>> my_seq
jpayne@68 2192 MutableSeq('ACTCGACGTCG')
jpayne@68 2193 >>> my_seq[5]
jpayne@68 2194 'A'
jpayne@68 2195 >>> my_seq[5:8] = "NNN"
jpayne@68 2196 >>> my_seq
jpayne@68 2197 MutableSeq('ACTCGNNNTCG')
jpayne@68 2198 >>> len(my_seq)
jpayne@68 2199 11
jpayne@68 2200
jpayne@68 2201 Note that the MutableSeq object does not support as many string-like
jpayne@68 2202 or biological methods as the Seq object.
jpayne@68 2203 """
jpayne@68 2204
jpayne@68 2205 def __init__(self, data):
jpayne@68 2206 """Create a MutableSeq object."""
jpayne@68 2207 if isinstance(data, bytearray):
jpayne@68 2208 self._data = data
jpayne@68 2209 elif isinstance(data, bytes):
jpayne@68 2210 self._data = bytearray(data)
jpayne@68 2211 elif isinstance(data, str):
jpayne@68 2212 self._data = bytearray(data, "ASCII")
jpayne@68 2213 elif isinstance(data, MutableSeq):
jpayne@68 2214 self._data = data._data[:] # Take a copy
jpayne@68 2215 elif isinstance(data, Seq):
jpayne@68 2216 # Make no assumptions about the Seq subclass internal storage
jpayne@68 2217 self._data = bytearray(bytes(data))
jpayne@68 2218 else:
jpayne@68 2219 raise TypeError(
jpayne@68 2220 "data should be a string, bytearray object, Seq object, or a "
jpayne@68 2221 "MutableSeq object"
jpayne@68 2222 )
jpayne@68 2223
jpayne@68 2224 def __setitem__(self, index, value):
jpayne@68 2225 """Set a subsequence of single letter via value parameter.
jpayne@68 2226
jpayne@68 2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2228 >>> my_seq[0] = 'T'
jpayne@68 2229 >>> my_seq
jpayne@68 2230 MutableSeq('TCTCGACGTCG')
jpayne@68 2231 """
jpayne@68 2232 if isinstance(index, numbers.Integral):
jpayne@68 2233 # Replacing a single letter with a new string
jpayne@68 2234 self._data[index] = ord(value)
jpayne@68 2235 else:
jpayne@68 2236 # Replacing a sub-sequence
jpayne@68 2237 if isinstance(value, MutableSeq):
jpayne@68 2238 self._data[index] = value._data
jpayne@68 2239 elif isinstance(value, Seq):
jpayne@68 2240 self._data[index] = bytes(value)
jpayne@68 2241 elif isinstance(value, str):
jpayne@68 2242 self._data[index] = value.encode("ASCII")
jpayne@68 2243 else:
jpayne@68 2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
jpayne@68 2245
jpayne@68 2246 def __delitem__(self, index):
jpayne@68 2247 """Delete a subsequence of single letter.
jpayne@68 2248
jpayne@68 2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2250 >>> del my_seq[0]
jpayne@68 2251 >>> my_seq
jpayne@68 2252 MutableSeq('CTCGACGTCG')
jpayne@68 2253 """
jpayne@68 2254 # Could be deleting a single letter, or a slice
jpayne@68 2255 del self._data[index]
jpayne@68 2256
jpayne@68 2257 def append(self, c):
jpayne@68 2258 """Add a subsequence to the mutable sequence object.
jpayne@68 2259
jpayne@68 2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2261 >>> my_seq.append('A')
jpayne@68 2262 >>> my_seq
jpayne@68 2263 MutableSeq('ACTCGACGTCGA')
jpayne@68 2264
jpayne@68 2265 No return value.
jpayne@68 2266 """
jpayne@68 2267 self._data.append(ord(c.encode("ASCII")))
jpayne@68 2268
jpayne@68 2269 def insert(self, i, c):
jpayne@68 2270 """Add a subsequence to the mutable sequence object at a given index.
jpayne@68 2271
jpayne@68 2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2273 >>> my_seq.insert(0,'A')
jpayne@68 2274 >>> my_seq
jpayne@68 2275 MutableSeq('AACTCGACGTCG')
jpayne@68 2276 >>> my_seq.insert(8,'G')
jpayne@68 2277 >>> my_seq
jpayne@68 2278 MutableSeq('AACTCGACGGTCG')
jpayne@68 2279
jpayne@68 2280 No return value.
jpayne@68 2281 """
jpayne@68 2282 self._data.insert(i, ord(c.encode("ASCII")))
jpayne@68 2283
jpayne@68 2284 def pop(self, i=(-1)):
jpayne@68 2285 """Remove a subsequence of a single letter at given index.
jpayne@68 2286
jpayne@68 2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2288 >>> my_seq.pop()
jpayne@68 2289 'G'
jpayne@68 2290 >>> my_seq
jpayne@68 2291 MutableSeq('ACTCGACGTC')
jpayne@68 2292 >>> my_seq.pop()
jpayne@68 2293 'C'
jpayne@68 2294 >>> my_seq
jpayne@68 2295 MutableSeq('ACTCGACGT')
jpayne@68 2296
jpayne@68 2297 Returns the last character of the sequence.
jpayne@68 2298 """
jpayne@68 2299 c = self._data[i]
jpayne@68 2300 del self._data[i]
jpayne@68 2301 return chr(c)
jpayne@68 2302
jpayne@68 2303 def remove(self, item):
jpayne@68 2304 """Remove a subsequence of a single letter from mutable sequence.
jpayne@68 2305
jpayne@68 2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2307 >>> my_seq.remove('C')
jpayne@68 2308 >>> my_seq
jpayne@68 2309 MutableSeq('ATCGACGTCG')
jpayne@68 2310 >>> my_seq.remove('A')
jpayne@68 2311 >>> my_seq
jpayne@68 2312 MutableSeq('TCGACGTCG')
jpayne@68 2313
jpayne@68 2314 No return value.
jpayne@68 2315 """
jpayne@68 2316 codepoint = ord(item)
jpayne@68 2317 try:
jpayne@68 2318 self._data.remove(codepoint)
jpayne@68 2319 except ValueError:
jpayne@68 2320 raise ValueError("value not found in MutableSeq") from None
jpayne@68 2321
jpayne@68 2322 def reverse(self):
jpayne@68 2323 """Modify the mutable sequence to reverse itself.
jpayne@68 2324
jpayne@68 2325 No return value.
jpayne@68 2326 """
jpayne@68 2327 self._data.reverse()
jpayne@68 2328
jpayne@68 2329 def extend(self, other):
jpayne@68 2330 """Add a sequence to the original mutable sequence object.
jpayne@68 2331
jpayne@68 2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
jpayne@68 2333 >>> my_seq.extend('A')
jpayne@68 2334 >>> my_seq
jpayne@68 2335 MutableSeq('ACTCGACGTCGA')
jpayne@68 2336 >>> my_seq.extend('TTT')
jpayne@68 2337 >>> my_seq
jpayne@68 2338 MutableSeq('ACTCGACGTCGATTT')
jpayne@68 2339
jpayne@68 2340 No return value.
jpayne@68 2341 """
jpayne@68 2342 if isinstance(other, MutableSeq):
jpayne@68 2343 self._data.extend(other._data)
jpayne@68 2344 elif isinstance(other, Seq):
jpayne@68 2345 self._data.extend(bytes(other))
jpayne@68 2346 elif isinstance(other, str):
jpayne@68 2347 self._data.extend(other.encode("ASCII"))
jpayne@68 2348 else:
jpayne@68 2349 raise TypeError("expected a string, Seq or MutableSeq")
jpayne@68 2350
jpayne@68 2351
jpayne@68 2352 class UndefinedSequenceError(ValueError):
jpayne@68 2353 """Sequence contents is undefined."""
jpayne@68 2354
jpayne@68 2355
jpayne@68 2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68 2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68 2358
jpayne@68 2359 Objects of this class can be used to create a Seq object to represent
jpayne@68 2360 sequences with a known length, but an unknown sequence contents.
jpayne@68 2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
jpayne@68 2362 UndefinedSequenceError except for requests of zero size, for which it
jpayne@68 2363 returns an empty bytes object.
jpayne@68 2364 """
jpayne@68 2365
jpayne@68 2366 __slots__ = ("_length",)
jpayne@68 2367
jpayne@68 2368 def __init__(self, length):
jpayne@68 2369 """Initialize the object with the sequence length.
jpayne@68 2370
jpayne@68 2371 The calling function is responsible for ensuring that the length is
jpayne@68 2372 greater than zero.
jpayne@68 2373 """
jpayne@68 2374 self._length = length
jpayne@68 2375 super().__init__()
jpayne@68 2376
jpayne@68 2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
jpayne@68 2378 if isinstance(key, slice):
jpayne@68 2379 start, end, step = key.indices(self._length)
jpayne@68 2380 size = len(range(start, end, step))
jpayne@68 2381 if size == 0:
jpayne@68 2382 return b""
jpayne@68 2383 return _UndefinedSequenceData(size)
jpayne@68 2384 else:
jpayne@68 2385 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68 2386
jpayne@68 2387 def __len__(self):
jpayne@68 2388 return self._length
jpayne@68 2389
jpayne@68 2390 def __bytes__(self):
jpayne@68 2391 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68 2392
jpayne@68 2393 def __add__(self, other):
jpayne@68 2394 length = len(self) + len(other)
jpayne@68 2395 try:
jpayne@68 2396 other = bytes(other)
jpayne@68 2397 except UndefinedSequenceError:
jpayne@68 2398 if isinstance(other, _UndefinedSequenceData):
jpayne@68 2399 return _UndefinedSequenceData(length)
jpayne@68 2400 else:
jpayne@68 2401 return NotImplemented
jpayne@68 2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
jpayne@68 2403 else:
jpayne@68 2404 data = {len(self): other}
jpayne@68 2405 return _PartiallyDefinedSequenceData(length, data)
jpayne@68 2406
jpayne@68 2407 def __radd__(self, other):
jpayne@68 2408 data = {0: bytes(other)}
jpayne@68 2409 length = len(other) + len(self)
jpayne@68 2410 return _PartiallyDefinedSequenceData(length, data)
jpayne@68 2411
jpayne@68 2412 def upper(self):
jpayne@68 2413 """Return an upper case copy of the sequence."""
jpayne@68 2414 # An upper case copy of an undefined sequence is an undefined
jpayne@68 2415 # sequence of the same length
jpayne@68 2416 return _UndefinedSequenceData(self._length)
jpayne@68 2417
jpayne@68 2418 def lower(self):
jpayne@68 2419 """Return a lower case copy of the sequence."""
jpayne@68 2420 # A lower case copy of an undefined sequence is an undefined
jpayne@68 2421 # sequence of the same length
jpayne@68 2422 return _UndefinedSequenceData(self._length)
jpayne@68 2423
jpayne@68 2424 def isupper(self):
jpayne@68 2425 """Return True if all ASCII characters in data are uppercase.
jpayne@68 2426
jpayne@68 2427 If there are no cased characters, the method returns False.
jpayne@68 2428 """
jpayne@68 2429 # Character case is irrelevant for an undefined sequence
jpayne@68 2430 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68 2431
jpayne@68 2432 def islower(self):
jpayne@68 2433 """Return True if all ASCII characters in data are lowercase.
jpayne@68 2434
jpayne@68 2435 If there are no cased characters, the method returns False.
jpayne@68 2436 """
jpayne@68 2437 # Character case is irrelevant for an undefined sequence
jpayne@68 2438 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68 2439
jpayne@68 2440 def replace(self, old, new):
jpayne@68 2441 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68 2442 # Replacing substring old by new in an undefined sequence will result
jpayne@68 2443 # in an undefined sequence of the same length, if old and new have the
jpayne@68 2444 # number of characters.
jpayne@68 2445 if len(old) != len(new):
jpayne@68 2446 raise UndefinedSequenceError("Sequence content is undefined")
jpayne@68 2447 return _UndefinedSequenceData(self._length)
jpayne@68 2448
jpayne@68 2449 @property
jpayne@68 2450 def defined(self):
jpayne@68 2451 """Return False, as the sequence is not defined and has a non-zero length."""
jpayne@68 2452 return False
jpayne@68 2453
jpayne@68 2454 @property
jpayne@68 2455 def defined_ranges(self):
jpayne@68 2456 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68 2457
jpayne@68 2458 As the sequence contents of an _UndefinedSequenceData object is fully
jpayne@68 2459 undefined, the return value is always an empty tuple.
jpayne@68 2460 """
jpayne@68 2461 return ()
jpayne@68 2462
jpayne@68 2463
jpayne@68 2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
jpayne@68 2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
jpayne@68 2466
jpayne@68 2467 Objects of this class can be used to create a Seq object to represent
jpayne@68 2468 sequences with a known length, but with a sequence contents that is only
jpayne@68 2469 partially known.
jpayne@68 2470 Calling __len__ returns the sequence length, calling __getitem__ returns
jpayne@68 2471 the sequence contents if known, otherwise an UndefinedSequenceError is
jpayne@68 2472 raised.
jpayne@68 2473 """
jpayne@68 2474
jpayne@68 2475 __slots__ = ("_length", "_data")
jpayne@68 2476
jpayne@68 2477 def __init__(self, length, data):
jpayne@68 2478 """Initialize with the sequence length and defined sequence segments.
jpayne@68 2479
jpayne@68 2480 The calling function is responsible for ensuring that the length is
jpayne@68 2481 greater than zero.
jpayne@68 2482 """
jpayne@68 2483 self._length = length
jpayne@68 2484 self._data = data
jpayne@68 2485 super().__init__()
jpayne@68 2486
jpayne@68 2487 def __getitem__(
jpayne@68 2488 self, key: Union[slice, int]
jpayne@68 2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
jpayne@68 2490 if isinstance(key, slice):
jpayne@68 2491 start, end, step = key.indices(self._length)
jpayne@68 2492 size = len(range(start, end, step))
jpayne@68 2493 if size == 0:
jpayne@68 2494 return b""
jpayne@68 2495 data = {}
jpayne@68 2496 for s, d in self._data.items():
jpayne@68 2497 indices = range(-s, -s + self._length)[key]
jpayne@68 2498 e: Optional[int] = indices.stop
jpayne@68 2499 assert e is not None
jpayne@68 2500 if step > 0:
jpayne@68 2501 if e <= 0:
jpayne@68 2502 continue
jpayne@68 2503 if indices.start < 0:
jpayne@68 2504 s = indices.start % step
jpayne@68 2505 else:
jpayne@68 2506 s = indices.start
jpayne@68 2507 else: # step < 0
jpayne@68 2508 if e < 0:
jpayne@68 2509 e = None
jpayne@68 2510 end = len(d) - 1
jpayne@68 2511 if indices.start > end:
jpayne@68 2512 s = end + (indices.start - end) % step
jpayne@68 2513 else:
jpayne@68 2514 s = indices.start
jpayne@68 2515 if s < 0:
jpayne@68 2516 continue
jpayne@68 2517 start = (s - indices.start) // step
jpayne@68 2518 d = d[s:e:step]
jpayne@68 2519 if d:
jpayne@68 2520 data[start] = d
jpayne@68 2521 if len(data) == 0: # Fully undefined sequence
jpayne@68 2522 return _UndefinedSequenceData(size)
jpayne@68 2523 # merge adjacent sequence segments
jpayne@68 2524 end = -1
jpayne@68 2525 previous = 0 # not needed here, but it keeps flake happy
jpayne@68 2526 items = data.items()
jpayne@68 2527 data = {}
jpayne@68 2528 for start, seq in items:
jpayne@68 2529 if end == start:
jpayne@68 2530 data[previous] += seq
jpayne@68 2531 else:
jpayne@68 2532 data[start] = seq
jpayne@68 2533 previous = start
jpayne@68 2534 end = start + len(seq)
jpayne@68 2535 if len(data) == 1:
jpayne@68 2536 seq = data.get(0)
jpayne@68 2537 if seq is not None and len(seq) == size:
jpayne@68 2538 return seq # Fully defined sequence; return bytes
jpayne@68 2539 if step < 0:
jpayne@68 2540 # use this after we drop Python 3.7:
jpayne@68 2541 # data = {start: data[start] for start in reversed(data)}
jpayne@68 2542 # use this as long as we support Python 3.7:
jpayne@68 2543 data = {start: data[start] for start in reversed(list(data.keys()))}
jpayne@68 2544 return _PartiallyDefinedSequenceData(size, data)
jpayne@68 2545 elif self._length <= key:
jpayne@68 2546 raise IndexError("sequence index out of range")
jpayne@68 2547 else:
jpayne@68 2548 for start, seq in self._data.items():
jpayne@68 2549 if start <= key and key < start + len(seq):
jpayne@68 2550 return seq[key - start]
jpayne@68 2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
jpayne@68 2552
jpayne@68 2553 def __len__(self):
jpayne@68 2554 return self._length
jpayne@68 2555
jpayne@68 2556 def __bytes__(self):
jpayne@68 2557 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68 2558
jpayne@68 2559 def __add__(self, other):
jpayne@68 2560 length = len(self) + len(other)
jpayne@68 2561 data = dict(self._data)
jpayne@68 2562 items = list(self._data.items())
jpayne@68 2563 start, seq = items[-1]
jpayne@68 2564 end = start + len(seq)
jpayne@68 2565 try:
jpayne@68 2566 other = bytes(other)
jpayne@68 2567 except UndefinedSequenceError:
jpayne@68 2568 if isinstance(other, _UndefinedSequenceData):
jpayne@68 2569 pass
jpayne@68 2570 elif isinstance(other, _PartiallyDefinedSequenceData):
jpayne@68 2571 other_items = list(other._data.items())
jpayne@68 2572 if end == len(self):
jpayne@68 2573 other_start, other_seq = other_items.pop(0)
jpayne@68 2574 if other_start == 0:
jpayne@68 2575 data[start] += other_seq
jpayne@68 2576 else:
jpayne@68 2577 data[len(self) + other_start] = other_seq
jpayne@68 2578 for other_start, other_seq in other_items:
jpayne@68 2579 data[len(self) + other_start] = other_seq
jpayne@68 2580 else:
jpayne@68 2581 if end == len(self):
jpayne@68 2582 data[start] += other
jpayne@68 2583 else:
jpayne@68 2584 data[len(self)] = other
jpayne@68 2585 return _PartiallyDefinedSequenceData(length, data)
jpayne@68 2586
jpayne@68 2587 def __radd__(self, other):
jpayne@68 2588 length = len(other) + len(self)
jpayne@68 2589 try:
jpayne@68 2590 other = bytes(other)
jpayne@68 2591 except UndefinedSequenceError:
jpayne@68 2592 data = {len(other) + start: seq for start, seq in self._data.items()}
jpayne@68 2593 else:
jpayne@68 2594 data = {0: other}
jpayne@68 2595 items = list(self._data.items())
jpayne@68 2596 start, seq = items.pop(0)
jpayne@68 2597 if start == 0:
jpayne@68 2598 data[0] += seq
jpayne@68 2599 else:
jpayne@68 2600 data[len(other) + start] = seq
jpayne@68 2601 for start, seq in items:
jpayne@68 2602 data[len(other) + start] = seq
jpayne@68 2603 return _PartiallyDefinedSequenceData(length, data)
jpayne@68 2604
jpayne@68 2605 def __mul__(self, other):
jpayne@68 2606 length = self._length
jpayne@68 2607 items = self._data.items()
jpayne@68 2608 data = {}
jpayne@68 2609 end = -1
jpayne@68 2610 previous = 0 # not needed here, but it keeps flake happy
jpayne@68 2611 for i in range(other):
jpayne@68 2612 for start, seq in items:
jpayne@68 2613 start += i * length
jpayne@68 2614 if end == start:
jpayne@68 2615 data[previous] += seq
jpayne@68 2616 else:
jpayne@68 2617 data[start] = seq
jpayne@68 2618 previous = start
jpayne@68 2619 end = start + len(seq)
jpayne@68 2620 return _PartiallyDefinedSequenceData(length * other, data)
jpayne@68 2621
jpayne@68 2622 def upper(self):
jpayne@68 2623 """Return an upper case copy of the sequence."""
jpayne@68 2624 data = {start: seq.upper() for start, seq in self._data.items()}
jpayne@68 2625 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68 2626
jpayne@68 2627 def lower(self):
jpayne@68 2628 """Return a lower case copy of the sequence."""
jpayne@68 2629 data = {start: seq.lower() for start, seq in self._data.items()}
jpayne@68 2630 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68 2631
jpayne@68 2632 def isupper(self):
jpayne@68 2633 """Return True if all ASCII characters in data are uppercase.
jpayne@68 2634
jpayne@68 2635 If there are no cased characters, the method returns False.
jpayne@68 2636 """
jpayne@68 2637 # Character case is irrelevant for an undefined sequence
jpayne@68 2638 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68 2639
jpayne@68 2640 def islower(self):
jpayne@68 2641 """Return True if all ASCII characters in data are lowercase.
jpayne@68 2642
jpayne@68 2643 If there are no cased characters, the method returns False.
jpayne@68 2644 """
jpayne@68 2645 # Character case is irrelevant for an undefined sequence
jpayne@68 2646 raise UndefinedSequenceError("Sequence content is only partially defined")
jpayne@68 2647
jpayne@68 2648 def translate(self, table, delete=b""):
jpayne@68 2649 """Return a copy with each character mapped by the given translation table.
jpayne@68 2650
jpayne@68 2651 table
jpayne@68 2652 Translation table, which must be a bytes object of length 256.
jpayne@68 2653
jpayne@68 2654 All characters occurring in the optional argument delete are removed.
jpayne@68 2655 The remaining characters are mapped through the given translation table.
jpayne@68 2656 """
jpayne@68 2657 items = self._data.items()
jpayne@68 2658 data = {start: seq.translate(table, delete) for start, seq in items}
jpayne@68 2659 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68 2660
jpayne@68 2661 def replace(self, old, new):
jpayne@68 2662 """Return a copy with all occurrences of substring old replaced by new."""
jpayne@68 2663 # Replacing substring old by new in the undefined sequence segments
jpayne@68 2664 # will result in an undefined sequence segment of the same length, if
jpayne@68 2665 # old and new have the number of characters. If not, an error is raised,
jpayne@68 2666 # as the correct start positions cannot be calculated reliably.
jpayne@68 2667 if len(old) != len(new):
jpayne@68 2668 raise UndefinedSequenceError(
jpayne@68 2669 "Sequence content is only partially defined; substring \n"
jpayne@68 2670 "replacement cannot be performed reliably"
jpayne@68 2671 )
jpayne@68 2672 items = self._data.items()
jpayne@68 2673 data = {start: seq.replace(old, new) for start, seq in items}
jpayne@68 2674 return _PartiallyDefinedSequenceData(self._length, data)
jpayne@68 2675
jpayne@68 2676 @property
jpayne@68 2677 def defined(self):
jpayne@68 2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
jpayne@68 2679 return False
jpayne@68 2680
jpayne@68 2681 @property
jpayne@68 2682 def defined_ranges(self):
jpayne@68 2683 """Return a tuple of the ranges where the sequence contents is defined.
jpayne@68 2684
jpayne@68 2685 The return value has the format ((start1, end1), (start2, end2), ...).
jpayne@68 2686 """
jpayne@68 2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
jpayne@68 2688
jpayne@68 2689
jpayne@68 2690 # The transcribe, backward_transcribe, and translate functions are
jpayne@68 2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
jpayne@68 2692 # The functions work both on Seq objects, and on strings.
jpayne@68 2693
jpayne@68 2694
jpayne@68 2695 def transcribe(dna):
jpayne@68 2696 """Transcribe a DNA sequence into RNA.
jpayne@68 2697
jpayne@68 2698 Following the usual convention, the sequence is interpreted as the
jpayne@68 2699 coding strand of the DNA double helix, not the template strand. This
jpayne@68 2700 means we can get the RNA sequence just by switching T to U.
jpayne@68 2701
jpayne@68 2702 If given a string, returns a new string object.
jpayne@68 2703
jpayne@68 2704 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68 2705
jpayne@68 2706 e.g.
jpayne@68 2707
jpayne@68 2708 >>> transcribe("ACTGN")
jpayne@68 2709 'ACUGN'
jpayne@68 2710 """
jpayne@68 2711 if isinstance(dna, Seq):
jpayne@68 2712 return dna.transcribe()
jpayne@68 2713 elif isinstance(dna, MutableSeq):
jpayne@68 2714 return Seq(dna).transcribe()
jpayne@68 2715 else:
jpayne@68 2716 return dna.replace("T", "U").replace("t", "u")
jpayne@68 2717
jpayne@68 2718
jpayne@68 2719 def back_transcribe(rna):
jpayne@68 2720 """Return the RNA sequence back-transcribed into DNA.
jpayne@68 2721
jpayne@68 2722 If given a string, returns a new string object.
jpayne@68 2723
jpayne@68 2724 Given a Seq or MutableSeq, returns a new Seq object.
jpayne@68 2725
jpayne@68 2726 e.g.
jpayne@68 2727
jpayne@68 2728 >>> back_transcribe("ACUGN")
jpayne@68 2729 'ACTGN'
jpayne@68 2730 """
jpayne@68 2731 if isinstance(rna, Seq):
jpayne@68 2732 return rna.back_transcribe()
jpayne@68 2733 elif isinstance(rna, MutableSeq):
jpayne@68 2734 return Seq(rna).back_transcribe()
jpayne@68 2735 else:
jpayne@68 2736 return rna.replace("U", "T").replace("u", "t")
jpayne@68 2737
jpayne@68 2738
jpayne@68 2739 def _translate_str(
jpayne@68 2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
jpayne@68 2741 ):
jpayne@68 2742 """Translate nucleotide string into a protein string (PRIVATE).
jpayne@68 2743
jpayne@68 2744 Arguments:
jpayne@68 2745 - sequence - a string
jpayne@68 2746 - table - Which codon table to use? This can be either a name (string),
jpayne@68 2747 an NCBI identifier (integer), or a CodonTable object (useful for
jpayne@68 2748 non-standard genetic codes). This defaults to the "Standard" table.
jpayne@68 2749 - stop_symbol - a single character string, what to use for terminators.
jpayne@68 2750 - to_stop - boolean, should translation terminate at the first
jpayne@68 2751 in frame stop codon? If there is no in-frame stop codon
jpayne@68 2752 then translation continues to the end.
jpayne@68 2753 - pos_stop - a single character string for a possible stop codon
jpayne@68 2754 (e.g. TAN or NNN)
jpayne@68 2755 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@68 2756 checks the sequence starts with a valid alternative start
jpayne@68 2757 codon (which will be translated as methionine, M), that the
jpayne@68 2758 sequence length is a multiple of three, and that there is a
jpayne@68 2759 single in frame stop codon at the end (this will be excluded
jpayne@68 2760 from the protein sequence, regardless of the to_stop option).
jpayne@68 2761 If these tests fail, an exception is raised.
jpayne@68 2762 - gap - Single character string to denote symbol used for gaps.
jpayne@68 2763 Defaults to None.
jpayne@68 2764
jpayne@68 2765 Returns a string.
jpayne@68 2766
jpayne@68 2767 e.g.
jpayne@68 2768
jpayne@68 2769 >>> from Bio.Data import CodonTable
jpayne@68 2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
jpayne@68 2771 >>> _translate_str("AAA", table)
jpayne@68 2772 'K'
jpayne@68 2773 >>> _translate_str("TAR", table)
jpayne@68 2774 '*'
jpayne@68 2775 >>> _translate_str("TAN", table)
jpayne@68 2776 'X'
jpayne@68 2777 >>> _translate_str("TAN", table, pos_stop="@")
jpayne@68 2778 '@'
jpayne@68 2779 >>> _translate_str("TA?", table)
jpayne@68 2780 Traceback (most recent call last):
jpayne@68 2781 ...
jpayne@68 2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
jpayne@68 2783
jpayne@68 2784 In a change to older versions of Biopython, partial codons are now
jpayne@68 2785 always regarded as an error (previously only checked if cds=True)
jpayne@68 2786 and will trigger a warning (likely to become an exception in a
jpayne@68 2787 future release).
jpayne@68 2788
jpayne@68 2789 If **cds=True**, the start and stop codons are checked, and the start
jpayne@68 2790 codon will be translated at methionine. The sequence must be an
jpayne@68 2791 while number of codons.
jpayne@68 2792
jpayne@68 2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
jpayne@68 2794 'MP'
jpayne@68 2795 >>> _translate_str("AAACCCTAG", table, cds=True)
jpayne@68 2796 Traceback (most recent call last):
jpayne@68 2797 ...
jpayne@68 2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
jpayne@68 2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
jpayne@68 2800 Traceback (most recent call last):
jpayne@68 2801 ...
jpayne@68 2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
jpayne@68 2803 """
jpayne@68 2804 try:
jpayne@68 2805 table_id = int(table)
jpayne@68 2806 except ValueError:
jpayne@68 2807 # Assume it's a table name
jpayne@68 2808 # The same table can be used for RNA or DNA
jpayne@68 2809 try:
jpayne@68 2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
jpayne@68 2811 except KeyError:
jpayne@68 2812 if isinstance(table, str):
jpayne@68 2813 raise ValueError(
jpayne@68 2814 "The Bio.Seq translate methods and function DO NOT "
jpayne@68 2815 "take a character string mapping table like the python "
jpayne@68 2816 "string object's translate method. "
jpayne@68 2817 "Use str(my_seq).translate(...) instead."
jpayne@68 2818 ) from None
jpayne@68 2819 else:
jpayne@68 2820 raise TypeError("table argument must be integer or string") from None
jpayne@68 2821 except (AttributeError, TypeError):
jpayne@68 2822 # Assume it's a CodonTable object
jpayne@68 2823 if isinstance(table, CodonTable.CodonTable):
jpayne@68 2824 codon_table = table
jpayne@68 2825 else:
jpayne@68 2826 raise ValueError("Bad table argument") from None
jpayne@68 2827 else:
jpayne@68 2828 # Assume it's a table ID
jpayne@68 2829 # The same table can be used for RNA or DNA
jpayne@68 2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
jpayne@68 2831 sequence = sequence.upper()
jpayne@68 2832 amino_acids = []
jpayne@68 2833 forward_table = codon_table.forward_table
jpayne@68 2834 stop_codons = codon_table.stop_codons
jpayne@68 2835 if codon_table.nucleotide_alphabet is not None:
jpayne@68 2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
jpayne@68 2837 else:
jpayne@68 2838 # Assume the worst case, ambiguous DNA or RNA:
jpayne@68 2839 valid_letters = set(
jpayne@68 2840 IUPACData.ambiguous_dna_letters.upper()
jpayne@68 2841 + IUPACData.ambiguous_rna_letters.upper()
jpayne@68 2842 )
jpayne@68 2843 n = len(sequence)
jpayne@68 2844
jpayne@68 2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
jpayne@68 2846 dual_coding = [c for c in stop_codons if c in forward_table]
jpayne@68 2847 if dual_coding:
jpayne@68 2848 c = dual_coding[0]
jpayne@68 2849 if to_stop:
jpayne@68 2850 raise ValueError(
jpayne@68 2851 "You cannot use 'to_stop=True' with this table as it contains"
jpayne@68 2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
jpayne@68 2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
jpayne@68 2854 )
jpayne@68 2855 warnings.warn(
jpayne@68 2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
jpayne@68 2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
jpayne@68 2858 " or STOP). Such codons will be translated as amino acid.",
jpayne@68 2859 BiopythonWarning,
jpayne@68 2860 )
jpayne@68 2861
jpayne@68 2862 if cds:
jpayne@68 2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
jpayne@68 2864 raise CodonTable.TranslationError(
jpayne@68 2865 f"First codon '{sequence[:3]}' is not a start codon"
jpayne@68 2866 )
jpayne@68 2867 if n % 3 != 0:
jpayne@68 2868 raise CodonTable.TranslationError(
jpayne@68 2869 f"Sequence length {n} is not a multiple of three"
jpayne@68 2870 )
jpayne@68 2871 if str(sequence[-3:]).upper() not in stop_codons:
jpayne@68 2872 raise CodonTable.TranslationError(
jpayne@68 2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
jpayne@68 2874 )
jpayne@68 2875 # Don't translate the stop symbol, and manually translate the M
jpayne@68 2876 sequence = sequence[3:-3]
jpayne@68 2877 n -= 6
jpayne@68 2878 amino_acids = ["M"]
jpayne@68 2879 elif n % 3 != 0:
jpayne@68 2880 warnings.warn(
jpayne@68 2881 "Partial codon, len(sequence) not a multiple of three. "
jpayne@68 2882 "Explicitly trim the sequence or add trailing N before "
jpayne@68 2883 "translation. This may become an error in future.",
jpayne@68 2884 BiopythonWarning,
jpayne@68 2885 )
jpayne@68 2886 if gap is not None:
jpayne@68 2887 if not isinstance(gap, str):
jpayne@68 2888 raise TypeError("Gap character should be a single character string.")
jpayne@68 2889 elif len(gap) > 1:
jpayne@68 2890 raise ValueError("Gap character should be a single character string.")
jpayne@68 2891
jpayne@68 2892 for i in range(0, n - n % 3, 3):
jpayne@68 2893 codon = sequence[i : i + 3]
jpayne@68 2894 try:
jpayne@68 2895 amino_acids.append(forward_table[codon])
jpayne@68 2896 except (KeyError, CodonTable.TranslationError):
jpayne@68 2897 if codon in codon_table.stop_codons:
jpayne@68 2898 if cds:
jpayne@68 2899 raise CodonTable.TranslationError(
jpayne@68 2900 f"Extra in frame stop codon '{codon}' found."
jpayne@68 2901 ) from None
jpayne@68 2902 if to_stop:
jpayne@68 2903 break
jpayne@68 2904 amino_acids.append(stop_symbol)
jpayne@68 2905 elif valid_letters.issuperset(set(codon)):
jpayne@68 2906 # Possible stop codon (e.g. NNN or TAN)
jpayne@68 2907 amino_acids.append(pos_stop)
jpayne@68 2908 elif gap is not None and codon == gap * 3:
jpayne@68 2909 # Gapped translation
jpayne@68 2910 amino_acids.append(gap)
jpayne@68 2911 else:
jpayne@68 2912 raise CodonTable.TranslationError(
jpayne@68 2913 f"Codon '{codon}' is invalid"
jpayne@68 2914 ) from None
jpayne@68 2915 return "".join(amino_acids)
jpayne@68 2916
jpayne@68 2917
jpayne@68 2918 def translate(
jpayne@68 2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
jpayne@68 2920 ):
jpayne@68 2921 """Translate a nucleotide sequence into amino acids.
jpayne@68 2922
jpayne@68 2923 If given a string, returns a new string object. Given a Seq or
jpayne@68 2924 MutableSeq, returns a Seq object.
jpayne@68 2925
jpayne@68 2926 Arguments:
jpayne@68 2927 - table - Which codon table to use? This can be either a name
jpayne@68 2928 (string), an NCBI identifier (integer), or a CodonTable object
jpayne@68 2929 (useful for non-standard genetic codes). Defaults to the "Standard"
jpayne@68 2930 table.
jpayne@68 2931 - stop_symbol - Single character string, what to use for any
jpayne@68 2932 terminators, defaults to the asterisk, "*".
jpayne@68 2933 - to_stop - Boolean, defaults to False meaning do a full
jpayne@68 2934 translation continuing on past any stop codons
jpayne@68 2935 (translated as the specified stop_symbol). If
jpayne@68 2936 True, translation is terminated at the first in
jpayne@68 2937 frame stop codon (and the stop_symbol is not
jpayne@68 2938 appended to the returned protein sequence).
jpayne@68 2939 - cds - Boolean, indicates this is a complete CDS. If True, this
jpayne@68 2940 checks the sequence starts with a valid alternative start
jpayne@68 2941 codon (which will be translated as methionine, M), that the
jpayne@68 2942 sequence length is a multiple of three, and that there is a
jpayne@68 2943 single in frame stop codon at the end (this will be excluded
jpayne@68 2944 from the protein sequence, regardless of the to_stop option).
jpayne@68 2945 If these tests fail, an exception is raised.
jpayne@68 2946 - gap - Single character string to denote symbol used for gaps.
jpayne@68 2947 Defaults to None.
jpayne@68 2948
jpayne@68 2949 A simple string example using the default (standard) genetic code:
jpayne@68 2950
jpayne@68 2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
jpayne@68 2952 >>> translate(coding_dna)
jpayne@68 2953 'VAIVMGR*KGAR*'
jpayne@68 2954 >>> translate(coding_dna, stop_symbol="@")
jpayne@68 2955 'VAIVMGR@KGAR@'
jpayne@68 2956 >>> translate(coding_dna, to_stop=True)
jpayne@68 2957 'VAIVMGR'
jpayne@68 2958
jpayne@68 2959 Now using NCBI table 2, where TGA is not a stop codon:
jpayne@68 2960
jpayne@68 2961 >>> translate(coding_dna, table=2)
jpayne@68 2962 'VAIVMGRWKGAR*'
jpayne@68 2963 >>> translate(coding_dna, table=2, to_stop=True)
jpayne@68 2964 'VAIVMGRWKGAR'
jpayne@68 2965
jpayne@68 2966 In fact this example uses an alternative start codon valid under NCBI
jpayne@68 2967 table 2, GTG, which means this example is a complete valid CDS which
jpayne@68 2968 when translated should really start with methionine (not valine):
jpayne@68 2969
jpayne@68 2970 >>> translate(coding_dna, table=2, cds=True)
jpayne@68 2971 'MAIVMGRWKGAR'
jpayne@68 2972
jpayne@68 2973 Note that if the sequence has no in-frame stop codon, then the to_stop
jpayne@68 2974 argument has no effect:
jpayne@68 2975
jpayne@68 2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
jpayne@68 2977 >>> translate(coding_dna2)
jpayne@68 2978 'VAIVMGR'
jpayne@68 2979 >>> translate(coding_dna2, to_stop=True)
jpayne@68 2980 'VAIVMGR'
jpayne@68 2981
jpayne@68 2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
jpayne@68 2983 or a stop codon. These are translated as "X". Any invalid codon
jpayne@68 2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
jpayne@68 2985
jpayne@68 2986 It will however translate either DNA or RNA.
jpayne@68 2987
jpayne@68 2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
jpayne@68 2989 stop codons'. These are stop codons with unambiguous sequence but which
jpayne@68 2990 have a context dependent coding as STOP or as amino acid. With these tables
jpayne@68 2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
jpayne@68 2992 coding codons will always be translated as amino acid, except for
jpayne@68 2993 'cds=True', where the last codon will be translated as STOP.
jpayne@68 2994
jpayne@68 2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
jpayne@68 2996 >>> translate(coding_dna3)
jpayne@68 2997 'MARK*'
jpayne@68 2998
jpayne@68 2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
jpayne@68 3000 'MARKW'
jpayne@68 3001
jpayne@68 3002 It will however raise a BiopythonWarning (not shown).
jpayne@68 3003
jpayne@68 3004 >>> translate(coding_dna3, table=27, cds=True)
jpayne@68 3005 'MARK'
jpayne@68 3006
jpayne@68 3007 >>> translate(coding_dna3, table=27, to_stop=True)
jpayne@68 3008 Traceback (most recent call last):
jpayne@68 3009 ...
jpayne@68 3010 ValueError: You cannot use 'to_stop=True' with this table ...
jpayne@68 3011 """
jpayne@68 3012 if isinstance(sequence, Seq):
jpayne@68 3013 return sequence.translate(table, stop_symbol, to_stop, cds)
jpayne@68 3014 elif isinstance(sequence, MutableSeq):
jpayne@68 3015 # Return a Seq object
jpayne@68 3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
jpayne@68 3017 else:
jpayne@68 3018 # Assume it's a string, return a string
jpayne@68 3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
jpayne@68 3020
jpayne@68 3021
jpayne@68 3022 def reverse_complement(sequence, inplace=False):
jpayne@68 3023 """Return the reverse complement as a DNA sequence.
jpayne@68 3024
jpayne@68 3025 If given a string, returns a new string object.
jpayne@68 3026 Given a Seq object, returns a new Seq object.
jpayne@68 3027 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68 3028 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68 3029
jpayne@68 3030 >>> my_seq = "CGA"
jpayne@68 3031 >>> reverse_complement(my_seq)
jpayne@68 3032 'TCG'
jpayne@68 3033 >>> my_seq = Seq("CGA")
jpayne@68 3034 >>> reverse_complement(my_seq)
jpayne@68 3035 Seq('TCG')
jpayne@68 3036 >>> my_seq = MutableSeq("CGA")
jpayne@68 3037 >>> reverse_complement(my_seq)
jpayne@68 3038 MutableSeq('TCG')
jpayne@68 3039 >>> my_seq
jpayne@68 3040 MutableSeq('CGA')
jpayne@68 3041
jpayne@68 3042 Any U in the sequence is treated as a T:
jpayne@68 3043
jpayne@68 3044 >>> reverse_complement(Seq("CGAUT"))
jpayne@68 3045 Seq('AATCG')
jpayne@68 3046
jpayne@68 3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
jpayne@68 3048
jpayne@68 3049 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68 3050 Seq('AAUCG')
jpayne@68 3051
jpayne@68 3052 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68 3053 ambiguous nucleotides. All other characters are not converted:
jpayne@68 3054
jpayne@68 3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
jpayne@68 3056 'zrxZRXaacgtAACGT'
jpayne@68 3057
jpayne@68 3058 The sequence is modified in-place and returned if inplace is True:
jpayne@68 3059
jpayne@68 3060 >>> my_seq = MutableSeq("CGA")
jpayne@68 3061 >>> reverse_complement(my_seq, inplace=True)
jpayne@68 3062 MutableSeq('TCG')
jpayne@68 3063 >>> my_seq
jpayne@68 3064 MutableSeq('TCG')
jpayne@68 3065
jpayne@68 3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68 3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68 3068 ``inplace=True``.
jpayne@68 3069 """
jpayne@68 3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68 3071
jpayne@68 3072 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68 3073 return sequence.reverse_complement(inplace)
jpayne@68 3074 if isinstance(sequence, SeqRecord):
jpayne@68 3075 if inplace:
jpayne@68 3076 raise TypeError("SeqRecords are immutable")
jpayne@68 3077 return sequence.reverse_complement()
jpayne@68 3078 # Assume it's a string.
jpayne@68 3079 if inplace:
jpayne@68 3080 raise TypeError("strings are immutable")
jpayne@68 3081 sequence = sequence.encode("ASCII")
jpayne@68 3082 sequence = sequence.translate(_dna_complement_table)
jpayne@68 3083 sequence = sequence.decode("ASCII")
jpayne@68 3084 return sequence[::-1]
jpayne@68 3085
jpayne@68 3086
jpayne@68 3087 def reverse_complement_rna(sequence, inplace=False):
jpayne@68 3088 """Return the reverse complement as an RNA sequence.
jpayne@68 3089
jpayne@68 3090 If given a string, returns a new string object.
jpayne@68 3091 Given a Seq object, returns a new Seq object.
jpayne@68 3092 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68 3093 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68 3094
jpayne@68 3095 >>> my_seq = "CGA"
jpayne@68 3096 >>> reverse_complement_rna(my_seq)
jpayne@68 3097 'UCG'
jpayne@68 3098 >>> my_seq = Seq("CGA")
jpayne@68 3099 >>> reverse_complement_rna(my_seq)
jpayne@68 3100 Seq('UCG')
jpayne@68 3101 >>> my_seq = MutableSeq("CGA")
jpayne@68 3102 >>> reverse_complement_rna(my_seq)
jpayne@68 3103 MutableSeq('UCG')
jpayne@68 3104 >>> my_seq
jpayne@68 3105 MutableSeq('CGA')
jpayne@68 3106
jpayne@68 3107 Any T in the sequence is treated as a U:
jpayne@68 3108
jpayne@68 3109 >>> reverse_complement_rna(Seq("CGAUT"))
jpayne@68 3110 Seq('AAUCG')
jpayne@68 3111
jpayne@68 3112 In contrast, ``reverse_complement`` returns a DNA sequence:
jpayne@68 3113
jpayne@68 3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
jpayne@68 3115 Seq('AATCG')
jpayne@68 3116
jpayne@68 3117 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68 3118 ambiguous nucleotides. All other characters are not converted:
jpayne@68 3119
jpayne@68 3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
jpayne@68 3121 'zrxZRXaacguAACGU'
jpayne@68 3122
jpayne@68 3123 The sequence is modified in-place and returned if inplace is True:
jpayne@68 3124
jpayne@68 3125 >>> my_seq = MutableSeq("CGA")
jpayne@68 3126 >>> reverse_complement_rna(my_seq, inplace=True)
jpayne@68 3127 MutableSeq('UCG')
jpayne@68 3128 >>> my_seq
jpayne@68 3129 MutableSeq('UCG')
jpayne@68 3130
jpayne@68 3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68 3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68 3133 ``inplace=True``.
jpayne@68 3134 """
jpayne@68 3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68 3136
jpayne@68 3137 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68 3138 return sequence.reverse_complement_rna(inplace)
jpayne@68 3139 if isinstance(sequence, SeqRecord):
jpayne@68 3140 if inplace:
jpayne@68 3141 raise TypeError("SeqRecords are immutable")
jpayne@68 3142 return sequence.reverse_complement_rna()
jpayne@68 3143 # Assume it's a string.
jpayne@68 3144 if inplace:
jpayne@68 3145 raise TypeError("strings are immutable")
jpayne@68 3146 sequence = sequence.encode("ASCII")
jpayne@68 3147 sequence = sequence.translate(_rna_complement_table)
jpayne@68 3148 sequence = sequence.decode("ASCII")
jpayne@68 3149 return sequence[::-1]
jpayne@68 3150
jpayne@68 3151
jpayne@68 3152 def complement(sequence, inplace=False):
jpayne@68 3153 """Return the complement as a DNA sequence.
jpayne@68 3154
jpayne@68 3155 If given a string, returns a new string object.
jpayne@68 3156 Given a Seq object, returns a new Seq object.
jpayne@68 3157 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68 3158 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68 3159
jpayne@68 3160 >>> my_seq = "CGA"
jpayne@68 3161 >>> complement(my_seq)
jpayne@68 3162 'GCT'
jpayne@68 3163 >>> my_seq = Seq("CGA")
jpayne@68 3164 >>> complement(my_seq)
jpayne@68 3165 Seq('GCT')
jpayne@68 3166 >>> my_seq = MutableSeq("CGA")
jpayne@68 3167 >>> complement(my_seq)
jpayne@68 3168 MutableSeq('GCT')
jpayne@68 3169 >>> my_seq
jpayne@68 3170 MutableSeq('CGA')
jpayne@68 3171
jpayne@68 3172 Any U in the sequence is treated as a T:
jpayne@68 3173
jpayne@68 3174 >>> complement(Seq("CGAUT"))
jpayne@68 3175 Seq('GCTAA')
jpayne@68 3176
jpayne@68 3177 In contrast, ``complement_rna`` returns an RNA sequence:
jpayne@68 3178
jpayne@68 3179 >>> complement_rna(Seq("CGAUT"))
jpayne@68 3180 Seq('GCUAA')
jpayne@68 3181
jpayne@68 3182 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68 3183 ambiguous nucleotides. All other characters are not converted:
jpayne@68 3184
jpayne@68 3185 >>> complement("ACGTUacgtuXYZxyz")
jpayne@68 3186 'TGCAAtgcaaXRZxrz'
jpayne@68 3187
jpayne@68 3188 The sequence is modified in-place and returned if inplace is True:
jpayne@68 3189
jpayne@68 3190 >>> my_seq = MutableSeq("CGA")
jpayne@68 3191 >>> complement(my_seq, inplace=True)
jpayne@68 3192 MutableSeq('GCT')
jpayne@68 3193 >>> my_seq
jpayne@68 3194 MutableSeq('GCT')
jpayne@68 3195
jpayne@68 3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68 3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68 3198 ``inplace=True``.
jpayne@68 3199 """
jpayne@68 3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68 3201
jpayne@68 3202 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68 3203 return sequence.complement(inplace)
jpayne@68 3204 if isinstance(sequence, SeqRecord):
jpayne@68 3205 if inplace:
jpayne@68 3206 raise TypeError("SeqRecords are immutable")
jpayne@68 3207 return sequence.complement()
jpayne@68 3208 # Assume it's a string.
jpayne@68 3209 if inplace is True:
jpayne@68 3210 raise TypeError("strings are immutable")
jpayne@68 3211 sequence = sequence.encode("ASCII")
jpayne@68 3212 sequence = sequence.translate(_dna_complement_table)
jpayne@68 3213 return sequence.decode("ASCII")
jpayne@68 3214
jpayne@68 3215
jpayne@68 3216 def complement_rna(sequence, inplace=False):
jpayne@68 3217 """Return the complement as an RNA sequence.
jpayne@68 3218
jpayne@68 3219 If given a string, returns a new string object.
jpayne@68 3220 Given a Seq object, returns a new Seq object.
jpayne@68 3221 Given a MutableSeq, returns a new MutableSeq object.
jpayne@68 3222 Given a SeqRecord object, returns a new SeqRecord object.
jpayne@68 3223
jpayne@68 3224 >>> my_seq = "CGA"
jpayne@68 3225 >>> complement_rna(my_seq)
jpayne@68 3226 'GCU'
jpayne@68 3227 >>> my_seq = Seq("CGA")
jpayne@68 3228 >>> complement_rna(my_seq)
jpayne@68 3229 Seq('GCU')
jpayne@68 3230 >>> my_seq = MutableSeq("CGA")
jpayne@68 3231 >>> complement_rna(my_seq)
jpayne@68 3232 MutableSeq('GCU')
jpayne@68 3233 >>> my_seq
jpayne@68 3234 MutableSeq('CGA')
jpayne@68 3235
jpayne@68 3236 Any T in the sequence is treated as a U:
jpayne@68 3237
jpayne@68 3238 >>> complement_rna(Seq("CGAUT"))
jpayne@68 3239 Seq('GCUAA')
jpayne@68 3240
jpayne@68 3241 In contrast, ``complement`` returns a DNA sequence:
jpayne@68 3242
jpayne@68 3243 >>> complement(Seq("CGAUT"))
jpayne@68 3244 Seq('GCTAA')
jpayne@68 3245
jpayne@68 3246 Supports and lower- and upper-case characters, and unambiguous and
jpayne@68 3247 ambiguous nucleotides. All other characters are not converted:
jpayne@68 3248
jpayne@68 3249 >>> complement_rna("ACGTUacgtuXYZxyz")
jpayne@68 3250 'UGCAAugcaaXRZxrz'
jpayne@68 3251
jpayne@68 3252 The sequence is modified in-place and returned if inplace is True:
jpayne@68 3253
jpayne@68 3254 >>> my_seq = MutableSeq("CGA")
jpayne@68 3255 >>> complement(my_seq, inplace=True)
jpayne@68 3256 MutableSeq('GCT')
jpayne@68 3257 >>> my_seq
jpayne@68 3258 MutableSeq('GCT')
jpayne@68 3259
jpayne@68 3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
jpayne@68 3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
jpayne@68 3262 ``inplace=True``.
jpayne@68 3263 """
jpayne@68 3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
jpayne@68 3265
jpayne@68 3266 if isinstance(sequence, (Seq, MutableSeq)):
jpayne@68 3267 return sequence.complement_rna(inplace)
jpayne@68 3268 if isinstance(sequence, SeqRecord):
jpayne@68 3269 if inplace:
jpayne@68 3270 raise TypeError("SeqRecords are immutable")
jpayne@68 3271 return sequence.complement_rna()
jpayne@68 3272 # Assume it's a string.
jpayne@68 3273 if inplace:
jpayne@68 3274 raise TypeError("strings are immutable")
jpayne@68 3275 sequence = sequence.encode("ASCII")
jpayne@68 3276 sequence = sequence.translate(_rna_complement_table)
jpayne@68 3277 return sequence.decode("ASCII")
jpayne@68 3278
jpayne@68 3279
jpayne@68 3280 def _test():
jpayne@68 3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
jpayne@68 3282 print("Running doctests...")
jpayne@68 3283 import doctest
jpayne@68 3284
jpayne@68 3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
jpayne@68 3286 print("Done")
jpayne@68 3287
jpayne@68 3288
jpayne@68 3289 if __name__ == "__main__":
jpayne@68 3290 _test()