comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # Copyright 2000 Andrew Dalke.
2 # Copyright 2000-2002 Brad Chapman.
3 # Copyright 2004-2005, 2010 by M de Hoon.
4 # Copyright 2007-2023 by Peter Cock.
5 # All rights reserved.
6 #
7 # This file is part of the Biopython distribution and governed by your
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
9 # Please see the LICENSE file that should have been included as part of this
10 # package.
11 """Provide objects to represent biological sequences.
12
13 See also the Seq_ wiki and the chapter in our tutorial:
14 - `HTML Tutorial`_
15 - `PDF Tutorial`_
16
17 .. _Seq: http://biopython.org/wiki/Seq
18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
20
21 """
22 import array
23 import collections
24 import numbers
25 import warnings
26
27 from abc import ABC
28 from abc import abstractmethod
29 from typing import overload, Optional, Union, Dict
30
31 from Bio import BiopythonWarning
32 from Bio.Data import CodonTable
33 from Bio.Data import IUPACData
34
35
36 def _maketrans(complement_mapping):
37 """Make a python string translation table (PRIVATE).
38
39 Arguments:
40 - complement_mapping - a dictionary such as ambiguous_dna_complement
41 and ambiguous_rna_complement from Data.IUPACData.
42
43 Returns a translation table (a bytes object of length 256) for use with
44 the python string's translate method to use in a (reverse) complement.
45
46 Compatible with lower case and upper case sequences.
47
48 For internal use only.
49 """
50 keys = "".join(complement_mapping.keys()).encode("ASCII")
51 values = "".join(complement_mapping.values()).encode("ASCII")
52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
53
54
55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
58 del ambiguous_dna_complement
59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
62 del ambiguous_rna_complement
63
64
65 class SequenceDataAbstractBaseClass(ABC):
66 """Abstract base class for sequence content providers.
67
68 Most users will not need to use this class. It is used internally as a base
69 class for sequence content provider classes such as _UndefinedSequenceData
70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
71 Instances of these classes can be used instead of a ``bytes`` object as the
72 data argument when creating a Seq object, and provide the sequence content
73 only when requested via ``__getitem__``. This allows lazy parsers to load
74 and parse sequence data from a file only for the requested sequence regions,
75 and _UndefinedSequenceData instances to raise an exception when undefined
76 sequence data are requested.
77
78 Future implementations of lazy parsers that similarly provide on-demand
79 parsing of sequence data should use a subclass of this abstract class and
80 implement the abstract methods ``__len__`` and ``__getitem__``:
81
82 * ``__len__`` must return the sequence length;
83 * ``__getitem__`` must return
84
85 * a ``bytes`` object for the requested region; or
86 * a new instance of the subclass for the requested region; or
87 * raise an ``UndefinedSequenceError``.
88
89 Calling ``__getitem__`` for a sequence region of size zero should always
90 return an empty ``bytes`` object.
91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
92 either return a ``bytes`` object with the full sequence, or raise an
93 ``UndefinedSequenceError``.
94
95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
96 as part of their ``__init__`` method.
97 """
98
99 __slots__ = ()
100
101 def __init__(self):
102 """Check if ``__getitem__`` returns a bytes-like object."""
103 assert self[:0] == b""
104
105 @abstractmethod
106 def __len__(self):
107 pass
108
109 @abstractmethod
110 def __getitem__(self, key):
111 pass
112
113 def __bytes__(self):
114 return self[:]
115
116 def __hash__(self):
117 return hash(bytes(self))
118
119 def __eq__(self, other):
120 return bytes(self) == other
121
122 def __lt__(self, other):
123 return bytes(self) < other
124
125 def __le__(self, other):
126 return bytes(self) <= other
127
128 def __gt__(self, other):
129 return bytes(self) > other
130
131 def __ge__(self, other):
132 return bytes(self) >= other
133
134 def __add__(self, other):
135 try:
136 return bytes(self) + bytes(other)
137 except UndefinedSequenceError:
138 return NotImplemented
139 # will be handled by _UndefinedSequenceData.__radd__ or
140 # by _PartiallyDefinedSequenceData.__radd__
141
142 def __radd__(self, other):
143 return other + bytes(self)
144
145 def __mul__(self, other):
146 return other * bytes(self)
147
148 def __contains__(self, item):
149 return bytes(self).__contains__(item)
150
151 def decode(self, encoding="utf-8"):
152 """Decode the data as bytes using the codec registered for encoding.
153
154 encoding
155 The encoding with which to decode the bytes.
156 """
157 return bytes(self).decode(encoding)
158
159 def count(self, sub, start=None, end=None):
160 """Return the number of non-overlapping occurrences of sub in data[start:end].
161
162 Optional arguments start and end are interpreted as in slice notation.
163 This method behaves as the count method of Python strings.
164 """
165 return bytes(self).count(sub, start, end)
166
167 def find(self, sub, start=None, end=None):
168 """Return the lowest index in data where subsection sub is found.
169
170 Return the lowest index in data where subsection sub is found,
171 such that sub is contained within data[start,end]. Optional
172 arguments start and end are interpreted as in slice notation.
173
174 Return -1 on failure.
175 """
176 return bytes(self).find(sub, start, end)
177
178 def rfind(self, sub, start=None, end=None):
179 """Return the highest index in data where subsection sub is found.
180
181 Return the highest index in data where subsection sub is found,
182 such that sub is contained within data[start,end]. Optional
183 arguments start and end are interpreted as in slice notation.
184
185 Return -1 on failure.
186 """
187 return bytes(self).rfind(sub, start, end)
188
189 def index(self, sub, start=None, end=None):
190 """Return the lowest index in data where subsection sub is found.
191
192 Return the lowest index in data where subsection sub is found,
193 such that sub is contained within data[start,end]. Optional
194 arguments start and end are interpreted as in slice notation.
195
196 Raises ValueError when the subsection is not found.
197 """
198 return bytes(self).index(sub, start, end)
199
200 def rindex(self, sub, start=None, end=None):
201 """Return the highest index in data where subsection sub is found.
202
203 Return the highest index in data where subsection sub is found,
204 such that sub is contained within data[start,end]. Optional
205 arguments start and end are interpreted as in slice notation.
206
207 Raise ValueError when the subsection is not found.
208 """
209 return bytes(self).rindex(sub, start, end)
210
211 def startswith(self, prefix, start=None, end=None):
212 """Return True if data starts with the specified prefix, False otherwise.
213
214 With optional start, test data beginning at that position.
215 With optional end, stop comparing data at that position.
216 prefix can also be a tuple of bytes to try.
217 """
218 return bytes(self).startswith(prefix, start, end)
219
220 def endswith(self, suffix, start=None, end=None):
221 """Return True if data ends with the specified suffix, False otherwise.
222
223 With optional start, test data beginning at that position.
224 With optional end, stop comparing data at that position.
225 suffix can also be a tuple of bytes to try.
226 """
227 return bytes(self).endswith(suffix, start, end)
228
229 def split(self, sep=None, maxsplit=-1):
230 """Return a list of the sections in the data, using sep as the delimiter.
231
232 sep
233 The delimiter according which to split the data.
234 None (the default value) means split on ASCII whitespace characters
235 (space, tab, return, newline, formfeed, vertical tab).
236 maxsplit
237 Maximum number of splits to do.
238 -1 (the default value) means no limit.
239 """
240 return bytes(self).split(sep, maxsplit)
241
242 def rsplit(self, sep=None, maxsplit=-1):
243 """Return a list of the sections in the data, using sep as the delimiter.
244
245 sep
246 The delimiter according which to split the data.
247 None (the default value) means split on ASCII whitespace characters
248 (space, tab, return, newline, formfeed, vertical tab).
249 maxsplit
250 Maximum number of splits to do.
251 -1 (the default value) means no limit.
252
253 Splitting is done starting at the end of the data and working to the front.
254 """
255 return bytes(self).rsplit(sep, maxsplit)
256
257 def strip(self, chars=None):
258 """Strip leading and trailing characters contained in the argument.
259
260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
261 """
262 return bytes(self).strip(chars)
263
264 def lstrip(self, chars=None):
265 """Strip leading characters contained in the argument.
266
267 If the argument is omitted or None, strip leading ASCII whitespace.
268 """
269 return bytes(self).lstrip(chars)
270
271 def rstrip(self, chars=None):
272 """Strip trailing characters contained in the argument.
273
274 If the argument is omitted or None, strip trailing ASCII whitespace.
275 """
276 return bytes(self).rstrip(chars)
277
278 def removeprefix(self, prefix):
279 """Remove the prefix if present."""
280 # Want to do just this, but need Python 3.9+
281 # return bytes(self).removeprefix(prefix)
282 data = bytes(self)
283 try:
284 return data.removeprefix(prefix)
285 except AttributeError:
286 if data.startswith(prefix):
287 return data[len(prefix) :]
288 else:
289 return data
290
291 def removesuffix(self, suffix):
292 """Remove the suffix if present."""
293 # Want to do just this, but need Python 3.9+
294 # return bytes(self).removesuffix(suffix)
295 data = bytes(self)
296 try:
297 return data.removesuffix(suffix)
298 except AttributeError:
299 if data.startswith(suffix):
300 return data[: -len(suffix)]
301 else:
302 return data
303
304 def upper(self):
305 """Return a copy of data with all ASCII characters converted to uppercase."""
306 return bytes(self).upper()
307
308 def lower(self):
309 """Return a copy of data with all ASCII characters converted to lowercase."""
310 return bytes(self).lower()
311
312 def isupper(self):
313 """Return True if all ASCII characters in data are uppercase.
314
315 If there are no cased characters, the method returns False.
316 """
317 return bytes(self).isupper()
318
319 def islower(self):
320 """Return True if all ASCII characters in data are lowercase.
321
322 If there are no cased characters, the method returns False.
323 """
324 return bytes(self).islower()
325
326 def replace(self, old, new):
327 """Return a copy with all occurrences of substring old replaced by new."""
328 return bytes(self).replace(old, new)
329
330 def translate(self, table, delete=b""):
331 """Return a copy with each character mapped by the given translation table.
332
333 table
334 Translation table, which must be a bytes object of length 256.
335
336 All characters occurring in the optional argument delete are removed.
337 The remaining characters are mapped through the given translation table.
338 """
339 return bytes(self).translate(table, delete)
340
341 @property
342 def defined(self):
343 """Return True if the sequence is defined, False if undefined or partially defined.
344
345 Zero-length sequences are always considered to be defined.
346 """
347 return True
348
349 @property
350 def defined_ranges(self):
351 """Return a tuple of the ranges where the sequence contents is defined.
352
353 The return value has the format ((start1, end1), (start2, end2), ...).
354 """
355 length = len(self)
356 if length > 0:
357 return ((0, length),)
358 else:
359 return ()
360
361
362 class _SeqAbstractBaseClass(ABC):
363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
364
365 Most users will not need to use this class. It is used internally as an
366 abstract base class for Seq and MutableSeq, as most of their methods are
367 identical.
368 """
369
370 __slots__ = ("_data",)
371 __array_ufunc__ = None # turn off numpy Ufuncs
372
373 @abstractmethod
374 def __init__(self):
375 pass
376
377 def __bytes__(self):
378 return bytes(self._data)
379
380 def __repr__(self):
381 """Return (truncated) representation of the sequence."""
382 data = self._data
383 if isinstance(data, _UndefinedSequenceData):
384 return f"Seq(None, length={len(self)})"
385 if isinstance(data, _PartiallyDefinedSequenceData):
386 d = {}
387 for position, seq in data._data.items():
388 if len(seq) > 60:
389 start = seq[:54].decode("ASCII")
390 end = seq[-3:].decode("ASCII")
391 seq = f"{start}...{end}"
392 else:
393 seq = seq.decode("ASCII")
394 d[position] = seq
395 return "Seq(%r, length=%d)" % (d, len(self))
396 if len(data) > 60:
397 # Shows the last three letters as it is often useful to see if
398 # there is a stop codon at the end of a sequence.
399 # Note total length is 54+3+3=60
400 start = data[:54].decode("ASCII")
401 end = data[-3:].decode("ASCII")
402 return f"{self.__class__.__name__}('{start}...{end}')"
403 else:
404 data = data.decode("ASCII")
405 return f"{self.__class__.__name__}('{data}')"
406
407 def __str__(self):
408 """Return the full sequence as a python string."""
409 return self._data.decode("ASCII")
410
411 def __eq__(self, other):
412 """Compare the sequence to another sequence or a string.
413
414 Sequences are equal to each other if their sequence contents is
415 identical:
416
417 >>> from Bio.Seq import Seq, MutableSeq
418 >>> seq1 = Seq("ACGT")
419 >>> seq2 = Seq("ACGT")
420 >>> mutable_seq = MutableSeq("ACGT")
421 >>> seq1 == seq2
422 True
423 >>> seq1 == mutable_seq
424 True
425 >>> seq1 == "ACGT"
426 True
427
428 Note that the sequence objects themselves are not identical to each
429 other:
430
431 >>> id(seq1) == id(seq2)
432 False
433 >>> seq1 is seq2
434 False
435
436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
437 objects:
438
439 >>> seq1 == "ACGT"
440 True
441 >>> seq1 == b"ACGT"
442 True
443 >>> seq1 == bytearray(b"ACGT")
444 True
445 """
446 if isinstance(other, _SeqAbstractBaseClass):
447 return self._data == other._data
448 elif isinstance(other, str):
449 return self._data == other.encode("ASCII")
450 else:
451 return self._data == other
452
453 def __lt__(self, other):
454 """Implement the less-than operand."""
455 if isinstance(other, _SeqAbstractBaseClass):
456 return self._data < other._data
457 elif isinstance(other, str):
458 return self._data < other.encode("ASCII")
459 else:
460 return self._data < other
461
462 def __le__(self, other):
463 """Implement the less-than or equal operand."""
464 if isinstance(other, _SeqAbstractBaseClass):
465 return self._data <= other._data
466 elif isinstance(other, str):
467 return self._data <= other.encode("ASCII")
468 else:
469 return self._data <= other
470
471 def __gt__(self, other):
472 """Implement the greater-than operand."""
473 if isinstance(other, _SeqAbstractBaseClass):
474 return self._data > other._data
475 elif isinstance(other, str):
476 return self._data > other.encode("ASCII")
477 else:
478 return self._data > other
479
480 def __ge__(self, other):
481 """Implement the greater-than or equal operand."""
482 if isinstance(other, _SeqAbstractBaseClass):
483 return self._data >= other._data
484 elif isinstance(other, str):
485 return self._data >= other.encode("ASCII")
486 else:
487 return self._data >= other
488
489 def __len__(self):
490 """Return the length of the sequence."""
491 return len(self._data)
492
493 def __iter__(self):
494 """Return an iterable of the sequence."""
495 return self._data.decode("ASCII").__iter__()
496
497 @overload
498 def __getitem__(self, index: int) -> str:
499 ...
500
501 @overload
502 def __getitem__(self, index: slice) -> "Seq":
503 ...
504
505 def __getitem__(self, index):
506 """Return a subsequence as a single letter or as a sequence object.
507
508 If the index is an integer, a single letter is returned as a Python
509 string:
510
511 >>> seq = Seq('ACTCGACGTCG')
512 >>> seq[5]
513 'A'
514
515 Otherwise, a new sequence object of the same class is returned:
516
517 >>> seq[5:8]
518 Seq('ACG')
519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
520 >>> mutable_seq[5:8]
521 MutableSeq('ACG')
522 """
523 if isinstance(index, numbers.Integral):
524 # Return a single letter as a string
525 return chr(self._data[index])
526 else:
527 # Return the (sub)sequence as another Seq/MutableSeq object
528 return self.__class__(self._data[index])
529
530 def __add__(self, other):
531 """Add a sequence or string to this sequence.
532
533 >>> from Bio.Seq import Seq, MutableSeq
534 >>> Seq("MELKI") + "LV"
535 Seq('MELKILV')
536 >>> MutableSeq("MELKI") + "LV"
537 MutableSeq('MELKILV')
538 """
539 if isinstance(other, _SeqAbstractBaseClass):
540 return self.__class__(self._data + other._data)
541 elif isinstance(other, str):
542 return self.__class__(self._data + other.encode("ASCII"))
543 else:
544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
545 # this. If not, returning NotImplemented will trigger a TypeError.
546 return NotImplemented
547
548 def __radd__(self, other):
549 """Add a sequence string on the left.
550
551 >>> from Bio.Seq import Seq, MutableSeq
552 >>> "LV" + Seq("MELKI")
553 Seq('LVMELKI')
554 >>> "LV" + MutableSeq("MELKI")
555 MutableSeq('LVMELKI')
556
557 Adding two sequence objects is handled via the __add__ method.
558 """
559 if isinstance(other, str):
560 return self.__class__(other.encode("ASCII") + self._data)
561 else:
562 return NotImplemented
563
564 def __mul__(self, other):
565 """Multiply sequence by integer.
566
567 >>> from Bio.Seq import Seq, MutableSeq
568 >>> Seq('ATG') * 2
569 Seq('ATGATG')
570 >>> MutableSeq('ATG') * 2
571 MutableSeq('ATGATG')
572 """
573 if not isinstance(other, numbers.Integral):
574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
575 # we would like to simply write
576 # data = self._data * other
577 # here, but currently that causes a bug on PyPy if self._data is a
578 # bytearray and other is a numpy integer. Using this workaround:
579 data = self._data.__mul__(other)
580 return self.__class__(data)
581
582 def __rmul__(self, other):
583 """Multiply integer by sequence.
584
585 >>> from Bio.Seq import Seq
586 >>> 2 * Seq('ATG')
587 Seq('ATGATG')
588 """
589 if not isinstance(other, numbers.Integral):
590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
591 # we would like to simply write
592 # data = self._data * other
593 # here, but currently that causes a bug on PyPy if self._data is a
594 # bytearray and other is a numpy integer. Using this workaround:
595 data = self._data.__mul__(other)
596 return self.__class__(data)
597
598 def __imul__(self, other):
599 """Multiply the sequence object by other and assign.
600
601 >>> from Bio.Seq import Seq
602 >>> seq = Seq('ATG')
603 >>> seq *= 2
604 >>> seq
605 Seq('ATGATG')
606
607 Note that this is different from in-place multiplication. The ``seq``
608 variable is reassigned to the multiplication result, but any variable
609 pointing to ``seq`` will remain unchanged:
610
611 >>> seq = Seq('ATG')
612 >>> seq2 = seq
613 >>> id(seq) == id(seq2)
614 True
615 >>> seq *= 2
616 >>> seq
617 Seq('ATGATG')
618 >>> seq2
619 Seq('ATG')
620 >>> id(seq) == id(seq2)
621 False
622 """
623 if not isinstance(other, numbers.Integral):
624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
625 # we would like to simply write
626 # data = self._data * other
627 # here, but currently that causes a bug on PyPy if self._data is a
628 # bytearray and other is a numpy integer. Using this workaround:
629 data = self._data.__mul__(other)
630 return self.__class__(data)
631
632 def count(self, sub, start=None, end=None):
633 """Return a non-overlapping count, like that of a python string.
634
635 The number of occurrences of substring argument sub in the
636 (sub)sequence given by [start:end] is returned as an integer.
637 Optional arguments start and end are interpreted as in slice
638 notation.
639
640 Arguments:
641 - sub - a string or another Seq object to look for
642 - start - optional integer, slice start
643 - end - optional integer, slice end
644
645 e.g.
646
647 >>> from Bio.Seq import Seq
648 >>> my_seq = Seq("AAAATGA")
649 >>> print(my_seq.count("A"))
650 5
651 >>> print(my_seq.count("ATG"))
652 1
653 >>> print(my_seq.count(Seq("AT")))
654 1
655 >>> print(my_seq.count("AT", 2, -1))
656 1
657
658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
659 objects, like that of Python strings, do a non-overlapping search, this
660 may not give the answer you expect:
661
662 >>> "AAAA".count("AA")
663 2
664 >>> print(Seq("AAAA").count("AA"))
665 2
666
667 For an overlapping search, use the ``count_overlap`` method:
668
669 >>> print(Seq("AAAA").count_overlap("AA"))
670 3
671 """
672 if isinstance(sub, MutableSeq):
673 sub = sub._data
674 elif isinstance(sub, Seq):
675 sub = bytes(sub)
676 elif isinstance(sub, str):
677 sub = sub.encode("ASCII")
678 elif not isinstance(sub, (bytes, bytearray)):
679 raise TypeError(
680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
681 % type(sub)
682 )
683 return self._data.count(sub, start, end)
684
685 def count_overlap(self, sub, start=None, end=None):
686 """Return an overlapping count.
687
688 Returns an integer, the number of occurrences of substring
689 argument sub in the (sub)sequence given by [start:end].
690 Optional arguments start and end are interpreted as in slice
691 notation.
692
693 Arguments:
694 - sub - a string or another Seq object to look for
695 - start - optional integer, slice start
696 - end - optional integer, slice end
697
698 e.g.
699
700 >>> from Bio.Seq import Seq
701 >>> print(Seq("AAAA").count_overlap("AA"))
702 3
703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
704 4
705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
706 1
707
708 For a non-overlapping search, use the ``count`` method:
709
710 >>> print(Seq("AAAA").count("AA"))
711 2
712
713 Where substrings do not overlap, ``count_overlap`` behaves the same as
714 the ``count`` method:
715
716 >>> from Bio.Seq import Seq
717 >>> my_seq = Seq("AAAATGA")
718 >>> print(my_seq.count_overlap("A"))
719 5
720 >>> my_seq.count_overlap("A") == my_seq.count("A")
721 True
722 >>> print(my_seq.count_overlap("ATG"))
723 1
724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
725 True
726 >>> print(my_seq.count_overlap(Seq("AT")))
727 1
728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
729 True
730 >>> print(my_seq.count_overlap("AT", 2, -1))
731 1
732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
733 True
734
735 HOWEVER, do not use this method for such cases because the
736 count() method is much for efficient.
737 """
738 if isinstance(sub, MutableSeq):
739 sub = sub._data
740 elif isinstance(sub, Seq):
741 sub = bytes(sub)
742 elif isinstance(sub, str):
743 sub = sub.encode("ASCII")
744 elif not isinstance(sub, (bytes, bytearray)):
745 raise TypeError(
746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
747 % type(sub)
748 )
749 data = self._data
750 overlap_count = 0
751 while True:
752 start = data.find(sub, start, end) + 1
753 if start != 0:
754 overlap_count += 1
755 else:
756 return overlap_count
757
758 def __contains__(self, item):
759 """Return True if item is a subsequence of the sequence, and False otherwise.
760
761 e.g.
762
763 >>> from Bio.Seq import Seq, MutableSeq
764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
765 >>> "AAA" in my_dna
766 True
767 >>> Seq("AAA") in my_dna
768 True
769 >>> MutableSeq("AAA") in my_dna
770 True
771 """
772 if isinstance(item, _SeqAbstractBaseClass):
773 item = bytes(item)
774 elif isinstance(item, str):
775 item = item.encode("ASCII")
776 return item in self._data
777
778 def find(self, sub, start=None, end=None):
779 """Return the lowest index in the sequence where subsequence sub is found.
780
781 With optional arguments start and end, return the lowest index in the
782 sequence such that the subsequence sub is contained within the sequence
783 region [start:end].
784
785 Arguments:
786 - sub - a string or another Seq or MutableSeq object to search for
787 - start - optional integer, slice start
788 - end - optional integer, slice end
789
790 Returns -1 if the subsequence is NOT found.
791
792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
793
794 >>> from Bio.Seq import Seq
795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
796 >>> my_rna.find("AUG")
797 3
798
799 The next typical start codon can then be found by starting the search
800 at position 4:
801
802 >>> my_rna.find("AUG", 4)
803 15
804
805 See the ``search`` method to find the locations of multiple subsequences
806 at the same time.
807 """
808 if isinstance(sub, _SeqAbstractBaseClass):
809 sub = bytes(sub)
810 elif isinstance(sub, str):
811 sub = sub.encode("ASCII")
812 elif not isinstance(sub, (bytes, bytearray)):
813 raise TypeError(
814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
815 % type(sub)
816 )
817 return self._data.find(sub, start, end)
818
819 def rfind(self, sub, start=None, end=None):
820 """Return the highest index in the sequence where subsequence sub is found.
821
822 With optional arguments start and end, return the highest index in the
823 sequence such that the subsequence sub is contained within the sequence
824 region [start:end].
825
826 Arguments:
827 - sub - a string or another Seq or MutableSeq object to search for
828 - start - optional integer, slice start
829 - end - optional integer, slice end
830
831 Returns -1 if the subsequence is NOT found.
832
833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
834
835 >>> from Bio.Seq import Seq
836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
837 >>> my_rna.rfind("AUG")
838 15
839
840 The location of the typical start codon before that can be found by
841 ending the search at position 15:
842
843 >>> my_rna.rfind("AUG", end=15)
844 3
845
846 See the ``search`` method to find the locations of multiple subsequences
847 at the same time.
848 """
849 if isinstance(sub, _SeqAbstractBaseClass):
850 sub = bytes(sub)
851 elif isinstance(sub, str):
852 sub = sub.encode("ASCII")
853 elif not isinstance(sub, (bytes, bytearray)):
854 raise TypeError(
855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
856 % type(sub)
857 )
858 return self._data.rfind(sub, start, end)
859
860 def index(self, sub, start=None, end=None):
861 """Return the lowest index in the sequence where subsequence sub is found.
862
863 With optional arguments start and end, return the lowest index in the
864 sequence such that the subsequence sub is contained within the sequence
865 region [start:end].
866
867 Arguments:
868 - sub - a string or another Seq or MutableSeq object to search for
869 - start - optional integer, slice start
870 - end - optional integer, slice end
871
872 Raises a ValueError if the subsequence is NOT found.
873
874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
875
876 >>> from Bio.Seq import Seq
877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
878 >>> my_rna.index("AUG")
879 3
880
881 The next typical start codon can then be found by starting the search
882 at position 4:
883
884 >>> my_rna.index("AUG", 4)
885 15
886
887 This method performs the same search as the ``find`` method. However,
888 if the subsequence is not found, ``find`` returns -1 while ``index``
889 raises a ValueError:
890
891 >>> my_rna.index("T")
892 Traceback (most recent call last):
893 ...
894 ValueError: ...
895 >>> my_rna.find("T")
896 -1
897
898 See the ``search`` method to find the locations of multiple subsequences
899 at the same time.
900 """
901 if isinstance(sub, MutableSeq):
902 sub = sub._data
903 elif isinstance(sub, Seq):
904 sub = bytes(sub)
905 elif isinstance(sub, str):
906 sub = sub.encode("ASCII")
907 elif not isinstance(sub, (bytes, bytearray)):
908 raise TypeError(
909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
910 % type(sub)
911 )
912 return self._data.index(sub, start, end)
913
914 def rindex(self, sub, start=None, end=None):
915 """Return the highest index in the sequence where subsequence sub is found.
916
917 With optional arguments start and end, return the highest index in the
918 sequence such that the subsequence sub is contained within the sequence
919 region [start:end].
920
921 Arguments:
922 - sub - a string or another Seq or MutableSeq object to search for
923 - start - optional integer, slice start
924 - end - optional integer, slice end
925
926 Returns -1 if the subsequence is NOT found.
927
928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
929
930 >>> from Bio.Seq import Seq
931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
932 >>> my_rna.rindex("AUG")
933 15
934
935 The location of the typical start codon before that can be found by
936 ending the search at position 15:
937
938 >>> my_rna.rindex("AUG", end=15)
939 3
940
941 This method performs the same search as the ``rfind`` method. However,
942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
943 raises a ValueError:
944
945 >>> my_rna.rindex("T")
946 Traceback (most recent call last):
947 ...
948 ValueError: ...
949 >>> my_rna.rfind("T")
950 -1
951
952 See the ``search`` method to find the locations of multiple subsequences
953 at the same time.
954 """
955 if isinstance(sub, MutableSeq):
956 sub = sub._data
957 elif isinstance(sub, Seq):
958 sub = bytes(sub)
959 elif isinstance(sub, str):
960 sub = sub.encode("ASCII")
961 elif not isinstance(sub, (bytes, bytearray)):
962 raise TypeError(
963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
964 % type(sub)
965 )
966 return self._data.rindex(sub, start, end)
967
968 def search(self, subs):
969 """Search the substrings subs in self and yield the index and substring found.
970
971 Arguments:
972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
973 objects containing the substrings to search for.
974
975 >>> from Bio.Seq import Seq
976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
978 >>> for index, substring in matches:
979 ... print(index, substring)
980 ...
981 7 CC
982 9 ATTG
983 20 CC
984 34 CC
985 34 CCC
986 35 CC
987 """
988 subdict = collections.defaultdict(set)
989 for index, sub in enumerate(subs):
990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
991 sub = bytes(sub)
992 elif isinstance(sub, str):
993 sub = sub.encode("ASCII")
994 elif not isinstance(sub, bytes):
995 raise TypeError(
996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
997 % (index, type(sub))
998 )
999 length = len(sub)
1000 subdict[length].add(sub)
1001 for start in range(len(self) - 1):
1002 for length, subs in subdict.items():
1003 stop = start + length
1004 for sub in subs:
1005 if self._data[start:stop] == sub:
1006 yield (start, sub.decode())
1007 break
1008
1009 def startswith(self, prefix, start=None, end=None):
1010 """Return True if the sequence starts with the given prefix, False otherwise.
1011
1012 Return True if the sequence starts with the specified prefix
1013 (a string or another Seq object), False otherwise.
1014 With optional start, test sequence beginning at that position.
1015 With optional end, stop comparing sequence at that position.
1016 prefix can also be a tuple of strings to try. e.g.
1017
1018 >>> from Bio.Seq import Seq
1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
1020 >>> my_rna.startswith("GUC")
1021 True
1022 >>> my_rna.startswith("AUG")
1023 False
1024 >>> my_rna.startswith("AUG", 3)
1025 True
1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
1027 True
1028 """
1029 if isinstance(prefix, tuple):
1030 prefix = tuple(
1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
1032 for p in prefix
1033 )
1034 elif isinstance(prefix, _SeqAbstractBaseClass):
1035 prefix = bytes(prefix)
1036 elif isinstance(prefix, str):
1037 prefix = prefix.encode("ASCII")
1038 return self._data.startswith(prefix, start, end)
1039
1040 def endswith(self, suffix, start=None, end=None):
1041 """Return True if the sequence ends with the given suffix, False otherwise.
1042
1043 Return True if the sequence ends with the specified suffix
1044 (a string or another Seq object), False otherwise.
1045 With optional start, test sequence beginning at that position.
1046 With optional end, stop comparing sequence at that position.
1047 suffix can also be a tuple of strings to try. e.g.
1048
1049 >>> from Bio.Seq import Seq
1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
1051 >>> my_rna.endswith("UUG")
1052 True
1053 >>> my_rna.endswith("AUG")
1054 False
1055 >>> my_rna.endswith("AUG", 0, 18)
1056 True
1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
1058 True
1059 """
1060 if isinstance(suffix, tuple):
1061 suffix = tuple(
1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
1063 for p in suffix
1064 )
1065 elif isinstance(suffix, _SeqAbstractBaseClass):
1066 suffix = bytes(suffix)
1067 elif isinstance(suffix, str):
1068 suffix = suffix.encode("ASCII")
1069 return self._data.endswith(suffix, start, end)
1070
1071 def split(self, sep=None, maxsplit=-1):
1072 """Return a list of subsequences when splitting the sequence by separator sep.
1073
1074 Return a list of the subsequences in the sequence (as Seq objects),
1075 using sep as the delimiter string. If maxsplit is given, at
1076 most maxsplit splits are done. If maxsplit is omitted, all
1077 splits are made.
1078
1079 For consistency with the ``split`` method of Python strings, any
1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
1081 default value
1082
1083 e.g.
1084
1085 >>> from Bio.Seq import Seq
1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
1087 >>> my_aa = my_rna.translate()
1088 >>> my_aa
1089 Seq('VMAIVMGR*KGAR*L')
1090 >>> for pep in my_aa.split("*"):
1091 ... pep
1092 Seq('VMAIVMGR')
1093 Seq('KGAR')
1094 Seq('L')
1095 >>> for pep in my_aa.split("*", 1):
1096 ... pep
1097 Seq('VMAIVMGR')
1098 Seq('KGAR*L')
1099
1100 See also the rsplit method, which splits the sequence starting from the
1101 end:
1102
1103 >>> for pep in my_aa.rsplit("*", 1):
1104 ... pep
1105 Seq('VMAIVMGR*KGAR')
1106 Seq('L')
1107 """
1108 if isinstance(sep, _SeqAbstractBaseClass):
1109 sep = bytes(sep)
1110 elif isinstance(sep, str):
1111 sep = sep.encode("ASCII")
1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
1113
1114 def rsplit(self, sep=None, maxsplit=-1):
1115 """Return a list of subsequences by splitting the sequence from the right.
1116
1117 Return a list of the subsequences in the sequence (as Seq objects),
1118 using sep as the delimiter string. If maxsplit is given, at
1119 most maxsplit splits are done. If maxsplit is omitted, all
1120 splits are made.
1121
1122 For consistency with the ``rsplit`` method of Python strings, any
1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
1124 default value
1125
1126 e.g.
1127
1128 >>> from Bio.Seq import Seq
1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
1130 >>> my_aa = my_rna.translate()
1131 >>> my_aa
1132 Seq('VMAIVMGR*KGAR*L')
1133 >>> for pep in my_aa.rsplit("*"):
1134 ... pep
1135 Seq('VMAIVMGR')
1136 Seq('KGAR')
1137 Seq('L')
1138 >>> for pep in my_aa.rsplit("*", 1):
1139 ... pep
1140 Seq('VMAIVMGR*KGAR')
1141 Seq('L')
1142
1143 See also the split method, which splits the sequence starting from the
1144 beginning:
1145
1146 >>> for pep in my_aa.split("*", 1):
1147 ... pep
1148 Seq('VMAIVMGR')
1149 Seq('KGAR*L')
1150 """
1151 if isinstance(sep, _SeqAbstractBaseClass):
1152 sep = bytes(sep)
1153 elif isinstance(sep, str):
1154 sep = sep.encode("ASCII")
1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
1156
1157 def strip(self, chars=None, inplace=False):
1158 """Return a sequence object with leading and trailing ends stripped.
1159
1160 With default arguments, leading and trailing whitespace is removed:
1161
1162 >>> seq = Seq(" ACGT ")
1163 >>> seq.strip()
1164 Seq('ACGT')
1165 >>> seq
1166 Seq(' ACGT ')
1167
1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
1169 instead. The order of the characters to be removed is not important:
1170
1171 >>> Seq("ACGTACGT").strip("TGCA")
1172 Seq('')
1173
1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
1175 default value). If ``inplace`` is ``True``, the sequence is stripped
1176 in-place and returned.
1177
1178 >>> seq = MutableSeq(" ACGT ")
1179 >>> seq.strip()
1180 MutableSeq('ACGT')
1181 >>> seq
1182 MutableSeq(' ACGT ')
1183 >>> seq.strip(inplace=True)
1184 MutableSeq('ACGT')
1185 >>> seq
1186 MutableSeq('ACGT')
1187
1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
1189 is called on a ``Seq`` object with ``inplace=True``.
1190
1191 See also the lstrip and rstrip methods.
1192 """
1193 if isinstance(chars, _SeqAbstractBaseClass):
1194 chars = bytes(chars)
1195 elif isinstance(chars, str):
1196 chars = chars.encode("ASCII")
1197 try:
1198 data = self._data.strip(chars)
1199 except TypeError:
1200 raise TypeError(
1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
1202 ) from None
1203 if inplace:
1204 if not isinstance(self._data, bytearray):
1205 raise TypeError("Sequence is immutable")
1206 self._data[:] = data
1207 return self
1208 else:
1209 return self.__class__(data)
1210
1211 def lstrip(self, chars=None, inplace=False):
1212 """Return a sequence object with leading and trailing ends stripped.
1213
1214 With default arguments, leading whitespace is removed:
1215
1216 >>> seq = Seq(" ACGT ")
1217 >>> seq.lstrip()
1218 Seq('ACGT ')
1219 >>> seq
1220 Seq(' ACGT ')
1221
1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
1223 from the leading end instead. The order of the characters to be removed
1224 is not important:
1225
1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
1227 Seq('TTACG')
1228
1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
1230 default value). If ``inplace`` is ``True``, the sequence is stripped
1231 in-place and returned.
1232
1233 >>> seq = MutableSeq(" ACGT ")
1234 >>> seq.lstrip()
1235 MutableSeq('ACGT ')
1236 >>> seq
1237 MutableSeq(' ACGT ')
1238 >>> seq.lstrip(inplace=True)
1239 MutableSeq('ACGT ')
1240 >>> seq
1241 MutableSeq('ACGT ')
1242
1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
1245
1246 See also the strip and rstrip methods.
1247 """
1248 if isinstance(chars, _SeqAbstractBaseClass):
1249 chars = bytes(chars)
1250 elif isinstance(chars, str):
1251 chars = chars.encode("ASCII")
1252 try:
1253 data = self._data.lstrip(chars)
1254 except TypeError:
1255 raise TypeError(
1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
1257 ) from None
1258 if inplace:
1259 if not isinstance(self._data, bytearray):
1260 raise TypeError("Sequence is immutable")
1261 self._data[:] = data
1262 return self
1263 else:
1264 return self.__class__(data)
1265
1266 def rstrip(self, chars=None, inplace=False):
1267 """Return a sequence object with trailing ends stripped.
1268
1269 With default arguments, trailing whitespace is removed:
1270
1271 >>> seq = Seq(" ACGT ")
1272 >>> seq.rstrip()
1273 Seq(' ACGT')
1274 >>> seq
1275 Seq(' ACGT ')
1276
1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
1278 from the trailing end instead. The order of the characters to be
1279 removed is not important:
1280
1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
1282 Seq('ACGACGTT')
1283
1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
1285 default value). If ``inplace`` is ``True``, the sequence is stripped
1286 in-place and returned.
1287
1288 >>> seq = MutableSeq(" ACGT ")
1289 >>> seq.rstrip()
1290 MutableSeq(' ACGT')
1291 >>> seq
1292 MutableSeq(' ACGT ')
1293 >>> seq.rstrip(inplace=True)
1294 MutableSeq(' ACGT')
1295 >>> seq
1296 MutableSeq(' ACGT')
1297
1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
1300
1301 See also the strip and lstrip methods.
1302 """
1303 if isinstance(chars, _SeqAbstractBaseClass):
1304 chars = bytes(chars)
1305 elif isinstance(chars, str):
1306 chars = chars.encode("ASCII")
1307 try:
1308 data = self._data.rstrip(chars)
1309 except TypeError:
1310 raise TypeError(
1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
1312 ) from None
1313 if inplace:
1314 if not isinstance(self._data, bytearray):
1315 raise TypeError("Sequence is immutable")
1316 self._data[:] = data
1317 return self
1318 else:
1319 return self.__class__(data)
1320
1321 def removeprefix(self, prefix, inplace=False):
1322 """Return a new Seq object with prefix (left) removed.
1323
1324 This behaves like the python string method of the same name.
1325
1326 e.g. Removing a start Codon:
1327
1328 >>> from Bio.Seq import Seq
1329 >>> my_seq = Seq("ATGGTGTGTGT")
1330 >>> my_seq
1331 Seq('ATGGTGTGTGT')
1332 >>> my_seq.removeprefix('ATG')
1333 Seq('GTGTGTGT')
1334
1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
1337
1338 See also the removesuffix method.
1339 """
1340 if isinstance(prefix, _SeqAbstractBaseClass):
1341 prefix = bytes(prefix)
1342 elif isinstance(prefix, str):
1343 prefix = prefix.encode("ASCII")
1344 try:
1345 data = self._data.removeprefix(prefix)
1346 except TypeError:
1347 raise TypeError(
1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
1349 ) from None
1350 except AttributeError:
1351 # Fall back for pre-Python 3.9
1352 data = self._data
1353 if data.startswith(prefix):
1354 data = data[len(prefix) :]
1355 if inplace:
1356 if not isinstance(self._data, bytearray):
1357 raise TypeError("Sequence is immutable")
1358 self._data[:] = data
1359 return self
1360 else:
1361 return self.__class__(data)
1362
1363 def removesuffix(self, suffix, inplace=False):
1364 """Return a new Seq object with suffix (right) removed.
1365
1366 This behaves like the python string method of the same name.
1367
1368 e.g. Removing a stop codon:
1369
1370 >>> from Bio.Seq import Seq
1371 >>> my_seq = Seq("GTGTGTGTTAG")
1372 >>> my_seq
1373 Seq('GTGTGTGTTAG')
1374 >>> stop_codon = Seq("TAG")
1375 >>> my_seq.removesuffix(stop_codon)
1376 Seq('GTGTGTGT')
1377
1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
1380
1381 See also the removeprefix method.
1382 """
1383 if isinstance(suffix, _SeqAbstractBaseClass):
1384 suffix = bytes(suffix)
1385 elif isinstance(suffix, str):
1386 suffix = suffix.encode("ASCII")
1387 try:
1388 data = self._data.removesuffix(suffix)
1389 except TypeError:
1390 raise TypeError(
1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
1392 ) from None
1393 except AttributeError:
1394 # Fall back for pre-Python 3.9
1395 data = self._data
1396 if data.endswith(suffix):
1397 data = data[: -len(suffix)]
1398 if inplace:
1399 if not isinstance(self._data, bytearray):
1400 raise TypeError("Sequence is immutable")
1401 self._data[:] = data
1402 return self
1403 else:
1404 return self.__class__(data)
1405
1406 def upper(self, inplace=False):
1407 """Return the sequence in upper case.
1408
1409 An upper-case copy of the sequence is returned if inplace is False,
1410 the default value:
1411
1412 >>> from Bio.Seq import Seq, MutableSeq
1413 >>> my_seq = Seq("VHLTPeeK*")
1414 >>> my_seq
1415 Seq('VHLTPeeK*')
1416 >>> my_seq.lower()
1417 Seq('vhltpeek*')
1418 >>> my_seq.upper()
1419 Seq('VHLTPEEK*')
1420 >>> my_seq
1421 Seq('VHLTPeeK*')
1422
1423 The sequence is modified in-place and returned if inplace is True:
1424
1425 >>> my_seq = MutableSeq("VHLTPeeK*")
1426 >>> my_seq
1427 MutableSeq('VHLTPeeK*')
1428 >>> my_seq.lower()
1429 MutableSeq('vhltpeek*')
1430 >>> my_seq.upper()
1431 MutableSeq('VHLTPEEK*')
1432 >>> my_seq
1433 MutableSeq('VHLTPeeK*')
1434
1435 >>> my_seq.lower(inplace=True)
1436 MutableSeq('vhltpeek*')
1437 >>> my_seq
1438 MutableSeq('vhltpeek*')
1439 >>> my_seq.upper(inplace=True)
1440 MutableSeq('VHLTPEEK*')
1441 >>> my_seq
1442 MutableSeq('VHLTPEEK*')
1443
1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
1446
1447 See also the ``lower`` method.
1448 """
1449 data = self._data.upper()
1450 if inplace:
1451 if not isinstance(self._data, bytearray):
1452 raise TypeError("Sequence is immutable")
1453 self._data[:] = data
1454 return self
1455 else:
1456 return self.__class__(data)
1457
1458 def lower(self, inplace=False):
1459 """Return the sequence in lower case.
1460
1461 An lower-case copy of the sequence is returned if inplace is False,
1462 the default value:
1463
1464 >>> from Bio.Seq import Seq, MutableSeq
1465 >>> my_seq = Seq("VHLTPeeK*")
1466 >>> my_seq
1467 Seq('VHLTPeeK*')
1468 >>> my_seq.lower()
1469 Seq('vhltpeek*')
1470 >>> my_seq.upper()
1471 Seq('VHLTPEEK*')
1472 >>> my_seq
1473 Seq('VHLTPeeK*')
1474
1475 The sequence is modified in-place and returned if inplace is True:
1476
1477 >>> my_seq = MutableSeq("VHLTPeeK*")
1478 >>> my_seq
1479 MutableSeq('VHLTPeeK*')
1480 >>> my_seq.lower()
1481 MutableSeq('vhltpeek*')
1482 >>> my_seq.upper()
1483 MutableSeq('VHLTPEEK*')
1484 >>> my_seq
1485 MutableSeq('VHLTPeeK*')
1486
1487 >>> my_seq.lower(inplace=True)
1488 MutableSeq('vhltpeek*')
1489 >>> my_seq
1490 MutableSeq('vhltpeek*')
1491 >>> my_seq.upper(inplace=True)
1492 MutableSeq('VHLTPEEK*')
1493 >>> my_seq
1494 MutableSeq('VHLTPEEK*')
1495
1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
1498
1499 See also the ``upper`` method.
1500 """
1501 data = self._data.lower()
1502 if inplace:
1503 if not isinstance(self._data, bytearray):
1504 raise TypeError("Sequence is immutable")
1505 self._data[:] = data
1506 return self
1507 else:
1508 return self.__class__(data)
1509
1510 def isupper(self):
1511 """Return True if all ASCII characters in data are uppercase.
1512
1513 If there are no cased characters, the method returns False.
1514 """
1515 return self._data.isupper()
1516
1517 def islower(self):
1518 """Return True if all ASCII characters in data are lowercase.
1519
1520 If there are no cased characters, the method returns False.
1521 """
1522 return self._data.islower()
1523
1524 def translate(
1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
1526 ):
1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
1528
1529 This method will translate DNA or RNA sequences. It should not
1530 be used on protein sequences as any result will be biologically
1531 meaningless.
1532
1533 Arguments:
1534 - table - Which codon table to use? This can be either a name
1535 (string), an NCBI identifier (integer), or a CodonTable
1536 object (useful for non-standard genetic codes). This
1537 defaults to the "Standard" table.
1538 - stop_symbol - Single character string, what to use for
1539 terminators. This defaults to the asterisk, "*".
1540 - to_stop - Boolean, defaults to False meaning do a full
1541 translation continuing on past any stop codons (translated as the
1542 specified stop_symbol). If True, translation is terminated at
1543 the first in frame stop codon (and the stop_symbol is not
1544 appended to the returned protein sequence).
1545 - cds - Boolean, indicates this is a complete CDS. If True,
1546 this checks the sequence starts with a valid alternative start
1547 codon (which will be translated as methionine, M), that the
1548 sequence length is a multiple of three, and that there is a
1549 single in frame stop codon at the end (this will be excluded
1550 from the protein sequence, regardless of the to_stop option).
1551 If these tests fail, an exception is raised.
1552 - gap - Single character string to denote symbol used for gaps.
1553 Defaults to the minus sign.
1554
1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
1557 pn a ``MutableSeq`` object.
1558
1559 e.g. Using the standard table:
1560
1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
1562 >>> coding_dna.translate()
1563 Seq('VAIVMGR*KGAR*')
1564 >>> coding_dna.translate(stop_symbol="@")
1565 Seq('VAIVMGR@KGAR@')
1566 >>> coding_dna.translate(to_stop=True)
1567 Seq('VAIVMGR')
1568
1569 Now using NCBI table 2, where TGA is not a stop codon:
1570
1571 >>> coding_dna.translate(table=2)
1572 Seq('VAIVMGRWKGAR*')
1573 >>> coding_dna.translate(table=2, to_stop=True)
1574 Seq('VAIVMGRWKGAR')
1575
1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
1577 this sequence could be a complete CDS:
1578
1579 >>> coding_dna.translate(table=2, cds=True)
1580 Seq('MAIVMGRWKGAR')
1581
1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
1583 and also the in frame stop codons:
1584
1585 >>> coding_dna.translate(table=1, cds=True)
1586 Traceback (most recent call last):
1587 ...
1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
1589
1590 If the sequence has no in-frame stop codon, then the to_stop argument
1591 has no effect:
1592
1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
1594 >>> coding_dna2.translate()
1595 Seq('LAIVMGR')
1596 >>> coding_dna2.translate(to_stop=True)
1597 Seq('LAIVMGR')
1598
1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1600 or a stop codon. These are translated as "X". Any invalid codon
1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
1602
1603 NOTE - This does NOT behave like the python string's translate
1604 method. For that use str(my_seq).translate(...) instead
1605 """
1606 try:
1607 data = str(self)
1608 except UndefinedSequenceError:
1609 # translating an undefined sequence yields an undefined
1610 # sequence with the length divided by 3
1611 n = len(self)
1612 if n % 3 != 0:
1613 warnings.warn(
1614 "Partial codon, len(sequence) not a multiple of three. "
1615 "This may become an error in future.",
1616 BiopythonWarning,
1617 )
1618 return Seq(None, n // 3)
1619
1620 return self.__class__(
1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
1622 )
1623
1624 def complement(self, inplace=False):
1625 """Return the complement as a DNA sequence.
1626
1627 >>> Seq("CGA").complement()
1628 Seq('GCT')
1629
1630 Any U in the sequence is treated as a T:
1631
1632 >>> Seq("CGAUT").complement()
1633 Seq('GCTAA')
1634
1635 In contrast, ``complement_rna`` returns an RNA sequence:
1636
1637 >>> Seq("CGAUT").complement_rna()
1638 Seq('GCUAA')
1639
1640 The sequence is modified in-place and returned if inplace is True:
1641
1642 >>> my_seq = MutableSeq("CGA")
1643 >>> my_seq
1644 MutableSeq('CGA')
1645 >>> my_seq.complement()
1646 MutableSeq('GCT')
1647 >>> my_seq
1648 MutableSeq('CGA')
1649
1650 >>> my_seq.complement(inplace=True)
1651 MutableSeq('GCT')
1652 >>> my_seq
1653 MutableSeq('GCT')
1654
1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
1657 """
1658 ttable = _dna_complement_table
1659 try:
1660 data = self._data.translate(ttable)
1661 except UndefinedSequenceError:
1662 # complement of an undefined sequence is an undefined sequence
1663 # of the same length
1664 return self
1665 if inplace:
1666 if not isinstance(self._data, bytearray):
1667 raise TypeError("Sequence is immutable")
1668 self._data[:] = data
1669 return self
1670 return self.__class__(data)
1671
1672 def complement_rna(self, inplace=False):
1673 """Return the complement as an RNA sequence.
1674
1675 >>> Seq("CGA").complement_rna()
1676 Seq('GCU')
1677
1678 Any T in the sequence is treated as a U:
1679
1680 >>> Seq("CGAUT").complement_rna()
1681 Seq('GCUAA')
1682
1683 In contrast, ``complement`` returns a DNA sequence by default:
1684
1685 >>> Seq("CGA").complement()
1686 Seq('GCT')
1687
1688 The sequence is modified in-place and returned if inplace is True:
1689
1690 >>> my_seq = MutableSeq("CGA")
1691 >>> my_seq
1692 MutableSeq('CGA')
1693 >>> my_seq.complement_rna()
1694 MutableSeq('GCU')
1695 >>> my_seq
1696 MutableSeq('CGA')
1697
1698 >>> my_seq.complement_rna(inplace=True)
1699 MutableSeq('GCU')
1700 >>> my_seq
1701 MutableSeq('GCU')
1702
1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
1705 """
1706 try:
1707 data = self._data.translate(_rna_complement_table)
1708 except UndefinedSequenceError:
1709 # complement of an undefined sequence is an undefined sequence
1710 # of the same length
1711 return self
1712 if inplace:
1713 if not isinstance(self._data, bytearray):
1714 raise TypeError("Sequence is immutable")
1715 self._data[:] = data
1716 return self
1717 return self.__class__(data)
1718
1719 def reverse_complement(self, inplace=False):
1720 """Return the reverse complement as a DNA sequence.
1721
1722 >>> Seq("CGA").reverse_complement()
1723 Seq('TCG')
1724
1725 Any U in the sequence is treated as a T:
1726
1727 >>> Seq("CGAUT").reverse_complement()
1728 Seq('AATCG')
1729
1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
1731
1732 >>> Seq("CGA").reverse_complement_rna()
1733 Seq('UCG')
1734
1735 The sequence is modified in-place and returned if inplace is True:
1736
1737 >>> my_seq = MutableSeq("CGA")
1738 >>> my_seq
1739 MutableSeq('CGA')
1740 >>> my_seq.reverse_complement()
1741 MutableSeq('TCG')
1742 >>> my_seq
1743 MutableSeq('CGA')
1744
1745 >>> my_seq.reverse_complement(inplace=True)
1746 MutableSeq('TCG')
1747 >>> my_seq
1748 MutableSeq('TCG')
1749
1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1751 ``reverse_complement`` is called on a ``Seq`` object with
1752 ``inplace=True``.
1753 """
1754 try:
1755 data = self._data.translate(_dna_complement_table)
1756 except UndefinedSequenceError:
1757 # reverse complement of an undefined sequence is an undefined sequence
1758 # of the same length
1759 return self
1760 if inplace:
1761 if not isinstance(self._data, bytearray):
1762 raise TypeError("Sequence is immutable")
1763 self._data[::-1] = data
1764 return self
1765 return self.__class__(data[::-1])
1766
1767 def reverse_complement_rna(self, inplace=False):
1768 """Return the reverse complement as an RNA sequence.
1769
1770 >>> Seq("CGA").reverse_complement_rna()
1771 Seq('UCG')
1772
1773 Any T in the sequence is treated as a U:
1774
1775 >>> Seq("CGAUT").reverse_complement_rna()
1776 Seq('AAUCG')
1777
1778 In contrast, ``reverse_complement`` returns a DNA sequence:
1779
1780 >>> Seq("CGA").reverse_complement()
1781 Seq('TCG')
1782
1783 The sequence is modified in-place and returned if inplace is True:
1784
1785 >>> my_seq = MutableSeq("CGA")
1786 >>> my_seq
1787 MutableSeq('CGA')
1788 >>> my_seq.reverse_complement_rna()
1789 MutableSeq('UCG')
1790 >>> my_seq
1791 MutableSeq('CGA')
1792
1793 >>> my_seq.reverse_complement_rna(inplace=True)
1794 MutableSeq('UCG')
1795 >>> my_seq
1796 MutableSeq('UCG')
1797
1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
1800 ``inplace=True``.
1801 """
1802 try:
1803 data = self._data.translate(_rna_complement_table)
1804 except UndefinedSequenceError:
1805 # reverse complement of an undefined sequence is an undefined sequence
1806 # of the same length
1807 return self
1808 if inplace:
1809 if not isinstance(self._data, bytearray):
1810 raise TypeError("Sequence is immutable")
1811 self._data[::-1] = data
1812 return self
1813 return self.__class__(data[::-1])
1814
1815 def transcribe(self, inplace=False):
1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
1817
1818 Following the usual convention, the sequence is interpreted as the
1819 coding strand of the DNA double helix, not the template strand. This
1820 means we can get the RNA sequence just by switching T to U.
1821
1822 >>> from Bio.Seq import Seq
1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
1824 >>> coding_dna
1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1826 >>> coding_dna.transcribe()
1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1828
1829 The sequence is modified in-place and returned if inplace is True:
1830
1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
1832 >>> sequence
1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1834 >>> sequence.transcribe()
1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1836 >>> sequence
1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1838
1839 >>> sequence.transcribe(inplace=True)
1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1841 >>> sequence
1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1843
1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
1846
1847 Trying to transcribe an RNA sequence has no effect.
1848 If you have a nucleotide sequence which might be DNA or RNA
1849 (or even a mixture), calling the transcribe method will ensure
1850 any T becomes U.
1851
1852 Trying to transcribe a protein sequence will replace any
1853 T for Threonine with U for Selenocysteine, which has no
1854 biologically plausible rational.
1855
1856 >>> from Bio.Seq import Seq
1857 >>> my_protein = Seq("MAIVMGRT")
1858 >>> my_protein.transcribe()
1859 Seq('MAIVMGRU')
1860 """
1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
1862 if inplace:
1863 if not isinstance(self._data, bytearray):
1864 raise TypeError("Sequence is immutable")
1865 self._data[:] = data
1866 return self
1867 return self.__class__(data)
1868
1869 def back_transcribe(self, inplace=False):
1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
1871
1872 >>> from Bio.Seq import Seq
1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
1874 >>> messenger_rna
1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1876 >>> messenger_rna.back_transcribe()
1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1878
1879 The sequence is modified in-place and returned if inplace is True:
1880
1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
1882 >>> sequence
1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1884 >>> sequence.back_transcribe()
1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1886 >>> sequence
1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
1888
1889 >>> sequence.back_transcribe(inplace=True)
1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1891 >>> sequence
1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
1893
1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
1896
1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
1898 sequence which might be DNA or RNA (or even a mixture), calling the
1899 back-transcribe method will ensure any U becomes T.
1900
1901 Trying to back-transcribe a protein sequence will replace any U for
1902 Selenocysteine with T for Threonine, which is biologically meaningless.
1903
1904 >>> from Bio.Seq import Seq
1905 >>> my_protein = Seq("MAIVMGRU")
1906 >>> my_protein.back_transcribe()
1907 Seq('MAIVMGRT')
1908 """
1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
1910 if inplace:
1911 if not isinstance(self._data, bytearray):
1912 raise TypeError("Sequence is immutable")
1913 self._data[:] = data
1914 return self
1915 return self.__class__(data)
1916
1917 def join(self, other):
1918 """Return a merge of the sequences in other, spaced by the sequence from self.
1919
1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
1921 the letters), or an iterable containing Seq, MutableSeq, or string
1922 objects. These arguments will be concatenated with the calling sequence
1923 as the spacer:
1924
1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
1926 >>> concatenated
1927 Seq('AAANNNNNTTTNNNNNPPP')
1928
1929 Joining the letters of a single sequence:
1930
1931 >>> Seq('NNNNN').join(Seq("ACGT"))
1932 Seq('ANNNNNCNNNNNGNNNNNT')
1933 >>> Seq('NNNNN').join("ACGT")
1934 Seq('ANNNNNCNNNNNGNNNNNT')
1935 """
1936 if isinstance(other, _SeqAbstractBaseClass):
1937 return self.__class__(str(self).join(str(other)))
1938 elif isinstance(other, str):
1939 return self.__class__(str(self).join(other))
1940
1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
1942
1943 if isinstance(other, SeqRecord):
1944 raise TypeError("Iterable cannot be a SeqRecord")
1945
1946 for c in other:
1947 if isinstance(c, SeqRecord):
1948 raise TypeError("Iterable cannot contain SeqRecords")
1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
1950 raise TypeError(
1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
1952 )
1953 return self.__class__(str(self).join([str(_) for _ in other]))
1954
1955 def replace(self, old, new, inplace=False):
1956 """Return a copy with all occurrences of subsequence old replaced by new.
1957
1958 >>> s = Seq("ACGTAACCGGTT")
1959 >>> t = s.replace("AC", "XYZ")
1960 >>> s
1961 Seq('ACGTAACCGGTT')
1962 >>> t
1963 Seq('XYZGTAXYZCGGTT')
1964
1965 For mutable sequences, passing inplace=True will modify the sequence in place:
1966
1967 >>> m = MutableSeq("ACGTAACCGGTT")
1968 >>> t = m.replace("AC", "XYZ")
1969 >>> m
1970 MutableSeq('ACGTAACCGGTT')
1971 >>> t
1972 MutableSeq('XYZGTAXYZCGGTT')
1973
1974 >>> m = MutableSeq("ACGTAACCGGTT")
1975 >>> t = m.replace("AC", "XYZ", inplace=True)
1976 >>> m
1977 MutableSeq('XYZGTAXYZCGGTT')
1978 >>> t
1979 MutableSeq('XYZGTAXYZCGGTT')
1980
1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
1983 """
1984 if isinstance(old, _SeqAbstractBaseClass):
1985 old = bytes(old)
1986 elif isinstance(old, str):
1987 old = old.encode("ASCII")
1988 if isinstance(new, _SeqAbstractBaseClass):
1989 new = bytes(new)
1990 elif isinstance(new, str):
1991 new = new.encode("ASCII")
1992 data = self._data.replace(old, new)
1993 if inplace:
1994 if not isinstance(self._data, bytearray):
1995 raise TypeError("Sequence is immutable")
1996 self._data[:] = data
1997 return self
1998 return self.__class__(data)
1999
2000 @property
2001 def defined(self):
2002 """Return True if the sequence is defined, False if undefined or partially defined.
2003
2004 Zero-length sequences are always considered to be defined.
2005 """
2006 if isinstance(self._data, (bytes, bytearray)):
2007 return True
2008 else:
2009 return self._data.defined
2010
2011 @property
2012 def defined_ranges(self):
2013 """Return a tuple of the ranges where the sequence contents is defined.
2014
2015 The return value has the format ((start1, end1), (start2, end2), ...).
2016 """
2017 if isinstance(self._data, (bytes, bytearray)):
2018 length = len(self)
2019 if length > 0:
2020 return ((0, length),)
2021 else:
2022 return ()
2023 else:
2024 return self._data.defined_ranges
2025
2026
2027 class Seq(_SeqAbstractBaseClass):
2028 """Read-only sequence object (essentially a string with biological methods).
2029
2030 Like normal python strings, our basic sequence object is immutable.
2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
2032 Seq objects to be used as dictionary keys.
2033
2034 The Seq object provides a number of string like methods (such as count,
2035 find, split and strip).
2036
2037 The Seq object also provides some biological methods, such as complement,
2038 reverse_complement, transcribe, back_transcribe and translate (which are
2039 not applicable to protein sequences).
2040 """
2041
2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
2043
2044 def __init__(
2045 self,
2046 data: Union[
2047 str,
2048 bytes,
2049 bytearray,
2050 _SeqAbstractBaseClass,
2051 SequenceDataAbstractBaseClass,
2052 dict,
2053 None,
2054 ],
2055 length: Optional[int] = None,
2056 ):
2057 """Create a Seq object.
2058
2059 Arguments:
2060 - data - Sequence, required (string)
2061 - length - Sequence length, used only if data is None or a dictionary (integer)
2062
2063 You will typically use Bio.SeqIO to read in sequences from files as
2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
2065 the seq property.
2066
2067 However, you can also create a Seq object directly:
2068
2069 >>> from Bio.Seq import Seq
2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
2071 >>> my_seq
2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
2073 >>> print(my_seq)
2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
2075
2076 To create a Seq object with for a sequence of known length but
2077 unknown sequence contents, use None for the data argument and pass
2078 the sequence length for the length argument. Trying to access the
2079 sequence contents of a Seq object created in this way will raise
2080 an UndefinedSequenceError:
2081
2082 >>> my_undefined_sequence = Seq(None, 20)
2083 >>> my_undefined_sequence
2084 Seq(None, length=20)
2085 >>> len(my_undefined_sequence)
2086 20
2087 >>> print(my_undefined_sequence)
2088 Traceback (most recent call last):
2089 ...
2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
2091
2092 If the sequence contents is known for parts of the sequence only, use
2093 a dictionary for the data argument to pass the known sequence segments:
2094
2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
2096 >>> my_partially_defined_sequence
2097 Seq({3: 'ACGT'}, length=10)
2098 >>> len(my_partially_defined_sequence)
2099 10
2100 >>> print(my_partially_defined_sequence)
2101 Traceback (most recent call last):
2102 ...
2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
2104 >>> my_partially_defined_sequence[3:7]
2105 Seq('ACGT')
2106 >>> print(my_partially_defined_sequence[3:7])
2107 ACGT
2108 """
2109 if data is None:
2110 if length is None:
2111 raise ValueError("length must not be None if data is None")
2112 elif length == 0:
2113 self._data = b""
2114 elif length < 0:
2115 raise ValueError("length must not be negative.")
2116 else:
2117 self._data = _UndefinedSequenceData(length)
2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
2119 self._data = data
2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
2121 self._data = bytes(data)
2122 elif isinstance(data, str):
2123 self._data = bytes(data, encoding="ASCII")
2124 elif isinstance(data, dict):
2125 if length is None:
2126 raise ValueError("length must not be None if data is a dictionary")
2127 elif length == 0:
2128 self._data = b""
2129 elif length < 0:
2130 raise ValueError("length must not be negative.")
2131 else:
2132 current = 0 # not needed here, but it keeps mypy happy
2133 end = -1
2134 starts = sorted(data.keys())
2135 _data: Dict[int, bytes] = {}
2136 for start in starts:
2137 seq = data[start]
2138 if isinstance(seq, str):
2139 seq = bytes(seq, encoding="ASCII")
2140 else:
2141 try:
2142 seq = bytes(seq)
2143 except Exception:
2144 raise ValueError("Expected bytes-like objects or strings")
2145 if start < end:
2146 raise ValueError("Sequence data are overlapping.")
2147 elif start == end:
2148 _data[current] += seq # noqa: F821
2149 else:
2150 _data[start] = seq
2151 current = start
2152 end = start + len(seq)
2153 if end > length:
2154 raise ValueError(
2155 "Provided sequence data extend beyond sequence length."
2156 )
2157 elif end == length and current == 0:
2158 # sequence is fully defined
2159 self._data = _data[current]
2160 else:
2161 self._data = _PartiallyDefinedSequenceData(length, _data)
2162 else:
2163 raise TypeError(
2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
2165 )
2166
2167 def __hash__(self):
2168 """Hash of the sequence as a string for comparison.
2169
2170 See Seq object comparison documentation (method ``__eq__`` in
2171 particular) as this has changed in Biopython 1.65. Older versions
2172 would hash on object identity.
2173 """
2174 return hash(self._data)
2175
2176
2177 class MutableSeq(_SeqAbstractBaseClass):
2178 """An editable sequence object.
2179
2180 Unlike normal python strings and our basic sequence object (the Seq class)
2181 which are immutable, the MutableSeq lets you edit the sequence in place.
2182 However, this means you cannot use a MutableSeq object as a dictionary key.
2183
2184 >>> from Bio.Seq import MutableSeq
2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
2186 >>> my_seq
2187 MutableSeq('ACTCGTCGTCG')
2188 >>> my_seq[5]
2189 'T'
2190 >>> my_seq[5] = "A"
2191 >>> my_seq
2192 MutableSeq('ACTCGACGTCG')
2193 >>> my_seq[5]
2194 'A'
2195 >>> my_seq[5:8] = "NNN"
2196 >>> my_seq
2197 MutableSeq('ACTCGNNNTCG')
2198 >>> len(my_seq)
2199 11
2200
2201 Note that the MutableSeq object does not support as many string-like
2202 or biological methods as the Seq object.
2203 """
2204
2205 def __init__(self, data):
2206 """Create a MutableSeq object."""
2207 if isinstance(data, bytearray):
2208 self._data = data
2209 elif isinstance(data, bytes):
2210 self._data = bytearray(data)
2211 elif isinstance(data, str):
2212 self._data = bytearray(data, "ASCII")
2213 elif isinstance(data, MutableSeq):
2214 self._data = data._data[:] # Take a copy
2215 elif isinstance(data, Seq):
2216 # Make no assumptions about the Seq subclass internal storage
2217 self._data = bytearray(bytes(data))
2218 else:
2219 raise TypeError(
2220 "data should be a string, bytearray object, Seq object, or a "
2221 "MutableSeq object"
2222 )
2223
2224 def __setitem__(self, index, value):
2225 """Set a subsequence of single letter via value parameter.
2226
2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
2228 >>> my_seq[0] = 'T'
2229 >>> my_seq
2230 MutableSeq('TCTCGACGTCG')
2231 """
2232 if isinstance(index, numbers.Integral):
2233 # Replacing a single letter with a new string
2234 self._data[index] = ord(value)
2235 else:
2236 # Replacing a sub-sequence
2237 if isinstance(value, MutableSeq):
2238 self._data[index] = value._data
2239 elif isinstance(value, Seq):
2240 self._data[index] = bytes(value)
2241 elif isinstance(value, str):
2242 self._data[index] = value.encode("ASCII")
2243 else:
2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
2245
2246 def __delitem__(self, index):
2247 """Delete a subsequence of single letter.
2248
2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
2250 >>> del my_seq[0]
2251 >>> my_seq
2252 MutableSeq('CTCGACGTCG')
2253 """
2254 # Could be deleting a single letter, or a slice
2255 del self._data[index]
2256
2257 def append(self, c):
2258 """Add a subsequence to the mutable sequence object.
2259
2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
2261 >>> my_seq.append('A')
2262 >>> my_seq
2263 MutableSeq('ACTCGACGTCGA')
2264
2265 No return value.
2266 """
2267 self._data.append(ord(c.encode("ASCII")))
2268
2269 def insert(self, i, c):
2270 """Add a subsequence to the mutable sequence object at a given index.
2271
2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
2273 >>> my_seq.insert(0,'A')
2274 >>> my_seq
2275 MutableSeq('AACTCGACGTCG')
2276 >>> my_seq.insert(8,'G')
2277 >>> my_seq
2278 MutableSeq('AACTCGACGGTCG')
2279
2280 No return value.
2281 """
2282 self._data.insert(i, ord(c.encode("ASCII")))
2283
2284 def pop(self, i=(-1)):
2285 """Remove a subsequence of a single letter at given index.
2286
2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
2288 >>> my_seq.pop()
2289 'G'
2290 >>> my_seq
2291 MutableSeq('ACTCGACGTC')
2292 >>> my_seq.pop()
2293 'C'
2294 >>> my_seq
2295 MutableSeq('ACTCGACGT')
2296
2297 Returns the last character of the sequence.
2298 """
2299 c = self._data[i]
2300 del self._data[i]
2301 return chr(c)
2302
2303 def remove(self, item):
2304 """Remove a subsequence of a single letter from mutable sequence.
2305
2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
2307 >>> my_seq.remove('C')
2308 >>> my_seq
2309 MutableSeq('ATCGACGTCG')
2310 >>> my_seq.remove('A')
2311 >>> my_seq
2312 MutableSeq('TCGACGTCG')
2313
2314 No return value.
2315 """
2316 codepoint = ord(item)
2317 try:
2318 self._data.remove(codepoint)
2319 except ValueError:
2320 raise ValueError("value not found in MutableSeq") from None
2321
2322 def reverse(self):
2323 """Modify the mutable sequence to reverse itself.
2324
2325 No return value.
2326 """
2327 self._data.reverse()
2328
2329 def extend(self, other):
2330 """Add a sequence to the original mutable sequence object.
2331
2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
2333 >>> my_seq.extend('A')
2334 >>> my_seq
2335 MutableSeq('ACTCGACGTCGA')
2336 >>> my_seq.extend('TTT')
2337 >>> my_seq
2338 MutableSeq('ACTCGACGTCGATTT')
2339
2340 No return value.
2341 """
2342 if isinstance(other, MutableSeq):
2343 self._data.extend(other._data)
2344 elif isinstance(other, Seq):
2345 self._data.extend(bytes(other))
2346 elif isinstance(other, str):
2347 self._data.extend(other.encode("ASCII"))
2348 else:
2349 raise TypeError("expected a string, Seq or MutableSeq")
2350
2351
2352 class UndefinedSequenceError(ValueError):
2353 """Sequence contents is undefined."""
2354
2355
2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
2358
2359 Objects of this class can be used to create a Seq object to represent
2360 sequences with a known length, but an unknown sequence contents.
2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
2362 UndefinedSequenceError except for requests of zero size, for which it
2363 returns an empty bytes object.
2364 """
2365
2366 __slots__ = ("_length",)
2367
2368 def __init__(self, length):
2369 """Initialize the object with the sequence length.
2370
2371 The calling function is responsible for ensuring that the length is
2372 greater than zero.
2373 """
2374 self._length = length
2375 super().__init__()
2376
2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
2378 if isinstance(key, slice):
2379 start, end, step = key.indices(self._length)
2380 size = len(range(start, end, step))
2381 if size == 0:
2382 return b""
2383 return _UndefinedSequenceData(size)
2384 else:
2385 raise UndefinedSequenceError("Sequence content is undefined")
2386
2387 def __len__(self):
2388 return self._length
2389
2390 def __bytes__(self):
2391 raise UndefinedSequenceError("Sequence content is undefined")
2392
2393 def __add__(self, other):
2394 length = len(self) + len(other)
2395 try:
2396 other = bytes(other)
2397 except UndefinedSequenceError:
2398 if isinstance(other, _UndefinedSequenceData):
2399 return _UndefinedSequenceData(length)
2400 else:
2401 return NotImplemented
2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
2403 else:
2404 data = {len(self): other}
2405 return _PartiallyDefinedSequenceData(length, data)
2406
2407 def __radd__(self, other):
2408 data = {0: bytes(other)}
2409 length = len(other) + len(self)
2410 return _PartiallyDefinedSequenceData(length, data)
2411
2412 def upper(self):
2413 """Return an upper case copy of the sequence."""
2414 # An upper case copy of an undefined sequence is an undefined
2415 # sequence of the same length
2416 return _UndefinedSequenceData(self._length)
2417
2418 def lower(self):
2419 """Return a lower case copy of the sequence."""
2420 # A lower case copy of an undefined sequence is an undefined
2421 # sequence of the same length
2422 return _UndefinedSequenceData(self._length)
2423
2424 def isupper(self):
2425 """Return True if all ASCII characters in data are uppercase.
2426
2427 If there are no cased characters, the method returns False.
2428 """
2429 # Character case is irrelevant for an undefined sequence
2430 raise UndefinedSequenceError("Sequence content is undefined")
2431
2432 def islower(self):
2433 """Return True if all ASCII characters in data are lowercase.
2434
2435 If there are no cased characters, the method returns False.
2436 """
2437 # Character case is irrelevant for an undefined sequence
2438 raise UndefinedSequenceError("Sequence content is undefined")
2439
2440 def replace(self, old, new):
2441 """Return a copy with all occurrences of substring old replaced by new."""
2442 # Replacing substring old by new in an undefined sequence will result
2443 # in an undefined sequence of the same length, if old and new have the
2444 # number of characters.
2445 if len(old) != len(new):
2446 raise UndefinedSequenceError("Sequence content is undefined")
2447 return _UndefinedSequenceData(self._length)
2448
2449 @property
2450 def defined(self):
2451 """Return False, as the sequence is not defined and has a non-zero length."""
2452 return False
2453
2454 @property
2455 def defined_ranges(self):
2456 """Return a tuple of the ranges where the sequence contents is defined.
2457
2458 As the sequence contents of an _UndefinedSequenceData object is fully
2459 undefined, the return value is always an empty tuple.
2460 """
2461 return ()
2462
2463
2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
2466
2467 Objects of this class can be used to create a Seq object to represent
2468 sequences with a known length, but with a sequence contents that is only
2469 partially known.
2470 Calling __len__ returns the sequence length, calling __getitem__ returns
2471 the sequence contents if known, otherwise an UndefinedSequenceError is
2472 raised.
2473 """
2474
2475 __slots__ = ("_length", "_data")
2476
2477 def __init__(self, length, data):
2478 """Initialize with the sequence length and defined sequence segments.
2479
2480 The calling function is responsible for ensuring that the length is
2481 greater than zero.
2482 """
2483 self._length = length
2484 self._data = data
2485 super().__init__()
2486
2487 def __getitem__(
2488 self, key: Union[slice, int]
2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
2490 if isinstance(key, slice):
2491 start, end, step = key.indices(self._length)
2492 size = len(range(start, end, step))
2493 if size == 0:
2494 return b""
2495 data = {}
2496 for s, d in self._data.items():
2497 indices = range(-s, -s + self._length)[key]
2498 e: Optional[int] = indices.stop
2499 assert e is not None
2500 if step > 0:
2501 if e <= 0:
2502 continue
2503 if indices.start < 0:
2504 s = indices.start % step
2505 else:
2506 s = indices.start
2507 else: # step < 0
2508 if e < 0:
2509 e = None
2510 end = len(d) - 1
2511 if indices.start > end:
2512 s = end + (indices.start - end) % step
2513 else:
2514 s = indices.start
2515 if s < 0:
2516 continue
2517 start = (s - indices.start) // step
2518 d = d[s:e:step]
2519 if d:
2520 data[start] = d
2521 if len(data) == 0: # Fully undefined sequence
2522 return _UndefinedSequenceData(size)
2523 # merge adjacent sequence segments
2524 end = -1
2525 previous = 0 # not needed here, but it keeps flake happy
2526 items = data.items()
2527 data = {}
2528 for start, seq in items:
2529 if end == start:
2530 data[previous] += seq
2531 else:
2532 data[start] = seq
2533 previous = start
2534 end = start + len(seq)
2535 if len(data) == 1:
2536 seq = data.get(0)
2537 if seq is not None and len(seq) == size:
2538 return seq # Fully defined sequence; return bytes
2539 if step < 0:
2540 # use this after we drop Python 3.7:
2541 # data = {start: data[start] for start in reversed(data)}
2542 # use this as long as we support Python 3.7:
2543 data = {start: data[start] for start in reversed(list(data.keys()))}
2544 return _PartiallyDefinedSequenceData(size, data)
2545 elif self._length <= key:
2546 raise IndexError("sequence index out of range")
2547 else:
2548 for start, seq in self._data.items():
2549 if start <= key and key < start + len(seq):
2550 return seq[key - start]
2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
2552
2553 def __len__(self):
2554 return self._length
2555
2556 def __bytes__(self):
2557 raise UndefinedSequenceError("Sequence content is only partially defined")
2558
2559 def __add__(self, other):
2560 length = len(self) + len(other)
2561 data = dict(self._data)
2562 items = list(self._data.items())
2563 start, seq = items[-1]
2564 end = start + len(seq)
2565 try:
2566 other = bytes(other)
2567 except UndefinedSequenceError:
2568 if isinstance(other, _UndefinedSequenceData):
2569 pass
2570 elif isinstance(other, _PartiallyDefinedSequenceData):
2571 other_items = list(other._data.items())
2572 if end == len(self):
2573 other_start, other_seq = other_items.pop(0)
2574 if other_start == 0:
2575 data[start] += other_seq
2576 else:
2577 data[len(self) + other_start] = other_seq
2578 for other_start, other_seq in other_items:
2579 data[len(self) + other_start] = other_seq
2580 else:
2581 if end == len(self):
2582 data[start] += other
2583 else:
2584 data[len(self)] = other
2585 return _PartiallyDefinedSequenceData(length, data)
2586
2587 def __radd__(self, other):
2588 length = len(other) + len(self)
2589 try:
2590 other = bytes(other)
2591 except UndefinedSequenceError:
2592 data = {len(other) + start: seq for start, seq in self._data.items()}
2593 else:
2594 data = {0: other}
2595 items = list(self._data.items())
2596 start, seq = items.pop(0)
2597 if start == 0:
2598 data[0] += seq
2599 else:
2600 data[len(other) + start] = seq
2601 for start, seq in items:
2602 data[len(other) + start] = seq
2603 return _PartiallyDefinedSequenceData(length, data)
2604
2605 def __mul__(self, other):
2606 length = self._length
2607 items = self._data.items()
2608 data = {}
2609 end = -1
2610 previous = 0 # not needed here, but it keeps flake happy
2611 for i in range(other):
2612 for start, seq in items:
2613 start += i * length
2614 if end == start:
2615 data[previous] += seq
2616 else:
2617 data[start] = seq
2618 previous = start
2619 end = start + len(seq)
2620 return _PartiallyDefinedSequenceData(length * other, data)
2621
2622 def upper(self):
2623 """Return an upper case copy of the sequence."""
2624 data = {start: seq.upper() for start, seq in self._data.items()}
2625 return _PartiallyDefinedSequenceData(self._length, data)
2626
2627 def lower(self):
2628 """Return a lower case copy of the sequence."""
2629 data = {start: seq.lower() for start, seq in self._data.items()}
2630 return _PartiallyDefinedSequenceData(self._length, data)
2631
2632 def isupper(self):
2633 """Return True if all ASCII characters in data are uppercase.
2634
2635 If there are no cased characters, the method returns False.
2636 """
2637 # Character case is irrelevant for an undefined sequence
2638 raise UndefinedSequenceError("Sequence content is only partially defined")
2639
2640 def islower(self):
2641 """Return True if all ASCII characters in data are lowercase.
2642
2643 If there are no cased characters, the method returns False.
2644 """
2645 # Character case is irrelevant for an undefined sequence
2646 raise UndefinedSequenceError("Sequence content is only partially defined")
2647
2648 def translate(self, table, delete=b""):
2649 """Return a copy with each character mapped by the given translation table.
2650
2651 table
2652 Translation table, which must be a bytes object of length 256.
2653
2654 All characters occurring in the optional argument delete are removed.
2655 The remaining characters are mapped through the given translation table.
2656 """
2657 items = self._data.items()
2658 data = {start: seq.translate(table, delete) for start, seq in items}
2659 return _PartiallyDefinedSequenceData(self._length, data)
2660
2661 def replace(self, old, new):
2662 """Return a copy with all occurrences of substring old replaced by new."""
2663 # Replacing substring old by new in the undefined sequence segments
2664 # will result in an undefined sequence segment of the same length, if
2665 # old and new have the number of characters. If not, an error is raised,
2666 # as the correct start positions cannot be calculated reliably.
2667 if len(old) != len(new):
2668 raise UndefinedSequenceError(
2669 "Sequence content is only partially defined; substring \n"
2670 "replacement cannot be performed reliably"
2671 )
2672 items = self._data.items()
2673 data = {start: seq.replace(old, new) for start, seq in items}
2674 return _PartiallyDefinedSequenceData(self._length, data)
2675
2676 @property
2677 def defined(self):
2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
2679 return False
2680
2681 @property
2682 def defined_ranges(self):
2683 """Return a tuple of the ranges where the sequence contents is defined.
2684
2685 The return value has the format ((start1, end1), (start2, end2), ...).
2686 """
2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
2688
2689
2690 # The transcribe, backward_transcribe, and translate functions are
2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
2692 # The functions work both on Seq objects, and on strings.
2693
2694
2695 def transcribe(dna):
2696 """Transcribe a DNA sequence into RNA.
2697
2698 Following the usual convention, the sequence is interpreted as the
2699 coding strand of the DNA double helix, not the template strand. This
2700 means we can get the RNA sequence just by switching T to U.
2701
2702 If given a string, returns a new string object.
2703
2704 Given a Seq or MutableSeq, returns a new Seq object.
2705
2706 e.g.
2707
2708 >>> transcribe("ACTGN")
2709 'ACUGN'
2710 """
2711 if isinstance(dna, Seq):
2712 return dna.transcribe()
2713 elif isinstance(dna, MutableSeq):
2714 return Seq(dna).transcribe()
2715 else:
2716 return dna.replace("T", "U").replace("t", "u")
2717
2718
2719 def back_transcribe(rna):
2720 """Return the RNA sequence back-transcribed into DNA.
2721
2722 If given a string, returns a new string object.
2723
2724 Given a Seq or MutableSeq, returns a new Seq object.
2725
2726 e.g.
2727
2728 >>> back_transcribe("ACUGN")
2729 'ACTGN'
2730 """
2731 if isinstance(rna, Seq):
2732 return rna.back_transcribe()
2733 elif isinstance(rna, MutableSeq):
2734 return Seq(rna).back_transcribe()
2735 else:
2736 return rna.replace("U", "T").replace("u", "t")
2737
2738
2739 def _translate_str(
2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
2741 ):
2742 """Translate nucleotide string into a protein string (PRIVATE).
2743
2744 Arguments:
2745 - sequence - a string
2746 - table - Which codon table to use? This can be either a name (string),
2747 an NCBI identifier (integer), or a CodonTable object (useful for
2748 non-standard genetic codes). This defaults to the "Standard" table.
2749 - stop_symbol - a single character string, what to use for terminators.
2750 - to_stop - boolean, should translation terminate at the first
2751 in frame stop codon? If there is no in-frame stop codon
2752 then translation continues to the end.
2753 - pos_stop - a single character string for a possible stop codon
2754 (e.g. TAN or NNN)
2755 - cds - Boolean, indicates this is a complete CDS. If True, this
2756 checks the sequence starts with a valid alternative start
2757 codon (which will be translated as methionine, M), that the
2758 sequence length is a multiple of three, and that there is a
2759 single in frame stop codon at the end (this will be excluded
2760 from the protein sequence, regardless of the to_stop option).
2761 If these tests fail, an exception is raised.
2762 - gap - Single character string to denote symbol used for gaps.
2763 Defaults to None.
2764
2765 Returns a string.
2766
2767 e.g.
2768
2769 >>> from Bio.Data import CodonTable
2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
2771 >>> _translate_str("AAA", table)
2772 'K'
2773 >>> _translate_str("TAR", table)
2774 '*'
2775 >>> _translate_str("TAN", table)
2776 'X'
2777 >>> _translate_str("TAN", table, pos_stop="@")
2778 '@'
2779 >>> _translate_str("TA?", table)
2780 Traceback (most recent call last):
2781 ...
2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
2783
2784 In a change to older versions of Biopython, partial codons are now
2785 always regarded as an error (previously only checked if cds=True)
2786 and will trigger a warning (likely to become an exception in a
2787 future release).
2788
2789 If **cds=True**, the start and stop codons are checked, and the start
2790 codon will be translated at methionine. The sequence must be an
2791 while number of codons.
2792
2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
2794 'MP'
2795 >>> _translate_str("AAACCCTAG", table, cds=True)
2796 Traceback (most recent call last):
2797 ...
2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
2800 Traceback (most recent call last):
2801 ...
2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
2803 """
2804 try:
2805 table_id = int(table)
2806 except ValueError:
2807 # Assume it's a table name
2808 # The same table can be used for RNA or DNA
2809 try:
2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
2811 except KeyError:
2812 if isinstance(table, str):
2813 raise ValueError(
2814 "The Bio.Seq translate methods and function DO NOT "
2815 "take a character string mapping table like the python "
2816 "string object's translate method. "
2817 "Use str(my_seq).translate(...) instead."
2818 ) from None
2819 else:
2820 raise TypeError("table argument must be integer or string") from None
2821 except (AttributeError, TypeError):
2822 # Assume it's a CodonTable object
2823 if isinstance(table, CodonTable.CodonTable):
2824 codon_table = table
2825 else:
2826 raise ValueError("Bad table argument") from None
2827 else:
2828 # Assume it's a table ID
2829 # The same table can be used for RNA or DNA
2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
2831 sequence = sequence.upper()
2832 amino_acids = []
2833 forward_table = codon_table.forward_table
2834 stop_codons = codon_table.stop_codons
2835 if codon_table.nucleotide_alphabet is not None:
2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
2837 else:
2838 # Assume the worst case, ambiguous DNA or RNA:
2839 valid_letters = set(
2840 IUPACData.ambiguous_dna_letters.upper()
2841 + IUPACData.ambiguous_rna_letters.upper()
2842 )
2843 n = len(sequence)
2844
2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
2846 dual_coding = [c for c in stop_codons if c in forward_table]
2847 if dual_coding:
2848 c = dual_coding[0]
2849 if to_stop:
2850 raise ValueError(
2851 "You cannot use 'to_stop=True' with this table as it contains"
2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
2854 )
2855 warnings.warn(
2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
2858 " or STOP). Such codons will be translated as amino acid.",
2859 BiopythonWarning,
2860 )
2861
2862 if cds:
2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
2864 raise CodonTable.TranslationError(
2865 f"First codon '{sequence[:3]}' is not a start codon"
2866 )
2867 if n % 3 != 0:
2868 raise CodonTable.TranslationError(
2869 f"Sequence length {n} is not a multiple of three"
2870 )
2871 if str(sequence[-3:]).upper() not in stop_codons:
2872 raise CodonTable.TranslationError(
2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
2874 )
2875 # Don't translate the stop symbol, and manually translate the M
2876 sequence = sequence[3:-3]
2877 n -= 6
2878 amino_acids = ["M"]
2879 elif n % 3 != 0:
2880 warnings.warn(
2881 "Partial codon, len(sequence) not a multiple of three. "
2882 "Explicitly trim the sequence or add trailing N before "
2883 "translation. This may become an error in future.",
2884 BiopythonWarning,
2885 )
2886 if gap is not None:
2887 if not isinstance(gap, str):
2888 raise TypeError("Gap character should be a single character string.")
2889 elif len(gap) > 1:
2890 raise ValueError("Gap character should be a single character string.")
2891
2892 for i in range(0, n - n % 3, 3):
2893 codon = sequence[i : i + 3]
2894 try:
2895 amino_acids.append(forward_table[codon])
2896 except (KeyError, CodonTable.TranslationError):
2897 if codon in codon_table.stop_codons:
2898 if cds:
2899 raise CodonTable.TranslationError(
2900 f"Extra in frame stop codon '{codon}' found."
2901 ) from None
2902 if to_stop:
2903 break
2904 amino_acids.append(stop_symbol)
2905 elif valid_letters.issuperset(set(codon)):
2906 # Possible stop codon (e.g. NNN or TAN)
2907 amino_acids.append(pos_stop)
2908 elif gap is not None and codon == gap * 3:
2909 # Gapped translation
2910 amino_acids.append(gap)
2911 else:
2912 raise CodonTable.TranslationError(
2913 f"Codon '{codon}' is invalid"
2914 ) from None
2915 return "".join(amino_acids)
2916
2917
2918 def translate(
2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
2920 ):
2921 """Translate a nucleotide sequence into amino acids.
2922
2923 If given a string, returns a new string object. Given a Seq or
2924 MutableSeq, returns a Seq object.
2925
2926 Arguments:
2927 - table - Which codon table to use? This can be either a name
2928 (string), an NCBI identifier (integer), or a CodonTable object
2929 (useful for non-standard genetic codes). Defaults to the "Standard"
2930 table.
2931 - stop_symbol - Single character string, what to use for any
2932 terminators, defaults to the asterisk, "*".
2933 - to_stop - Boolean, defaults to False meaning do a full
2934 translation continuing on past any stop codons
2935 (translated as the specified stop_symbol). If
2936 True, translation is terminated at the first in
2937 frame stop codon (and the stop_symbol is not
2938 appended to the returned protein sequence).
2939 - cds - Boolean, indicates this is a complete CDS. If True, this
2940 checks the sequence starts with a valid alternative start
2941 codon (which will be translated as methionine, M), that the
2942 sequence length is a multiple of three, and that there is a
2943 single in frame stop codon at the end (this will be excluded
2944 from the protein sequence, regardless of the to_stop option).
2945 If these tests fail, an exception is raised.
2946 - gap - Single character string to denote symbol used for gaps.
2947 Defaults to None.
2948
2949 A simple string example using the default (standard) genetic code:
2950
2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
2952 >>> translate(coding_dna)
2953 'VAIVMGR*KGAR*'
2954 >>> translate(coding_dna, stop_symbol="@")
2955 'VAIVMGR@KGAR@'
2956 >>> translate(coding_dna, to_stop=True)
2957 'VAIVMGR'
2958
2959 Now using NCBI table 2, where TGA is not a stop codon:
2960
2961 >>> translate(coding_dna, table=2)
2962 'VAIVMGRWKGAR*'
2963 >>> translate(coding_dna, table=2, to_stop=True)
2964 'VAIVMGRWKGAR'
2965
2966 In fact this example uses an alternative start codon valid under NCBI
2967 table 2, GTG, which means this example is a complete valid CDS which
2968 when translated should really start with methionine (not valine):
2969
2970 >>> translate(coding_dna, table=2, cds=True)
2971 'MAIVMGRWKGAR'
2972
2973 Note that if the sequence has no in-frame stop codon, then the to_stop
2974 argument has no effect:
2975
2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
2977 >>> translate(coding_dna2)
2978 'VAIVMGR'
2979 >>> translate(coding_dna2, to_stop=True)
2980 'VAIVMGR'
2981
2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
2983 or a stop codon. These are translated as "X". Any invalid codon
2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
2985
2986 It will however translate either DNA or RNA.
2987
2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
2989 stop codons'. These are stop codons with unambiguous sequence but which
2990 have a context dependent coding as STOP or as amino acid. With these tables
2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
2992 coding codons will always be translated as amino acid, except for
2993 'cds=True', where the last codon will be translated as STOP.
2994
2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
2996 >>> translate(coding_dna3)
2997 'MARK*'
2998
2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
3000 'MARKW'
3001
3002 It will however raise a BiopythonWarning (not shown).
3003
3004 >>> translate(coding_dna3, table=27, cds=True)
3005 'MARK'
3006
3007 >>> translate(coding_dna3, table=27, to_stop=True)
3008 Traceback (most recent call last):
3009 ...
3010 ValueError: You cannot use 'to_stop=True' with this table ...
3011 """
3012 if isinstance(sequence, Seq):
3013 return sequence.translate(table, stop_symbol, to_stop, cds)
3014 elif isinstance(sequence, MutableSeq):
3015 # Return a Seq object
3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
3017 else:
3018 # Assume it's a string, return a string
3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
3020
3021
3022 def reverse_complement(sequence, inplace=False):
3023 """Return the reverse complement as a DNA sequence.
3024
3025 If given a string, returns a new string object.
3026 Given a Seq object, returns a new Seq object.
3027 Given a MutableSeq, returns a new MutableSeq object.
3028 Given a SeqRecord object, returns a new SeqRecord object.
3029
3030 >>> my_seq = "CGA"
3031 >>> reverse_complement(my_seq)
3032 'TCG'
3033 >>> my_seq = Seq("CGA")
3034 >>> reverse_complement(my_seq)
3035 Seq('TCG')
3036 >>> my_seq = MutableSeq("CGA")
3037 >>> reverse_complement(my_seq)
3038 MutableSeq('TCG')
3039 >>> my_seq
3040 MutableSeq('CGA')
3041
3042 Any U in the sequence is treated as a T:
3043
3044 >>> reverse_complement(Seq("CGAUT"))
3045 Seq('AATCG')
3046
3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
3048
3049 >>> reverse_complement_rna(Seq("CGAUT"))
3050 Seq('AAUCG')
3051
3052 Supports and lower- and upper-case characters, and unambiguous and
3053 ambiguous nucleotides. All other characters are not converted:
3054
3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
3056 'zrxZRXaacgtAACGT'
3057
3058 The sequence is modified in-place and returned if inplace is True:
3059
3060 >>> my_seq = MutableSeq("CGA")
3061 >>> reverse_complement(my_seq, inplace=True)
3062 MutableSeq('TCG')
3063 >>> my_seq
3064 MutableSeq('TCG')
3065
3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
3068 ``inplace=True``.
3069 """
3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
3071
3072 if isinstance(sequence, (Seq, MutableSeq)):
3073 return sequence.reverse_complement(inplace)
3074 if isinstance(sequence, SeqRecord):
3075 if inplace:
3076 raise TypeError("SeqRecords are immutable")
3077 return sequence.reverse_complement()
3078 # Assume it's a string.
3079 if inplace:
3080 raise TypeError("strings are immutable")
3081 sequence = sequence.encode("ASCII")
3082 sequence = sequence.translate(_dna_complement_table)
3083 sequence = sequence.decode("ASCII")
3084 return sequence[::-1]
3085
3086
3087 def reverse_complement_rna(sequence, inplace=False):
3088 """Return the reverse complement as an RNA sequence.
3089
3090 If given a string, returns a new string object.
3091 Given a Seq object, returns a new Seq object.
3092 Given a MutableSeq, returns a new MutableSeq object.
3093 Given a SeqRecord object, returns a new SeqRecord object.
3094
3095 >>> my_seq = "CGA"
3096 >>> reverse_complement_rna(my_seq)
3097 'UCG'
3098 >>> my_seq = Seq("CGA")
3099 >>> reverse_complement_rna(my_seq)
3100 Seq('UCG')
3101 >>> my_seq = MutableSeq("CGA")
3102 >>> reverse_complement_rna(my_seq)
3103 MutableSeq('UCG')
3104 >>> my_seq
3105 MutableSeq('CGA')
3106
3107 Any T in the sequence is treated as a U:
3108
3109 >>> reverse_complement_rna(Seq("CGAUT"))
3110 Seq('AAUCG')
3111
3112 In contrast, ``reverse_complement`` returns a DNA sequence:
3113
3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
3115 Seq('AATCG')
3116
3117 Supports and lower- and upper-case characters, and unambiguous and
3118 ambiguous nucleotides. All other characters are not converted:
3119
3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
3121 'zrxZRXaacguAACGU'
3122
3123 The sequence is modified in-place and returned if inplace is True:
3124
3125 >>> my_seq = MutableSeq("CGA")
3126 >>> reverse_complement_rna(my_seq, inplace=True)
3127 MutableSeq('UCG')
3128 >>> my_seq
3129 MutableSeq('UCG')
3130
3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
3133 ``inplace=True``.
3134 """
3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
3136
3137 if isinstance(sequence, (Seq, MutableSeq)):
3138 return sequence.reverse_complement_rna(inplace)
3139 if isinstance(sequence, SeqRecord):
3140 if inplace:
3141 raise TypeError("SeqRecords are immutable")
3142 return sequence.reverse_complement_rna()
3143 # Assume it's a string.
3144 if inplace:
3145 raise TypeError("strings are immutable")
3146 sequence = sequence.encode("ASCII")
3147 sequence = sequence.translate(_rna_complement_table)
3148 sequence = sequence.decode("ASCII")
3149 return sequence[::-1]
3150
3151
3152 def complement(sequence, inplace=False):
3153 """Return the complement as a DNA sequence.
3154
3155 If given a string, returns a new string object.
3156 Given a Seq object, returns a new Seq object.
3157 Given a MutableSeq, returns a new MutableSeq object.
3158 Given a SeqRecord object, returns a new SeqRecord object.
3159
3160 >>> my_seq = "CGA"
3161 >>> complement(my_seq)
3162 'GCT'
3163 >>> my_seq = Seq("CGA")
3164 >>> complement(my_seq)
3165 Seq('GCT')
3166 >>> my_seq = MutableSeq("CGA")
3167 >>> complement(my_seq)
3168 MutableSeq('GCT')
3169 >>> my_seq
3170 MutableSeq('CGA')
3171
3172 Any U in the sequence is treated as a T:
3173
3174 >>> complement(Seq("CGAUT"))
3175 Seq('GCTAA')
3176
3177 In contrast, ``complement_rna`` returns an RNA sequence:
3178
3179 >>> complement_rna(Seq("CGAUT"))
3180 Seq('GCUAA')
3181
3182 Supports and lower- and upper-case characters, and unambiguous and
3183 ambiguous nucleotides. All other characters are not converted:
3184
3185 >>> complement("ACGTUacgtuXYZxyz")
3186 'TGCAAtgcaaXRZxrz'
3187
3188 The sequence is modified in-place and returned if inplace is True:
3189
3190 >>> my_seq = MutableSeq("CGA")
3191 >>> complement(my_seq, inplace=True)
3192 MutableSeq('GCT')
3193 >>> my_seq
3194 MutableSeq('GCT')
3195
3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
3198 ``inplace=True``.
3199 """
3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
3201
3202 if isinstance(sequence, (Seq, MutableSeq)):
3203 return sequence.complement(inplace)
3204 if isinstance(sequence, SeqRecord):
3205 if inplace:
3206 raise TypeError("SeqRecords are immutable")
3207 return sequence.complement()
3208 # Assume it's a string.
3209 if inplace is True:
3210 raise TypeError("strings are immutable")
3211 sequence = sequence.encode("ASCII")
3212 sequence = sequence.translate(_dna_complement_table)
3213 return sequence.decode("ASCII")
3214
3215
3216 def complement_rna(sequence, inplace=False):
3217 """Return the complement as an RNA sequence.
3218
3219 If given a string, returns a new string object.
3220 Given a Seq object, returns a new Seq object.
3221 Given a MutableSeq, returns a new MutableSeq object.
3222 Given a SeqRecord object, returns a new SeqRecord object.
3223
3224 >>> my_seq = "CGA"
3225 >>> complement_rna(my_seq)
3226 'GCU'
3227 >>> my_seq = Seq("CGA")
3228 >>> complement_rna(my_seq)
3229 Seq('GCU')
3230 >>> my_seq = MutableSeq("CGA")
3231 >>> complement_rna(my_seq)
3232 MutableSeq('GCU')
3233 >>> my_seq
3234 MutableSeq('CGA')
3235
3236 Any T in the sequence is treated as a U:
3237
3238 >>> complement_rna(Seq("CGAUT"))
3239 Seq('GCUAA')
3240
3241 In contrast, ``complement`` returns a DNA sequence:
3242
3243 >>> complement(Seq("CGAUT"))
3244 Seq('GCTAA')
3245
3246 Supports and lower- and upper-case characters, and unambiguous and
3247 ambiguous nucleotides. All other characters are not converted:
3248
3249 >>> complement_rna("ACGTUacgtuXYZxyz")
3250 'UGCAAugcaaXRZxrz'
3251
3252 The sequence is modified in-place and returned if inplace is True:
3253
3254 >>> my_seq = MutableSeq("CGA")
3255 >>> complement(my_seq, inplace=True)
3256 MutableSeq('GCT')
3257 >>> my_seq
3258 MutableSeq('GCT')
3259
3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
3262 ``inplace=True``.
3263 """
3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
3265
3266 if isinstance(sequence, (Seq, MutableSeq)):
3267 return sequence.complement_rna(inplace)
3268 if isinstance(sequence, SeqRecord):
3269 if inplace:
3270 raise TypeError("SeqRecords are immutable")
3271 return sequence.complement_rna()
3272 # Assume it's a string.
3273 if inplace:
3274 raise TypeError("strings are immutable")
3275 sequence = sequence.encode("ASCII")
3276 sequence = sequence.translate(_rna_complement_table)
3277 return sequence.decode("ASCII")
3278
3279
3280 def _test():
3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
3282 print("Running doctests...")
3283 import doctest
3284
3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
3286 print("Done")
3287
3288
3289 if __name__ == "__main__":
3290 _test()