Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/Seq.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright 2000 Andrew Dalke. | |
2 # Copyright 2000-2002 Brad Chapman. | |
3 # Copyright 2004-2005, 2010 by M de Hoon. | |
4 # Copyright 2007-2023 by Peter Cock. | |
5 # All rights reserved. | |
6 # | |
7 # This file is part of the Biopython distribution and governed by your | |
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
9 # Please see the LICENSE file that should have been included as part of this | |
10 # package. | |
11 """Provide objects to represent biological sequences. | |
12 | |
13 See also the Seq_ wiki and the chapter in our tutorial: | |
14 - `HTML Tutorial`_ | |
15 - `PDF Tutorial`_ | |
16 | |
17 .. _Seq: http://biopython.org/wiki/Seq | |
18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html | |
19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf | |
20 | |
21 """ | |
22 import array | |
23 import collections | |
24 import numbers | |
25 import warnings | |
26 | |
27 from abc import ABC | |
28 from abc import abstractmethod | |
29 from typing import overload, Optional, Union, Dict | |
30 | |
31 from Bio import BiopythonWarning | |
32 from Bio.Data import CodonTable | |
33 from Bio.Data import IUPACData | |
34 | |
35 | |
36 def _maketrans(complement_mapping): | |
37 """Make a python string translation table (PRIVATE). | |
38 | |
39 Arguments: | |
40 - complement_mapping - a dictionary such as ambiguous_dna_complement | |
41 and ambiguous_rna_complement from Data.IUPACData. | |
42 | |
43 Returns a translation table (a bytes object of length 256) for use with | |
44 the python string's translate method to use in a (reverse) complement. | |
45 | |
46 Compatible with lower case and upper case sequences. | |
47 | |
48 For internal use only. | |
49 """ | |
50 keys = "".join(complement_mapping.keys()).encode("ASCII") | |
51 values = "".join(complement_mapping.values()).encode("ASCII") | |
52 return bytes.maketrans(keys + keys.lower(), values + values.lower()) | |
53 | |
54 | |
55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement) | |
56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"] | |
57 _dna_complement_table = _maketrans(ambiguous_dna_complement) | |
58 del ambiguous_dna_complement | |
59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement) | |
60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"] | |
61 _rna_complement_table = _maketrans(ambiguous_rna_complement) | |
62 del ambiguous_rna_complement | |
63 | |
64 | |
65 class SequenceDataAbstractBaseClass(ABC): | |
66 """Abstract base class for sequence content providers. | |
67 | |
68 Most users will not need to use this class. It is used internally as a base | |
69 class for sequence content provider classes such as _UndefinedSequenceData | |
70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO. | |
71 Instances of these classes can be used instead of a ``bytes`` object as the | |
72 data argument when creating a Seq object, and provide the sequence content | |
73 only when requested via ``__getitem__``. This allows lazy parsers to load | |
74 and parse sequence data from a file only for the requested sequence regions, | |
75 and _UndefinedSequenceData instances to raise an exception when undefined | |
76 sequence data are requested. | |
77 | |
78 Future implementations of lazy parsers that similarly provide on-demand | |
79 parsing of sequence data should use a subclass of this abstract class and | |
80 implement the abstract methods ``__len__`` and ``__getitem__``: | |
81 | |
82 * ``__len__`` must return the sequence length; | |
83 * ``__getitem__`` must return | |
84 | |
85 * a ``bytes`` object for the requested region; or | |
86 * a new instance of the subclass for the requested region; or | |
87 * raise an ``UndefinedSequenceError``. | |
88 | |
89 Calling ``__getitem__`` for a sequence region of size zero should always | |
90 return an empty ``bytes`` object. | |
91 Calling ``__getitem__`` for the full sequence (as in data[:]) should | |
92 either return a ``bytes`` object with the full sequence, or raise an | |
93 ``UndefinedSequenceError``. | |
94 | |
95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()`` | |
96 as part of their ``__init__`` method. | |
97 """ | |
98 | |
99 __slots__ = () | |
100 | |
101 def __init__(self): | |
102 """Check if ``__getitem__`` returns a bytes-like object.""" | |
103 assert self[:0] == b"" | |
104 | |
105 @abstractmethod | |
106 def __len__(self): | |
107 pass | |
108 | |
109 @abstractmethod | |
110 def __getitem__(self, key): | |
111 pass | |
112 | |
113 def __bytes__(self): | |
114 return self[:] | |
115 | |
116 def __hash__(self): | |
117 return hash(bytes(self)) | |
118 | |
119 def __eq__(self, other): | |
120 return bytes(self) == other | |
121 | |
122 def __lt__(self, other): | |
123 return bytes(self) < other | |
124 | |
125 def __le__(self, other): | |
126 return bytes(self) <= other | |
127 | |
128 def __gt__(self, other): | |
129 return bytes(self) > other | |
130 | |
131 def __ge__(self, other): | |
132 return bytes(self) >= other | |
133 | |
134 def __add__(self, other): | |
135 try: | |
136 return bytes(self) + bytes(other) | |
137 except UndefinedSequenceError: | |
138 return NotImplemented | |
139 # will be handled by _UndefinedSequenceData.__radd__ or | |
140 # by _PartiallyDefinedSequenceData.__radd__ | |
141 | |
142 def __radd__(self, other): | |
143 return other + bytes(self) | |
144 | |
145 def __mul__(self, other): | |
146 return other * bytes(self) | |
147 | |
148 def __contains__(self, item): | |
149 return bytes(self).__contains__(item) | |
150 | |
151 def decode(self, encoding="utf-8"): | |
152 """Decode the data as bytes using the codec registered for encoding. | |
153 | |
154 encoding | |
155 The encoding with which to decode the bytes. | |
156 """ | |
157 return bytes(self).decode(encoding) | |
158 | |
159 def count(self, sub, start=None, end=None): | |
160 """Return the number of non-overlapping occurrences of sub in data[start:end]. | |
161 | |
162 Optional arguments start and end are interpreted as in slice notation. | |
163 This method behaves as the count method of Python strings. | |
164 """ | |
165 return bytes(self).count(sub, start, end) | |
166 | |
167 def find(self, sub, start=None, end=None): | |
168 """Return the lowest index in data where subsection sub is found. | |
169 | |
170 Return the lowest index in data where subsection sub is found, | |
171 such that sub is contained within data[start,end]. Optional | |
172 arguments start and end are interpreted as in slice notation. | |
173 | |
174 Return -1 on failure. | |
175 """ | |
176 return bytes(self).find(sub, start, end) | |
177 | |
178 def rfind(self, sub, start=None, end=None): | |
179 """Return the highest index in data where subsection sub is found. | |
180 | |
181 Return the highest index in data where subsection sub is found, | |
182 such that sub is contained within data[start,end]. Optional | |
183 arguments start and end are interpreted as in slice notation. | |
184 | |
185 Return -1 on failure. | |
186 """ | |
187 return bytes(self).rfind(sub, start, end) | |
188 | |
189 def index(self, sub, start=None, end=None): | |
190 """Return the lowest index in data where subsection sub is found. | |
191 | |
192 Return the lowest index in data where subsection sub is found, | |
193 such that sub is contained within data[start,end]. Optional | |
194 arguments start and end are interpreted as in slice notation. | |
195 | |
196 Raises ValueError when the subsection is not found. | |
197 """ | |
198 return bytes(self).index(sub, start, end) | |
199 | |
200 def rindex(self, sub, start=None, end=None): | |
201 """Return the highest index in data where subsection sub is found. | |
202 | |
203 Return the highest index in data where subsection sub is found, | |
204 such that sub is contained within data[start,end]. Optional | |
205 arguments start and end are interpreted as in slice notation. | |
206 | |
207 Raise ValueError when the subsection is not found. | |
208 """ | |
209 return bytes(self).rindex(sub, start, end) | |
210 | |
211 def startswith(self, prefix, start=None, end=None): | |
212 """Return True if data starts with the specified prefix, False otherwise. | |
213 | |
214 With optional start, test data beginning at that position. | |
215 With optional end, stop comparing data at that position. | |
216 prefix can also be a tuple of bytes to try. | |
217 """ | |
218 return bytes(self).startswith(prefix, start, end) | |
219 | |
220 def endswith(self, suffix, start=None, end=None): | |
221 """Return True if data ends with the specified suffix, False otherwise. | |
222 | |
223 With optional start, test data beginning at that position. | |
224 With optional end, stop comparing data at that position. | |
225 suffix can also be a tuple of bytes to try. | |
226 """ | |
227 return bytes(self).endswith(suffix, start, end) | |
228 | |
229 def split(self, sep=None, maxsplit=-1): | |
230 """Return a list of the sections in the data, using sep as the delimiter. | |
231 | |
232 sep | |
233 The delimiter according which to split the data. | |
234 None (the default value) means split on ASCII whitespace characters | |
235 (space, tab, return, newline, formfeed, vertical tab). | |
236 maxsplit | |
237 Maximum number of splits to do. | |
238 -1 (the default value) means no limit. | |
239 """ | |
240 return bytes(self).split(sep, maxsplit) | |
241 | |
242 def rsplit(self, sep=None, maxsplit=-1): | |
243 """Return a list of the sections in the data, using sep as the delimiter. | |
244 | |
245 sep | |
246 The delimiter according which to split the data. | |
247 None (the default value) means split on ASCII whitespace characters | |
248 (space, tab, return, newline, formfeed, vertical tab). | |
249 maxsplit | |
250 Maximum number of splits to do. | |
251 -1 (the default value) means no limit. | |
252 | |
253 Splitting is done starting at the end of the data and working to the front. | |
254 """ | |
255 return bytes(self).rsplit(sep, maxsplit) | |
256 | |
257 def strip(self, chars=None): | |
258 """Strip leading and trailing characters contained in the argument. | |
259 | |
260 If the argument is omitted or None, strip leading and trailing ASCII whitespace. | |
261 """ | |
262 return bytes(self).strip(chars) | |
263 | |
264 def lstrip(self, chars=None): | |
265 """Strip leading characters contained in the argument. | |
266 | |
267 If the argument is omitted or None, strip leading ASCII whitespace. | |
268 """ | |
269 return bytes(self).lstrip(chars) | |
270 | |
271 def rstrip(self, chars=None): | |
272 """Strip trailing characters contained in the argument. | |
273 | |
274 If the argument is omitted or None, strip trailing ASCII whitespace. | |
275 """ | |
276 return bytes(self).rstrip(chars) | |
277 | |
278 def removeprefix(self, prefix): | |
279 """Remove the prefix if present.""" | |
280 # Want to do just this, but need Python 3.9+ | |
281 # return bytes(self).removeprefix(prefix) | |
282 data = bytes(self) | |
283 try: | |
284 return data.removeprefix(prefix) | |
285 except AttributeError: | |
286 if data.startswith(prefix): | |
287 return data[len(prefix) :] | |
288 else: | |
289 return data | |
290 | |
291 def removesuffix(self, suffix): | |
292 """Remove the suffix if present.""" | |
293 # Want to do just this, but need Python 3.9+ | |
294 # return bytes(self).removesuffix(suffix) | |
295 data = bytes(self) | |
296 try: | |
297 return data.removesuffix(suffix) | |
298 except AttributeError: | |
299 if data.startswith(suffix): | |
300 return data[: -len(suffix)] | |
301 else: | |
302 return data | |
303 | |
304 def upper(self): | |
305 """Return a copy of data with all ASCII characters converted to uppercase.""" | |
306 return bytes(self).upper() | |
307 | |
308 def lower(self): | |
309 """Return a copy of data with all ASCII characters converted to lowercase.""" | |
310 return bytes(self).lower() | |
311 | |
312 def isupper(self): | |
313 """Return True if all ASCII characters in data are uppercase. | |
314 | |
315 If there are no cased characters, the method returns False. | |
316 """ | |
317 return bytes(self).isupper() | |
318 | |
319 def islower(self): | |
320 """Return True if all ASCII characters in data are lowercase. | |
321 | |
322 If there are no cased characters, the method returns False. | |
323 """ | |
324 return bytes(self).islower() | |
325 | |
326 def replace(self, old, new): | |
327 """Return a copy with all occurrences of substring old replaced by new.""" | |
328 return bytes(self).replace(old, new) | |
329 | |
330 def translate(self, table, delete=b""): | |
331 """Return a copy with each character mapped by the given translation table. | |
332 | |
333 table | |
334 Translation table, which must be a bytes object of length 256. | |
335 | |
336 All characters occurring in the optional argument delete are removed. | |
337 The remaining characters are mapped through the given translation table. | |
338 """ | |
339 return bytes(self).translate(table, delete) | |
340 | |
341 @property | |
342 def defined(self): | |
343 """Return True if the sequence is defined, False if undefined or partially defined. | |
344 | |
345 Zero-length sequences are always considered to be defined. | |
346 """ | |
347 return True | |
348 | |
349 @property | |
350 def defined_ranges(self): | |
351 """Return a tuple of the ranges where the sequence contents is defined. | |
352 | |
353 The return value has the format ((start1, end1), (start2, end2), ...). | |
354 """ | |
355 length = len(self) | |
356 if length > 0: | |
357 return ((0, length),) | |
358 else: | |
359 return () | |
360 | |
361 | |
362 class _SeqAbstractBaseClass(ABC): | |
363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE). | |
364 | |
365 Most users will not need to use this class. It is used internally as an | |
366 abstract base class for Seq and MutableSeq, as most of their methods are | |
367 identical. | |
368 """ | |
369 | |
370 __slots__ = ("_data",) | |
371 __array_ufunc__ = None # turn off numpy Ufuncs | |
372 | |
373 @abstractmethod | |
374 def __init__(self): | |
375 pass | |
376 | |
377 def __bytes__(self): | |
378 return bytes(self._data) | |
379 | |
380 def __repr__(self): | |
381 """Return (truncated) representation of the sequence.""" | |
382 data = self._data | |
383 if isinstance(data, _UndefinedSequenceData): | |
384 return f"Seq(None, length={len(self)})" | |
385 if isinstance(data, _PartiallyDefinedSequenceData): | |
386 d = {} | |
387 for position, seq in data._data.items(): | |
388 if len(seq) > 60: | |
389 start = seq[:54].decode("ASCII") | |
390 end = seq[-3:].decode("ASCII") | |
391 seq = f"{start}...{end}" | |
392 else: | |
393 seq = seq.decode("ASCII") | |
394 d[position] = seq | |
395 return "Seq(%r, length=%d)" % (d, len(self)) | |
396 if len(data) > 60: | |
397 # Shows the last three letters as it is often useful to see if | |
398 # there is a stop codon at the end of a sequence. | |
399 # Note total length is 54+3+3=60 | |
400 start = data[:54].decode("ASCII") | |
401 end = data[-3:].decode("ASCII") | |
402 return f"{self.__class__.__name__}('{start}...{end}')" | |
403 else: | |
404 data = data.decode("ASCII") | |
405 return f"{self.__class__.__name__}('{data}')" | |
406 | |
407 def __str__(self): | |
408 """Return the full sequence as a python string.""" | |
409 return self._data.decode("ASCII") | |
410 | |
411 def __eq__(self, other): | |
412 """Compare the sequence to another sequence or a string. | |
413 | |
414 Sequences are equal to each other if their sequence contents is | |
415 identical: | |
416 | |
417 >>> from Bio.Seq import Seq, MutableSeq | |
418 >>> seq1 = Seq("ACGT") | |
419 >>> seq2 = Seq("ACGT") | |
420 >>> mutable_seq = MutableSeq("ACGT") | |
421 >>> seq1 == seq2 | |
422 True | |
423 >>> seq1 == mutable_seq | |
424 True | |
425 >>> seq1 == "ACGT" | |
426 True | |
427 | |
428 Note that the sequence objects themselves are not identical to each | |
429 other: | |
430 | |
431 >>> id(seq1) == id(seq2) | |
432 False | |
433 >>> seq1 is seq2 | |
434 False | |
435 | |
436 Sequences can also be compared to strings, ``bytes``, and ``bytearray`` | |
437 objects: | |
438 | |
439 >>> seq1 == "ACGT" | |
440 True | |
441 >>> seq1 == b"ACGT" | |
442 True | |
443 >>> seq1 == bytearray(b"ACGT") | |
444 True | |
445 """ | |
446 if isinstance(other, _SeqAbstractBaseClass): | |
447 return self._data == other._data | |
448 elif isinstance(other, str): | |
449 return self._data == other.encode("ASCII") | |
450 else: | |
451 return self._data == other | |
452 | |
453 def __lt__(self, other): | |
454 """Implement the less-than operand.""" | |
455 if isinstance(other, _SeqAbstractBaseClass): | |
456 return self._data < other._data | |
457 elif isinstance(other, str): | |
458 return self._data < other.encode("ASCII") | |
459 else: | |
460 return self._data < other | |
461 | |
462 def __le__(self, other): | |
463 """Implement the less-than or equal operand.""" | |
464 if isinstance(other, _SeqAbstractBaseClass): | |
465 return self._data <= other._data | |
466 elif isinstance(other, str): | |
467 return self._data <= other.encode("ASCII") | |
468 else: | |
469 return self._data <= other | |
470 | |
471 def __gt__(self, other): | |
472 """Implement the greater-than operand.""" | |
473 if isinstance(other, _SeqAbstractBaseClass): | |
474 return self._data > other._data | |
475 elif isinstance(other, str): | |
476 return self._data > other.encode("ASCII") | |
477 else: | |
478 return self._data > other | |
479 | |
480 def __ge__(self, other): | |
481 """Implement the greater-than or equal operand.""" | |
482 if isinstance(other, _SeqAbstractBaseClass): | |
483 return self._data >= other._data | |
484 elif isinstance(other, str): | |
485 return self._data >= other.encode("ASCII") | |
486 else: | |
487 return self._data >= other | |
488 | |
489 def __len__(self): | |
490 """Return the length of the sequence.""" | |
491 return len(self._data) | |
492 | |
493 def __iter__(self): | |
494 """Return an iterable of the sequence.""" | |
495 return self._data.decode("ASCII").__iter__() | |
496 | |
497 @overload | |
498 def __getitem__(self, index: int) -> str: | |
499 ... | |
500 | |
501 @overload | |
502 def __getitem__(self, index: slice) -> "Seq": | |
503 ... | |
504 | |
505 def __getitem__(self, index): | |
506 """Return a subsequence as a single letter or as a sequence object. | |
507 | |
508 If the index is an integer, a single letter is returned as a Python | |
509 string: | |
510 | |
511 >>> seq = Seq('ACTCGACGTCG') | |
512 >>> seq[5] | |
513 'A' | |
514 | |
515 Otherwise, a new sequence object of the same class is returned: | |
516 | |
517 >>> seq[5:8] | |
518 Seq('ACG') | |
519 >>> mutable_seq = MutableSeq('ACTCGACGTCG') | |
520 >>> mutable_seq[5:8] | |
521 MutableSeq('ACG') | |
522 """ | |
523 if isinstance(index, numbers.Integral): | |
524 # Return a single letter as a string | |
525 return chr(self._data[index]) | |
526 else: | |
527 # Return the (sub)sequence as another Seq/MutableSeq object | |
528 return self.__class__(self._data[index]) | |
529 | |
530 def __add__(self, other): | |
531 """Add a sequence or string to this sequence. | |
532 | |
533 >>> from Bio.Seq import Seq, MutableSeq | |
534 >>> Seq("MELKI") + "LV" | |
535 Seq('MELKILV') | |
536 >>> MutableSeq("MELKI") + "LV" | |
537 MutableSeq('MELKILV') | |
538 """ | |
539 if isinstance(other, _SeqAbstractBaseClass): | |
540 return self.__class__(self._data + other._data) | |
541 elif isinstance(other, str): | |
542 return self.__class__(self._data + other.encode("ASCII")) | |
543 else: | |
544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle | |
545 # this. If not, returning NotImplemented will trigger a TypeError. | |
546 return NotImplemented | |
547 | |
548 def __radd__(self, other): | |
549 """Add a sequence string on the left. | |
550 | |
551 >>> from Bio.Seq import Seq, MutableSeq | |
552 >>> "LV" + Seq("MELKI") | |
553 Seq('LVMELKI') | |
554 >>> "LV" + MutableSeq("MELKI") | |
555 MutableSeq('LVMELKI') | |
556 | |
557 Adding two sequence objects is handled via the __add__ method. | |
558 """ | |
559 if isinstance(other, str): | |
560 return self.__class__(other.encode("ASCII") + self._data) | |
561 else: | |
562 return NotImplemented | |
563 | |
564 def __mul__(self, other): | |
565 """Multiply sequence by integer. | |
566 | |
567 >>> from Bio.Seq import Seq, MutableSeq | |
568 >>> Seq('ATG') * 2 | |
569 Seq('ATGATG') | |
570 >>> MutableSeq('ATG') * 2 | |
571 MutableSeq('ATGATG') | |
572 """ | |
573 if not isinstance(other, numbers.Integral): | |
574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") | |
575 # we would like to simply write | |
576 # data = self._data * other | |
577 # here, but currently that causes a bug on PyPy if self._data is a | |
578 # bytearray and other is a numpy integer. Using this workaround: | |
579 data = self._data.__mul__(other) | |
580 return self.__class__(data) | |
581 | |
582 def __rmul__(self, other): | |
583 """Multiply integer by sequence. | |
584 | |
585 >>> from Bio.Seq import Seq | |
586 >>> 2 * Seq('ATG') | |
587 Seq('ATGATG') | |
588 """ | |
589 if not isinstance(other, numbers.Integral): | |
590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") | |
591 # we would like to simply write | |
592 # data = self._data * other | |
593 # here, but currently that causes a bug on PyPy if self._data is a | |
594 # bytearray and other is a numpy integer. Using this workaround: | |
595 data = self._data.__mul__(other) | |
596 return self.__class__(data) | |
597 | |
598 def __imul__(self, other): | |
599 """Multiply the sequence object by other and assign. | |
600 | |
601 >>> from Bio.Seq import Seq | |
602 >>> seq = Seq('ATG') | |
603 >>> seq *= 2 | |
604 >>> seq | |
605 Seq('ATGATG') | |
606 | |
607 Note that this is different from in-place multiplication. The ``seq`` | |
608 variable is reassigned to the multiplication result, but any variable | |
609 pointing to ``seq`` will remain unchanged: | |
610 | |
611 >>> seq = Seq('ATG') | |
612 >>> seq2 = seq | |
613 >>> id(seq) == id(seq2) | |
614 True | |
615 >>> seq *= 2 | |
616 >>> seq | |
617 Seq('ATGATG') | |
618 >>> seq2 | |
619 Seq('ATG') | |
620 >>> id(seq) == id(seq2) | |
621 False | |
622 """ | |
623 if not isinstance(other, numbers.Integral): | |
624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type") | |
625 # we would like to simply write | |
626 # data = self._data * other | |
627 # here, but currently that causes a bug on PyPy if self._data is a | |
628 # bytearray and other is a numpy integer. Using this workaround: | |
629 data = self._data.__mul__(other) | |
630 return self.__class__(data) | |
631 | |
632 def count(self, sub, start=None, end=None): | |
633 """Return a non-overlapping count, like that of a python string. | |
634 | |
635 The number of occurrences of substring argument sub in the | |
636 (sub)sequence given by [start:end] is returned as an integer. | |
637 Optional arguments start and end are interpreted as in slice | |
638 notation. | |
639 | |
640 Arguments: | |
641 - sub - a string or another Seq object to look for | |
642 - start - optional integer, slice start | |
643 - end - optional integer, slice end | |
644 | |
645 e.g. | |
646 | |
647 >>> from Bio.Seq import Seq | |
648 >>> my_seq = Seq("AAAATGA") | |
649 >>> print(my_seq.count("A")) | |
650 5 | |
651 >>> print(my_seq.count("ATG")) | |
652 1 | |
653 >>> print(my_seq.count(Seq("AT"))) | |
654 1 | |
655 >>> print(my_seq.count("AT", 2, -1)) | |
656 1 | |
657 | |
658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq | |
659 objects, like that of Python strings, do a non-overlapping search, this | |
660 may not give the answer you expect: | |
661 | |
662 >>> "AAAA".count("AA") | |
663 2 | |
664 >>> print(Seq("AAAA").count("AA")) | |
665 2 | |
666 | |
667 For an overlapping search, use the ``count_overlap`` method: | |
668 | |
669 >>> print(Seq("AAAA").count_overlap("AA")) | |
670 3 | |
671 """ | |
672 if isinstance(sub, MutableSeq): | |
673 sub = sub._data | |
674 elif isinstance(sub, Seq): | |
675 sub = bytes(sub) | |
676 elif isinstance(sub, str): | |
677 sub = sub.encode("ASCII") | |
678 elif not isinstance(sub, (bytes, bytearray)): | |
679 raise TypeError( | |
680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
681 % type(sub) | |
682 ) | |
683 return self._data.count(sub, start, end) | |
684 | |
685 def count_overlap(self, sub, start=None, end=None): | |
686 """Return an overlapping count. | |
687 | |
688 Returns an integer, the number of occurrences of substring | |
689 argument sub in the (sub)sequence given by [start:end]. | |
690 Optional arguments start and end are interpreted as in slice | |
691 notation. | |
692 | |
693 Arguments: | |
694 - sub - a string or another Seq object to look for | |
695 - start - optional integer, slice start | |
696 - end - optional integer, slice end | |
697 | |
698 e.g. | |
699 | |
700 >>> from Bio.Seq import Seq | |
701 >>> print(Seq("AAAA").count_overlap("AA")) | |
702 3 | |
703 >>> print(Seq("ATATATATA").count_overlap("ATA")) | |
704 4 | |
705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1)) | |
706 1 | |
707 | |
708 For a non-overlapping search, use the ``count`` method: | |
709 | |
710 >>> print(Seq("AAAA").count("AA")) | |
711 2 | |
712 | |
713 Where substrings do not overlap, ``count_overlap`` behaves the same as | |
714 the ``count`` method: | |
715 | |
716 >>> from Bio.Seq import Seq | |
717 >>> my_seq = Seq("AAAATGA") | |
718 >>> print(my_seq.count_overlap("A")) | |
719 5 | |
720 >>> my_seq.count_overlap("A") == my_seq.count("A") | |
721 True | |
722 >>> print(my_seq.count_overlap("ATG")) | |
723 1 | |
724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG") | |
725 True | |
726 >>> print(my_seq.count_overlap(Seq("AT"))) | |
727 1 | |
728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT")) | |
729 True | |
730 >>> print(my_seq.count_overlap("AT", 2, -1)) | |
731 1 | |
732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1) | |
733 True | |
734 | |
735 HOWEVER, do not use this method for such cases because the | |
736 count() method is much for efficient. | |
737 """ | |
738 if isinstance(sub, MutableSeq): | |
739 sub = sub._data | |
740 elif isinstance(sub, Seq): | |
741 sub = bytes(sub) | |
742 elif isinstance(sub, str): | |
743 sub = sub.encode("ASCII") | |
744 elif not isinstance(sub, (bytes, bytearray)): | |
745 raise TypeError( | |
746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
747 % type(sub) | |
748 ) | |
749 data = self._data | |
750 overlap_count = 0 | |
751 while True: | |
752 start = data.find(sub, start, end) + 1 | |
753 if start != 0: | |
754 overlap_count += 1 | |
755 else: | |
756 return overlap_count | |
757 | |
758 def __contains__(self, item): | |
759 """Return True if item is a subsequence of the sequence, and False otherwise. | |
760 | |
761 e.g. | |
762 | |
763 >>> from Bio.Seq import Seq, MutableSeq | |
764 >>> my_dna = Seq("ATATGAAATTTGAAAA") | |
765 >>> "AAA" in my_dna | |
766 True | |
767 >>> Seq("AAA") in my_dna | |
768 True | |
769 >>> MutableSeq("AAA") in my_dna | |
770 True | |
771 """ | |
772 if isinstance(item, _SeqAbstractBaseClass): | |
773 item = bytes(item) | |
774 elif isinstance(item, str): | |
775 item = item.encode("ASCII") | |
776 return item in self._data | |
777 | |
778 def find(self, sub, start=None, end=None): | |
779 """Return the lowest index in the sequence where subsequence sub is found. | |
780 | |
781 With optional arguments start and end, return the lowest index in the | |
782 sequence such that the subsequence sub is contained within the sequence | |
783 region [start:end]. | |
784 | |
785 Arguments: | |
786 - sub - a string or another Seq or MutableSeq object to search for | |
787 - start - optional integer, slice start | |
788 - end - optional integer, slice end | |
789 | |
790 Returns -1 if the subsequence is NOT found. | |
791 | |
792 e.g. Locating the first typical start codon, AUG, in an RNA sequence: | |
793 | |
794 >>> from Bio.Seq import Seq | |
795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
796 >>> my_rna.find("AUG") | |
797 3 | |
798 | |
799 The next typical start codon can then be found by starting the search | |
800 at position 4: | |
801 | |
802 >>> my_rna.find("AUG", 4) | |
803 15 | |
804 | |
805 See the ``search`` method to find the locations of multiple subsequences | |
806 at the same time. | |
807 """ | |
808 if isinstance(sub, _SeqAbstractBaseClass): | |
809 sub = bytes(sub) | |
810 elif isinstance(sub, str): | |
811 sub = sub.encode("ASCII") | |
812 elif not isinstance(sub, (bytes, bytearray)): | |
813 raise TypeError( | |
814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
815 % type(sub) | |
816 ) | |
817 return self._data.find(sub, start, end) | |
818 | |
819 def rfind(self, sub, start=None, end=None): | |
820 """Return the highest index in the sequence where subsequence sub is found. | |
821 | |
822 With optional arguments start and end, return the highest index in the | |
823 sequence such that the subsequence sub is contained within the sequence | |
824 region [start:end]. | |
825 | |
826 Arguments: | |
827 - sub - a string or another Seq or MutableSeq object to search for | |
828 - start - optional integer, slice start | |
829 - end - optional integer, slice end | |
830 | |
831 Returns -1 if the subsequence is NOT found. | |
832 | |
833 e.g. Locating the last typical start codon, AUG, in an RNA sequence: | |
834 | |
835 >>> from Bio.Seq import Seq | |
836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
837 >>> my_rna.rfind("AUG") | |
838 15 | |
839 | |
840 The location of the typical start codon before that can be found by | |
841 ending the search at position 15: | |
842 | |
843 >>> my_rna.rfind("AUG", end=15) | |
844 3 | |
845 | |
846 See the ``search`` method to find the locations of multiple subsequences | |
847 at the same time. | |
848 """ | |
849 if isinstance(sub, _SeqAbstractBaseClass): | |
850 sub = bytes(sub) | |
851 elif isinstance(sub, str): | |
852 sub = sub.encode("ASCII") | |
853 elif not isinstance(sub, (bytes, bytearray)): | |
854 raise TypeError( | |
855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
856 % type(sub) | |
857 ) | |
858 return self._data.rfind(sub, start, end) | |
859 | |
860 def index(self, sub, start=None, end=None): | |
861 """Return the lowest index in the sequence where subsequence sub is found. | |
862 | |
863 With optional arguments start and end, return the lowest index in the | |
864 sequence such that the subsequence sub is contained within the sequence | |
865 region [start:end]. | |
866 | |
867 Arguments: | |
868 - sub - a string or another Seq or MutableSeq object to search for | |
869 - start - optional integer, slice start | |
870 - end - optional integer, slice end | |
871 | |
872 Raises a ValueError if the subsequence is NOT found. | |
873 | |
874 e.g. Locating the first typical start codon, AUG, in an RNA sequence: | |
875 | |
876 >>> from Bio.Seq import Seq | |
877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
878 >>> my_rna.index("AUG") | |
879 3 | |
880 | |
881 The next typical start codon can then be found by starting the search | |
882 at position 4: | |
883 | |
884 >>> my_rna.index("AUG", 4) | |
885 15 | |
886 | |
887 This method performs the same search as the ``find`` method. However, | |
888 if the subsequence is not found, ``find`` returns -1 while ``index`` | |
889 raises a ValueError: | |
890 | |
891 >>> my_rna.index("T") | |
892 Traceback (most recent call last): | |
893 ... | |
894 ValueError: ... | |
895 >>> my_rna.find("T") | |
896 -1 | |
897 | |
898 See the ``search`` method to find the locations of multiple subsequences | |
899 at the same time. | |
900 """ | |
901 if isinstance(sub, MutableSeq): | |
902 sub = sub._data | |
903 elif isinstance(sub, Seq): | |
904 sub = bytes(sub) | |
905 elif isinstance(sub, str): | |
906 sub = sub.encode("ASCII") | |
907 elif not isinstance(sub, (bytes, bytearray)): | |
908 raise TypeError( | |
909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
910 % type(sub) | |
911 ) | |
912 return self._data.index(sub, start, end) | |
913 | |
914 def rindex(self, sub, start=None, end=None): | |
915 """Return the highest index in the sequence where subsequence sub is found. | |
916 | |
917 With optional arguments start and end, return the highest index in the | |
918 sequence such that the subsequence sub is contained within the sequence | |
919 region [start:end]. | |
920 | |
921 Arguments: | |
922 - sub - a string or another Seq or MutableSeq object to search for | |
923 - start - optional integer, slice start | |
924 - end - optional integer, slice end | |
925 | |
926 Returns -1 if the subsequence is NOT found. | |
927 | |
928 e.g. Locating the last typical start codon, AUG, in an RNA sequence: | |
929 | |
930 >>> from Bio.Seq import Seq | |
931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
932 >>> my_rna.rindex("AUG") | |
933 15 | |
934 | |
935 The location of the typical start codon before that can be found by | |
936 ending the search at position 15: | |
937 | |
938 >>> my_rna.rindex("AUG", end=15) | |
939 3 | |
940 | |
941 This method performs the same search as the ``rfind`` method. However, | |
942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex`` | |
943 raises a ValueError: | |
944 | |
945 >>> my_rna.rindex("T") | |
946 Traceback (most recent call last): | |
947 ... | |
948 ValueError: ... | |
949 >>> my_rna.rfind("T") | |
950 -1 | |
951 | |
952 See the ``search`` method to find the locations of multiple subsequences | |
953 at the same time. | |
954 """ | |
955 if isinstance(sub, MutableSeq): | |
956 sub = sub._data | |
957 elif isinstance(sub, Seq): | |
958 sub = bytes(sub) | |
959 elif isinstance(sub, str): | |
960 sub = sub.encode("ASCII") | |
961 elif not isinstance(sub, (bytes, bytearray)): | |
962 raise TypeError( | |
963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
964 % type(sub) | |
965 ) | |
966 return self._data.rindex(sub, start, end) | |
967 | |
968 def search(self, subs): | |
969 """Search the substrings subs in self and yield the index and substring found. | |
970 | |
971 Arguments: | |
972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray | |
973 objects containing the substrings to search for. | |
974 | |
975 >>> from Bio.Seq import Seq | |
976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG") | |
977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")]) | |
978 >>> for index, substring in matches: | |
979 ... print(index, substring) | |
980 ... | |
981 7 CC | |
982 9 ATTG | |
983 20 CC | |
984 34 CC | |
985 34 CCC | |
986 35 CC | |
987 """ | |
988 subdict = collections.defaultdict(set) | |
989 for index, sub in enumerate(subs): | |
990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)): | |
991 sub = bytes(sub) | |
992 elif isinstance(sub, str): | |
993 sub = sub.encode("ASCII") | |
994 elif not isinstance(sub, bytes): | |
995 raise TypeError( | |
996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'" | |
997 % (index, type(sub)) | |
998 ) | |
999 length = len(sub) | |
1000 subdict[length].add(sub) | |
1001 for start in range(len(self) - 1): | |
1002 for length, subs in subdict.items(): | |
1003 stop = start + length | |
1004 for sub in subs: | |
1005 if self._data[start:stop] == sub: | |
1006 yield (start, sub.decode()) | |
1007 break | |
1008 | |
1009 def startswith(self, prefix, start=None, end=None): | |
1010 """Return True if the sequence starts with the given prefix, False otherwise. | |
1011 | |
1012 Return True if the sequence starts with the specified prefix | |
1013 (a string or another Seq object), False otherwise. | |
1014 With optional start, test sequence beginning at that position. | |
1015 With optional end, stop comparing sequence at that position. | |
1016 prefix can also be a tuple of strings to try. e.g. | |
1017 | |
1018 >>> from Bio.Seq import Seq | |
1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
1020 >>> my_rna.startswith("GUC") | |
1021 True | |
1022 >>> my_rna.startswith("AUG") | |
1023 False | |
1024 >>> my_rna.startswith("AUG", 3) | |
1025 True | |
1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) | |
1027 True | |
1028 """ | |
1029 if isinstance(prefix, tuple): | |
1030 prefix = tuple( | |
1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") | |
1032 for p in prefix | |
1033 ) | |
1034 elif isinstance(prefix, _SeqAbstractBaseClass): | |
1035 prefix = bytes(prefix) | |
1036 elif isinstance(prefix, str): | |
1037 prefix = prefix.encode("ASCII") | |
1038 return self._data.startswith(prefix, start, end) | |
1039 | |
1040 def endswith(self, suffix, start=None, end=None): | |
1041 """Return True if the sequence ends with the given suffix, False otherwise. | |
1042 | |
1043 Return True if the sequence ends with the specified suffix | |
1044 (a string or another Seq object), False otherwise. | |
1045 With optional start, test sequence beginning at that position. | |
1046 With optional end, stop comparing sequence at that position. | |
1047 suffix can also be a tuple of strings to try. e.g. | |
1048 | |
1049 >>> from Bio.Seq import Seq | |
1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
1051 >>> my_rna.endswith("UUG") | |
1052 True | |
1053 >>> my_rna.endswith("AUG") | |
1054 False | |
1055 >>> my_rna.endswith("AUG", 0, 18) | |
1056 True | |
1057 >>> my_rna.endswith(("UCC", "UCA", "UUG")) | |
1058 True | |
1059 """ | |
1060 if isinstance(suffix, tuple): | |
1061 suffix = tuple( | |
1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII") | |
1063 for p in suffix | |
1064 ) | |
1065 elif isinstance(suffix, _SeqAbstractBaseClass): | |
1066 suffix = bytes(suffix) | |
1067 elif isinstance(suffix, str): | |
1068 suffix = suffix.encode("ASCII") | |
1069 return self._data.endswith(suffix, start, end) | |
1070 | |
1071 def split(self, sep=None, maxsplit=-1): | |
1072 """Return a list of subsequences when splitting the sequence by separator sep. | |
1073 | |
1074 Return a list of the subsequences in the sequence (as Seq objects), | |
1075 using sep as the delimiter string. If maxsplit is given, at | |
1076 most maxsplit splits are done. If maxsplit is omitted, all | |
1077 splits are made. | |
1078 | |
1079 For consistency with the ``split`` method of Python strings, any | |
1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the | |
1081 default value | |
1082 | |
1083 e.g. | |
1084 | |
1085 >>> from Bio.Seq import Seq | |
1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
1087 >>> my_aa = my_rna.translate() | |
1088 >>> my_aa | |
1089 Seq('VMAIVMGR*KGAR*L') | |
1090 >>> for pep in my_aa.split("*"): | |
1091 ... pep | |
1092 Seq('VMAIVMGR') | |
1093 Seq('KGAR') | |
1094 Seq('L') | |
1095 >>> for pep in my_aa.split("*", 1): | |
1096 ... pep | |
1097 Seq('VMAIVMGR') | |
1098 Seq('KGAR*L') | |
1099 | |
1100 See also the rsplit method, which splits the sequence starting from the | |
1101 end: | |
1102 | |
1103 >>> for pep in my_aa.rsplit("*", 1): | |
1104 ... pep | |
1105 Seq('VMAIVMGR*KGAR') | |
1106 Seq('L') | |
1107 """ | |
1108 if isinstance(sep, _SeqAbstractBaseClass): | |
1109 sep = bytes(sep) | |
1110 elif isinstance(sep, str): | |
1111 sep = sep.encode("ASCII") | |
1112 return [Seq(part) for part in self._data.split(sep, maxsplit)] | |
1113 | |
1114 def rsplit(self, sep=None, maxsplit=-1): | |
1115 """Return a list of subsequences by splitting the sequence from the right. | |
1116 | |
1117 Return a list of the subsequences in the sequence (as Seq objects), | |
1118 using sep as the delimiter string. If maxsplit is given, at | |
1119 most maxsplit splits are done. If maxsplit is omitted, all | |
1120 splits are made. | |
1121 | |
1122 For consistency with the ``rsplit`` method of Python strings, any | |
1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the | |
1124 default value | |
1125 | |
1126 e.g. | |
1127 | |
1128 >>> from Bio.Seq import Seq | |
1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") | |
1130 >>> my_aa = my_rna.translate() | |
1131 >>> my_aa | |
1132 Seq('VMAIVMGR*KGAR*L') | |
1133 >>> for pep in my_aa.rsplit("*"): | |
1134 ... pep | |
1135 Seq('VMAIVMGR') | |
1136 Seq('KGAR') | |
1137 Seq('L') | |
1138 >>> for pep in my_aa.rsplit("*", 1): | |
1139 ... pep | |
1140 Seq('VMAIVMGR*KGAR') | |
1141 Seq('L') | |
1142 | |
1143 See also the split method, which splits the sequence starting from the | |
1144 beginning: | |
1145 | |
1146 >>> for pep in my_aa.split("*", 1): | |
1147 ... pep | |
1148 Seq('VMAIVMGR') | |
1149 Seq('KGAR*L') | |
1150 """ | |
1151 if isinstance(sep, _SeqAbstractBaseClass): | |
1152 sep = bytes(sep) | |
1153 elif isinstance(sep, str): | |
1154 sep = sep.encode("ASCII") | |
1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)] | |
1156 | |
1157 def strip(self, chars=None, inplace=False): | |
1158 """Return a sequence object with leading and trailing ends stripped. | |
1159 | |
1160 With default arguments, leading and trailing whitespace is removed: | |
1161 | |
1162 >>> seq = Seq(" ACGT ") | |
1163 >>> seq.strip() | |
1164 Seq('ACGT') | |
1165 >>> seq | |
1166 Seq(' ACGT ') | |
1167 | |
1168 If ``chars`` is given and not ``None``, remove characters in ``chars`` | |
1169 instead. The order of the characters to be removed is not important: | |
1170 | |
1171 >>> Seq("ACGTACGT").strip("TGCA") | |
1172 Seq('') | |
1173 | |
1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the | |
1175 default value). If ``inplace`` is ``True``, the sequence is stripped | |
1176 in-place and returned. | |
1177 | |
1178 >>> seq = MutableSeq(" ACGT ") | |
1179 >>> seq.strip() | |
1180 MutableSeq('ACGT') | |
1181 >>> seq | |
1182 MutableSeq(' ACGT ') | |
1183 >>> seq.strip(inplace=True) | |
1184 MutableSeq('ACGT') | |
1185 >>> seq | |
1186 MutableSeq('ACGT') | |
1187 | |
1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip`` | |
1189 is called on a ``Seq`` object with ``inplace=True``. | |
1190 | |
1191 See also the lstrip and rstrip methods. | |
1192 """ | |
1193 if isinstance(chars, _SeqAbstractBaseClass): | |
1194 chars = bytes(chars) | |
1195 elif isinstance(chars, str): | |
1196 chars = chars.encode("ASCII") | |
1197 try: | |
1198 data = self._data.strip(chars) | |
1199 except TypeError: | |
1200 raise TypeError( | |
1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object" | |
1202 ) from None | |
1203 if inplace: | |
1204 if not isinstance(self._data, bytearray): | |
1205 raise TypeError("Sequence is immutable") | |
1206 self._data[:] = data | |
1207 return self | |
1208 else: | |
1209 return self.__class__(data) | |
1210 | |
1211 def lstrip(self, chars=None, inplace=False): | |
1212 """Return a sequence object with leading and trailing ends stripped. | |
1213 | |
1214 With default arguments, leading whitespace is removed: | |
1215 | |
1216 >>> seq = Seq(" ACGT ") | |
1217 >>> seq.lstrip() | |
1218 Seq('ACGT ') | |
1219 >>> seq | |
1220 Seq(' ACGT ') | |
1221 | |
1222 If ``chars`` is given and not ``None``, remove characters in ``chars`` | |
1223 from the leading end instead. The order of the characters to be removed | |
1224 is not important: | |
1225 | |
1226 >>> Seq("ACGACGTTACG").lstrip("GCA") | |
1227 Seq('TTACG') | |
1228 | |
1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the | |
1230 default value). If ``inplace`` is ``True``, the sequence is stripped | |
1231 in-place and returned. | |
1232 | |
1233 >>> seq = MutableSeq(" ACGT ") | |
1234 >>> seq.lstrip() | |
1235 MutableSeq('ACGT ') | |
1236 >>> seq | |
1237 MutableSeq(' ACGT ') | |
1238 >>> seq.lstrip(inplace=True) | |
1239 MutableSeq('ACGT ') | |
1240 >>> seq | |
1241 MutableSeq('ACGT ') | |
1242 | |
1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``. | |
1245 | |
1246 See also the strip and rstrip methods. | |
1247 """ | |
1248 if isinstance(chars, _SeqAbstractBaseClass): | |
1249 chars = bytes(chars) | |
1250 elif isinstance(chars, str): | |
1251 chars = chars.encode("ASCII") | |
1252 try: | |
1253 data = self._data.lstrip(chars) | |
1254 except TypeError: | |
1255 raise TypeError( | |
1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object" | |
1257 ) from None | |
1258 if inplace: | |
1259 if not isinstance(self._data, bytearray): | |
1260 raise TypeError("Sequence is immutable") | |
1261 self._data[:] = data | |
1262 return self | |
1263 else: | |
1264 return self.__class__(data) | |
1265 | |
1266 def rstrip(self, chars=None, inplace=False): | |
1267 """Return a sequence object with trailing ends stripped. | |
1268 | |
1269 With default arguments, trailing whitespace is removed: | |
1270 | |
1271 >>> seq = Seq(" ACGT ") | |
1272 >>> seq.rstrip() | |
1273 Seq(' ACGT') | |
1274 >>> seq | |
1275 Seq(' ACGT ') | |
1276 | |
1277 If ``chars`` is given and not ``None``, remove characters in ``chars`` | |
1278 from the trailing end instead. The order of the characters to be | |
1279 removed is not important: | |
1280 | |
1281 >>> Seq("ACGACGTTACG").rstrip("GCA") | |
1282 Seq('ACGACGTT') | |
1283 | |
1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the | |
1285 default value). If ``inplace`` is ``True``, the sequence is stripped | |
1286 in-place and returned. | |
1287 | |
1288 >>> seq = MutableSeq(" ACGT ") | |
1289 >>> seq.rstrip() | |
1290 MutableSeq(' ACGT') | |
1291 >>> seq | |
1292 MutableSeq(' ACGT ') | |
1293 >>> seq.rstrip(inplace=True) | |
1294 MutableSeq(' ACGT') | |
1295 >>> seq | |
1296 MutableSeq(' ACGT') | |
1297 | |
1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``. | |
1300 | |
1301 See also the strip and lstrip methods. | |
1302 """ | |
1303 if isinstance(chars, _SeqAbstractBaseClass): | |
1304 chars = bytes(chars) | |
1305 elif isinstance(chars, str): | |
1306 chars = chars.encode("ASCII") | |
1307 try: | |
1308 data = self._data.rstrip(chars) | |
1309 except TypeError: | |
1310 raise TypeError( | |
1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object" | |
1312 ) from None | |
1313 if inplace: | |
1314 if not isinstance(self._data, bytearray): | |
1315 raise TypeError("Sequence is immutable") | |
1316 self._data[:] = data | |
1317 return self | |
1318 else: | |
1319 return self.__class__(data) | |
1320 | |
1321 def removeprefix(self, prefix, inplace=False): | |
1322 """Return a new Seq object with prefix (left) removed. | |
1323 | |
1324 This behaves like the python string method of the same name. | |
1325 | |
1326 e.g. Removing a start Codon: | |
1327 | |
1328 >>> from Bio.Seq import Seq | |
1329 >>> my_seq = Seq("ATGGTGTGTGT") | |
1330 >>> my_seq | |
1331 Seq('ATGGTGTGTGT') | |
1332 >>> my_seq.removeprefix('ATG') | |
1333 Seq('GTGTGTGT') | |
1334 | |
1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``. | |
1337 | |
1338 See also the removesuffix method. | |
1339 """ | |
1340 if isinstance(prefix, _SeqAbstractBaseClass): | |
1341 prefix = bytes(prefix) | |
1342 elif isinstance(prefix, str): | |
1343 prefix = prefix.encode("ASCII") | |
1344 try: | |
1345 data = self._data.removeprefix(prefix) | |
1346 except TypeError: | |
1347 raise TypeError( | |
1348 "argument must be a string, Seq, MutableSeq, or bytes-like object" | |
1349 ) from None | |
1350 except AttributeError: | |
1351 # Fall back for pre-Python 3.9 | |
1352 data = self._data | |
1353 if data.startswith(prefix): | |
1354 data = data[len(prefix) :] | |
1355 if inplace: | |
1356 if not isinstance(self._data, bytearray): | |
1357 raise TypeError("Sequence is immutable") | |
1358 self._data[:] = data | |
1359 return self | |
1360 else: | |
1361 return self.__class__(data) | |
1362 | |
1363 def removesuffix(self, suffix, inplace=False): | |
1364 """Return a new Seq object with suffix (right) removed. | |
1365 | |
1366 This behaves like the python string method of the same name. | |
1367 | |
1368 e.g. Removing a stop codon: | |
1369 | |
1370 >>> from Bio.Seq import Seq | |
1371 >>> my_seq = Seq("GTGTGTGTTAG") | |
1372 >>> my_seq | |
1373 Seq('GTGTGTGTTAG') | |
1374 >>> stop_codon = Seq("TAG") | |
1375 >>> my_seq.removesuffix(stop_codon) | |
1376 Seq('GTGTGTGT') | |
1377 | |
1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``. | |
1380 | |
1381 See also the removeprefix method. | |
1382 """ | |
1383 if isinstance(suffix, _SeqAbstractBaseClass): | |
1384 suffix = bytes(suffix) | |
1385 elif isinstance(suffix, str): | |
1386 suffix = suffix.encode("ASCII") | |
1387 try: | |
1388 data = self._data.removesuffix(suffix) | |
1389 except TypeError: | |
1390 raise TypeError( | |
1391 "argument must be a string, Seq, MutableSeq, or bytes-like object" | |
1392 ) from None | |
1393 except AttributeError: | |
1394 # Fall back for pre-Python 3.9 | |
1395 data = self._data | |
1396 if data.endswith(suffix): | |
1397 data = data[: -len(suffix)] | |
1398 if inplace: | |
1399 if not isinstance(self._data, bytearray): | |
1400 raise TypeError("Sequence is immutable") | |
1401 self._data[:] = data | |
1402 return self | |
1403 else: | |
1404 return self.__class__(data) | |
1405 | |
1406 def upper(self, inplace=False): | |
1407 """Return the sequence in upper case. | |
1408 | |
1409 An upper-case copy of the sequence is returned if inplace is False, | |
1410 the default value: | |
1411 | |
1412 >>> from Bio.Seq import Seq, MutableSeq | |
1413 >>> my_seq = Seq("VHLTPeeK*") | |
1414 >>> my_seq | |
1415 Seq('VHLTPeeK*') | |
1416 >>> my_seq.lower() | |
1417 Seq('vhltpeek*') | |
1418 >>> my_seq.upper() | |
1419 Seq('VHLTPEEK*') | |
1420 >>> my_seq | |
1421 Seq('VHLTPeeK*') | |
1422 | |
1423 The sequence is modified in-place and returned if inplace is True: | |
1424 | |
1425 >>> my_seq = MutableSeq("VHLTPeeK*") | |
1426 >>> my_seq | |
1427 MutableSeq('VHLTPeeK*') | |
1428 >>> my_seq.lower() | |
1429 MutableSeq('vhltpeek*') | |
1430 >>> my_seq.upper() | |
1431 MutableSeq('VHLTPEEK*') | |
1432 >>> my_seq | |
1433 MutableSeq('VHLTPeeK*') | |
1434 | |
1435 >>> my_seq.lower(inplace=True) | |
1436 MutableSeq('vhltpeek*') | |
1437 >>> my_seq | |
1438 MutableSeq('vhltpeek*') | |
1439 >>> my_seq.upper(inplace=True) | |
1440 MutableSeq('VHLTPEEK*') | |
1441 >>> my_seq | |
1442 MutableSeq('VHLTPEEK*') | |
1443 | |
1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``. | |
1446 | |
1447 See also the ``lower`` method. | |
1448 """ | |
1449 data = self._data.upper() | |
1450 if inplace: | |
1451 if not isinstance(self._data, bytearray): | |
1452 raise TypeError("Sequence is immutable") | |
1453 self._data[:] = data | |
1454 return self | |
1455 else: | |
1456 return self.__class__(data) | |
1457 | |
1458 def lower(self, inplace=False): | |
1459 """Return the sequence in lower case. | |
1460 | |
1461 An lower-case copy of the sequence is returned if inplace is False, | |
1462 the default value: | |
1463 | |
1464 >>> from Bio.Seq import Seq, MutableSeq | |
1465 >>> my_seq = Seq("VHLTPeeK*") | |
1466 >>> my_seq | |
1467 Seq('VHLTPeeK*') | |
1468 >>> my_seq.lower() | |
1469 Seq('vhltpeek*') | |
1470 >>> my_seq.upper() | |
1471 Seq('VHLTPEEK*') | |
1472 >>> my_seq | |
1473 Seq('VHLTPeeK*') | |
1474 | |
1475 The sequence is modified in-place and returned if inplace is True: | |
1476 | |
1477 >>> my_seq = MutableSeq("VHLTPeeK*") | |
1478 >>> my_seq | |
1479 MutableSeq('VHLTPeeK*') | |
1480 >>> my_seq.lower() | |
1481 MutableSeq('vhltpeek*') | |
1482 >>> my_seq.upper() | |
1483 MutableSeq('VHLTPEEK*') | |
1484 >>> my_seq | |
1485 MutableSeq('VHLTPeeK*') | |
1486 | |
1487 >>> my_seq.lower(inplace=True) | |
1488 MutableSeq('vhltpeek*') | |
1489 >>> my_seq | |
1490 MutableSeq('vhltpeek*') | |
1491 >>> my_seq.upper(inplace=True) | |
1492 MutableSeq('VHLTPEEK*') | |
1493 >>> my_seq | |
1494 MutableSeq('VHLTPEEK*') | |
1495 | |
1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``. | |
1498 | |
1499 See also the ``upper`` method. | |
1500 """ | |
1501 data = self._data.lower() | |
1502 if inplace: | |
1503 if not isinstance(self._data, bytearray): | |
1504 raise TypeError("Sequence is immutable") | |
1505 self._data[:] = data | |
1506 return self | |
1507 else: | |
1508 return self.__class__(data) | |
1509 | |
1510 def isupper(self): | |
1511 """Return True if all ASCII characters in data are uppercase. | |
1512 | |
1513 If there are no cased characters, the method returns False. | |
1514 """ | |
1515 return self._data.isupper() | |
1516 | |
1517 def islower(self): | |
1518 """Return True if all ASCII characters in data are lowercase. | |
1519 | |
1520 If there are no cased characters, the method returns False. | |
1521 """ | |
1522 return self._data.islower() | |
1523 | |
1524 def translate( | |
1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-" | |
1526 ): | |
1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object. | |
1528 | |
1529 This method will translate DNA or RNA sequences. It should not | |
1530 be used on protein sequences as any result will be biologically | |
1531 meaningless. | |
1532 | |
1533 Arguments: | |
1534 - table - Which codon table to use? This can be either a name | |
1535 (string), an NCBI identifier (integer), or a CodonTable | |
1536 object (useful for non-standard genetic codes). This | |
1537 defaults to the "Standard" table. | |
1538 - stop_symbol - Single character string, what to use for | |
1539 terminators. This defaults to the asterisk, "*". | |
1540 - to_stop - Boolean, defaults to False meaning do a full | |
1541 translation continuing on past any stop codons (translated as the | |
1542 specified stop_symbol). If True, translation is terminated at | |
1543 the first in frame stop codon (and the stop_symbol is not | |
1544 appended to the returned protein sequence). | |
1545 - cds - Boolean, indicates this is a complete CDS. If True, | |
1546 this checks the sequence starts with a valid alternative start | |
1547 codon (which will be translated as methionine, M), that the | |
1548 sequence length is a multiple of three, and that there is a | |
1549 single in frame stop codon at the end (this will be excluded | |
1550 from the protein sequence, regardless of the to_stop option). | |
1551 If these tests fail, an exception is raised. | |
1552 - gap - Single character string to denote symbol used for gaps. | |
1553 Defaults to the minus sign. | |
1554 | |
1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq`` | |
1556 object; a ``MutableSeq`` object is returned if ``translate`` is called | |
1557 pn a ``MutableSeq`` object. | |
1558 | |
1559 e.g. Using the standard table: | |
1560 | |
1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") | |
1562 >>> coding_dna.translate() | |
1563 Seq('VAIVMGR*KGAR*') | |
1564 >>> coding_dna.translate(stop_symbol="@") | |
1565 Seq('VAIVMGR@KGAR@') | |
1566 >>> coding_dna.translate(to_stop=True) | |
1567 Seq('VAIVMGR') | |
1568 | |
1569 Now using NCBI table 2, where TGA is not a stop codon: | |
1570 | |
1571 >>> coding_dna.translate(table=2) | |
1572 Seq('VAIVMGRWKGAR*') | |
1573 >>> coding_dna.translate(table=2, to_stop=True) | |
1574 Seq('VAIVMGRWKGAR') | |
1575 | |
1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning | |
1577 this sequence could be a complete CDS: | |
1578 | |
1579 >>> coding_dna.translate(table=2, cds=True) | |
1580 Seq('MAIVMGRWKGAR') | |
1581 | |
1582 It isn't a valid CDS under NCBI table 1, due to both the start codon | |
1583 and also the in frame stop codons: | |
1584 | |
1585 >>> coding_dna.translate(table=1, cds=True) | |
1586 Traceback (most recent call last): | |
1587 ... | |
1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon | |
1589 | |
1590 If the sequence has no in-frame stop codon, then the to_stop argument | |
1591 has no effect: | |
1592 | |
1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") | |
1594 >>> coding_dna2.translate() | |
1595 Seq('LAIVMGR') | |
1596 >>> coding_dna2.translate(to_stop=True) | |
1597 Seq('LAIVMGR') | |
1598 | |
1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid | |
1600 or a stop codon. These are translated as "X". Any invalid codon | |
1601 (e.g. "TA?" or "T-A") will throw a TranslationError. | |
1602 | |
1603 NOTE - This does NOT behave like the python string's translate | |
1604 method. For that use str(my_seq).translate(...) instead | |
1605 """ | |
1606 try: | |
1607 data = str(self) | |
1608 except UndefinedSequenceError: | |
1609 # translating an undefined sequence yields an undefined | |
1610 # sequence with the length divided by 3 | |
1611 n = len(self) | |
1612 if n % 3 != 0: | |
1613 warnings.warn( | |
1614 "Partial codon, len(sequence) not a multiple of three. " | |
1615 "This may become an error in future.", | |
1616 BiopythonWarning, | |
1617 ) | |
1618 return Seq(None, n // 3) | |
1619 | |
1620 return self.__class__( | |
1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap) | |
1622 ) | |
1623 | |
1624 def complement(self, inplace=False): | |
1625 """Return the complement as a DNA sequence. | |
1626 | |
1627 >>> Seq("CGA").complement() | |
1628 Seq('GCT') | |
1629 | |
1630 Any U in the sequence is treated as a T: | |
1631 | |
1632 >>> Seq("CGAUT").complement() | |
1633 Seq('GCTAA') | |
1634 | |
1635 In contrast, ``complement_rna`` returns an RNA sequence: | |
1636 | |
1637 >>> Seq("CGAUT").complement_rna() | |
1638 Seq('GCUAA') | |
1639 | |
1640 The sequence is modified in-place and returned if inplace is True: | |
1641 | |
1642 >>> my_seq = MutableSeq("CGA") | |
1643 >>> my_seq | |
1644 MutableSeq('CGA') | |
1645 >>> my_seq.complement() | |
1646 MutableSeq('GCT') | |
1647 >>> my_seq | |
1648 MutableSeq('CGA') | |
1649 | |
1650 >>> my_seq.complement(inplace=True) | |
1651 MutableSeq('GCT') | |
1652 >>> my_seq | |
1653 MutableSeq('GCT') | |
1654 | |
1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. | |
1657 """ | |
1658 ttable = _dna_complement_table | |
1659 try: | |
1660 data = self._data.translate(ttable) | |
1661 except UndefinedSequenceError: | |
1662 # complement of an undefined sequence is an undefined sequence | |
1663 # of the same length | |
1664 return self | |
1665 if inplace: | |
1666 if not isinstance(self._data, bytearray): | |
1667 raise TypeError("Sequence is immutable") | |
1668 self._data[:] = data | |
1669 return self | |
1670 return self.__class__(data) | |
1671 | |
1672 def complement_rna(self, inplace=False): | |
1673 """Return the complement as an RNA sequence. | |
1674 | |
1675 >>> Seq("CGA").complement_rna() | |
1676 Seq('GCU') | |
1677 | |
1678 Any T in the sequence is treated as a U: | |
1679 | |
1680 >>> Seq("CGAUT").complement_rna() | |
1681 Seq('GCUAA') | |
1682 | |
1683 In contrast, ``complement`` returns a DNA sequence by default: | |
1684 | |
1685 >>> Seq("CGA").complement() | |
1686 Seq('GCT') | |
1687 | |
1688 The sequence is modified in-place and returned if inplace is True: | |
1689 | |
1690 >>> my_seq = MutableSeq("CGA") | |
1691 >>> my_seq | |
1692 MutableSeq('CGA') | |
1693 >>> my_seq.complement_rna() | |
1694 MutableSeq('GCU') | |
1695 >>> my_seq | |
1696 MutableSeq('CGA') | |
1697 | |
1698 >>> my_seq.complement_rna(inplace=True) | |
1699 MutableSeq('GCU') | |
1700 >>> my_seq | |
1701 MutableSeq('GCU') | |
1702 | |
1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``. | |
1705 """ | |
1706 try: | |
1707 data = self._data.translate(_rna_complement_table) | |
1708 except UndefinedSequenceError: | |
1709 # complement of an undefined sequence is an undefined sequence | |
1710 # of the same length | |
1711 return self | |
1712 if inplace: | |
1713 if not isinstance(self._data, bytearray): | |
1714 raise TypeError("Sequence is immutable") | |
1715 self._data[:] = data | |
1716 return self | |
1717 return self.__class__(data) | |
1718 | |
1719 def reverse_complement(self, inplace=False): | |
1720 """Return the reverse complement as a DNA sequence. | |
1721 | |
1722 >>> Seq("CGA").reverse_complement() | |
1723 Seq('TCG') | |
1724 | |
1725 Any U in the sequence is treated as a T: | |
1726 | |
1727 >>> Seq("CGAUT").reverse_complement() | |
1728 Seq('AATCG') | |
1729 | |
1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence: | |
1731 | |
1732 >>> Seq("CGA").reverse_complement_rna() | |
1733 Seq('UCG') | |
1734 | |
1735 The sequence is modified in-place and returned if inplace is True: | |
1736 | |
1737 >>> my_seq = MutableSeq("CGA") | |
1738 >>> my_seq | |
1739 MutableSeq('CGA') | |
1740 >>> my_seq.reverse_complement() | |
1741 MutableSeq('TCG') | |
1742 >>> my_seq | |
1743 MutableSeq('CGA') | |
1744 | |
1745 >>> my_seq.reverse_complement(inplace=True) | |
1746 MutableSeq('TCG') | |
1747 >>> my_seq | |
1748 MutableSeq('TCG') | |
1749 | |
1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1751 ``reverse_complement`` is called on a ``Seq`` object with | |
1752 ``inplace=True``. | |
1753 """ | |
1754 try: | |
1755 data = self._data.translate(_dna_complement_table) | |
1756 except UndefinedSequenceError: | |
1757 # reverse complement of an undefined sequence is an undefined sequence | |
1758 # of the same length | |
1759 return self | |
1760 if inplace: | |
1761 if not isinstance(self._data, bytearray): | |
1762 raise TypeError("Sequence is immutable") | |
1763 self._data[::-1] = data | |
1764 return self | |
1765 return self.__class__(data[::-1]) | |
1766 | |
1767 def reverse_complement_rna(self, inplace=False): | |
1768 """Return the reverse complement as an RNA sequence. | |
1769 | |
1770 >>> Seq("CGA").reverse_complement_rna() | |
1771 Seq('UCG') | |
1772 | |
1773 Any T in the sequence is treated as a U: | |
1774 | |
1775 >>> Seq("CGAUT").reverse_complement_rna() | |
1776 Seq('AAUCG') | |
1777 | |
1778 In contrast, ``reverse_complement`` returns a DNA sequence: | |
1779 | |
1780 >>> Seq("CGA").reverse_complement() | |
1781 Seq('TCG') | |
1782 | |
1783 The sequence is modified in-place and returned if inplace is True: | |
1784 | |
1785 >>> my_seq = MutableSeq("CGA") | |
1786 >>> my_seq | |
1787 MutableSeq('CGA') | |
1788 >>> my_seq.reverse_complement_rna() | |
1789 MutableSeq('UCG') | |
1790 >>> my_seq | |
1791 MutableSeq('CGA') | |
1792 | |
1793 >>> my_seq.reverse_complement_rna(inplace=True) | |
1794 MutableSeq('UCG') | |
1795 >>> my_seq | |
1796 MutableSeq('UCG') | |
1797 | |
1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1799 ``reverse_complement_rna`` is called on a ``Seq`` object with | |
1800 ``inplace=True``. | |
1801 """ | |
1802 try: | |
1803 data = self._data.translate(_rna_complement_table) | |
1804 except UndefinedSequenceError: | |
1805 # reverse complement of an undefined sequence is an undefined sequence | |
1806 # of the same length | |
1807 return self | |
1808 if inplace: | |
1809 if not isinstance(self._data, bytearray): | |
1810 raise TypeError("Sequence is immutable") | |
1811 self._data[::-1] = data | |
1812 return self | |
1813 return self.__class__(data[::-1]) | |
1814 | |
1815 def transcribe(self, inplace=False): | |
1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object. | |
1817 | |
1818 Following the usual convention, the sequence is interpreted as the | |
1819 coding strand of the DNA double helix, not the template strand. This | |
1820 means we can get the RNA sequence just by switching T to U. | |
1821 | |
1822 >>> from Bio.Seq import Seq | |
1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") | |
1824 >>> coding_dna | |
1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1826 >>> coding_dna.transcribe() | |
1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1828 | |
1829 The sequence is modified in-place and returned if inplace is True: | |
1830 | |
1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") | |
1832 >>> sequence | |
1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1834 >>> sequence.transcribe() | |
1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1836 >>> sequence | |
1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1838 | |
1839 >>> sequence.transcribe(inplace=True) | |
1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1841 >>> sequence | |
1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1843 | |
1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. | |
1846 | |
1847 Trying to transcribe an RNA sequence has no effect. | |
1848 If you have a nucleotide sequence which might be DNA or RNA | |
1849 (or even a mixture), calling the transcribe method will ensure | |
1850 any T becomes U. | |
1851 | |
1852 Trying to transcribe a protein sequence will replace any | |
1853 T for Threonine with U for Selenocysteine, which has no | |
1854 biologically plausible rational. | |
1855 | |
1856 >>> from Bio.Seq import Seq | |
1857 >>> my_protein = Seq("MAIVMGRT") | |
1858 >>> my_protein.transcribe() | |
1859 Seq('MAIVMGRU') | |
1860 """ | |
1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u") | |
1862 if inplace: | |
1863 if not isinstance(self._data, bytearray): | |
1864 raise TypeError("Sequence is immutable") | |
1865 self._data[:] = data | |
1866 return self | |
1867 return self.__class__(data) | |
1868 | |
1869 def back_transcribe(self, inplace=False): | |
1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object. | |
1871 | |
1872 >>> from Bio.Seq import Seq | |
1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") | |
1874 >>> messenger_rna | |
1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1876 >>> messenger_rna.back_transcribe() | |
1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1878 | |
1879 The sequence is modified in-place and returned if inplace is True: | |
1880 | |
1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") | |
1882 >>> sequence | |
1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1884 >>> sequence.back_transcribe() | |
1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1886 >>> sequence | |
1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG') | |
1888 | |
1889 >>> sequence.back_transcribe(inplace=True) | |
1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1891 >>> sequence | |
1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG') | |
1893 | |
1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``. | |
1896 | |
1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide | |
1898 sequence which might be DNA or RNA (or even a mixture), calling the | |
1899 back-transcribe method will ensure any U becomes T. | |
1900 | |
1901 Trying to back-transcribe a protein sequence will replace any U for | |
1902 Selenocysteine with T for Threonine, which is biologically meaningless. | |
1903 | |
1904 >>> from Bio.Seq import Seq | |
1905 >>> my_protein = Seq("MAIVMGRU") | |
1906 >>> my_protein.back_transcribe() | |
1907 Seq('MAIVMGRT') | |
1908 """ | |
1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t") | |
1910 if inplace: | |
1911 if not isinstance(self._data, bytearray): | |
1912 raise TypeError("Sequence is immutable") | |
1913 self._data[:] = data | |
1914 return self | |
1915 return self.__class__(data) | |
1916 | |
1917 def join(self, other): | |
1918 """Return a merge of the sequences in other, spaced by the sequence from self. | |
1919 | |
1920 Accepts a Seq object, MutableSeq object, or string (and iterates over | |
1921 the letters), or an iterable containing Seq, MutableSeq, or string | |
1922 objects. These arguments will be concatenated with the calling sequence | |
1923 as the spacer: | |
1924 | |
1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")]) | |
1926 >>> concatenated | |
1927 Seq('AAANNNNNTTTNNNNNPPP') | |
1928 | |
1929 Joining the letters of a single sequence: | |
1930 | |
1931 >>> Seq('NNNNN').join(Seq("ACGT")) | |
1932 Seq('ANNNNNCNNNNNGNNNNNT') | |
1933 >>> Seq('NNNNN').join("ACGT") | |
1934 Seq('ANNNNNCNNNNNGNNNNNT') | |
1935 """ | |
1936 if isinstance(other, _SeqAbstractBaseClass): | |
1937 return self.__class__(str(self).join(str(other))) | |
1938 elif isinstance(other, str): | |
1939 return self.__class__(str(self).join(other)) | |
1940 | |
1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports | |
1942 | |
1943 if isinstance(other, SeqRecord): | |
1944 raise TypeError("Iterable cannot be a SeqRecord") | |
1945 | |
1946 for c in other: | |
1947 if isinstance(c, SeqRecord): | |
1948 raise TypeError("Iterable cannot contain SeqRecords") | |
1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)): | |
1950 raise TypeError( | |
1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings" | |
1952 ) | |
1953 return self.__class__(str(self).join([str(_) for _ in other])) | |
1954 | |
1955 def replace(self, old, new, inplace=False): | |
1956 """Return a copy with all occurrences of subsequence old replaced by new. | |
1957 | |
1958 >>> s = Seq("ACGTAACCGGTT") | |
1959 >>> t = s.replace("AC", "XYZ") | |
1960 >>> s | |
1961 Seq('ACGTAACCGGTT') | |
1962 >>> t | |
1963 Seq('XYZGTAXYZCGGTT') | |
1964 | |
1965 For mutable sequences, passing inplace=True will modify the sequence in place: | |
1966 | |
1967 >>> m = MutableSeq("ACGTAACCGGTT") | |
1968 >>> t = m.replace("AC", "XYZ") | |
1969 >>> m | |
1970 MutableSeq('ACGTAACCGGTT') | |
1971 >>> t | |
1972 MutableSeq('XYZGTAXYZCGGTT') | |
1973 | |
1974 >>> m = MutableSeq("ACGTAACCGGTT") | |
1975 >>> t = m.replace("AC", "XYZ", inplace=True) | |
1976 >>> m | |
1977 MutableSeq('XYZGTAXYZCGGTT') | |
1978 >>> t | |
1979 MutableSeq('XYZGTAXYZCGGTT') | |
1980 | |
1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if | |
1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``. | |
1983 """ | |
1984 if isinstance(old, _SeqAbstractBaseClass): | |
1985 old = bytes(old) | |
1986 elif isinstance(old, str): | |
1987 old = old.encode("ASCII") | |
1988 if isinstance(new, _SeqAbstractBaseClass): | |
1989 new = bytes(new) | |
1990 elif isinstance(new, str): | |
1991 new = new.encode("ASCII") | |
1992 data = self._data.replace(old, new) | |
1993 if inplace: | |
1994 if not isinstance(self._data, bytearray): | |
1995 raise TypeError("Sequence is immutable") | |
1996 self._data[:] = data | |
1997 return self | |
1998 return self.__class__(data) | |
1999 | |
2000 @property | |
2001 def defined(self): | |
2002 """Return True if the sequence is defined, False if undefined or partially defined. | |
2003 | |
2004 Zero-length sequences are always considered to be defined. | |
2005 """ | |
2006 if isinstance(self._data, (bytes, bytearray)): | |
2007 return True | |
2008 else: | |
2009 return self._data.defined | |
2010 | |
2011 @property | |
2012 def defined_ranges(self): | |
2013 """Return a tuple of the ranges where the sequence contents is defined. | |
2014 | |
2015 The return value has the format ((start1, end1), (start2, end2), ...). | |
2016 """ | |
2017 if isinstance(self._data, (bytes, bytearray)): | |
2018 length = len(self) | |
2019 if length > 0: | |
2020 return ((0, length),) | |
2021 else: | |
2022 return () | |
2023 else: | |
2024 return self._data.defined_ranges | |
2025 | |
2026 | |
2027 class Seq(_SeqAbstractBaseClass): | |
2028 """Read-only sequence object (essentially a string with biological methods). | |
2029 | |
2030 Like normal python strings, our basic sequence object is immutable. | |
2031 This prevents you from doing my_seq[5] = "A" for example, but does allow | |
2032 Seq objects to be used as dictionary keys. | |
2033 | |
2034 The Seq object provides a number of string like methods (such as count, | |
2035 find, split and strip). | |
2036 | |
2037 The Seq object also provides some biological methods, such as complement, | |
2038 reverse_complement, transcribe, back_transcribe and translate (which are | |
2039 not applicable to protein sequences). | |
2040 """ | |
2041 | |
2042 _data: Union[bytes, SequenceDataAbstractBaseClass] | |
2043 | |
2044 def __init__( | |
2045 self, | |
2046 data: Union[ | |
2047 str, | |
2048 bytes, | |
2049 bytearray, | |
2050 _SeqAbstractBaseClass, | |
2051 SequenceDataAbstractBaseClass, | |
2052 dict, | |
2053 None, | |
2054 ], | |
2055 length: Optional[int] = None, | |
2056 ): | |
2057 """Create a Seq object. | |
2058 | |
2059 Arguments: | |
2060 - data - Sequence, required (string) | |
2061 - length - Sequence length, used only if data is None or a dictionary (integer) | |
2062 | |
2063 You will typically use Bio.SeqIO to read in sequences from files as | |
2064 SeqRecord objects, whose sequence will be exposed as a Seq object via | |
2065 the seq property. | |
2066 | |
2067 However, you can also create a Seq object directly: | |
2068 | |
2069 >>> from Bio.Seq import Seq | |
2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF") | |
2071 >>> my_seq | |
2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') | |
2073 >>> print(my_seq) | |
2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
2075 | |
2076 To create a Seq object with for a sequence of known length but | |
2077 unknown sequence contents, use None for the data argument and pass | |
2078 the sequence length for the length argument. Trying to access the | |
2079 sequence contents of a Seq object created in this way will raise | |
2080 an UndefinedSequenceError: | |
2081 | |
2082 >>> my_undefined_sequence = Seq(None, 20) | |
2083 >>> my_undefined_sequence | |
2084 Seq(None, length=20) | |
2085 >>> len(my_undefined_sequence) | |
2086 20 | |
2087 >>> print(my_undefined_sequence) | |
2088 Traceback (most recent call last): | |
2089 ... | |
2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined | |
2091 | |
2092 If the sequence contents is known for parts of the sequence only, use | |
2093 a dictionary for the data argument to pass the known sequence segments: | |
2094 | |
2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10) | |
2096 >>> my_partially_defined_sequence | |
2097 Seq({3: 'ACGT'}, length=10) | |
2098 >>> len(my_partially_defined_sequence) | |
2099 10 | |
2100 >>> print(my_partially_defined_sequence) | |
2101 Traceback (most recent call last): | |
2102 ... | |
2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined | |
2104 >>> my_partially_defined_sequence[3:7] | |
2105 Seq('ACGT') | |
2106 >>> print(my_partially_defined_sequence[3:7]) | |
2107 ACGT | |
2108 """ | |
2109 if data is None: | |
2110 if length is None: | |
2111 raise ValueError("length must not be None if data is None") | |
2112 elif length == 0: | |
2113 self._data = b"" | |
2114 elif length < 0: | |
2115 raise ValueError("length must not be negative.") | |
2116 else: | |
2117 self._data = _UndefinedSequenceData(length) | |
2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)): | |
2119 self._data = data | |
2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)): | |
2121 self._data = bytes(data) | |
2122 elif isinstance(data, str): | |
2123 self._data = bytes(data, encoding="ASCII") | |
2124 elif isinstance(data, dict): | |
2125 if length is None: | |
2126 raise ValueError("length must not be None if data is a dictionary") | |
2127 elif length == 0: | |
2128 self._data = b"" | |
2129 elif length < 0: | |
2130 raise ValueError("length must not be negative.") | |
2131 else: | |
2132 current = 0 # not needed here, but it keeps mypy happy | |
2133 end = -1 | |
2134 starts = sorted(data.keys()) | |
2135 _data: Dict[int, bytes] = {} | |
2136 for start in starts: | |
2137 seq = data[start] | |
2138 if isinstance(seq, str): | |
2139 seq = bytes(seq, encoding="ASCII") | |
2140 else: | |
2141 try: | |
2142 seq = bytes(seq) | |
2143 except Exception: | |
2144 raise ValueError("Expected bytes-like objects or strings") | |
2145 if start < end: | |
2146 raise ValueError("Sequence data are overlapping.") | |
2147 elif start == end: | |
2148 _data[current] += seq # noqa: F821 | |
2149 else: | |
2150 _data[start] = seq | |
2151 current = start | |
2152 end = start + len(seq) | |
2153 if end > length: | |
2154 raise ValueError( | |
2155 "Provided sequence data extend beyond sequence length." | |
2156 ) | |
2157 elif end == length and current == 0: | |
2158 # sequence is fully defined | |
2159 self._data = _data[current] | |
2160 else: | |
2161 self._data = _PartiallyDefinedSequenceData(length, _data) | |
2162 else: | |
2163 raise TypeError( | |
2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object" | |
2165 ) | |
2166 | |
2167 def __hash__(self): | |
2168 """Hash of the sequence as a string for comparison. | |
2169 | |
2170 See Seq object comparison documentation (method ``__eq__`` in | |
2171 particular) as this has changed in Biopython 1.65. Older versions | |
2172 would hash on object identity. | |
2173 """ | |
2174 return hash(self._data) | |
2175 | |
2176 | |
2177 class MutableSeq(_SeqAbstractBaseClass): | |
2178 """An editable sequence object. | |
2179 | |
2180 Unlike normal python strings and our basic sequence object (the Seq class) | |
2181 which are immutable, the MutableSeq lets you edit the sequence in place. | |
2182 However, this means you cannot use a MutableSeq object as a dictionary key. | |
2183 | |
2184 >>> from Bio.Seq import MutableSeq | |
2185 >>> my_seq = MutableSeq("ACTCGTCGTCG") | |
2186 >>> my_seq | |
2187 MutableSeq('ACTCGTCGTCG') | |
2188 >>> my_seq[5] | |
2189 'T' | |
2190 >>> my_seq[5] = "A" | |
2191 >>> my_seq | |
2192 MutableSeq('ACTCGACGTCG') | |
2193 >>> my_seq[5] | |
2194 'A' | |
2195 >>> my_seq[5:8] = "NNN" | |
2196 >>> my_seq | |
2197 MutableSeq('ACTCGNNNTCG') | |
2198 >>> len(my_seq) | |
2199 11 | |
2200 | |
2201 Note that the MutableSeq object does not support as many string-like | |
2202 or biological methods as the Seq object. | |
2203 """ | |
2204 | |
2205 def __init__(self, data): | |
2206 """Create a MutableSeq object.""" | |
2207 if isinstance(data, bytearray): | |
2208 self._data = data | |
2209 elif isinstance(data, bytes): | |
2210 self._data = bytearray(data) | |
2211 elif isinstance(data, str): | |
2212 self._data = bytearray(data, "ASCII") | |
2213 elif isinstance(data, MutableSeq): | |
2214 self._data = data._data[:] # Take a copy | |
2215 elif isinstance(data, Seq): | |
2216 # Make no assumptions about the Seq subclass internal storage | |
2217 self._data = bytearray(bytes(data)) | |
2218 else: | |
2219 raise TypeError( | |
2220 "data should be a string, bytearray object, Seq object, or a " | |
2221 "MutableSeq object" | |
2222 ) | |
2223 | |
2224 def __setitem__(self, index, value): | |
2225 """Set a subsequence of single letter via value parameter. | |
2226 | |
2227 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2228 >>> my_seq[0] = 'T' | |
2229 >>> my_seq | |
2230 MutableSeq('TCTCGACGTCG') | |
2231 """ | |
2232 if isinstance(index, numbers.Integral): | |
2233 # Replacing a single letter with a new string | |
2234 self._data[index] = ord(value) | |
2235 else: | |
2236 # Replacing a sub-sequence | |
2237 if isinstance(value, MutableSeq): | |
2238 self._data[index] = value._data | |
2239 elif isinstance(value, Seq): | |
2240 self._data[index] = bytes(value) | |
2241 elif isinstance(value, str): | |
2242 self._data[index] = value.encode("ASCII") | |
2243 else: | |
2244 raise TypeError(f"received unexpected type '{type(value).__name__}'") | |
2245 | |
2246 def __delitem__(self, index): | |
2247 """Delete a subsequence of single letter. | |
2248 | |
2249 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2250 >>> del my_seq[0] | |
2251 >>> my_seq | |
2252 MutableSeq('CTCGACGTCG') | |
2253 """ | |
2254 # Could be deleting a single letter, or a slice | |
2255 del self._data[index] | |
2256 | |
2257 def append(self, c): | |
2258 """Add a subsequence to the mutable sequence object. | |
2259 | |
2260 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2261 >>> my_seq.append('A') | |
2262 >>> my_seq | |
2263 MutableSeq('ACTCGACGTCGA') | |
2264 | |
2265 No return value. | |
2266 """ | |
2267 self._data.append(ord(c.encode("ASCII"))) | |
2268 | |
2269 def insert(self, i, c): | |
2270 """Add a subsequence to the mutable sequence object at a given index. | |
2271 | |
2272 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2273 >>> my_seq.insert(0,'A') | |
2274 >>> my_seq | |
2275 MutableSeq('AACTCGACGTCG') | |
2276 >>> my_seq.insert(8,'G') | |
2277 >>> my_seq | |
2278 MutableSeq('AACTCGACGGTCG') | |
2279 | |
2280 No return value. | |
2281 """ | |
2282 self._data.insert(i, ord(c.encode("ASCII"))) | |
2283 | |
2284 def pop(self, i=(-1)): | |
2285 """Remove a subsequence of a single letter at given index. | |
2286 | |
2287 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2288 >>> my_seq.pop() | |
2289 'G' | |
2290 >>> my_seq | |
2291 MutableSeq('ACTCGACGTC') | |
2292 >>> my_seq.pop() | |
2293 'C' | |
2294 >>> my_seq | |
2295 MutableSeq('ACTCGACGT') | |
2296 | |
2297 Returns the last character of the sequence. | |
2298 """ | |
2299 c = self._data[i] | |
2300 del self._data[i] | |
2301 return chr(c) | |
2302 | |
2303 def remove(self, item): | |
2304 """Remove a subsequence of a single letter from mutable sequence. | |
2305 | |
2306 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2307 >>> my_seq.remove('C') | |
2308 >>> my_seq | |
2309 MutableSeq('ATCGACGTCG') | |
2310 >>> my_seq.remove('A') | |
2311 >>> my_seq | |
2312 MutableSeq('TCGACGTCG') | |
2313 | |
2314 No return value. | |
2315 """ | |
2316 codepoint = ord(item) | |
2317 try: | |
2318 self._data.remove(codepoint) | |
2319 except ValueError: | |
2320 raise ValueError("value not found in MutableSeq") from None | |
2321 | |
2322 def reverse(self): | |
2323 """Modify the mutable sequence to reverse itself. | |
2324 | |
2325 No return value. | |
2326 """ | |
2327 self._data.reverse() | |
2328 | |
2329 def extend(self, other): | |
2330 """Add a sequence to the original mutable sequence object. | |
2331 | |
2332 >>> my_seq = MutableSeq('ACTCGACGTCG') | |
2333 >>> my_seq.extend('A') | |
2334 >>> my_seq | |
2335 MutableSeq('ACTCGACGTCGA') | |
2336 >>> my_seq.extend('TTT') | |
2337 >>> my_seq | |
2338 MutableSeq('ACTCGACGTCGATTT') | |
2339 | |
2340 No return value. | |
2341 """ | |
2342 if isinstance(other, MutableSeq): | |
2343 self._data.extend(other._data) | |
2344 elif isinstance(other, Seq): | |
2345 self._data.extend(bytes(other)) | |
2346 elif isinstance(other, str): | |
2347 self._data.extend(other.encode("ASCII")) | |
2348 else: | |
2349 raise TypeError("expected a string, Seq or MutableSeq") | |
2350 | |
2351 | |
2352 class UndefinedSequenceError(ValueError): | |
2353 """Sequence contents is undefined.""" | |
2354 | |
2355 | |
2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass): | |
2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE). | |
2358 | |
2359 Objects of this class can be used to create a Seq object to represent | |
2360 sequences with a known length, but an unknown sequence contents. | |
2361 Calling __len__ returns the sequence length, calling __getitem__ raises an | |
2362 UndefinedSequenceError except for requests of zero size, for which it | |
2363 returns an empty bytes object. | |
2364 """ | |
2365 | |
2366 __slots__ = ("_length",) | |
2367 | |
2368 def __init__(self, length): | |
2369 """Initialize the object with the sequence length. | |
2370 | |
2371 The calling function is responsible for ensuring that the length is | |
2372 greater than zero. | |
2373 """ | |
2374 self._length = length | |
2375 super().__init__() | |
2376 | |
2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]: | |
2378 if isinstance(key, slice): | |
2379 start, end, step = key.indices(self._length) | |
2380 size = len(range(start, end, step)) | |
2381 if size == 0: | |
2382 return b"" | |
2383 return _UndefinedSequenceData(size) | |
2384 else: | |
2385 raise UndefinedSequenceError("Sequence content is undefined") | |
2386 | |
2387 def __len__(self): | |
2388 return self._length | |
2389 | |
2390 def __bytes__(self): | |
2391 raise UndefinedSequenceError("Sequence content is undefined") | |
2392 | |
2393 def __add__(self, other): | |
2394 length = len(self) + len(other) | |
2395 try: | |
2396 other = bytes(other) | |
2397 except UndefinedSequenceError: | |
2398 if isinstance(other, _UndefinedSequenceData): | |
2399 return _UndefinedSequenceData(length) | |
2400 else: | |
2401 return NotImplemented | |
2402 # _PartiallyDefinedSequenceData.__radd__ will handle this | |
2403 else: | |
2404 data = {len(self): other} | |
2405 return _PartiallyDefinedSequenceData(length, data) | |
2406 | |
2407 def __radd__(self, other): | |
2408 data = {0: bytes(other)} | |
2409 length = len(other) + len(self) | |
2410 return _PartiallyDefinedSequenceData(length, data) | |
2411 | |
2412 def upper(self): | |
2413 """Return an upper case copy of the sequence.""" | |
2414 # An upper case copy of an undefined sequence is an undefined | |
2415 # sequence of the same length | |
2416 return _UndefinedSequenceData(self._length) | |
2417 | |
2418 def lower(self): | |
2419 """Return a lower case copy of the sequence.""" | |
2420 # A lower case copy of an undefined sequence is an undefined | |
2421 # sequence of the same length | |
2422 return _UndefinedSequenceData(self._length) | |
2423 | |
2424 def isupper(self): | |
2425 """Return True if all ASCII characters in data are uppercase. | |
2426 | |
2427 If there are no cased characters, the method returns False. | |
2428 """ | |
2429 # Character case is irrelevant for an undefined sequence | |
2430 raise UndefinedSequenceError("Sequence content is undefined") | |
2431 | |
2432 def islower(self): | |
2433 """Return True if all ASCII characters in data are lowercase. | |
2434 | |
2435 If there are no cased characters, the method returns False. | |
2436 """ | |
2437 # Character case is irrelevant for an undefined sequence | |
2438 raise UndefinedSequenceError("Sequence content is undefined") | |
2439 | |
2440 def replace(self, old, new): | |
2441 """Return a copy with all occurrences of substring old replaced by new.""" | |
2442 # Replacing substring old by new in an undefined sequence will result | |
2443 # in an undefined sequence of the same length, if old and new have the | |
2444 # number of characters. | |
2445 if len(old) != len(new): | |
2446 raise UndefinedSequenceError("Sequence content is undefined") | |
2447 return _UndefinedSequenceData(self._length) | |
2448 | |
2449 @property | |
2450 def defined(self): | |
2451 """Return False, as the sequence is not defined and has a non-zero length.""" | |
2452 return False | |
2453 | |
2454 @property | |
2455 def defined_ranges(self): | |
2456 """Return a tuple of the ranges where the sequence contents is defined. | |
2457 | |
2458 As the sequence contents of an _UndefinedSequenceData object is fully | |
2459 undefined, the return value is always an empty tuple. | |
2460 """ | |
2461 return () | |
2462 | |
2463 | |
2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass): | |
2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE). | |
2466 | |
2467 Objects of this class can be used to create a Seq object to represent | |
2468 sequences with a known length, but with a sequence contents that is only | |
2469 partially known. | |
2470 Calling __len__ returns the sequence length, calling __getitem__ returns | |
2471 the sequence contents if known, otherwise an UndefinedSequenceError is | |
2472 raised. | |
2473 """ | |
2474 | |
2475 __slots__ = ("_length", "_data") | |
2476 | |
2477 def __init__(self, length, data): | |
2478 """Initialize with the sequence length and defined sequence segments. | |
2479 | |
2480 The calling function is responsible for ensuring that the length is | |
2481 greater than zero. | |
2482 """ | |
2483 self._length = length | |
2484 self._data = data | |
2485 super().__init__() | |
2486 | |
2487 def __getitem__( | |
2488 self, key: Union[slice, int] | |
2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]: | |
2490 if isinstance(key, slice): | |
2491 start, end, step = key.indices(self._length) | |
2492 size = len(range(start, end, step)) | |
2493 if size == 0: | |
2494 return b"" | |
2495 data = {} | |
2496 for s, d in self._data.items(): | |
2497 indices = range(-s, -s + self._length)[key] | |
2498 e: Optional[int] = indices.stop | |
2499 assert e is not None | |
2500 if step > 0: | |
2501 if e <= 0: | |
2502 continue | |
2503 if indices.start < 0: | |
2504 s = indices.start % step | |
2505 else: | |
2506 s = indices.start | |
2507 else: # step < 0 | |
2508 if e < 0: | |
2509 e = None | |
2510 end = len(d) - 1 | |
2511 if indices.start > end: | |
2512 s = end + (indices.start - end) % step | |
2513 else: | |
2514 s = indices.start | |
2515 if s < 0: | |
2516 continue | |
2517 start = (s - indices.start) // step | |
2518 d = d[s:e:step] | |
2519 if d: | |
2520 data[start] = d | |
2521 if len(data) == 0: # Fully undefined sequence | |
2522 return _UndefinedSequenceData(size) | |
2523 # merge adjacent sequence segments | |
2524 end = -1 | |
2525 previous = 0 # not needed here, but it keeps flake happy | |
2526 items = data.items() | |
2527 data = {} | |
2528 for start, seq in items: | |
2529 if end == start: | |
2530 data[previous] += seq | |
2531 else: | |
2532 data[start] = seq | |
2533 previous = start | |
2534 end = start + len(seq) | |
2535 if len(data) == 1: | |
2536 seq = data.get(0) | |
2537 if seq is not None and len(seq) == size: | |
2538 return seq # Fully defined sequence; return bytes | |
2539 if step < 0: | |
2540 # use this after we drop Python 3.7: | |
2541 # data = {start: data[start] for start in reversed(data)} | |
2542 # use this as long as we support Python 3.7: | |
2543 data = {start: data[start] for start in reversed(list(data.keys()))} | |
2544 return _PartiallyDefinedSequenceData(size, data) | |
2545 elif self._length <= key: | |
2546 raise IndexError("sequence index out of range") | |
2547 else: | |
2548 for start, seq in self._data.items(): | |
2549 if start <= key and key < start + len(seq): | |
2550 return seq[key - start] | |
2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key) | |
2552 | |
2553 def __len__(self): | |
2554 return self._length | |
2555 | |
2556 def __bytes__(self): | |
2557 raise UndefinedSequenceError("Sequence content is only partially defined") | |
2558 | |
2559 def __add__(self, other): | |
2560 length = len(self) + len(other) | |
2561 data = dict(self._data) | |
2562 items = list(self._data.items()) | |
2563 start, seq = items[-1] | |
2564 end = start + len(seq) | |
2565 try: | |
2566 other = bytes(other) | |
2567 except UndefinedSequenceError: | |
2568 if isinstance(other, _UndefinedSequenceData): | |
2569 pass | |
2570 elif isinstance(other, _PartiallyDefinedSequenceData): | |
2571 other_items = list(other._data.items()) | |
2572 if end == len(self): | |
2573 other_start, other_seq = other_items.pop(0) | |
2574 if other_start == 0: | |
2575 data[start] += other_seq | |
2576 else: | |
2577 data[len(self) + other_start] = other_seq | |
2578 for other_start, other_seq in other_items: | |
2579 data[len(self) + other_start] = other_seq | |
2580 else: | |
2581 if end == len(self): | |
2582 data[start] += other | |
2583 else: | |
2584 data[len(self)] = other | |
2585 return _PartiallyDefinedSequenceData(length, data) | |
2586 | |
2587 def __radd__(self, other): | |
2588 length = len(other) + len(self) | |
2589 try: | |
2590 other = bytes(other) | |
2591 except UndefinedSequenceError: | |
2592 data = {len(other) + start: seq for start, seq in self._data.items()} | |
2593 else: | |
2594 data = {0: other} | |
2595 items = list(self._data.items()) | |
2596 start, seq = items.pop(0) | |
2597 if start == 0: | |
2598 data[0] += seq | |
2599 else: | |
2600 data[len(other) + start] = seq | |
2601 for start, seq in items: | |
2602 data[len(other) + start] = seq | |
2603 return _PartiallyDefinedSequenceData(length, data) | |
2604 | |
2605 def __mul__(self, other): | |
2606 length = self._length | |
2607 items = self._data.items() | |
2608 data = {} | |
2609 end = -1 | |
2610 previous = 0 # not needed here, but it keeps flake happy | |
2611 for i in range(other): | |
2612 for start, seq in items: | |
2613 start += i * length | |
2614 if end == start: | |
2615 data[previous] += seq | |
2616 else: | |
2617 data[start] = seq | |
2618 previous = start | |
2619 end = start + len(seq) | |
2620 return _PartiallyDefinedSequenceData(length * other, data) | |
2621 | |
2622 def upper(self): | |
2623 """Return an upper case copy of the sequence.""" | |
2624 data = {start: seq.upper() for start, seq in self._data.items()} | |
2625 return _PartiallyDefinedSequenceData(self._length, data) | |
2626 | |
2627 def lower(self): | |
2628 """Return a lower case copy of the sequence.""" | |
2629 data = {start: seq.lower() for start, seq in self._data.items()} | |
2630 return _PartiallyDefinedSequenceData(self._length, data) | |
2631 | |
2632 def isupper(self): | |
2633 """Return True if all ASCII characters in data are uppercase. | |
2634 | |
2635 If there are no cased characters, the method returns False. | |
2636 """ | |
2637 # Character case is irrelevant for an undefined sequence | |
2638 raise UndefinedSequenceError("Sequence content is only partially defined") | |
2639 | |
2640 def islower(self): | |
2641 """Return True if all ASCII characters in data are lowercase. | |
2642 | |
2643 If there are no cased characters, the method returns False. | |
2644 """ | |
2645 # Character case is irrelevant for an undefined sequence | |
2646 raise UndefinedSequenceError("Sequence content is only partially defined") | |
2647 | |
2648 def translate(self, table, delete=b""): | |
2649 """Return a copy with each character mapped by the given translation table. | |
2650 | |
2651 table | |
2652 Translation table, which must be a bytes object of length 256. | |
2653 | |
2654 All characters occurring in the optional argument delete are removed. | |
2655 The remaining characters are mapped through the given translation table. | |
2656 """ | |
2657 items = self._data.items() | |
2658 data = {start: seq.translate(table, delete) for start, seq in items} | |
2659 return _PartiallyDefinedSequenceData(self._length, data) | |
2660 | |
2661 def replace(self, old, new): | |
2662 """Return a copy with all occurrences of substring old replaced by new.""" | |
2663 # Replacing substring old by new in the undefined sequence segments | |
2664 # will result in an undefined sequence segment of the same length, if | |
2665 # old and new have the number of characters. If not, an error is raised, | |
2666 # as the correct start positions cannot be calculated reliably. | |
2667 if len(old) != len(new): | |
2668 raise UndefinedSequenceError( | |
2669 "Sequence content is only partially defined; substring \n" | |
2670 "replacement cannot be performed reliably" | |
2671 ) | |
2672 items = self._data.items() | |
2673 data = {start: seq.replace(old, new) for start, seq in items} | |
2674 return _PartiallyDefinedSequenceData(self._length, data) | |
2675 | |
2676 @property | |
2677 def defined(self): | |
2678 """Return False, as the sequence is not fully defined and has a non-zero length.""" | |
2679 return False | |
2680 | |
2681 @property | |
2682 def defined_ranges(self): | |
2683 """Return a tuple of the ranges where the sequence contents is defined. | |
2684 | |
2685 The return value has the format ((start1, end1), (start2, end2), ...). | |
2686 """ | |
2687 return tuple((start, start + len(seq)) for start, seq in self._data.items()) | |
2688 | |
2689 | |
2690 # The transcribe, backward_transcribe, and translate functions are | |
2691 # user-friendly versions of the corresponding Seq/MutableSeq methods. | |
2692 # The functions work both on Seq objects, and on strings. | |
2693 | |
2694 | |
2695 def transcribe(dna): | |
2696 """Transcribe a DNA sequence into RNA. | |
2697 | |
2698 Following the usual convention, the sequence is interpreted as the | |
2699 coding strand of the DNA double helix, not the template strand. This | |
2700 means we can get the RNA sequence just by switching T to U. | |
2701 | |
2702 If given a string, returns a new string object. | |
2703 | |
2704 Given a Seq or MutableSeq, returns a new Seq object. | |
2705 | |
2706 e.g. | |
2707 | |
2708 >>> transcribe("ACTGN") | |
2709 'ACUGN' | |
2710 """ | |
2711 if isinstance(dna, Seq): | |
2712 return dna.transcribe() | |
2713 elif isinstance(dna, MutableSeq): | |
2714 return Seq(dna).transcribe() | |
2715 else: | |
2716 return dna.replace("T", "U").replace("t", "u") | |
2717 | |
2718 | |
2719 def back_transcribe(rna): | |
2720 """Return the RNA sequence back-transcribed into DNA. | |
2721 | |
2722 If given a string, returns a new string object. | |
2723 | |
2724 Given a Seq or MutableSeq, returns a new Seq object. | |
2725 | |
2726 e.g. | |
2727 | |
2728 >>> back_transcribe("ACUGN") | |
2729 'ACTGN' | |
2730 """ | |
2731 if isinstance(rna, Seq): | |
2732 return rna.back_transcribe() | |
2733 elif isinstance(rna, MutableSeq): | |
2734 return Seq(rna).back_transcribe() | |
2735 else: | |
2736 return rna.replace("U", "T").replace("u", "t") | |
2737 | |
2738 | |
2739 def _translate_str( | |
2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None | |
2741 ): | |
2742 """Translate nucleotide string into a protein string (PRIVATE). | |
2743 | |
2744 Arguments: | |
2745 - sequence - a string | |
2746 - table - Which codon table to use? This can be either a name (string), | |
2747 an NCBI identifier (integer), or a CodonTable object (useful for | |
2748 non-standard genetic codes). This defaults to the "Standard" table. | |
2749 - stop_symbol - a single character string, what to use for terminators. | |
2750 - to_stop - boolean, should translation terminate at the first | |
2751 in frame stop codon? If there is no in-frame stop codon | |
2752 then translation continues to the end. | |
2753 - pos_stop - a single character string for a possible stop codon | |
2754 (e.g. TAN or NNN) | |
2755 - cds - Boolean, indicates this is a complete CDS. If True, this | |
2756 checks the sequence starts with a valid alternative start | |
2757 codon (which will be translated as methionine, M), that the | |
2758 sequence length is a multiple of three, and that there is a | |
2759 single in frame stop codon at the end (this will be excluded | |
2760 from the protein sequence, regardless of the to_stop option). | |
2761 If these tests fail, an exception is raised. | |
2762 - gap - Single character string to denote symbol used for gaps. | |
2763 Defaults to None. | |
2764 | |
2765 Returns a string. | |
2766 | |
2767 e.g. | |
2768 | |
2769 >>> from Bio.Data import CodonTable | |
2770 >>> table = CodonTable.ambiguous_dna_by_id[1] | |
2771 >>> _translate_str("AAA", table) | |
2772 'K' | |
2773 >>> _translate_str("TAR", table) | |
2774 '*' | |
2775 >>> _translate_str("TAN", table) | |
2776 'X' | |
2777 >>> _translate_str("TAN", table, pos_stop="@") | |
2778 '@' | |
2779 >>> _translate_str("TA?", table) | |
2780 Traceback (most recent call last): | |
2781 ... | |
2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid | |
2783 | |
2784 In a change to older versions of Biopython, partial codons are now | |
2785 always regarded as an error (previously only checked if cds=True) | |
2786 and will trigger a warning (likely to become an exception in a | |
2787 future release). | |
2788 | |
2789 If **cds=True**, the start and stop codons are checked, and the start | |
2790 codon will be translated at methionine. The sequence must be an | |
2791 while number of codons. | |
2792 | |
2793 >>> _translate_str("ATGCCCTAG", table, cds=True) | |
2794 'MP' | |
2795 >>> _translate_str("AAACCCTAG", table, cds=True) | |
2796 Traceback (most recent call last): | |
2797 ... | |
2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon | |
2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) | |
2800 Traceback (most recent call last): | |
2801 ... | |
2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found. | |
2803 """ | |
2804 try: | |
2805 table_id = int(table) | |
2806 except ValueError: | |
2807 # Assume it's a table name | |
2808 # The same table can be used for RNA or DNA | |
2809 try: | |
2810 codon_table = CodonTable.ambiguous_generic_by_name[table] | |
2811 except KeyError: | |
2812 if isinstance(table, str): | |
2813 raise ValueError( | |
2814 "The Bio.Seq translate methods and function DO NOT " | |
2815 "take a character string mapping table like the python " | |
2816 "string object's translate method. " | |
2817 "Use str(my_seq).translate(...) instead." | |
2818 ) from None | |
2819 else: | |
2820 raise TypeError("table argument must be integer or string") from None | |
2821 except (AttributeError, TypeError): | |
2822 # Assume it's a CodonTable object | |
2823 if isinstance(table, CodonTable.CodonTable): | |
2824 codon_table = table | |
2825 else: | |
2826 raise ValueError("Bad table argument") from None | |
2827 else: | |
2828 # Assume it's a table ID | |
2829 # The same table can be used for RNA or DNA | |
2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id] | |
2831 sequence = sequence.upper() | |
2832 amino_acids = [] | |
2833 forward_table = codon_table.forward_table | |
2834 stop_codons = codon_table.stop_codons | |
2835 if codon_table.nucleotide_alphabet is not None: | |
2836 valid_letters = set(codon_table.nucleotide_alphabet.upper()) | |
2837 else: | |
2838 # Assume the worst case, ambiguous DNA or RNA: | |
2839 valid_letters = set( | |
2840 IUPACData.ambiguous_dna_letters.upper() | |
2841 + IUPACData.ambiguous_rna_letters.upper() | |
2842 ) | |
2843 n = len(sequence) | |
2844 | |
2845 # Check for tables with 'ambiguous' (dual-coding) stop codons: | |
2846 dual_coding = [c for c in stop_codons if c in forward_table] | |
2847 if dual_coding: | |
2848 c = dual_coding[0] | |
2849 if to_stop: | |
2850 raise ValueError( | |
2851 "You cannot use 'to_stop=True' with this table as it contains" | |
2852 f" {len(dual_coding)} codon(s) which can be both STOP and an" | |
2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)." | |
2854 ) | |
2855 warnings.warn( | |
2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for" | |
2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'" | |
2858 " or STOP). Such codons will be translated as amino acid.", | |
2859 BiopythonWarning, | |
2860 ) | |
2861 | |
2862 if cds: | |
2863 if str(sequence[:3]).upper() not in codon_table.start_codons: | |
2864 raise CodonTable.TranslationError( | |
2865 f"First codon '{sequence[:3]}' is not a start codon" | |
2866 ) | |
2867 if n % 3 != 0: | |
2868 raise CodonTable.TranslationError( | |
2869 f"Sequence length {n} is not a multiple of three" | |
2870 ) | |
2871 if str(sequence[-3:]).upper() not in stop_codons: | |
2872 raise CodonTable.TranslationError( | |
2873 f"Final codon '{sequence[-3:]}' is not a stop codon" | |
2874 ) | |
2875 # Don't translate the stop symbol, and manually translate the M | |
2876 sequence = sequence[3:-3] | |
2877 n -= 6 | |
2878 amino_acids = ["M"] | |
2879 elif n % 3 != 0: | |
2880 warnings.warn( | |
2881 "Partial codon, len(sequence) not a multiple of three. " | |
2882 "Explicitly trim the sequence or add trailing N before " | |
2883 "translation. This may become an error in future.", | |
2884 BiopythonWarning, | |
2885 ) | |
2886 if gap is not None: | |
2887 if not isinstance(gap, str): | |
2888 raise TypeError("Gap character should be a single character string.") | |
2889 elif len(gap) > 1: | |
2890 raise ValueError("Gap character should be a single character string.") | |
2891 | |
2892 for i in range(0, n - n % 3, 3): | |
2893 codon = sequence[i : i + 3] | |
2894 try: | |
2895 amino_acids.append(forward_table[codon]) | |
2896 except (KeyError, CodonTable.TranslationError): | |
2897 if codon in codon_table.stop_codons: | |
2898 if cds: | |
2899 raise CodonTable.TranslationError( | |
2900 f"Extra in frame stop codon '{codon}' found." | |
2901 ) from None | |
2902 if to_stop: | |
2903 break | |
2904 amino_acids.append(stop_symbol) | |
2905 elif valid_letters.issuperset(set(codon)): | |
2906 # Possible stop codon (e.g. NNN or TAN) | |
2907 amino_acids.append(pos_stop) | |
2908 elif gap is not None and codon == gap * 3: | |
2909 # Gapped translation | |
2910 amino_acids.append(gap) | |
2911 else: | |
2912 raise CodonTable.TranslationError( | |
2913 f"Codon '{codon}' is invalid" | |
2914 ) from None | |
2915 return "".join(amino_acids) | |
2916 | |
2917 | |
2918 def translate( | |
2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None | |
2920 ): | |
2921 """Translate a nucleotide sequence into amino acids. | |
2922 | |
2923 If given a string, returns a new string object. Given a Seq or | |
2924 MutableSeq, returns a Seq object. | |
2925 | |
2926 Arguments: | |
2927 - table - Which codon table to use? This can be either a name | |
2928 (string), an NCBI identifier (integer), or a CodonTable object | |
2929 (useful for non-standard genetic codes). Defaults to the "Standard" | |
2930 table. | |
2931 - stop_symbol - Single character string, what to use for any | |
2932 terminators, defaults to the asterisk, "*". | |
2933 - to_stop - Boolean, defaults to False meaning do a full | |
2934 translation continuing on past any stop codons | |
2935 (translated as the specified stop_symbol). If | |
2936 True, translation is terminated at the first in | |
2937 frame stop codon (and the stop_symbol is not | |
2938 appended to the returned protein sequence). | |
2939 - cds - Boolean, indicates this is a complete CDS. If True, this | |
2940 checks the sequence starts with a valid alternative start | |
2941 codon (which will be translated as methionine, M), that the | |
2942 sequence length is a multiple of three, and that there is a | |
2943 single in frame stop codon at the end (this will be excluded | |
2944 from the protein sequence, regardless of the to_stop option). | |
2945 If these tests fail, an exception is raised. | |
2946 - gap - Single character string to denote symbol used for gaps. | |
2947 Defaults to None. | |
2948 | |
2949 A simple string example using the default (standard) genetic code: | |
2950 | |
2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" | |
2952 >>> translate(coding_dna) | |
2953 'VAIVMGR*KGAR*' | |
2954 >>> translate(coding_dna, stop_symbol="@") | |
2955 'VAIVMGR@KGAR@' | |
2956 >>> translate(coding_dna, to_stop=True) | |
2957 'VAIVMGR' | |
2958 | |
2959 Now using NCBI table 2, where TGA is not a stop codon: | |
2960 | |
2961 >>> translate(coding_dna, table=2) | |
2962 'VAIVMGRWKGAR*' | |
2963 >>> translate(coding_dna, table=2, to_stop=True) | |
2964 'VAIVMGRWKGAR' | |
2965 | |
2966 In fact this example uses an alternative start codon valid under NCBI | |
2967 table 2, GTG, which means this example is a complete valid CDS which | |
2968 when translated should really start with methionine (not valine): | |
2969 | |
2970 >>> translate(coding_dna, table=2, cds=True) | |
2971 'MAIVMGRWKGAR' | |
2972 | |
2973 Note that if the sequence has no in-frame stop codon, then the to_stop | |
2974 argument has no effect: | |
2975 | |
2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" | |
2977 >>> translate(coding_dna2) | |
2978 'VAIVMGR' | |
2979 >>> translate(coding_dna2, to_stop=True) | |
2980 'VAIVMGR' | |
2981 | |
2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid | |
2983 or a stop codon. These are translated as "X". Any invalid codon | |
2984 (e.g. "TA?" or "T-A") will throw a TranslationError. | |
2985 | |
2986 It will however translate either DNA or RNA. | |
2987 | |
2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous | |
2989 stop codons'. These are stop codons with unambiguous sequence but which | |
2990 have a context dependent coding as STOP or as amino acid. With these tables | |
2991 'to_stop' must be False (otherwise a ValueError is raised). The dual | |
2992 coding codons will always be translated as amino acid, except for | |
2993 'cds=True', where the last codon will be translated as STOP. | |
2994 | |
2995 >>> coding_dna3 = "ATGGCACGGAAGTGA" | |
2996 >>> translate(coding_dna3) | |
2997 'MARK*' | |
2998 | |
2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W | |
3000 'MARKW' | |
3001 | |
3002 It will however raise a BiopythonWarning (not shown). | |
3003 | |
3004 >>> translate(coding_dna3, table=27, cds=True) | |
3005 'MARK' | |
3006 | |
3007 >>> translate(coding_dna3, table=27, to_stop=True) | |
3008 Traceback (most recent call last): | |
3009 ... | |
3010 ValueError: You cannot use 'to_stop=True' with this table ... | |
3011 """ | |
3012 if isinstance(sequence, Seq): | |
3013 return sequence.translate(table, stop_symbol, to_stop, cds) | |
3014 elif isinstance(sequence, MutableSeq): | |
3015 # Return a Seq object | |
3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds) | |
3017 else: | |
3018 # Assume it's a string, return a string | |
3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap) | |
3020 | |
3021 | |
3022 def reverse_complement(sequence, inplace=False): | |
3023 """Return the reverse complement as a DNA sequence. | |
3024 | |
3025 If given a string, returns a new string object. | |
3026 Given a Seq object, returns a new Seq object. | |
3027 Given a MutableSeq, returns a new MutableSeq object. | |
3028 Given a SeqRecord object, returns a new SeqRecord object. | |
3029 | |
3030 >>> my_seq = "CGA" | |
3031 >>> reverse_complement(my_seq) | |
3032 'TCG' | |
3033 >>> my_seq = Seq("CGA") | |
3034 >>> reverse_complement(my_seq) | |
3035 Seq('TCG') | |
3036 >>> my_seq = MutableSeq("CGA") | |
3037 >>> reverse_complement(my_seq) | |
3038 MutableSeq('TCG') | |
3039 >>> my_seq | |
3040 MutableSeq('CGA') | |
3041 | |
3042 Any U in the sequence is treated as a T: | |
3043 | |
3044 >>> reverse_complement(Seq("CGAUT")) | |
3045 Seq('AATCG') | |
3046 | |
3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence: | |
3048 | |
3049 >>> reverse_complement_rna(Seq("CGAUT")) | |
3050 Seq('AAUCG') | |
3051 | |
3052 Supports and lower- and upper-case characters, and unambiguous and | |
3053 ambiguous nucleotides. All other characters are not converted: | |
3054 | |
3055 >>> reverse_complement("ACGTUacgtuXYZxyz") | |
3056 'zrxZRXaacgtAACGT' | |
3057 | |
3058 The sequence is modified in-place and returned if inplace is True: | |
3059 | |
3060 >>> my_seq = MutableSeq("CGA") | |
3061 >>> reverse_complement(my_seq, inplace=True) | |
3062 MutableSeq('TCG') | |
3063 >>> my_seq | |
3064 MutableSeq('TCG') | |
3065 | |
3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is | |
3067 raised if ``reverse_complement`` is called on a ``Seq`` object with | |
3068 ``inplace=True``. | |
3069 """ | |
3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports | |
3071 | |
3072 if isinstance(sequence, (Seq, MutableSeq)): | |
3073 return sequence.reverse_complement(inplace) | |
3074 if isinstance(sequence, SeqRecord): | |
3075 if inplace: | |
3076 raise TypeError("SeqRecords are immutable") | |
3077 return sequence.reverse_complement() | |
3078 # Assume it's a string. | |
3079 if inplace: | |
3080 raise TypeError("strings are immutable") | |
3081 sequence = sequence.encode("ASCII") | |
3082 sequence = sequence.translate(_dna_complement_table) | |
3083 sequence = sequence.decode("ASCII") | |
3084 return sequence[::-1] | |
3085 | |
3086 | |
3087 def reverse_complement_rna(sequence, inplace=False): | |
3088 """Return the reverse complement as an RNA sequence. | |
3089 | |
3090 If given a string, returns a new string object. | |
3091 Given a Seq object, returns a new Seq object. | |
3092 Given a MutableSeq, returns a new MutableSeq object. | |
3093 Given a SeqRecord object, returns a new SeqRecord object. | |
3094 | |
3095 >>> my_seq = "CGA" | |
3096 >>> reverse_complement_rna(my_seq) | |
3097 'UCG' | |
3098 >>> my_seq = Seq("CGA") | |
3099 >>> reverse_complement_rna(my_seq) | |
3100 Seq('UCG') | |
3101 >>> my_seq = MutableSeq("CGA") | |
3102 >>> reverse_complement_rna(my_seq) | |
3103 MutableSeq('UCG') | |
3104 >>> my_seq | |
3105 MutableSeq('CGA') | |
3106 | |
3107 Any T in the sequence is treated as a U: | |
3108 | |
3109 >>> reverse_complement_rna(Seq("CGAUT")) | |
3110 Seq('AAUCG') | |
3111 | |
3112 In contrast, ``reverse_complement`` returns a DNA sequence: | |
3113 | |
3114 >>> reverse_complement(Seq("CGAUT"), inplace=False) | |
3115 Seq('AATCG') | |
3116 | |
3117 Supports and lower- and upper-case characters, and unambiguous and | |
3118 ambiguous nucleotides. All other characters are not converted: | |
3119 | |
3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz") | |
3121 'zrxZRXaacguAACGU' | |
3122 | |
3123 The sequence is modified in-place and returned if inplace is True: | |
3124 | |
3125 >>> my_seq = MutableSeq("CGA") | |
3126 >>> reverse_complement_rna(my_seq, inplace=True) | |
3127 MutableSeq('UCG') | |
3128 >>> my_seq | |
3129 MutableSeq('UCG') | |
3130 | |
3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is | |
3132 raised if ``reverse_complement`` is called on a ``Seq`` object with | |
3133 ``inplace=True``. | |
3134 """ | |
3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports | |
3136 | |
3137 if isinstance(sequence, (Seq, MutableSeq)): | |
3138 return sequence.reverse_complement_rna(inplace) | |
3139 if isinstance(sequence, SeqRecord): | |
3140 if inplace: | |
3141 raise TypeError("SeqRecords are immutable") | |
3142 return sequence.reverse_complement_rna() | |
3143 # Assume it's a string. | |
3144 if inplace: | |
3145 raise TypeError("strings are immutable") | |
3146 sequence = sequence.encode("ASCII") | |
3147 sequence = sequence.translate(_rna_complement_table) | |
3148 sequence = sequence.decode("ASCII") | |
3149 return sequence[::-1] | |
3150 | |
3151 | |
3152 def complement(sequence, inplace=False): | |
3153 """Return the complement as a DNA sequence. | |
3154 | |
3155 If given a string, returns a new string object. | |
3156 Given a Seq object, returns a new Seq object. | |
3157 Given a MutableSeq, returns a new MutableSeq object. | |
3158 Given a SeqRecord object, returns a new SeqRecord object. | |
3159 | |
3160 >>> my_seq = "CGA" | |
3161 >>> complement(my_seq) | |
3162 'GCT' | |
3163 >>> my_seq = Seq("CGA") | |
3164 >>> complement(my_seq) | |
3165 Seq('GCT') | |
3166 >>> my_seq = MutableSeq("CGA") | |
3167 >>> complement(my_seq) | |
3168 MutableSeq('GCT') | |
3169 >>> my_seq | |
3170 MutableSeq('CGA') | |
3171 | |
3172 Any U in the sequence is treated as a T: | |
3173 | |
3174 >>> complement(Seq("CGAUT")) | |
3175 Seq('GCTAA') | |
3176 | |
3177 In contrast, ``complement_rna`` returns an RNA sequence: | |
3178 | |
3179 >>> complement_rna(Seq("CGAUT")) | |
3180 Seq('GCUAA') | |
3181 | |
3182 Supports and lower- and upper-case characters, and unambiguous and | |
3183 ambiguous nucleotides. All other characters are not converted: | |
3184 | |
3185 >>> complement("ACGTUacgtuXYZxyz") | |
3186 'TGCAAtgcaaXRZxrz' | |
3187 | |
3188 The sequence is modified in-place and returned if inplace is True: | |
3189 | |
3190 >>> my_seq = MutableSeq("CGA") | |
3191 >>> complement(my_seq, inplace=True) | |
3192 MutableSeq('GCT') | |
3193 >>> my_seq | |
3194 MutableSeq('GCT') | |
3195 | |
3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is | |
3197 raised if ``reverse_complement`` is called on a ``Seq`` object with | |
3198 ``inplace=True``. | |
3199 """ | |
3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports | |
3201 | |
3202 if isinstance(sequence, (Seq, MutableSeq)): | |
3203 return sequence.complement(inplace) | |
3204 if isinstance(sequence, SeqRecord): | |
3205 if inplace: | |
3206 raise TypeError("SeqRecords are immutable") | |
3207 return sequence.complement() | |
3208 # Assume it's a string. | |
3209 if inplace is True: | |
3210 raise TypeError("strings are immutable") | |
3211 sequence = sequence.encode("ASCII") | |
3212 sequence = sequence.translate(_dna_complement_table) | |
3213 return sequence.decode("ASCII") | |
3214 | |
3215 | |
3216 def complement_rna(sequence, inplace=False): | |
3217 """Return the complement as an RNA sequence. | |
3218 | |
3219 If given a string, returns a new string object. | |
3220 Given a Seq object, returns a new Seq object. | |
3221 Given a MutableSeq, returns a new MutableSeq object. | |
3222 Given a SeqRecord object, returns a new SeqRecord object. | |
3223 | |
3224 >>> my_seq = "CGA" | |
3225 >>> complement_rna(my_seq) | |
3226 'GCU' | |
3227 >>> my_seq = Seq("CGA") | |
3228 >>> complement_rna(my_seq) | |
3229 Seq('GCU') | |
3230 >>> my_seq = MutableSeq("CGA") | |
3231 >>> complement_rna(my_seq) | |
3232 MutableSeq('GCU') | |
3233 >>> my_seq | |
3234 MutableSeq('CGA') | |
3235 | |
3236 Any T in the sequence is treated as a U: | |
3237 | |
3238 >>> complement_rna(Seq("CGAUT")) | |
3239 Seq('GCUAA') | |
3240 | |
3241 In contrast, ``complement`` returns a DNA sequence: | |
3242 | |
3243 >>> complement(Seq("CGAUT")) | |
3244 Seq('GCTAA') | |
3245 | |
3246 Supports and lower- and upper-case characters, and unambiguous and | |
3247 ambiguous nucleotides. All other characters are not converted: | |
3248 | |
3249 >>> complement_rna("ACGTUacgtuXYZxyz") | |
3250 'UGCAAugcaaXRZxrz' | |
3251 | |
3252 The sequence is modified in-place and returned if inplace is True: | |
3253 | |
3254 >>> my_seq = MutableSeq("CGA") | |
3255 >>> complement(my_seq, inplace=True) | |
3256 MutableSeq('GCT') | |
3257 >>> my_seq | |
3258 MutableSeq('GCT') | |
3259 | |
3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is | |
3261 raised if ``reverse_complement`` is called on a ``Seq`` object with | |
3262 ``inplace=True``. | |
3263 """ | |
3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports | |
3265 | |
3266 if isinstance(sequence, (Seq, MutableSeq)): | |
3267 return sequence.complement_rna(inplace) | |
3268 if isinstance(sequence, SeqRecord): | |
3269 if inplace: | |
3270 raise TypeError("SeqRecords are immutable") | |
3271 return sequence.complement_rna() | |
3272 # Assume it's a string. | |
3273 if inplace: | |
3274 raise TypeError("strings are immutable") | |
3275 sequence = sequence.encode("ASCII") | |
3276 sequence = sequence.translate(_rna_complement_table) | |
3277 return sequence.decode("ASCII") | |
3278 | |
3279 | |
3280 def _test(): | |
3281 """Run the Bio.Seq module's doctests (PRIVATE).""" | |
3282 print("Running doctests...") | |
3283 import doctest | |
3284 | |
3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) | |
3286 print("Done") | |
3287 | |
3288 | |
3289 if __name__ == "__main__": | |
3290 _test() |