jpayne@68
|
1 # Copyright 2000 Andrew Dalke.
|
jpayne@68
|
2 # Copyright 2000-2002 Brad Chapman.
|
jpayne@68
|
3 # Copyright 2004-2005, 2010 by M de Hoon.
|
jpayne@68
|
4 # Copyright 2007-2023 by Peter Cock.
|
jpayne@68
|
5 # All rights reserved.
|
jpayne@68
|
6 #
|
jpayne@68
|
7 # This file is part of the Biopython distribution and governed by your
|
jpayne@68
|
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@68
|
9 # Please see the LICENSE file that should have been included as part of this
|
jpayne@68
|
10 # package.
|
jpayne@68
|
11 """Provide objects to represent biological sequences.
|
jpayne@68
|
12
|
jpayne@68
|
13 See also the Seq_ wiki and the chapter in our tutorial:
|
jpayne@68
|
14 - `HTML Tutorial`_
|
jpayne@68
|
15 - `PDF Tutorial`_
|
jpayne@68
|
16
|
jpayne@68
|
17 .. _Seq: http://biopython.org/wiki/Seq
|
jpayne@68
|
18 .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
|
jpayne@68
|
19 .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
|
jpayne@68
|
20
|
jpayne@68
|
21 """
|
jpayne@68
|
22 import array
|
jpayne@68
|
23 import collections
|
jpayne@68
|
24 import numbers
|
jpayne@68
|
25 import warnings
|
jpayne@68
|
26
|
jpayne@68
|
27 from abc import ABC
|
jpayne@68
|
28 from abc import abstractmethod
|
jpayne@68
|
29 from typing import overload, Optional, Union, Dict
|
jpayne@68
|
30
|
jpayne@68
|
31 from Bio import BiopythonWarning
|
jpayne@68
|
32 from Bio.Data import CodonTable
|
jpayne@68
|
33 from Bio.Data import IUPACData
|
jpayne@68
|
34
|
jpayne@68
|
35
|
jpayne@68
|
36 def _maketrans(complement_mapping):
|
jpayne@68
|
37 """Make a python string translation table (PRIVATE).
|
jpayne@68
|
38
|
jpayne@68
|
39 Arguments:
|
jpayne@68
|
40 - complement_mapping - a dictionary such as ambiguous_dna_complement
|
jpayne@68
|
41 and ambiguous_rna_complement from Data.IUPACData.
|
jpayne@68
|
42
|
jpayne@68
|
43 Returns a translation table (a bytes object of length 256) for use with
|
jpayne@68
|
44 the python string's translate method to use in a (reverse) complement.
|
jpayne@68
|
45
|
jpayne@68
|
46 Compatible with lower case and upper case sequences.
|
jpayne@68
|
47
|
jpayne@68
|
48 For internal use only.
|
jpayne@68
|
49 """
|
jpayne@68
|
50 keys = "".join(complement_mapping.keys()).encode("ASCII")
|
jpayne@68
|
51 values = "".join(complement_mapping.values()).encode("ASCII")
|
jpayne@68
|
52 return bytes.maketrans(keys + keys.lower(), values + values.lower())
|
jpayne@68
|
53
|
jpayne@68
|
54
|
jpayne@68
|
55 ambiguous_dna_complement = dict(IUPACData.ambiguous_dna_complement)
|
jpayne@68
|
56 ambiguous_dna_complement["U"] = ambiguous_dna_complement["T"]
|
jpayne@68
|
57 _dna_complement_table = _maketrans(ambiguous_dna_complement)
|
jpayne@68
|
58 del ambiguous_dna_complement
|
jpayne@68
|
59 ambiguous_rna_complement = dict(IUPACData.ambiguous_rna_complement)
|
jpayne@68
|
60 ambiguous_rna_complement["T"] = ambiguous_rna_complement["U"]
|
jpayne@68
|
61 _rna_complement_table = _maketrans(ambiguous_rna_complement)
|
jpayne@68
|
62 del ambiguous_rna_complement
|
jpayne@68
|
63
|
jpayne@68
|
64
|
jpayne@68
|
65 class SequenceDataAbstractBaseClass(ABC):
|
jpayne@68
|
66 """Abstract base class for sequence content providers.
|
jpayne@68
|
67
|
jpayne@68
|
68 Most users will not need to use this class. It is used internally as a base
|
jpayne@68
|
69 class for sequence content provider classes such as _UndefinedSequenceData
|
jpayne@68
|
70 defined in this module, and _TwoBitSequenceData in Bio.SeqIO.TwoBitIO.
|
jpayne@68
|
71 Instances of these classes can be used instead of a ``bytes`` object as the
|
jpayne@68
|
72 data argument when creating a Seq object, and provide the sequence content
|
jpayne@68
|
73 only when requested via ``__getitem__``. This allows lazy parsers to load
|
jpayne@68
|
74 and parse sequence data from a file only for the requested sequence regions,
|
jpayne@68
|
75 and _UndefinedSequenceData instances to raise an exception when undefined
|
jpayne@68
|
76 sequence data are requested.
|
jpayne@68
|
77
|
jpayne@68
|
78 Future implementations of lazy parsers that similarly provide on-demand
|
jpayne@68
|
79 parsing of sequence data should use a subclass of this abstract class and
|
jpayne@68
|
80 implement the abstract methods ``__len__`` and ``__getitem__``:
|
jpayne@68
|
81
|
jpayne@68
|
82 * ``__len__`` must return the sequence length;
|
jpayne@68
|
83 * ``__getitem__`` must return
|
jpayne@68
|
84
|
jpayne@68
|
85 * a ``bytes`` object for the requested region; or
|
jpayne@68
|
86 * a new instance of the subclass for the requested region; or
|
jpayne@68
|
87 * raise an ``UndefinedSequenceError``.
|
jpayne@68
|
88
|
jpayne@68
|
89 Calling ``__getitem__`` for a sequence region of size zero should always
|
jpayne@68
|
90 return an empty ``bytes`` object.
|
jpayne@68
|
91 Calling ``__getitem__`` for the full sequence (as in data[:]) should
|
jpayne@68
|
92 either return a ``bytes`` object with the full sequence, or raise an
|
jpayne@68
|
93 ``UndefinedSequenceError``.
|
jpayne@68
|
94
|
jpayne@68
|
95 Subclasses of SequenceDataAbstractBaseClass must call ``super().__init__()``
|
jpayne@68
|
96 as part of their ``__init__`` method.
|
jpayne@68
|
97 """
|
jpayne@68
|
98
|
jpayne@68
|
99 __slots__ = ()
|
jpayne@68
|
100
|
jpayne@68
|
101 def __init__(self):
|
jpayne@68
|
102 """Check if ``__getitem__`` returns a bytes-like object."""
|
jpayne@68
|
103 assert self[:0] == b""
|
jpayne@68
|
104
|
jpayne@68
|
105 @abstractmethod
|
jpayne@68
|
106 def __len__(self):
|
jpayne@68
|
107 pass
|
jpayne@68
|
108
|
jpayne@68
|
109 @abstractmethod
|
jpayne@68
|
110 def __getitem__(self, key):
|
jpayne@68
|
111 pass
|
jpayne@68
|
112
|
jpayne@68
|
113 def __bytes__(self):
|
jpayne@68
|
114 return self[:]
|
jpayne@68
|
115
|
jpayne@68
|
116 def __hash__(self):
|
jpayne@68
|
117 return hash(bytes(self))
|
jpayne@68
|
118
|
jpayne@68
|
119 def __eq__(self, other):
|
jpayne@68
|
120 return bytes(self) == other
|
jpayne@68
|
121
|
jpayne@68
|
122 def __lt__(self, other):
|
jpayne@68
|
123 return bytes(self) < other
|
jpayne@68
|
124
|
jpayne@68
|
125 def __le__(self, other):
|
jpayne@68
|
126 return bytes(self) <= other
|
jpayne@68
|
127
|
jpayne@68
|
128 def __gt__(self, other):
|
jpayne@68
|
129 return bytes(self) > other
|
jpayne@68
|
130
|
jpayne@68
|
131 def __ge__(self, other):
|
jpayne@68
|
132 return bytes(self) >= other
|
jpayne@68
|
133
|
jpayne@68
|
134 def __add__(self, other):
|
jpayne@68
|
135 try:
|
jpayne@68
|
136 return bytes(self) + bytes(other)
|
jpayne@68
|
137 except UndefinedSequenceError:
|
jpayne@68
|
138 return NotImplemented
|
jpayne@68
|
139 # will be handled by _UndefinedSequenceData.__radd__ or
|
jpayne@68
|
140 # by _PartiallyDefinedSequenceData.__radd__
|
jpayne@68
|
141
|
jpayne@68
|
142 def __radd__(self, other):
|
jpayne@68
|
143 return other + bytes(self)
|
jpayne@68
|
144
|
jpayne@68
|
145 def __mul__(self, other):
|
jpayne@68
|
146 return other * bytes(self)
|
jpayne@68
|
147
|
jpayne@68
|
148 def __contains__(self, item):
|
jpayne@68
|
149 return bytes(self).__contains__(item)
|
jpayne@68
|
150
|
jpayne@68
|
151 def decode(self, encoding="utf-8"):
|
jpayne@68
|
152 """Decode the data as bytes using the codec registered for encoding.
|
jpayne@68
|
153
|
jpayne@68
|
154 encoding
|
jpayne@68
|
155 The encoding with which to decode the bytes.
|
jpayne@68
|
156 """
|
jpayne@68
|
157 return bytes(self).decode(encoding)
|
jpayne@68
|
158
|
jpayne@68
|
159 def count(self, sub, start=None, end=None):
|
jpayne@68
|
160 """Return the number of non-overlapping occurrences of sub in data[start:end].
|
jpayne@68
|
161
|
jpayne@68
|
162 Optional arguments start and end are interpreted as in slice notation.
|
jpayne@68
|
163 This method behaves as the count method of Python strings.
|
jpayne@68
|
164 """
|
jpayne@68
|
165 return bytes(self).count(sub, start, end)
|
jpayne@68
|
166
|
jpayne@68
|
167 def find(self, sub, start=None, end=None):
|
jpayne@68
|
168 """Return the lowest index in data where subsection sub is found.
|
jpayne@68
|
169
|
jpayne@68
|
170 Return the lowest index in data where subsection sub is found,
|
jpayne@68
|
171 such that sub is contained within data[start,end]. Optional
|
jpayne@68
|
172 arguments start and end are interpreted as in slice notation.
|
jpayne@68
|
173
|
jpayne@68
|
174 Return -1 on failure.
|
jpayne@68
|
175 """
|
jpayne@68
|
176 return bytes(self).find(sub, start, end)
|
jpayne@68
|
177
|
jpayne@68
|
178 def rfind(self, sub, start=None, end=None):
|
jpayne@68
|
179 """Return the highest index in data where subsection sub is found.
|
jpayne@68
|
180
|
jpayne@68
|
181 Return the highest index in data where subsection sub is found,
|
jpayne@68
|
182 such that sub is contained within data[start,end]. Optional
|
jpayne@68
|
183 arguments start and end are interpreted as in slice notation.
|
jpayne@68
|
184
|
jpayne@68
|
185 Return -1 on failure.
|
jpayne@68
|
186 """
|
jpayne@68
|
187 return bytes(self).rfind(sub, start, end)
|
jpayne@68
|
188
|
jpayne@68
|
189 def index(self, sub, start=None, end=None):
|
jpayne@68
|
190 """Return the lowest index in data where subsection sub is found.
|
jpayne@68
|
191
|
jpayne@68
|
192 Return the lowest index in data where subsection sub is found,
|
jpayne@68
|
193 such that sub is contained within data[start,end]. Optional
|
jpayne@68
|
194 arguments start and end are interpreted as in slice notation.
|
jpayne@68
|
195
|
jpayne@68
|
196 Raises ValueError when the subsection is not found.
|
jpayne@68
|
197 """
|
jpayne@68
|
198 return bytes(self).index(sub, start, end)
|
jpayne@68
|
199
|
jpayne@68
|
200 def rindex(self, sub, start=None, end=None):
|
jpayne@68
|
201 """Return the highest index in data where subsection sub is found.
|
jpayne@68
|
202
|
jpayne@68
|
203 Return the highest index in data where subsection sub is found,
|
jpayne@68
|
204 such that sub is contained within data[start,end]. Optional
|
jpayne@68
|
205 arguments start and end are interpreted as in slice notation.
|
jpayne@68
|
206
|
jpayne@68
|
207 Raise ValueError when the subsection is not found.
|
jpayne@68
|
208 """
|
jpayne@68
|
209 return bytes(self).rindex(sub, start, end)
|
jpayne@68
|
210
|
jpayne@68
|
211 def startswith(self, prefix, start=None, end=None):
|
jpayne@68
|
212 """Return True if data starts with the specified prefix, False otherwise.
|
jpayne@68
|
213
|
jpayne@68
|
214 With optional start, test data beginning at that position.
|
jpayne@68
|
215 With optional end, stop comparing data at that position.
|
jpayne@68
|
216 prefix can also be a tuple of bytes to try.
|
jpayne@68
|
217 """
|
jpayne@68
|
218 return bytes(self).startswith(prefix, start, end)
|
jpayne@68
|
219
|
jpayne@68
|
220 def endswith(self, suffix, start=None, end=None):
|
jpayne@68
|
221 """Return True if data ends with the specified suffix, False otherwise.
|
jpayne@68
|
222
|
jpayne@68
|
223 With optional start, test data beginning at that position.
|
jpayne@68
|
224 With optional end, stop comparing data at that position.
|
jpayne@68
|
225 suffix can also be a tuple of bytes to try.
|
jpayne@68
|
226 """
|
jpayne@68
|
227 return bytes(self).endswith(suffix, start, end)
|
jpayne@68
|
228
|
jpayne@68
|
229 def split(self, sep=None, maxsplit=-1):
|
jpayne@68
|
230 """Return a list of the sections in the data, using sep as the delimiter.
|
jpayne@68
|
231
|
jpayne@68
|
232 sep
|
jpayne@68
|
233 The delimiter according which to split the data.
|
jpayne@68
|
234 None (the default value) means split on ASCII whitespace characters
|
jpayne@68
|
235 (space, tab, return, newline, formfeed, vertical tab).
|
jpayne@68
|
236 maxsplit
|
jpayne@68
|
237 Maximum number of splits to do.
|
jpayne@68
|
238 -1 (the default value) means no limit.
|
jpayne@68
|
239 """
|
jpayne@68
|
240 return bytes(self).split(sep, maxsplit)
|
jpayne@68
|
241
|
jpayne@68
|
242 def rsplit(self, sep=None, maxsplit=-1):
|
jpayne@68
|
243 """Return a list of the sections in the data, using sep as the delimiter.
|
jpayne@68
|
244
|
jpayne@68
|
245 sep
|
jpayne@68
|
246 The delimiter according which to split the data.
|
jpayne@68
|
247 None (the default value) means split on ASCII whitespace characters
|
jpayne@68
|
248 (space, tab, return, newline, formfeed, vertical tab).
|
jpayne@68
|
249 maxsplit
|
jpayne@68
|
250 Maximum number of splits to do.
|
jpayne@68
|
251 -1 (the default value) means no limit.
|
jpayne@68
|
252
|
jpayne@68
|
253 Splitting is done starting at the end of the data and working to the front.
|
jpayne@68
|
254 """
|
jpayne@68
|
255 return bytes(self).rsplit(sep, maxsplit)
|
jpayne@68
|
256
|
jpayne@68
|
257 def strip(self, chars=None):
|
jpayne@68
|
258 """Strip leading and trailing characters contained in the argument.
|
jpayne@68
|
259
|
jpayne@68
|
260 If the argument is omitted or None, strip leading and trailing ASCII whitespace.
|
jpayne@68
|
261 """
|
jpayne@68
|
262 return bytes(self).strip(chars)
|
jpayne@68
|
263
|
jpayne@68
|
264 def lstrip(self, chars=None):
|
jpayne@68
|
265 """Strip leading characters contained in the argument.
|
jpayne@68
|
266
|
jpayne@68
|
267 If the argument is omitted or None, strip leading ASCII whitespace.
|
jpayne@68
|
268 """
|
jpayne@68
|
269 return bytes(self).lstrip(chars)
|
jpayne@68
|
270
|
jpayne@68
|
271 def rstrip(self, chars=None):
|
jpayne@68
|
272 """Strip trailing characters contained in the argument.
|
jpayne@68
|
273
|
jpayne@68
|
274 If the argument is omitted or None, strip trailing ASCII whitespace.
|
jpayne@68
|
275 """
|
jpayne@68
|
276 return bytes(self).rstrip(chars)
|
jpayne@68
|
277
|
jpayne@68
|
278 def removeprefix(self, prefix):
|
jpayne@68
|
279 """Remove the prefix if present."""
|
jpayne@68
|
280 # Want to do just this, but need Python 3.9+
|
jpayne@68
|
281 # return bytes(self).removeprefix(prefix)
|
jpayne@68
|
282 data = bytes(self)
|
jpayne@68
|
283 try:
|
jpayne@68
|
284 return data.removeprefix(prefix)
|
jpayne@68
|
285 except AttributeError:
|
jpayne@68
|
286 if data.startswith(prefix):
|
jpayne@68
|
287 return data[len(prefix) :]
|
jpayne@68
|
288 else:
|
jpayne@68
|
289 return data
|
jpayne@68
|
290
|
jpayne@68
|
291 def removesuffix(self, suffix):
|
jpayne@68
|
292 """Remove the suffix if present."""
|
jpayne@68
|
293 # Want to do just this, but need Python 3.9+
|
jpayne@68
|
294 # return bytes(self).removesuffix(suffix)
|
jpayne@68
|
295 data = bytes(self)
|
jpayne@68
|
296 try:
|
jpayne@68
|
297 return data.removesuffix(suffix)
|
jpayne@68
|
298 except AttributeError:
|
jpayne@68
|
299 if data.startswith(suffix):
|
jpayne@68
|
300 return data[: -len(suffix)]
|
jpayne@68
|
301 else:
|
jpayne@68
|
302 return data
|
jpayne@68
|
303
|
jpayne@68
|
304 def upper(self):
|
jpayne@68
|
305 """Return a copy of data with all ASCII characters converted to uppercase."""
|
jpayne@68
|
306 return bytes(self).upper()
|
jpayne@68
|
307
|
jpayne@68
|
308 def lower(self):
|
jpayne@68
|
309 """Return a copy of data with all ASCII characters converted to lowercase."""
|
jpayne@68
|
310 return bytes(self).lower()
|
jpayne@68
|
311
|
jpayne@68
|
312 def isupper(self):
|
jpayne@68
|
313 """Return True if all ASCII characters in data are uppercase.
|
jpayne@68
|
314
|
jpayne@68
|
315 If there are no cased characters, the method returns False.
|
jpayne@68
|
316 """
|
jpayne@68
|
317 return bytes(self).isupper()
|
jpayne@68
|
318
|
jpayne@68
|
319 def islower(self):
|
jpayne@68
|
320 """Return True if all ASCII characters in data are lowercase.
|
jpayne@68
|
321
|
jpayne@68
|
322 If there are no cased characters, the method returns False.
|
jpayne@68
|
323 """
|
jpayne@68
|
324 return bytes(self).islower()
|
jpayne@68
|
325
|
jpayne@68
|
326 def replace(self, old, new):
|
jpayne@68
|
327 """Return a copy with all occurrences of substring old replaced by new."""
|
jpayne@68
|
328 return bytes(self).replace(old, new)
|
jpayne@68
|
329
|
jpayne@68
|
330 def translate(self, table, delete=b""):
|
jpayne@68
|
331 """Return a copy with each character mapped by the given translation table.
|
jpayne@68
|
332
|
jpayne@68
|
333 table
|
jpayne@68
|
334 Translation table, which must be a bytes object of length 256.
|
jpayne@68
|
335
|
jpayne@68
|
336 All characters occurring in the optional argument delete are removed.
|
jpayne@68
|
337 The remaining characters are mapped through the given translation table.
|
jpayne@68
|
338 """
|
jpayne@68
|
339 return bytes(self).translate(table, delete)
|
jpayne@68
|
340
|
jpayne@68
|
341 @property
|
jpayne@68
|
342 def defined(self):
|
jpayne@68
|
343 """Return True if the sequence is defined, False if undefined or partially defined.
|
jpayne@68
|
344
|
jpayne@68
|
345 Zero-length sequences are always considered to be defined.
|
jpayne@68
|
346 """
|
jpayne@68
|
347 return True
|
jpayne@68
|
348
|
jpayne@68
|
349 @property
|
jpayne@68
|
350 def defined_ranges(self):
|
jpayne@68
|
351 """Return a tuple of the ranges where the sequence contents is defined.
|
jpayne@68
|
352
|
jpayne@68
|
353 The return value has the format ((start1, end1), (start2, end2), ...).
|
jpayne@68
|
354 """
|
jpayne@68
|
355 length = len(self)
|
jpayne@68
|
356 if length > 0:
|
jpayne@68
|
357 return ((0, length),)
|
jpayne@68
|
358 else:
|
jpayne@68
|
359 return ()
|
jpayne@68
|
360
|
jpayne@68
|
361
|
jpayne@68
|
362 class _SeqAbstractBaseClass(ABC):
|
jpayne@68
|
363 """Abstract base class for the Seq and MutableSeq classes (PRIVATE).
|
jpayne@68
|
364
|
jpayne@68
|
365 Most users will not need to use this class. It is used internally as an
|
jpayne@68
|
366 abstract base class for Seq and MutableSeq, as most of their methods are
|
jpayne@68
|
367 identical.
|
jpayne@68
|
368 """
|
jpayne@68
|
369
|
jpayne@68
|
370 __slots__ = ("_data",)
|
jpayne@68
|
371 __array_ufunc__ = None # turn off numpy Ufuncs
|
jpayne@68
|
372
|
jpayne@68
|
373 @abstractmethod
|
jpayne@68
|
374 def __init__(self):
|
jpayne@68
|
375 pass
|
jpayne@68
|
376
|
jpayne@68
|
377 def __bytes__(self):
|
jpayne@68
|
378 return bytes(self._data)
|
jpayne@68
|
379
|
jpayne@68
|
380 def __repr__(self):
|
jpayne@68
|
381 """Return (truncated) representation of the sequence."""
|
jpayne@68
|
382 data = self._data
|
jpayne@68
|
383 if isinstance(data, _UndefinedSequenceData):
|
jpayne@68
|
384 return f"Seq(None, length={len(self)})"
|
jpayne@68
|
385 if isinstance(data, _PartiallyDefinedSequenceData):
|
jpayne@68
|
386 d = {}
|
jpayne@68
|
387 for position, seq in data._data.items():
|
jpayne@68
|
388 if len(seq) > 60:
|
jpayne@68
|
389 start = seq[:54].decode("ASCII")
|
jpayne@68
|
390 end = seq[-3:].decode("ASCII")
|
jpayne@68
|
391 seq = f"{start}...{end}"
|
jpayne@68
|
392 else:
|
jpayne@68
|
393 seq = seq.decode("ASCII")
|
jpayne@68
|
394 d[position] = seq
|
jpayne@68
|
395 return "Seq(%r, length=%d)" % (d, len(self))
|
jpayne@68
|
396 if len(data) > 60:
|
jpayne@68
|
397 # Shows the last three letters as it is often useful to see if
|
jpayne@68
|
398 # there is a stop codon at the end of a sequence.
|
jpayne@68
|
399 # Note total length is 54+3+3=60
|
jpayne@68
|
400 start = data[:54].decode("ASCII")
|
jpayne@68
|
401 end = data[-3:].decode("ASCII")
|
jpayne@68
|
402 return f"{self.__class__.__name__}('{start}...{end}')"
|
jpayne@68
|
403 else:
|
jpayne@68
|
404 data = data.decode("ASCII")
|
jpayne@68
|
405 return f"{self.__class__.__name__}('{data}')"
|
jpayne@68
|
406
|
jpayne@68
|
407 def __str__(self):
|
jpayne@68
|
408 """Return the full sequence as a python string."""
|
jpayne@68
|
409 return self._data.decode("ASCII")
|
jpayne@68
|
410
|
jpayne@68
|
411 def __eq__(self, other):
|
jpayne@68
|
412 """Compare the sequence to another sequence or a string.
|
jpayne@68
|
413
|
jpayne@68
|
414 Sequences are equal to each other if their sequence contents is
|
jpayne@68
|
415 identical:
|
jpayne@68
|
416
|
jpayne@68
|
417 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
418 >>> seq1 = Seq("ACGT")
|
jpayne@68
|
419 >>> seq2 = Seq("ACGT")
|
jpayne@68
|
420 >>> mutable_seq = MutableSeq("ACGT")
|
jpayne@68
|
421 >>> seq1 == seq2
|
jpayne@68
|
422 True
|
jpayne@68
|
423 >>> seq1 == mutable_seq
|
jpayne@68
|
424 True
|
jpayne@68
|
425 >>> seq1 == "ACGT"
|
jpayne@68
|
426 True
|
jpayne@68
|
427
|
jpayne@68
|
428 Note that the sequence objects themselves are not identical to each
|
jpayne@68
|
429 other:
|
jpayne@68
|
430
|
jpayne@68
|
431 >>> id(seq1) == id(seq2)
|
jpayne@68
|
432 False
|
jpayne@68
|
433 >>> seq1 is seq2
|
jpayne@68
|
434 False
|
jpayne@68
|
435
|
jpayne@68
|
436 Sequences can also be compared to strings, ``bytes``, and ``bytearray``
|
jpayne@68
|
437 objects:
|
jpayne@68
|
438
|
jpayne@68
|
439 >>> seq1 == "ACGT"
|
jpayne@68
|
440 True
|
jpayne@68
|
441 >>> seq1 == b"ACGT"
|
jpayne@68
|
442 True
|
jpayne@68
|
443 >>> seq1 == bytearray(b"ACGT")
|
jpayne@68
|
444 True
|
jpayne@68
|
445 """
|
jpayne@68
|
446 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
447 return self._data == other._data
|
jpayne@68
|
448 elif isinstance(other, str):
|
jpayne@68
|
449 return self._data == other.encode("ASCII")
|
jpayne@68
|
450 else:
|
jpayne@68
|
451 return self._data == other
|
jpayne@68
|
452
|
jpayne@68
|
453 def __lt__(self, other):
|
jpayne@68
|
454 """Implement the less-than operand."""
|
jpayne@68
|
455 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
456 return self._data < other._data
|
jpayne@68
|
457 elif isinstance(other, str):
|
jpayne@68
|
458 return self._data < other.encode("ASCII")
|
jpayne@68
|
459 else:
|
jpayne@68
|
460 return self._data < other
|
jpayne@68
|
461
|
jpayne@68
|
462 def __le__(self, other):
|
jpayne@68
|
463 """Implement the less-than or equal operand."""
|
jpayne@68
|
464 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
465 return self._data <= other._data
|
jpayne@68
|
466 elif isinstance(other, str):
|
jpayne@68
|
467 return self._data <= other.encode("ASCII")
|
jpayne@68
|
468 else:
|
jpayne@68
|
469 return self._data <= other
|
jpayne@68
|
470
|
jpayne@68
|
471 def __gt__(self, other):
|
jpayne@68
|
472 """Implement the greater-than operand."""
|
jpayne@68
|
473 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
474 return self._data > other._data
|
jpayne@68
|
475 elif isinstance(other, str):
|
jpayne@68
|
476 return self._data > other.encode("ASCII")
|
jpayne@68
|
477 else:
|
jpayne@68
|
478 return self._data > other
|
jpayne@68
|
479
|
jpayne@68
|
480 def __ge__(self, other):
|
jpayne@68
|
481 """Implement the greater-than or equal operand."""
|
jpayne@68
|
482 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
483 return self._data >= other._data
|
jpayne@68
|
484 elif isinstance(other, str):
|
jpayne@68
|
485 return self._data >= other.encode("ASCII")
|
jpayne@68
|
486 else:
|
jpayne@68
|
487 return self._data >= other
|
jpayne@68
|
488
|
jpayne@68
|
489 def __len__(self):
|
jpayne@68
|
490 """Return the length of the sequence."""
|
jpayne@68
|
491 return len(self._data)
|
jpayne@68
|
492
|
jpayne@68
|
493 def __iter__(self):
|
jpayne@68
|
494 """Return an iterable of the sequence."""
|
jpayne@68
|
495 return self._data.decode("ASCII").__iter__()
|
jpayne@68
|
496
|
jpayne@68
|
497 @overload
|
jpayne@68
|
498 def __getitem__(self, index: int) -> str:
|
jpayne@68
|
499 ...
|
jpayne@68
|
500
|
jpayne@68
|
501 @overload
|
jpayne@68
|
502 def __getitem__(self, index: slice) -> "Seq":
|
jpayne@68
|
503 ...
|
jpayne@68
|
504
|
jpayne@68
|
505 def __getitem__(self, index):
|
jpayne@68
|
506 """Return a subsequence as a single letter or as a sequence object.
|
jpayne@68
|
507
|
jpayne@68
|
508 If the index is an integer, a single letter is returned as a Python
|
jpayne@68
|
509 string:
|
jpayne@68
|
510
|
jpayne@68
|
511 >>> seq = Seq('ACTCGACGTCG')
|
jpayne@68
|
512 >>> seq[5]
|
jpayne@68
|
513 'A'
|
jpayne@68
|
514
|
jpayne@68
|
515 Otherwise, a new sequence object of the same class is returned:
|
jpayne@68
|
516
|
jpayne@68
|
517 >>> seq[5:8]
|
jpayne@68
|
518 Seq('ACG')
|
jpayne@68
|
519 >>> mutable_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
520 >>> mutable_seq[5:8]
|
jpayne@68
|
521 MutableSeq('ACG')
|
jpayne@68
|
522 """
|
jpayne@68
|
523 if isinstance(index, numbers.Integral):
|
jpayne@68
|
524 # Return a single letter as a string
|
jpayne@68
|
525 return chr(self._data[index])
|
jpayne@68
|
526 else:
|
jpayne@68
|
527 # Return the (sub)sequence as another Seq/MutableSeq object
|
jpayne@68
|
528 return self.__class__(self._data[index])
|
jpayne@68
|
529
|
jpayne@68
|
530 def __add__(self, other):
|
jpayne@68
|
531 """Add a sequence or string to this sequence.
|
jpayne@68
|
532
|
jpayne@68
|
533 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
534 >>> Seq("MELKI") + "LV"
|
jpayne@68
|
535 Seq('MELKILV')
|
jpayne@68
|
536 >>> MutableSeq("MELKI") + "LV"
|
jpayne@68
|
537 MutableSeq('MELKILV')
|
jpayne@68
|
538 """
|
jpayne@68
|
539 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
540 return self.__class__(self._data + other._data)
|
jpayne@68
|
541 elif isinstance(other, str):
|
jpayne@68
|
542 return self.__class__(self._data + other.encode("ASCII"))
|
jpayne@68
|
543 else:
|
jpayne@68
|
544 # If other is a SeqRecord, then SeqRecord's __radd__ will handle
|
jpayne@68
|
545 # this. If not, returning NotImplemented will trigger a TypeError.
|
jpayne@68
|
546 return NotImplemented
|
jpayne@68
|
547
|
jpayne@68
|
548 def __radd__(self, other):
|
jpayne@68
|
549 """Add a sequence string on the left.
|
jpayne@68
|
550
|
jpayne@68
|
551 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
552 >>> "LV" + Seq("MELKI")
|
jpayne@68
|
553 Seq('LVMELKI')
|
jpayne@68
|
554 >>> "LV" + MutableSeq("MELKI")
|
jpayne@68
|
555 MutableSeq('LVMELKI')
|
jpayne@68
|
556
|
jpayne@68
|
557 Adding two sequence objects is handled via the __add__ method.
|
jpayne@68
|
558 """
|
jpayne@68
|
559 if isinstance(other, str):
|
jpayne@68
|
560 return self.__class__(other.encode("ASCII") + self._data)
|
jpayne@68
|
561 else:
|
jpayne@68
|
562 return NotImplemented
|
jpayne@68
|
563
|
jpayne@68
|
564 def __mul__(self, other):
|
jpayne@68
|
565 """Multiply sequence by integer.
|
jpayne@68
|
566
|
jpayne@68
|
567 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
568 >>> Seq('ATG') * 2
|
jpayne@68
|
569 Seq('ATGATG')
|
jpayne@68
|
570 >>> MutableSeq('ATG') * 2
|
jpayne@68
|
571 MutableSeq('ATGATG')
|
jpayne@68
|
572 """
|
jpayne@68
|
573 if not isinstance(other, numbers.Integral):
|
jpayne@68
|
574 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
jpayne@68
|
575 # we would like to simply write
|
jpayne@68
|
576 # data = self._data * other
|
jpayne@68
|
577 # here, but currently that causes a bug on PyPy if self._data is a
|
jpayne@68
|
578 # bytearray and other is a numpy integer. Using this workaround:
|
jpayne@68
|
579 data = self._data.__mul__(other)
|
jpayne@68
|
580 return self.__class__(data)
|
jpayne@68
|
581
|
jpayne@68
|
582 def __rmul__(self, other):
|
jpayne@68
|
583 """Multiply integer by sequence.
|
jpayne@68
|
584
|
jpayne@68
|
585 >>> from Bio.Seq import Seq
|
jpayne@68
|
586 >>> 2 * Seq('ATG')
|
jpayne@68
|
587 Seq('ATGATG')
|
jpayne@68
|
588 """
|
jpayne@68
|
589 if not isinstance(other, numbers.Integral):
|
jpayne@68
|
590 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
jpayne@68
|
591 # we would like to simply write
|
jpayne@68
|
592 # data = self._data * other
|
jpayne@68
|
593 # here, but currently that causes a bug on PyPy if self._data is a
|
jpayne@68
|
594 # bytearray and other is a numpy integer. Using this workaround:
|
jpayne@68
|
595 data = self._data.__mul__(other)
|
jpayne@68
|
596 return self.__class__(data)
|
jpayne@68
|
597
|
jpayne@68
|
598 def __imul__(self, other):
|
jpayne@68
|
599 """Multiply the sequence object by other and assign.
|
jpayne@68
|
600
|
jpayne@68
|
601 >>> from Bio.Seq import Seq
|
jpayne@68
|
602 >>> seq = Seq('ATG')
|
jpayne@68
|
603 >>> seq *= 2
|
jpayne@68
|
604 >>> seq
|
jpayne@68
|
605 Seq('ATGATG')
|
jpayne@68
|
606
|
jpayne@68
|
607 Note that this is different from in-place multiplication. The ``seq``
|
jpayne@68
|
608 variable is reassigned to the multiplication result, but any variable
|
jpayne@68
|
609 pointing to ``seq`` will remain unchanged:
|
jpayne@68
|
610
|
jpayne@68
|
611 >>> seq = Seq('ATG')
|
jpayne@68
|
612 >>> seq2 = seq
|
jpayne@68
|
613 >>> id(seq) == id(seq2)
|
jpayne@68
|
614 True
|
jpayne@68
|
615 >>> seq *= 2
|
jpayne@68
|
616 >>> seq
|
jpayne@68
|
617 Seq('ATGATG')
|
jpayne@68
|
618 >>> seq2
|
jpayne@68
|
619 Seq('ATG')
|
jpayne@68
|
620 >>> id(seq) == id(seq2)
|
jpayne@68
|
621 False
|
jpayne@68
|
622 """
|
jpayne@68
|
623 if not isinstance(other, numbers.Integral):
|
jpayne@68
|
624 raise TypeError(f"can't multiply {self.__class__.__name__} by non-int type")
|
jpayne@68
|
625 # we would like to simply write
|
jpayne@68
|
626 # data = self._data * other
|
jpayne@68
|
627 # here, but currently that causes a bug on PyPy if self._data is a
|
jpayne@68
|
628 # bytearray and other is a numpy integer. Using this workaround:
|
jpayne@68
|
629 data = self._data.__mul__(other)
|
jpayne@68
|
630 return self.__class__(data)
|
jpayne@68
|
631
|
jpayne@68
|
632 def count(self, sub, start=None, end=None):
|
jpayne@68
|
633 """Return a non-overlapping count, like that of a python string.
|
jpayne@68
|
634
|
jpayne@68
|
635 The number of occurrences of substring argument sub in the
|
jpayne@68
|
636 (sub)sequence given by [start:end] is returned as an integer.
|
jpayne@68
|
637 Optional arguments start and end are interpreted as in slice
|
jpayne@68
|
638 notation.
|
jpayne@68
|
639
|
jpayne@68
|
640 Arguments:
|
jpayne@68
|
641 - sub - a string or another Seq object to look for
|
jpayne@68
|
642 - start - optional integer, slice start
|
jpayne@68
|
643 - end - optional integer, slice end
|
jpayne@68
|
644
|
jpayne@68
|
645 e.g.
|
jpayne@68
|
646
|
jpayne@68
|
647 >>> from Bio.Seq import Seq
|
jpayne@68
|
648 >>> my_seq = Seq("AAAATGA")
|
jpayne@68
|
649 >>> print(my_seq.count("A"))
|
jpayne@68
|
650 5
|
jpayne@68
|
651 >>> print(my_seq.count("ATG"))
|
jpayne@68
|
652 1
|
jpayne@68
|
653 >>> print(my_seq.count(Seq("AT")))
|
jpayne@68
|
654 1
|
jpayne@68
|
655 >>> print(my_seq.count("AT", 2, -1))
|
jpayne@68
|
656 1
|
jpayne@68
|
657
|
jpayne@68
|
658 HOWEVER, please note because the ``count`` method of Seq and MutableSeq
|
jpayne@68
|
659 objects, like that of Python strings, do a non-overlapping search, this
|
jpayne@68
|
660 may not give the answer you expect:
|
jpayne@68
|
661
|
jpayne@68
|
662 >>> "AAAA".count("AA")
|
jpayne@68
|
663 2
|
jpayne@68
|
664 >>> print(Seq("AAAA").count("AA"))
|
jpayne@68
|
665 2
|
jpayne@68
|
666
|
jpayne@68
|
667 For an overlapping search, use the ``count_overlap`` method:
|
jpayne@68
|
668
|
jpayne@68
|
669 >>> print(Seq("AAAA").count_overlap("AA"))
|
jpayne@68
|
670 3
|
jpayne@68
|
671 """
|
jpayne@68
|
672 if isinstance(sub, MutableSeq):
|
jpayne@68
|
673 sub = sub._data
|
jpayne@68
|
674 elif isinstance(sub, Seq):
|
jpayne@68
|
675 sub = bytes(sub)
|
jpayne@68
|
676 elif isinstance(sub, str):
|
jpayne@68
|
677 sub = sub.encode("ASCII")
|
jpayne@68
|
678 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
679 raise TypeError(
|
jpayne@68
|
680 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
681 % type(sub)
|
jpayne@68
|
682 )
|
jpayne@68
|
683 return self._data.count(sub, start, end)
|
jpayne@68
|
684
|
jpayne@68
|
685 def count_overlap(self, sub, start=None, end=None):
|
jpayne@68
|
686 """Return an overlapping count.
|
jpayne@68
|
687
|
jpayne@68
|
688 Returns an integer, the number of occurrences of substring
|
jpayne@68
|
689 argument sub in the (sub)sequence given by [start:end].
|
jpayne@68
|
690 Optional arguments start and end are interpreted as in slice
|
jpayne@68
|
691 notation.
|
jpayne@68
|
692
|
jpayne@68
|
693 Arguments:
|
jpayne@68
|
694 - sub - a string or another Seq object to look for
|
jpayne@68
|
695 - start - optional integer, slice start
|
jpayne@68
|
696 - end - optional integer, slice end
|
jpayne@68
|
697
|
jpayne@68
|
698 e.g.
|
jpayne@68
|
699
|
jpayne@68
|
700 >>> from Bio.Seq import Seq
|
jpayne@68
|
701 >>> print(Seq("AAAA").count_overlap("AA"))
|
jpayne@68
|
702 3
|
jpayne@68
|
703 >>> print(Seq("ATATATATA").count_overlap("ATA"))
|
jpayne@68
|
704 4
|
jpayne@68
|
705 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1))
|
jpayne@68
|
706 1
|
jpayne@68
|
707
|
jpayne@68
|
708 For a non-overlapping search, use the ``count`` method:
|
jpayne@68
|
709
|
jpayne@68
|
710 >>> print(Seq("AAAA").count("AA"))
|
jpayne@68
|
711 2
|
jpayne@68
|
712
|
jpayne@68
|
713 Where substrings do not overlap, ``count_overlap`` behaves the same as
|
jpayne@68
|
714 the ``count`` method:
|
jpayne@68
|
715
|
jpayne@68
|
716 >>> from Bio.Seq import Seq
|
jpayne@68
|
717 >>> my_seq = Seq("AAAATGA")
|
jpayne@68
|
718 >>> print(my_seq.count_overlap("A"))
|
jpayne@68
|
719 5
|
jpayne@68
|
720 >>> my_seq.count_overlap("A") == my_seq.count("A")
|
jpayne@68
|
721 True
|
jpayne@68
|
722 >>> print(my_seq.count_overlap("ATG"))
|
jpayne@68
|
723 1
|
jpayne@68
|
724 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG")
|
jpayne@68
|
725 True
|
jpayne@68
|
726 >>> print(my_seq.count_overlap(Seq("AT")))
|
jpayne@68
|
727 1
|
jpayne@68
|
728 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT"))
|
jpayne@68
|
729 True
|
jpayne@68
|
730 >>> print(my_seq.count_overlap("AT", 2, -1))
|
jpayne@68
|
731 1
|
jpayne@68
|
732 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1)
|
jpayne@68
|
733 True
|
jpayne@68
|
734
|
jpayne@68
|
735 HOWEVER, do not use this method for such cases because the
|
jpayne@68
|
736 count() method is much for efficient.
|
jpayne@68
|
737 """
|
jpayne@68
|
738 if isinstance(sub, MutableSeq):
|
jpayne@68
|
739 sub = sub._data
|
jpayne@68
|
740 elif isinstance(sub, Seq):
|
jpayne@68
|
741 sub = bytes(sub)
|
jpayne@68
|
742 elif isinstance(sub, str):
|
jpayne@68
|
743 sub = sub.encode("ASCII")
|
jpayne@68
|
744 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
745 raise TypeError(
|
jpayne@68
|
746 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
747 % type(sub)
|
jpayne@68
|
748 )
|
jpayne@68
|
749 data = self._data
|
jpayne@68
|
750 overlap_count = 0
|
jpayne@68
|
751 while True:
|
jpayne@68
|
752 start = data.find(sub, start, end) + 1
|
jpayne@68
|
753 if start != 0:
|
jpayne@68
|
754 overlap_count += 1
|
jpayne@68
|
755 else:
|
jpayne@68
|
756 return overlap_count
|
jpayne@68
|
757
|
jpayne@68
|
758 def __contains__(self, item):
|
jpayne@68
|
759 """Return True if item is a subsequence of the sequence, and False otherwise.
|
jpayne@68
|
760
|
jpayne@68
|
761 e.g.
|
jpayne@68
|
762
|
jpayne@68
|
763 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
764 >>> my_dna = Seq("ATATGAAATTTGAAAA")
|
jpayne@68
|
765 >>> "AAA" in my_dna
|
jpayne@68
|
766 True
|
jpayne@68
|
767 >>> Seq("AAA") in my_dna
|
jpayne@68
|
768 True
|
jpayne@68
|
769 >>> MutableSeq("AAA") in my_dna
|
jpayne@68
|
770 True
|
jpayne@68
|
771 """
|
jpayne@68
|
772 if isinstance(item, _SeqAbstractBaseClass):
|
jpayne@68
|
773 item = bytes(item)
|
jpayne@68
|
774 elif isinstance(item, str):
|
jpayne@68
|
775 item = item.encode("ASCII")
|
jpayne@68
|
776 return item in self._data
|
jpayne@68
|
777
|
jpayne@68
|
778 def find(self, sub, start=None, end=None):
|
jpayne@68
|
779 """Return the lowest index in the sequence where subsequence sub is found.
|
jpayne@68
|
780
|
jpayne@68
|
781 With optional arguments start and end, return the lowest index in the
|
jpayne@68
|
782 sequence such that the subsequence sub is contained within the sequence
|
jpayne@68
|
783 region [start:end].
|
jpayne@68
|
784
|
jpayne@68
|
785 Arguments:
|
jpayne@68
|
786 - sub - a string or another Seq or MutableSeq object to search for
|
jpayne@68
|
787 - start - optional integer, slice start
|
jpayne@68
|
788 - end - optional integer, slice end
|
jpayne@68
|
789
|
jpayne@68
|
790 Returns -1 if the subsequence is NOT found.
|
jpayne@68
|
791
|
jpayne@68
|
792 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
|
jpayne@68
|
793
|
jpayne@68
|
794 >>> from Bio.Seq import Seq
|
jpayne@68
|
795 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
796 >>> my_rna.find("AUG")
|
jpayne@68
|
797 3
|
jpayne@68
|
798
|
jpayne@68
|
799 The next typical start codon can then be found by starting the search
|
jpayne@68
|
800 at position 4:
|
jpayne@68
|
801
|
jpayne@68
|
802 >>> my_rna.find("AUG", 4)
|
jpayne@68
|
803 15
|
jpayne@68
|
804
|
jpayne@68
|
805 See the ``search`` method to find the locations of multiple subsequences
|
jpayne@68
|
806 at the same time.
|
jpayne@68
|
807 """
|
jpayne@68
|
808 if isinstance(sub, _SeqAbstractBaseClass):
|
jpayne@68
|
809 sub = bytes(sub)
|
jpayne@68
|
810 elif isinstance(sub, str):
|
jpayne@68
|
811 sub = sub.encode("ASCII")
|
jpayne@68
|
812 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
813 raise TypeError(
|
jpayne@68
|
814 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
815 % type(sub)
|
jpayne@68
|
816 )
|
jpayne@68
|
817 return self._data.find(sub, start, end)
|
jpayne@68
|
818
|
jpayne@68
|
819 def rfind(self, sub, start=None, end=None):
|
jpayne@68
|
820 """Return the highest index in the sequence where subsequence sub is found.
|
jpayne@68
|
821
|
jpayne@68
|
822 With optional arguments start and end, return the highest index in the
|
jpayne@68
|
823 sequence such that the subsequence sub is contained within the sequence
|
jpayne@68
|
824 region [start:end].
|
jpayne@68
|
825
|
jpayne@68
|
826 Arguments:
|
jpayne@68
|
827 - sub - a string or another Seq or MutableSeq object to search for
|
jpayne@68
|
828 - start - optional integer, slice start
|
jpayne@68
|
829 - end - optional integer, slice end
|
jpayne@68
|
830
|
jpayne@68
|
831 Returns -1 if the subsequence is NOT found.
|
jpayne@68
|
832
|
jpayne@68
|
833 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
|
jpayne@68
|
834
|
jpayne@68
|
835 >>> from Bio.Seq import Seq
|
jpayne@68
|
836 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
837 >>> my_rna.rfind("AUG")
|
jpayne@68
|
838 15
|
jpayne@68
|
839
|
jpayne@68
|
840 The location of the typical start codon before that can be found by
|
jpayne@68
|
841 ending the search at position 15:
|
jpayne@68
|
842
|
jpayne@68
|
843 >>> my_rna.rfind("AUG", end=15)
|
jpayne@68
|
844 3
|
jpayne@68
|
845
|
jpayne@68
|
846 See the ``search`` method to find the locations of multiple subsequences
|
jpayne@68
|
847 at the same time.
|
jpayne@68
|
848 """
|
jpayne@68
|
849 if isinstance(sub, _SeqAbstractBaseClass):
|
jpayne@68
|
850 sub = bytes(sub)
|
jpayne@68
|
851 elif isinstance(sub, str):
|
jpayne@68
|
852 sub = sub.encode("ASCII")
|
jpayne@68
|
853 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
854 raise TypeError(
|
jpayne@68
|
855 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
856 % type(sub)
|
jpayne@68
|
857 )
|
jpayne@68
|
858 return self._data.rfind(sub, start, end)
|
jpayne@68
|
859
|
jpayne@68
|
860 def index(self, sub, start=None, end=None):
|
jpayne@68
|
861 """Return the lowest index in the sequence where subsequence sub is found.
|
jpayne@68
|
862
|
jpayne@68
|
863 With optional arguments start and end, return the lowest index in the
|
jpayne@68
|
864 sequence such that the subsequence sub is contained within the sequence
|
jpayne@68
|
865 region [start:end].
|
jpayne@68
|
866
|
jpayne@68
|
867 Arguments:
|
jpayne@68
|
868 - sub - a string or another Seq or MutableSeq object to search for
|
jpayne@68
|
869 - start - optional integer, slice start
|
jpayne@68
|
870 - end - optional integer, slice end
|
jpayne@68
|
871
|
jpayne@68
|
872 Raises a ValueError if the subsequence is NOT found.
|
jpayne@68
|
873
|
jpayne@68
|
874 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
|
jpayne@68
|
875
|
jpayne@68
|
876 >>> from Bio.Seq import Seq
|
jpayne@68
|
877 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
878 >>> my_rna.index("AUG")
|
jpayne@68
|
879 3
|
jpayne@68
|
880
|
jpayne@68
|
881 The next typical start codon can then be found by starting the search
|
jpayne@68
|
882 at position 4:
|
jpayne@68
|
883
|
jpayne@68
|
884 >>> my_rna.index("AUG", 4)
|
jpayne@68
|
885 15
|
jpayne@68
|
886
|
jpayne@68
|
887 This method performs the same search as the ``find`` method. However,
|
jpayne@68
|
888 if the subsequence is not found, ``find`` returns -1 while ``index``
|
jpayne@68
|
889 raises a ValueError:
|
jpayne@68
|
890
|
jpayne@68
|
891 >>> my_rna.index("T")
|
jpayne@68
|
892 Traceback (most recent call last):
|
jpayne@68
|
893 ...
|
jpayne@68
|
894 ValueError: ...
|
jpayne@68
|
895 >>> my_rna.find("T")
|
jpayne@68
|
896 -1
|
jpayne@68
|
897
|
jpayne@68
|
898 See the ``search`` method to find the locations of multiple subsequences
|
jpayne@68
|
899 at the same time.
|
jpayne@68
|
900 """
|
jpayne@68
|
901 if isinstance(sub, MutableSeq):
|
jpayne@68
|
902 sub = sub._data
|
jpayne@68
|
903 elif isinstance(sub, Seq):
|
jpayne@68
|
904 sub = bytes(sub)
|
jpayne@68
|
905 elif isinstance(sub, str):
|
jpayne@68
|
906 sub = sub.encode("ASCII")
|
jpayne@68
|
907 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
908 raise TypeError(
|
jpayne@68
|
909 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
910 % type(sub)
|
jpayne@68
|
911 )
|
jpayne@68
|
912 return self._data.index(sub, start, end)
|
jpayne@68
|
913
|
jpayne@68
|
914 def rindex(self, sub, start=None, end=None):
|
jpayne@68
|
915 """Return the highest index in the sequence where subsequence sub is found.
|
jpayne@68
|
916
|
jpayne@68
|
917 With optional arguments start and end, return the highest index in the
|
jpayne@68
|
918 sequence such that the subsequence sub is contained within the sequence
|
jpayne@68
|
919 region [start:end].
|
jpayne@68
|
920
|
jpayne@68
|
921 Arguments:
|
jpayne@68
|
922 - sub - a string or another Seq or MutableSeq object to search for
|
jpayne@68
|
923 - start - optional integer, slice start
|
jpayne@68
|
924 - end - optional integer, slice end
|
jpayne@68
|
925
|
jpayne@68
|
926 Returns -1 if the subsequence is NOT found.
|
jpayne@68
|
927
|
jpayne@68
|
928 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
|
jpayne@68
|
929
|
jpayne@68
|
930 >>> from Bio.Seq import Seq
|
jpayne@68
|
931 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
932 >>> my_rna.rindex("AUG")
|
jpayne@68
|
933 15
|
jpayne@68
|
934
|
jpayne@68
|
935 The location of the typical start codon before that can be found by
|
jpayne@68
|
936 ending the search at position 15:
|
jpayne@68
|
937
|
jpayne@68
|
938 >>> my_rna.rindex("AUG", end=15)
|
jpayne@68
|
939 3
|
jpayne@68
|
940
|
jpayne@68
|
941 This method performs the same search as the ``rfind`` method. However,
|
jpayne@68
|
942 if the subsequence is not found, ``rfind`` returns -1 which ``rindex``
|
jpayne@68
|
943 raises a ValueError:
|
jpayne@68
|
944
|
jpayne@68
|
945 >>> my_rna.rindex("T")
|
jpayne@68
|
946 Traceback (most recent call last):
|
jpayne@68
|
947 ...
|
jpayne@68
|
948 ValueError: ...
|
jpayne@68
|
949 >>> my_rna.rfind("T")
|
jpayne@68
|
950 -1
|
jpayne@68
|
951
|
jpayne@68
|
952 See the ``search`` method to find the locations of multiple subsequences
|
jpayne@68
|
953 at the same time.
|
jpayne@68
|
954 """
|
jpayne@68
|
955 if isinstance(sub, MutableSeq):
|
jpayne@68
|
956 sub = sub._data
|
jpayne@68
|
957 elif isinstance(sub, Seq):
|
jpayne@68
|
958 sub = bytes(sub)
|
jpayne@68
|
959 elif isinstance(sub, str):
|
jpayne@68
|
960 sub = sub.encode("ASCII")
|
jpayne@68
|
961 elif not isinstance(sub, (bytes, bytearray)):
|
jpayne@68
|
962 raise TypeError(
|
jpayne@68
|
963 "a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
964 % type(sub)
|
jpayne@68
|
965 )
|
jpayne@68
|
966 return self._data.rindex(sub, start, end)
|
jpayne@68
|
967
|
jpayne@68
|
968 def search(self, subs):
|
jpayne@68
|
969 """Search the substrings subs in self and yield the index and substring found.
|
jpayne@68
|
970
|
jpayne@68
|
971 Arguments:
|
jpayne@68
|
972 - subs - a list of strings, Seq, MutableSeq, bytes, or bytearray
|
jpayne@68
|
973 objects containing the substrings to search for.
|
jpayne@68
|
974
|
jpayne@68
|
975 >>> from Bio.Seq import Seq
|
jpayne@68
|
976 >>> dna = Seq("GTCATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTTG")
|
jpayne@68
|
977 >>> matches = dna.search(["CC", Seq("ATTG"), "ATTG", Seq("CCC")])
|
jpayne@68
|
978 >>> for index, substring in matches:
|
jpayne@68
|
979 ... print(index, substring)
|
jpayne@68
|
980 ...
|
jpayne@68
|
981 7 CC
|
jpayne@68
|
982 9 ATTG
|
jpayne@68
|
983 20 CC
|
jpayne@68
|
984 34 CC
|
jpayne@68
|
985 34 CCC
|
jpayne@68
|
986 35 CC
|
jpayne@68
|
987 """
|
jpayne@68
|
988 subdict = collections.defaultdict(set)
|
jpayne@68
|
989 for index, sub in enumerate(subs):
|
jpayne@68
|
990 if isinstance(sub, (_SeqAbstractBaseClass, bytearray)):
|
jpayne@68
|
991 sub = bytes(sub)
|
jpayne@68
|
992 elif isinstance(sub, str):
|
jpayne@68
|
993 sub = sub.encode("ASCII")
|
jpayne@68
|
994 elif not isinstance(sub, bytes):
|
jpayne@68
|
995 raise TypeError(
|
jpayne@68
|
996 "subs[%d]: a Seq, MutableSeq, str, bytes, or bytearray object is required, not '%s'"
|
jpayne@68
|
997 % (index, type(sub))
|
jpayne@68
|
998 )
|
jpayne@68
|
999 length = len(sub)
|
jpayne@68
|
1000 subdict[length].add(sub)
|
jpayne@68
|
1001 for start in range(len(self) - 1):
|
jpayne@68
|
1002 for length, subs in subdict.items():
|
jpayne@68
|
1003 stop = start + length
|
jpayne@68
|
1004 for sub in subs:
|
jpayne@68
|
1005 if self._data[start:stop] == sub:
|
jpayne@68
|
1006 yield (start, sub.decode())
|
jpayne@68
|
1007 break
|
jpayne@68
|
1008
|
jpayne@68
|
1009 def startswith(self, prefix, start=None, end=None):
|
jpayne@68
|
1010 """Return True if the sequence starts with the given prefix, False otherwise.
|
jpayne@68
|
1011
|
jpayne@68
|
1012 Return True if the sequence starts with the specified prefix
|
jpayne@68
|
1013 (a string or another Seq object), False otherwise.
|
jpayne@68
|
1014 With optional start, test sequence beginning at that position.
|
jpayne@68
|
1015 With optional end, stop comparing sequence at that position.
|
jpayne@68
|
1016 prefix can also be a tuple of strings to try. e.g.
|
jpayne@68
|
1017
|
jpayne@68
|
1018 >>> from Bio.Seq import Seq
|
jpayne@68
|
1019 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
1020 >>> my_rna.startswith("GUC")
|
jpayne@68
|
1021 True
|
jpayne@68
|
1022 >>> my_rna.startswith("AUG")
|
jpayne@68
|
1023 False
|
jpayne@68
|
1024 >>> my_rna.startswith("AUG", 3)
|
jpayne@68
|
1025 True
|
jpayne@68
|
1026 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1)
|
jpayne@68
|
1027 True
|
jpayne@68
|
1028 """
|
jpayne@68
|
1029 if isinstance(prefix, tuple):
|
jpayne@68
|
1030 prefix = tuple(
|
jpayne@68
|
1031 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
|
jpayne@68
|
1032 for p in prefix
|
jpayne@68
|
1033 )
|
jpayne@68
|
1034 elif isinstance(prefix, _SeqAbstractBaseClass):
|
jpayne@68
|
1035 prefix = bytes(prefix)
|
jpayne@68
|
1036 elif isinstance(prefix, str):
|
jpayne@68
|
1037 prefix = prefix.encode("ASCII")
|
jpayne@68
|
1038 return self._data.startswith(prefix, start, end)
|
jpayne@68
|
1039
|
jpayne@68
|
1040 def endswith(self, suffix, start=None, end=None):
|
jpayne@68
|
1041 """Return True if the sequence ends with the given suffix, False otherwise.
|
jpayne@68
|
1042
|
jpayne@68
|
1043 Return True if the sequence ends with the specified suffix
|
jpayne@68
|
1044 (a string or another Seq object), False otherwise.
|
jpayne@68
|
1045 With optional start, test sequence beginning at that position.
|
jpayne@68
|
1046 With optional end, stop comparing sequence at that position.
|
jpayne@68
|
1047 suffix can also be a tuple of strings to try. e.g.
|
jpayne@68
|
1048
|
jpayne@68
|
1049 >>> from Bio.Seq import Seq
|
jpayne@68
|
1050 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
1051 >>> my_rna.endswith("UUG")
|
jpayne@68
|
1052 True
|
jpayne@68
|
1053 >>> my_rna.endswith("AUG")
|
jpayne@68
|
1054 False
|
jpayne@68
|
1055 >>> my_rna.endswith("AUG", 0, 18)
|
jpayne@68
|
1056 True
|
jpayne@68
|
1057 >>> my_rna.endswith(("UCC", "UCA", "UUG"))
|
jpayne@68
|
1058 True
|
jpayne@68
|
1059 """
|
jpayne@68
|
1060 if isinstance(suffix, tuple):
|
jpayne@68
|
1061 suffix = tuple(
|
jpayne@68
|
1062 bytes(p) if isinstance(p, _SeqAbstractBaseClass) else p.encode("ASCII")
|
jpayne@68
|
1063 for p in suffix
|
jpayne@68
|
1064 )
|
jpayne@68
|
1065 elif isinstance(suffix, _SeqAbstractBaseClass):
|
jpayne@68
|
1066 suffix = bytes(suffix)
|
jpayne@68
|
1067 elif isinstance(suffix, str):
|
jpayne@68
|
1068 suffix = suffix.encode("ASCII")
|
jpayne@68
|
1069 return self._data.endswith(suffix, start, end)
|
jpayne@68
|
1070
|
jpayne@68
|
1071 def split(self, sep=None, maxsplit=-1):
|
jpayne@68
|
1072 """Return a list of subsequences when splitting the sequence by separator sep.
|
jpayne@68
|
1073
|
jpayne@68
|
1074 Return a list of the subsequences in the sequence (as Seq objects),
|
jpayne@68
|
1075 using sep as the delimiter string. If maxsplit is given, at
|
jpayne@68
|
1076 most maxsplit splits are done. If maxsplit is omitted, all
|
jpayne@68
|
1077 splits are made.
|
jpayne@68
|
1078
|
jpayne@68
|
1079 For consistency with the ``split`` method of Python strings, any
|
jpayne@68
|
1080 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
|
jpayne@68
|
1081 default value
|
jpayne@68
|
1082
|
jpayne@68
|
1083 e.g.
|
jpayne@68
|
1084
|
jpayne@68
|
1085 >>> from Bio.Seq import Seq
|
jpayne@68
|
1086 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
1087 >>> my_aa = my_rna.translate()
|
jpayne@68
|
1088 >>> my_aa
|
jpayne@68
|
1089 Seq('VMAIVMGR*KGAR*L')
|
jpayne@68
|
1090 >>> for pep in my_aa.split("*"):
|
jpayne@68
|
1091 ... pep
|
jpayne@68
|
1092 Seq('VMAIVMGR')
|
jpayne@68
|
1093 Seq('KGAR')
|
jpayne@68
|
1094 Seq('L')
|
jpayne@68
|
1095 >>> for pep in my_aa.split("*", 1):
|
jpayne@68
|
1096 ... pep
|
jpayne@68
|
1097 Seq('VMAIVMGR')
|
jpayne@68
|
1098 Seq('KGAR*L')
|
jpayne@68
|
1099
|
jpayne@68
|
1100 See also the rsplit method, which splits the sequence starting from the
|
jpayne@68
|
1101 end:
|
jpayne@68
|
1102
|
jpayne@68
|
1103 >>> for pep in my_aa.rsplit("*", 1):
|
jpayne@68
|
1104 ... pep
|
jpayne@68
|
1105 Seq('VMAIVMGR*KGAR')
|
jpayne@68
|
1106 Seq('L')
|
jpayne@68
|
1107 """
|
jpayne@68
|
1108 if isinstance(sep, _SeqAbstractBaseClass):
|
jpayne@68
|
1109 sep = bytes(sep)
|
jpayne@68
|
1110 elif isinstance(sep, str):
|
jpayne@68
|
1111 sep = sep.encode("ASCII")
|
jpayne@68
|
1112 return [Seq(part) for part in self._data.split(sep, maxsplit)]
|
jpayne@68
|
1113
|
jpayne@68
|
1114 def rsplit(self, sep=None, maxsplit=-1):
|
jpayne@68
|
1115 """Return a list of subsequences by splitting the sequence from the right.
|
jpayne@68
|
1116
|
jpayne@68
|
1117 Return a list of the subsequences in the sequence (as Seq objects),
|
jpayne@68
|
1118 using sep as the delimiter string. If maxsplit is given, at
|
jpayne@68
|
1119 most maxsplit splits are done. If maxsplit is omitted, all
|
jpayne@68
|
1120 splits are made.
|
jpayne@68
|
1121
|
jpayne@68
|
1122 For consistency with the ``rsplit`` method of Python strings, any
|
jpayne@68
|
1123 whitespace (tabs, spaces, newlines) is a separator if sep is None, the
|
jpayne@68
|
1124 default value
|
jpayne@68
|
1125
|
jpayne@68
|
1126 e.g.
|
jpayne@68
|
1127
|
jpayne@68
|
1128 >>> from Bio.Seq import Seq
|
jpayne@68
|
1129 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
|
jpayne@68
|
1130 >>> my_aa = my_rna.translate()
|
jpayne@68
|
1131 >>> my_aa
|
jpayne@68
|
1132 Seq('VMAIVMGR*KGAR*L')
|
jpayne@68
|
1133 >>> for pep in my_aa.rsplit("*"):
|
jpayne@68
|
1134 ... pep
|
jpayne@68
|
1135 Seq('VMAIVMGR')
|
jpayne@68
|
1136 Seq('KGAR')
|
jpayne@68
|
1137 Seq('L')
|
jpayne@68
|
1138 >>> for pep in my_aa.rsplit("*", 1):
|
jpayne@68
|
1139 ... pep
|
jpayne@68
|
1140 Seq('VMAIVMGR*KGAR')
|
jpayne@68
|
1141 Seq('L')
|
jpayne@68
|
1142
|
jpayne@68
|
1143 See also the split method, which splits the sequence starting from the
|
jpayne@68
|
1144 beginning:
|
jpayne@68
|
1145
|
jpayne@68
|
1146 >>> for pep in my_aa.split("*", 1):
|
jpayne@68
|
1147 ... pep
|
jpayne@68
|
1148 Seq('VMAIVMGR')
|
jpayne@68
|
1149 Seq('KGAR*L')
|
jpayne@68
|
1150 """
|
jpayne@68
|
1151 if isinstance(sep, _SeqAbstractBaseClass):
|
jpayne@68
|
1152 sep = bytes(sep)
|
jpayne@68
|
1153 elif isinstance(sep, str):
|
jpayne@68
|
1154 sep = sep.encode("ASCII")
|
jpayne@68
|
1155 return [Seq(part) for part in self._data.rsplit(sep, maxsplit)]
|
jpayne@68
|
1156
|
jpayne@68
|
1157 def strip(self, chars=None, inplace=False):
|
jpayne@68
|
1158 """Return a sequence object with leading and trailing ends stripped.
|
jpayne@68
|
1159
|
jpayne@68
|
1160 With default arguments, leading and trailing whitespace is removed:
|
jpayne@68
|
1161
|
jpayne@68
|
1162 >>> seq = Seq(" ACGT ")
|
jpayne@68
|
1163 >>> seq.strip()
|
jpayne@68
|
1164 Seq('ACGT')
|
jpayne@68
|
1165 >>> seq
|
jpayne@68
|
1166 Seq(' ACGT ')
|
jpayne@68
|
1167
|
jpayne@68
|
1168 If ``chars`` is given and not ``None``, remove characters in ``chars``
|
jpayne@68
|
1169 instead. The order of the characters to be removed is not important:
|
jpayne@68
|
1170
|
jpayne@68
|
1171 >>> Seq("ACGTACGT").strip("TGCA")
|
jpayne@68
|
1172 Seq('')
|
jpayne@68
|
1173
|
jpayne@68
|
1174 A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
jpayne@68
|
1175 default value). If ``inplace`` is ``True``, the sequence is stripped
|
jpayne@68
|
1176 in-place and returned.
|
jpayne@68
|
1177
|
jpayne@68
|
1178 >>> seq = MutableSeq(" ACGT ")
|
jpayne@68
|
1179 >>> seq.strip()
|
jpayne@68
|
1180 MutableSeq('ACGT')
|
jpayne@68
|
1181 >>> seq
|
jpayne@68
|
1182 MutableSeq(' ACGT ')
|
jpayne@68
|
1183 >>> seq.strip(inplace=True)
|
jpayne@68
|
1184 MutableSeq('ACGT')
|
jpayne@68
|
1185 >>> seq
|
jpayne@68
|
1186 MutableSeq('ACGT')
|
jpayne@68
|
1187
|
jpayne@68
|
1188 As ``Seq`` objects are immutable, a ``TypeError`` is raised if ``strip``
|
jpayne@68
|
1189 is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1190
|
jpayne@68
|
1191 See also the lstrip and rstrip methods.
|
jpayne@68
|
1192 """
|
jpayne@68
|
1193 if isinstance(chars, _SeqAbstractBaseClass):
|
jpayne@68
|
1194 chars = bytes(chars)
|
jpayne@68
|
1195 elif isinstance(chars, str):
|
jpayne@68
|
1196 chars = chars.encode("ASCII")
|
jpayne@68
|
1197 try:
|
jpayne@68
|
1198 data = self._data.strip(chars)
|
jpayne@68
|
1199 except TypeError:
|
jpayne@68
|
1200 raise TypeError(
|
jpayne@68
|
1201 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
jpayne@68
|
1202 ) from None
|
jpayne@68
|
1203 if inplace:
|
jpayne@68
|
1204 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1205 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1206 self._data[:] = data
|
jpayne@68
|
1207 return self
|
jpayne@68
|
1208 else:
|
jpayne@68
|
1209 return self.__class__(data)
|
jpayne@68
|
1210
|
jpayne@68
|
1211 def lstrip(self, chars=None, inplace=False):
|
jpayne@68
|
1212 """Return a sequence object with leading and trailing ends stripped.
|
jpayne@68
|
1213
|
jpayne@68
|
1214 With default arguments, leading whitespace is removed:
|
jpayne@68
|
1215
|
jpayne@68
|
1216 >>> seq = Seq(" ACGT ")
|
jpayne@68
|
1217 >>> seq.lstrip()
|
jpayne@68
|
1218 Seq('ACGT ')
|
jpayne@68
|
1219 >>> seq
|
jpayne@68
|
1220 Seq(' ACGT ')
|
jpayne@68
|
1221
|
jpayne@68
|
1222 If ``chars`` is given and not ``None``, remove characters in ``chars``
|
jpayne@68
|
1223 from the leading end instead. The order of the characters to be removed
|
jpayne@68
|
1224 is not important:
|
jpayne@68
|
1225
|
jpayne@68
|
1226 >>> Seq("ACGACGTTACG").lstrip("GCA")
|
jpayne@68
|
1227 Seq('TTACG')
|
jpayne@68
|
1228
|
jpayne@68
|
1229 A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
jpayne@68
|
1230 default value). If ``inplace`` is ``True``, the sequence is stripped
|
jpayne@68
|
1231 in-place and returned.
|
jpayne@68
|
1232
|
jpayne@68
|
1233 >>> seq = MutableSeq(" ACGT ")
|
jpayne@68
|
1234 >>> seq.lstrip()
|
jpayne@68
|
1235 MutableSeq('ACGT ')
|
jpayne@68
|
1236 >>> seq
|
jpayne@68
|
1237 MutableSeq(' ACGT ')
|
jpayne@68
|
1238 >>> seq.lstrip(inplace=True)
|
jpayne@68
|
1239 MutableSeq('ACGT ')
|
jpayne@68
|
1240 >>> seq
|
jpayne@68
|
1241 MutableSeq('ACGT ')
|
jpayne@68
|
1242
|
jpayne@68
|
1243 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1244 ``lstrip`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1245
|
jpayne@68
|
1246 See also the strip and rstrip methods.
|
jpayne@68
|
1247 """
|
jpayne@68
|
1248 if isinstance(chars, _SeqAbstractBaseClass):
|
jpayne@68
|
1249 chars = bytes(chars)
|
jpayne@68
|
1250 elif isinstance(chars, str):
|
jpayne@68
|
1251 chars = chars.encode("ASCII")
|
jpayne@68
|
1252 try:
|
jpayne@68
|
1253 data = self._data.lstrip(chars)
|
jpayne@68
|
1254 except TypeError:
|
jpayne@68
|
1255 raise TypeError(
|
jpayne@68
|
1256 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
jpayne@68
|
1257 ) from None
|
jpayne@68
|
1258 if inplace:
|
jpayne@68
|
1259 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1260 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1261 self._data[:] = data
|
jpayne@68
|
1262 return self
|
jpayne@68
|
1263 else:
|
jpayne@68
|
1264 return self.__class__(data)
|
jpayne@68
|
1265
|
jpayne@68
|
1266 def rstrip(self, chars=None, inplace=False):
|
jpayne@68
|
1267 """Return a sequence object with trailing ends stripped.
|
jpayne@68
|
1268
|
jpayne@68
|
1269 With default arguments, trailing whitespace is removed:
|
jpayne@68
|
1270
|
jpayne@68
|
1271 >>> seq = Seq(" ACGT ")
|
jpayne@68
|
1272 >>> seq.rstrip()
|
jpayne@68
|
1273 Seq(' ACGT')
|
jpayne@68
|
1274 >>> seq
|
jpayne@68
|
1275 Seq(' ACGT ')
|
jpayne@68
|
1276
|
jpayne@68
|
1277 If ``chars`` is given and not ``None``, remove characters in ``chars``
|
jpayne@68
|
1278 from the trailing end instead. The order of the characters to be
|
jpayne@68
|
1279 removed is not important:
|
jpayne@68
|
1280
|
jpayne@68
|
1281 >>> Seq("ACGACGTTACG").rstrip("GCA")
|
jpayne@68
|
1282 Seq('ACGACGTT')
|
jpayne@68
|
1283
|
jpayne@68
|
1284 A copy of the sequence is returned if ``inplace`` is ``False`` (the
|
jpayne@68
|
1285 default value). If ``inplace`` is ``True``, the sequence is stripped
|
jpayne@68
|
1286 in-place and returned.
|
jpayne@68
|
1287
|
jpayne@68
|
1288 >>> seq = MutableSeq(" ACGT ")
|
jpayne@68
|
1289 >>> seq.rstrip()
|
jpayne@68
|
1290 MutableSeq(' ACGT')
|
jpayne@68
|
1291 >>> seq
|
jpayne@68
|
1292 MutableSeq(' ACGT ')
|
jpayne@68
|
1293 >>> seq.rstrip(inplace=True)
|
jpayne@68
|
1294 MutableSeq(' ACGT')
|
jpayne@68
|
1295 >>> seq
|
jpayne@68
|
1296 MutableSeq(' ACGT')
|
jpayne@68
|
1297
|
jpayne@68
|
1298 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1299 ``rstrip`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1300
|
jpayne@68
|
1301 See also the strip and lstrip methods.
|
jpayne@68
|
1302 """
|
jpayne@68
|
1303 if isinstance(chars, _SeqAbstractBaseClass):
|
jpayne@68
|
1304 chars = bytes(chars)
|
jpayne@68
|
1305 elif isinstance(chars, str):
|
jpayne@68
|
1306 chars = chars.encode("ASCII")
|
jpayne@68
|
1307 try:
|
jpayne@68
|
1308 data = self._data.rstrip(chars)
|
jpayne@68
|
1309 except TypeError:
|
jpayne@68
|
1310 raise TypeError(
|
jpayne@68
|
1311 "argument must be None or a string, Seq, MutableSeq, or bytes-like object"
|
jpayne@68
|
1312 ) from None
|
jpayne@68
|
1313 if inplace:
|
jpayne@68
|
1314 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1315 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1316 self._data[:] = data
|
jpayne@68
|
1317 return self
|
jpayne@68
|
1318 else:
|
jpayne@68
|
1319 return self.__class__(data)
|
jpayne@68
|
1320
|
jpayne@68
|
1321 def removeprefix(self, prefix, inplace=False):
|
jpayne@68
|
1322 """Return a new Seq object with prefix (left) removed.
|
jpayne@68
|
1323
|
jpayne@68
|
1324 This behaves like the python string method of the same name.
|
jpayne@68
|
1325
|
jpayne@68
|
1326 e.g. Removing a start Codon:
|
jpayne@68
|
1327
|
jpayne@68
|
1328 >>> from Bio.Seq import Seq
|
jpayne@68
|
1329 >>> my_seq = Seq("ATGGTGTGTGT")
|
jpayne@68
|
1330 >>> my_seq
|
jpayne@68
|
1331 Seq('ATGGTGTGTGT')
|
jpayne@68
|
1332 >>> my_seq.removeprefix('ATG')
|
jpayne@68
|
1333 Seq('GTGTGTGT')
|
jpayne@68
|
1334
|
jpayne@68
|
1335 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1336 ``removeprefix`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1337
|
jpayne@68
|
1338 See also the removesuffix method.
|
jpayne@68
|
1339 """
|
jpayne@68
|
1340 if isinstance(prefix, _SeqAbstractBaseClass):
|
jpayne@68
|
1341 prefix = bytes(prefix)
|
jpayne@68
|
1342 elif isinstance(prefix, str):
|
jpayne@68
|
1343 prefix = prefix.encode("ASCII")
|
jpayne@68
|
1344 try:
|
jpayne@68
|
1345 data = self._data.removeprefix(prefix)
|
jpayne@68
|
1346 except TypeError:
|
jpayne@68
|
1347 raise TypeError(
|
jpayne@68
|
1348 "argument must be a string, Seq, MutableSeq, or bytes-like object"
|
jpayne@68
|
1349 ) from None
|
jpayne@68
|
1350 except AttributeError:
|
jpayne@68
|
1351 # Fall back for pre-Python 3.9
|
jpayne@68
|
1352 data = self._data
|
jpayne@68
|
1353 if data.startswith(prefix):
|
jpayne@68
|
1354 data = data[len(prefix) :]
|
jpayne@68
|
1355 if inplace:
|
jpayne@68
|
1356 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1357 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1358 self._data[:] = data
|
jpayne@68
|
1359 return self
|
jpayne@68
|
1360 else:
|
jpayne@68
|
1361 return self.__class__(data)
|
jpayne@68
|
1362
|
jpayne@68
|
1363 def removesuffix(self, suffix, inplace=False):
|
jpayne@68
|
1364 """Return a new Seq object with suffix (right) removed.
|
jpayne@68
|
1365
|
jpayne@68
|
1366 This behaves like the python string method of the same name.
|
jpayne@68
|
1367
|
jpayne@68
|
1368 e.g. Removing a stop codon:
|
jpayne@68
|
1369
|
jpayne@68
|
1370 >>> from Bio.Seq import Seq
|
jpayne@68
|
1371 >>> my_seq = Seq("GTGTGTGTTAG")
|
jpayne@68
|
1372 >>> my_seq
|
jpayne@68
|
1373 Seq('GTGTGTGTTAG')
|
jpayne@68
|
1374 >>> stop_codon = Seq("TAG")
|
jpayne@68
|
1375 >>> my_seq.removesuffix(stop_codon)
|
jpayne@68
|
1376 Seq('GTGTGTGT')
|
jpayne@68
|
1377
|
jpayne@68
|
1378 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1379 ``removesuffix`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1380
|
jpayne@68
|
1381 See also the removeprefix method.
|
jpayne@68
|
1382 """
|
jpayne@68
|
1383 if isinstance(suffix, _SeqAbstractBaseClass):
|
jpayne@68
|
1384 suffix = bytes(suffix)
|
jpayne@68
|
1385 elif isinstance(suffix, str):
|
jpayne@68
|
1386 suffix = suffix.encode("ASCII")
|
jpayne@68
|
1387 try:
|
jpayne@68
|
1388 data = self._data.removesuffix(suffix)
|
jpayne@68
|
1389 except TypeError:
|
jpayne@68
|
1390 raise TypeError(
|
jpayne@68
|
1391 "argument must be a string, Seq, MutableSeq, or bytes-like object"
|
jpayne@68
|
1392 ) from None
|
jpayne@68
|
1393 except AttributeError:
|
jpayne@68
|
1394 # Fall back for pre-Python 3.9
|
jpayne@68
|
1395 data = self._data
|
jpayne@68
|
1396 if data.endswith(suffix):
|
jpayne@68
|
1397 data = data[: -len(suffix)]
|
jpayne@68
|
1398 if inplace:
|
jpayne@68
|
1399 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1400 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1401 self._data[:] = data
|
jpayne@68
|
1402 return self
|
jpayne@68
|
1403 else:
|
jpayne@68
|
1404 return self.__class__(data)
|
jpayne@68
|
1405
|
jpayne@68
|
1406 def upper(self, inplace=False):
|
jpayne@68
|
1407 """Return the sequence in upper case.
|
jpayne@68
|
1408
|
jpayne@68
|
1409 An upper-case copy of the sequence is returned if inplace is False,
|
jpayne@68
|
1410 the default value:
|
jpayne@68
|
1411
|
jpayne@68
|
1412 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
1413 >>> my_seq = Seq("VHLTPeeK*")
|
jpayne@68
|
1414 >>> my_seq
|
jpayne@68
|
1415 Seq('VHLTPeeK*')
|
jpayne@68
|
1416 >>> my_seq.lower()
|
jpayne@68
|
1417 Seq('vhltpeek*')
|
jpayne@68
|
1418 >>> my_seq.upper()
|
jpayne@68
|
1419 Seq('VHLTPEEK*')
|
jpayne@68
|
1420 >>> my_seq
|
jpayne@68
|
1421 Seq('VHLTPeeK*')
|
jpayne@68
|
1422
|
jpayne@68
|
1423 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1424
|
jpayne@68
|
1425 >>> my_seq = MutableSeq("VHLTPeeK*")
|
jpayne@68
|
1426 >>> my_seq
|
jpayne@68
|
1427 MutableSeq('VHLTPeeK*')
|
jpayne@68
|
1428 >>> my_seq.lower()
|
jpayne@68
|
1429 MutableSeq('vhltpeek*')
|
jpayne@68
|
1430 >>> my_seq.upper()
|
jpayne@68
|
1431 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1432 >>> my_seq
|
jpayne@68
|
1433 MutableSeq('VHLTPeeK*')
|
jpayne@68
|
1434
|
jpayne@68
|
1435 >>> my_seq.lower(inplace=True)
|
jpayne@68
|
1436 MutableSeq('vhltpeek*')
|
jpayne@68
|
1437 >>> my_seq
|
jpayne@68
|
1438 MutableSeq('vhltpeek*')
|
jpayne@68
|
1439 >>> my_seq.upper(inplace=True)
|
jpayne@68
|
1440 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1441 >>> my_seq
|
jpayne@68
|
1442 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1443
|
jpayne@68
|
1444 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1445 ``upper`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1446
|
jpayne@68
|
1447 See also the ``lower`` method.
|
jpayne@68
|
1448 """
|
jpayne@68
|
1449 data = self._data.upper()
|
jpayne@68
|
1450 if inplace:
|
jpayne@68
|
1451 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1452 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1453 self._data[:] = data
|
jpayne@68
|
1454 return self
|
jpayne@68
|
1455 else:
|
jpayne@68
|
1456 return self.__class__(data)
|
jpayne@68
|
1457
|
jpayne@68
|
1458 def lower(self, inplace=False):
|
jpayne@68
|
1459 """Return the sequence in lower case.
|
jpayne@68
|
1460
|
jpayne@68
|
1461 An lower-case copy of the sequence is returned if inplace is False,
|
jpayne@68
|
1462 the default value:
|
jpayne@68
|
1463
|
jpayne@68
|
1464 >>> from Bio.Seq import Seq, MutableSeq
|
jpayne@68
|
1465 >>> my_seq = Seq("VHLTPeeK*")
|
jpayne@68
|
1466 >>> my_seq
|
jpayne@68
|
1467 Seq('VHLTPeeK*')
|
jpayne@68
|
1468 >>> my_seq.lower()
|
jpayne@68
|
1469 Seq('vhltpeek*')
|
jpayne@68
|
1470 >>> my_seq.upper()
|
jpayne@68
|
1471 Seq('VHLTPEEK*')
|
jpayne@68
|
1472 >>> my_seq
|
jpayne@68
|
1473 Seq('VHLTPeeK*')
|
jpayne@68
|
1474
|
jpayne@68
|
1475 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1476
|
jpayne@68
|
1477 >>> my_seq = MutableSeq("VHLTPeeK*")
|
jpayne@68
|
1478 >>> my_seq
|
jpayne@68
|
1479 MutableSeq('VHLTPeeK*')
|
jpayne@68
|
1480 >>> my_seq.lower()
|
jpayne@68
|
1481 MutableSeq('vhltpeek*')
|
jpayne@68
|
1482 >>> my_seq.upper()
|
jpayne@68
|
1483 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1484 >>> my_seq
|
jpayne@68
|
1485 MutableSeq('VHLTPeeK*')
|
jpayne@68
|
1486
|
jpayne@68
|
1487 >>> my_seq.lower(inplace=True)
|
jpayne@68
|
1488 MutableSeq('vhltpeek*')
|
jpayne@68
|
1489 >>> my_seq
|
jpayne@68
|
1490 MutableSeq('vhltpeek*')
|
jpayne@68
|
1491 >>> my_seq.upper(inplace=True)
|
jpayne@68
|
1492 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1493 >>> my_seq
|
jpayne@68
|
1494 MutableSeq('VHLTPEEK*')
|
jpayne@68
|
1495
|
jpayne@68
|
1496 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1497 ``lower`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1498
|
jpayne@68
|
1499 See also the ``upper`` method.
|
jpayne@68
|
1500 """
|
jpayne@68
|
1501 data = self._data.lower()
|
jpayne@68
|
1502 if inplace:
|
jpayne@68
|
1503 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1504 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1505 self._data[:] = data
|
jpayne@68
|
1506 return self
|
jpayne@68
|
1507 else:
|
jpayne@68
|
1508 return self.__class__(data)
|
jpayne@68
|
1509
|
jpayne@68
|
1510 def isupper(self):
|
jpayne@68
|
1511 """Return True if all ASCII characters in data are uppercase.
|
jpayne@68
|
1512
|
jpayne@68
|
1513 If there are no cased characters, the method returns False.
|
jpayne@68
|
1514 """
|
jpayne@68
|
1515 return self._data.isupper()
|
jpayne@68
|
1516
|
jpayne@68
|
1517 def islower(self):
|
jpayne@68
|
1518 """Return True if all ASCII characters in data are lowercase.
|
jpayne@68
|
1519
|
jpayne@68
|
1520 If there are no cased characters, the method returns False.
|
jpayne@68
|
1521 """
|
jpayne@68
|
1522 return self._data.islower()
|
jpayne@68
|
1523
|
jpayne@68
|
1524 def translate(
|
jpayne@68
|
1525 self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-"
|
jpayne@68
|
1526 ):
|
jpayne@68
|
1527 """Turn a nucleotide sequence into a protein sequence by creating a new sequence object.
|
jpayne@68
|
1528
|
jpayne@68
|
1529 This method will translate DNA or RNA sequences. It should not
|
jpayne@68
|
1530 be used on protein sequences as any result will be biologically
|
jpayne@68
|
1531 meaningless.
|
jpayne@68
|
1532
|
jpayne@68
|
1533 Arguments:
|
jpayne@68
|
1534 - table - Which codon table to use? This can be either a name
|
jpayne@68
|
1535 (string), an NCBI identifier (integer), or a CodonTable
|
jpayne@68
|
1536 object (useful for non-standard genetic codes). This
|
jpayne@68
|
1537 defaults to the "Standard" table.
|
jpayne@68
|
1538 - stop_symbol - Single character string, what to use for
|
jpayne@68
|
1539 terminators. This defaults to the asterisk, "*".
|
jpayne@68
|
1540 - to_stop - Boolean, defaults to False meaning do a full
|
jpayne@68
|
1541 translation continuing on past any stop codons (translated as the
|
jpayne@68
|
1542 specified stop_symbol). If True, translation is terminated at
|
jpayne@68
|
1543 the first in frame stop codon (and the stop_symbol is not
|
jpayne@68
|
1544 appended to the returned protein sequence).
|
jpayne@68
|
1545 - cds - Boolean, indicates this is a complete CDS. If True,
|
jpayne@68
|
1546 this checks the sequence starts with a valid alternative start
|
jpayne@68
|
1547 codon (which will be translated as methionine, M), that the
|
jpayne@68
|
1548 sequence length is a multiple of three, and that there is a
|
jpayne@68
|
1549 single in frame stop codon at the end (this will be excluded
|
jpayne@68
|
1550 from the protein sequence, regardless of the to_stop option).
|
jpayne@68
|
1551 If these tests fail, an exception is raised.
|
jpayne@68
|
1552 - gap - Single character string to denote symbol used for gaps.
|
jpayne@68
|
1553 Defaults to the minus sign.
|
jpayne@68
|
1554
|
jpayne@68
|
1555 A ``Seq`` object is returned if ``translate`` is called on a ``Seq``
|
jpayne@68
|
1556 object; a ``MutableSeq`` object is returned if ``translate`` is called
|
jpayne@68
|
1557 pn a ``MutableSeq`` object.
|
jpayne@68
|
1558
|
jpayne@68
|
1559 e.g. Using the standard table:
|
jpayne@68
|
1560
|
jpayne@68
|
1561 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
jpayne@68
|
1562 >>> coding_dna.translate()
|
jpayne@68
|
1563 Seq('VAIVMGR*KGAR*')
|
jpayne@68
|
1564 >>> coding_dna.translate(stop_symbol="@")
|
jpayne@68
|
1565 Seq('VAIVMGR@KGAR@')
|
jpayne@68
|
1566 >>> coding_dna.translate(to_stop=True)
|
jpayne@68
|
1567 Seq('VAIVMGR')
|
jpayne@68
|
1568
|
jpayne@68
|
1569 Now using NCBI table 2, where TGA is not a stop codon:
|
jpayne@68
|
1570
|
jpayne@68
|
1571 >>> coding_dna.translate(table=2)
|
jpayne@68
|
1572 Seq('VAIVMGRWKGAR*')
|
jpayne@68
|
1573 >>> coding_dna.translate(table=2, to_stop=True)
|
jpayne@68
|
1574 Seq('VAIVMGRWKGAR')
|
jpayne@68
|
1575
|
jpayne@68
|
1576 In fact, GTG is an alternative start codon under NCBI table 2, meaning
|
jpayne@68
|
1577 this sequence could be a complete CDS:
|
jpayne@68
|
1578
|
jpayne@68
|
1579 >>> coding_dna.translate(table=2, cds=True)
|
jpayne@68
|
1580 Seq('MAIVMGRWKGAR')
|
jpayne@68
|
1581
|
jpayne@68
|
1582 It isn't a valid CDS under NCBI table 1, due to both the start codon
|
jpayne@68
|
1583 and also the in frame stop codons:
|
jpayne@68
|
1584
|
jpayne@68
|
1585 >>> coding_dna.translate(table=1, cds=True)
|
jpayne@68
|
1586 Traceback (most recent call last):
|
jpayne@68
|
1587 ...
|
jpayne@68
|
1588 Bio.Data.CodonTable.TranslationError: First codon 'GTG' is not a start codon
|
jpayne@68
|
1589
|
jpayne@68
|
1590 If the sequence has no in-frame stop codon, then the to_stop argument
|
jpayne@68
|
1591 has no effect:
|
jpayne@68
|
1592
|
jpayne@68
|
1593 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
|
jpayne@68
|
1594 >>> coding_dna2.translate()
|
jpayne@68
|
1595 Seq('LAIVMGR')
|
jpayne@68
|
1596 >>> coding_dna2.translate(to_stop=True)
|
jpayne@68
|
1597 Seq('LAIVMGR')
|
jpayne@68
|
1598
|
jpayne@68
|
1599 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
|
jpayne@68
|
1600 or a stop codon. These are translated as "X". Any invalid codon
|
jpayne@68
|
1601 (e.g. "TA?" or "T-A") will throw a TranslationError.
|
jpayne@68
|
1602
|
jpayne@68
|
1603 NOTE - This does NOT behave like the python string's translate
|
jpayne@68
|
1604 method. For that use str(my_seq).translate(...) instead
|
jpayne@68
|
1605 """
|
jpayne@68
|
1606 try:
|
jpayne@68
|
1607 data = str(self)
|
jpayne@68
|
1608 except UndefinedSequenceError:
|
jpayne@68
|
1609 # translating an undefined sequence yields an undefined
|
jpayne@68
|
1610 # sequence with the length divided by 3
|
jpayne@68
|
1611 n = len(self)
|
jpayne@68
|
1612 if n % 3 != 0:
|
jpayne@68
|
1613 warnings.warn(
|
jpayne@68
|
1614 "Partial codon, len(sequence) not a multiple of three. "
|
jpayne@68
|
1615 "This may become an error in future.",
|
jpayne@68
|
1616 BiopythonWarning,
|
jpayne@68
|
1617 )
|
jpayne@68
|
1618 return Seq(None, n // 3)
|
jpayne@68
|
1619
|
jpayne@68
|
1620 return self.__class__(
|
jpayne@68
|
1621 _translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap)
|
jpayne@68
|
1622 )
|
jpayne@68
|
1623
|
jpayne@68
|
1624 def complement(self, inplace=False):
|
jpayne@68
|
1625 """Return the complement as a DNA sequence.
|
jpayne@68
|
1626
|
jpayne@68
|
1627 >>> Seq("CGA").complement()
|
jpayne@68
|
1628 Seq('GCT')
|
jpayne@68
|
1629
|
jpayne@68
|
1630 Any U in the sequence is treated as a T:
|
jpayne@68
|
1631
|
jpayne@68
|
1632 >>> Seq("CGAUT").complement()
|
jpayne@68
|
1633 Seq('GCTAA')
|
jpayne@68
|
1634
|
jpayne@68
|
1635 In contrast, ``complement_rna`` returns an RNA sequence:
|
jpayne@68
|
1636
|
jpayne@68
|
1637 >>> Seq("CGAUT").complement_rna()
|
jpayne@68
|
1638 Seq('GCUAA')
|
jpayne@68
|
1639
|
jpayne@68
|
1640 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1641
|
jpayne@68
|
1642 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
1643 >>> my_seq
|
jpayne@68
|
1644 MutableSeq('CGA')
|
jpayne@68
|
1645 >>> my_seq.complement()
|
jpayne@68
|
1646 MutableSeq('GCT')
|
jpayne@68
|
1647 >>> my_seq
|
jpayne@68
|
1648 MutableSeq('CGA')
|
jpayne@68
|
1649
|
jpayne@68
|
1650 >>> my_seq.complement(inplace=True)
|
jpayne@68
|
1651 MutableSeq('GCT')
|
jpayne@68
|
1652 >>> my_seq
|
jpayne@68
|
1653 MutableSeq('GCT')
|
jpayne@68
|
1654
|
jpayne@68
|
1655 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1656 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1657 """
|
jpayne@68
|
1658 ttable = _dna_complement_table
|
jpayne@68
|
1659 try:
|
jpayne@68
|
1660 data = self._data.translate(ttable)
|
jpayne@68
|
1661 except UndefinedSequenceError:
|
jpayne@68
|
1662 # complement of an undefined sequence is an undefined sequence
|
jpayne@68
|
1663 # of the same length
|
jpayne@68
|
1664 return self
|
jpayne@68
|
1665 if inplace:
|
jpayne@68
|
1666 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1667 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1668 self._data[:] = data
|
jpayne@68
|
1669 return self
|
jpayne@68
|
1670 return self.__class__(data)
|
jpayne@68
|
1671
|
jpayne@68
|
1672 def complement_rna(self, inplace=False):
|
jpayne@68
|
1673 """Return the complement as an RNA sequence.
|
jpayne@68
|
1674
|
jpayne@68
|
1675 >>> Seq("CGA").complement_rna()
|
jpayne@68
|
1676 Seq('GCU')
|
jpayne@68
|
1677
|
jpayne@68
|
1678 Any T in the sequence is treated as a U:
|
jpayne@68
|
1679
|
jpayne@68
|
1680 >>> Seq("CGAUT").complement_rna()
|
jpayne@68
|
1681 Seq('GCUAA')
|
jpayne@68
|
1682
|
jpayne@68
|
1683 In contrast, ``complement`` returns a DNA sequence by default:
|
jpayne@68
|
1684
|
jpayne@68
|
1685 >>> Seq("CGA").complement()
|
jpayne@68
|
1686 Seq('GCT')
|
jpayne@68
|
1687
|
jpayne@68
|
1688 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1689
|
jpayne@68
|
1690 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
1691 >>> my_seq
|
jpayne@68
|
1692 MutableSeq('CGA')
|
jpayne@68
|
1693 >>> my_seq.complement_rna()
|
jpayne@68
|
1694 MutableSeq('GCU')
|
jpayne@68
|
1695 >>> my_seq
|
jpayne@68
|
1696 MutableSeq('CGA')
|
jpayne@68
|
1697
|
jpayne@68
|
1698 >>> my_seq.complement_rna(inplace=True)
|
jpayne@68
|
1699 MutableSeq('GCU')
|
jpayne@68
|
1700 >>> my_seq
|
jpayne@68
|
1701 MutableSeq('GCU')
|
jpayne@68
|
1702
|
jpayne@68
|
1703 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1704 ``complement_rna`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1705 """
|
jpayne@68
|
1706 try:
|
jpayne@68
|
1707 data = self._data.translate(_rna_complement_table)
|
jpayne@68
|
1708 except UndefinedSequenceError:
|
jpayne@68
|
1709 # complement of an undefined sequence is an undefined sequence
|
jpayne@68
|
1710 # of the same length
|
jpayne@68
|
1711 return self
|
jpayne@68
|
1712 if inplace:
|
jpayne@68
|
1713 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1714 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1715 self._data[:] = data
|
jpayne@68
|
1716 return self
|
jpayne@68
|
1717 return self.__class__(data)
|
jpayne@68
|
1718
|
jpayne@68
|
1719 def reverse_complement(self, inplace=False):
|
jpayne@68
|
1720 """Return the reverse complement as a DNA sequence.
|
jpayne@68
|
1721
|
jpayne@68
|
1722 >>> Seq("CGA").reverse_complement()
|
jpayne@68
|
1723 Seq('TCG')
|
jpayne@68
|
1724
|
jpayne@68
|
1725 Any U in the sequence is treated as a T:
|
jpayne@68
|
1726
|
jpayne@68
|
1727 >>> Seq("CGAUT").reverse_complement()
|
jpayne@68
|
1728 Seq('AATCG')
|
jpayne@68
|
1729
|
jpayne@68
|
1730 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
|
jpayne@68
|
1731
|
jpayne@68
|
1732 >>> Seq("CGA").reverse_complement_rna()
|
jpayne@68
|
1733 Seq('UCG')
|
jpayne@68
|
1734
|
jpayne@68
|
1735 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1736
|
jpayne@68
|
1737 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
1738 >>> my_seq
|
jpayne@68
|
1739 MutableSeq('CGA')
|
jpayne@68
|
1740 >>> my_seq.reverse_complement()
|
jpayne@68
|
1741 MutableSeq('TCG')
|
jpayne@68
|
1742 >>> my_seq
|
jpayne@68
|
1743 MutableSeq('CGA')
|
jpayne@68
|
1744
|
jpayne@68
|
1745 >>> my_seq.reverse_complement(inplace=True)
|
jpayne@68
|
1746 MutableSeq('TCG')
|
jpayne@68
|
1747 >>> my_seq
|
jpayne@68
|
1748 MutableSeq('TCG')
|
jpayne@68
|
1749
|
jpayne@68
|
1750 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1751 ``reverse_complement`` is called on a ``Seq`` object with
|
jpayne@68
|
1752 ``inplace=True``.
|
jpayne@68
|
1753 """
|
jpayne@68
|
1754 try:
|
jpayne@68
|
1755 data = self._data.translate(_dna_complement_table)
|
jpayne@68
|
1756 except UndefinedSequenceError:
|
jpayne@68
|
1757 # reverse complement of an undefined sequence is an undefined sequence
|
jpayne@68
|
1758 # of the same length
|
jpayne@68
|
1759 return self
|
jpayne@68
|
1760 if inplace:
|
jpayne@68
|
1761 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1762 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1763 self._data[::-1] = data
|
jpayne@68
|
1764 return self
|
jpayne@68
|
1765 return self.__class__(data[::-1])
|
jpayne@68
|
1766
|
jpayne@68
|
1767 def reverse_complement_rna(self, inplace=False):
|
jpayne@68
|
1768 """Return the reverse complement as an RNA sequence.
|
jpayne@68
|
1769
|
jpayne@68
|
1770 >>> Seq("CGA").reverse_complement_rna()
|
jpayne@68
|
1771 Seq('UCG')
|
jpayne@68
|
1772
|
jpayne@68
|
1773 Any T in the sequence is treated as a U:
|
jpayne@68
|
1774
|
jpayne@68
|
1775 >>> Seq("CGAUT").reverse_complement_rna()
|
jpayne@68
|
1776 Seq('AAUCG')
|
jpayne@68
|
1777
|
jpayne@68
|
1778 In contrast, ``reverse_complement`` returns a DNA sequence:
|
jpayne@68
|
1779
|
jpayne@68
|
1780 >>> Seq("CGA").reverse_complement()
|
jpayne@68
|
1781 Seq('TCG')
|
jpayne@68
|
1782
|
jpayne@68
|
1783 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1784
|
jpayne@68
|
1785 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
1786 >>> my_seq
|
jpayne@68
|
1787 MutableSeq('CGA')
|
jpayne@68
|
1788 >>> my_seq.reverse_complement_rna()
|
jpayne@68
|
1789 MutableSeq('UCG')
|
jpayne@68
|
1790 >>> my_seq
|
jpayne@68
|
1791 MutableSeq('CGA')
|
jpayne@68
|
1792
|
jpayne@68
|
1793 >>> my_seq.reverse_complement_rna(inplace=True)
|
jpayne@68
|
1794 MutableSeq('UCG')
|
jpayne@68
|
1795 >>> my_seq
|
jpayne@68
|
1796 MutableSeq('UCG')
|
jpayne@68
|
1797
|
jpayne@68
|
1798 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1799 ``reverse_complement_rna`` is called on a ``Seq`` object with
|
jpayne@68
|
1800 ``inplace=True``.
|
jpayne@68
|
1801 """
|
jpayne@68
|
1802 try:
|
jpayne@68
|
1803 data = self._data.translate(_rna_complement_table)
|
jpayne@68
|
1804 except UndefinedSequenceError:
|
jpayne@68
|
1805 # reverse complement of an undefined sequence is an undefined sequence
|
jpayne@68
|
1806 # of the same length
|
jpayne@68
|
1807 return self
|
jpayne@68
|
1808 if inplace:
|
jpayne@68
|
1809 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1810 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1811 self._data[::-1] = data
|
jpayne@68
|
1812 return self
|
jpayne@68
|
1813 return self.__class__(data[::-1])
|
jpayne@68
|
1814
|
jpayne@68
|
1815 def transcribe(self, inplace=False):
|
jpayne@68
|
1816 """Transcribe a DNA sequence into RNA and return the RNA sequence as a new Seq object.
|
jpayne@68
|
1817
|
jpayne@68
|
1818 Following the usual convention, the sequence is interpreted as the
|
jpayne@68
|
1819 coding strand of the DNA double helix, not the template strand. This
|
jpayne@68
|
1820 means we can get the RNA sequence just by switching T to U.
|
jpayne@68
|
1821
|
jpayne@68
|
1822 >>> from Bio.Seq import Seq
|
jpayne@68
|
1823 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
jpayne@68
|
1824 >>> coding_dna
|
jpayne@68
|
1825 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1826 >>> coding_dna.transcribe()
|
jpayne@68
|
1827 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1828
|
jpayne@68
|
1829 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1830
|
jpayne@68
|
1831 >>> sequence = MutableSeq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
|
jpayne@68
|
1832 >>> sequence
|
jpayne@68
|
1833 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1834 >>> sequence.transcribe()
|
jpayne@68
|
1835 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1836 >>> sequence
|
jpayne@68
|
1837 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1838
|
jpayne@68
|
1839 >>> sequence.transcribe(inplace=True)
|
jpayne@68
|
1840 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1841 >>> sequence
|
jpayne@68
|
1842 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1843
|
jpayne@68
|
1844 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1845 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1846
|
jpayne@68
|
1847 Trying to transcribe an RNA sequence has no effect.
|
jpayne@68
|
1848 If you have a nucleotide sequence which might be DNA or RNA
|
jpayne@68
|
1849 (or even a mixture), calling the transcribe method will ensure
|
jpayne@68
|
1850 any T becomes U.
|
jpayne@68
|
1851
|
jpayne@68
|
1852 Trying to transcribe a protein sequence will replace any
|
jpayne@68
|
1853 T for Threonine with U for Selenocysteine, which has no
|
jpayne@68
|
1854 biologically plausible rational.
|
jpayne@68
|
1855
|
jpayne@68
|
1856 >>> from Bio.Seq import Seq
|
jpayne@68
|
1857 >>> my_protein = Seq("MAIVMGRT")
|
jpayne@68
|
1858 >>> my_protein.transcribe()
|
jpayne@68
|
1859 Seq('MAIVMGRU')
|
jpayne@68
|
1860 """
|
jpayne@68
|
1861 data = self._data.replace(b"T", b"U").replace(b"t", b"u")
|
jpayne@68
|
1862 if inplace:
|
jpayne@68
|
1863 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1864 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1865 self._data[:] = data
|
jpayne@68
|
1866 return self
|
jpayne@68
|
1867 return self.__class__(data)
|
jpayne@68
|
1868
|
jpayne@68
|
1869 def back_transcribe(self, inplace=False):
|
jpayne@68
|
1870 """Return the DNA sequence from an RNA sequence by creating a new Seq object.
|
jpayne@68
|
1871
|
jpayne@68
|
1872 >>> from Bio.Seq import Seq
|
jpayne@68
|
1873 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
|
jpayne@68
|
1874 >>> messenger_rna
|
jpayne@68
|
1875 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1876 >>> messenger_rna.back_transcribe()
|
jpayne@68
|
1877 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1878
|
jpayne@68
|
1879 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
1880
|
jpayne@68
|
1881 >>> sequence = MutableSeq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
|
jpayne@68
|
1882 >>> sequence
|
jpayne@68
|
1883 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1884 >>> sequence.back_transcribe()
|
jpayne@68
|
1885 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1886 >>> sequence
|
jpayne@68
|
1887 MutableSeq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
|
jpayne@68
|
1888
|
jpayne@68
|
1889 >>> sequence.back_transcribe(inplace=True)
|
jpayne@68
|
1890 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1891 >>> sequence
|
jpayne@68
|
1892 MutableSeq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
|
jpayne@68
|
1893
|
jpayne@68
|
1894 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1895 ``transcribe`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1896
|
jpayne@68
|
1897 Trying to back-transcribe DNA has no effect, If you have a nucleotide
|
jpayne@68
|
1898 sequence which might be DNA or RNA (or even a mixture), calling the
|
jpayne@68
|
1899 back-transcribe method will ensure any U becomes T.
|
jpayne@68
|
1900
|
jpayne@68
|
1901 Trying to back-transcribe a protein sequence will replace any U for
|
jpayne@68
|
1902 Selenocysteine with T for Threonine, which is biologically meaningless.
|
jpayne@68
|
1903
|
jpayne@68
|
1904 >>> from Bio.Seq import Seq
|
jpayne@68
|
1905 >>> my_protein = Seq("MAIVMGRU")
|
jpayne@68
|
1906 >>> my_protein.back_transcribe()
|
jpayne@68
|
1907 Seq('MAIVMGRT')
|
jpayne@68
|
1908 """
|
jpayne@68
|
1909 data = self._data.replace(b"U", b"T").replace(b"u", b"t")
|
jpayne@68
|
1910 if inplace:
|
jpayne@68
|
1911 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1912 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1913 self._data[:] = data
|
jpayne@68
|
1914 return self
|
jpayne@68
|
1915 return self.__class__(data)
|
jpayne@68
|
1916
|
jpayne@68
|
1917 def join(self, other):
|
jpayne@68
|
1918 """Return a merge of the sequences in other, spaced by the sequence from self.
|
jpayne@68
|
1919
|
jpayne@68
|
1920 Accepts a Seq object, MutableSeq object, or string (and iterates over
|
jpayne@68
|
1921 the letters), or an iterable containing Seq, MutableSeq, or string
|
jpayne@68
|
1922 objects. These arguments will be concatenated with the calling sequence
|
jpayne@68
|
1923 as the spacer:
|
jpayne@68
|
1924
|
jpayne@68
|
1925 >>> concatenated = Seq('NNNNN').join([Seq("AAA"), Seq("TTT"), Seq("PPP")])
|
jpayne@68
|
1926 >>> concatenated
|
jpayne@68
|
1927 Seq('AAANNNNNTTTNNNNNPPP')
|
jpayne@68
|
1928
|
jpayne@68
|
1929 Joining the letters of a single sequence:
|
jpayne@68
|
1930
|
jpayne@68
|
1931 >>> Seq('NNNNN').join(Seq("ACGT"))
|
jpayne@68
|
1932 Seq('ANNNNNCNNNNNGNNNNNT')
|
jpayne@68
|
1933 >>> Seq('NNNNN').join("ACGT")
|
jpayne@68
|
1934 Seq('ANNNNNCNNNNNGNNNNNT')
|
jpayne@68
|
1935 """
|
jpayne@68
|
1936 if isinstance(other, _SeqAbstractBaseClass):
|
jpayne@68
|
1937 return self.__class__(str(self).join(str(other)))
|
jpayne@68
|
1938 elif isinstance(other, str):
|
jpayne@68
|
1939 return self.__class__(str(self).join(other))
|
jpayne@68
|
1940
|
jpayne@68
|
1941 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
jpayne@68
|
1942
|
jpayne@68
|
1943 if isinstance(other, SeqRecord):
|
jpayne@68
|
1944 raise TypeError("Iterable cannot be a SeqRecord")
|
jpayne@68
|
1945
|
jpayne@68
|
1946 for c in other:
|
jpayne@68
|
1947 if isinstance(c, SeqRecord):
|
jpayne@68
|
1948 raise TypeError("Iterable cannot contain SeqRecords")
|
jpayne@68
|
1949 elif not isinstance(c, (str, _SeqAbstractBaseClass)):
|
jpayne@68
|
1950 raise TypeError(
|
jpayne@68
|
1951 "Input must be an iterable of Seq objects, MutableSeq objects, or strings"
|
jpayne@68
|
1952 )
|
jpayne@68
|
1953 return self.__class__(str(self).join([str(_) for _ in other]))
|
jpayne@68
|
1954
|
jpayne@68
|
1955 def replace(self, old, new, inplace=False):
|
jpayne@68
|
1956 """Return a copy with all occurrences of subsequence old replaced by new.
|
jpayne@68
|
1957
|
jpayne@68
|
1958 >>> s = Seq("ACGTAACCGGTT")
|
jpayne@68
|
1959 >>> t = s.replace("AC", "XYZ")
|
jpayne@68
|
1960 >>> s
|
jpayne@68
|
1961 Seq('ACGTAACCGGTT')
|
jpayne@68
|
1962 >>> t
|
jpayne@68
|
1963 Seq('XYZGTAXYZCGGTT')
|
jpayne@68
|
1964
|
jpayne@68
|
1965 For mutable sequences, passing inplace=True will modify the sequence in place:
|
jpayne@68
|
1966
|
jpayne@68
|
1967 >>> m = MutableSeq("ACGTAACCGGTT")
|
jpayne@68
|
1968 >>> t = m.replace("AC", "XYZ")
|
jpayne@68
|
1969 >>> m
|
jpayne@68
|
1970 MutableSeq('ACGTAACCGGTT')
|
jpayne@68
|
1971 >>> t
|
jpayne@68
|
1972 MutableSeq('XYZGTAXYZCGGTT')
|
jpayne@68
|
1973
|
jpayne@68
|
1974 >>> m = MutableSeq("ACGTAACCGGTT")
|
jpayne@68
|
1975 >>> t = m.replace("AC", "XYZ", inplace=True)
|
jpayne@68
|
1976 >>> m
|
jpayne@68
|
1977 MutableSeq('XYZGTAXYZCGGTT')
|
jpayne@68
|
1978 >>> t
|
jpayne@68
|
1979 MutableSeq('XYZGTAXYZCGGTT')
|
jpayne@68
|
1980
|
jpayne@68
|
1981 As ``Seq`` objects are immutable, a ``TypeError`` is raised if
|
jpayne@68
|
1982 ``replace`` is called on a ``Seq`` object with ``inplace=True``.
|
jpayne@68
|
1983 """
|
jpayne@68
|
1984 if isinstance(old, _SeqAbstractBaseClass):
|
jpayne@68
|
1985 old = bytes(old)
|
jpayne@68
|
1986 elif isinstance(old, str):
|
jpayne@68
|
1987 old = old.encode("ASCII")
|
jpayne@68
|
1988 if isinstance(new, _SeqAbstractBaseClass):
|
jpayne@68
|
1989 new = bytes(new)
|
jpayne@68
|
1990 elif isinstance(new, str):
|
jpayne@68
|
1991 new = new.encode("ASCII")
|
jpayne@68
|
1992 data = self._data.replace(old, new)
|
jpayne@68
|
1993 if inplace:
|
jpayne@68
|
1994 if not isinstance(self._data, bytearray):
|
jpayne@68
|
1995 raise TypeError("Sequence is immutable")
|
jpayne@68
|
1996 self._data[:] = data
|
jpayne@68
|
1997 return self
|
jpayne@68
|
1998 return self.__class__(data)
|
jpayne@68
|
1999
|
jpayne@68
|
2000 @property
|
jpayne@68
|
2001 def defined(self):
|
jpayne@68
|
2002 """Return True if the sequence is defined, False if undefined or partially defined.
|
jpayne@68
|
2003
|
jpayne@68
|
2004 Zero-length sequences are always considered to be defined.
|
jpayne@68
|
2005 """
|
jpayne@68
|
2006 if isinstance(self._data, (bytes, bytearray)):
|
jpayne@68
|
2007 return True
|
jpayne@68
|
2008 else:
|
jpayne@68
|
2009 return self._data.defined
|
jpayne@68
|
2010
|
jpayne@68
|
2011 @property
|
jpayne@68
|
2012 def defined_ranges(self):
|
jpayne@68
|
2013 """Return a tuple of the ranges where the sequence contents is defined.
|
jpayne@68
|
2014
|
jpayne@68
|
2015 The return value has the format ((start1, end1), (start2, end2), ...).
|
jpayne@68
|
2016 """
|
jpayne@68
|
2017 if isinstance(self._data, (bytes, bytearray)):
|
jpayne@68
|
2018 length = len(self)
|
jpayne@68
|
2019 if length > 0:
|
jpayne@68
|
2020 return ((0, length),)
|
jpayne@68
|
2021 else:
|
jpayne@68
|
2022 return ()
|
jpayne@68
|
2023 else:
|
jpayne@68
|
2024 return self._data.defined_ranges
|
jpayne@68
|
2025
|
jpayne@68
|
2026
|
jpayne@68
|
2027 class Seq(_SeqAbstractBaseClass):
|
jpayne@68
|
2028 """Read-only sequence object (essentially a string with biological methods).
|
jpayne@68
|
2029
|
jpayne@68
|
2030 Like normal python strings, our basic sequence object is immutable.
|
jpayne@68
|
2031 This prevents you from doing my_seq[5] = "A" for example, but does allow
|
jpayne@68
|
2032 Seq objects to be used as dictionary keys.
|
jpayne@68
|
2033
|
jpayne@68
|
2034 The Seq object provides a number of string like methods (such as count,
|
jpayne@68
|
2035 find, split and strip).
|
jpayne@68
|
2036
|
jpayne@68
|
2037 The Seq object also provides some biological methods, such as complement,
|
jpayne@68
|
2038 reverse_complement, transcribe, back_transcribe and translate (which are
|
jpayne@68
|
2039 not applicable to protein sequences).
|
jpayne@68
|
2040 """
|
jpayne@68
|
2041
|
jpayne@68
|
2042 _data: Union[bytes, SequenceDataAbstractBaseClass]
|
jpayne@68
|
2043
|
jpayne@68
|
2044 def __init__(
|
jpayne@68
|
2045 self,
|
jpayne@68
|
2046 data: Union[
|
jpayne@68
|
2047 str,
|
jpayne@68
|
2048 bytes,
|
jpayne@68
|
2049 bytearray,
|
jpayne@68
|
2050 _SeqAbstractBaseClass,
|
jpayne@68
|
2051 SequenceDataAbstractBaseClass,
|
jpayne@68
|
2052 dict,
|
jpayne@68
|
2053 None,
|
jpayne@68
|
2054 ],
|
jpayne@68
|
2055 length: Optional[int] = None,
|
jpayne@68
|
2056 ):
|
jpayne@68
|
2057 """Create a Seq object.
|
jpayne@68
|
2058
|
jpayne@68
|
2059 Arguments:
|
jpayne@68
|
2060 - data - Sequence, required (string)
|
jpayne@68
|
2061 - length - Sequence length, used only if data is None or a dictionary (integer)
|
jpayne@68
|
2062
|
jpayne@68
|
2063 You will typically use Bio.SeqIO to read in sequences from files as
|
jpayne@68
|
2064 SeqRecord objects, whose sequence will be exposed as a Seq object via
|
jpayne@68
|
2065 the seq property.
|
jpayne@68
|
2066
|
jpayne@68
|
2067 However, you can also create a Seq object directly:
|
jpayne@68
|
2068
|
jpayne@68
|
2069 >>> from Bio.Seq import Seq
|
jpayne@68
|
2070 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF")
|
jpayne@68
|
2071 >>> my_seq
|
jpayne@68
|
2072 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')
|
jpayne@68
|
2073 >>> print(my_seq)
|
jpayne@68
|
2074 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
|
jpayne@68
|
2075
|
jpayne@68
|
2076 To create a Seq object with for a sequence of known length but
|
jpayne@68
|
2077 unknown sequence contents, use None for the data argument and pass
|
jpayne@68
|
2078 the sequence length for the length argument. Trying to access the
|
jpayne@68
|
2079 sequence contents of a Seq object created in this way will raise
|
jpayne@68
|
2080 an UndefinedSequenceError:
|
jpayne@68
|
2081
|
jpayne@68
|
2082 >>> my_undefined_sequence = Seq(None, 20)
|
jpayne@68
|
2083 >>> my_undefined_sequence
|
jpayne@68
|
2084 Seq(None, length=20)
|
jpayne@68
|
2085 >>> len(my_undefined_sequence)
|
jpayne@68
|
2086 20
|
jpayne@68
|
2087 >>> print(my_undefined_sequence)
|
jpayne@68
|
2088 Traceback (most recent call last):
|
jpayne@68
|
2089 ...
|
jpayne@68
|
2090 Bio.Seq.UndefinedSequenceError: Sequence content is undefined
|
jpayne@68
|
2091
|
jpayne@68
|
2092 If the sequence contents is known for parts of the sequence only, use
|
jpayne@68
|
2093 a dictionary for the data argument to pass the known sequence segments:
|
jpayne@68
|
2094
|
jpayne@68
|
2095 >>> my_partially_defined_sequence = Seq({3: "ACGT"}, 10)
|
jpayne@68
|
2096 >>> my_partially_defined_sequence
|
jpayne@68
|
2097 Seq({3: 'ACGT'}, length=10)
|
jpayne@68
|
2098 >>> len(my_partially_defined_sequence)
|
jpayne@68
|
2099 10
|
jpayne@68
|
2100 >>> print(my_partially_defined_sequence)
|
jpayne@68
|
2101 Traceback (most recent call last):
|
jpayne@68
|
2102 ...
|
jpayne@68
|
2103 Bio.Seq.UndefinedSequenceError: Sequence content is only partially defined
|
jpayne@68
|
2104 >>> my_partially_defined_sequence[3:7]
|
jpayne@68
|
2105 Seq('ACGT')
|
jpayne@68
|
2106 >>> print(my_partially_defined_sequence[3:7])
|
jpayne@68
|
2107 ACGT
|
jpayne@68
|
2108 """
|
jpayne@68
|
2109 if data is None:
|
jpayne@68
|
2110 if length is None:
|
jpayne@68
|
2111 raise ValueError("length must not be None if data is None")
|
jpayne@68
|
2112 elif length == 0:
|
jpayne@68
|
2113 self._data = b""
|
jpayne@68
|
2114 elif length < 0:
|
jpayne@68
|
2115 raise ValueError("length must not be negative.")
|
jpayne@68
|
2116 else:
|
jpayne@68
|
2117 self._data = _UndefinedSequenceData(length)
|
jpayne@68
|
2118 elif isinstance(data, (bytes, SequenceDataAbstractBaseClass)):
|
jpayne@68
|
2119 self._data = data
|
jpayne@68
|
2120 elif isinstance(data, (bytearray, _SeqAbstractBaseClass)):
|
jpayne@68
|
2121 self._data = bytes(data)
|
jpayne@68
|
2122 elif isinstance(data, str):
|
jpayne@68
|
2123 self._data = bytes(data, encoding="ASCII")
|
jpayne@68
|
2124 elif isinstance(data, dict):
|
jpayne@68
|
2125 if length is None:
|
jpayne@68
|
2126 raise ValueError("length must not be None if data is a dictionary")
|
jpayne@68
|
2127 elif length == 0:
|
jpayne@68
|
2128 self._data = b""
|
jpayne@68
|
2129 elif length < 0:
|
jpayne@68
|
2130 raise ValueError("length must not be negative.")
|
jpayne@68
|
2131 else:
|
jpayne@68
|
2132 current = 0 # not needed here, but it keeps mypy happy
|
jpayne@68
|
2133 end = -1
|
jpayne@68
|
2134 starts = sorted(data.keys())
|
jpayne@68
|
2135 _data: Dict[int, bytes] = {}
|
jpayne@68
|
2136 for start in starts:
|
jpayne@68
|
2137 seq = data[start]
|
jpayne@68
|
2138 if isinstance(seq, str):
|
jpayne@68
|
2139 seq = bytes(seq, encoding="ASCII")
|
jpayne@68
|
2140 else:
|
jpayne@68
|
2141 try:
|
jpayne@68
|
2142 seq = bytes(seq)
|
jpayne@68
|
2143 except Exception:
|
jpayne@68
|
2144 raise ValueError("Expected bytes-like objects or strings")
|
jpayne@68
|
2145 if start < end:
|
jpayne@68
|
2146 raise ValueError("Sequence data are overlapping.")
|
jpayne@68
|
2147 elif start == end:
|
jpayne@68
|
2148 _data[current] += seq # noqa: F821
|
jpayne@68
|
2149 else:
|
jpayne@68
|
2150 _data[start] = seq
|
jpayne@68
|
2151 current = start
|
jpayne@68
|
2152 end = start + len(seq)
|
jpayne@68
|
2153 if end > length:
|
jpayne@68
|
2154 raise ValueError(
|
jpayne@68
|
2155 "Provided sequence data extend beyond sequence length."
|
jpayne@68
|
2156 )
|
jpayne@68
|
2157 elif end == length and current == 0:
|
jpayne@68
|
2158 # sequence is fully defined
|
jpayne@68
|
2159 self._data = _data[current]
|
jpayne@68
|
2160 else:
|
jpayne@68
|
2161 self._data = _PartiallyDefinedSequenceData(length, _data)
|
jpayne@68
|
2162 else:
|
jpayne@68
|
2163 raise TypeError(
|
jpayne@68
|
2164 "data should be a string, bytes, bytearray, Seq, or MutableSeq object"
|
jpayne@68
|
2165 )
|
jpayne@68
|
2166
|
jpayne@68
|
2167 def __hash__(self):
|
jpayne@68
|
2168 """Hash of the sequence as a string for comparison.
|
jpayne@68
|
2169
|
jpayne@68
|
2170 See Seq object comparison documentation (method ``__eq__`` in
|
jpayne@68
|
2171 particular) as this has changed in Biopython 1.65. Older versions
|
jpayne@68
|
2172 would hash on object identity.
|
jpayne@68
|
2173 """
|
jpayne@68
|
2174 return hash(self._data)
|
jpayne@68
|
2175
|
jpayne@68
|
2176
|
jpayne@68
|
2177 class MutableSeq(_SeqAbstractBaseClass):
|
jpayne@68
|
2178 """An editable sequence object.
|
jpayne@68
|
2179
|
jpayne@68
|
2180 Unlike normal python strings and our basic sequence object (the Seq class)
|
jpayne@68
|
2181 which are immutable, the MutableSeq lets you edit the sequence in place.
|
jpayne@68
|
2182 However, this means you cannot use a MutableSeq object as a dictionary key.
|
jpayne@68
|
2183
|
jpayne@68
|
2184 >>> from Bio.Seq import MutableSeq
|
jpayne@68
|
2185 >>> my_seq = MutableSeq("ACTCGTCGTCG")
|
jpayne@68
|
2186 >>> my_seq
|
jpayne@68
|
2187 MutableSeq('ACTCGTCGTCG')
|
jpayne@68
|
2188 >>> my_seq[5]
|
jpayne@68
|
2189 'T'
|
jpayne@68
|
2190 >>> my_seq[5] = "A"
|
jpayne@68
|
2191 >>> my_seq
|
jpayne@68
|
2192 MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2193 >>> my_seq[5]
|
jpayne@68
|
2194 'A'
|
jpayne@68
|
2195 >>> my_seq[5:8] = "NNN"
|
jpayne@68
|
2196 >>> my_seq
|
jpayne@68
|
2197 MutableSeq('ACTCGNNNTCG')
|
jpayne@68
|
2198 >>> len(my_seq)
|
jpayne@68
|
2199 11
|
jpayne@68
|
2200
|
jpayne@68
|
2201 Note that the MutableSeq object does not support as many string-like
|
jpayne@68
|
2202 or biological methods as the Seq object.
|
jpayne@68
|
2203 """
|
jpayne@68
|
2204
|
jpayne@68
|
2205 def __init__(self, data):
|
jpayne@68
|
2206 """Create a MutableSeq object."""
|
jpayne@68
|
2207 if isinstance(data, bytearray):
|
jpayne@68
|
2208 self._data = data
|
jpayne@68
|
2209 elif isinstance(data, bytes):
|
jpayne@68
|
2210 self._data = bytearray(data)
|
jpayne@68
|
2211 elif isinstance(data, str):
|
jpayne@68
|
2212 self._data = bytearray(data, "ASCII")
|
jpayne@68
|
2213 elif isinstance(data, MutableSeq):
|
jpayne@68
|
2214 self._data = data._data[:] # Take a copy
|
jpayne@68
|
2215 elif isinstance(data, Seq):
|
jpayne@68
|
2216 # Make no assumptions about the Seq subclass internal storage
|
jpayne@68
|
2217 self._data = bytearray(bytes(data))
|
jpayne@68
|
2218 else:
|
jpayne@68
|
2219 raise TypeError(
|
jpayne@68
|
2220 "data should be a string, bytearray object, Seq object, or a "
|
jpayne@68
|
2221 "MutableSeq object"
|
jpayne@68
|
2222 )
|
jpayne@68
|
2223
|
jpayne@68
|
2224 def __setitem__(self, index, value):
|
jpayne@68
|
2225 """Set a subsequence of single letter via value parameter.
|
jpayne@68
|
2226
|
jpayne@68
|
2227 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2228 >>> my_seq[0] = 'T'
|
jpayne@68
|
2229 >>> my_seq
|
jpayne@68
|
2230 MutableSeq('TCTCGACGTCG')
|
jpayne@68
|
2231 """
|
jpayne@68
|
2232 if isinstance(index, numbers.Integral):
|
jpayne@68
|
2233 # Replacing a single letter with a new string
|
jpayne@68
|
2234 self._data[index] = ord(value)
|
jpayne@68
|
2235 else:
|
jpayne@68
|
2236 # Replacing a sub-sequence
|
jpayne@68
|
2237 if isinstance(value, MutableSeq):
|
jpayne@68
|
2238 self._data[index] = value._data
|
jpayne@68
|
2239 elif isinstance(value, Seq):
|
jpayne@68
|
2240 self._data[index] = bytes(value)
|
jpayne@68
|
2241 elif isinstance(value, str):
|
jpayne@68
|
2242 self._data[index] = value.encode("ASCII")
|
jpayne@68
|
2243 else:
|
jpayne@68
|
2244 raise TypeError(f"received unexpected type '{type(value).__name__}'")
|
jpayne@68
|
2245
|
jpayne@68
|
2246 def __delitem__(self, index):
|
jpayne@68
|
2247 """Delete a subsequence of single letter.
|
jpayne@68
|
2248
|
jpayne@68
|
2249 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2250 >>> del my_seq[0]
|
jpayne@68
|
2251 >>> my_seq
|
jpayne@68
|
2252 MutableSeq('CTCGACGTCG')
|
jpayne@68
|
2253 """
|
jpayne@68
|
2254 # Could be deleting a single letter, or a slice
|
jpayne@68
|
2255 del self._data[index]
|
jpayne@68
|
2256
|
jpayne@68
|
2257 def append(self, c):
|
jpayne@68
|
2258 """Add a subsequence to the mutable sequence object.
|
jpayne@68
|
2259
|
jpayne@68
|
2260 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2261 >>> my_seq.append('A')
|
jpayne@68
|
2262 >>> my_seq
|
jpayne@68
|
2263 MutableSeq('ACTCGACGTCGA')
|
jpayne@68
|
2264
|
jpayne@68
|
2265 No return value.
|
jpayne@68
|
2266 """
|
jpayne@68
|
2267 self._data.append(ord(c.encode("ASCII")))
|
jpayne@68
|
2268
|
jpayne@68
|
2269 def insert(self, i, c):
|
jpayne@68
|
2270 """Add a subsequence to the mutable sequence object at a given index.
|
jpayne@68
|
2271
|
jpayne@68
|
2272 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2273 >>> my_seq.insert(0,'A')
|
jpayne@68
|
2274 >>> my_seq
|
jpayne@68
|
2275 MutableSeq('AACTCGACGTCG')
|
jpayne@68
|
2276 >>> my_seq.insert(8,'G')
|
jpayne@68
|
2277 >>> my_seq
|
jpayne@68
|
2278 MutableSeq('AACTCGACGGTCG')
|
jpayne@68
|
2279
|
jpayne@68
|
2280 No return value.
|
jpayne@68
|
2281 """
|
jpayne@68
|
2282 self._data.insert(i, ord(c.encode("ASCII")))
|
jpayne@68
|
2283
|
jpayne@68
|
2284 def pop(self, i=(-1)):
|
jpayne@68
|
2285 """Remove a subsequence of a single letter at given index.
|
jpayne@68
|
2286
|
jpayne@68
|
2287 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2288 >>> my_seq.pop()
|
jpayne@68
|
2289 'G'
|
jpayne@68
|
2290 >>> my_seq
|
jpayne@68
|
2291 MutableSeq('ACTCGACGTC')
|
jpayne@68
|
2292 >>> my_seq.pop()
|
jpayne@68
|
2293 'C'
|
jpayne@68
|
2294 >>> my_seq
|
jpayne@68
|
2295 MutableSeq('ACTCGACGT')
|
jpayne@68
|
2296
|
jpayne@68
|
2297 Returns the last character of the sequence.
|
jpayne@68
|
2298 """
|
jpayne@68
|
2299 c = self._data[i]
|
jpayne@68
|
2300 del self._data[i]
|
jpayne@68
|
2301 return chr(c)
|
jpayne@68
|
2302
|
jpayne@68
|
2303 def remove(self, item):
|
jpayne@68
|
2304 """Remove a subsequence of a single letter from mutable sequence.
|
jpayne@68
|
2305
|
jpayne@68
|
2306 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2307 >>> my_seq.remove('C')
|
jpayne@68
|
2308 >>> my_seq
|
jpayne@68
|
2309 MutableSeq('ATCGACGTCG')
|
jpayne@68
|
2310 >>> my_seq.remove('A')
|
jpayne@68
|
2311 >>> my_seq
|
jpayne@68
|
2312 MutableSeq('TCGACGTCG')
|
jpayne@68
|
2313
|
jpayne@68
|
2314 No return value.
|
jpayne@68
|
2315 """
|
jpayne@68
|
2316 codepoint = ord(item)
|
jpayne@68
|
2317 try:
|
jpayne@68
|
2318 self._data.remove(codepoint)
|
jpayne@68
|
2319 except ValueError:
|
jpayne@68
|
2320 raise ValueError("value not found in MutableSeq") from None
|
jpayne@68
|
2321
|
jpayne@68
|
2322 def reverse(self):
|
jpayne@68
|
2323 """Modify the mutable sequence to reverse itself.
|
jpayne@68
|
2324
|
jpayne@68
|
2325 No return value.
|
jpayne@68
|
2326 """
|
jpayne@68
|
2327 self._data.reverse()
|
jpayne@68
|
2328
|
jpayne@68
|
2329 def extend(self, other):
|
jpayne@68
|
2330 """Add a sequence to the original mutable sequence object.
|
jpayne@68
|
2331
|
jpayne@68
|
2332 >>> my_seq = MutableSeq('ACTCGACGTCG')
|
jpayne@68
|
2333 >>> my_seq.extend('A')
|
jpayne@68
|
2334 >>> my_seq
|
jpayne@68
|
2335 MutableSeq('ACTCGACGTCGA')
|
jpayne@68
|
2336 >>> my_seq.extend('TTT')
|
jpayne@68
|
2337 >>> my_seq
|
jpayne@68
|
2338 MutableSeq('ACTCGACGTCGATTT')
|
jpayne@68
|
2339
|
jpayne@68
|
2340 No return value.
|
jpayne@68
|
2341 """
|
jpayne@68
|
2342 if isinstance(other, MutableSeq):
|
jpayne@68
|
2343 self._data.extend(other._data)
|
jpayne@68
|
2344 elif isinstance(other, Seq):
|
jpayne@68
|
2345 self._data.extend(bytes(other))
|
jpayne@68
|
2346 elif isinstance(other, str):
|
jpayne@68
|
2347 self._data.extend(other.encode("ASCII"))
|
jpayne@68
|
2348 else:
|
jpayne@68
|
2349 raise TypeError("expected a string, Seq or MutableSeq")
|
jpayne@68
|
2350
|
jpayne@68
|
2351
|
jpayne@68
|
2352 class UndefinedSequenceError(ValueError):
|
jpayne@68
|
2353 """Sequence contents is undefined."""
|
jpayne@68
|
2354
|
jpayne@68
|
2355
|
jpayne@68
|
2356 class _UndefinedSequenceData(SequenceDataAbstractBaseClass):
|
jpayne@68
|
2357 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
|
jpayne@68
|
2358
|
jpayne@68
|
2359 Objects of this class can be used to create a Seq object to represent
|
jpayne@68
|
2360 sequences with a known length, but an unknown sequence contents.
|
jpayne@68
|
2361 Calling __len__ returns the sequence length, calling __getitem__ raises an
|
jpayne@68
|
2362 UndefinedSequenceError except for requests of zero size, for which it
|
jpayne@68
|
2363 returns an empty bytes object.
|
jpayne@68
|
2364 """
|
jpayne@68
|
2365
|
jpayne@68
|
2366 __slots__ = ("_length",)
|
jpayne@68
|
2367
|
jpayne@68
|
2368 def __init__(self, length):
|
jpayne@68
|
2369 """Initialize the object with the sequence length.
|
jpayne@68
|
2370
|
jpayne@68
|
2371 The calling function is responsible for ensuring that the length is
|
jpayne@68
|
2372 greater than zero.
|
jpayne@68
|
2373 """
|
jpayne@68
|
2374 self._length = length
|
jpayne@68
|
2375 super().__init__()
|
jpayne@68
|
2376
|
jpayne@68
|
2377 def __getitem__(self, key: slice) -> Union[bytes, "_UndefinedSequenceData"]:
|
jpayne@68
|
2378 if isinstance(key, slice):
|
jpayne@68
|
2379 start, end, step = key.indices(self._length)
|
jpayne@68
|
2380 size = len(range(start, end, step))
|
jpayne@68
|
2381 if size == 0:
|
jpayne@68
|
2382 return b""
|
jpayne@68
|
2383 return _UndefinedSequenceData(size)
|
jpayne@68
|
2384 else:
|
jpayne@68
|
2385 raise UndefinedSequenceError("Sequence content is undefined")
|
jpayne@68
|
2386
|
jpayne@68
|
2387 def __len__(self):
|
jpayne@68
|
2388 return self._length
|
jpayne@68
|
2389
|
jpayne@68
|
2390 def __bytes__(self):
|
jpayne@68
|
2391 raise UndefinedSequenceError("Sequence content is undefined")
|
jpayne@68
|
2392
|
jpayne@68
|
2393 def __add__(self, other):
|
jpayne@68
|
2394 length = len(self) + len(other)
|
jpayne@68
|
2395 try:
|
jpayne@68
|
2396 other = bytes(other)
|
jpayne@68
|
2397 except UndefinedSequenceError:
|
jpayne@68
|
2398 if isinstance(other, _UndefinedSequenceData):
|
jpayne@68
|
2399 return _UndefinedSequenceData(length)
|
jpayne@68
|
2400 else:
|
jpayne@68
|
2401 return NotImplemented
|
jpayne@68
|
2402 # _PartiallyDefinedSequenceData.__radd__ will handle this
|
jpayne@68
|
2403 else:
|
jpayne@68
|
2404 data = {len(self): other}
|
jpayne@68
|
2405 return _PartiallyDefinedSequenceData(length, data)
|
jpayne@68
|
2406
|
jpayne@68
|
2407 def __radd__(self, other):
|
jpayne@68
|
2408 data = {0: bytes(other)}
|
jpayne@68
|
2409 length = len(other) + len(self)
|
jpayne@68
|
2410 return _PartiallyDefinedSequenceData(length, data)
|
jpayne@68
|
2411
|
jpayne@68
|
2412 def upper(self):
|
jpayne@68
|
2413 """Return an upper case copy of the sequence."""
|
jpayne@68
|
2414 # An upper case copy of an undefined sequence is an undefined
|
jpayne@68
|
2415 # sequence of the same length
|
jpayne@68
|
2416 return _UndefinedSequenceData(self._length)
|
jpayne@68
|
2417
|
jpayne@68
|
2418 def lower(self):
|
jpayne@68
|
2419 """Return a lower case copy of the sequence."""
|
jpayne@68
|
2420 # A lower case copy of an undefined sequence is an undefined
|
jpayne@68
|
2421 # sequence of the same length
|
jpayne@68
|
2422 return _UndefinedSequenceData(self._length)
|
jpayne@68
|
2423
|
jpayne@68
|
2424 def isupper(self):
|
jpayne@68
|
2425 """Return True if all ASCII characters in data are uppercase.
|
jpayne@68
|
2426
|
jpayne@68
|
2427 If there are no cased characters, the method returns False.
|
jpayne@68
|
2428 """
|
jpayne@68
|
2429 # Character case is irrelevant for an undefined sequence
|
jpayne@68
|
2430 raise UndefinedSequenceError("Sequence content is undefined")
|
jpayne@68
|
2431
|
jpayne@68
|
2432 def islower(self):
|
jpayne@68
|
2433 """Return True if all ASCII characters in data are lowercase.
|
jpayne@68
|
2434
|
jpayne@68
|
2435 If there are no cased characters, the method returns False.
|
jpayne@68
|
2436 """
|
jpayne@68
|
2437 # Character case is irrelevant for an undefined sequence
|
jpayne@68
|
2438 raise UndefinedSequenceError("Sequence content is undefined")
|
jpayne@68
|
2439
|
jpayne@68
|
2440 def replace(self, old, new):
|
jpayne@68
|
2441 """Return a copy with all occurrences of substring old replaced by new."""
|
jpayne@68
|
2442 # Replacing substring old by new in an undefined sequence will result
|
jpayne@68
|
2443 # in an undefined sequence of the same length, if old and new have the
|
jpayne@68
|
2444 # number of characters.
|
jpayne@68
|
2445 if len(old) != len(new):
|
jpayne@68
|
2446 raise UndefinedSequenceError("Sequence content is undefined")
|
jpayne@68
|
2447 return _UndefinedSequenceData(self._length)
|
jpayne@68
|
2448
|
jpayne@68
|
2449 @property
|
jpayne@68
|
2450 def defined(self):
|
jpayne@68
|
2451 """Return False, as the sequence is not defined and has a non-zero length."""
|
jpayne@68
|
2452 return False
|
jpayne@68
|
2453
|
jpayne@68
|
2454 @property
|
jpayne@68
|
2455 def defined_ranges(self):
|
jpayne@68
|
2456 """Return a tuple of the ranges where the sequence contents is defined.
|
jpayne@68
|
2457
|
jpayne@68
|
2458 As the sequence contents of an _UndefinedSequenceData object is fully
|
jpayne@68
|
2459 undefined, the return value is always an empty tuple.
|
jpayne@68
|
2460 """
|
jpayne@68
|
2461 return ()
|
jpayne@68
|
2462
|
jpayne@68
|
2463
|
jpayne@68
|
2464 class _PartiallyDefinedSequenceData(SequenceDataAbstractBaseClass):
|
jpayne@68
|
2465 """Stores the length of a sequence with an undefined sequence contents (PRIVATE).
|
jpayne@68
|
2466
|
jpayne@68
|
2467 Objects of this class can be used to create a Seq object to represent
|
jpayne@68
|
2468 sequences with a known length, but with a sequence contents that is only
|
jpayne@68
|
2469 partially known.
|
jpayne@68
|
2470 Calling __len__ returns the sequence length, calling __getitem__ returns
|
jpayne@68
|
2471 the sequence contents if known, otherwise an UndefinedSequenceError is
|
jpayne@68
|
2472 raised.
|
jpayne@68
|
2473 """
|
jpayne@68
|
2474
|
jpayne@68
|
2475 __slots__ = ("_length", "_data")
|
jpayne@68
|
2476
|
jpayne@68
|
2477 def __init__(self, length, data):
|
jpayne@68
|
2478 """Initialize with the sequence length and defined sequence segments.
|
jpayne@68
|
2479
|
jpayne@68
|
2480 The calling function is responsible for ensuring that the length is
|
jpayne@68
|
2481 greater than zero.
|
jpayne@68
|
2482 """
|
jpayne@68
|
2483 self._length = length
|
jpayne@68
|
2484 self._data = data
|
jpayne@68
|
2485 super().__init__()
|
jpayne@68
|
2486
|
jpayne@68
|
2487 def __getitem__(
|
jpayne@68
|
2488 self, key: Union[slice, int]
|
jpayne@68
|
2489 ) -> Union[bytes, SequenceDataAbstractBaseClass]:
|
jpayne@68
|
2490 if isinstance(key, slice):
|
jpayne@68
|
2491 start, end, step = key.indices(self._length)
|
jpayne@68
|
2492 size = len(range(start, end, step))
|
jpayne@68
|
2493 if size == 0:
|
jpayne@68
|
2494 return b""
|
jpayne@68
|
2495 data = {}
|
jpayne@68
|
2496 for s, d in self._data.items():
|
jpayne@68
|
2497 indices = range(-s, -s + self._length)[key]
|
jpayne@68
|
2498 e: Optional[int] = indices.stop
|
jpayne@68
|
2499 assert e is not None
|
jpayne@68
|
2500 if step > 0:
|
jpayne@68
|
2501 if e <= 0:
|
jpayne@68
|
2502 continue
|
jpayne@68
|
2503 if indices.start < 0:
|
jpayne@68
|
2504 s = indices.start % step
|
jpayne@68
|
2505 else:
|
jpayne@68
|
2506 s = indices.start
|
jpayne@68
|
2507 else: # step < 0
|
jpayne@68
|
2508 if e < 0:
|
jpayne@68
|
2509 e = None
|
jpayne@68
|
2510 end = len(d) - 1
|
jpayne@68
|
2511 if indices.start > end:
|
jpayne@68
|
2512 s = end + (indices.start - end) % step
|
jpayne@68
|
2513 else:
|
jpayne@68
|
2514 s = indices.start
|
jpayne@68
|
2515 if s < 0:
|
jpayne@68
|
2516 continue
|
jpayne@68
|
2517 start = (s - indices.start) // step
|
jpayne@68
|
2518 d = d[s:e:step]
|
jpayne@68
|
2519 if d:
|
jpayne@68
|
2520 data[start] = d
|
jpayne@68
|
2521 if len(data) == 0: # Fully undefined sequence
|
jpayne@68
|
2522 return _UndefinedSequenceData(size)
|
jpayne@68
|
2523 # merge adjacent sequence segments
|
jpayne@68
|
2524 end = -1
|
jpayne@68
|
2525 previous = 0 # not needed here, but it keeps flake happy
|
jpayne@68
|
2526 items = data.items()
|
jpayne@68
|
2527 data = {}
|
jpayne@68
|
2528 for start, seq in items:
|
jpayne@68
|
2529 if end == start:
|
jpayne@68
|
2530 data[previous] += seq
|
jpayne@68
|
2531 else:
|
jpayne@68
|
2532 data[start] = seq
|
jpayne@68
|
2533 previous = start
|
jpayne@68
|
2534 end = start + len(seq)
|
jpayne@68
|
2535 if len(data) == 1:
|
jpayne@68
|
2536 seq = data.get(0)
|
jpayne@68
|
2537 if seq is not None and len(seq) == size:
|
jpayne@68
|
2538 return seq # Fully defined sequence; return bytes
|
jpayne@68
|
2539 if step < 0:
|
jpayne@68
|
2540 # use this after we drop Python 3.7:
|
jpayne@68
|
2541 # data = {start: data[start] for start in reversed(data)}
|
jpayne@68
|
2542 # use this as long as we support Python 3.7:
|
jpayne@68
|
2543 data = {start: data[start] for start in reversed(list(data.keys()))}
|
jpayne@68
|
2544 return _PartiallyDefinedSequenceData(size, data)
|
jpayne@68
|
2545 elif self._length <= key:
|
jpayne@68
|
2546 raise IndexError("sequence index out of range")
|
jpayne@68
|
2547 else:
|
jpayne@68
|
2548 for start, seq in self._data.items():
|
jpayne@68
|
2549 if start <= key and key < start + len(seq):
|
jpayne@68
|
2550 return seq[key - start]
|
jpayne@68
|
2551 raise UndefinedSequenceError("Sequence at position %d is undefined" % key)
|
jpayne@68
|
2552
|
jpayne@68
|
2553 def __len__(self):
|
jpayne@68
|
2554 return self._length
|
jpayne@68
|
2555
|
jpayne@68
|
2556 def __bytes__(self):
|
jpayne@68
|
2557 raise UndefinedSequenceError("Sequence content is only partially defined")
|
jpayne@68
|
2558
|
jpayne@68
|
2559 def __add__(self, other):
|
jpayne@68
|
2560 length = len(self) + len(other)
|
jpayne@68
|
2561 data = dict(self._data)
|
jpayne@68
|
2562 items = list(self._data.items())
|
jpayne@68
|
2563 start, seq = items[-1]
|
jpayne@68
|
2564 end = start + len(seq)
|
jpayne@68
|
2565 try:
|
jpayne@68
|
2566 other = bytes(other)
|
jpayne@68
|
2567 except UndefinedSequenceError:
|
jpayne@68
|
2568 if isinstance(other, _UndefinedSequenceData):
|
jpayne@68
|
2569 pass
|
jpayne@68
|
2570 elif isinstance(other, _PartiallyDefinedSequenceData):
|
jpayne@68
|
2571 other_items = list(other._data.items())
|
jpayne@68
|
2572 if end == len(self):
|
jpayne@68
|
2573 other_start, other_seq = other_items.pop(0)
|
jpayne@68
|
2574 if other_start == 0:
|
jpayne@68
|
2575 data[start] += other_seq
|
jpayne@68
|
2576 else:
|
jpayne@68
|
2577 data[len(self) + other_start] = other_seq
|
jpayne@68
|
2578 for other_start, other_seq in other_items:
|
jpayne@68
|
2579 data[len(self) + other_start] = other_seq
|
jpayne@68
|
2580 else:
|
jpayne@68
|
2581 if end == len(self):
|
jpayne@68
|
2582 data[start] += other
|
jpayne@68
|
2583 else:
|
jpayne@68
|
2584 data[len(self)] = other
|
jpayne@68
|
2585 return _PartiallyDefinedSequenceData(length, data)
|
jpayne@68
|
2586
|
jpayne@68
|
2587 def __radd__(self, other):
|
jpayne@68
|
2588 length = len(other) + len(self)
|
jpayne@68
|
2589 try:
|
jpayne@68
|
2590 other = bytes(other)
|
jpayne@68
|
2591 except UndefinedSequenceError:
|
jpayne@68
|
2592 data = {len(other) + start: seq for start, seq in self._data.items()}
|
jpayne@68
|
2593 else:
|
jpayne@68
|
2594 data = {0: other}
|
jpayne@68
|
2595 items = list(self._data.items())
|
jpayne@68
|
2596 start, seq = items.pop(0)
|
jpayne@68
|
2597 if start == 0:
|
jpayne@68
|
2598 data[0] += seq
|
jpayne@68
|
2599 else:
|
jpayne@68
|
2600 data[len(other) + start] = seq
|
jpayne@68
|
2601 for start, seq in items:
|
jpayne@68
|
2602 data[len(other) + start] = seq
|
jpayne@68
|
2603 return _PartiallyDefinedSequenceData(length, data)
|
jpayne@68
|
2604
|
jpayne@68
|
2605 def __mul__(self, other):
|
jpayne@68
|
2606 length = self._length
|
jpayne@68
|
2607 items = self._data.items()
|
jpayne@68
|
2608 data = {}
|
jpayne@68
|
2609 end = -1
|
jpayne@68
|
2610 previous = 0 # not needed here, but it keeps flake happy
|
jpayne@68
|
2611 for i in range(other):
|
jpayne@68
|
2612 for start, seq in items:
|
jpayne@68
|
2613 start += i * length
|
jpayne@68
|
2614 if end == start:
|
jpayne@68
|
2615 data[previous] += seq
|
jpayne@68
|
2616 else:
|
jpayne@68
|
2617 data[start] = seq
|
jpayne@68
|
2618 previous = start
|
jpayne@68
|
2619 end = start + len(seq)
|
jpayne@68
|
2620 return _PartiallyDefinedSequenceData(length * other, data)
|
jpayne@68
|
2621
|
jpayne@68
|
2622 def upper(self):
|
jpayne@68
|
2623 """Return an upper case copy of the sequence."""
|
jpayne@68
|
2624 data = {start: seq.upper() for start, seq in self._data.items()}
|
jpayne@68
|
2625 return _PartiallyDefinedSequenceData(self._length, data)
|
jpayne@68
|
2626
|
jpayne@68
|
2627 def lower(self):
|
jpayne@68
|
2628 """Return a lower case copy of the sequence."""
|
jpayne@68
|
2629 data = {start: seq.lower() for start, seq in self._data.items()}
|
jpayne@68
|
2630 return _PartiallyDefinedSequenceData(self._length, data)
|
jpayne@68
|
2631
|
jpayne@68
|
2632 def isupper(self):
|
jpayne@68
|
2633 """Return True if all ASCII characters in data are uppercase.
|
jpayne@68
|
2634
|
jpayne@68
|
2635 If there are no cased characters, the method returns False.
|
jpayne@68
|
2636 """
|
jpayne@68
|
2637 # Character case is irrelevant for an undefined sequence
|
jpayne@68
|
2638 raise UndefinedSequenceError("Sequence content is only partially defined")
|
jpayne@68
|
2639
|
jpayne@68
|
2640 def islower(self):
|
jpayne@68
|
2641 """Return True if all ASCII characters in data are lowercase.
|
jpayne@68
|
2642
|
jpayne@68
|
2643 If there are no cased characters, the method returns False.
|
jpayne@68
|
2644 """
|
jpayne@68
|
2645 # Character case is irrelevant for an undefined sequence
|
jpayne@68
|
2646 raise UndefinedSequenceError("Sequence content is only partially defined")
|
jpayne@68
|
2647
|
jpayne@68
|
2648 def translate(self, table, delete=b""):
|
jpayne@68
|
2649 """Return a copy with each character mapped by the given translation table.
|
jpayne@68
|
2650
|
jpayne@68
|
2651 table
|
jpayne@68
|
2652 Translation table, which must be a bytes object of length 256.
|
jpayne@68
|
2653
|
jpayne@68
|
2654 All characters occurring in the optional argument delete are removed.
|
jpayne@68
|
2655 The remaining characters are mapped through the given translation table.
|
jpayne@68
|
2656 """
|
jpayne@68
|
2657 items = self._data.items()
|
jpayne@68
|
2658 data = {start: seq.translate(table, delete) for start, seq in items}
|
jpayne@68
|
2659 return _PartiallyDefinedSequenceData(self._length, data)
|
jpayne@68
|
2660
|
jpayne@68
|
2661 def replace(self, old, new):
|
jpayne@68
|
2662 """Return a copy with all occurrences of substring old replaced by new."""
|
jpayne@68
|
2663 # Replacing substring old by new in the undefined sequence segments
|
jpayne@68
|
2664 # will result in an undefined sequence segment of the same length, if
|
jpayne@68
|
2665 # old and new have the number of characters. If not, an error is raised,
|
jpayne@68
|
2666 # as the correct start positions cannot be calculated reliably.
|
jpayne@68
|
2667 if len(old) != len(new):
|
jpayne@68
|
2668 raise UndefinedSequenceError(
|
jpayne@68
|
2669 "Sequence content is only partially defined; substring \n"
|
jpayne@68
|
2670 "replacement cannot be performed reliably"
|
jpayne@68
|
2671 )
|
jpayne@68
|
2672 items = self._data.items()
|
jpayne@68
|
2673 data = {start: seq.replace(old, new) for start, seq in items}
|
jpayne@68
|
2674 return _PartiallyDefinedSequenceData(self._length, data)
|
jpayne@68
|
2675
|
jpayne@68
|
2676 @property
|
jpayne@68
|
2677 def defined(self):
|
jpayne@68
|
2678 """Return False, as the sequence is not fully defined and has a non-zero length."""
|
jpayne@68
|
2679 return False
|
jpayne@68
|
2680
|
jpayne@68
|
2681 @property
|
jpayne@68
|
2682 def defined_ranges(self):
|
jpayne@68
|
2683 """Return a tuple of the ranges where the sequence contents is defined.
|
jpayne@68
|
2684
|
jpayne@68
|
2685 The return value has the format ((start1, end1), (start2, end2), ...).
|
jpayne@68
|
2686 """
|
jpayne@68
|
2687 return tuple((start, start + len(seq)) for start, seq in self._data.items())
|
jpayne@68
|
2688
|
jpayne@68
|
2689
|
jpayne@68
|
2690 # The transcribe, backward_transcribe, and translate functions are
|
jpayne@68
|
2691 # user-friendly versions of the corresponding Seq/MutableSeq methods.
|
jpayne@68
|
2692 # The functions work both on Seq objects, and on strings.
|
jpayne@68
|
2693
|
jpayne@68
|
2694
|
jpayne@68
|
2695 def transcribe(dna):
|
jpayne@68
|
2696 """Transcribe a DNA sequence into RNA.
|
jpayne@68
|
2697
|
jpayne@68
|
2698 Following the usual convention, the sequence is interpreted as the
|
jpayne@68
|
2699 coding strand of the DNA double helix, not the template strand. This
|
jpayne@68
|
2700 means we can get the RNA sequence just by switching T to U.
|
jpayne@68
|
2701
|
jpayne@68
|
2702 If given a string, returns a new string object.
|
jpayne@68
|
2703
|
jpayne@68
|
2704 Given a Seq or MutableSeq, returns a new Seq object.
|
jpayne@68
|
2705
|
jpayne@68
|
2706 e.g.
|
jpayne@68
|
2707
|
jpayne@68
|
2708 >>> transcribe("ACTGN")
|
jpayne@68
|
2709 'ACUGN'
|
jpayne@68
|
2710 """
|
jpayne@68
|
2711 if isinstance(dna, Seq):
|
jpayne@68
|
2712 return dna.transcribe()
|
jpayne@68
|
2713 elif isinstance(dna, MutableSeq):
|
jpayne@68
|
2714 return Seq(dna).transcribe()
|
jpayne@68
|
2715 else:
|
jpayne@68
|
2716 return dna.replace("T", "U").replace("t", "u")
|
jpayne@68
|
2717
|
jpayne@68
|
2718
|
jpayne@68
|
2719 def back_transcribe(rna):
|
jpayne@68
|
2720 """Return the RNA sequence back-transcribed into DNA.
|
jpayne@68
|
2721
|
jpayne@68
|
2722 If given a string, returns a new string object.
|
jpayne@68
|
2723
|
jpayne@68
|
2724 Given a Seq or MutableSeq, returns a new Seq object.
|
jpayne@68
|
2725
|
jpayne@68
|
2726 e.g.
|
jpayne@68
|
2727
|
jpayne@68
|
2728 >>> back_transcribe("ACUGN")
|
jpayne@68
|
2729 'ACTGN'
|
jpayne@68
|
2730 """
|
jpayne@68
|
2731 if isinstance(rna, Seq):
|
jpayne@68
|
2732 return rna.back_transcribe()
|
jpayne@68
|
2733 elif isinstance(rna, MutableSeq):
|
jpayne@68
|
2734 return Seq(rna).back_transcribe()
|
jpayne@68
|
2735 else:
|
jpayne@68
|
2736 return rna.replace("U", "T").replace("u", "t")
|
jpayne@68
|
2737
|
jpayne@68
|
2738
|
jpayne@68
|
2739 def _translate_str(
|
jpayne@68
|
2740 sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None
|
jpayne@68
|
2741 ):
|
jpayne@68
|
2742 """Translate nucleotide string into a protein string (PRIVATE).
|
jpayne@68
|
2743
|
jpayne@68
|
2744 Arguments:
|
jpayne@68
|
2745 - sequence - a string
|
jpayne@68
|
2746 - table - Which codon table to use? This can be either a name (string),
|
jpayne@68
|
2747 an NCBI identifier (integer), or a CodonTable object (useful for
|
jpayne@68
|
2748 non-standard genetic codes). This defaults to the "Standard" table.
|
jpayne@68
|
2749 - stop_symbol - a single character string, what to use for terminators.
|
jpayne@68
|
2750 - to_stop - boolean, should translation terminate at the first
|
jpayne@68
|
2751 in frame stop codon? If there is no in-frame stop codon
|
jpayne@68
|
2752 then translation continues to the end.
|
jpayne@68
|
2753 - pos_stop - a single character string for a possible stop codon
|
jpayne@68
|
2754 (e.g. TAN or NNN)
|
jpayne@68
|
2755 - cds - Boolean, indicates this is a complete CDS. If True, this
|
jpayne@68
|
2756 checks the sequence starts with a valid alternative start
|
jpayne@68
|
2757 codon (which will be translated as methionine, M), that the
|
jpayne@68
|
2758 sequence length is a multiple of three, and that there is a
|
jpayne@68
|
2759 single in frame stop codon at the end (this will be excluded
|
jpayne@68
|
2760 from the protein sequence, regardless of the to_stop option).
|
jpayne@68
|
2761 If these tests fail, an exception is raised.
|
jpayne@68
|
2762 - gap - Single character string to denote symbol used for gaps.
|
jpayne@68
|
2763 Defaults to None.
|
jpayne@68
|
2764
|
jpayne@68
|
2765 Returns a string.
|
jpayne@68
|
2766
|
jpayne@68
|
2767 e.g.
|
jpayne@68
|
2768
|
jpayne@68
|
2769 >>> from Bio.Data import CodonTable
|
jpayne@68
|
2770 >>> table = CodonTable.ambiguous_dna_by_id[1]
|
jpayne@68
|
2771 >>> _translate_str("AAA", table)
|
jpayne@68
|
2772 'K'
|
jpayne@68
|
2773 >>> _translate_str("TAR", table)
|
jpayne@68
|
2774 '*'
|
jpayne@68
|
2775 >>> _translate_str("TAN", table)
|
jpayne@68
|
2776 'X'
|
jpayne@68
|
2777 >>> _translate_str("TAN", table, pos_stop="@")
|
jpayne@68
|
2778 '@'
|
jpayne@68
|
2779 >>> _translate_str("TA?", table)
|
jpayne@68
|
2780 Traceback (most recent call last):
|
jpayne@68
|
2781 ...
|
jpayne@68
|
2782 Bio.Data.CodonTable.TranslationError: Codon 'TA?' is invalid
|
jpayne@68
|
2783
|
jpayne@68
|
2784 In a change to older versions of Biopython, partial codons are now
|
jpayne@68
|
2785 always regarded as an error (previously only checked if cds=True)
|
jpayne@68
|
2786 and will trigger a warning (likely to become an exception in a
|
jpayne@68
|
2787 future release).
|
jpayne@68
|
2788
|
jpayne@68
|
2789 If **cds=True**, the start and stop codons are checked, and the start
|
jpayne@68
|
2790 codon will be translated at methionine. The sequence must be an
|
jpayne@68
|
2791 while number of codons.
|
jpayne@68
|
2792
|
jpayne@68
|
2793 >>> _translate_str("ATGCCCTAG", table, cds=True)
|
jpayne@68
|
2794 'MP'
|
jpayne@68
|
2795 >>> _translate_str("AAACCCTAG", table, cds=True)
|
jpayne@68
|
2796 Traceback (most recent call last):
|
jpayne@68
|
2797 ...
|
jpayne@68
|
2798 Bio.Data.CodonTable.TranslationError: First codon 'AAA' is not a start codon
|
jpayne@68
|
2799 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
|
jpayne@68
|
2800 Traceback (most recent call last):
|
jpayne@68
|
2801 ...
|
jpayne@68
|
2802 Bio.Data.CodonTable.TranslationError: Extra in frame stop codon 'TAG' found.
|
jpayne@68
|
2803 """
|
jpayne@68
|
2804 try:
|
jpayne@68
|
2805 table_id = int(table)
|
jpayne@68
|
2806 except ValueError:
|
jpayne@68
|
2807 # Assume it's a table name
|
jpayne@68
|
2808 # The same table can be used for RNA or DNA
|
jpayne@68
|
2809 try:
|
jpayne@68
|
2810 codon_table = CodonTable.ambiguous_generic_by_name[table]
|
jpayne@68
|
2811 except KeyError:
|
jpayne@68
|
2812 if isinstance(table, str):
|
jpayne@68
|
2813 raise ValueError(
|
jpayne@68
|
2814 "The Bio.Seq translate methods and function DO NOT "
|
jpayne@68
|
2815 "take a character string mapping table like the python "
|
jpayne@68
|
2816 "string object's translate method. "
|
jpayne@68
|
2817 "Use str(my_seq).translate(...) instead."
|
jpayne@68
|
2818 ) from None
|
jpayne@68
|
2819 else:
|
jpayne@68
|
2820 raise TypeError("table argument must be integer or string") from None
|
jpayne@68
|
2821 except (AttributeError, TypeError):
|
jpayne@68
|
2822 # Assume it's a CodonTable object
|
jpayne@68
|
2823 if isinstance(table, CodonTable.CodonTable):
|
jpayne@68
|
2824 codon_table = table
|
jpayne@68
|
2825 else:
|
jpayne@68
|
2826 raise ValueError("Bad table argument") from None
|
jpayne@68
|
2827 else:
|
jpayne@68
|
2828 # Assume it's a table ID
|
jpayne@68
|
2829 # The same table can be used for RNA or DNA
|
jpayne@68
|
2830 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
|
jpayne@68
|
2831 sequence = sequence.upper()
|
jpayne@68
|
2832 amino_acids = []
|
jpayne@68
|
2833 forward_table = codon_table.forward_table
|
jpayne@68
|
2834 stop_codons = codon_table.stop_codons
|
jpayne@68
|
2835 if codon_table.nucleotide_alphabet is not None:
|
jpayne@68
|
2836 valid_letters = set(codon_table.nucleotide_alphabet.upper())
|
jpayne@68
|
2837 else:
|
jpayne@68
|
2838 # Assume the worst case, ambiguous DNA or RNA:
|
jpayne@68
|
2839 valid_letters = set(
|
jpayne@68
|
2840 IUPACData.ambiguous_dna_letters.upper()
|
jpayne@68
|
2841 + IUPACData.ambiguous_rna_letters.upper()
|
jpayne@68
|
2842 )
|
jpayne@68
|
2843 n = len(sequence)
|
jpayne@68
|
2844
|
jpayne@68
|
2845 # Check for tables with 'ambiguous' (dual-coding) stop codons:
|
jpayne@68
|
2846 dual_coding = [c for c in stop_codons if c in forward_table]
|
jpayne@68
|
2847 if dual_coding:
|
jpayne@68
|
2848 c = dual_coding[0]
|
jpayne@68
|
2849 if to_stop:
|
jpayne@68
|
2850 raise ValueError(
|
jpayne@68
|
2851 "You cannot use 'to_stop=True' with this table as it contains"
|
jpayne@68
|
2852 f" {len(dual_coding)} codon(s) which can be both STOP and an"
|
jpayne@68
|
2853 f" amino acid (e.g. '{c}' -> '{forward_table[c]}' or STOP)."
|
jpayne@68
|
2854 )
|
jpayne@68
|
2855 warnings.warn(
|
jpayne@68
|
2856 f"This table contains {len(dual_coding)} codon(s) which code(s) for"
|
jpayne@68
|
2857 f" both STOP and an amino acid (e.g. '{c}' -> '{forward_table[c]}'"
|
jpayne@68
|
2858 " or STOP). Such codons will be translated as amino acid.",
|
jpayne@68
|
2859 BiopythonWarning,
|
jpayne@68
|
2860 )
|
jpayne@68
|
2861
|
jpayne@68
|
2862 if cds:
|
jpayne@68
|
2863 if str(sequence[:3]).upper() not in codon_table.start_codons:
|
jpayne@68
|
2864 raise CodonTable.TranslationError(
|
jpayne@68
|
2865 f"First codon '{sequence[:3]}' is not a start codon"
|
jpayne@68
|
2866 )
|
jpayne@68
|
2867 if n % 3 != 0:
|
jpayne@68
|
2868 raise CodonTable.TranslationError(
|
jpayne@68
|
2869 f"Sequence length {n} is not a multiple of three"
|
jpayne@68
|
2870 )
|
jpayne@68
|
2871 if str(sequence[-3:]).upper() not in stop_codons:
|
jpayne@68
|
2872 raise CodonTable.TranslationError(
|
jpayne@68
|
2873 f"Final codon '{sequence[-3:]}' is not a stop codon"
|
jpayne@68
|
2874 )
|
jpayne@68
|
2875 # Don't translate the stop symbol, and manually translate the M
|
jpayne@68
|
2876 sequence = sequence[3:-3]
|
jpayne@68
|
2877 n -= 6
|
jpayne@68
|
2878 amino_acids = ["M"]
|
jpayne@68
|
2879 elif n % 3 != 0:
|
jpayne@68
|
2880 warnings.warn(
|
jpayne@68
|
2881 "Partial codon, len(sequence) not a multiple of three. "
|
jpayne@68
|
2882 "Explicitly trim the sequence or add trailing N before "
|
jpayne@68
|
2883 "translation. This may become an error in future.",
|
jpayne@68
|
2884 BiopythonWarning,
|
jpayne@68
|
2885 )
|
jpayne@68
|
2886 if gap is not None:
|
jpayne@68
|
2887 if not isinstance(gap, str):
|
jpayne@68
|
2888 raise TypeError("Gap character should be a single character string.")
|
jpayne@68
|
2889 elif len(gap) > 1:
|
jpayne@68
|
2890 raise ValueError("Gap character should be a single character string.")
|
jpayne@68
|
2891
|
jpayne@68
|
2892 for i in range(0, n - n % 3, 3):
|
jpayne@68
|
2893 codon = sequence[i : i + 3]
|
jpayne@68
|
2894 try:
|
jpayne@68
|
2895 amino_acids.append(forward_table[codon])
|
jpayne@68
|
2896 except (KeyError, CodonTable.TranslationError):
|
jpayne@68
|
2897 if codon in codon_table.stop_codons:
|
jpayne@68
|
2898 if cds:
|
jpayne@68
|
2899 raise CodonTable.TranslationError(
|
jpayne@68
|
2900 f"Extra in frame stop codon '{codon}' found."
|
jpayne@68
|
2901 ) from None
|
jpayne@68
|
2902 if to_stop:
|
jpayne@68
|
2903 break
|
jpayne@68
|
2904 amino_acids.append(stop_symbol)
|
jpayne@68
|
2905 elif valid_letters.issuperset(set(codon)):
|
jpayne@68
|
2906 # Possible stop codon (e.g. NNN or TAN)
|
jpayne@68
|
2907 amino_acids.append(pos_stop)
|
jpayne@68
|
2908 elif gap is not None and codon == gap * 3:
|
jpayne@68
|
2909 # Gapped translation
|
jpayne@68
|
2910 amino_acids.append(gap)
|
jpayne@68
|
2911 else:
|
jpayne@68
|
2912 raise CodonTable.TranslationError(
|
jpayne@68
|
2913 f"Codon '{codon}' is invalid"
|
jpayne@68
|
2914 ) from None
|
jpayne@68
|
2915 return "".join(amino_acids)
|
jpayne@68
|
2916
|
jpayne@68
|
2917
|
jpayne@68
|
2918 def translate(
|
jpayne@68
|
2919 sequence, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap=None
|
jpayne@68
|
2920 ):
|
jpayne@68
|
2921 """Translate a nucleotide sequence into amino acids.
|
jpayne@68
|
2922
|
jpayne@68
|
2923 If given a string, returns a new string object. Given a Seq or
|
jpayne@68
|
2924 MutableSeq, returns a Seq object.
|
jpayne@68
|
2925
|
jpayne@68
|
2926 Arguments:
|
jpayne@68
|
2927 - table - Which codon table to use? This can be either a name
|
jpayne@68
|
2928 (string), an NCBI identifier (integer), or a CodonTable object
|
jpayne@68
|
2929 (useful for non-standard genetic codes). Defaults to the "Standard"
|
jpayne@68
|
2930 table.
|
jpayne@68
|
2931 - stop_symbol - Single character string, what to use for any
|
jpayne@68
|
2932 terminators, defaults to the asterisk, "*".
|
jpayne@68
|
2933 - to_stop - Boolean, defaults to False meaning do a full
|
jpayne@68
|
2934 translation continuing on past any stop codons
|
jpayne@68
|
2935 (translated as the specified stop_symbol). If
|
jpayne@68
|
2936 True, translation is terminated at the first in
|
jpayne@68
|
2937 frame stop codon (and the stop_symbol is not
|
jpayne@68
|
2938 appended to the returned protein sequence).
|
jpayne@68
|
2939 - cds - Boolean, indicates this is a complete CDS. If True, this
|
jpayne@68
|
2940 checks the sequence starts with a valid alternative start
|
jpayne@68
|
2941 codon (which will be translated as methionine, M), that the
|
jpayne@68
|
2942 sequence length is a multiple of three, and that there is a
|
jpayne@68
|
2943 single in frame stop codon at the end (this will be excluded
|
jpayne@68
|
2944 from the protein sequence, regardless of the to_stop option).
|
jpayne@68
|
2945 If these tests fail, an exception is raised.
|
jpayne@68
|
2946 - gap - Single character string to denote symbol used for gaps.
|
jpayne@68
|
2947 Defaults to None.
|
jpayne@68
|
2948
|
jpayne@68
|
2949 A simple string example using the default (standard) genetic code:
|
jpayne@68
|
2950
|
jpayne@68
|
2951 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
|
jpayne@68
|
2952 >>> translate(coding_dna)
|
jpayne@68
|
2953 'VAIVMGR*KGAR*'
|
jpayne@68
|
2954 >>> translate(coding_dna, stop_symbol="@")
|
jpayne@68
|
2955 'VAIVMGR@KGAR@'
|
jpayne@68
|
2956 >>> translate(coding_dna, to_stop=True)
|
jpayne@68
|
2957 'VAIVMGR'
|
jpayne@68
|
2958
|
jpayne@68
|
2959 Now using NCBI table 2, where TGA is not a stop codon:
|
jpayne@68
|
2960
|
jpayne@68
|
2961 >>> translate(coding_dna, table=2)
|
jpayne@68
|
2962 'VAIVMGRWKGAR*'
|
jpayne@68
|
2963 >>> translate(coding_dna, table=2, to_stop=True)
|
jpayne@68
|
2964 'VAIVMGRWKGAR'
|
jpayne@68
|
2965
|
jpayne@68
|
2966 In fact this example uses an alternative start codon valid under NCBI
|
jpayne@68
|
2967 table 2, GTG, which means this example is a complete valid CDS which
|
jpayne@68
|
2968 when translated should really start with methionine (not valine):
|
jpayne@68
|
2969
|
jpayne@68
|
2970 >>> translate(coding_dna, table=2, cds=True)
|
jpayne@68
|
2971 'MAIVMGRWKGAR'
|
jpayne@68
|
2972
|
jpayne@68
|
2973 Note that if the sequence has no in-frame stop codon, then the to_stop
|
jpayne@68
|
2974 argument has no effect:
|
jpayne@68
|
2975
|
jpayne@68
|
2976 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
|
jpayne@68
|
2977 >>> translate(coding_dna2)
|
jpayne@68
|
2978 'VAIVMGR'
|
jpayne@68
|
2979 >>> translate(coding_dna2, to_stop=True)
|
jpayne@68
|
2980 'VAIVMGR'
|
jpayne@68
|
2981
|
jpayne@68
|
2982 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
|
jpayne@68
|
2983 or a stop codon. These are translated as "X". Any invalid codon
|
jpayne@68
|
2984 (e.g. "TA?" or "T-A") will throw a TranslationError.
|
jpayne@68
|
2985
|
jpayne@68
|
2986 It will however translate either DNA or RNA.
|
jpayne@68
|
2987
|
jpayne@68
|
2988 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous
|
jpayne@68
|
2989 stop codons'. These are stop codons with unambiguous sequence but which
|
jpayne@68
|
2990 have a context dependent coding as STOP or as amino acid. With these tables
|
jpayne@68
|
2991 'to_stop' must be False (otherwise a ValueError is raised). The dual
|
jpayne@68
|
2992 coding codons will always be translated as amino acid, except for
|
jpayne@68
|
2993 'cds=True', where the last codon will be translated as STOP.
|
jpayne@68
|
2994
|
jpayne@68
|
2995 >>> coding_dna3 = "ATGGCACGGAAGTGA"
|
jpayne@68
|
2996 >>> translate(coding_dna3)
|
jpayne@68
|
2997 'MARK*'
|
jpayne@68
|
2998
|
jpayne@68
|
2999 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W
|
jpayne@68
|
3000 'MARKW'
|
jpayne@68
|
3001
|
jpayne@68
|
3002 It will however raise a BiopythonWarning (not shown).
|
jpayne@68
|
3003
|
jpayne@68
|
3004 >>> translate(coding_dna3, table=27, cds=True)
|
jpayne@68
|
3005 'MARK'
|
jpayne@68
|
3006
|
jpayne@68
|
3007 >>> translate(coding_dna3, table=27, to_stop=True)
|
jpayne@68
|
3008 Traceback (most recent call last):
|
jpayne@68
|
3009 ...
|
jpayne@68
|
3010 ValueError: You cannot use 'to_stop=True' with this table ...
|
jpayne@68
|
3011 """
|
jpayne@68
|
3012 if isinstance(sequence, Seq):
|
jpayne@68
|
3013 return sequence.translate(table, stop_symbol, to_stop, cds)
|
jpayne@68
|
3014 elif isinstance(sequence, MutableSeq):
|
jpayne@68
|
3015 # Return a Seq object
|
jpayne@68
|
3016 return Seq(sequence).translate(table, stop_symbol, to_stop, cds)
|
jpayne@68
|
3017 else:
|
jpayne@68
|
3018 # Assume it's a string, return a string
|
jpayne@68
|
3019 return _translate_str(sequence, table, stop_symbol, to_stop, cds, gap=gap)
|
jpayne@68
|
3020
|
jpayne@68
|
3021
|
jpayne@68
|
3022 def reverse_complement(sequence, inplace=False):
|
jpayne@68
|
3023 """Return the reverse complement as a DNA sequence.
|
jpayne@68
|
3024
|
jpayne@68
|
3025 If given a string, returns a new string object.
|
jpayne@68
|
3026 Given a Seq object, returns a new Seq object.
|
jpayne@68
|
3027 Given a MutableSeq, returns a new MutableSeq object.
|
jpayne@68
|
3028 Given a SeqRecord object, returns a new SeqRecord object.
|
jpayne@68
|
3029
|
jpayne@68
|
3030 >>> my_seq = "CGA"
|
jpayne@68
|
3031 >>> reverse_complement(my_seq)
|
jpayne@68
|
3032 'TCG'
|
jpayne@68
|
3033 >>> my_seq = Seq("CGA")
|
jpayne@68
|
3034 >>> reverse_complement(my_seq)
|
jpayne@68
|
3035 Seq('TCG')
|
jpayne@68
|
3036 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3037 >>> reverse_complement(my_seq)
|
jpayne@68
|
3038 MutableSeq('TCG')
|
jpayne@68
|
3039 >>> my_seq
|
jpayne@68
|
3040 MutableSeq('CGA')
|
jpayne@68
|
3041
|
jpayne@68
|
3042 Any U in the sequence is treated as a T:
|
jpayne@68
|
3043
|
jpayne@68
|
3044 >>> reverse_complement(Seq("CGAUT"))
|
jpayne@68
|
3045 Seq('AATCG')
|
jpayne@68
|
3046
|
jpayne@68
|
3047 In contrast, ``reverse_complement_rna`` returns an RNA sequence:
|
jpayne@68
|
3048
|
jpayne@68
|
3049 >>> reverse_complement_rna(Seq("CGAUT"))
|
jpayne@68
|
3050 Seq('AAUCG')
|
jpayne@68
|
3051
|
jpayne@68
|
3052 Supports and lower- and upper-case characters, and unambiguous and
|
jpayne@68
|
3053 ambiguous nucleotides. All other characters are not converted:
|
jpayne@68
|
3054
|
jpayne@68
|
3055 >>> reverse_complement("ACGTUacgtuXYZxyz")
|
jpayne@68
|
3056 'zrxZRXaacgtAACGT'
|
jpayne@68
|
3057
|
jpayne@68
|
3058 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
3059
|
jpayne@68
|
3060 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3061 >>> reverse_complement(my_seq, inplace=True)
|
jpayne@68
|
3062 MutableSeq('TCG')
|
jpayne@68
|
3063 >>> my_seq
|
jpayne@68
|
3064 MutableSeq('TCG')
|
jpayne@68
|
3065
|
jpayne@68
|
3066 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
jpayne@68
|
3067 raised if ``reverse_complement`` is called on a ``Seq`` object with
|
jpayne@68
|
3068 ``inplace=True``.
|
jpayne@68
|
3069 """
|
jpayne@68
|
3070 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
jpayne@68
|
3071
|
jpayne@68
|
3072 if isinstance(sequence, (Seq, MutableSeq)):
|
jpayne@68
|
3073 return sequence.reverse_complement(inplace)
|
jpayne@68
|
3074 if isinstance(sequence, SeqRecord):
|
jpayne@68
|
3075 if inplace:
|
jpayne@68
|
3076 raise TypeError("SeqRecords are immutable")
|
jpayne@68
|
3077 return sequence.reverse_complement()
|
jpayne@68
|
3078 # Assume it's a string.
|
jpayne@68
|
3079 if inplace:
|
jpayne@68
|
3080 raise TypeError("strings are immutable")
|
jpayne@68
|
3081 sequence = sequence.encode("ASCII")
|
jpayne@68
|
3082 sequence = sequence.translate(_dna_complement_table)
|
jpayne@68
|
3083 sequence = sequence.decode("ASCII")
|
jpayne@68
|
3084 return sequence[::-1]
|
jpayne@68
|
3085
|
jpayne@68
|
3086
|
jpayne@68
|
3087 def reverse_complement_rna(sequence, inplace=False):
|
jpayne@68
|
3088 """Return the reverse complement as an RNA sequence.
|
jpayne@68
|
3089
|
jpayne@68
|
3090 If given a string, returns a new string object.
|
jpayne@68
|
3091 Given a Seq object, returns a new Seq object.
|
jpayne@68
|
3092 Given a MutableSeq, returns a new MutableSeq object.
|
jpayne@68
|
3093 Given a SeqRecord object, returns a new SeqRecord object.
|
jpayne@68
|
3094
|
jpayne@68
|
3095 >>> my_seq = "CGA"
|
jpayne@68
|
3096 >>> reverse_complement_rna(my_seq)
|
jpayne@68
|
3097 'UCG'
|
jpayne@68
|
3098 >>> my_seq = Seq("CGA")
|
jpayne@68
|
3099 >>> reverse_complement_rna(my_seq)
|
jpayne@68
|
3100 Seq('UCG')
|
jpayne@68
|
3101 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3102 >>> reverse_complement_rna(my_seq)
|
jpayne@68
|
3103 MutableSeq('UCG')
|
jpayne@68
|
3104 >>> my_seq
|
jpayne@68
|
3105 MutableSeq('CGA')
|
jpayne@68
|
3106
|
jpayne@68
|
3107 Any T in the sequence is treated as a U:
|
jpayne@68
|
3108
|
jpayne@68
|
3109 >>> reverse_complement_rna(Seq("CGAUT"))
|
jpayne@68
|
3110 Seq('AAUCG')
|
jpayne@68
|
3111
|
jpayne@68
|
3112 In contrast, ``reverse_complement`` returns a DNA sequence:
|
jpayne@68
|
3113
|
jpayne@68
|
3114 >>> reverse_complement(Seq("CGAUT"), inplace=False)
|
jpayne@68
|
3115 Seq('AATCG')
|
jpayne@68
|
3116
|
jpayne@68
|
3117 Supports and lower- and upper-case characters, and unambiguous and
|
jpayne@68
|
3118 ambiguous nucleotides. All other characters are not converted:
|
jpayne@68
|
3119
|
jpayne@68
|
3120 >>> reverse_complement_rna("ACGTUacgtuXYZxyz")
|
jpayne@68
|
3121 'zrxZRXaacguAACGU'
|
jpayne@68
|
3122
|
jpayne@68
|
3123 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
3124
|
jpayne@68
|
3125 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3126 >>> reverse_complement_rna(my_seq, inplace=True)
|
jpayne@68
|
3127 MutableSeq('UCG')
|
jpayne@68
|
3128 >>> my_seq
|
jpayne@68
|
3129 MutableSeq('UCG')
|
jpayne@68
|
3130
|
jpayne@68
|
3131 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
jpayne@68
|
3132 raised if ``reverse_complement`` is called on a ``Seq`` object with
|
jpayne@68
|
3133 ``inplace=True``.
|
jpayne@68
|
3134 """
|
jpayne@68
|
3135 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
jpayne@68
|
3136
|
jpayne@68
|
3137 if isinstance(sequence, (Seq, MutableSeq)):
|
jpayne@68
|
3138 return sequence.reverse_complement_rna(inplace)
|
jpayne@68
|
3139 if isinstance(sequence, SeqRecord):
|
jpayne@68
|
3140 if inplace:
|
jpayne@68
|
3141 raise TypeError("SeqRecords are immutable")
|
jpayne@68
|
3142 return sequence.reverse_complement_rna()
|
jpayne@68
|
3143 # Assume it's a string.
|
jpayne@68
|
3144 if inplace:
|
jpayne@68
|
3145 raise TypeError("strings are immutable")
|
jpayne@68
|
3146 sequence = sequence.encode("ASCII")
|
jpayne@68
|
3147 sequence = sequence.translate(_rna_complement_table)
|
jpayne@68
|
3148 sequence = sequence.decode("ASCII")
|
jpayne@68
|
3149 return sequence[::-1]
|
jpayne@68
|
3150
|
jpayne@68
|
3151
|
jpayne@68
|
3152 def complement(sequence, inplace=False):
|
jpayne@68
|
3153 """Return the complement as a DNA sequence.
|
jpayne@68
|
3154
|
jpayne@68
|
3155 If given a string, returns a new string object.
|
jpayne@68
|
3156 Given a Seq object, returns a new Seq object.
|
jpayne@68
|
3157 Given a MutableSeq, returns a new MutableSeq object.
|
jpayne@68
|
3158 Given a SeqRecord object, returns a new SeqRecord object.
|
jpayne@68
|
3159
|
jpayne@68
|
3160 >>> my_seq = "CGA"
|
jpayne@68
|
3161 >>> complement(my_seq)
|
jpayne@68
|
3162 'GCT'
|
jpayne@68
|
3163 >>> my_seq = Seq("CGA")
|
jpayne@68
|
3164 >>> complement(my_seq)
|
jpayne@68
|
3165 Seq('GCT')
|
jpayne@68
|
3166 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3167 >>> complement(my_seq)
|
jpayne@68
|
3168 MutableSeq('GCT')
|
jpayne@68
|
3169 >>> my_seq
|
jpayne@68
|
3170 MutableSeq('CGA')
|
jpayne@68
|
3171
|
jpayne@68
|
3172 Any U in the sequence is treated as a T:
|
jpayne@68
|
3173
|
jpayne@68
|
3174 >>> complement(Seq("CGAUT"))
|
jpayne@68
|
3175 Seq('GCTAA')
|
jpayne@68
|
3176
|
jpayne@68
|
3177 In contrast, ``complement_rna`` returns an RNA sequence:
|
jpayne@68
|
3178
|
jpayne@68
|
3179 >>> complement_rna(Seq("CGAUT"))
|
jpayne@68
|
3180 Seq('GCUAA')
|
jpayne@68
|
3181
|
jpayne@68
|
3182 Supports and lower- and upper-case characters, and unambiguous and
|
jpayne@68
|
3183 ambiguous nucleotides. All other characters are not converted:
|
jpayne@68
|
3184
|
jpayne@68
|
3185 >>> complement("ACGTUacgtuXYZxyz")
|
jpayne@68
|
3186 'TGCAAtgcaaXRZxrz'
|
jpayne@68
|
3187
|
jpayne@68
|
3188 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
3189
|
jpayne@68
|
3190 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3191 >>> complement(my_seq, inplace=True)
|
jpayne@68
|
3192 MutableSeq('GCT')
|
jpayne@68
|
3193 >>> my_seq
|
jpayne@68
|
3194 MutableSeq('GCT')
|
jpayne@68
|
3195
|
jpayne@68
|
3196 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
jpayne@68
|
3197 raised if ``reverse_complement`` is called on a ``Seq`` object with
|
jpayne@68
|
3198 ``inplace=True``.
|
jpayne@68
|
3199 """
|
jpayne@68
|
3200 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
jpayne@68
|
3201
|
jpayne@68
|
3202 if isinstance(sequence, (Seq, MutableSeq)):
|
jpayne@68
|
3203 return sequence.complement(inplace)
|
jpayne@68
|
3204 if isinstance(sequence, SeqRecord):
|
jpayne@68
|
3205 if inplace:
|
jpayne@68
|
3206 raise TypeError("SeqRecords are immutable")
|
jpayne@68
|
3207 return sequence.complement()
|
jpayne@68
|
3208 # Assume it's a string.
|
jpayne@68
|
3209 if inplace is True:
|
jpayne@68
|
3210 raise TypeError("strings are immutable")
|
jpayne@68
|
3211 sequence = sequence.encode("ASCII")
|
jpayne@68
|
3212 sequence = sequence.translate(_dna_complement_table)
|
jpayne@68
|
3213 return sequence.decode("ASCII")
|
jpayne@68
|
3214
|
jpayne@68
|
3215
|
jpayne@68
|
3216 def complement_rna(sequence, inplace=False):
|
jpayne@68
|
3217 """Return the complement as an RNA sequence.
|
jpayne@68
|
3218
|
jpayne@68
|
3219 If given a string, returns a new string object.
|
jpayne@68
|
3220 Given a Seq object, returns a new Seq object.
|
jpayne@68
|
3221 Given a MutableSeq, returns a new MutableSeq object.
|
jpayne@68
|
3222 Given a SeqRecord object, returns a new SeqRecord object.
|
jpayne@68
|
3223
|
jpayne@68
|
3224 >>> my_seq = "CGA"
|
jpayne@68
|
3225 >>> complement_rna(my_seq)
|
jpayne@68
|
3226 'GCU'
|
jpayne@68
|
3227 >>> my_seq = Seq("CGA")
|
jpayne@68
|
3228 >>> complement_rna(my_seq)
|
jpayne@68
|
3229 Seq('GCU')
|
jpayne@68
|
3230 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3231 >>> complement_rna(my_seq)
|
jpayne@68
|
3232 MutableSeq('GCU')
|
jpayne@68
|
3233 >>> my_seq
|
jpayne@68
|
3234 MutableSeq('CGA')
|
jpayne@68
|
3235
|
jpayne@68
|
3236 Any T in the sequence is treated as a U:
|
jpayne@68
|
3237
|
jpayne@68
|
3238 >>> complement_rna(Seq("CGAUT"))
|
jpayne@68
|
3239 Seq('GCUAA')
|
jpayne@68
|
3240
|
jpayne@68
|
3241 In contrast, ``complement`` returns a DNA sequence:
|
jpayne@68
|
3242
|
jpayne@68
|
3243 >>> complement(Seq("CGAUT"))
|
jpayne@68
|
3244 Seq('GCTAA')
|
jpayne@68
|
3245
|
jpayne@68
|
3246 Supports and lower- and upper-case characters, and unambiguous and
|
jpayne@68
|
3247 ambiguous nucleotides. All other characters are not converted:
|
jpayne@68
|
3248
|
jpayne@68
|
3249 >>> complement_rna("ACGTUacgtuXYZxyz")
|
jpayne@68
|
3250 'UGCAAugcaaXRZxrz'
|
jpayne@68
|
3251
|
jpayne@68
|
3252 The sequence is modified in-place and returned if inplace is True:
|
jpayne@68
|
3253
|
jpayne@68
|
3254 >>> my_seq = MutableSeq("CGA")
|
jpayne@68
|
3255 >>> complement(my_seq, inplace=True)
|
jpayne@68
|
3256 MutableSeq('GCT')
|
jpayne@68
|
3257 >>> my_seq
|
jpayne@68
|
3258 MutableSeq('GCT')
|
jpayne@68
|
3259
|
jpayne@68
|
3260 As strings and ``Seq`` objects are immutable, a ``TypeError`` is
|
jpayne@68
|
3261 raised if ``reverse_complement`` is called on a ``Seq`` object with
|
jpayne@68
|
3262 ``inplace=True``.
|
jpayne@68
|
3263 """
|
jpayne@68
|
3264 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports
|
jpayne@68
|
3265
|
jpayne@68
|
3266 if isinstance(sequence, (Seq, MutableSeq)):
|
jpayne@68
|
3267 return sequence.complement_rna(inplace)
|
jpayne@68
|
3268 if isinstance(sequence, SeqRecord):
|
jpayne@68
|
3269 if inplace:
|
jpayne@68
|
3270 raise TypeError("SeqRecords are immutable")
|
jpayne@68
|
3271 return sequence.complement_rna()
|
jpayne@68
|
3272 # Assume it's a string.
|
jpayne@68
|
3273 if inplace:
|
jpayne@68
|
3274 raise TypeError("strings are immutable")
|
jpayne@68
|
3275 sequence = sequence.encode("ASCII")
|
jpayne@68
|
3276 sequence = sequence.translate(_rna_complement_table)
|
jpayne@68
|
3277 return sequence.decode("ASCII")
|
jpayne@68
|
3278
|
jpayne@68
|
3279
|
jpayne@68
|
3280 def _test():
|
jpayne@68
|
3281 """Run the Bio.Seq module's doctests (PRIVATE)."""
|
jpayne@68
|
3282 print("Running doctests...")
|
jpayne@68
|
3283 import doctest
|
jpayne@68
|
3284
|
jpayne@68
|
3285 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
|
jpayne@68
|
3286 print("Done")
|
jpayne@68
|
3287
|
jpayne@68
|
3288
|
jpayne@68
|
3289 if __name__ == "__main__":
|
jpayne@68
|
3290 _test()
|