jpayne@69
|
1 # Copyright 2000-2003 Jeff Chang.
|
jpayne@69
|
2 # Copyright 2001-2008 Brad Chapman.
|
jpayne@69
|
3 # Copyright 2005-2024 by Peter Cock.
|
jpayne@69
|
4 # Copyright 2006-2009 Michiel de Hoon.
|
jpayne@69
|
5 # All rights reserved.
|
jpayne@69
|
6 #
|
jpayne@69
|
7 # This file is part of the Biopython distribution and governed by your
|
jpayne@69
|
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@69
|
9 # Please see the LICENSE file that should have been included as part of this
|
jpayne@69
|
10 # package.
|
jpayne@69
|
11 """Represent a Sequence Feature holding info about a part of a sequence.
|
jpayne@69
|
12
|
jpayne@69
|
13 This is heavily modeled after the Biocorba SeqFeature objects, and
|
jpayne@69
|
14 may be pretty biased towards GenBank stuff since I'm writing it
|
jpayne@69
|
15 for the GenBank parser output...
|
jpayne@69
|
16
|
jpayne@69
|
17 What's here:
|
jpayne@69
|
18
|
jpayne@69
|
19 Base class to hold a Feature
|
jpayne@69
|
20 ----------------------------
|
jpayne@69
|
21
|
jpayne@69
|
22 Classes:
|
jpayne@69
|
23 - SeqFeature
|
jpayne@69
|
24
|
jpayne@69
|
25 Hold information about a Reference
|
jpayne@69
|
26 ----------------------------------
|
jpayne@69
|
27
|
jpayne@69
|
28 This is an attempt to create a General class to hold Reference type
|
jpayne@69
|
29 information.
|
jpayne@69
|
30
|
jpayne@69
|
31 Classes:
|
jpayne@69
|
32 - Reference
|
jpayne@69
|
33
|
jpayne@69
|
34 Specify locations of a feature on a Sequence
|
jpayne@69
|
35 --------------------------------------------
|
jpayne@69
|
36
|
jpayne@69
|
37 This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
|
jpayne@69
|
38 This has the advantages of allowing us to handle fuzzy stuff in case anyone
|
jpayne@69
|
39 needs it, and also be compatible with BioPerl etc and BioSQL.
|
jpayne@69
|
40
|
jpayne@69
|
41 Classes:
|
jpayne@69
|
42 - Location - abstract base class of SimpleLocation and CompoundLocation.
|
jpayne@69
|
43 - SimpleLocation - Specify the start and end location of a feature.
|
jpayne@69
|
44 - CompoundLocation - Collection of SimpleLocation objects (for joins etc).
|
jpayne@69
|
45 - Position - abstract base class of ExactPosition, WithinPosition,
|
jpayne@69
|
46 BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and
|
jpayne@69
|
47 UnknownPosition.
|
jpayne@69
|
48 - ExactPosition - Specify the position as being exact.
|
jpayne@69
|
49 - WithinPosition - Specify a position occurring within some range.
|
jpayne@69
|
50 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
|
jpayne@69
|
51 - BeforePosition - Specify the position as being found before some base.
|
jpayne@69
|
52 - AfterPosition - Specify the position as being found after some base.
|
jpayne@69
|
53 - OneOfPosition - Specify a position consisting of multiple alternative positions.
|
jpayne@69
|
54 - UncertainPosition - Specify a specific position which is uncertain.
|
jpayne@69
|
55 - UnknownPosition - Represents missing information like '?' in UniProt.
|
jpayne@69
|
56
|
jpayne@69
|
57
|
jpayne@69
|
58 Exceptions:
|
jpayne@69
|
59 - LocationParserError - Exception indicating a failure to parse a location
|
jpayne@69
|
60 string.
|
jpayne@69
|
61
|
jpayne@69
|
62 """
|
jpayne@69
|
63 import functools
|
jpayne@69
|
64 import re
|
jpayne@69
|
65 import warnings
|
jpayne@69
|
66 from abc import ABC, abstractmethod
|
jpayne@69
|
67
|
jpayne@69
|
68 from Bio import BiopythonDeprecationWarning
|
jpayne@69
|
69 from Bio import BiopythonParserWarning
|
jpayne@69
|
70 from Bio.Seq import MutableSeq
|
jpayne@69
|
71 from Bio.Seq import reverse_complement
|
jpayne@69
|
72 from Bio.Seq import Seq
|
jpayne@69
|
73
|
jpayne@69
|
74
|
jpayne@69
|
75 # Regular expressions for location parsing
|
jpayne@69
|
76
|
jpayne@69
|
77 _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)"
|
jpayne@69
|
78 _oneof_position = r"one\-of\(\d+[,\d+]+\)"
|
jpayne@69
|
79
|
jpayne@69
|
80 _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})"
|
jpayne@69
|
81
|
jpayne@69
|
82 _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))"
|
jpayne@69
|
83
|
jpayne@69
|
84 _split = re.compile(_any_location).split
|
jpayne@69
|
85
|
jpayne@69
|
86 assert _split("123..145")[1::2] == ["123..145"]
|
jpayne@69
|
87 assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"]
|
jpayne@69
|
88 assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"]
|
jpayne@69
|
89 assert _split("complement(123..145),200..209")[1::2] == [
|
jpayne@69
|
90 "complement(123..145)",
|
jpayne@69
|
91 "200..209",
|
jpayne@69
|
92 ]
|
jpayne@69
|
93 assert _split("123..145,one-of(200,203)..209")[1::2] == [
|
jpayne@69
|
94 "123..145",
|
jpayne@69
|
95 "one-of(200,203)..209",
|
jpayne@69
|
96 ]
|
jpayne@69
|
97 assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [
|
jpayne@69
|
98 "123..145",
|
jpayne@69
|
99 "one-of(200,203)..one-of(209,211)",
|
jpayne@69
|
100 "300",
|
jpayne@69
|
101 ]
|
jpayne@69
|
102 assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [
|
jpayne@69
|
103 "123..145",
|
jpayne@69
|
104 "complement(one-of(200,203)..one-of(209,211))",
|
jpayne@69
|
105 "300",
|
jpayne@69
|
106 ]
|
jpayne@69
|
107 assert _split("123..145,200..one-of(209,211),300")[1::2] == [
|
jpayne@69
|
108 "123..145",
|
jpayne@69
|
109 "200..one-of(209,211)",
|
jpayne@69
|
110 "300",
|
jpayne@69
|
111 ]
|
jpayne@69
|
112 assert _split("123..145,200..one-of(209,211)")[1::2] == [
|
jpayne@69
|
113 "123..145",
|
jpayne@69
|
114 "200..one-of(209,211)",
|
jpayne@69
|
115 ]
|
jpayne@69
|
116 assert _split(
|
jpayne@69
|
117 "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"
|
jpayne@69
|
118 )[1::2] == [
|
jpayne@69
|
119 "complement(149815..150200)",
|
jpayne@69
|
120 "complement(293787..295573)",
|
jpayne@69
|
121 "NC_016402.1:6618..6676",
|
jpayne@69
|
122 "181647..181905",
|
jpayne@69
|
123 ]
|
jpayne@69
|
124
|
jpayne@69
|
125
|
jpayne@69
|
126 _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+"
|
jpayne@69
|
127
|
jpayne@69
|
128 _between_location = r"\d+\^\d+"
|
jpayne@69
|
129
|
jpayne@69
|
130 _within_position = r"\(\d+\.\d+\)"
|
jpayne@69
|
131 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
|
jpayne@69
|
132 _within_position,
|
jpayne@69
|
133 _within_position,
|
jpayne@69
|
134 )
|
jpayne@69
|
135 _within_position = r"\((\d+)\.(\d+)\)"
|
jpayne@69
|
136 _re_within_position = re.compile(_within_position)
|
jpayne@69
|
137 assert _re_within_position.match("(3.9)")
|
jpayne@69
|
138
|
jpayne@69
|
139 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
|
jpayne@69
|
140 _oneof_position = r"one\-of\((\d+[,\d+]+)\)"
|
jpayne@69
|
141 _re_oneof_position = re.compile(_oneof_position)
|
jpayne@69
|
142 assert _re_oneof_position.match("one-of(6,9)")
|
jpayne@69
|
143 assert not _re_oneof_position.match("one-of(3)")
|
jpayne@69
|
144 assert _re_oneof_position.match("one-of(3,6)")
|
jpayne@69
|
145 assert _re_oneof_position.match("one-of(3,6,9)")
|
jpayne@69
|
146
|
jpayne@69
|
147 _solo_location = r"[<>]?\d+"
|
jpayne@69
|
148 _solo_bond = r"bond\(%s\)" % _solo_location
|
jpayne@69
|
149
|
jpayne@69
|
150 _re_location_category = re.compile(
|
jpayne@69
|
151 r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$"
|
jpayne@69
|
152 % (
|
jpayne@69
|
153 _pair_location,
|
jpayne@69
|
154 _between_location,
|
jpayne@69
|
155 _within_location,
|
jpayne@69
|
156 _oneof_location,
|
jpayne@69
|
157 _solo_bond,
|
jpayne@69
|
158 _solo_location,
|
jpayne@69
|
159 )
|
jpayne@69
|
160 )
|
jpayne@69
|
161
|
jpayne@69
|
162
|
jpayne@69
|
163 class LocationParserError(ValueError):
|
jpayne@69
|
164 """Could not parse a feature location string."""
|
jpayne@69
|
165
|
jpayne@69
|
166
|
jpayne@69
|
167 class SeqFeature:
|
jpayne@69
|
168 """Represent a Sequence Feature on an object.
|
jpayne@69
|
169
|
jpayne@69
|
170 Attributes:
|
jpayne@69
|
171 - location - the location of the feature on the sequence (SimpleLocation)
|
jpayne@69
|
172 - type - the specified type of the feature (ie. CDS, exon, repeat...)
|
jpayne@69
|
173 - id - A string identifier for the feature.
|
jpayne@69
|
174 - qualifiers - A dictionary of qualifiers on the feature. These are
|
jpayne@69
|
175 analogous to the qualifiers from a GenBank feature table. The keys of
|
jpayne@69
|
176 the dictionary are qualifier names, the values are the qualifier
|
jpayne@69
|
177 values.
|
jpayne@69
|
178
|
jpayne@69
|
179 """
|
jpayne@69
|
180
|
jpayne@69
|
181 def __init__(
|
jpayne@69
|
182 self,
|
jpayne@69
|
183 location=None,
|
jpayne@69
|
184 type="",
|
jpayne@69
|
185 id="<unknown id>",
|
jpayne@69
|
186 qualifiers=None,
|
jpayne@69
|
187 sub_features=None,
|
jpayne@69
|
188 ):
|
jpayne@69
|
189 """Initialize a SeqFeature on a sequence.
|
jpayne@69
|
190
|
jpayne@69
|
191 location can either be a SimpleLocation (with strand argument also
|
jpayne@69
|
192 given if required), or None.
|
jpayne@69
|
193
|
jpayne@69
|
194 e.g. With no strand, on the forward strand, and on the reverse strand:
|
jpayne@69
|
195
|
jpayne@69
|
196 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
197 >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain")
|
jpayne@69
|
198 >>> f1.location.strand == None
|
jpayne@69
|
199 True
|
jpayne@69
|
200 >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS")
|
jpayne@69
|
201 >>> f2.location.strand == +1
|
jpayne@69
|
202 True
|
jpayne@69
|
203 >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS")
|
jpayne@69
|
204 >>> f3.location.strand == -1
|
jpayne@69
|
205 True
|
jpayne@69
|
206
|
jpayne@69
|
207 For exact start/end positions, an integer can be used (as shown above)
|
jpayne@69
|
208 as shorthand for the ExactPosition object. For non-exact locations, the
|
jpayne@69
|
209 SimpleLocation must be specified via the appropriate position objects.
|
jpayne@69
|
210 """
|
jpayne@69
|
211 if (
|
jpayne@69
|
212 location is not None
|
jpayne@69
|
213 and not isinstance(location, SimpleLocation)
|
jpayne@69
|
214 and not isinstance(location, CompoundLocation)
|
jpayne@69
|
215 ):
|
jpayne@69
|
216 raise TypeError(
|
jpayne@69
|
217 "SimpleLocation, CompoundLocation (or None) required for the location"
|
jpayne@69
|
218 )
|
jpayne@69
|
219 self.location = location
|
jpayne@69
|
220 self.type = type
|
jpayne@69
|
221 self.id = id
|
jpayne@69
|
222 self.qualifiers = {}
|
jpayne@69
|
223 if qualifiers is not None:
|
jpayne@69
|
224 self.qualifiers.update(qualifiers)
|
jpayne@69
|
225 if sub_features is not None:
|
jpayne@69
|
226 raise TypeError("Rather than sub_features, use a CompoundLocation")
|
jpayne@69
|
227
|
jpayne@69
|
228 def _get_strand(self):
|
jpayne@69
|
229 """Get function for the strand property (PRIVATE)."""
|
jpayne@69
|
230 warnings.warn(
|
jpayne@69
|
231 "Please use .location.strand rather than .strand",
|
jpayne@69
|
232 BiopythonDeprecationWarning,
|
jpayne@69
|
233 )
|
jpayne@69
|
234 return self.location.strand
|
jpayne@69
|
235
|
jpayne@69
|
236 def _set_strand(self, value):
|
jpayne@69
|
237 """Set function for the strand property (PRIVATE)."""
|
jpayne@69
|
238 warnings.warn(
|
jpayne@69
|
239 "Please use .location.strand rather than .strand",
|
jpayne@69
|
240 BiopythonDeprecationWarning,
|
jpayne@69
|
241 )
|
jpayne@69
|
242 try:
|
jpayne@69
|
243 self.location.strand = value
|
jpayne@69
|
244 except AttributeError:
|
jpayne@69
|
245 if self.location is None:
|
jpayne@69
|
246 if value is not None:
|
jpayne@69
|
247 raise ValueError("Can't set strand without a location.") from None
|
jpayne@69
|
248 else:
|
jpayne@69
|
249 raise
|
jpayne@69
|
250
|
jpayne@69
|
251 strand = property(
|
jpayne@69
|
252 fget=_get_strand,
|
jpayne@69
|
253 fset=_set_strand,
|
jpayne@69
|
254 doc="Alias for the location's strand (DEPRECATED).",
|
jpayne@69
|
255 )
|
jpayne@69
|
256
|
jpayne@69
|
257 def _get_ref(self):
|
jpayne@69
|
258 """Get function for the reference property (PRIVATE)."""
|
jpayne@69
|
259 warnings.warn(
|
jpayne@69
|
260 "Please use .location.ref rather than .ref",
|
jpayne@69
|
261 BiopythonDeprecationWarning,
|
jpayne@69
|
262 )
|
jpayne@69
|
263 try:
|
jpayne@69
|
264 return self.location.ref
|
jpayne@69
|
265 except AttributeError:
|
jpayne@69
|
266 return None
|
jpayne@69
|
267
|
jpayne@69
|
268 def _set_ref(self, value):
|
jpayne@69
|
269 """Set function for the reference property (PRIVATE)."""
|
jpayne@69
|
270 warnings.warn(
|
jpayne@69
|
271 "Please use .location.ref rather than .ref",
|
jpayne@69
|
272 BiopythonDeprecationWarning,
|
jpayne@69
|
273 )
|
jpayne@69
|
274 try:
|
jpayne@69
|
275 self.location.ref = value
|
jpayne@69
|
276 except AttributeError:
|
jpayne@69
|
277 if self.location is None:
|
jpayne@69
|
278 if value is not None:
|
jpayne@69
|
279 raise ValueError("Can't set ref without a location.") from None
|
jpayne@69
|
280 else:
|
jpayne@69
|
281 raise
|
jpayne@69
|
282
|
jpayne@69
|
283 ref = property(
|
jpayne@69
|
284 fget=_get_ref,
|
jpayne@69
|
285 fset=_set_ref,
|
jpayne@69
|
286 doc="Alias for the location's ref (DEPRECATED).",
|
jpayne@69
|
287 )
|
jpayne@69
|
288
|
jpayne@69
|
289 def _get_ref_db(self):
|
jpayne@69
|
290 """Get function for the database reference property (PRIVATE)."""
|
jpayne@69
|
291 warnings.warn(
|
jpayne@69
|
292 "Please use .location.ref_db rather than .ref_db",
|
jpayne@69
|
293 BiopythonDeprecationWarning,
|
jpayne@69
|
294 )
|
jpayne@69
|
295 try:
|
jpayne@69
|
296 return self.location.ref_db
|
jpayne@69
|
297 except AttributeError:
|
jpayne@69
|
298 return None
|
jpayne@69
|
299
|
jpayne@69
|
300 def _set_ref_db(self, value):
|
jpayne@69
|
301 """Set function for the database reference property (PRIVATE)."""
|
jpayne@69
|
302 warnings.warn(
|
jpayne@69
|
303 "Please use .location.ref_db rather than .ref_db",
|
jpayne@69
|
304 BiopythonDeprecationWarning,
|
jpayne@69
|
305 )
|
jpayne@69
|
306 self.location.ref_db = value
|
jpayne@69
|
307
|
jpayne@69
|
308 ref_db = property(
|
jpayne@69
|
309 fget=_get_ref_db,
|
jpayne@69
|
310 fset=_set_ref_db,
|
jpayne@69
|
311 doc="Alias for the location's ref_db (DEPRECATED).",
|
jpayne@69
|
312 )
|
jpayne@69
|
313
|
jpayne@69
|
314 def __eq__(self, other):
|
jpayne@69
|
315 """Check if two SeqFeature objects should be considered equal."""
|
jpayne@69
|
316 return (
|
jpayne@69
|
317 isinstance(other, SeqFeature)
|
jpayne@69
|
318 and self.id == other.id
|
jpayne@69
|
319 and self.type == other.type
|
jpayne@69
|
320 and self.location == other.location
|
jpayne@69
|
321 and self.qualifiers == other.qualifiers
|
jpayne@69
|
322 )
|
jpayne@69
|
323
|
jpayne@69
|
324 def __repr__(self):
|
jpayne@69
|
325 """Represent the feature as a string for debugging."""
|
jpayne@69
|
326 answer = f"{self.__class__.__name__}({self.location!r}"
|
jpayne@69
|
327 if self.type:
|
jpayne@69
|
328 answer += f", type={self.type!r}"
|
jpayne@69
|
329 if self.id and self.id != "<unknown id>":
|
jpayne@69
|
330 answer += f", id={self.id!r}"
|
jpayne@69
|
331 if self.qualifiers:
|
jpayne@69
|
332 answer += ", qualifiers=..."
|
jpayne@69
|
333 answer += ")"
|
jpayne@69
|
334 return answer
|
jpayne@69
|
335
|
jpayne@69
|
336 def __str__(self):
|
jpayne@69
|
337 """Return the full feature as a python string."""
|
jpayne@69
|
338 out = f"type: {self.type}\n"
|
jpayne@69
|
339 out += f"location: {self.location}\n"
|
jpayne@69
|
340 if self.id and self.id != "<unknown id>":
|
jpayne@69
|
341 out += f"id: {self.id}\n"
|
jpayne@69
|
342 out += "qualifiers:\n"
|
jpayne@69
|
343 for qual_key in sorted(self.qualifiers):
|
jpayne@69
|
344 out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n"
|
jpayne@69
|
345 return out
|
jpayne@69
|
346
|
jpayne@69
|
347 def _shift(self, offset):
|
jpayne@69
|
348 """Return a copy of the feature with its location shifted (PRIVATE).
|
jpayne@69
|
349
|
jpayne@69
|
350 The annotation qualifiers are copied.
|
jpayne@69
|
351 """
|
jpayne@69
|
352 return SeqFeature(
|
jpayne@69
|
353 location=self.location._shift(offset),
|
jpayne@69
|
354 type=self.type,
|
jpayne@69
|
355 id=self.id,
|
jpayne@69
|
356 qualifiers=self.qualifiers.copy(),
|
jpayne@69
|
357 )
|
jpayne@69
|
358
|
jpayne@69
|
359 def _flip(self, length):
|
jpayne@69
|
360 """Return a copy of the feature with its location flipped (PRIVATE).
|
jpayne@69
|
361
|
jpayne@69
|
362 The argument length gives the length of the parent sequence. For
|
jpayne@69
|
363 example a location 0..20 (+1 strand) with parent length 30 becomes
|
jpayne@69
|
364 after flipping 10..30 (-1 strand). Strandless (None) or unknown
|
jpayne@69
|
365 strand (0) remain like that - just their end points are changed.
|
jpayne@69
|
366
|
jpayne@69
|
367 The annotation qualifiers are copied.
|
jpayne@69
|
368 """
|
jpayne@69
|
369 return SeqFeature(
|
jpayne@69
|
370 location=self.location._flip(length),
|
jpayne@69
|
371 type=self.type,
|
jpayne@69
|
372 id=self.id,
|
jpayne@69
|
373 qualifiers=self.qualifiers.copy(),
|
jpayne@69
|
374 )
|
jpayne@69
|
375
|
jpayne@69
|
376 def extract(self, parent_sequence, references=None):
|
jpayne@69
|
377 """Extract the feature's sequence from supplied parent sequence.
|
jpayne@69
|
378
|
jpayne@69
|
379 The parent_sequence can be a Seq like object or a string, and will
|
jpayne@69
|
380 generally return an object of the same type. The exception to this is
|
jpayne@69
|
381 a MutableSeq as the parent sequence will return a Seq object.
|
jpayne@69
|
382
|
jpayne@69
|
383 This should cope with complex locations including complements, joins
|
jpayne@69
|
384 and fuzzy positions. Even mixed strand features should work! This
|
jpayne@69
|
385 also covers features on protein sequences (e.g. domains), although
|
jpayne@69
|
386 here reverse strand features are not permitted. If the
|
jpayne@69
|
387 location refers to other records, they must be supplied in the
|
jpayne@69
|
388 optional dictionary references.
|
jpayne@69
|
389
|
jpayne@69
|
390 >>> from Bio.Seq import Seq
|
jpayne@69
|
391 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
392 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
|
jpayne@69
|
393 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
|
jpayne@69
|
394 >>> f.extract(seq)
|
jpayne@69
|
395 Seq('VALIVIC')
|
jpayne@69
|
396
|
jpayne@69
|
397 If the SimpleLocation is None, e.g. when parsing invalid locus
|
jpayne@69
|
398 locations in the GenBank parser, extract() will raise a ValueError.
|
jpayne@69
|
399
|
jpayne@69
|
400 >>> from Bio.Seq import Seq
|
jpayne@69
|
401 >>> from Bio.SeqFeature import SeqFeature
|
jpayne@69
|
402 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
|
jpayne@69
|
403 >>> f = SeqFeature(None, type="domain")
|
jpayne@69
|
404 >>> f.extract(seq)
|
jpayne@69
|
405 Traceback (most recent call last):
|
jpayne@69
|
406 ...
|
jpayne@69
|
407 ValueError: The feature's .location is None. Check the sequence file for a valid location.
|
jpayne@69
|
408
|
jpayne@69
|
409 Note - currently only compound features of type "join" are supported.
|
jpayne@69
|
410 """
|
jpayne@69
|
411 if self.location is None:
|
jpayne@69
|
412 raise ValueError(
|
jpayne@69
|
413 "The feature's .location is None. Check the "
|
jpayne@69
|
414 "sequence file for a valid location."
|
jpayne@69
|
415 )
|
jpayne@69
|
416 return self.location.extract(parent_sequence, references=references)
|
jpayne@69
|
417
|
jpayne@69
|
418 def translate(
|
jpayne@69
|
419 self,
|
jpayne@69
|
420 parent_sequence,
|
jpayne@69
|
421 table="Standard",
|
jpayne@69
|
422 start_offset=None,
|
jpayne@69
|
423 stop_symbol="*",
|
jpayne@69
|
424 to_stop=False,
|
jpayne@69
|
425 cds=None,
|
jpayne@69
|
426 gap=None,
|
jpayne@69
|
427 ):
|
jpayne@69
|
428 """Get a translation of the feature's sequence.
|
jpayne@69
|
429
|
jpayne@69
|
430 This method is intended for CDS or other features that code proteins
|
jpayne@69
|
431 and is a shortcut that will both extract the feature and
|
jpayne@69
|
432 translate it, taking into account the codon_start and transl_table
|
jpayne@69
|
433 qualifiers, if they are present. If they are not present the
|
jpayne@69
|
434 value of the arguments "table" and "start_offset" are used.
|
jpayne@69
|
435
|
jpayne@69
|
436 The "cds" parameter is set to "True" if the feature is of type
|
jpayne@69
|
437 "CDS" but can be overridden by giving an explicit argument.
|
jpayne@69
|
438
|
jpayne@69
|
439 The arguments stop_symbol, to_stop and gap have the same meaning
|
jpayne@69
|
440 as Seq.translate, refer to that documentation for further information.
|
jpayne@69
|
441
|
jpayne@69
|
442 Arguments:
|
jpayne@69
|
443 - parent_sequence - A DNA or RNA sequence.
|
jpayne@69
|
444 - table - Which codon table to use if there is no transl_table
|
jpayne@69
|
445 qualifier for this feature. This can be either a name
|
jpayne@69
|
446 (string), an NCBI identifier (integer), or a CodonTable
|
jpayne@69
|
447 object (useful for non-standard genetic codes). This
|
jpayne@69
|
448 defaults to the "Standard" table.
|
jpayne@69
|
449 - start_offset - offset at which the first complete codon of a
|
jpayne@69
|
450 coding feature can be found, relative to the first base of
|
jpayne@69
|
451 that feature. Has a valid value of 0, 1 or 2. NOTE: this
|
jpayne@69
|
452 uses python's 0-based numbering whereas the codon_start
|
jpayne@69
|
453 qualifier in files from NCBI use 1-based numbering.
|
jpayne@69
|
454 Will override a codon_start qualifier
|
jpayne@69
|
455
|
jpayne@69
|
456 >>> from Bio.Seq import Seq
|
jpayne@69
|
457 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
458 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
|
jpayne@69
|
459 >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS")
|
jpayne@69
|
460 >>> f.qualifiers['transl_table'] = [11]
|
jpayne@69
|
461
|
jpayne@69
|
462 Note that features of type CDS are subject to the usual
|
jpayne@69
|
463 checks at translation. But you can override this behavior
|
jpayne@69
|
464 by giving explicit arguments:
|
jpayne@69
|
465
|
jpayne@69
|
466 >>> f.translate(seq, cds=False)
|
jpayne@69
|
467 Seq('GYTYR*CL**')
|
jpayne@69
|
468
|
jpayne@69
|
469 Now use the start_offset argument to change the frame. Note
|
jpayne@69
|
470 this uses python 0-based numbering.
|
jpayne@69
|
471
|
jpayne@69
|
472 >>> f.translate(seq, start_offset=1, cds=False)
|
jpayne@69
|
473 Seq('VTLTDNVSD')
|
jpayne@69
|
474
|
jpayne@69
|
475 Alternatively use the codon_start qualifier to do the same
|
jpayne@69
|
476 thing. Note: this uses 1-based numbering, which is found
|
jpayne@69
|
477 in files from NCBI.
|
jpayne@69
|
478
|
jpayne@69
|
479 >>> f.qualifiers['codon_start'] = [2]
|
jpayne@69
|
480 >>> f.translate(seq, cds=False)
|
jpayne@69
|
481 Seq('VTLTDNVSD')
|
jpayne@69
|
482 """
|
jpayne@69
|
483 # see if this feature should be translated in a different
|
jpayne@69
|
484 # frame using the "codon_start" qualifier
|
jpayne@69
|
485 if start_offset is None:
|
jpayne@69
|
486 try:
|
jpayne@69
|
487 start_offset = int(self.qualifiers["codon_start"][0]) - 1
|
jpayne@69
|
488 except KeyError:
|
jpayne@69
|
489 start_offset = 0
|
jpayne@69
|
490
|
jpayne@69
|
491 if start_offset not in [0, 1, 2]:
|
jpayne@69
|
492 raise ValueError(
|
jpayne@69
|
493 "The start_offset must be 0, 1, or 2. "
|
jpayne@69
|
494 f"The supplied value is {start_offset}. "
|
jpayne@69
|
495 "Check the value of either the codon_start qualifier "
|
jpayne@69
|
496 "or the start_offset argument"
|
jpayne@69
|
497 )
|
jpayne@69
|
498
|
jpayne@69
|
499 feat_seq = self.extract(parent_sequence)[start_offset:]
|
jpayne@69
|
500 codon_table = self.qualifiers.get("transl_table", [table])[0]
|
jpayne@69
|
501
|
jpayne@69
|
502 if cds is None:
|
jpayne@69
|
503 cds = self.type == "CDS"
|
jpayne@69
|
504
|
jpayne@69
|
505 return feat_seq.translate(
|
jpayne@69
|
506 table=codon_table,
|
jpayne@69
|
507 stop_symbol=stop_symbol,
|
jpayne@69
|
508 to_stop=to_stop,
|
jpayne@69
|
509 cds=cds,
|
jpayne@69
|
510 gap=gap,
|
jpayne@69
|
511 )
|
jpayne@69
|
512
|
jpayne@69
|
513 def __bool__(self):
|
jpayne@69
|
514 """Boolean value of an instance of this class (True).
|
jpayne@69
|
515
|
jpayne@69
|
516 This behavior is for backwards compatibility, since until the
|
jpayne@69
|
517 __len__ method was added, a SeqFeature always evaluated as True.
|
jpayne@69
|
518
|
jpayne@69
|
519 Note that in comparison, Seq objects, strings, lists, etc, will all
|
jpayne@69
|
520 evaluate to False if they have length zero.
|
jpayne@69
|
521
|
jpayne@69
|
522 WARNING: The SeqFeature may in future evaluate to False when its
|
jpayne@69
|
523 length is zero (in order to better match normal python behavior)!
|
jpayne@69
|
524 """
|
jpayne@69
|
525 return True
|
jpayne@69
|
526
|
jpayne@69
|
527 def __len__(self):
|
jpayne@69
|
528 """Return the length of the region where the feature is located.
|
jpayne@69
|
529
|
jpayne@69
|
530 >>> from Bio.Seq import Seq
|
jpayne@69
|
531 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
532 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
|
jpayne@69
|
533 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
|
jpayne@69
|
534 >>> len(f)
|
jpayne@69
|
535 7
|
jpayne@69
|
536 >>> f.extract(seq)
|
jpayne@69
|
537 Seq('VALIVIC')
|
jpayne@69
|
538 >>> len(f.extract(seq))
|
jpayne@69
|
539 7
|
jpayne@69
|
540
|
jpayne@69
|
541 This is a proxy for taking the length of the feature's location:
|
jpayne@69
|
542
|
jpayne@69
|
543 >>> len(f.location)
|
jpayne@69
|
544 7
|
jpayne@69
|
545
|
jpayne@69
|
546 For simple features this is the same as the region spanned (end
|
jpayne@69
|
547 position minus start position using Pythonic counting). However, for
|
jpayne@69
|
548 a compound location (e.g. a CDS as the join of several exons) the
|
jpayne@69
|
549 gaps are not counted (e.g. introns). This ensures that len(f) matches
|
jpayne@69
|
550 len(f.extract(parent_seq)), and also makes sure things work properly
|
jpayne@69
|
551 with features wrapping the origin etc.
|
jpayne@69
|
552 """
|
jpayne@69
|
553 return len(self.location)
|
jpayne@69
|
554
|
jpayne@69
|
555 def __iter__(self):
|
jpayne@69
|
556 """Iterate over the parent positions within the feature.
|
jpayne@69
|
557
|
jpayne@69
|
558 The iteration order is strand aware, and can be thought of as moving
|
jpayne@69
|
559 along the feature using the parent sequence coordinates:
|
jpayne@69
|
560
|
jpayne@69
|
561 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
562 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
|
jpayne@69
|
563 >>> len(f)
|
jpayne@69
|
564 5
|
jpayne@69
|
565 >>> for i in f: print(i)
|
jpayne@69
|
566 9
|
jpayne@69
|
567 8
|
jpayne@69
|
568 7
|
jpayne@69
|
569 6
|
jpayne@69
|
570 5
|
jpayne@69
|
571 >>> list(f)
|
jpayne@69
|
572 [9, 8, 7, 6, 5]
|
jpayne@69
|
573
|
jpayne@69
|
574 This is a proxy for iterating over the location,
|
jpayne@69
|
575
|
jpayne@69
|
576 >>> list(f.location)
|
jpayne@69
|
577 [9, 8, 7, 6, 5]
|
jpayne@69
|
578 """
|
jpayne@69
|
579 return iter(self.location)
|
jpayne@69
|
580
|
jpayne@69
|
581 def __contains__(self, value):
|
jpayne@69
|
582 """Check if an integer position is within the feature.
|
jpayne@69
|
583
|
jpayne@69
|
584 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
585 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
|
jpayne@69
|
586 >>> len(f)
|
jpayne@69
|
587 5
|
jpayne@69
|
588 >>> [i for i in range(15) if i in f]
|
jpayne@69
|
589 [5, 6, 7, 8, 9]
|
jpayne@69
|
590
|
jpayne@69
|
591 For example, to see which features include a SNP position, you could
|
jpayne@69
|
592 use this:
|
jpayne@69
|
593
|
jpayne@69
|
594 >>> from Bio import SeqIO
|
jpayne@69
|
595 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
|
jpayne@69
|
596 >>> for f in record.features:
|
jpayne@69
|
597 ... if 1750 in f:
|
jpayne@69
|
598 ... print("%s %s" % (f.type, f.location))
|
jpayne@69
|
599 source [0:154478](+)
|
jpayne@69
|
600 gene [1716:4347](-)
|
jpayne@69
|
601 tRNA join{[4310:4347](-), [1716:1751](-)}
|
jpayne@69
|
602
|
jpayne@69
|
603 Note that for a feature defined as a join of several subfeatures (e.g.
|
jpayne@69
|
604 the union of several exons) the gaps are not checked (e.g. introns).
|
jpayne@69
|
605 In this example, the tRNA location is defined in the GenBank file as
|
jpayne@69
|
606 complement(join(1717..1751,4311..4347)), so that position 1760 falls
|
jpayne@69
|
607 in the gap:
|
jpayne@69
|
608
|
jpayne@69
|
609 >>> for f in record.features:
|
jpayne@69
|
610 ... if 1760 in f:
|
jpayne@69
|
611 ... print("%s %s" % (f.type, f.location))
|
jpayne@69
|
612 source [0:154478](+)
|
jpayne@69
|
613 gene [1716:4347](-)
|
jpayne@69
|
614
|
jpayne@69
|
615 Note that additional care may be required with fuzzy locations, for
|
jpayne@69
|
616 example just before a BeforePosition:
|
jpayne@69
|
617
|
jpayne@69
|
618 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
|
jpayne@69
|
619 >>> from Bio.SeqFeature import BeforePosition
|
jpayne@69
|
620 >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain")
|
jpayne@69
|
621 >>> len(f)
|
jpayne@69
|
622 5
|
jpayne@69
|
623 >>> [i for i in range(10) if i in f]
|
jpayne@69
|
624 [3, 4, 5, 6, 7]
|
jpayne@69
|
625
|
jpayne@69
|
626 Note that is is a proxy for testing membership on the location.
|
jpayne@69
|
627
|
jpayne@69
|
628 >>> [i for i in range(10) if i in f.location]
|
jpayne@69
|
629 [3, 4, 5, 6, 7]
|
jpayne@69
|
630 """
|
jpayne@69
|
631 return value in self.location
|
jpayne@69
|
632
|
jpayne@69
|
633
|
jpayne@69
|
634 # --- References
|
jpayne@69
|
635
|
jpayne@69
|
636
|
jpayne@69
|
637 # TODO -- Will this hold PubMed and Medline information decently?
|
jpayne@69
|
638 class Reference:
|
jpayne@69
|
639 """Represent a Generic Reference object.
|
jpayne@69
|
640
|
jpayne@69
|
641 Attributes:
|
jpayne@69
|
642 - location - A list of Location objects specifying regions of
|
jpayne@69
|
643 the sequence that the references correspond to. If no locations are
|
jpayne@69
|
644 specified, the entire sequence is assumed.
|
jpayne@69
|
645 - authors - A big old string, or a list split by author, of authors
|
jpayne@69
|
646 for the reference.
|
jpayne@69
|
647 - title - The title of the reference.
|
jpayne@69
|
648 - journal - Journal the reference was published in.
|
jpayne@69
|
649 - medline_id - A medline reference for the article.
|
jpayne@69
|
650 - pubmed_id - A pubmed reference for the article.
|
jpayne@69
|
651 - comment - A place to stick any comments about the reference.
|
jpayne@69
|
652
|
jpayne@69
|
653 """
|
jpayne@69
|
654
|
jpayne@69
|
655 def __init__(self):
|
jpayne@69
|
656 """Initialize the class."""
|
jpayne@69
|
657 self.location = []
|
jpayne@69
|
658 self.authors = ""
|
jpayne@69
|
659 self.consrtm = ""
|
jpayne@69
|
660 self.title = ""
|
jpayne@69
|
661 self.journal = ""
|
jpayne@69
|
662 self.medline_id = ""
|
jpayne@69
|
663 self.pubmed_id = ""
|
jpayne@69
|
664 self.comment = ""
|
jpayne@69
|
665
|
jpayne@69
|
666 def __str__(self):
|
jpayne@69
|
667 """Return the full Reference object as a python string."""
|
jpayne@69
|
668 out = ""
|
jpayne@69
|
669 for single_location in self.location:
|
jpayne@69
|
670 out += f"location: {single_location}\n"
|
jpayne@69
|
671 out += f"authors: {self.authors}\n"
|
jpayne@69
|
672 if self.consrtm:
|
jpayne@69
|
673 out += f"consrtm: {self.consrtm}\n"
|
jpayne@69
|
674 out += f"title: {self.title}\n"
|
jpayne@69
|
675 out += f"journal: {self.journal}\n"
|
jpayne@69
|
676 out += f"medline id: {self.medline_id}\n"
|
jpayne@69
|
677 out += f"pubmed id: {self.pubmed_id}\n"
|
jpayne@69
|
678 out += f"comment: {self.comment}\n"
|
jpayne@69
|
679 return out
|
jpayne@69
|
680
|
jpayne@69
|
681 def __repr__(self):
|
jpayne@69
|
682 """Represent the Reference object as a string for debugging."""
|
jpayne@69
|
683 # TODO - Update this is __init__ later accepts values
|
jpayne@69
|
684 return f"{self.__class__.__name__}(title={self.title!r}, ...)"
|
jpayne@69
|
685
|
jpayne@69
|
686 def __eq__(self, other):
|
jpayne@69
|
687 """Check if two Reference objects should be considered equal.
|
jpayne@69
|
688
|
jpayne@69
|
689 Note prior to Biopython 1.70 the location was not compared, as
|
jpayne@69
|
690 until then __eq__ for the SimpleLocation class was not defined.
|
jpayne@69
|
691 """
|
jpayne@69
|
692 return (
|
jpayne@69
|
693 self.authors == other.authors
|
jpayne@69
|
694 and self.consrtm == other.consrtm
|
jpayne@69
|
695 and self.title == other.title
|
jpayne@69
|
696 and self.journal == other.journal
|
jpayne@69
|
697 and self.medline_id == other.medline_id
|
jpayne@69
|
698 and self.pubmed_id == other.pubmed_id
|
jpayne@69
|
699 and self.comment == other.comment
|
jpayne@69
|
700 and self.location == other.location
|
jpayne@69
|
701 )
|
jpayne@69
|
702
|
jpayne@69
|
703
|
jpayne@69
|
704 # --- Handling feature locations
|
jpayne@69
|
705
|
jpayne@69
|
706
|
jpayne@69
|
707 class Location(ABC):
|
jpayne@69
|
708 """Abstract base class representing a location."""
|
jpayne@69
|
709
|
jpayne@69
|
710 @abstractmethod
|
jpayne@69
|
711 def __repr__(self):
|
jpayne@69
|
712 """Represent the Location object as a string for debugging."""
|
jpayne@69
|
713 return f"{self.__class__.__name__}(...)"
|
jpayne@69
|
714
|
jpayne@69
|
715 def fromstring(text, length=None, circular=False, stranded=True):
|
jpayne@69
|
716 """Create a Location object from a string.
|
jpayne@69
|
717
|
jpayne@69
|
718 This should accept any valid location string in the INSDC Feature Table
|
jpayne@69
|
719 format (https://www.insdc.org/submitting-standards/feature-table/) as
|
jpayne@69
|
720 used in GenBank, DDBJ and EMBL files.
|
jpayne@69
|
721
|
jpayne@69
|
722 Simple examples:
|
jpayne@69
|
723
|
jpayne@69
|
724 >>> Location.fromstring("123..456", 1000)
|
jpayne@69
|
725 SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1)
|
jpayne@69
|
726 >>> Location.fromstring("complement(<123..>456)", 1000)
|
jpayne@69
|
727 SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1)
|
jpayne@69
|
728
|
jpayne@69
|
729 A more complex location using within positions,
|
jpayne@69
|
730
|
jpayne@69
|
731 >>> Location.fromstring("(9.10)..(20.25)", 1000)
|
jpayne@69
|
732 SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
|
jpayne@69
|
733
|
jpayne@69
|
734 Notice how that will act as though it has overall start 8 and end 25.
|
jpayne@69
|
735
|
jpayne@69
|
736 Zero length between feature,
|
jpayne@69
|
737
|
jpayne@69
|
738 >>> Location.fromstring("123^124", 1000)
|
jpayne@69
|
739 SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1)
|
jpayne@69
|
740
|
jpayne@69
|
741 The expected sequence length is needed for a special case, a between
|
jpayne@69
|
742 position at the start/end of a circular genome:
|
jpayne@69
|
743
|
jpayne@69
|
744 >>> Location.fromstring("1000^1", 1000)
|
jpayne@69
|
745 SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
|
jpayne@69
|
746
|
jpayne@69
|
747 Apart from this special case, between positions P^Q must have P+1==Q,
|
jpayne@69
|
748
|
jpayne@69
|
749 >>> Location.fromstring("123^456", 1000)
|
jpayne@69
|
750 Traceback (most recent call last):
|
jpayne@69
|
751 ...
|
jpayne@69
|
752 Bio.SeqFeature.LocationParserError: invalid feature location '123^456'
|
jpayne@69
|
753
|
jpayne@69
|
754 You can optionally provide a reference name:
|
jpayne@69
|
755
|
jpayne@69
|
756 >>> Location.fromstring("AL391218.9:105173..108462", 2000000)
|
jpayne@69
|
757 SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
|
jpayne@69
|
758
|
jpayne@69
|
759 >>> Location.fromstring("<2644..159", 2868, "circular")
|
jpayne@69
|
760 CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
|
jpayne@69
|
761 """
|
jpayne@69
|
762 if text.startswith("complement("):
|
jpayne@69
|
763 if text[-1] != ")":
|
jpayne@69
|
764 raise ValueError(f"closing bracket missing in '{text}'")
|
jpayne@69
|
765 text = text[11:-1]
|
jpayne@69
|
766 strand = -1
|
jpayne@69
|
767 elif stranded:
|
jpayne@69
|
768 strand = 1
|
jpayne@69
|
769 else:
|
jpayne@69
|
770 strand = None
|
jpayne@69
|
771
|
jpayne@69
|
772 # Determine if we have a simple location or a compound location
|
jpayne@69
|
773 if text.startswith("join("):
|
jpayne@69
|
774 operator = "join"
|
jpayne@69
|
775 parts = _split(text[5:-1])[1::2]
|
jpayne@69
|
776 # assert parts[0] == "" and parts[-1] == ""
|
jpayne@69
|
777 elif text.startswith("order("):
|
jpayne@69
|
778 operator = "order"
|
jpayne@69
|
779 parts = _split(text[6:-1])[1::2]
|
jpayne@69
|
780 # assert parts[0] == "" and parts[-1] == ""
|
jpayne@69
|
781 elif text.startswith("bond("):
|
jpayne@69
|
782 operator = "bond"
|
jpayne@69
|
783 parts = _split(text[5:-1])[1::2]
|
jpayne@69
|
784 # assert parts[0] == "" and parts[-1] == ""
|
jpayne@69
|
785 else:
|
jpayne@69
|
786 loc = SimpleLocation.fromstring(text, length, circular)
|
jpayne@69
|
787 loc.strand = strand
|
jpayne@69
|
788 if strand == -1:
|
jpayne@69
|
789 loc.parts.reverse()
|
jpayne@69
|
790 return loc
|
jpayne@69
|
791 locs = []
|
jpayne@69
|
792 for part in parts:
|
jpayne@69
|
793 loc = SimpleLocation.fromstring(part, length, circular)
|
jpayne@69
|
794 if loc is None:
|
jpayne@69
|
795 break
|
jpayne@69
|
796 if loc.strand == -1:
|
jpayne@69
|
797 if strand == -1:
|
jpayne@69
|
798 raise LocationParserError("double complement in '{text}'?")
|
jpayne@69
|
799 else:
|
jpayne@69
|
800 loc.strand = strand
|
jpayne@69
|
801 locs.extend(loc.parts)
|
jpayne@69
|
802 else:
|
jpayne@69
|
803 if len(locs) == 1:
|
jpayne@69
|
804 return loc
|
jpayne@69
|
805 # Historically a join on the reverse strand has been represented
|
jpayne@69
|
806 # in Biopython with both the parent SeqFeature and its children
|
jpayne@69
|
807 # (the exons for a CDS) all given a strand of -1. Likewise, for
|
jpayne@69
|
808 # a join feature on the forward strand they all have strand +1.
|
jpayne@69
|
809 # However, we must also consider evil mixed strand examples like
|
jpayne@69
|
810 # this, join(complement(69611..69724),139856..140087,140625..140650)
|
jpayne@69
|
811 if strand == -1:
|
jpayne@69
|
812 # Whole thing was wrapped in complement(...)
|
jpayne@69
|
813 for loc in locs:
|
jpayne@69
|
814 assert loc.strand == -1
|
jpayne@69
|
815 # Reverse the backwards order used in GenBank files
|
jpayne@69
|
816 # with complement(join(...))
|
jpayne@69
|
817 locs = locs[::-1]
|
jpayne@69
|
818 return CompoundLocation(locs, operator=operator)
|
jpayne@69
|
819 # Not recognized
|
jpayne@69
|
820 if "order" in text and "join" in text:
|
jpayne@69
|
821 # See Bug 3197
|
jpayne@69
|
822 raise LocationParserError(
|
jpayne@69
|
823 f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal"
|
jpayne@69
|
824 )
|
jpayne@69
|
825
|
jpayne@69
|
826 # See issue #937. Note that NCBI has already fixed this record.
|
jpayne@69
|
827 if ",)" in text:
|
jpayne@69
|
828 warnings.warn(
|
jpayne@69
|
829 "Dropping trailing comma in malformed feature location",
|
jpayne@69
|
830 BiopythonParserWarning,
|
jpayne@69
|
831 )
|
jpayne@69
|
832 text = text.replace(",)", ")")
|
jpayne@69
|
833 return Location.fromstring(text)
|
jpayne@69
|
834
|
jpayne@69
|
835 raise LocationParserError(f"failed to parse feature location '{text}'")
|
jpayne@69
|
836
|
jpayne@69
|
837
|
jpayne@69
|
838 class SimpleLocation(Location):
|
jpayne@69
|
839 """Specify the location of a feature along a sequence.
|
jpayne@69
|
840
|
jpayne@69
|
841 The SimpleLocation is used for simple continuous features, which can
|
jpayne@69
|
842 be described as running from a start position to and end position
|
jpayne@69
|
843 (optionally with a strand and reference information). More complex
|
jpayne@69
|
844 locations made up from several non-continuous parts (e.g. a coding
|
jpayne@69
|
845 sequence made up of several exons) are described using a SeqFeature
|
jpayne@69
|
846 with a CompoundLocation.
|
jpayne@69
|
847
|
jpayne@69
|
848 Note that the start and end location numbering follow Python's scheme,
|
jpayne@69
|
849 thus a GenBank entry of 123..150 (one based counting) becomes a location
|
jpayne@69
|
850 of [122:150] (zero based counting).
|
jpayne@69
|
851
|
jpayne@69
|
852 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
853 >>> f = SimpleLocation(122, 150)
|
jpayne@69
|
854 >>> print(f)
|
jpayne@69
|
855 [122:150]
|
jpayne@69
|
856 >>> print(f.start)
|
jpayne@69
|
857 122
|
jpayne@69
|
858 >>> print(f.end)
|
jpayne@69
|
859 150
|
jpayne@69
|
860 >>> print(f.strand)
|
jpayne@69
|
861 None
|
jpayne@69
|
862
|
jpayne@69
|
863 Note the strand defaults to None. If you are working with nucleotide
|
jpayne@69
|
864 sequences you'd want to be explicit if it is the forward strand:
|
jpayne@69
|
865
|
jpayne@69
|
866 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
867 >>> f = SimpleLocation(122, 150, strand=+1)
|
jpayne@69
|
868 >>> print(f)
|
jpayne@69
|
869 [122:150](+)
|
jpayne@69
|
870 >>> print(f.strand)
|
jpayne@69
|
871 1
|
jpayne@69
|
872
|
jpayne@69
|
873 Note that for a parent sequence of length n, the SimpleLocation
|
jpayne@69
|
874 start and end must satisfy the inequality 0 <= start <= end <= n.
|
jpayne@69
|
875 This means even for features on the reverse strand of a nucleotide
|
jpayne@69
|
876 sequence, we expect the 'start' coordinate to be less than the
|
jpayne@69
|
877 'end'.
|
jpayne@69
|
878
|
jpayne@69
|
879 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
880 >>> r = SimpleLocation(122, 150, strand=-1)
|
jpayne@69
|
881 >>> print(r)
|
jpayne@69
|
882 [122:150](-)
|
jpayne@69
|
883 >>> print(r.start)
|
jpayne@69
|
884 122
|
jpayne@69
|
885 >>> print(r.end)
|
jpayne@69
|
886 150
|
jpayne@69
|
887 >>> print(r.strand)
|
jpayne@69
|
888 -1
|
jpayne@69
|
889
|
jpayne@69
|
890 i.e. Rather than thinking of the 'start' and 'end' biologically in a
|
jpayne@69
|
891 strand aware manner, think of them as the 'left most' or 'minimum'
|
jpayne@69
|
892 boundary, and the 'right most' or 'maximum' boundary of the region
|
jpayne@69
|
893 being described. This is particularly important with compound
|
jpayne@69
|
894 locations describing non-continuous regions.
|
jpayne@69
|
895
|
jpayne@69
|
896 In the example above we have used standard exact positions, but there
|
jpayne@69
|
897 are also specialised position objects used to represent fuzzy positions
|
jpayne@69
|
898 as well, for example a GenBank location like complement(<123..150)
|
jpayne@69
|
899 would use a BeforePosition object for the start.
|
jpayne@69
|
900 """
|
jpayne@69
|
901
|
jpayne@69
|
902 def __init__(self, start, end, strand=None, ref=None, ref_db=None):
|
jpayne@69
|
903 """Initialize the class.
|
jpayne@69
|
904
|
jpayne@69
|
905 start and end arguments specify the values where the feature begins
|
jpayne@69
|
906 and ends. These can either by any of the ``*Position`` objects that
|
jpayne@69
|
907 inherit from Position, or can just be integers specifying the position.
|
jpayne@69
|
908 In the case of integers, the values are assumed to be exact and are
|
jpayne@69
|
909 converted in ExactPosition arguments. This is meant to make it easy
|
jpayne@69
|
910 to deal with non-fuzzy ends.
|
jpayne@69
|
911
|
jpayne@69
|
912 i.e. Short form:
|
jpayne@69
|
913
|
jpayne@69
|
914 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
915 >>> loc = SimpleLocation(5, 10, strand=-1)
|
jpayne@69
|
916 >>> print(loc)
|
jpayne@69
|
917 [5:10](-)
|
jpayne@69
|
918
|
jpayne@69
|
919 Explicit form:
|
jpayne@69
|
920
|
jpayne@69
|
921 >>> from Bio.SeqFeature import SimpleLocation, ExactPosition
|
jpayne@69
|
922 >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1)
|
jpayne@69
|
923 >>> print(loc)
|
jpayne@69
|
924 [5:10](-)
|
jpayne@69
|
925
|
jpayne@69
|
926 Other fuzzy positions are used similarly,
|
jpayne@69
|
927
|
jpayne@69
|
928 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
929 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
|
jpayne@69
|
930 >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1)
|
jpayne@69
|
931 >>> print(loc2)
|
jpayne@69
|
932 [<5:>10](-)
|
jpayne@69
|
933
|
jpayne@69
|
934 For nucleotide features you will also want to specify the strand,
|
jpayne@69
|
935 use 1 for the forward (plus) strand, -1 for the reverse (negative)
|
jpayne@69
|
936 strand, 0 for stranded but strand unknown (? in GFF3), or None for
|
jpayne@69
|
937 when the strand does not apply (dot in GFF3), e.g. features on
|
jpayne@69
|
938 proteins.
|
jpayne@69
|
939
|
jpayne@69
|
940 >>> loc = SimpleLocation(5, 10, strand=+1)
|
jpayne@69
|
941 >>> print(loc)
|
jpayne@69
|
942 [5:10](+)
|
jpayne@69
|
943 >>> print(loc.strand)
|
jpayne@69
|
944 1
|
jpayne@69
|
945
|
jpayne@69
|
946 Normally feature locations are given relative to the parent
|
jpayne@69
|
947 sequence you are working with, but an explicit accession can
|
jpayne@69
|
948 be given with the optional ref and db_ref strings:
|
jpayne@69
|
949
|
jpayne@69
|
950 >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1)
|
jpayne@69
|
951 >>> print(loc)
|
jpayne@69
|
952 AL391218.9[105172:108462](+)
|
jpayne@69
|
953 >>> print(loc.ref)
|
jpayne@69
|
954 AL391218.9
|
jpayne@69
|
955
|
jpayne@69
|
956 """
|
jpayne@69
|
957 # TODO - Check 0 <= start <= end (<= length of reference)
|
jpayne@69
|
958 if isinstance(start, Position):
|
jpayne@69
|
959 self._start = start
|
jpayne@69
|
960 elif isinstance(start, int):
|
jpayne@69
|
961 self._start = ExactPosition(start)
|
jpayne@69
|
962 else:
|
jpayne@69
|
963 raise TypeError(f"start={start!r} {type(start)}")
|
jpayne@69
|
964 if isinstance(end, Position):
|
jpayne@69
|
965 self._end = end
|
jpayne@69
|
966 elif isinstance(end, int):
|
jpayne@69
|
967 self._end = ExactPosition(end)
|
jpayne@69
|
968 else:
|
jpayne@69
|
969 raise TypeError(f"end={end!r} {type(end)}")
|
jpayne@69
|
970 if (
|
jpayne@69
|
971 isinstance(self.start, int)
|
jpayne@69
|
972 and isinstance(self.end, int)
|
jpayne@69
|
973 and self.start > self.end
|
jpayne@69
|
974 ):
|
jpayne@69
|
975 raise ValueError(
|
jpayne@69
|
976 f"End location ({self.end}) must be greater than "
|
jpayne@69
|
977 f"or equal to start location ({self.start})"
|
jpayne@69
|
978 )
|
jpayne@69
|
979 self.strand = strand
|
jpayne@69
|
980 self.ref = ref
|
jpayne@69
|
981 self.ref_db = ref_db
|
jpayne@69
|
982
|
jpayne@69
|
983 @staticmethod
|
jpayne@69
|
984 def fromstring(text, length=None, circular=False):
|
jpayne@69
|
985 """Create a SimpleLocation object from a string."""
|
jpayne@69
|
986 if text.startswith("complement("):
|
jpayne@69
|
987 text = text[11:-1]
|
jpayne@69
|
988 strand = -1
|
jpayne@69
|
989 else:
|
jpayne@69
|
990 strand = None
|
jpayne@69
|
991 # Try simple cases first for speed
|
jpayne@69
|
992 try:
|
jpayne@69
|
993 s, e = text.split("..")
|
jpayne@69
|
994 s = int(s) - 1
|
jpayne@69
|
995 e = int(e)
|
jpayne@69
|
996 except ValueError:
|
jpayne@69
|
997 pass
|
jpayne@69
|
998 else:
|
jpayne@69
|
999 if 0 <= s < e:
|
jpayne@69
|
1000 return SimpleLocation(s, e, strand)
|
jpayne@69
|
1001 # Try general case
|
jpayne@69
|
1002 try:
|
jpayne@69
|
1003 ref, text = text.split(":")
|
jpayne@69
|
1004 except ValueError:
|
jpayne@69
|
1005 ref = None
|
jpayne@69
|
1006 m = _re_location_category.match(text)
|
jpayne@69
|
1007 if m is None:
|
jpayne@69
|
1008 raise LocationParserError(f"Could not parse feature location '{text}'")
|
jpayne@69
|
1009 for key, value in m.groupdict().items():
|
jpayne@69
|
1010 if value is not None:
|
jpayne@69
|
1011 break
|
jpayne@69
|
1012 assert value == text
|
jpayne@69
|
1013 if key == "bond":
|
jpayne@69
|
1014 # e.g. bond(196)
|
jpayne@69
|
1015 warnings.warn(
|
jpayne@69
|
1016 "Dropping bond qualifier in feature location",
|
jpayne@69
|
1017 BiopythonParserWarning,
|
jpayne@69
|
1018 )
|
jpayne@69
|
1019 text = text[5:-1]
|
jpayne@69
|
1020 s_pos = Position.fromstring(text, -1)
|
jpayne@69
|
1021 e_pos = Position.fromstring(text)
|
jpayne@69
|
1022 elif key == "solo":
|
jpayne@69
|
1023 # e.g. "123"
|
jpayne@69
|
1024 s_pos = Position.fromstring(text, -1)
|
jpayne@69
|
1025 e_pos = Position.fromstring(text)
|
jpayne@69
|
1026 elif key in ("pair", "within", "oneof"):
|
jpayne@69
|
1027 s, e = text.split("..")
|
jpayne@69
|
1028 # Attempt to fix features that span the origin
|
jpayne@69
|
1029 s_pos = Position.fromstring(s, -1)
|
jpayne@69
|
1030 e_pos = Position.fromstring(e)
|
jpayne@69
|
1031 if s_pos >= e_pos:
|
jpayne@69
|
1032 # There is likely a problem with origin wrapping.
|
jpayne@69
|
1033 # Create a CompoundLocation of the wrapped feature,
|
jpayne@69
|
1034 # consisting of two SimpleLocation objects to extend to
|
jpayne@69
|
1035 # the list of feature locations.
|
jpayne@69
|
1036 if not circular:
|
jpayne@69
|
1037 raise LocationParserError(
|
jpayne@69
|
1038 f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined"
|
jpayne@69
|
1039 )
|
jpayne@69
|
1040 warnings.warn(
|
jpayne@69
|
1041 "Attempting to fix invalid location %r as "
|
jpayne@69
|
1042 "it looks like incorrect origin wrapping. "
|
jpayne@69
|
1043 "Please fix input file, this could have "
|
jpayne@69
|
1044 "unintended behavior." % text,
|
jpayne@69
|
1045 BiopythonParserWarning,
|
jpayne@69
|
1046 )
|
jpayne@69
|
1047
|
jpayne@69
|
1048 f1 = SimpleLocation(s_pos, length, strand)
|
jpayne@69
|
1049 f2 = SimpleLocation(0, e_pos, strand)
|
jpayne@69
|
1050
|
jpayne@69
|
1051 if strand == -1:
|
jpayne@69
|
1052 # For complementary features spanning the origin
|
jpayne@69
|
1053 return f2 + f1
|
jpayne@69
|
1054 else:
|
jpayne@69
|
1055 return f1 + f2
|
jpayne@69
|
1056 elif key == "between":
|
jpayne@69
|
1057 # A between location like "67^68" (one based counting) is a
|
jpayne@69
|
1058 # special case (note it has zero length). In python slice
|
jpayne@69
|
1059 # notation this is 67:67, a zero length slice. See Bug 2622
|
jpayne@69
|
1060 # Further more, on a circular genome of length N you can have
|
jpayne@69
|
1061 # a location N^1 meaning the junction at the origin. See Bug 3098.
|
jpayne@69
|
1062 # NOTE - We can imagine between locations like "2^4", but this
|
jpayne@69
|
1063 # is just "3". Similarly, "2^5" is just "3..4"
|
jpayne@69
|
1064 s, e = text.split("^")
|
jpayne@69
|
1065 s = int(s)
|
jpayne@69
|
1066 e = int(e)
|
jpayne@69
|
1067 if s + 1 == e or (s == length and e == 1):
|
jpayne@69
|
1068 s_pos = ExactPosition(s)
|
jpayne@69
|
1069 e_pos = s_pos
|
jpayne@69
|
1070 else:
|
jpayne@69
|
1071 raise LocationParserError(f"invalid feature location '{text}'")
|
jpayne@69
|
1072 if s_pos < 0:
|
jpayne@69
|
1073 raise LocationParserError(
|
jpayne@69
|
1074 f"negative starting position in feature location '{text}'"
|
jpayne@69
|
1075 )
|
jpayne@69
|
1076 return SimpleLocation(s_pos, e_pos, strand, ref=ref)
|
jpayne@69
|
1077
|
jpayne@69
|
1078 def _get_strand(self):
|
jpayne@69
|
1079 """Get function for the strand property (PRIVATE)."""
|
jpayne@69
|
1080 return self._strand
|
jpayne@69
|
1081
|
jpayne@69
|
1082 def _set_strand(self, value):
|
jpayne@69
|
1083 """Set function for the strand property (PRIVATE)."""
|
jpayne@69
|
1084 if value not in [+1, -1, 0, None]:
|
jpayne@69
|
1085 raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}")
|
jpayne@69
|
1086 self._strand = value
|
jpayne@69
|
1087
|
jpayne@69
|
1088 strand = property(
|
jpayne@69
|
1089 fget=_get_strand,
|
jpayne@69
|
1090 fset=_set_strand,
|
jpayne@69
|
1091 doc="Strand of the location (+1, -1, 0 or None).",
|
jpayne@69
|
1092 )
|
jpayne@69
|
1093
|
jpayne@69
|
1094 def __str__(self):
|
jpayne@69
|
1095 """Return a representation of the SimpleLocation object (with python counting).
|
jpayne@69
|
1096
|
jpayne@69
|
1097 For the simple case this uses the python splicing syntax, [122:150]
|
jpayne@69
|
1098 (zero based counting) which GenBank would call 123..150 (one based
|
jpayne@69
|
1099 counting).
|
jpayne@69
|
1100 """
|
jpayne@69
|
1101 answer = f"[{self._start}:{self._end}]"
|
jpayne@69
|
1102 if self.ref and self.ref_db:
|
jpayne@69
|
1103 answer = f"{self.ref_db}:{self.ref}{answer}"
|
jpayne@69
|
1104 elif self.ref:
|
jpayne@69
|
1105 answer = self.ref + answer
|
jpayne@69
|
1106 # Is ref_db without ref meaningful?
|
jpayne@69
|
1107 if self.strand is None:
|
jpayne@69
|
1108 return answer
|
jpayne@69
|
1109 elif self.strand == +1:
|
jpayne@69
|
1110 return answer + "(+)"
|
jpayne@69
|
1111 elif self.strand == -1:
|
jpayne@69
|
1112 return answer + "(-)"
|
jpayne@69
|
1113 else:
|
jpayne@69
|
1114 # strand = 0, stranded but strand unknown, ? in GFF3
|
jpayne@69
|
1115 return answer + "(?)"
|
jpayne@69
|
1116
|
jpayne@69
|
1117 def __repr__(self):
|
jpayne@69
|
1118 """Represent the SimpleLocation object as a string for debugging."""
|
jpayne@69
|
1119 optional = ""
|
jpayne@69
|
1120 if self.strand is not None:
|
jpayne@69
|
1121 optional += f", strand={self.strand!r}"
|
jpayne@69
|
1122 if self.ref is not None:
|
jpayne@69
|
1123 optional += f", ref={self.ref!r}"
|
jpayne@69
|
1124 if self.ref_db is not None:
|
jpayne@69
|
1125 optional += f", ref_db={self.ref_db!r}"
|
jpayne@69
|
1126 return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})"
|
jpayne@69
|
1127
|
jpayne@69
|
1128 def __add__(self, other):
|
jpayne@69
|
1129 """Combine location with another SimpleLocation object, or shift it.
|
jpayne@69
|
1130
|
jpayne@69
|
1131 You can add two feature locations to make a join CompoundLocation:
|
jpayne@69
|
1132
|
jpayne@69
|
1133 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1134 >>> f1 = SimpleLocation(5, 10)
|
jpayne@69
|
1135 >>> f2 = SimpleLocation(20, 30)
|
jpayne@69
|
1136 >>> combined = f1 + f2
|
jpayne@69
|
1137 >>> print(combined)
|
jpayne@69
|
1138 join{[5:10], [20:30]}
|
jpayne@69
|
1139
|
jpayne@69
|
1140 This is thus equivalent to:
|
jpayne@69
|
1141
|
jpayne@69
|
1142 >>> from Bio.SeqFeature import CompoundLocation
|
jpayne@69
|
1143 >>> join = CompoundLocation([f1, f2])
|
jpayne@69
|
1144 >>> print(join)
|
jpayne@69
|
1145 join{[5:10], [20:30]}
|
jpayne@69
|
1146
|
jpayne@69
|
1147 You can also use sum(...) in this way:
|
jpayne@69
|
1148
|
jpayne@69
|
1149 >>> join = sum([f1, f2])
|
jpayne@69
|
1150 >>> print(join)
|
jpayne@69
|
1151 join{[5:10], [20:30]}
|
jpayne@69
|
1152
|
jpayne@69
|
1153 Furthermore, you can combine a SimpleLocation with a CompoundLocation
|
jpayne@69
|
1154 in this way.
|
jpayne@69
|
1155
|
jpayne@69
|
1156 Separately, adding an integer will give a new SimpleLocation with
|
jpayne@69
|
1157 its start and end offset by that amount. For example:
|
jpayne@69
|
1158
|
jpayne@69
|
1159 >>> print(f1)
|
jpayne@69
|
1160 [5:10]
|
jpayne@69
|
1161 >>> print(f1 + 100)
|
jpayne@69
|
1162 [105:110]
|
jpayne@69
|
1163 >>> print(200 + f1)
|
jpayne@69
|
1164 [205:210]
|
jpayne@69
|
1165
|
jpayne@69
|
1166 This can be useful when editing annotation.
|
jpayne@69
|
1167 """
|
jpayne@69
|
1168 if isinstance(other, SimpleLocation):
|
jpayne@69
|
1169 return CompoundLocation([self, other])
|
jpayne@69
|
1170 elif isinstance(other, int):
|
jpayne@69
|
1171 return self._shift(other)
|
jpayne@69
|
1172 else:
|
jpayne@69
|
1173 # This will allow CompoundLocation's __radd__ to be called:
|
jpayne@69
|
1174 return NotImplemented
|
jpayne@69
|
1175
|
jpayne@69
|
1176 def __radd__(self, other):
|
jpayne@69
|
1177 """Return a SimpleLocation object by shifting the location by an integer amount."""
|
jpayne@69
|
1178 if isinstance(other, int):
|
jpayne@69
|
1179 return self._shift(other)
|
jpayne@69
|
1180 else:
|
jpayne@69
|
1181 return NotImplemented
|
jpayne@69
|
1182
|
jpayne@69
|
1183 def __sub__(self, other):
|
jpayne@69
|
1184 """Subtracting an integer will shift the start and end by that amount.
|
jpayne@69
|
1185
|
jpayne@69
|
1186 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1187 >>> f1 = SimpleLocation(105, 150)
|
jpayne@69
|
1188 >>> print(f1)
|
jpayne@69
|
1189 [105:150]
|
jpayne@69
|
1190 >>> print(f1 - 100)
|
jpayne@69
|
1191 [5:50]
|
jpayne@69
|
1192
|
jpayne@69
|
1193 This can be useful when editing annotation. You can also add an integer
|
jpayne@69
|
1194 to a feature location (which shifts in the opposite direction).
|
jpayne@69
|
1195 """
|
jpayne@69
|
1196 if isinstance(other, int):
|
jpayne@69
|
1197 return self._shift(-other)
|
jpayne@69
|
1198 else:
|
jpayne@69
|
1199 return NotImplemented
|
jpayne@69
|
1200
|
jpayne@69
|
1201 def __nonzero__(self):
|
jpayne@69
|
1202 """Return True regardless of the length of the feature.
|
jpayne@69
|
1203
|
jpayne@69
|
1204 This behavior is for backwards compatibility, since until the
|
jpayne@69
|
1205 __len__ method was added, a SimpleLocation always evaluated as True.
|
jpayne@69
|
1206
|
jpayne@69
|
1207 Note that in comparison, Seq objects, strings, lists, etc, will all
|
jpayne@69
|
1208 evaluate to False if they have length zero.
|
jpayne@69
|
1209
|
jpayne@69
|
1210 WARNING: The SimpleLocation may in future evaluate to False when its
|
jpayne@69
|
1211 length is zero (in order to better match normal python behavior)!
|
jpayne@69
|
1212 """
|
jpayne@69
|
1213 return True
|
jpayne@69
|
1214
|
jpayne@69
|
1215 def __len__(self):
|
jpayne@69
|
1216 """Return the length of the region described by the SimpleLocation object.
|
jpayne@69
|
1217
|
jpayne@69
|
1218 Note that extra care may be needed for fuzzy locations, e.g.
|
jpayne@69
|
1219
|
jpayne@69
|
1220 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1221 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
|
jpayne@69
|
1222 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
|
jpayne@69
|
1223 >>> len(loc)
|
jpayne@69
|
1224 5
|
jpayne@69
|
1225 """
|
jpayne@69
|
1226 return int(self._end) - int(self._start)
|
jpayne@69
|
1227
|
jpayne@69
|
1228 def __contains__(self, value):
|
jpayne@69
|
1229 """Check if an integer position is within the SimpleLocation object.
|
jpayne@69
|
1230
|
jpayne@69
|
1231 Note that extra care may be needed for fuzzy locations, e.g.
|
jpayne@69
|
1232
|
jpayne@69
|
1233 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1234 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
|
jpayne@69
|
1235 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
|
jpayne@69
|
1236 >>> len(loc)
|
jpayne@69
|
1237 5
|
jpayne@69
|
1238 >>> [i for i in range(15) if i in loc]
|
jpayne@69
|
1239 [5, 6, 7, 8, 9]
|
jpayne@69
|
1240 """
|
jpayne@69
|
1241 if not isinstance(value, int):
|
jpayne@69
|
1242 raise ValueError(
|
jpayne@69
|
1243 "Currently we only support checking for integer "
|
jpayne@69
|
1244 "positions being within a SimpleLocation."
|
jpayne@69
|
1245 )
|
jpayne@69
|
1246 if value < self._start or value >= self._end:
|
jpayne@69
|
1247 return False
|
jpayne@69
|
1248 else:
|
jpayne@69
|
1249 return True
|
jpayne@69
|
1250
|
jpayne@69
|
1251 def __iter__(self):
|
jpayne@69
|
1252 """Iterate over the parent positions within the SimpleLocation object.
|
jpayne@69
|
1253
|
jpayne@69
|
1254 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1255 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
|
jpayne@69
|
1256 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
|
jpayne@69
|
1257 >>> len(loc)
|
jpayne@69
|
1258 5
|
jpayne@69
|
1259 >>> for i in loc: print(i)
|
jpayne@69
|
1260 5
|
jpayne@69
|
1261 6
|
jpayne@69
|
1262 7
|
jpayne@69
|
1263 8
|
jpayne@69
|
1264 9
|
jpayne@69
|
1265 >>> list(loc)
|
jpayne@69
|
1266 [5, 6, 7, 8, 9]
|
jpayne@69
|
1267 >>> [i for i in range(15) if i in loc]
|
jpayne@69
|
1268 [5, 6, 7, 8, 9]
|
jpayne@69
|
1269
|
jpayne@69
|
1270 Note this is strand aware:
|
jpayne@69
|
1271
|
jpayne@69
|
1272 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1)
|
jpayne@69
|
1273 >>> list(loc)
|
jpayne@69
|
1274 [9, 8, 7, 6, 5]
|
jpayne@69
|
1275 """
|
jpayne@69
|
1276 if self.strand == -1:
|
jpayne@69
|
1277 yield from range(self._end - 1, self._start - 1, -1)
|
jpayne@69
|
1278 else:
|
jpayne@69
|
1279 yield from range(self._start, self._end)
|
jpayne@69
|
1280
|
jpayne@69
|
1281 def __eq__(self, other):
|
jpayne@69
|
1282 """Implement equality by comparing all the location attributes."""
|
jpayne@69
|
1283 if not isinstance(other, SimpleLocation):
|
jpayne@69
|
1284 return False
|
jpayne@69
|
1285 return (
|
jpayne@69
|
1286 self._start == other.start
|
jpayne@69
|
1287 and self._end == other.end
|
jpayne@69
|
1288 and self._strand == other.strand
|
jpayne@69
|
1289 and self.ref == other.ref
|
jpayne@69
|
1290 and self.ref_db == other.ref_db
|
jpayne@69
|
1291 )
|
jpayne@69
|
1292
|
jpayne@69
|
1293 def _shift(self, offset):
|
jpayne@69
|
1294 """Return a copy of the SimpleLocation shifted by an offset (PRIVATE).
|
jpayne@69
|
1295
|
jpayne@69
|
1296 Returns self when location is relative to an external reference.
|
jpayne@69
|
1297 """
|
jpayne@69
|
1298 # TODO - What if offset is a fuzzy position?
|
jpayne@69
|
1299 if self.ref or self.ref_db:
|
jpayne@69
|
1300 return self
|
jpayne@69
|
1301 return SimpleLocation(
|
jpayne@69
|
1302 start=self._start + offset,
|
jpayne@69
|
1303 end=self._end + offset,
|
jpayne@69
|
1304 strand=self.strand,
|
jpayne@69
|
1305 )
|
jpayne@69
|
1306
|
jpayne@69
|
1307 def _flip(self, length):
|
jpayne@69
|
1308 """Return a copy of the location after the parent is reversed (PRIVATE).
|
jpayne@69
|
1309
|
jpayne@69
|
1310 Returns self when location is relative to an external reference.
|
jpayne@69
|
1311 """
|
jpayne@69
|
1312 if self.ref or self.ref_db:
|
jpayne@69
|
1313 return self
|
jpayne@69
|
1314 # Note this will flip the start and end too!
|
jpayne@69
|
1315 if self.strand == +1:
|
jpayne@69
|
1316 flip_strand = -1
|
jpayne@69
|
1317 elif self.strand == -1:
|
jpayne@69
|
1318 flip_strand = +1
|
jpayne@69
|
1319 else:
|
jpayne@69
|
1320 # 0 or None
|
jpayne@69
|
1321 flip_strand = self.strand
|
jpayne@69
|
1322 return SimpleLocation(
|
jpayne@69
|
1323 start=self._end._flip(length),
|
jpayne@69
|
1324 end=self._start._flip(length),
|
jpayne@69
|
1325 strand=flip_strand,
|
jpayne@69
|
1326 )
|
jpayne@69
|
1327
|
jpayne@69
|
1328 @property
|
jpayne@69
|
1329 def parts(self):
|
jpayne@69
|
1330 """Read only list of sections (always one, the SimpleLocation object).
|
jpayne@69
|
1331
|
jpayne@69
|
1332 This is a convenience property allowing you to write code handling
|
jpayne@69
|
1333 both SimpleLocation objects (with one part) and more complex
|
jpayne@69
|
1334 CompoundLocation objects (with multiple parts) interchangeably.
|
jpayne@69
|
1335 """
|
jpayne@69
|
1336 return [self]
|
jpayne@69
|
1337
|
jpayne@69
|
1338 @property
|
jpayne@69
|
1339 def start(self):
|
jpayne@69
|
1340 """Start location - left most (minimum) value, regardless of strand.
|
jpayne@69
|
1341
|
jpayne@69
|
1342 Read only, returns an integer like position object, possibly a fuzzy
|
jpayne@69
|
1343 position.
|
jpayne@69
|
1344 """
|
jpayne@69
|
1345 return self._start
|
jpayne@69
|
1346
|
jpayne@69
|
1347 @property
|
jpayne@69
|
1348 def end(self):
|
jpayne@69
|
1349 """End location - right most (maximum) value, regardless of strand.
|
jpayne@69
|
1350
|
jpayne@69
|
1351 Read only, returns an integer like position object, possibly a fuzzy
|
jpayne@69
|
1352 position.
|
jpayne@69
|
1353 """
|
jpayne@69
|
1354 return self._end
|
jpayne@69
|
1355
|
jpayne@69
|
1356 def extract(self, parent_sequence, references=None):
|
jpayne@69
|
1357 """Extract the sequence from supplied parent sequence using the SimpleLocation object.
|
jpayne@69
|
1358
|
jpayne@69
|
1359 The parent_sequence can be a Seq like object or a string, and will
|
jpayne@69
|
1360 generally return an object of the same type. The exception to this is
|
jpayne@69
|
1361 a MutableSeq as the parent sequence will return a Seq object.
|
jpayne@69
|
1362 If the location refers to other records, they must be supplied
|
jpayne@69
|
1363 in the optional dictionary references.
|
jpayne@69
|
1364
|
jpayne@69
|
1365 >>> from Bio.Seq import Seq
|
jpayne@69
|
1366 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1367 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
|
jpayne@69
|
1368 >>> feature_loc = SimpleLocation(8, 15)
|
jpayne@69
|
1369 >>> feature_loc.extract(seq)
|
jpayne@69
|
1370 Seq('VALIVIC')
|
jpayne@69
|
1371
|
jpayne@69
|
1372 """
|
jpayne@69
|
1373 if self.ref or self.ref_db:
|
jpayne@69
|
1374 if not references:
|
jpayne@69
|
1375 raise ValueError(
|
jpayne@69
|
1376 f"Feature references another sequence ({self.ref}),"
|
jpayne@69
|
1377 " references mandatory"
|
jpayne@69
|
1378 )
|
jpayne@69
|
1379 elif self.ref not in references:
|
jpayne@69
|
1380 # KeyError?
|
jpayne@69
|
1381 raise ValueError(
|
jpayne@69
|
1382 f"Feature references another sequence ({self.ref}),"
|
jpayne@69
|
1383 " not found in references"
|
jpayne@69
|
1384 )
|
jpayne@69
|
1385 parent_sequence = references[self.ref]
|
jpayne@69
|
1386 f_seq = parent_sequence[int(self.start) : int(self.end)]
|
jpayne@69
|
1387 if isinstance(f_seq, MutableSeq):
|
jpayne@69
|
1388 f_seq = Seq(f_seq)
|
jpayne@69
|
1389 if self.strand == -1:
|
jpayne@69
|
1390 f_seq = reverse_complement(f_seq)
|
jpayne@69
|
1391 return f_seq
|
jpayne@69
|
1392
|
jpayne@69
|
1393
|
jpayne@69
|
1394 FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only.
|
jpayne@69
|
1395
|
jpayne@69
|
1396
|
jpayne@69
|
1397 class CompoundLocation(Location):
|
jpayne@69
|
1398 """For handling joins etc where a feature location has several parts."""
|
jpayne@69
|
1399
|
jpayne@69
|
1400 def __init__(self, parts, operator="join"):
|
jpayne@69
|
1401 """Initialize the class.
|
jpayne@69
|
1402
|
jpayne@69
|
1403 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
|
jpayne@69
|
1404 >>> f1 = SimpleLocation(10, 40, strand=+1)
|
jpayne@69
|
1405 >>> f2 = SimpleLocation(50, 59, strand=+1)
|
jpayne@69
|
1406 >>> f = CompoundLocation([f1, f2])
|
jpayne@69
|
1407 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
|
jpayne@69
|
1408 True
|
jpayne@69
|
1409 >>> print(f.operator)
|
jpayne@69
|
1410 join
|
jpayne@69
|
1411 >>> 5 in f
|
jpayne@69
|
1412 False
|
jpayne@69
|
1413 >>> 15 in f
|
jpayne@69
|
1414 True
|
jpayne@69
|
1415 >>> f.strand
|
jpayne@69
|
1416 1
|
jpayne@69
|
1417
|
jpayne@69
|
1418 Notice that the strand of the compound location is computed
|
jpayne@69
|
1419 automatically - in the case of mixed strands on the sub-locations
|
jpayne@69
|
1420 the overall strand is set to None.
|
jpayne@69
|
1421
|
jpayne@69
|
1422 >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1),
|
jpayne@69
|
1423 ... SimpleLocation(10, 13, strand=-1)])
|
jpayne@69
|
1424 >>> print(f.strand)
|
jpayne@69
|
1425 None
|
jpayne@69
|
1426 >>> len(f)
|
jpayne@69
|
1427 6
|
jpayne@69
|
1428 >>> list(f)
|
jpayne@69
|
1429 [3, 4, 5, 12, 11, 10]
|
jpayne@69
|
1430
|
jpayne@69
|
1431 The example above doing list(f) iterates over the coordinates within the
|
jpayne@69
|
1432 feature. This allows you to use max and min on the location, to find the
|
jpayne@69
|
1433 range covered:
|
jpayne@69
|
1434
|
jpayne@69
|
1435 >>> min(f)
|
jpayne@69
|
1436 3
|
jpayne@69
|
1437 >>> max(f)
|
jpayne@69
|
1438 12
|
jpayne@69
|
1439
|
jpayne@69
|
1440 More generally, you can use the compound location's start and end which
|
jpayne@69
|
1441 give the full span covered, 0 <= start <= end <= full sequence length.
|
jpayne@69
|
1442
|
jpayne@69
|
1443 >>> f.start == min(f)
|
jpayne@69
|
1444 True
|
jpayne@69
|
1445 >>> f.end == max(f) + 1
|
jpayne@69
|
1446 True
|
jpayne@69
|
1447
|
jpayne@69
|
1448 This is consistent with the behavior of the SimpleLocation for a single
|
jpayne@69
|
1449 region, where again the 'start' and 'end' do not necessarily give the
|
jpayne@69
|
1450 biological start and end, but rather the 'minimal' and 'maximal'
|
jpayne@69
|
1451 coordinate boundaries.
|
jpayne@69
|
1452
|
jpayne@69
|
1453 Note that adding locations provides a more intuitive method of
|
jpayne@69
|
1454 construction:
|
jpayne@69
|
1455
|
jpayne@69
|
1456 >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1)
|
jpayne@69
|
1457 >>> len(f)
|
jpayne@69
|
1458 6
|
jpayne@69
|
1459 >>> list(f)
|
jpayne@69
|
1460 [3, 4, 5, 12, 11, 10]
|
jpayne@69
|
1461 """
|
jpayne@69
|
1462 self.operator = operator
|
jpayne@69
|
1463 self.parts = list(parts)
|
jpayne@69
|
1464 for loc in self.parts:
|
jpayne@69
|
1465 if not isinstance(loc, SimpleLocation):
|
jpayne@69
|
1466 raise ValueError(
|
jpayne@69
|
1467 "CompoundLocation should be given a list of "
|
jpayne@69
|
1468 "SimpleLocation objects, not %s" % loc.__class__
|
jpayne@69
|
1469 )
|
jpayne@69
|
1470 if len(parts) < 2:
|
jpayne@69
|
1471 raise ValueError(
|
jpayne@69
|
1472 f"CompoundLocation should have at least 2 parts, not {parts!r}"
|
jpayne@69
|
1473 )
|
jpayne@69
|
1474
|
jpayne@69
|
1475 def __str__(self):
|
jpayne@69
|
1476 """Return a representation of the CompoundLocation object (with python counting)."""
|
jpayne@69
|
1477 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
|
jpayne@69
|
1478
|
jpayne@69
|
1479 def __repr__(self):
|
jpayne@69
|
1480 """Represent the CompoundLocation object as string for debugging."""
|
jpayne@69
|
1481 return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})"
|
jpayne@69
|
1482
|
jpayne@69
|
1483 def _get_strand(self):
|
jpayne@69
|
1484 """Get function for the strand property (PRIVATE)."""
|
jpayne@69
|
1485 # Historically a join on the reverse strand has been represented
|
jpayne@69
|
1486 # in Biopython with both the parent SeqFeature and its children
|
jpayne@69
|
1487 # (the exons for a CDS) all given a strand of -1. Likewise, for
|
jpayne@69
|
1488 # a join feature on the forward strand they all have strand +1.
|
jpayne@69
|
1489 # However, we must also consider evil mixed strand examples like
|
jpayne@69
|
1490 # this, join(complement(69611..69724),139856..140087,140625..140650)
|
jpayne@69
|
1491 if len({loc.strand for loc in self.parts}) == 1:
|
jpayne@69
|
1492 return self.parts[0].strand
|
jpayne@69
|
1493 else:
|
jpayne@69
|
1494 return None # i.e. mixed strands
|
jpayne@69
|
1495
|
jpayne@69
|
1496 def _set_strand(self, value):
|
jpayne@69
|
1497 """Set function for the strand property (PRIVATE)."""
|
jpayne@69
|
1498 # Should this be allowed/encouraged?
|
jpayne@69
|
1499 for loc in self.parts:
|
jpayne@69
|
1500 loc.strand = value
|
jpayne@69
|
1501
|
jpayne@69
|
1502 strand = property(
|
jpayne@69
|
1503 fget=_get_strand,
|
jpayne@69
|
1504 fset=_set_strand,
|
jpayne@69
|
1505 doc="""Overall strand of the compound location.
|
jpayne@69
|
1506
|
jpayne@69
|
1507 If all the parts have the same strand, that is returned. Otherwise
|
jpayne@69
|
1508 for mixed strands, this returns None.
|
jpayne@69
|
1509
|
jpayne@69
|
1510 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
|
jpayne@69
|
1511 >>> f1 = SimpleLocation(15, 17, strand=1)
|
jpayne@69
|
1512 >>> f2 = SimpleLocation(20, 30, strand=-1)
|
jpayne@69
|
1513 >>> f = f1 + f2
|
jpayne@69
|
1514 >>> f1.strand
|
jpayne@69
|
1515 1
|
jpayne@69
|
1516 >>> f2.strand
|
jpayne@69
|
1517 -1
|
jpayne@69
|
1518 >>> f.strand
|
jpayne@69
|
1519 >>> f.strand is None
|
jpayne@69
|
1520 True
|
jpayne@69
|
1521
|
jpayne@69
|
1522 If you set the strand of a CompoundLocation, this is applied to
|
jpayne@69
|
1523 all the parts - use with caution:
|
jpayne@69
|
1524
|
jpayne@69
|
1525 >>> f.strand = 1
|
jpayne@69
|
1526 >>> f1.strand
|
jpayne@69
|
1527 1
|
jpayne@69
|
1528 >>> f2.strand
|
jpayne@69
|
1529 1
|
jpayne@69
|
1530 >>> f.strand
|
jpayne@69
|
1531 1
|
jpayne@69
|
1532
|
jpayne@69
|
1533 """,
|
jpayne@69
|
1534 )
|
jpayne@69
|
1535
|
jpayne@69
|
1536 def __add__(self, other):
|
jpayne@69
|
1537 """Combine locations, or shift the location by an integer offset.
|
jpayne@69
|
1538
|
jpayne@69
|
1539 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1540 >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30)
|
jpayne@69
|
1541 >>> print(f1)
|
jpayne@69
|
1542 join{[15:17], [20:30]}
|
jpayne@69
|
1543
|
jpayne@69
|
1544 You can add another SimpleLocation:
|
jpayne@69
|
1545
|
jpayne@69
|
1546 >>> print(f1 + SimpleLocation(40, 50))
|
jpayne@69
|
1547 join{[15:17], [20:30], [40:50]}
|
jpayne@69
|
1548 >>> print(SimpleLocation(5, 10) + f1)
|
jpayne@69
|
1549 join{[5:10], [15:17], [20:30]}
|
jpayne@69
|
1550
|
jpayne@69
|
1551 You can also add another CompoundLocation:
|
jpayne@69
|
1552
|
jpayne@69
|
1553 >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70)
|
jpayne@69
|
1554 >>> print(f2)
|
jpayne@69
|
1555 join{[40:50], [60:70]}
|
jpayne@69
|
1556 >>> print(f1 + f2)
|
jpayne@69
|
1557 join{[15:17], [20:30], [40:50], [60:70]}
|
jpayne@69
|
1558
|
jpayne@69
|
1559 Also, as with the SimpleLocation, adding an integer shifts the
|
jpayne@69
|
1560 location's coordinates by that offset:
|
jpayne@69
|
1561
|
jpayne@69
|
1562 >>> print(f1 + 100)
|
jpayne@69
|
1563 join{[115:117], [120:130]}
|
jpayne@69
|
1564 >>> print(200 + f1)
|
jpayne@69
|
1565 join{[215:217], [220:230]}
|
jpayne@69
|
1566 >>> print(f1 + (-5))
|
jpayne@69
|
1567 join{[10:12], [15:25]}
|
jpayne@69
|
1568 """
|
jpayne@69
|
1569 if isinstance(other, SimpleLocation):
|
jpayne@69
|
1570 return CompoundLocation(self.parts + [other], self.operator)
|
jpayne@69
|
1571 elif isinstance(other, CompoundLocation):
|
jpayne@69
|
1572 if self.operator != other.operator:
|
jpayne@69
|
1573 # Handle join+order -> order as a special case?
|
jpayne@69
|
1574 raise ValueError(
|
jpayne@69
|
1575 f"Mixed operators {self.operator} and {other.operator}"
|
jpayne@69
|
1576 )
|
jpayne@69
|
1577 return CompoundLocation(self.parts + other.parts, self.operator)
|
jpayne@69
|
1578 elif isinstance(other, int):
|
jpayne@69
|
1579 return self._shift(other)
|
jpayne@69
|
1580 else:
|
jpayne@69
|
1581 raise NotImplementedError
|
jpayne@69
|
1582
|
jpayne@69
|
1583 def __radd__(self, other):
|
jpayne@69
|
1584 """Add a feature to the left."""
|
jpayne@69
|
1585 if isinstance(other, SimpleLocation):
|
jpayne@69
|
1586 return CompoundLocation([other] + self.parts, self.operator)
|
jpayne@69
|
1587 elif isinstance(other, int):
|
jpayne@69
|
1588 return self._shift(other)
|
jpayne@69
|
1589 else:
|
jpayne@69
|
1590 raise NotImplementedError
|
jpayne@69
|
1591
|
jpayne@69
|
1592 def __contains__(self, value):
|
jpayne@69
|
1593 """Check if an integer position is within the CompoundLocation object."""
|
jpayne@69
|
1594 for loc in self.parts:
|
jpayne@69
|
1595 if value in loc:
|
jpayne@69
|
1596 return True
|
jpayne@69
|
1597 return False
|
jpayne@69
|
1598
|
jpayne@69
|
1599 def __nonzero__(self):
|
jpayne@69
|
1600 """Return True regardless of the length of the feature.
|
jpayne@69
|
1601
|
jpayne@69
|
1602 This behavior is for backwards compatibility, since until the
|
jpayne@69
|
1603 __len__ method was added, a SimpleLocation always evaluated as True.
|
jpayne@69
|
1604
|
jpayne@69
|
1605 Note that in comparison, Seq objects, strings, lists, etc, will all
|
jpayne@69
|
1606 evaluate to False if they have length zero.
|
jpayne@69
|
1607
|
jpayne@69
|
1608 WARNING: The SimpleLocation may in future evaluate to False when its
|
jpayne@69
|
1609 length is zero (in order to better match normal python behavior)!
|
jpayne@69
|
1610 """
|
jpayne@69
|
1611 return True
|
jpayne@69
|
1612
|
jpayne@69
|
1613 def __len__(self):
|
jpayne@69
|
1614 """Return the length of the CompoundLocation object."""
|
jpayne@69
|
1615 return sum(len(loc) for loc in self.parts)
|
jpayne@69
|
1616
|
jpayne@69
|
1617 def __iter__(self):
|
jpayne@69
|
1618 """Iterate over the parent positions within the CompoundLocation object."""
|
jpayne@69
|
1619 for loc in self.parts:
|
jpayne@69
|
1620 yield from loc
|
jpayne@69
|
1621
|
jpayne@69
|
1622 def __eq__(self, other):
|
jpayne@69
|
1623 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
|
jpayne@69
|
1624 if not isinstance(other, CompoundLocation):
|
jpayne@69
|
1625 return False
|
jpayne@69
|
1626 if len(self.parts) != len(other.parts):
|
jpayne@69
|
1627 return False
|
jpayne@69
|
1628 if self.operator != other.operator:
|
jpayne@69
|
1629 return False
|
jpayne@69
|
1630 for self_part, other_part in zip(self.parts, other.parts):
|
jpayne@69
|
1631 if self_part != other_part:
|
jpayne@69
|
1632 return False
|
jpayne@69
|
1633 return True
|
jpayne@69
|
1634
|
jpayne@69
|
1635 def _shift(self, offset):
|
jpayne@69
|
1636 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
|
jpayne@69
|
1637 return CompoundLocation(
|
jpayne@69
|
1638 [loc._shift(offset) for loc in self.parts], self.operator
|
jpayne@69
|
1639 )
|
jpayne@69
|
1640
|
jpayne@69
|
1641 def _flip(self, length):
|
jpayne@69
|
1642 """Return a copy of the locations after the parent is reversed (PRIVATE).
|
jpayne@69
|
1643
|
jpayne@69
|
1644 Note that the order of the parts is NOT reversed too. Consider a CDS
|
jpayne@69
|
1645 on the forward strand with exons small, medium and large (in length).
|
jpayne@69
|
1646 Once we change the frame of reference to the reverse complement strand,
|
jpayne@69
|
1647 the start codon is still part of the small exon, and the stop codon
|
jpayne@69
|
1648 still part of the large exon - so the part order remains the same!
|
jpayne@69
|
1649
|
jpayne@69
|
1650 Here is an artificial example, were the features map to the two upper
|
jpayne@69
|
1651 case regions and the lower case runs of n are not used:
|
jpayne@69
|
1652
|
jpayne@69
|
1653 >>> from Bio.Seq import Seq
|
jpayne@69
|
1654 >>> from Bio.SeqFeature import SimpleLocation
|
jpayne@69
|
1655 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
|
jpayne@69
|
1656 >>> small = SimpleLocation(5, 20, strand=1)
|
jpayne@69
|
1657 >>> large = SimpleLocation(28, 52, strand=1)
|
jpayne@69
|
1658 >>> location = small + large
|
jpayne@69
|
1659 >>> print(small)
|
jpayne@69
|
1660 [5:20](+)
|
jpayne@69
|
1661 >>> print(large)
|
jpayne@69
|
1662 [28:52](+)
|
jpayne@69
|
1663 >>> print(location)
|
jpayne@69
|
1664 join{[5:20](+), [28:52](+)}
|
jpayne@69
|
1665 >>> for part in location.parts:
|
jpayne@69
|
1666 ... print(len(part))
|
jpayne@69
|
1667 ...
|
jpayne@69
|
1668 15
|
jpayne@69
|
1669 24
|
jpayne@69
|
1670
|
jpayne@69
|
1671 As you can see, this is a silly example where each "exon" is a word:
|
jpayne@69
|
1672
|
jpayne@69
|
1673 >>> print(small.extract(dna).translate())
|
jpayne@69
|
1674 SILLY
|
jpayne@69
|
1675 >>> print(large.extract(dna).translate())
|
jpayne@69
|
1676 EXAMPLE*
|
jpayne@69
|
1677 >>> print(location.extract(dna).translate())
|
jpayne@69
|
1678 SILLYEXAMPLE*
|
jpayne@69
|
1679 >>> for part in location.parts:
|
jpayne@69
|
1680 ... print(part.extract(dna).translate())
|
jpayne@69
|
1681 ...
|
jpayne@69
|
1682 SILLY
|
jpayne@69
|
1683 EXAMPLE*
|
jpayne@69
|
1684
|
jpayne@69
|
1685 Now, let's look at this from the reverse strand frame of reference:
|
jpayne@69
|
1686
|
jpayne@69
|
1687 >>> flipped_dna = dna.reverse_complement()
|
jpayne@69
|
1688 >>> flipped_location = location._flip(len(dna))
|
jpayne@69
|
1689 >>> print(flipped_location.extract(flipped_dna).translate())
|
jpayne@69
|
1690 SILLYEXAMPLE*
|
jpayne@69
|
1691 >>> for part in flipped_location.parts:
|
jpayne@69
|
1692 ... print(part.extract(flipped_dna).translate())
|
jpayne@69
|
1693 ...
|
jpayne@69
|
1694 SILLY
|
jpayne@69
|
1695 EXAMPLE*
|
jpayne@69
|
1696
|
jpayne@69
|
1697 The key point here is the first part of the CompoundFeature is still the
|
jpayne@69
|
1698 small exon, while the second part is still the large exon:
|
jpayne@69
|
1699
|
jpayne@69
|
1700 >>> for part in flipped_location.parts:
|
jpayne@69
|
1701 ... print(len(part))
|
jpayne@69
|
1702 ...
|
jpayne@69
|
1703 15
|
jpayne@69
|
1704 24
|
jpayne@69
|
1705 >>> print(flipped_location)
|
jpayne@69
|
1706 join{[37:52](-), [5:29](-)}
|
jpayne@69
|
1707
|
jpayne@69
|
1708 Notice the parts are not reversed. However, there was a bug here in older
|
jpayne@69
|
1709 versions of Biopython which would have given join{[5:29](-), [37:52](-)}
|
jpayne@69
|
1710 and the translation would have wrongly been "EXAMPLE*SILLY" instead.
|
jpayne@69
|
1711
|
jpayne@69
|
1712 """
|
jpayne@69
|
1713 return CompoundLocation(
|
jpayne@69
|
1714 [loc._flip(length) for loc in self.parts], self.operator
|
jpayne@69
|
1715 )
|
jpayne@69
|
1716
|
jpayne@69
|
1717 @property
|
jpayne@69
|
1718 def start(self):
|
jpayne@69
|
1719 """Start location - left most (minimum) value, regardless of strand.
|
jpayne@69
|
1720
|
jpayne@69
|
1721 Read only, returns an integer like position object, possibly a fuzzy
|
jpayne@69
|
1722 position.
|
jpayne@69
|
1723
|
jpayne@69
|
1724 For the special case of a CompoundLocation wrapping the origin of a
|
jpayne@69
|
1725 circular genome, this will return zero.
|
jpayne@69
|
1726 """
|
jpayne@69
|
1727 return min(loc.start for loc in self.parts)
|
jpayne@69
|
1728
|
jpayne@69
|
1729 @property
|
jpayne@69
|
1730 def end(self):
|
jpayne@69
|
1731 """End location - right most (maximum) value, regardless of strand.
|
jpayne@69
|
1732
|
jpayne@69
|
1733 Read only, returns an integer like position object, possibly a fuzzy
|
jpayne@69
|
1734 position.
|
jpayne@69
|
1735
|
jpayne@69
|
1736 For the special case of a CompoundLocation wrapping the origin of
|
jpayne@69
|
1737 a circular genome this will match the genome length.
|
jpayne@69
|
1738 """
|
jpayne@69
|
1739 return max(loc.end for loc in self.parts)
|
jpayne@69
|
1740
|
jpayne@69
|
1741 @property
|
jpayne@69
|
1742 def ref(self):
|
jpayne@69
|
1743 """Not present in CompoundLocation, dummy method for API compatibility."""
|
jpayne@69
|
1744 return None
|
jpayne@69
|
1745
|
jpayne@69
|
1746 @property
|
jpayne@69
|
1747 def ref_db(self):
|
jpayne@69
|
1748 """Not present in CompoundLocation, dummy method for API compatibility."""
|
jpayne@69
|
1749 return None
|
jpayne@69
|
1750
|
jpayne@69
|
1751 def extract(self, parent_sequence, references=None):
|
jpayne@69
|
1752 """Extract the sequence from supplied parent sequence using the CompoundLocation object.
|
jpayne@69
|
1753
|
jpayne@69
|
1754 The parent_sequence can be a Seq like object or a string, and will
|
jpayne@69
|
1755 generally return an object of the same type. The exception to this is
|
jpayne@69
|
1756 a MutableSeq as the parent sequence will return a Seq object.
|
jpayne@69
|
1757 If the location refers to other records, they must be supplied
|
jpayne@69
|
1758 in the optional dictionary references.
|
jpayne@69
|
1759
|
jpayne@69
|
1760 >>> from Bio.Seq import Seq
|
jpayne@69
|
1761 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
|
jpayne@69
|
1762 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
|
jpayne@69
|
1763 >>> fl1 = SimpleLocation(2, 8)
|
jpayne@69
|
1764 >>> fl2 = SimpleLocation(10, 15)
|
jpayne@69
|
1765 >>> fl3 = CompoundLocation([fl1,fl2])
|
jpayne@69
|
1766 >>> fl3.extract(seq)
|
jpayne@69
|
1767 Seq('QHKAMILIVIC')
|
jpayne@69
|
1768
|
jpayne@69
|
1769 """
|
jpayne@69
|
1770 # This copes with mixed strand features & all on reverse:
|
jpayne@69
|
1771 parts = [
|
jpayne@69
|
1772 loc.extract(parent_sequence, references=references) for loc in self.parts
|
jpayne@69
|
1773 ]
|
jpayne@69
|
1774 f_seq = functools.reduce(lambda x, y: x + y, parts)
|
jpayne@69
|
1775 return f_seq
|
jpayne@69
|
1776
|
jpayne@69
|
1777
|
jpayne@69
|
1778 class Position(ABC):
|
jpayne@69
|
1779 """Abstract base class representing a position."""
|
jpayne@69
|
1780
|
jpayne@69
|
1781 @abstractmethod
|
jpayne@69
|
1782 def __repr__(self):
|
jpayne@69
|
1783 """Represent the Position object as a string for debugging."""
|
jpayne@69
|
1784 return f"{self.__class__.__name__}(...)"
|
jpayne@69
|
1785
|
jpayne@69
|
1786 @staticmethod
|
jpayne@69
|
1787 def fromstring(text, offset=0):
|
jpayne@69
|
1788 """Build a Position object from the text string.
|
jpayne@69
|
1789
|
jpayne@69
|
1790 For an end position, leave offset as zero (default):
|
jpayne@69
|
1791
|
jpayne@69
|
1792 >>> Position.fromstring("5")
|
jpayne@69
|
1793 ExactPosition(5)
|
jpayne@69
|
1794
|
jpayne@69
|
1795 For a start position, set offset to minus one (for Python counting):
|
jpayne@69
|
1796
|
jpayne@69
|
1797 >>> Position.fromstring("5", -1)
|
jpayne@69
|
1798 ExactPosition(4)
|
jpayne@69
|
1799
|
jpayne@69
|
1800 This also covers fuzzy positions:
|
jpayne@69
|
1801
|
jpayne@69
|
1802 >>> p = Position.fromstring("<5")
|
jpayne@69
|
1803 >>> p
|
jpayne@69
|
1804 BeforePosition(5)
|
jpayne@69
|
1805 >>> print(p)
|
jpayne@69
|
1806 <5
|
jpayne@69
|
1807 >>> int(p)
|
jpayne@69
|
1808 5
|
jpayne@69
|
1809
|
jpayne@69
|
1810 >>> Position.fromstring(">5")
|
jpayne@69
|
1811 AfterPosition(5)
|
jpayne@69
|
1812
|
jpayne@69
|
1813 By default assumes an end position, so note the integer behavior:
|
jpayne@69
|
1814
|
jpayne@69
|
1815 >>> p = Position.fromstring("one-of(5,8,11)")
|
jpayne@69
|
1816 >>> p
|
jpayne@69
|
1817 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
|
jpayne@69
|
1818 >>> print(p)
|
jpayne@69
|
1819 one-of(5,8,11)
|
jpayne@69
|
1820 >>> int(p)
|
jpayne@69
|
1821 11
|
jpayne@69
|
1822
|
jpayne@69
|
1823 >>> Position.fromstring("(8.10)")
|
jpayne@69
|
1824 WithinPosition(10, left=8, right=10)
|
jpayne@69
|
1825
|
jpayne@69
|
1826 Fuzzy start positions:
|
jpayne@69
|
1827
|
jpayne@69
|
1828 >>> p = Position.fromstring("<5", -1)
|
jpayne@69
|
1829 >>> p
|
jpayne@69
|
1830 BeforePosition(4)
|
jpayne@69
|
1831 >>> print(p)
|
jpayne@69
|
1832 <4
|
jpayne@69
|
1833 >>> int(p)
|
jpayne@69
|
1834 4
|
jpayne@69
|
1835
|
jpayne@69
|
1836 Notice how the integer behavior changes too!
|
jpayne@69
|
1837
|
jpayne@69
|
1838 >>> p = Position.fromstring("one-of(5,8,11)", -1)
|
jpayne@69
|
1839 >>> p
|
jpayne@69
|
1840 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
|
jpayne@69
|
1841 >>> print(p)
|
jpayne@69
|
1842 one-of(4,7,10)
|
jpayne@69
|
1843 >>> int(p)
|
jpayne@69
|
1844 4
|
jpayne@69
|
1845
|
jpayne@69
|
1846 """
|
jpayne@69
|
1847 if offset != 0 and offset != -1:
|
jpayne@69
|
1848 raise ValueError(
|
jpayne@69
|
1849 "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)."
|
jpayne@69
|
1850 )
|
jpayne@69
|
1851 if text == "?":
|
jpayne@69
|
1852 return UnknownPosition()
|
jpayne@69
|
1853 if text.startswith("?"):
|
jpayne@69
|
1854 return UncertainPosition(int(text[1:]) + offset)
|
jpayne@69
|
1855 if text.startswith("<"):
|
jpayne@69
|
1856 return BeforePosition(int(text[1:]) + offset)
|
jpayne@69
|
1857 if text.startswith(">"):
|
jpayne@69
|
1858 return AfterPosition(int(text[1:]) + offset)
|
jpayne@69
|
1859 m = _re_within_position.match(text)
|
jpayne@69
|
1860 if m is not None:
|
jpayne@69
|
1861 s, e = m.groups()
|
jpayne@69
|
1862 s = int(s) + offset
|
jpayne@69
|
1863 e = int(e) + offset
|
jpayne@69
|
1864 if offset == -1:
|
jpayne@69
|
1865 default = s
|
jpayne@69
|
1866 else:
|
jpayne@69
|
1867 default = e
|
jpayne@69
|
1868 return WithinPosition(default, left=s, right=e)
|
jpayne@69
|
1869 m = _re_oneof_position.match(text)
|
jpayne@69
|
1870 if m is not None:
|
jpayne@69
|
1871 positions = m.groups()[0]
|
jpayne@69
|
1872 parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")]
|
jpayne@69
|
1873 if offset == -1:
|
jpayne@69
|
1874 default = min(int(pos) for pos in parts)
|
jpayne@69
|
1875 else:
|
jpayne@69
|
1876 default = max(int(pos) for pos in parts)
|
jpayne@69
|
1877 return OneOfPosition(default, choices=parts)
|
jpayne@69
|
1878 return ExactPosition(int(text) + offset)
|
jpayne@69
|
1879
|
jpayne@69
|
1880
|
jpayne@69
|
1881 class ExactPosition(int, Position):
|
jpayne@69
|
1882 """Specify the specific position of a boundary.
|
jpayne@69
|
1883
|
jpayne@69
|
1884 Arguments:
|
jpayne@69
|
1885 - position - The position of the boundary.
|
jpayne@69
|
1886 - extension - An optional argument which must be zero since we don't
|
jpayne@69
|
1887 have an extension. The argument is provided so that the same number
|
jpayne@69
|
1888 of arguments can be passed to all position types.
|
jpayne@69
|
1889
|
jpayne@69
|
1890 In this case, there is no fuzziness associated with the position.
|
jpayne@69
|
1891
|
jpayne@69
|
1892 >>> p = ExactPosition(5)
|
jpayne@69
|
1893 >>> p
|
jpayne@69
|
1894 ExactPosition(5)
|
jpayne@69
|
1895 >>> print(p)
|
jpayne@69
|
1896 5
|
jpayne@69
|
1897
|
jpayne@69
|
1898 >>> isinstance(p, Position)
|
jpayne@69
|
1899 True
|
jpayne@69
|
1900 >>> isinstance(p, int)
|
jpayne@69
|
1901 True
|
jpayne@69
|
1902
|
jpayne@69
|
1903 Integer comparisons and operations should work as expected:
|
jpayne@69
|
1904
|
jpayne@69
|
1905 >>> p == 5
|
jpayne@69
|
1906 True
|
jpayne@69
|
1907 >>> p < 6
|
jpayne@69
|
1908 True
|
jpayne@69
|
1909 >>> p <= 5
|
jpayne@69
|
1910 True
|
jpayne@69
|
1911 >>> p + 10
|
jpayne@69
|
1912 ExactPosition(15)
|
jpayne@69
|
1913
|
jpayne@69
|
1914 """
|
jpayne@69
|
1915
|
jpayne@69
|
1916 def __new__(cls, position, extension=0):
|
jpayne@69
|
1917 """Create an ExactPosition object."""
|
jpayne@69
|
1918 if extension != 0:
|
jpayne@69
|
1919 raise AttributeError(f"Non-zero extension {extension} for exact position.")
|
jpayne@69
|
1920 return int.__new__(cls, position)
|
jpayne@69
|
1921
|
jpayne@69
|
1922 # Must define this on Python 3.8 onwards because we redefine __repr__
|
jpayne@69
|
1923 def __str__(self):
|
jpayne@69
|
1924 """Return a representation of the ExactPosition object (with python counting)."""
|
jpayne@69
|
1925 return str(int(self))
|
jpayne@69
|
1926
|
jpayne@69
|
1927 def __repr__(self):
|
jpayne@69
|
1928 """Represent the ExactPosition object as a string for debugging."""
|
jpayne@69
|
1929 return "%s(%i)" % (self.__class__.__name__, int(self))
|
jpayne@69
|
1930
|
jpayne@69
|
1931 def __add__(self, offset):
|
jpayne@69
|
1932 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
1933 # By default preserve any subclass
|
jpayne@69
|
1934 return self.__class__(int(self) + offset)
|
jpayne@69
|
1935
|
jpayne@69
|
1936 def _flip(self, length):
|
jpayne@69
|
1937 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
1938 # By default preserve any subclass
|
jpayne@69
|
1939 return self.__class__(length - int(self))
|
jpayne@69
|
1940
|
jpayne@69
|
1941
|
jpayne@69
|
1942 class UncertainPosition(ExactPosition):
|
jpayne@69
|
1943 """Specify a specific position which is uncertain.
|
jpayne@69
|
1944
|
jpayne@69
|
1945 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
|
jpayne@69
|
1946 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
|
jpayne@69
|
1947 """
|
jpayne@69
|
1948
|
jpayne@69
|
1949
|
jpayne@69
|
1950 class UnknownPosition(Position):
|
jpayne@69
|
1951 """Specify a specific position which is unknown (has no position).
|
jpayne@69
|
1952
|
jpayne@69
|
1953 This is used in UniProt, e.g. ? or in the XML as unknown.
|
jpayne@69
|
1954 """
|
jpayne@69
|
1955
|
jpayne@69
|
1956 def __repr__(self):
|
jpayne@69
|
1957 """Represent the UnknownPosition object as a string for debugging."""
|
jpayne@69
|
1958 return f"{self.__class__.__name__}()"
|
jpayne@69
|
1959
|
jpayne@69
|
1960 def __hash__(self):
|
jpayne@69
|
1961 """Return the hash value of the UnknownPosition object."""
|
jpayne@69
|
1962 return hash(None)
|
jpayne@69
|
1963
|
jpayne@69
|
1964 def __add__(self, offset):
|
jpayne@69
|
1965 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
1966 return self
|
jpayne@69
|
1967
|
jpayne@69
|
1968 def _flip(self, length):
|
jpayne@69
|
1969 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
1970 return self
|
jpayne@69
|
1971
|
jpayne@69
|
1972
|
jpayne@69
|
1973 class WithinPosition(int, Position):
|
jpayne@69
|
1974 """Specify the position of a boundary within some coordinates.
|
jpayne@69
|
1975
|
jpayne@69
|
1976 Arguments:
|
jpayne@69
|
1977 - position - The default integer position
|
jpayne@69
|
1978 - left - The start (left) position of the boundary
|
jpayne@69
|
1979 - right - The end (right) position of the boundary
|
jpayne@69
|
1980
|
jpayne@69
|
1981 This allows dealing with a location like ((11.14)..100). This
|
jpayne@69
|
1982 indicates that the start of the sequence is somewhere between 11
|
jpayne@69
|
1983 and 14. Since this is a start coordinate, it should act like
|
jpayne@69
|
1984 it is at position 11 (or in Python counting, 10).
|
jpayne@69
|
1985
|
jpayne@69
|
1986 >>> p = WithinPosition(10, 10, 13)
|
jpayne@69
|
1987 >>> p
|
jpayne@69
|
1988 WithinPosition(10, left=10, right=13)
|
jpayne@69
|
1989 >>> print(p)
|
jpayne@69
|
1990 (10.13)
|
jpayne@69
|
1991 >>> int(p)
|
jpayne@69
|
1992 10
|
jpayne@69
|
1993
|
jpayne@69
|
1994 Basic integer comparisons and operations should work as though
|
jpayne@69
|
1995 this were a plain integer:
|
jpayne@69
|
1996
|
jpayne@69
|
1997 >>> p == 10
|
jpayne@69
|
1998 True
|
jpayne@69
|
1999 >>> p in [9, 10, 11]
|
jpayne@69
|
2000 True
|
jpayne@69
|
2001 >>> p < 11
|
jpayne@69
|
2002 True
|
jpayne@69
|
2003 >>> p + 10
|
jpayne@69
|
2004 WithinPosition(20, left=20, right=23)
|
jpayne@69
|
2005
|
jpayne@69
|
2006 >>> isinstance(p, WithinPosition)
|
jpayne@69
|
2007 True
|
jpayne@69
|
2008 >>> isinstance(p, Position)
|
jpayne@69
|
2009 True
|
jpayne@69
|
2010 >>> isinstance(p, int)
|
jpayne@69
|
2011 True
|
jpayne@69
|
2012
|
jpayne@69
|
2013 Note this also applies for comparison to other position objects,
|
jpayne@69
|
2014 where again the integer behavior is used:
|
jpayne@69
|
2015
|
jpayne@69
|
2016 >>> p == 10
|
jpayne@69
|
2017 True
|
jpayne@69
|
2018 >>> p == ExactPosition(10)
|
jpayne@69
|
2019 True
|
jpayne@69
|
2020 >>> p == BeforePosition(10)
|
jpayne@69
|
2021 True
|
jpayne@69
|
2022 >>> p == AfterPosition(10)
|
jpayne@69
|
2023 True
|
jpayne@69
|
2024
|
jpayne@69
|
2025 If this were an end point, you would want the position to be 13
|
jpayne@69
|
2026 (the right/larger value, not the left/smaller value as above):
|
jpayne@69
|
2027
|
jpayne@69
|
2028 >>> p2 = WithinPosition(13, 10, 13)
|
jpayne@69
|
2029 >>> p2
|
jpayne@69
|
2030 WithinPosition(13, left=10, right=13)
|
jpayne@69
|
2031 >>> print(p2)
|
jpayne@69
|
2032 (10.13)
|
jpayne@69
|
2033 >>> int(p2)
|
jpayne@69
|
2034 13
|
jpayne@69
|
2035 >>> p2 == 13
|
jpayne@69
|
2036 True
|
jpayne@69
|
2037 >>> p2 == ExactPosition(13)
|
jpayne@69
|
2038 True
|
jpayne@69
|
2039
|
jpayne@69
|
2040 """
|
jpayne@69
|
2041
|
jpayne@69
|
2042 def __new__(cls, position, left, right):
|
jpayne@69
|
2043 """Create a WithinPosition object."""
|
jpayne@69
|
2044 if not (position == left or position == right):
|
jpayne@69
|
2045 raise RuntimeError(
|
jpayne@69
|
2046 "WithinPosition: %r should match left %r or "
|
jpayne@69
|
2047 "right %r" % (position, left, right)
|
jpayne@69
|
2048 )
|
jpayne@69
|
2049 obj = int.__new__(cls, position)
|
jpayne@69
|
2050 obj._left = left
|
jpayne@69
|
2051 obj._right = right
|
jpayne@69
|
2052 return obj
|
jpayne@69
|
2053
|
jpayne@69
|
2054 def __getnewargs__(self):
|
jpayne@69
|
2055 """Return the arguments accepted by __new__.
|
jpayne@69
|
2056
|
jpayne@69
|
2057 Necessary to allow pickling and unpickling of class instances.
|
jpayne@69
|
2058 """
|
jpayne@69
|
2059 return (int(self), self._left, self._right)
|
jpayne@69
|
2060
|
jpayne@69
|
2061 def __repr__(self):
|
jpayne@69
|
2062 """Represent the WithinPosition object as a string for debugging."""
|
jpayne@69
|
2063 return "%s(%i, left=%i, right=%i)" % (
|
jpayne@69
|
2064 self.__class__.__name__,
|
jpayne@69
|
2065 int(self),
|
jpayne@69
|
2066 self._left,
|
jpayne@69
|
2067 self._right,
|
jpayne@69
|
2068 )
|
jpayne@69
|
2069
|
jpayne@69
|
2070 def __str__(self):
|
jpayne@69
|
2071 """Return a representation of the WithinPosition object (with python counting)."""
|
jpayne@69
|
2072 return f"({self._left}.{self._right})"
|
jpayne@69
|
2073
|
jpayne@69
|
2074 def __add__(self, offset):
|
jpayne@69
|
2075 """Return a copy of the position object with its location shifted."""
|
jpayne@69
|
2076 return self.__class__(
|
jpayne@69
|
2077 int(self) + offset, self._left + offset, self._right + offset
|
jpayne@69
|
2078 )
|
jpayne@69
|
2079
|
jpayne@69
|
2080 def _flip(self, length):
|
jpayne@69
|
2081 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
2082 return self.__class__(
|
jpayne@69
|
2083 length - int(self), length - self._right, length - self._left
|
jpayne@69
|
2084 )
|
jpayne@69
|
2085
|
jpayne@69
|
2086
|
jpayne@69
|
2087 class BetweenPosition(int, Position):
|
jpayne@69
|
2088 """Specify the position of a boundary between two coordinates (OBSOLETE?).
|
jpayne@69
|
2089
|
jpayne@69
|
2090 Arguments:
|
jpayne@69
|
2091 - position - The default integer position
|
jpayne@69
|
2092 - left - The start (left) position of the boundary
|
jpayne@69
|
2093 - right - The end (right) position of the boundary
|
jpayne@69
|
2094
|
jpayne@69
|
2095 This allows dealing with a position like 123^456. This
|
jpayne@69
|
2096 indicates that the start of the sequence is somewhere between
|
jpayne@69
|
2097 123 and 456. It is up to the parser to set the position argument
|
jpayne@69
|
2098 to either boundary point (depending on if this is being used as
|
jpayne@69
|
2099 a start or end of the feature). For example as a feature end:
|
jpayne@69
|
2100
|
jpayne@69
|
2101 >>> p = BetweenPosition(456, 123, 456)
|
jpayne@69
|
2102 >>> p
|
jpayne@69
|
2103 BetweenPosition(456, left=123, right=456)
|
jpayne@69
|
2104 >>> print(p)
|
jpayne@69
|
2105 (123^456)
|
jpayne@69
|
2106 >>> int(p)
|
jpayne@69
|
2107 456
|
jpayne@69
|
2108
|
jpayne@69
|
2109 Integer equality and comparison use the given position,
|
jpayne@69
|
2110
|
jpayne@69
|
2111 >>> p == 456
|
jpayne@69
|
2112 True
|
jpayne@69
|
2113 >>> p in [455, 456, 457]
|
jpayne@69
|
2114 True
|
jpayne@69
|
2115 >>> p > 300
|
jpayne@69
|
2116 True
|
jpayne@69
|
2117
|
jpayne@69
|
2118 The old legacy properties of position and extension give the
|
jpayne@69
|
2119 starting/lower/left position as an integer, and the distance
|
jpayne@69
|
2120 to the ending/higher/right position as an integer. Note that
|
jpayne@69
|
2121 the position object will act like either the left or the right
|
jpayne@69
|
2122 end-point depending on how it was created:
|
jpayne@69
|
2123
|
jpayne@69
|
2124 >>> p2 = BetweenPosition(123, left=123, right=456)
|
jpayne@69
|
2125 >>> int(p) == int(p2)
|
jpayne@69
|
2126 False
|
jpayne@69
|
2127 >>> p == 456
|
jpayne@69
|
2128 True
|
jpayne@69
|
2129 >>> p2 == 123
|
jpayne@69
|
2130 True
|
jpayne@69
|
2131
|
jpayne@69
|
2132 Note this potentially surprising behavior:
|
jpayne@69
|
2133
|
jpayne@69
|
2134 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
|
jpayne@69
|
2135 True
|
jpayne@69
|
2136 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
|
jpayne@69
|
2137 True
|
jpayne@69
|
2138 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
|
jpayne@69
|
2139 True
|
jpayne@69
|
2140
|
jpayne@69
|
2141 i.e. For equality (and sorting) the position objects behave like
|
jpayne@69
|
2142 integers.
|
jpayne@69
|
2143
|
jpayne@69
|
2144 """
|
jpayne@69
|
2145
|
jpayne@69
|
2146 def __new__(cls, position, left, right):
|
jpayne@69
|
2147 """Create a new instance in BetweenPosition object."""
|
jpayne@69
|
2148 assert position == left or position == right
|
jpayne@69
|
2149 # TODO - public API for getting left/right, especially the unknown one
|
jpayne@69
|
2150 obj = int.__new__(cls, position)
|
jpayne@69
|
2151 obj._left = left
|
jpayne@69
|
2152 obj._right = right
|
jpayne@69
|
2153 return obj
|
jpayne@69
|
2154
|
jpayne@69
|
2155 def __getnewargs__(self):
|
jpayne@69
|
2156 """Return the arguments accepted by __new__.
|
jpayne@69
|
2157
|
jpayne@69
|
2158 Necessary to allow pickling and unpickling of class instances.
|
jpayne@69
|
2159 """
|
jpayne@69
|
2160 return (int(self), self._left, self._right)
|
jpayne@69
|
2161
|
jpayne@69
|
2162 def __repr__(self):
|
jpayne@69
|
2163 """Represent the BetweenPosition object as a string for debugging."""
|
jpayne@69
|
2164 return "%s(%i, left=%i, right=%i)" % (
|
jpayne@69
|
2165 self.__class__.__name__,
|
jpayne@69
|
2166 int(self),
|
jpayne@69
|
2167 self._left,
|
jpayne@69
|
2168 self._right,
|
jpayne@69
|
2169 )
|
jpayne@69
|
2170
|
jpayne@69
|
2171 def __str__(self):
|
jpayne@69
|
2172 """Return a representation of the BetweenPosition object (with python counting)."""
|
jpayne@69
|
2173 return f"({self._left}^{self._right})"
|
jpayne@69
|
2174
|
jpayne@69
|
2175 def __add__(self, offset):
|
jpayne@69
|
2176 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
2177 return self.__class__(
|
jpayne@69
|
2178 int(self) + offset, self._left + offset, self._right + offset
|
jpayne@69
|
2179 )
|
jpayne@69
|
2180
|
jpayne@69
|
2181 def _flip(self, length):
|
jpayne@69
|
2182 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
2183 return self.__class__(
|
jpayne@69
|
2184 length - int(self), length - self._right, length - self._left
|
jpayne@69
|
2185 )
|
jpayne@69
|
2186
|
jpayne@69
|
2187
|
jpayne@69
|
2188 class BeforePosition(int, Position):
|
jpayne@69
|
2189 """Specify a position where the actual location occurs before it.
|
jpayne@69
|
2190
|
jpayne@69
|
2191 Arguments:
|
jpayne@69
|
2192 - position - The upper boundary of where the location can occur.
|
jpayne@69
|
2193 - extension - An optional argument which must be zero since we don't
|
jpayne@69
|
2194 have an extension. The argument is provided so that the same number
|
jpayne@69
|
2195 of arguments can be passed to all position types.
|
jpayne@69
|
2196
|
jpayne@69
|
2197 This is used to specify positions like (<10..100) where the location
|
jpayne@69
|
2198 occurs somewhere before position 10.
|
jpayne@69
|
2199
|
jpayne@69
|
2200 >>> p = BeforePosition(5)
|
jpayne@69
|
2201 >>> p
|
jpayne@69
|
2202 BeforePosition(5)
|
jpayne@69
|
2203 >>> print(p)
|
jpayne@69
|
2204 <5
|
jpayne@69
|
2205 >>> int(p)
|
jpayne@69
|
2206 5
|
jpayne@69
|
2207 >>> p + 10
|
jpayne@69
|
2208 BeforePosition(15)
|
jpayne@69
|
2209
|
jpayne@69
|
2210 Note this potentially surprising behavior:
|
jpayne@69
|
2211
|
jpayne@69
|
2212 >>> p == ExactPosition(5)
|
jpayne@69
|
2213 True
|
jpayne@69
|
2214 >>> p == AfterPosition(5)
|
jpayne@69
|
2215 True
|
jpayne@69
|
2216
|
jpayne@69
|
2217 Just remember that for equality and sorting the position objects act
|
jpayne@69
|
2218 like integers.
|
jpayne@69
|
2219 """
|
jpayne@69
|
2220
|
jpayne@69
|
2221 # Subclasses int so can't use __init__
|
jpayne@69
|
2222 def __new__(cls, position, extension=0):
|
jpayne@69
|
2223 """Create a new instance in BeforePosition object."""
|
jpayne@69
|
2224 if extension != 0:
|
jpayne@69
|
2225 raise AttributeError(f"Non-zero extension {extension} for exact position.")
|
jpayne@69
|
2226 return int.__new__(cls, position)
|
jpayne@69
|
2227
|
jpayne@69
|
2228 def __repr__(self):
|
jpayne@69
|
2229 """Represent the location as a string for debugging."""
|
jpayne@69
|
2230 return "%s(%i)" % (self.__class__.__name__, int(self))
|
jpayne@69
|
2231
|
jpayne@69
|
2232 def __str__(self):
|
jpayne@69
|
2233 """Return a representation of the BeforePosition object (with python counting)."""
|
jpayne@69
|
2234 return f"<{int(self)}"
|
jpayne@69
|
2235
|
jpayne@69
|
2236 def __add__(self, offset):
|
jpayne@69
|
2237 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
2238 return self.__class__(int(self) + offset)
|
jpayne@69
|
2239
|
jpayne@69
|
2240 def _flip(self, length):
|
jpayne@69
|
2241 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
2242 return AfterPosition(length - int(self))
|
jpayne@69
|
2243
|
jpayne@69
|
2244
|
jpayne@69
|
2245 class AfterPosition(int, Position):
|
jpayne@69
|
2246 """Specify a position where the actual location is found after it.
|
jpayne@69
|
2247
|
jpayne@69
|
2248 Arguments:
|
jpayne@69
|
2249 - position - The lower boundary of where the location can occur.
|
jpayne@69
|
2250 - extension - An optional argument which must be zero since we don't
|
jpayne@69
|
2251 have an extension. The argument is provided so that the same number
|
jpayne@69
|
2252 of arguments can be passed to all position types.
|
jpayne@69
|
2253
|
jpayne@69
|
2254 This is used to specify positions like (>10..100) where the location
|
jpayne@69
|
2255 occurs somewhere after position 10.
|
jpayne@69
|
2256
|
jpayne@69
|
2257 >>> p = AfterPosition(7)
|
jpayne@69
|
2258 >>> p
|
jpayne@69
|
2259 AfterPosition(7)
|
jpayne@69
|
2260 >>> print(p)
|
jpayne@69
|
2261 >7
|
jpayne@69
|
2262 >>> int(p)
|
jpayne@69
|
2263 7
|
jpayne@69
|
2264 >>> p + 10
|
jpayne@69
|
2265 AfterPosition(17)
|
jpayne@69
|
2266
|
jpayne@69
|
2267 >>> isinstance(p, AfterPosition)
|
jpayne@69
|
2268 True
|
jpayne@69
|
2269 >>> isinstance(p, Position)
|
jpayne@69
|
2270 True
|
jpayne@69
|
2271 >>> isinstance(p, int)
|
jpayne@69
|
2272 True
|
jpayne@69
|
2273
|
jpayne@69
|
2274 Note this potentially surprising behavior:
|
jpayne@69
|
2275
|
jpayne@69
|
2276 >>> p == ExactPosition(7)
|
jpayne@69
|
2277 True
|
jpayne@69
|
2278 >>> p == BeforePosition(7)
|
jpayne@69
|
2279 True
|
jpayne@69
|
2280
|
jpayne@69
|
2281 Just remember that for equality and sorting the position objects act
|
jpayne@69
|
2282 like integers.
|
jpayne@69
|
2283 """
|
jpayne@69
|
2284
|
jpayne@69
|
2285 # Subclasses int so can't use __init__
|
jpayne@69
|
2286 def __new__(cls, position, extension=0):
|
jpayne@69
|
2287 """Create a new instance of the AfterPosition object."""
|
jpayne@69
|
2288 if extension != 0:
|
jpayne@69
|
2289 raise AttributeError(f"Non-zero extension {extension} for exact position.")
|
jpayne@69
|
2290 return int.__new__(cls, position)
|
jpayne@69
|
2291
|
jpayne@69
|
2292 def __repr__(self):
|
jpayne@69
|
2293 """Represent the location as a string for debugging."""
|
jpayne@69
|
2294 return "%s(%i)" % (self.__class__.__name__, int(self))
|
jpayne@69
|
2295
|
jpayne@69
|
2296 def __str__(self):
|
jpayne@69
|
2297 """Return a representation of the AfterPosition object (with python counting)."""
|
jpayne@69
|
2298 return f">{int(self)}"
|
jpayne@69
|
2299
|
jpayne@69
|
2300 def __add__(self, offset):
|
jpayne@69
|
2301 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
2302 return self.__class__(int(self) + offset)
|
jpayne@69
|
2303
|
jpayne@69
|
2304 def _flip(self, length):
|
jpayne@69
|
2305 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
2306 return BeforePosition(length - int(self))
|
jpayne@69
|
2307
|
jpayne@69
|
2308
|
jpayne@69
|
2309 class OneOfPosition(int, Position):
|
jpayne@69
|
2310 """Specify a position where the location can be multiple positions.
|
jpayne@69
|
2311
|
jpayne@69
|
2312 This models the GenBank 'one-of(1888,1901)' function, and tries
|
jpayne@69
|
2313 to make this fit within the Biopython Position models. If this was
|
jpayne@69
|
2314 a start position it should act like 1888, but as an end position 1901.
|
jpayne@69
|
2315
|
jpayne@69
|
2316 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
|
jpayne@69
|
2317 >>> p
|
jpayne@69
|
2318 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
|
jpayne@69
|
2319 >>> int(p)
|
jpayne@69
|
2320 1888
|
jpayne@69
|
2321
|
jpayne@69
|
2322 Integer comparisons and operators act like using int(p),
|
jpayne@69
|
2323
|
jpayne@69
|
2324 >>> p == 1888
|
jpayne@69
|
2325 True
|
jpayne@69
|
2326 >>> p <= 1888
|
jpayne@69
|
2327 True
|
jpayne@69
|
2328 >>> p > 1888
|
jpayne@69
|
2329 False
|
jpayne@69
|
2330 >>> p + 100
|
jpayne@69
|
2331 OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)])
|
jpayne@69
|
2332
|
jpayne@69
|
2333 >>> isinstance(p, OneOfPosition)
|
jpayne@69
|
2334 True
|
jpayne@69
|
2335 >>> isinstance(p, Position)
|
jpayne@69
|
2336 True
|
jpayne@69
|
2337 >>> isinstance(p, int)
|
jpayne@69
|
2338 True
|
jpayne@69
|
2339
|
jpayne@69
|
2340 """
|
jpayne@69
|
2341
|
jpayne@69
|
2342 def __new__(cls, position, choices):
|
jpayne@69
|
2343 """Initialize with a set of possible positions.
|
jpayne@69
|
2344
|
jpayne@69
|
2345 choices is a list of Position derived objects, specifying possible
|
jpayne@69
|
2346 locations.
|
jpayne@69
|
2347
|
jpayne@69
|
2348 position is an integer specifying the default behavior.
|
jpayne@69
|
2349 """
|
jpayne@69
|
2350 if position not in choices:
|
jpayne@69
|
2351 raise ValueError(
|
jpayne@69
|
2352 f"OneOfPosition: {position!r} should match one of {choices!r}"
|
jpayne@69
|
2353 )
|
jpayne@69
|
2354 obj = int.__new__(cls, position)
|
jpayne@69
|
2355 obj.position_choices = choices
|
jpayne@69
|
2356 return obj
|
jpayne@69
|
2357
|
jpayne@69
|
2358 def __getnewargs__(self):
|
jpayne@69
|
2359 """Return the arguments accepted by __new__.
|
jpayne@69
|
2360
|
jpayne@69
|
2361 Necessary to allow pickling and unpickling of class instances.
|
jpayne@69
|
2362 """
|
jpayne@69
|
2363 return (int(self), self.position_choices)
|
jpayne@69
|
2364
|
jpayne@69
|
2365 def __repr__(self):
|
jpayne@69
|
2366 """Represent the OneOfPosition object as a string for debugging."""
|
jpayne@69
|
2367 return "%s(%i, choices=%r)" % (
|
jpayne@69
|
2368 self.__class__.__name__,
|
jpayne@69
|
2369 int(self),
|
jpayne@69
|
2370 self.position_choices,
|
jpayne@69
|
2371 )
|
jpayne@69
|
2372
|
jpayne@69
|
2373 def __str__(self):
|
jpayne@69
|
2374 """Return a representation of the OneOfPosition object (with python counting)."""
|
jpayne@69
|
2375 out = "one-of("
|
jpayne@69
|
2376 for position in self.position_choices:
|
jpayne@69
|
2377 out += f"{position},"
|
jpayne@69
|
2378 # replace the last comma with the closing parenthesis
|
jpayne@69
|
2379 return out[:-1] + ")"
|
jpayne@69
|
2380
|
jpayne@69
|
2381 def __add__(self, offset):
|
jpayne@69
|
2382 """Return a copy of the position object with its location shifted (PRIVATE)."""
|
jpayne@69
|
2383 return self.__class__(
|
jpayne@69
|
2384 int(self) + offset, [p + offset for p in self.position_choices]
|
jpayne@69
|
2385 )
|
jpayne@69
|
2386
|
jpayne@69
|
2387 def _flip(self, length):
|
jpayne@69
|
2388 """Return a copy of the location after the parent is reversed (PRIVATE)."""
|
jpayne@69
|
2389 return self.__class__(
|
jpayne@69
|
2390 length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
|
jpayne@69
|
2391 )
|
jpayne@69
|
2392
|
jpayne@69
|
2393
|
jpayne@69
|
2394 if __name__ == "__main__":
|
jpayne@69
|
2395 from Bio._utils import run_doctest
|
jpayne@69
|
2396
|
jpayne@69
|
2397 run_doctest()
|