Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/SeqFeature.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright 2000-2003 Jeff Chang. | |
2 # Copyright 2001-2008 Brad Chapman. | |
3 # Copyright 2005-2024 by Peter Cock. | |
4 # Copyright 2006-2009 Michiel de Hoon. | |
5 # All rights reserved. | |
6 # | |
7 # This file is part of the Biopython distribution and governed by your | |
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
9 # Please see the LICENSE file that should have been included as part of this | |
10 # package. | |
11 """Represent a Sequence Feature holding info about a part of a sequence. | |
12 | |
13 This is heavily modeled after the Biocorba SeqFeature objects, and | |
14 may be pretty biased towards GenBank stuff since I'm writing it | |
15 for the GenBank parser output... | |
16 | |
17 What's here: | |
18 | |
19 Base class to hold a Feature | |
20 ---------------------------- | |
21 | |
22 Classes: | |
23 - SeqFeature | |
24 | |
25 Hold information about a Reference | |
26 ---------------------------------- | |
27 | |
28 This is an attempt to create a General class to hold Reference type | |
29 information. | |
30 | |
31 Classes: | |
32 - Reference | |
33 | |
34 Specify locations of a feature on a Sequence | |
35 -------------------------------------------- | |
36 | |
37 This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. | |
38 This has the advantages of allowing us to handle fuzzy stuff in case anyone | |
39 needs it, and also be compatible with BioPerl etc and BioSQL. | |
40 | |
41 Classes: | |
42 - Location - abstract base class of SimpleLocation and CompoundLocation. | |
43 - SimpleLocation - Specify the start and end location of a feature. | |
44 - CompoundLocation - Collection of SimpleLocation objects (for joins etc). | |
45 - Position - abstract base class of ExactPosition, WithinPosition, | |
46 BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and | |
47 UnknownPosition. | |
48 - ExactPosition - Specify the position as being exact. | |
49 - WithinPosition - Specify a position occurring within some range. | |
50 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). | |
51 - BeforePosition - Specify the position as being found before some base. | |
52 - AfterPosition - Specify the position as being found after some base. | |
53 - OneOfPosition - Specify a position consisting of multiple alternative positions. | |
54 - UncertainPosition - Specify a specific position which is uncertain. | |
55 - UnknownPosition - Represents missing information like '?' in UniProt. | |
56 | |
57 | |
58 Exceptions: | |
59 - LocationParserError - Exception indicating a failure to parse a location | |
60 string. | |
61 | |
62 """ | |
63 import functools | |
64 import re | |
65 import warnings | |
66 from abc import ABC, abstractmethod | |
67 | |
68 from Bio import BiopythonDeprecationWarning | |
69 from Bio import BiopythonParserWarning | |
70 from Bio.Seq import MutableSeq | |
71 from Bio.Seq import reverse_complement | |
72 from Bio.Seq import Seq | |
73 | |
74 | |
75 # Regular expressions for location parsing | |
76 | |
77 _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)" | |
78 _oneof_position = r"one\-of\(\d+[,\d+]+\)" | |
79 | |
80 _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})" | |
81 | |
82 _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))" | |
83 | |
84 _split = re.compile(_any_location).split | |
85 | |
86 assert _split("123..145")[1::2] == ["123..145"] | |
87 assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"] | |
88 assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"] | |
89 assert _split("complement(123..145),200..209")[1::2] == [ | |
90 "complement(123..145)", | |
91 "200..209", | |
92 ] | |
93 assert _split("123..145,one-of(200,203)..209")[1::2] == [ | |
94 "123..145", | |
95 "one-of(200,203)..209", | |
96 ] | |
97 assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [ | |
98 "123..145", | |
99 "one-of(200,203)..one-of(209,211)", | |
100 "300", | |
101 ] | |
102 assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [ | |
103 "123..145", | |
104 "complement(one-of(200,203)..one-of(209,211))", | |
105 "300", | |
106 ] | |
107 assert _split("123..145,200..one-of(209,211),300")[1::2] == [ | |
108 "123..145", | |
109 "200..one-of(209,211)", | |
110 "300", | |
111 ] | |
112 assert _split("123..145,200..one-of(209,211)")[1::2] == [ | |
113 "123..145", | |
114 "200..one-of(209,211)", | |
115 ] | |
116 assert _split( | |
117 "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905" | |
118 )[1::2] == [ | |
119 "complement(149815..150200)", | |
120 "complement(293787..295573)", | |
121 "NC_016402.1:6618..6676", | |
122 "181647..181905", | |
123 ] | |
124 | |
125 | |
126 _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+" | |
127 | |
128 _between_location = r"\d+\^\d+" | |
129 | |
130 _within_position = r"\(\d+\.\d+\)" | |
131 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % ( | |
132 _within_position, | |
133 _within_position, | |
134 ) | |
135 _within_position = r"\((\d+)\.(\d+)\)" | |
136 _re_within_position = re.compile(_within_position) | |
137 assert _re_within_position.match("(3.9)") | |
138 | |
139 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position) | |
140 _oneof_position = r"one\-of\((\d+[,\d+]+)\)" | |
141 _re_oneof_position = re.compile(_oneof_position) | |
142 assert _re_oneof_position.match("one-of(6,9)") | |
143 assert not _re_oneof_position.match("one-of(3)") | |
144 assert _re_oneof_position.match("one-of(3,6)") | |
145 assert _re_oneof_position.match("one-of(3,6,9)") | |
146 | |
147 _solo_location = r"[<>]?\d+" | |
148 _solo_bond = r"bond\(%s\)" % _solo_location | |
149 | |
150 _re_location_category = re.compile( | |
151 r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$" | |
152 % ( | |
153 _pair_location, | |
154 _between_location, | |
155 _within_location, | |
156 _oneof_location, | |
157 _solo_bond, | |
158 _solo_location, | |
159 ) | |
160 ) | |
161 | |
162 | |
163 class LocationParserError(ValueError): | |
164 """Could not parse a feature location string.""" | |
165 | |
166 | |
167 class SeqFeature: | |
168 """Represent a Sequence Feature on an object. | |
169 | |
170 Attributes: | |
171 - location - the location of the feature on the sequence (SimpleLocation) | |
172 - type - the specified type of the feature (ie. CDS, exon, repeat...) | |
173 - id - A string identifier for the feature. | |
174 - qualifiers - A dictionary of qualifiers on the feature. These are | |
175 analogous to the qualifiers from a GenBank feature table. The keys of | |
176 the dictionary are qualifier names, the values are the qualifier | |
177 values. | |
178 | |
179 """ | |
180 | |
181 def __init__( | |
182 self, | |
183 location=None, | |
184 type="", | |
185 id="<unknown id>", | |
186 qualifiers=None, | |
187 sub_features=None, | |
188 ): | |
189 """Initialize a SeqFeature on a sequence. | |
190 | |
191 location can either be a SimpleLocation (with strand argument also | |
192 given if required), or None. | |
193 | |
194 e.g. With no strand, on the forward strand, and on the reverse strand: | |
195 | |
196 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
197 >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain") | |
198 >>> f1.location.strand == None | |
199 True | |
200 >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS") | |
201 >>> f2.location.strand == +1 | |
202 True | |
203 >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS") | |
204 >>> f3.location.strand == -1 | |
205 True | |
206 | |
207 For exact start/end positions, an integer can be used (as shown above) | |
208 as shorthand for the ExactPosition object. For non-exact locations, the | |
209 SimpleLocation must be specified via the appropriate position objects. | |
210 """ | |
211 if ( | |
212 location is not None | |
213 and not isinstance(location, SimpleLocation) | |
214 and not isinstance(location, CompoundLocation) | |
215 ): | |
216 raise TypeError( | |
217 "SimpleLocation, CompoundLocation (or None) required for the location" | |
218 ) | |
219 self.location = location | |
220 self.type = type | |
221 self.id = id | |
222 self.qualifiers = {} | |
223 if qualifiers is not None: | |
224 self.qualifiers.update(qualifiers) | |
225 if sub_features is not None: | |
226 raise TypeError("Rather than sub_features, use a CompoundLocation") | |
227 | |
228 def _get_strand(self): | |
229 """Get function for the strand property (PRIVATE).""" | |
230 warnings.warn( | |
231 "Please use .location.strand rather than .strand", | |
232 BiopythonDeprecationWarning, | |
233 ) | |
234 return self.location.strand | |
235 | |
236 def _set_strand(self, value): | |
237 """Set function for the strand property (PRIVATE).""" | |
238 warnings.warn( | |
239 "Please use .location.strand rather than .strand", | |
240 BiopythonDeprecationWarning, | |
241 ) | |
242 try: | |
243 self.location.strand = value | |
244 except AttributeError: | |
245 if self.location is None: | |
246 if value is not None: | |
247 raise ValueError("Can't set strand without a location.") from None | |
248 else: | |
249 raise | |
250 | |
251 strand = property( | |
252 fget=_get_strand, | |
253 fset=_set_strand, | |
254 doc="Alias for the location's strand (DEPRECATED).", | |
255 ) | |
256 | |
257 def _get_ref(self): | |
258 """Get function for the reference property (PRIVATE).""" | |
259 warnings.warn( | |
260 "Please use .location.ref rather than .ref", | |
261 BiopythonDeprecationWarning, | |
262 ) | |
263 try: | |
264 return self.location.ref | |
265 except AttributeError: | |
266 return None | |
267 | |
268 def _set_ref(self, value): | |
269 """Set function for the reference property (PRIVATE).""" | |
270 warnings.warn( | |
271 "Please use .location.ref rather than .ref", | |
272 BiopythonDeprecationWarning, | |
273 ) | |
274 try: | |
275 self.location.ref = value | |
276 except AttributeError: | |
277 if self.location is None: | |
278 if value is not None: | |
279 raise ValueError("Can't set ref without a location.") from None | |
280 else: | |
281 raise | |
282 | |
283 ref = property( | |
284 fget=_get_ref, | |
285 fset=_set_ref, | |
286 doc="Alias for the location's ref (DEPRECATED).", | |
287 ) | |
288 | |
289 def _get_ref_db(self): | |
290 """Get function for the database reference property (PRIVATE).""" | |
291 warnings.warn( | |
292 "Please use .location.ref_db rather than .ref_db", | |
293 BiopythonDeprecationWarning, | |
294 ) | |
295 try: | |
296 return self.location.ref_db | |
297 except AttributeError: | |
298 return None | |
299 | |
300 def _set_ref_db(self, value): | |
301 """Set function for the database reference property (PRIVATE).""" | |
302 warnings.warn( | |
303 "Please use .location.ref_db rather than .ref_db", | |
304 BiopythonDeprecationWarning, | |
305 ) | |
306 self.location.ref_db = value | |
307 | |
308 ref_db = property( | |
309 fget=_get_ref_db, | |
310 fset=_set_ref_db, | |
311 doc="Alias for the location's ref_db (DEPRECATED).", | |
312 ) | |
313 | |
314 def __eq__(self, other): | |
315 """Check if two SeqFeature objects should be considered equal.""" | |
316 return ( | |
317 isinstance(other, SeqFeature) | |
318 and self.id == other.id | |
319 and self.type == other.type | |
320 and self.location == other.location | |
321 and self.qualifiers == other.qualifiers | |
322 ) | |
323 | |
324 def __repr__(self): | |
325 """Represent the feature as a string for debugging.""" | |
326 answer = f"{self.__class__.__name__}({self.location!r}" | |
327 if self.type: | |
328 answer += f", type={self.type!r}" | |
329 if self.id and self.id != "<unknown id>": | |
330 answer += f", id={self.id!r}" | |
331 if self.qualifiers: | |
332 answer += ", qualifiers=..." | |
333 answer += ")" | |
334 return answer | |
335 | |
336 def __str__(self): | |
337 """Return the full feature as a python string.""" | |
338 out = f"type: {self.type}\n" | |
339 out += f"location: {self.location}\n" | |
340 if self.id and self.id != "<unknown id>": | |
341 out += f"id: {self.id}\n" | |
342 out += "qualifiers:\n" | |
343 for qual_key in sorted(self.qualifiers): | |
344 out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n" | |
345 return out | |
346 | |
347 def _shift(self, offset): | |
348 """Return a copy of the feature with its location shifted (PRIVATE). | |
349 | |
350 The annotation qualifiers are copied. | |
351 """ | |
352 return SeqFeature( | |
353 location=self.location._shift(offset), | |
354 type=self.type, | |
355 id=self.id, | |
356 qualifiers=self.qualifiers.copy(), | |
357 ) | |
358 | |
359 def _flip(self, length): | |
360 """Return a copy of the feature with its location flipped (PRIVATE). | |
361 | |
362 The argument length gives the length of the parent sequence. For | |
363 example a location 0..20 (+1 strand) with parent length 30 becomes | |
364 after flipping 10..30 (-1 strand). Strandless (None) or unknown | |
365 strand (0) remain like that - just their end points are changed. | |
366 | |
367 The annotation qualifiers are copied. | |
368 """ | |
369 return SeqFeature( | |
370 location=self.location._flip(length), | |
371 type=self.type, | |
372 id=self.id, | |
373 qualifiers=self.qualifiers.copy(), | |
374 ) | |
375 | |
376 def extract(self, parent_sequence, references=None): | |
377 """Extract the feature's sequence from supplied parent sequence. | |
378 | |
379 The parent_sequence can be a Seq like object or a string, and will | |
380 generally return an object of the same type. The exception to this is | |
381 a MutableSeq as the parent sequence will return a Seq object. | |
382 | |
383 This should cope with complex locations including complements, joins | |
384 and fuzzy positions. Even mixed strand features should work! This | |
385 also covers features on protein sequences (e.g. domains), although | |
386 here reverse strand features are not permitted. If the | |
387 location refers to other records, they must be supplied in the | |
388 optional dictionary references. | |
389 | |
390 >>> from Bio.Seq import Seq | |
391 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
392 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
393 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain") | |
394 >>> f.extract(seq) | |
395 Seq('VALIVIC') | |
396 | |
397 If the SimpleLocation is None, e.g. when parsing invalid locus | |
398 locations in the GenBank parser, extract() will raise a ValueError. | |
399 | |
400 >>> from Bio.Seq import Seq | |
401 >>> from Bio.SeqFeature import SeqFeature | |
402 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
403 >>> f = SeqFeature(None, type="domain") | |
404 >>> f.extract(seq) | |
405 Traceback (most recent call last): | |
406 ... | |
407 ValueError: The feature's .location is None. Check the sequence file for a valid location. | |
408 | |
409 Note - currently only compound features of type "join" are supported. | |
410 """ | |
411 if self.location is None: | |
412 raise ValueError( | |
413 "The feature's .location is None. Check the " | |
414 "sequence file for a valid location." | |
415 ) | |
416 return self.location.extract(parent_sequence, references=references) | |
417 | |
418 def translate( | |
419 self, | |
420 parent_sequence, | |
421 table="Standard", | |
422 start_offset=None, | |
423 stop_symbol="*", | |
424 to_stop=False, | |
425 cds=None, | |
426 gap=None, | |
427 ): | |
428 """Get a translation of the feature's sequence. | |
429 | |
430 This method is intended for CDS or other features that code proteins | |
431 and is a shortcut that will both extract the feature and | |
432 translate it, taking into account the codon_start and transl_table | |
433 qualifiers, if they are present. If they are not present the | |
434 value of the arguments "table" and "start_offset" are used. | |
435 | |
436 The "cds" parameter is set to "True" if the feature is of type | |
437 "CDS" but can be overridden by giving an explicit argument. | |
438 | |
439 The arguments stop_symbol, to_stop and gap have the same meaning | |
440 as Seq.translate, refer to that documentation for further information. | |
441 | |
442 Arguments: | |
443 - parent_sequence - A DNA or RNA sequence. | |
444 - table - Which codon table to use if there is no transl_table | |
445 qualifier for this feature. This can be either a name | |
446 (string), an NCBI identifier (integer), or a CodonTable | |
447 object (useful for non-standard genetic codes). This | |
448 defaults to the "Standard" table. | |
449 - start_offset - offset at which the first complete codon of a | |
450 coding feature can be found, relative to the first base of | |
451 that feature. Has a valid value of 0, 1 or 2. NOTE: this | |
452 uses python's 0-based numbering whereas the codon_start | |
453 qualifier in files from NCBI use 1-based numbering. | |
454 Will override a codon_start qualifier | |
455 | |
456 >>> from Bio.Seq import Seq | |
457 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
458 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA") | |
459 >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS") | |
460 >>> f.qualifiers['transl_table'] = [11] | |
461 | |
462 Note that features of type CDS are subject to the usual | |
463 checks at translation. But you can override this behavior | |
464 by giving explicit arguments: | |
465 | |
466 >>> f.translate(seq, cds=False) | |
467 Seq('GYTYR*CL**') | |
468 | |
469 Now use the start_offset argument to change the frame. Note | |
470 this uses python 0-based numbering. | |
471 | |
472 >>> f.translate(seq, start_offset=1, cds=False) | |
473 Seq('VTLTDNVSD') | |
474 | |
475 Alternatively use the codon_start qualifier to do the same | |
476 thing. Note: this uses 1-based numbering, which is found | |
477 in files from NCBI. | |
478 | |
479 >>> f.qualifiers['codon_start'] = [2] | |
480 >>> f.translate(seq, cds=False) | |
481 Seq('VTLTDNVSD') | |
482 """ | |
483 # see if this feature should be translated in a different | |
484 # frame using the "codon_start" qualifier | |
485 if start_offset is None: | |
486 try: | |
487 start_offset = int(self.qualifiers["codon_start"][0]) - 1 | |
488 except KeyError: | |
489 start_offset = 0 | |
490 | |
491 if start_offset not in [0, 1, 2]: | |
492 raise ValueError( | |
493 "The start_offset must be 0, 1, or 2. " | |
494 f"The supplied value is {start_offset}. " | |
495 "Check the value of either the codon_start qualifier " | |
496 "or the start_offset argument" | |
497 ) | |
498 | |
499 feat_seq = self.extract(parent_sequence)[start_offset:] | |
500 codon_table = self.qualifiers.get("transl_table", [table])[0] | |
501 | |
502 if cds is None: | |
503 cds = self.type == "CDS" | |
504 | |
505 return feat_seq.translate( | |
506 table=codon_table, | |
507 stop_symbol=stop_symbol, | |
508 to_stop=to_stop, | |
509 cds=cds, | |
510 gap=gap, | |
511 ) | |
512 | |
513 def __bool__(self): | |
514 """Boolean value of an instance of this class (True). | |
515 | |
516 This behavior is for backwards compatibility, since until the | |
517 __len__ method was added, a SeqFeature always evaluated as True. | |
518 | |
519 Note that in comparison, Seq objects, strings, lists, etc, will all | |
520 evaluate to False if they have length zero. | |
521 | |
522 WARNING: The SeqFeature may in future evaluate to False when its | |
523 length is zero (in order to better match normal python behavior)! | |
524 """ | |
525 return True | |
526 | |
527 def __len__(self): | |
528 """Return the length of the region where the feature is located. | |
529 | |
530 >>> from Bio.Seq import Seq | |
531 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
532 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
533 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain") | |
534 >>> len(f) | |
535 7 | |
536 >>> f.extract(seq) | |
537 Seq('VALIVIC') | |
538 >>> len(f.extract(seq)) | |
539 7 | |
540 | |
541 This is a proxy for taking the length of the feature's location: | |
542 | |
543 >>> len(f.location) | |
544 7 | |
545 | |
546 For simple features this is the same as the region spanned (end | |
547 position minus start position using Pythonic counting). However, for | |
548 a compound location (e.g. a CDS as the join of several exons) the | |
549 gaps are not counted (e.g. introns). This ensures that len(f) matches | |
550 len(f.extract(parent_seq)), and also makes sure things work properly | |
551 with features wrapping the origin etc. | |
552 """ | |
553 return len(self.location) | |
554 | |
555 def __iter__(self): | |
556 """Iterate over the parent positions within the feature. | |
557 | |
558 The iteration order is strand aware, and can be thought of as moving | |
559 along the feature using the parent sequence coordinates: | |
560 | |
561 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
562 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain") | |
563 >>> len(f) | |
564 5 | |
565 >>> for i in f: print(i) | |
566 9 | |
567 8 | |
568 7 | |
569 6 | |
570 5 | |
571 >>> list(f) | |
572 [9, 8, 7, 6, 5] | |
573 | |
574 This is a proxy for iterating over the location, | |
575 | |
576 >>> list(f.location) | |
577 [9, 8, 7, 6, 5] | |
578 """ | |
579 return iter(self.location) | |
580 | |
581 def __contains__(self, value): | |
582 """Check if an integer position is within the feature. | |
583 | |
584 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
585 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain") | |
586 >>> len(f) | |
587 5 | |
588 >>> [i for i in range(15) if i in f] | |
589 [5, 6, 7, 8, 9] | |
590 | |
591 For example, to see which features include a SNP position, you could | |
592 use this: | |
593 | |
594 >>> from Bio import SeqIO | |
595 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") | |
596 >>> for f in record.features: | |
597 ... if 1750 in f: | |
598 ... print("%s %s" % (f.type, f.location)) | |
599 source [0:154478](+) | |
600 gene [1716:4347](-) | |
601 tRNA join{[4310:4347](-), [1716:1751](-)} | |
602 | |
603 Note that for a feature defined as a join of several subfeatures (e.g. | |
604 the union of several exons) the gaps are not checked (e.g. introns). | |
605 In this example, the tRNA location is defined in the GenBank file as | |
606 complement(join(1717..1751,4311..4347)), so that position 1760 falls | |
607 in the gap: | |
608 | |
609 >>> for f in record.features: | |
610 ... if 1760 in f: | |
611 ... print("%s %s" % (f.type, f.location)) | |
612 source [0:154478](+) | |
613 gene [1716:4347](-) | |
614 | |
615 Note that additional care may be required with fuzzy locations, for | |
616 example just before a BeforePosition: | |
617 | |
618 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
619 >>> from Bio.SeqFeature import BeforePosition | |
620 >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain") | |
621 >>> len(f) | |
622 5 | |
623 >>> [i for i in range(10) if i in f] | |
624 [3, 4, 5, 6, 7] | |
625 | |
626 Note that is is a proxy for testing membership on the location. | |
627 | |
628 >>> [i for i in range(10) if i in f.location] | |
629 [3, 4, 5, 6, 7] | |
630 """ | |
631 return value in self.location | |
632 | |
633 | |
634 # --- References | |
635 | |
636 | |
637 # TODO -- Will this hold PubMed and Medline information decently? | |
638 class Reference: | |
639 """Represent a Generic Reference object. | |
640 | |
641 Attributes: | |
642 - location - A list of Location objects specifying regions of | |
643 the sequence that the references correspond to. If no locations are | |
644 specified, the entire sequence is assumed. | |
645 - authors - A big old string, or a list split by author, of authors | |
646 for the reference. | |
647 - title - The title of the reference. | |
648 - journal - Journal the reference was published in. | |
649 - medline_id - A medline reference for the article. | |
650 - pubmed_id - A pubmed reference for the article. | |
651 - comment - A place to stick any comments about the reference. | |
652 | |
653 """ | |
654 | |
655 def __init__(self): | |
656 """Initialize the class.""" | |
657 self.location = [] | |
658 self.authors = "" | |
659 self.consrtm = "" | |
660 self.title = "" | |
661 self.journal = "" | |
662 self.medline_id = "" | |
663 self.pubmed_id = "" | |
664 self.comment = "" | |
665 | |
666 def __str__(self): | |
667 """Return the full Reference object as a python string.""" | |
668 out = "" | |
669 for single_location in self.location: | |
670 out += f"location: {single_location}\n" | |
671 out += f"authors: {self.authors}\n" | |
672 if self.consrtm: | |
673 out += f"consrtm: {self.consrtm}\n" | |
674 out += f"title: {self.title}\n" | |
675 out += f"journal: {self.journal}\n" | |
676 out += f"medline id: {self.medline_id}\n" | |
677 out += f"pubmed id: {self.pubmed_id}\n" | |
678 out += f"comment: {self.comment}\n" | |
679 return out | |
680 | |
681 def __repr__(self): | |
682 """Represent the Reference object as a string for debugging.""" | |
683 # TODO - Update this is __init__ later accepts values | |
684 return f"{self.__class__.__name__}(title={self.title!r}, ...)" | |
685 | |
686 def __eq__(self, other): | |
687 """Check if two Reference objects should be considered equal. | |
688 | |
689 Note prior to Biopython 1.70 the location was not compared, as | |
690 until then __eq__ for the SimpleLocation class was not defined. | |
691 """ | |
692 return ( | |
693 self.authors == other.authors | |
694 and self.consrtm == other.consrtm | |
695 and self.title == other.title | |
696 and self.journal == other.journal | |
697 and self.medline_id == other.medline_id | |
698 and self.pubmed_id == other.pubmed_id | |
699 and self.comment == other.comment | |
700 and self.location == other.location | |
701 ) | |
702 | |
703 | |
704 # --- Handling feature locations | |
705 | |
706 | |
707 class Location(ABC): | |
708 """Abstract base class representing a location.""" | |
709 | |
710 @abstractmethod | |
711 def __repr__(self): | |
712 """Represent the Location object as a string for debugging.""" | |
713 return f"{self.__class__.__name__}(...)" | |
714 | |
715 def fromstring(text, length=None, circular=False, stranded=True): | |
716 """Create a Location object from a string. | |
717 | |
718 This should accept any valid location string in the INSDC Feature Table | |
719 format (https://www.insdc.org/submitting-standards/feature-table/) as | |
720 used in GenBank, DDBJ and EMBL files. | |
721 | |
722 Simple examples: | |
723 | |
724 >>> Location.fromstring("123..456", 1000) | |
725 SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1) | |
726 >>> Location.fromstring("complement(<123..>456)", 1000) | |
727 SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1) | |
728 | |
729 A more complex location using within positions, | |
730 | |
731 >>> Location.fromstring("(9.10)..(20.25)", 1000) | |
732 SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) | |
733 | |
734 Notice how that will act as though it has overall start 8 and end 25. | |
735 | |
736 Zero length between feature, | |
737 | |
738 >>> Location.fromstring("123^124", 1000) | |
739 SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1) | |
740 | |
741 The expected sequence length is needed for a special case, a between | |
742 position at the start/end of a circular genome: | |
743 | |
744 >>> Location.fromstring("1000^1", 1000) | |
745 SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1) | |
746 | |
747 Apart from this special case, between positions P^Q must have P+1==Q, | |
748 | |
749 >>> Location.fromstring("123^456", 1000) | |
750 Traceback (most recent call last): | |
751 ... | |
752 Bio.SeqFeature.LocationParserError: invalid feature location '123^456' | |
753 | |
754 You can optionally provide a reference name: | |
755 | |
756 >>> Location.fromstring("AL391218.9:105173..108462", 2000000) | |
757 SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') | |
758 | |
759 >>> Location.fromstring("<2644..159", 2868, "circular") | |
760 CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join') | |
761 """ | |
762 if text.startswith("complement("): | |
763 if text[-1] != ")": | |
764 raise ValueError(f"closing bracket missing in '{text}'") | |
765 text = text[11:-1] | |
766 strand = -1 | |
767 elif stranded: | |
768 strand = 1 | |
769 else: | |
770 strand = None | |
771 | |
772 # Determine if we have a simple location or a compound location | |
773 if text.startswith("join("): | |
774 operator = "join" | |
775 parts = _split(text[5:-1])[1::2] | |
776 # assert parts[0] == "" and parts[-1] == "" | |
777 elif text.startswith("order("): | |
778 operator = "order" | |
779 parts = _split(text[6:-1])[1::2] | |
780 # assert parts[0] == "" and parts[-1] == "" | |
781 elif text.startswith("bond("): | |
782 operator = "bond" | |
783 parts = _split(text[5:-1])[1::2] | |
784 # assert parts[0] == "" and parts[-1] == "" | |
785 else: | |
786 loc = SimpleLocation.fromstring(text, length, circular) | |
787 loc.strand = strand | |
788 if strand == -1: | |
789 loc.parts.reverse() | |
790 return loc | |
791 locs = [] | |
792 for part in parts: | |
793 loc = SimpleLocation.fromstring(part, length, circular) | |
794 if loc is None: | |
795 break | |
796 if loc.strand == -1: | |
797 if strand == -1: | |
798 raise LocationParserError("double complement in '{text}'?") | |
799 else: | |
800 loc.strand = strand | |
801 locs.extend(loc.parts) | |
802 else: | |
803 if len(locs) == 1: | |
804 return loc | |
805 # Historically a join on the reverse strand has been represented | |
806 # in Biopython with both the parent SeqFeature and its children | |
807 # (the exons for a CDS) all given a strand of -1. Likewise, for | |
808 # a join feature on the forward strand they all have strand +1. | |
809 # However, we must also consider evil mixed strand examples like | |
810 # this, join(complement(69611..69724),139856..140087,140625..140650) | |
811 if strand == -1: | |
812 # Whole thing was wrapped in complement(...) | |
813 for loc in locs: | |
814 assert loc.strand == -1 | |
815 # Reverse the backwards order used in GenBank files | |
816 # with complement(join(...)) | |
817 locs = locs[::-1] | |
818 return CompoundLocation(locs, operator=operator) | |
819 # Not recognized | |
820 if "order" in text and "join" in text: | |
821 # See Bug 3197 | |
822 raise LocationParserError( | |
823 f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal" | |
824 ) | |
825 | |
826 # See issue #937. Note that NCBI has already fixed this record. | |
827 if ",)" in text: | |
828 warnings.warn( | |
829 "Dropping trailing comma in malformed feature location", | |
830 BiopythonParserWarning, | |
831 ) | |
832 text = text.replace(",)", ")") | |
833 return Location.fromstring(text) | |
834 | |
835 raise LocationParserError(f"failed to parse feature location '{text}'") | |
836 | |
837 | |
838 class SimpleLocation(Location): | |
839 """Specify the location of a feature along a sequence. | |
840 | |
841 The SimpleLocation is used for simple continuous features, which can | |
842 be described as running from a start position to and end position | |
843 (optionally with a strand and reference information). More complex | |
844 locations made up from several non-continuous parts (e.g. a coding | |
845 sequence made up of several exons) are described using a SeqFeature | |
846 with a CompoundLocation. | |
847 | |
848 Note that the start and end location numbering follow Python's scheme, | |
849 thus a GenBank entry of 123..150 (one based counting) becomes a location | |
850 of [122:150] (zero based counting). | |
851 | |
852 >>> from Bio.SeqFeature import SimpleLocation | |
853 >>> f = SimpleLocation(122, 150) | |
854 >>> print(f) | |
855 [122:150] | |
856 >>> print(f.start) | |
857 122 | |
858 >>> print(f.end) | |
859 150 | |
860 >>> print(f.strand) | |
861 None | |
862 | |
863 Note the strand defaults to None. If you are working with nucleotide | |
864 sequences you'd want to be explicit if it is the forward strand: | |
865 | |
866 >>> from Bio.SeqFeature import SimpleLocation | |
867 >>> f = SimpleLocation(122, 150, strand=+1) | |
868 >>> print(f) | |
869 [122:150](+) | |
870 >>> print(f.strand) | |
871 1 | |
872 | |
873 Note that for a parent sequence of length n, the SimpleLocation | |
874 start and end must satisfy the inequality 0 <= start <= end <= n. | |
875 This means even for features on the reverse strand of a nucleotide | |
876 sequence, we expect the 'start' coordinate to be less than the | |
877 'end'. | |
878 | |
879 >>> from Bio.SeqFeature import SimpleLocation | |
880 >>> r = SimpleLocation(122, 150, strand=-1) | |
881 >>> print(r) | |
882 [122:150](-) | |
883 >>> print(r.start) | |
884 122 | |
885 >>> print(r.end) | |
886 150 | |
887 >>> print(r.strand) | |
888 -1 | |
889 | |
890 i.e. Rather than thinking of the 'start' and 'end' biologically in a | |
891 strand aware manner, think of them as the 'left most' or 'minimum' | |
892 boundary, and the 'right most' or 'maximum' boundary of the region | |
893 being described. This is particularly important with compound | |
894 locations describing non-continuous regions. | |
895 | |
896 In the example above we have used standard exact positions, but there | |
897 are also specialised position objects used to represent fuzzy positions | |
898 as well, for example a GenBank location like complement(<123..150) | |
899 would use a BeforePosition object for the start. | |
900 """ | |
901 | |
902 def __init__(self, start, end, strand=None, ref=None, ref_db=None): | |
903 """Initialize the class. | |
904 | |
905 start and end arguments specify the values where the feature begins | |
906 and ends. These can either by any of the ``*Position`` objects that | |
907 inherit from Position, or can just be integers specifying the position. | |
908 In the case of integers, the values are assumed to be exact and are | |
909 converted in ExactPosition arguments. This is meant to make it easy | |
910 to deal with non-fuzzy ends. | |
911 | |
912 i.e. Short form: | |
913 | |
914 >>> from Bio.SeqFeature import SimpleLocation | |
915 >>> loc = SimpleLocation(5, 10, strand=-1) | |
916 >>> print(loc) | |
917 [5:10](-) | |
918 | |
919 Explicit form: | |
920 | |
921 >>> from Bio.SeqFeature import SimpleLocation, ExactPosition | |
922 >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1) | |
923 >>> print(loc) | |
924 [5:10](-) | |
925 | |
926 Other fuzzy positions are used similarly, | |
927 | |
928 >>> from Bio.SeqFeature import SimpleLocation | |
929 >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
930 >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1) | |
931 >>> print(loc2) | |
932 [<5:>10](-) | |
933 | |
934 For nucleotide features you will also want to specify the strand, | |
935 use 1 for the forward (plus) strand, -1 for the reverse (negative) | |
936 strand, 0 for stranded but strand unknown (? in GFF3), or None for | |
937 when the strand does not apply (dot in GFF3), e.g. features on | |
938 proteins. | |
939 | |
940 >>> loc = SimpleLocation(5, 10, strand=+1) | |
941 >>> print(loc) | |
942 [5:10](+) | |
943 >>> print(loc.strand) | |
944 1 | |
945 | |
946 Normally feature locations are given relative to the parent | |
947 sequence you are working with, but an explicit accession can | |
948 be given with the optional ref and db_ref strings: | |
949 | |
950 >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1) | |
951 >>> print(loc) | |
952 AL391218.9[105172:108462](+) | |
953 >>> print(loc.ref) | |
954 AL391218.9 | |
955 | |
956 """ | |
957 # TODO - Check 0 <= start <= end (<= length of reference) | |
958 if isinstance(start, Position): | |
959 self._start = start | |
960 elif isinstance(start, int): | |
961 self._start = ExactPosition(start) | |
962 else: | |
963 raise TypeError(f"start={start!r} {type(start)}") | |
964 if isinstance(end, Position): | |
965 self._end = end | |
966 elif isinstance(end, int): | |
967 self._end = ExactPosition(end) | |
968 else: | |
969 raise TypeError(f"end={end!r} {type(end)}") | |
970 if ( | |
971 isinstance(self.start, int) | |
972 and isinstance(self.end, int) | |
973 and self.start > self.end | |
974 ): | |
975 raise ValueError( | |
976 f"End location ({self.end}) must be greater than " | |
977 f"or equal to start location ({self.start})" | |
978 ) | |
979 self.strand = strand | |
980 self.ref = ref | |
981 self.ref_db = ref_db | |
982 | |
983 @staticmethod | |
984 def fromstring(text, length=None, circular=False): | |
985 """Create a SimpleLocation object from a string.""" | |
986 if text.startswith("complement("): | |
987 text = text[11:-1] | |
988 strand = -1 | |
989 else: | |
990 strand = None | |
991 # Try simple cases first for speed | |
992 try: | |
993 s, e = text.split("..") | |
994 s = int(s) - 1 | |
995 e = int(e) | |
996 except ValueError: | |
997 pass | |
998 else: | |
999 if 0 <= s < e: | |
1000 return SimpleLocation(s, e, strand) | |
1001 # Try general case | |
1002 try: | |
1003 ref, text = text.split(":") | |
1004 except ValueError: | |
1005 ref = None | |
1006 m = _re_location_category.match(text) | |
1007 if m is None: | |
1008 raise LocationParserError(f"Could not parse feature location '{text}'") | |
1009 for key, value in m.groupdict().items(): | |
1010 if value is not None: | |
1011 break | |
1012 assert value == text | |
1013 if key == "bond": | |
1014 # e.g. bond(196) | |
1015 warnings.warn( | |
1016 "Dropping bond qualifier in feature location", | |
1017 BiopythonParserWarning, | |
1018 ) | |
1019 text = text[5:-1] | |
1020 s_pos = Position.fromstring(text, -1) | |
1021 e_pos = Position.fromstring(text) | |
1022 elif key == "solo": | |
1023 # e.g. "123" | |
1024 s_pos = Position.fromstring(text, -1) | |
1025 e_pos = Position.fromstring(text) | |
1026 elif key in ("pair", "within", "oneof"): | |
1027 s, e = text.split("..") | |
1028 # Attempt to fix features that span the origin | |
1029 s_pos = Position.fromstring(s, -1) | |
1030 e_pos = Position.fromstring(e) | |
1031 if s_pos >= e_pos: | |
1032 # There is likely a problem with origin wrapping. | |
1033 # Create a CompoundLocation of the wrapped feature, | |
1034 # consisting of two SimpleLocation objects to extend to | |
1035 # the list of feature locations. | |
1036 if not circular: | |
1037 raise LocationParserError( | |
1038 f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined" | |
1039 ) | |
1040 warnings.warn( | |
1041 "Attempting to fix invalid location %r as " | |
1042 "it looks like incorrect origin wrapping. " | |
1043 "Please fix input file, this could have " | |
1044 "unintended behavior." % text, | |
1045 BiopythonParserWarning, | |
1046 ) | |
1047 | |
1048 f1 = SimpleLocation(s_pos, length, strand) | |
1049 f2 = SimpleLocation(0, e_pos, strand) | |
1050 | |
1051 if strand == -1: | |
1052 # For complementary features spanning the origin | |
1053 return f2 + f1 | |
1054 else: | |
1055 return f1 + f2 | |
1056 elif key == "between": | |
1057 # A between location like "67^68" (one based counting) is a | |
1058 # special case (note it has zero length). In python slice | |
1059 # notation this is 67:67, a zero length slice. See Bug 2622 | |
1060 # Further more, on a circular genome of length N you can have | |
1061 # a location N^1 meaning the junction at the origin. See Bug 3098. | |
1062 # NOTE - We can imagine between locations like "2^4", but this | |
1063 # is just "3". Similarly, "2^5" is just "3..4" | |
1064 s, e = text.split("^") | |
1065 s = int(s) | |
1066 e = int(e) | |
1067 if s + 1 == e or (s == length and e == 1): | |
1068 s_pos = ExactPosition(s) | |
1069 e_pos = s_pos | |
1070 else: | |
1071 raise LocationParserError(f"invalid feature location '{text}'") | |
1072 if s_pos < 0: | |
1073 raise LocationParserError( | |
1074 f"negative starting position in feature location '{text}'" | |
1075 ) | |
1076 return SimpleLocation(s_pos, e_pos, strand, ref=ref) | |
1077 | |
1078 def _get_strand(self): | |
1079 """Get function for the strand property (PRIVATE).""" | |
1080 return self._strand | |
1081 | |
1082 def _set_strand(self, value): | |
1083 """Set function for the strand property (PRIVATE).""" | |
1084 if value not in [+1, -1, 0, None]: | |
1085 raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}") | |
1086 self._strand = value | |
1087 | |
1088 strand = property( | |
1089 fget=_get_strand, | |
1090 fset=_set_strand, | |
1091 doc="Strand of the location (+1, -1, 0 or None).", | |
1092 ) | |
1093 | |
1094 def __str__(self): | |
1095 """Return a representation of the SimpleLocation object (with python counting). | |
1096 | |
1097 For the simple case this uses the python splicing syntax, [122:150] | |
1098 (zero based counting) which GenBank would call 123..150 (one based | |
1099 counting). | |
1100 """ | |
1101 answer = f"[{self._start}:{self._end}]" | |
1102 if self.ref and self.ref_db: | |
1103 answer = f"{self.ref_db}:{self.ref}{answer}" | |
1104 elif self.ref: | |
1105 answer = self.ref + answer | |
1106 # Is ref_db without ref meaningful? | |
1107 if self.strand is None: | |
1108 return answer | |
1109 elif self.strand == +1: | |
1110 return answer + "(+)" | |
1111 elif self.strand == -1: | |
1112 return answer + "(-)" | |
1113 else: | |
1114 # strand = 0, stranded but strand unknown, ? in GFF3 | |
1115 return answer + "(?)" | |
1116 | |
1117 def __repr__(self): | |
1118 """Represent the SimpleLocation object as a string for debugging.""" | |
1119 optional = "" | |
1120 if self.strand is not None: | |
1121 optional += f", strand={self.strand!r}" | |
1122 if self.ref is not None: | |
1123 optional += f", ref={self.ref!r}" | |
1124 if self.ref_db is not None: | |
1125 optional += f", ref_db={self.ref_db!r}" | |
1126 return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})" | |
1127 | |
1128 def __add__(self, other): | |
1129 """Combine location with another SimpleLocation object, or shift it. | |
1130 | |
1131 You can add two feature locations to make a join CompoundLocation: | |
1132 | |
1133 >>> from Bio.SeqFeature import SimpleLocation | |
1134 >>> f1 = SimpleLocation(5, 10) | |
1135 >>> f2 = SimpleLocation(20, 30) | |
1136 >>> combined = f1 + f2 | |
1137 >>> print(combined) | |
1138 join{[5:10], [20:30]} | |
1139 | |
1140 This is thus equivalent to: | |
1141 | |
1142 >>> from Bio.SeqFeature import CompoundLocation | |
1143 >>> join = CompoundLocation([f1, f2]) | |
1144 >>> print(join) | |
1145 join{[5:10], [20:30]} | |
1146 | |
1147 You can also use sum(...) in this way: | |
1148 | |
1149 >>> join = sum([f1, f2]) | |
1150 >>> print(join) | |
1151 join{[5:10], [20:30]} | |
1152 | |
1153 Furthermore, you can combine a SimpleLocation with a CompoundLocation | |
1154 in this way. | |
1155 | |
1156 Separately, adding an integer will give a new SimpleLocation with | |
1157 its start and end offset by that amount. For example: | |
1158 | |
1159 >>> print(f1) | |
1160 [5:10] | |
1161 >>> print(f1 + 100) | |
1162 [105:110] | |
1163 >>> print(200 + f1) | |
1164 [205:210] | |
1165 | |
1166 This can be useful when editing annotation. | |
1167 """ | |
1168 if isinstance(other, SimpleLocation): | |
1169 return CompoundLocation([self, other]) | |
1170 elif isinstance(other, int): | |
1171 return self._shift(other) | |
1172 else: | |
1173 # This will allow CompoundLocation's __radd__ to be called: | |
1174 return NotImplemented | |
1175 | |
1176 def __radd__(self, other): | |
1177 """Return a SimpleLocation object by shifting the location by an integer amount.""" | |
1178 if isinstance(other, int): | |
1179 return self._shift(other) | |
1180 else: | |
1181 return NotImplemented | |
1182 | |
1183 def __sub__(self, other): | |
1184 """Subtracting an integer will shift the start and end by that amount. | |
1185 | |
1186 >>> from Bio.SeqFeature import SimpleLocation | |
1187 >>> f1 = SimpleLocation(105, 150) | |
1188 >>> print(f1) | |
1189 [105:150] | |
1190 >>> print(f1 - 100) | |
1191 [5:50] | |
1192 | |
1193 This can be useful when editing annotation. You can also add an integer | |
1194 to a feature location (which shifts in the opposite direction). | |
1195 """ | |
1196 if isinstance(other, int): | |
1197 return self._shift(-other) | |
1198 else: | |
1199 return NotImplemented | |
1200 | |
1201 def __nonzero__(self): | |
1202 """Return True regardless of the length of the feature. | |
1203 | |
1204 This behavior is for backwards compatibility, since until the | |
1205 __len__ method was added, a SimpleLocation always evaluated as True. | |
1206 | |
1207 Note that in comparison, Seq objects, strings, lists, etc, will all | |
1208 evaluate to False if they have length zero. | |
1209 | |
1210 WARNING: The SimpleLocation may in future evaluate to False when its | |
1211 length is zero (in order to better match normal python behavior)! | |
1212 """ | |
1213 return True | |
1214 | |
1215 def __len__(self): | |
1216 """Return the length of the region described by the SimpleLocation object. | |
1217 | |
1218 Note that extra care may be needed for fuzzy locations, e.g. | |
1219 | |
1220 >>> from Bio.SeqFeature import SimpleLocation | |
1221 >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
1222 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
1223 >>> len(loc) | |
1224 5 | |
1225 """ | |
1226 return int(self._end) - int(self._start) | |
1227 | |
1228 def __contains__(self, value): | |
1229 """Check if an integer position is within the SimpleLocation object. | |
1230 | |
1231 Note that extra care may be needed for fuzzy locations, e.g. | |
1232 | |
1233 >>> from Bio.SeqFeature import SimpleLocation | |
1234 >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
1235 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
1236 >>> len(loc) | |
1237 5 | |
1238 >>> [i for i in range(15) if i in loc] | |
1239 [5, 6, 7, 8, 9] | |
1240 """ | |
1241 if not isinstance(value, int): | |
1242 raise ValueError( | |
1243 "Currently we only support checking for integer " | |
1244 "positions being within a SimpleLocation." | |
1245 ) | |
1246 if value < self._start or value >= self._end: | |
1247 return False | |
1248 else: | |
1249 return True | |
1250 | |
1251 def __iter__(self): | |
1252 """Iterate over the parent positions within the SimpleLocation object. | |
1253 | |
1254 >>> from Bio.SeqFeature import SimpleLocation | |
1255 >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
1256 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
1257 >>> len(loc) | |
1258 5 | |
1259 >>> for i in loc: print(i) | |
1260 5 | |
1261 6 | |
1262 7 | |
1263 8 | |
1264 9 | |
1265 >>> list(loc) | |
1266 [5, 6, 7, 8, 9] | |
1267 >>> [i for i in range(15) if i in loc] | |
1268 [5, 6, 7, 8, 9] | |
1269 | |
1270 Note this is strand aware: | |
1271 | |
1272 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1) | |
1273 >>> list(loc) | |
1274 [9, 8, 7, 6, 5] | |
1275 """ | |
1276 if self.strand == -1: | |
1277 yield from range(self._end - 1, self._start - 1, -1) | |
1278 else: | |
1279 yield from range(self._start, self._end) | |
1280 | |
1281 def __eq__(self, other): | |
1282 """Implement equality by comparing all the location attributes.""" | |
1283 if not isinstance(other, SimpleLocation): | |
1284 return False | |
1285 return ( | |
1286 self._start == other.start | |
1287 and self._end == other.end | |
1288 and self._strand == other.strand | |
1289 and self.ref == other.ref | |
1290 and self.ref_db == other.ref_db | |
1291 ) | |
1292 | |
1293 def _shift(self, offset): | |
1294 """Return a copy of the SimpleLocation shifted by an offset (PRIVATE). | |
1295 | |
1296 Returns self when location is relative to an external reference. | |
1297 """ | |
1298 # TODO - What if offset is a fuzzy position? | |
1299 if self.ref or self.ref_db: | |
1300 return self | |
1301 return SimpleLocation( | |
1302 start=self._start + offset, | |
1303 end=self._end + offset, | |
1304 strand=self.strand, | |
1305 ) | |
1306 | |
1307 def _flip(self, length): | |
1308 """Return a copy of the location after the parent is reversed (PRIVATE). | |
1309 | |
1310 Returns self when location is relative to an external reference. | |
1311 """ | |
1312 if self.ref or self.ref_db: | |
1313 return self | |
1314 # Note this will flip the start and end too! | |
1315 if self.strand == +1: | |
1316 flip_strand = -1 | |
1317 elif self.strand == -1: | |
1318 flip_strand = +1 | |
1319 else: | |
1320 # 0 or None | |
1321 flip_strand = self.strand | |
1322 return SimpleLocation( | |
1323 start=self._end._flip(length), | |
1324 end=self._start._flip(length), | |
1325 strand=flip_strand, | |
1326 ) | |
1327 | |
1328 @property | |
1329 def parts(self): | |
1330 """Read only list of sections (always one, the SimpleLocation object). | |
1331 | |
1332 This is a convenience property allowing you to write code handling | |
1333 both SimpleLocation objects (with one part) and more complex | |
1334 CompoundLocation objects (with multiple parts) interchangeably. | |
1335 """ | |
1336 return [self] | |
1337 | |
1338 @property | |
1339 def start(self): | |
1340 """Start location - left most (minimum) value, regardless of strand. | |
1341 | |
1342 Read only, returns an integer like position object, possibly a fuzzy | |
1343 position. | |
1344 """ | |
1345 return self._start | |
1346 | |
1347 @property | |
1348 def end(self): | |
1349 """End location - right most (maximum) value, regardless of strand. | |
1350 | |
1351 Read only, returns an integer like position object, possibly a fuzzy | |
1352 position. | |
1353 """ | |
1354 return self._end | |
1355 | |
1356 def extract(self, parent_sequence, references=None): | |
1357 """Extract the sequence from supplied parent sequence using the SimpleLocation object. | |
1358 | |
1359 The parent_sequence can be a Seq like object or a string, and will | |
1360 generally return an object of the same type. The exception to this is | |
1361 a MutableSeq as the parent sequence will return a Seq object. | |
1362 If the location refers to other records, they must be supplied | |
1363 in the optional dictionary references. | |
1364 | |
1365 >>> from Bio.Seq import Seq | |
1366 >>> from Bio.SeqFeature import SimpleLocation | |
1367 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
1368 >>> feature_loc = SimpleLocation(8, 15) | |
1369 >>> feature_loc.extract(seq) | |
1370 Seq('VALIVIC') | |
1371 | |
1372 """ | |
1373 if self.ref or self.ref_db: | |
1374 if not references: | |
1375 raise ValueError( | |
1376 f"Feature references another sequence ({self.ref})," | |
1377 " references mandatory" | |
1378 ) | |
1379 elif self.ref not in references: | |
1380 # KeyError? | |
1381 raise ValueError( | |
1382 f"Feature references another sequence ({self.ref})," | |
1383 " not found in references" | |
1384 ) | |
1385 parent_sequence = references[self.ref] | |
1386 f_seq = parent_sequence[int(self.start) : int(self.end)] | |
1387 if isinstance(f_seq, MutableSeq): | |
1388 f_seq = Seq(f_seq) | |
1389 if self.strand == -1: | |
1390 f_seq = reverse_complement(f_seq) | |
1391 return f_seq | |
1392 | |
1393 | |
1394 FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only. | |
1395 | |
1396 | |
1397 class CompoundLocation(Location): | |
1398 """For handling joins etc where a feature location has several parts.""" | |
1399 | |
1400 def __init__(self, parts, operator="join"): | |
1401 """Initialize the class. | |
1402 | |
1403 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
1404 >>> f1 = SimpleLocation(10, 40, strand=+1) | |
1405 >>> f2 = SimpleLocation(50, 59, strand=+1) | |
1406 >>> f = CompoundLocation([f1, f2]) | |
1407 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) | |
1408 True | |
1409 >>> print(f.operator) | |
1410 join | |
1411 >>> 5 in f | |
1412 False | |
1413 >>> 15 in f | |
1414 True | |
1415 >>> f.strand | |
1416 1 | |
1417 | |
1418 Notice that the strand of the compound location is computed | |
1419 automatically - in the case of mixed strands on the sub-locations | |
1420 the overall strand is set to None. | |
1421 | |
1422 >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1), | |
1423 ... SimpleLocation(10, 13, strand=-1)]) | |
1424 >>> print(f.strand) | |
1425 None | |
1426 >>> len(f) | |
1427 6 | |
1428 >>> list(f) | |
1429 [3, 4, 5, 12, 11, 10] | |
1430 | |
1431 The example above doing list(f) iterates over the coordinates within the | |
1432 feature. This allows you to use max and min on the location, to find the | |
1433 range covered: | |
1434 | |
1435 >>> min(f) | |
1436 3 | |
1437 >>> max(f) | |
1438 12 | |
1439 | |
1440 More generally, you can use the compound location's start and end which | |
1441 give the full span covered, 0 <= start <= end <= full sequence length. | |
1442 | |
1443 >>> f.start == min(f) | |
1444 True | |
1445 >>> f.end == max(f) + 1 | |
1446 True | |
1447 | |
1448 This is consistent with the behavior of the SimpleLocation for a single | |
1449 region, where again the 'start' and 'end' do not necessarily give the | |
1450 biological start and end, but rather the 'minimal' and 'maximal' | |
1451 coordinate boundaries. | |
1452 | |
1453 Note that adding locations provides a more intuitive method of | |
1454 construction: | |
1455 | |
1456 >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1) | |
1457 >>> len(f) | |
1458 6 | |
1459 >>> list(f) | |
1460 [3, 4, 5, 12, 11, 10] | |
1461 """ | |
1462 self.operator = operator | |
1463 self.parts = list(parts) | |
1464 for loc in self.parts: | |
1465 if not isinstance(loc, SimpleLocation): | |
1466 raise ValueError( | |
1467 "CompoundLocation should be given a list of " | |
1468 "SimpleLocation objects, not %s" % loc.__class__ | |
1469 ) | |
1470 if len(parts) < 2: | |
1471 raise ValueError( | |
1472 f"CompoundLocation should have at least 2 parts, not {parts!r}" | |
1473 ) | |
1474 | |
1475 def __str__(self): | |
1476 """Return a representation of the CompoundLocation object (with python counting).""" | |
1477 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts)) | |
1478 | |
1479 def __repr__(self): | |
1480 """Represent the CompoundLocation object as string for debugging.""" | |
1481 return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})" | |
1482 | |
1483 def _get_strand(self): | |
1484 """Get function for the strand property (PRIVATE).""" | |
1485 # Historically a join on the reverse strand has been represented | |
1486 # in Biopython with both the parent SeqFeature and its children | |
1487 # (the exons for a CDS) all given a strand of -1. Likewise, for | |
1488 # a join feature on the forward strand they all have strand +1. | |
1489 # However, we must also consider evil mixed strand examples like | |
1490 # this, join(complement(69611..69724),139856..140087,140625..140650) | |
1491 if len({loc.strand for loc in self.parts}) == 1: | |
1492 return self.parts[0].strand | |
1493 else: | |
1494 return None # i.e. mixed strands | |
1495 | |
1496 def _set_strand(self, value): | |
1497 """Set function for the strand property (PRIVATE).""" | |
1498 # Should this be allowed/encouraged? | |
1499 for loc in self.parts: | |
1500 loc.strand = value | |
1501 | |
1502 strand = property( | |
1503 fget=_get_strand, | |
1504 fset=_set_strand, | |
1505 doc="""Overall strand of the compound location. | |
1506 | |
1507 If all the parts have the same strand, that is returned. Otherwise | |
1508 for mixed strands, this returns None. | |
1509 | |
1510 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
1511 >>> f1 = SimpleLocation(15, 17, strand=1) | |
1512 >>> f2 = SimpleLocation(20, 30, strand=-1) | |
1513 >>> f = f1 + f2 | |
1514 >>> f1.strand | |
1515 1 | |
1516 >>> f2.strand | |
1517 -1 | |
1518 >>> f.strand | |
1519 >>> f.strand is None | |
1520 True | |
1521 | |
1522 If you set the strand of a CompoundLocation, this is applied to | |
1523 all the parts - use with caution: | |
1524 | |
1525 >>> f.strand = 1 | |
1526 >>> f1.strand | |
1527 1 | |
1528 >>> f2.strand | |
1529 1 | |
1530 >>> f.strand | |
1531 1 | |
1532 | |
1533 """, | |
1534 ) | |
1535 | |
1536 def __add__(self, other): | |
1537 """Combine locations, or shift the location by an integer offset. | |
1538 | |
1539 >>> from Bio.SeqFeature import SimpleLocation | |
1540 >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30) | |
1541 >>> print(f1) | |
1542 join{[15:17], [20:30]} | |
1543 | |
1544 You can add another SimpleLocation: | |
1545 | |
1546 >>> print(f1 + SimpleLocation(40, 50)) | |
1547 join{[15:17], [20:30], [40:50]} | |
1548 >>> print(SimpleLocation(5, 10) + f1) | |
1549 join{[5:10], [15:17], [20:30]} | |
1550 | |
1551 You can also add another CompoundLocation: | |
1552 | |
1553 >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70) | |
1554 >>> print(f2) | |
1555 join{[40:50], [60:70]} | |
1556 >>> print(f1 + f2) | |
1557 join{[15:17], [20:30], [40:50], [60:70]} | |
1558 | |
1559 Also, as with the SimpleLocation, adding an integer shifts the | |
1560 location's coordinates by that offset: | |
1561 | |
1562 >>> print(f1 + 100) | |
1563 join{[115:117], [120:130]} | |
1564 >>> print(200 + f1) | |
1565 join{[215:217], [220:230]} | |
1566 >>> print(f1 + (-5)) | |
1567 join{[10:12], [15:25]} | |
1568 """ | |
1569 if isinstance(other, SimpleLocation): | |
1570 return CompoundLocation(self.parts + [other], self.operator) | |
1571 elif isinstance(other, CompoundLocation): | |
1572 if self.operator != other.operator: | |
1573 # Handle join+order -> order as a special case? | |
1574 raise ValueError( | |
1575 f"Mixed operators {self.operator} and {other.operator}" | |
1576 ) | |
1577 return CompoundLocation(self.parts + other.parts, self.operator) | |
1578 elif isinstance(other, int): | |
1579 return self._shift(other) | |
1580 else: | |
1581 raise NotImplementedError | |
1582 | |
1583 def __radd__(self, other): | |
1584 """Add a feature to the left.""" | |
1585 if isinstance(other, SimpleLocation): | |
1586 return CompoundLocation([other] + self.parts, self.operator) | |
1587 elif isinstance(other, int): | |
1588 return self._shift(other) | |
1589 else: | |
1590 raise NotImplementedError | |
1591 | |
1592 def __contains__(self, value): | |
1593 """Check if an integer position is within the CompoundLocation object.""" | |
1594 for loc in self.parts: | |
1595 if value in loc: | |
1596 return True | |
1597 return False | |
1598 | |
1599 def __nonzero__(self): | |
1600 """Return True regardless of the length of the feature. | |
1601 | |
1602 This behavior is for backwards compatibility, since until the | |
1603 __len__ method was added, a SimpleLocation always evaluated as True. | |
1604 | |
1605 Note that in comparison, Seq objects, strings, lists, etc, will all | |
1606 evaluate to False if they have length zero. | |
1607 | |
1608 WARNING: The SimpleLocation may in future evaluate to False when its | |
1609 length is zero (in order to better match normal python behavior)! | |
1610 """ | |
1611 return True | |
1612 | |
1613 def __len__(self): | |
1614 """Return the length of the CompoundLocation object.""" | |
1615 return sum(len(loc) for loc in self.parts) | |
1616 | |
1617 def __iter__(self): | |
1618 """Iterate over the parent positions within the CompoundLocation object.""" | |
1619 for loc in self.parts: | |
1620 yield from loc | |
1621 | |
1622 def __eq__(self, other): | |
1623 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation.""" | |
1624 if not isinstance(other, CompoundLocation): | |
1625 return False | |
1626 if len(self.parts) != len(other.parts): | |
1627 return False | |
1628 if self.operator != other.operator: | |
1629 return False | |
1630 for self_part, other_part in zip(self.parts, other.parts): | |
1631 if self_part != other_part: | |
1632 return False | |
1633 return True | |
1634 | |
1635 def _shift(self, offset): | |
1636 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE).""" | |
1637 return CompoundLocation( | |
1638 [loc._shift(offset) for loc in self.parts], self.operator | |
1639 ) | |
1640 | |
1641 def _flip(self, length): | |
1642 """Return a copy of the locations after the parent is reversed (PRIVATE). | |
1643 | |
1644 Note that the order of the parts is NOT reversed too. Consider a CDS | |
1645 on the forward strand with exons small, medium and large (in length). | |
1646 Once we change the frame of reference to the reverse complement strand, | |
1647 the start codon is still part of the small exon, and the stop codon | |
1648 still part of the large exon - so the part order remains the same! | |
1649 | |
1650 Here is an artificial example, were the features map to the two upper | |
1651 case regions and the lower case runs of n are not used: | |
1652 | |
1653 >>> from Bio.Seq import Seq | |
1654 >>> from Bio.SeqFeature import SimpleLocation | |
1655 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") | |
1656 >>> small = SimpleLocation(5, 20, strand=1) | |
1657 >>> large = SimpleLocation(28, 52, strand=1) | |
1658 >>> location = small + large | |
1659 >>> print(small) | |
1660 [5:20](+) | |
1661 >>> print(large) | |
1662 [28:52](+) | |
1663 >>> print(location) | |
1664 join{[5:20](+), [28:52](+)} | |
1665 >>> for part in location.parts: | |
1666 ... print(len(part)) | |
1667 ... | |
1668 15 | |
1669 24 | |
1670 | |
1671 As you can see, this is a silly example where each "exon" is a word: | |
1672 | |
1673 >>> print(small.extract(dna).translate()) | |
1674 SILLY | |
1675 >>> print(large.extract(dna).translate()) | |
1676 EXAMPLE* | |
1677 >>> print(location.extract(dna).translate()) | |
1678 SILLYEXAMPLE* | |
1679 >>> for part in location.parts: | |
1680 ... print(part.extract(dna).translate()) | |
1681 ... | |
1682 SILLY | |
1683 EXAMPLE* | |
1684 | |
1685 Now, let's look at this from the reverse strand frame of reference: | |
1686 | |
1687 >>> flipped_dna = dna.reverse_complement() | |
1688 >>> flipped_location = location._flip(len(dna)) | |
1689 >>> print(flipped_location.extract(flipped_dna).translate()) | |
1690 SILLYEXAMPLE* | |
1691 >>> for part in flipped_location.parts: | |
1692 ... print(part.extract(flipped_dna).translate()) | |
1693 ... | |
1694 SILLY | |
1695 EXAMPLE* | |
1696 | |
1697 The key point here is the first part of the CompoundFeature is still the | |
1698 small exon, while the second part is still the large exon: | |
1699 | |
1700 >>> for part in flipped_location.parts: | |
1701 ... print(len(part)) | |
1702 ... | |
1703 15 | |
1704 24 | |
1705 >>> print(flipped_location) | |
1706 join{[37:52](-), [5:29](-)} | |
1707 | |
1708 Notice the parts are not reversed. However, there was a bug here in older | |
1709 versions of Biopython which would have given join{[5:29](-), [37:52](-)} | |
1710 and the translation would have wrongly been "EXAMPLE*SILLY" instead. | |
1711 | |
1712 """ | |
1713 return CompoundLocation( | |
1714 [loc._flip(length) for loc in self.parts], self.operator | |
1715 ) | |
1716 | |
1717 @property | |
1718 def start(self): | |
1719 """Start location - left most (minimum) value, regardless of strand. | |
1720 | |
1721 Read only, returns an integer like position object, possibly a fuzzy | |
1722 position. | |
1723 | |
1724 For the special case of a CompoundLocation wrapping the origin of a | |
1725 circular genome, this will return zero. | |
1726 """ | |
1727 return min(loc.start for loc in self.parts) | |
1728 | |
1729 @property | |
1730 def end(self): | |
1731 """End location - right most (maximum) value, regardless of strand. | |
1732 | |
1733 Read only, returns an integer like position object, possibly a fuzzy | |
1734 position. | |
1735 | |
1736 For the special case of a CompoundLocation wrapping the origin of | |
1737 a circular genome this will match the genome length. | |
1738 """ | |
1739 return max(loc.end for loc in self.parts) | |
1740 | |
1741 @property | |
1742 def ref(self): | |
1743 """Not present in CompoundLocation, dummy method for API compatibility.""" | |
1744 return None | |
1745 | |
1746 @property | |
1747 def ref_db(self): | |
1748 """Not present in CompoundLocation, dummy method for API compatibility.""" | |
1749 return None | |
1750 | |
1751 def extract(self, parent_sequence, references=None): | |
1752 """Extract the sequence from supplied parent sequence using the CompoundLocation object. | |
1753 | |
1754 The parent_sequence can be a Seq like object or a string, and will | |
1755 generally return an object of the same type. The exception to this is | |
1756 a MutableSeq as the parent sequence will return a Seq object. | |
1757 If the location refers to other records, they must be supplied | |
1758 in the optional dictionary references. | |
1759 | |
1760 >>> from Bio.Seq import Seq | |
1761 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
1762 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
1763 >>> fl1 = SimpleLocation(2, 8) | |
1764 >>> fl2 = SimpleLocation(10, 15) | |
1765 >>> fl3 = CompoundLocation([fl1,fl2]) | |
1766 >>> fl3.extract(seq) | |
1767 Seq('QHKAMILIVIC') | |
1768 | |
1769 """ | |
1770 # This copes with mixed strand features & all on reverse: | |
1771 parts = [ | |
1772 loc.extract(parent_sequence, references=references) for loc in self.parts | |
1773 ] | |
1774 f_seq = functools.reduce(lambda x, y: x + y, parts) | |
1775 return f_seq | |
1776 | |
1777 | |
1778 class Position(ABC): | |
1779 """Abstract base class representing a position.""" | |
1780 | |
1781 @abstractmethod | |
1782 def __repr__(self): | |
1783 """Represent the Position object as a string for debugging.""" | |
1784 return f"{self.__class__.__name__}(...)" | |
1785 | |
1786 @staticmethod | |
1787 def fromstring(text, offset=0): | |
1788 """Build a Position object from the text string. | |
1789 | |
1790 For an end position, leave offset as zero (default): | |
1791 | |
1792 >>> Position.fromstring("5") | |
1793 ExactPosition(5) | |
1794 | |
1795 For a start position, set offset to minus one (for Python counting): | |
1796 | |
1797 >>> Position.fromstring("5", -1) | |
1798 ExactPosition(4) | |
1799 | |
1800 This also covers fuzzy positions: | |
1801 | |
1802 >>> p = Position.fromstring("<5") | |
1803 >>> p | |
1804 BeforePosition(5) | |
1805 >>> print(p) | |
1806 <5 | |
1807 >>> int(p) | |
1808 5 | |
1809 | |
1810 >>> Position.fromstring(">5") | |
1811 AfterPosition(5) | |
1812 | |
1813 By default assumes an end position, so note the integer behavior: | |
1814 | |
1815 >>> p = Position.fromstring("one-of(5,8,11)") | |
1816 >>> p | |
1817 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) | |
1818 >>> print(p) | |
1819 one-of(5,8,11) | |
1820 >>> int(p) | |
1821 11 | |
1822 | |
1823 >>> Position.fromstring("(8.10)") | |
1824 WithinPosition(10, left=8, right=10) | |
1825 | |
1826 Fuzzy start positions: | |
1827 | |
1828 >>> p = Position.fromstring("<5", -1) | |
1829 >>> p | |
1830 BeforePosition(4) | |
1831 >>> print(p) | |
1832 <4 | |
1833 >>> int(p) | |
1834 4 | |
1835 | |
1836 Notice how the integer behavior changes too! | |
1837 | |
1838 >>> p = Position.fromstring("one-of(5,8,11)", -1) | |
1839 >>> p | |
1840 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) | |
1841 >>> print(p) | |
1842 one-of(4,7,10) | |
1843 >>> int(p) | |
1844 4 | |
1845 | |
1846 """ | |
1847 if offset != 0 and offset != -1: | |
1848 raise ValueError( | |
1849 "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)." | |
1850 ) | |
1851 if text == "?": | |
1852 return UnknownPosition() | |
1853 if text.startswith("?"): | |
1854 return UncertainPosition(int(text[1:]) + offset) | |
1855 if text.startswith("<"): | |
1856 return BeforePosition(int(text[1:]) + offset) | |
1857 if text.startswith(">"): | |
1858 return AfterPosition(int(text[1:]) + offset) | |
1859 m = _re_within_position.match(text) | |
1860 if m is not None: | |
1861 s, e = m.groups() | |
1862 s = int(s) + offset | |
1863 e = int(e) + offset | |
1864 if offset == -1: | |
1865 default = s | |
1866 else: | |
1867 default = e | |
1868 return WithinPosition(default, left=s, right=e) | |
1869 m = _re_oneof_position.match(text) | |
1870 if m is not None: | |
1871 positions = m.groups()[0] | |
1872 parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")] | |
1873 if offset == -1: | |
1874 default = min(int(pos) for pos in parts) | |
1875 else: | |
1876 default = max(int(pos) for pos in parts) | |
1877 return OneOfPosition(default, choices=parts) | |
1878 return ExactPosition(int(text) + offset) | |
1879 | |
1880 | |
1881 class ExactPosition(int, Position): | |
1882 """Specify the specific position of a boundary. | |
1883 | |
1884 Arguments: | |
1885 - position - The position of the boundary. | |
1886 - extension - An optional argument which must be zero since we don't | |
1887 have an extension. The argument is provided so that the same number | |
1888 of arguments can be passed to all position types. | |
1889 | |
1890 In this case, there is no fuzziness associated with the position. | |
1891 | |
1892 >>> p = ExactPosition(5) | |
1893 >>> p | |
1894 ExactPosition(5) | |
1895 >>> print(p) | |
1896 5 | |
1897 | |
1898 >>> isinstance(p, Position) | |
1899 True | |
1900 >>> isinstance(p, int) | |
1901 True | |
1902 | |
1903 Integer comparisons and operations should work as expected: | |
1904 | |
1905 >>> p == 5 | |
1906 True | |
1907 >>> p < 6 | |
1908 True | |
1909 >>> p <= 5 | |
1910 True | |
1911 >>> p + 10 | |
1912 ExactPosition(15) | |
1913 | |
1914 """ | |
1915 | |
1916 def __new__(cls, position, extension=0): | |
1917 """Create an ExactPosition object.""" | |
1918 if extension != 0: | |
1919 raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
1920 return int.__new__(cls, position) | |
1921 | |
1922 # Must define this on Python 3.8 onwards because we redefine __repr__ | |
1923 def __str__(self): | |
1924 """Return a representation of the ExactPosition object (with python counting).""" | |
1925 return str(int(self)) | |
1926 | |
1927 def __repr__(self): | |
1928 """Represent the ExactPosition object as a string for debugging.""" | |
1929 return "%s(%i)" % (self.__class__.__name__, int(self)) | |
1930 | |
1931 def __add__(self, offset): | |
1932 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
1933 # By default preserve any subclass | |
1934 return self.__class__(int(self) + offset) | |
1935 | |
1936 def _flip(self, length): | |
1937 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
1938 # By default preserve any subclass | |
1939 return self.__class__(length - int(self)) | |
1940 | |
1941 | |
1942 class UncertainPosition(ExactPosition): | |
1943 """Specify a specific position which is uncertain. | |
1944 | |
1945 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the | |
1946 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. | |
1947 """ | |
1948 | |
1949 | |
1950 class UnknownPosition(Position): | |
1951 """Specify a specific position which is unknown (has no position). | |
1952 | |
1953 This is used in UniProt, e.g. ? or in the XML as unknown. | |
1954 """ | |
1955 | |
1956 def __repr__(self): | |
1957 """Represent the UnknownPosition object as a string for debugging.""" | |
1958 return f"{self.__class__.__name__}()" | |
1959 | |
1960 def __hash__(self): | |
1961 """Return the hash value of the UnknownPosition object.""" | |
1962 return hash(None) | |
1963 | |
1964 def __add__(self, offset): | |
1965 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
1966 return self | |
1967 | |
1968 def _flip(self, length): | |
1969 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
1970 return self | |
1971 | |
1972 | |
1973 class WithinPosition(int, Position): | |
1974 """Specify the position of a boundary within some coordinates. | |
1975 | |
1976 Arguments: | |
1977 - position - The default integer position | |
1978 - left - The start (left) position of the boundary | |
1979 - right - The end (right) position of the boundary | |
1980 | |
1981 This allows dealing with a location like ((11.14)..100). This | |
1982 indicates that the start of the sequence is somewhere between 11 | |
1983 and 14. Since this is a start coordinate, it should act like | |
1984 it is at position 11 (or in Python counting, 10). | |
1985 | |
1986 >>> p = WithinPosition(10, 10, 13) | |
1987 >>> p | |
1988 WithinPosition(10, left=10, right=13) | |
1989 >>> print(p) | |
1990 (10.13) | |
1991 >>> int(p) | |
1992 10 | |
1993 | |
1994 Basic integer comparisons and operations should work as though | |
1995 this were a plain integer: | |
1996 | |
1997 >>> p == 10 | |
1998 True | |
1999 >>> p in [9, 10, 11] | |
2000 True | |
2001 >>> p < 11 | |
2002 True | |
2003 >>> p + 10 | |
2004 WithinPosition(20, left=20, right=23) | |
2005 | |
2006 >>> isinstance(p, WithinPosition) | |
2007 True | |
2008 >>> isinstance(p, Position) | |
2009 True | |
2010 >>> isinstance(p, int) | |
2011 True | |
2012 | |
2013 Note this also applies for comparison to other position objects, | |
2014 where again the integer behavior is used: | |
2015 | |
2016 >>> p == 10 | |
2017 True | |
2018 >>> p == ExactPosition(10) | |
2019 True | |
2020 >>> p == BeforePosition(10) | |
2021 True | |
2022 >>> p == AfterPosition(10) | |
2023 True | |
2024 | |
2025 If this were an end point, you would want the position to be 13 | |
2026 (the right/larger value, not the left/smaller value as above): | |
2027 | |
2028 >>> p2 = WithinPosition(13, 10, 13) | |
2029 >>> p2 | |
2030 WithinPosition(13, left=10, right=13) | |
2031 >>> print(p2) | |
2032 (10.13) | |
2033 >>> int(p2) | |
2034 13 | |
2035 >>> p2 == 13 | |
2036 True | |
2037 >>> p2 == ExactPosition(13) | |
2038 True | |
2039 | |
2040 """ | |
2041 | |
2042 def __new__(cls, position, left, right): | |
2043 """Create a WithinPosition object.""" | |
2044 if not (position == left or position == right): | |
2045 raise RuntimeError( | |
2046 "WithinPosition: %r should match left %r or " | |
2047 "right %r" % (position, left, right) | |
2048 ) | |
2049 obj = int.__new__(cls, position) | |
2050 obj._left = left | |
2051 obj._right = right | |
2052 return obj | |
2053 | |
2054 def __getnewargs__(self): | |
2055 """Return the arguments accepted by __new__. | |
2056 | |
2057 Necessary to allow pickling and unpickling of class instances. | |
2058 """ | |
2059 return (int(self), self._left, self._right) | |
2060 | |
2061 def __repr__(self): | |
2062 """Represent the WithinPosition object as a string for debugging.""" | |
2063 return "%s(%i, left=%i, right=%i)" % ( | |
2064 self.__class__.__name__, | |
2065 int(self), | |
2066 self._left, | |
2067 self._right, | |
2068 ) | |
2069 | |
2070 def __str__(self): | |
2071 """Return a representation of the WithinPosition object (with python counting).""" | |
2072 return f"({self._left}.{self._right})" | |
2073 | |
2074 def __add__(self, offset): | |
2075 """Return a copy of the position object with its location shifted.""" | |
2076 return self.__class__( | |
2077 int(self) + offset, self._left + offset, self._right + offset | |
2078 ) | |
2079 | |
2080 def _flip(self, length): | |
2081 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
2082 return self.__class__( | |
2083 length - int(self), length - self._right, length - self._left | |
2084 ) | |
2085 | |
2086 | |
2087 class BetweenPosition(int, Position): | |
2088 """Specify the position of a boundary between two coordinates (OBSOLETE?). | |
2089 | |
2090 Arguments: | |
2091 - position - The default integer position | |
2092 - left - The start (left) position of the boundary | |
2093 - right - The end (right) position of the boundary | |
2094 | |
2095 This allows dealing with a position like 123^456. This | |
2096 indicates that the start of the sequence is somewhere between | |
2097 123 and 456. It is up to the parser to set the position argument | |
2098 to either boundary point (depending on if this is being used as | |
2099 a start or end of the feature). For example as a feature end: | |
2100 | |
2101 >>> p = BetweenPosition(456, 123, 456) | |
2102 >>> p | |
2103 BetweenPosition(456, left=123, right=456) | |
2104 >>> print(p) | |
2105 (123^456) | |
2106 >>> int(p) | |
2107 456 | |
2108 | |
2109 Integer equality and comparison use the given position, | |
2110 | |
2111 >>> p == 456 | |
2112 True | |
2113 >>> p in [455, 456, 457] | |
2114 True | |
2115 >>> p > 300 | |
2116 True | |
2117 | |
2118 The old legacy properties of position and extension give the | |
2119 starting/lower/left position as an integer, and the distance | |
2120 to the ending/higher/right position as an integer. Note that | |
2121 the position object will act like either the left or the right | |
2122 end-point depending on how it was created: | |
2123 | |
2124 >>> p2 = BetweenPosition(123, left=123, right=456) | |
2125 >>> int(p) == int(p2) | |
2126 False | |
2127 >>> p == 456 | |
2128 True | |
2129 >>> p2 == 123 | |
2130 True | |
2131 | |
2132 Note this potentially surprising behavior: | |
2133 | |
2134 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) | |
2135 True | |
2136 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) | |
2137 True | |
2138 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) | |
2139 True | |
2140 | |
2141 i.e. For equality (and sorting) the position objects behave like | |
2142 integers. | |
2143 | |
2144 """ | |
2145 | |
2146 def __new__(cls, position, left, right): | |
2147 """Create a new instance in BetweenPosition object.""" | |
2148 assert position == left or position == right | |
2149 # TODO - public API for getting left/right, especially the unknown one | |
2150 obj = int.__new__(cls, position) | |
2151 obj._left = left | |
2152 obj._right = right | |
2153 return obj | |
2154 | |
2155 def __getnewargs__(self): | |
2156 """Return the arguments accepted by __new__. | |
2157 | |
2158 Necessary to allow pickling and unpickling of class instances. | |
2159 """ | |
2160 return (int(self), self._left, self._right) | |
2161 | |
2162 def __repr__(self): | |
2163 """Represent the BetweenPosition object as a string for debugging.""" | |
2164 return "%s(%i, left=%i, right=%i)" % ( | |
2165 self.__class__.__name__, | |
2166 int(self), | |
2167 self._left, | |
2168 self._right, | |
2169 ) | |
2170 | |
2171 def __str__(self): | |
2172 """Return a representation of the BetweenPosition object (with python counting).""" | |
2173 return f"({self._left}^{self._right})" | |
2174 | |
2175 def __add__(self, offset): | |
2176 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
2177 return self.__class__( | |
2178 int(self) + offset, self._left + offset, self._right + offset | |
2179 ) | |
2180 | |
2181 def _flip(self, length): | |
2182 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
2183 return self.__class__( | |
2184 length - int(self), length - self._right, length - self._left | |
2185 ) | |
2186 | |
2187 | |
2188 class BeforePosition(int, Position): | |
2189 """Specify a position where the actual location occurs before it. | |
2190 | |
2191 Arguments: | |
2192 - position - The upper boundary of where the location can occur. | |
2193 - extension - An optional argument which must be zero since we don't | |
2194 have an extension. The argument is provided so that the same number | |
2195 of arguments can be passed to all position types. | |
2196 | |
2197 This is used to specify positions like (<10..100) where the location | |
2198 occurs somewhere before position 10. | |
2199 | |
2200 >>> p = BeforePosition(5) | |
2201 >>> p | |
2202 BeforePosition(5) | |
2203 >>> print(p) | |
2204 <5 | |
2205 >>> int(p) | |
2206 5 | |
2207 >>> p + 10 | |
2208 BeforePosition(15) | |
2209 | |
2210 Note this potentially surprising behavior: | |
2211 | |
2212 >>> p == ExactPosition(5) | |
2213 True | |
2214 >>> p == AfterPosition(5) | |
2215 True | |
2216 | |
2217 Just remember that for equality and sorting the position objects act | |
2218 like integers. | |
2219 """ | |
2220 | |
2221 # Subclasses int so can't use __init__ | |
2222 def __new__(cls, position, extension=0): | |
2223 """Create a new instance in BeforePosition object.""" | |
2224 if extension != 0: | |
2225 raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
2226 return int.__new__(cls, position) | |
2227 | |
2228 def __repr__(self): | |
2229 """Represent the location as a string for debugging.""" | |
2230 return "%s(%i)" % (self.__class__.__name__, int(self)) | |
2231 | |
2232 def __str__(self): | |
2233 """Return a representation of the BeforePosition object (with python counting).""" | |
2234 return f"<{int(self)}" | |
2235 | |
2236 def __add__(self, offset): | |
2237 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
2238 return self.__class__(int(self) + offset) | |
2239 | |
2240 def _flip(self, length): | |
2241 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
2242 return AfterPosition(length - int(self)) | |
2243 | |
2244 | |
2245 class AfterPosition(int, Position): | |
2246 """Specify a position where the actual location is found after it. | |
2247 | |
2248 Arguments: | |
2249 - position - The lower boundary of where the location can occur. | |
2250 - extension - An optional argument which must be zero since we don't | |
2251 have an extension. The argument is provided so that the same number | |
2252 of arguments can be passed to all position types. | |
2253 | |
2254 This is used to specify positions like (>10..100) where the location | |
2255 occurs somewhere after position 10. | |
2256 | |
2257 >>> p = AfterPosition(7) | |
2258 >>> p | |
2259 AfterPosition(7) | |
2260 >>> print(p) | |
2261 >7 | |
2262 >>> int(p) | |
2263 7 | |
2264 >>> p + 10 | |
2265 AfterPosition(17) | |
2266 | |
2267 >>> isinstance(p, AfterPosition) | |
2268 True | |
2269 >>> isinstance(p, Position) | |
2270 True | |
2271 >>> isinstance(p, int) | |
2272 True | |
2273 | |
2274 Note this potentially surprising behavior: | |
2275 | |
2276 >>> p == ExactPosition(7) | |
2277 True | |
2278 >>> p == BeforePosition(7) | |
2279 True | |
2280 | |
2281 Just remember that for equality and sorting the position objects act | |
2282 like integers. | |
2283 """ | |
2284 | |
2285 # Subclasses int so can't use __init__ | |
2286 def __new__(cls, position, extension=0): | |
2287 """Create a new instance of the AfterPosition object.""" | |
2288 if extension != 0: | |
2289 raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
2290 return int.__new__(cls, position) | |
2291 | |
2292 def __repr__(self): | |
2293 """Represent the location as a string for debugging.""" | |
2294 return "%s(%i)" % (self.__class__.__name__, int(self)) | |
2295 | |
2296 def __str__(self): | |
2297 """Return a representation of the AfterPosition object (with python counting).""" | |
2298 return f">{int(self)}" | |
2299 | |
2300 def __add__(self, offset): | |
2301 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
2302 return self.__class__(int(self) + offset) | |
2303 | |
2304 def _flip(self, length): | |
2305 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
2306 return BeforePosition(length - int(self)) | |
2307 | |
2308 | |
2309 class OneOfPosition(int, Position): | |
2310 """Specify a position where the location can be multiple positions. | |
2311 | |
2312 This models the GenBank 'one-of(1888,1901)' function, and tries | |
2313 to make this fit within the Biopython Position models. If this was | |
2314 a start position it should act like 1888, but as an end position 1901. | |
2315 | |
2316 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) | |
2317 >>> p | |
2318 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) | |
2319 >>> int(p) | |
2320 1888 | |
2321 | |
2322 Integer comparisons and operators act like using int(p), | |
2323 | |
2324 >>> p == 1888 | |
2325 True | |
2326 >>> p <= 1888 | |
2327 True | |
2328 >>> p > 1888 | |
2329 False | |
2330 >>> p + 100 | |
2331 OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)]) | |
2332 | |
2333 >>> isinstance(p, OneOfPosition) | |
2334 True | |
2335 >>> isinstance(p, Position) | |
2336 True | |
2337 >>> isinstance(p, int) | |
2338 True | |
2339 | |
2340 """ | |
2341 | |
2342 def __new__(cls, position, choices): | |
2343 """Initialize with a set of possible positions. | |
2344 | |
2345 choices is a list of Position derived objects, specifying possible | |
2346 locations. | |
2347 | |
2348 position is an integer specifying the default behavior. | |
2349 """ | |
2350 if position not in choices: | |
2351 raise ValueError( | |
2352 f"OneOfPosition: {position!r} should match one of {choices!r}" | |
2353 ) | |
2354 obj = int.__new__(cls, position) | |
2355 obj.position_choices = choices | |
2356 return obj | |
2357 | |
2358 def __getnewargs__(self): | |
2359 """Return the arguments accepted by __new__. | |
2360 | |
2361 Necessary to allow pickling and unpickling of class instances. | |
2362 """ | |
2363 return (int(self), self.position_choices) | |
2364 | |
2365 def __repr__(self): | |
2366 """Represent the OneOfPosition object as a string for debugging.""" | |
2367 return "%s(%i, choices=%r)" % ( | |
2368 self.__class__.__name__, | |
2369 int(self), | |
2370 self.position_choices, | |
2371 ) | |
2372 | |
2373 def __str__(self): | |
2374 """Return a representation of the OneOfPosition object (with python counting).""" | |
2375 out = "one-of(" | |
2376 for position in self.position_choices: | |
2377 out += f"{position}," | |
2378 # replace the last comma with the closing parenthesis | |
2379 return out[:-1] + ")" | |
2380 | |
2381 def __add__(self, offset): | |
2382 """Return a copy of the position object with its location shifted (PRIVATE).""" | |
2383 return self.__class__( | |
2384 int(self) + offset, [p + offset for p in self.position_choices] | |
2385 ) | |
2386 | |
2387 def _flip(self, length): | |
2388 """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
2389 return self.__class__( | |
2390 length - int(self), [p._flip(length) for p in self.position_choices[::-1]] | |
2391 ) | |
2392 | |
2393 | |
2394 if __name__ == "__main__": | |
2395 from Bio._utils import run_doctest | |
2396 | |
2397 run_doctest() |