comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/SeqFeature.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright 2000-2003 Jeff Chang.
2 # Copyright 2001-2008 Brad Chapman.
3 # Copyright 2005-2024 by Peter Cock.
4 # Copyright 2006-2009 Michiel de Hoon.
5 # All rights reserved.
6 #
7 # This file is part of the Biopython distribution and governed by your
8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
9 # Please see the LICENSE file that should have been included as part of this
10 # package.
11 """Represent a Sequence Feature holding info about a part of a sequence.
12
13 This is heavily modeled after the Biocorba SeqFeature objects, and
14 may be pretty biased towards GenBank stuff since I'm writing it
15 for the GenBank parser output...
16
17 What's here:
18
19 Base class to hold a Feature
20 ----------------------------
21
22 Classes:
23 - SeqFeature
24
25 Hold information about a Reference
26 ----------------------------------
27
28 This is an attempt to create a General class to hold Reference type
29 information.
30
31 Classes:
32 - Reference
33
34 Specify locations of a feature on a Sequence
35 --------------------------------------------
36
37 This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
38 This has the advantages of allowing us to handle fuzzy stuff in case anyone
39 needs it, and also be compatible with BioPerl etc and BioSQL.
40
41 Classes:
42 - Location - abstract base class of SimpleLocation and CompoundLocation.
43 - SimpleLocation - Specify the start and end location of a feature.
44 - CompoundLocation - Collection of SimpleLocation objects (for joins etc).
45 - Position - abstract base class of ExactPosition, WithinPosition,
46 BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and
47 UnknownPosition.
48 - ExactPosition - Specify the position as being exact.
49 - WithinPosition - Specify a position occurring within some range.
50 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
51 - BeforePosition - Specify the position as being found before some base.
52 - AfterPosition - Specify the position as being found after some base.
53 - OneOfPosition - Specify a position consisting of multiple alternative positions.
54 - UncertainPosition - Specify a specific position which is uncertain.
55 - UnknownPosition - Represents missing information like '?' in UniProt.
56
57
58 Exceptions:
59 - LocationParserError - Exception indicating a failure to parse a location
60 string.
61
62 """
63 import functools
64 import re
65 import warnings
66 from abc import ABC, abstractmethod
67
68 from Bio import BiopythonDeprecationWarning
69 from Bio import BiopythonParserWarning
70 from Bio.Seq import MutableSeq
71 from Bio.Seq import reverse_complement
72 from Bio.Seq import Seq
73
74
75 # Regular expressions for location parsing
76
77 _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)"
78 _oneof_position = r"one\-of\(\d+[,\d+]+\)"
79
80 _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})"
81
82 _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))"
83
84 _split = re.compile(_any_location).split
85
86 assert _split("123..145")[1::2] == ["123..145"]
87 assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"]
88 assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"]
89 assert _split("complement(123..145),200..209")[1::2] == [
90 "complement(123..145)",
91 "200..209",
92 ]
93 assert _split("123..145,one-of(200,203)..209")[1::2] == [
94 "123..145",
95 "one-of(200,203)..209",
96 ]
97 assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [
98 "123..145",
99 "one-of(200,203)..one-of(209,211)",
100 "300",
101 ]
102 assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [
103 "123..145",
104 "complement(one-of(200,203)..one-of(209,211))",
105 "300",
106 ]
107 assert _split("123..145,200..one-of(209,211),300")[1::2] == [
108 "123..145",
109 "200..one-of(209,211)",
110 "300",
111 ]
112 assert _split("123..145,200..one-of(209,211)")[1::2] == [
113 "123..145",
114 "200..one-of(209,211)",
115 ]
116 assert _split(
117 "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"
118 )[1::2] == [
119 "complement(149815..150200)",
120 "complement(293787..295573)",
121 "NC_016402.1:6618..6676",
122 "181647..181905",
123 ]
124
125
126 _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+"
127
128 _between_location = r"\d+\^\d+"
129
130 _within_position = r"\(\d+\.\d+\)"
131 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
132 _within_position,
133 _within_position,
134 )
135 _within_position = r"\((\d+)\.(\d+)\)"
136 _re_within_position = re.compile(_within_position)
137 assert _re_within_position.match("(3.9)")
138
139 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
140 _oneof_position = r"one\-of\((\d+[,\d+]+)\)"
141 _re_oneof_position = re.compile(_oneof_position)
142 assert _re_oneof_position.match("one-of(6,9)")
143 assert not _re_oneof_position.match("one-of(3)")
144 assert _re_oneof_position.match("one-of(3,6)")
145 assert _re_oneof_position.match("one-of(3,6,9)")
146
147 _solo_location = r"[<>]?\d+"
148 _solo_bond = r"bond\(%s\)" % _solo_location
149
150 _re_location_category = re.compile(
151 r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$"
152 % (
153 _pair_location,
154 _between_location,
155 _within_location,
156 _oneof_location,
157 _solo_bond,
158 _solo_location,
159 )
160 )
161
162
163 class LocationParserError(ValueError):
164 """Could not parse a feature location string."""
165
166
167 class SeqFeature:
168 """Represent a Sequence Feature on an object.
169
170 Attributes:
171 - location - the location of the feature on the sequence (SimpleLocation)
172 - type - the specified type of the feature (ie. CDS, exon, repeat...)
173 - id - A string identifier for the feature.
174 - qualifiers - A dictionary of qualifiers on the feature. These are
175 analogous to the qualifiers from a GenBank feature table. The keys of
176 the dictionary are qualifier names, the values are the qualifier
177 values.
178
179 """
180
181 def __init__(
182 self,
183 location=None,
184 type="",
185 id="<unknown id>",
186 qualifiers=None,
187 sub_features=None,
188 ):
189 """Initialize a SeqFeature on a sequence.
190
191 location can either be a SimpleLocation (with strand argument also
192 given if required), or None.
193
194 e.g. With no strand, on the forward strand, and on the reverse strand:
195
196 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
197 >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain")
198 >>> f1.location.strand == None
199 True
200 >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS")
201 >>> f2.location.strand == +1
202 True
203 >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS")
204 >>> f3.location.strand == -1
205 True
206
207 For exact start/end positions, an integer can be used (as shown above)
208 as shorthand for the ExactPosition object. For non-exact locations, the
209 SimpleLocation must be specified via the appropriate position objects.
210 """
211 if (
212 location is not None
213 and not isinstance(location, SimpleLocation)
214 and not isinstance(location, CompoundLocation)
215 ):
216 raise TypeError(
217 "SimpleLocation, CompoundLocation (or None) required for the location"
218 )
219 self.location = location
220 self.type = type
221 self.id = id
222 self.qualifiers = {}
223 if qualifiers is not None:
224 self.qualifiers.update(qualifiers)
225 if sub_features is not None:
226 raise TypeError("Rather than sub_features, use a CompoundLocation")
227
228 def _get_strand(self):
229 """Get function for the strand property (PRIVATE)."""
230 warnings.warn(
231 "Please use .location.strand rather than .strand",
232 BiopythonDeprecationWarning,
233 )
234 return self.location.strand
235
236 def _set_strand(self, value):
237 """Set function for the strand property (PRIVATE)."""
238 warnings.warn(
239 "Please use .location.strand rather than .strand",
240 BiopythonDeprecationWarning,
241 )
242 try:
243 self.location.strand = value
244 except AttributeError:
245 if self.location is None:
246 if value is not None:
247 raise ValueError("Can't set strand without a location.") from None
248 else:
249 raise
250
251 strand = property(
252 fget=_get_strand,
253 fset=_set_strand,
254 doc="Alias for the location's strand (DEPRECATED).",
255 )
256
257 def _get_ref(self):
258 """Get function for the reference property (PRIVATE)."""
259 warnings.warn(
260 "Please use .location.ref rather than .ref",
261 BiopythonDeprecationWarning,
262 )
263 try:
264 return self.location.ref
265 except AttributeError:
266 return None
267
268 def _set_ref(self, value):
269 """Set function for the reference property (PRIVATE)."""
270 warnings.warn(
271 "Please use .location.ref rather than .ref",
272 BiopythonDeprecationWarning,
273 )
274 try:
275 self.location.ref = value
276 except AttributeError:
277 if self.location is None:
278 if value is not None:
279 raise ValueError("Can't set ref without a location.") from None
280 else:
281 raise
282
283 ref = property(
284 fget=_get_ref,
285 fset=_set_ref,
286 doc="Alias for the location's ref (DEPRECATED).",
287 )
288
289 def _get_ref_db(self):
290 """Get function for the database reference property (PRIVATE)."""
291 warnings.warn(
292 "Please use .location.ref_db rather than .ref_db",
293 BiopythonDeprecationWarning,
294 )
295 try:
296 return self.location.ref_db
297 except AttributeError:
298 return None
299
300 def _set_ref_db(self, value):
301 """Set function for the database reference property (PRIVATE)."""
302 warnings.warn(
303 "Please use .location.ref_db rather than .ref_db",
304 BiopythonDeprecationWarning,
305 )
306 self.location.ref_db = value
307
308 ref_db = property(
309 fget=_get_ref_db,
310 fset=_set_ref_db,
311 doc="Alias for the location's ref_db (DEPRECATED).",
312 )
313
314 def __eq__(self, other):
315 """Check if two SeqFeature objects should be considered equal."""
316 return (
317 isinstance(other, SeqFeature)
318 and self.id == other.id
319 and self.type == other.type
320 and self.location == other.location
321 and self.qualifiers == other.qualifiers
322 )
323
324 def __repr__(self):
325 """Represent the feature as a string for debugging."""
326 answer = f"{self.__class__.__name__}({self.location!r}"
327 if self.type:
328 answer += f", type={self.type!r}"
329 if self.id and self.id != "<unknown id>":
330 answer += f", id={self.id!r}"
331 if self.qualifiers:
332 answer += ", qualifiers=..."
333 answer += ")"
334 return answer
335
336 def __str__(self):
337 """Return the full feature as a python string."""
338 out = f"type: {self.type}\n"
339 out += f"location: {self.location}\n"
340 if self.id and self.id != "<unknown id>":
341 out += f"id: {self.id}\n"
342 out += "qualifiers:\n"
343 for qual_key in sorted(self.qualifiers):
344 out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n"
345 return out
346
347 def _shift(self, offset):
348 """Return a copy of the feature with its location shifted (PRIVATE).
349
350 The annotation qualifiers are copied.
351 """
352 return SeqFeature(
353 location=self.location._shift(offset),
354 type=self.type,
355 id=self.id,
356 qualifiers=self.qualifiers.copy(),
357 )
358
359 def _flip(self, length):
360 """Return a copy of the feature with its location flipped (PRIVATE).
361
362 The argument length gives the length of the parent sequence. For
363 example a location 0..20 (+1 strand) with parent length 30 becomes
364 after flipping 10..30 (-1 strand). Strandless (None) or unknown
365 strand (0) remain like that - just their end points are changed.
366
367 The annotation qualifiers are copied.
368 """
369 return SeqFeature(
370 location=self.location._flip(length),
371 type=self.type,
372 id=self.id,
373 qualifiers=self.qualifiers.copy(),
374 )
375
376 def extract(self, parent_sequence, references=None):
377 """Extract the feature's sequence from supplied parent sequence.
378
379 The parent_sequence can be a Seq like object or a string, and will
380 generally return an object of the same type. The exception to this is
381 a MutableSeq as the parent sequence will return a Seq object.
382
383 This should cope with complex locations including complements, joins
384 and fuzzy positions. Even mixed strand features should work! This
385 also covers features on protein sequences (e.g. domains), although
386 here reverse strand features are not permitted. If the
387 location refers to other records, they must be supplied in the
388 optional dictionary references.
389
390 >>> from Bio.Seq import Seq
391 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
392 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
393 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
394 >>> f.extract(seq)
395 Seq('VALIVIC')
396
397 If the SimpleLocation is None, e.g. when parsing invalid locus
398 locations in the GenBank parser, extract() will raise a ValueError.
399
400 >>> from Bio.Seq import Seq
401 >>> from Bio.SeqFeature import SeqFeature
402 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
403 >>> f = SeqFeature(None, type="domain")
404 >>> f.extract(seq)
405 Traceback (most recent call last):
406 ...
407 ValueError: The feature's .location is None. Check the sequence file for a valid location.
408
409 Note - currently only compound features of type "join" are supported.
410 """
411 if self.location is None:
412 raise ValueError(
413 "The feature's .location is None. Check the "
414 "sequence file for a valid location."
415 )
416 return self.location.extract(parent_sequence, references=references)
417
418 def translate(
419 self,
420 parent_sequence,
421 table="Standard",
422 start_offset=None,
423 stop_symbol="*",
424 to_stop=False,
425 cds=None,
426 gap=None,
427 ):
428 """Get a translation of the feature's sequence.
429
430 This method is intended for CDS or other features that code proteins
431 and is a shortcut that will both extract the feature and
432 translate it, taking into account the codon_start and transl_table
433 qualifiers, if they are present. If they are not present the
434 value of the arguments "table" and "start_offset" are used.
435
436 The "cds" parameter is set to "True" if the feature is of type
437 "CDS" but can be overridden by giving an explicit argument.
438
439 The arguments stop_symbol, to_stop and gap have the same meaning
440 as Seq.translate, refer to that documentation for further information.
441
442 Arguments:
443 - parent_sequence - A DNA or RNA sequence.
444 - table - Which codon table to use if there is no transl_table
445 qualifier for this feature. This can be either a name
446 (string), an NCBI identifier (integer), or a CodonTable
447 object (useful for non-standard genetic codes). This
448 defaults to the "Standard" table.
449 - start_offset - offset at which the first complete codon of a
450 coding feature can be found, relative to the first base of
451 that feature. Has a valid value of 0, 1 or 2. NOTE: this
452 uses python's 0-based numbering whereas the codon_start
453 qualifier in files from NCBI use 1-based numbering.
454 Will override a codon_start qualifier
455
456 >>> from Bio.Seq import Seq
457 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
458 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
459 >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS")
460 >>> f.qualifiers['transl_table'] = [11]
461
462 Note that features of type CDS are subject to the usual
463 checks at translation. But you can override this behavior
464 by giving explicit arguments:
465
466 >>> f.translate(seq, cds=False)
467 Seq('GYTYR*CL**')
468
469 Now use the start_offset argument to change the frame. Note
470 this uses python 0-based numbering.
471
472 >>> f.translate(seq, start_offset=1, cds=False)
473 Seq('VTLTDNVSD')
474
475 Alternatively use the codon_start qualifier to do the same
476 thing. Note: this uses 1-based numbering, which is found
477 in files from NCBI.
478
479 >>> f.qualifiers['codon_start'] = [2]
480 >>> f.translate(seq, cds=False)
481 Seq('VTLTDNVSD')
482 """
483 # see if this feature should be translated in a different
484 # frame using the "codon_start" qualifier
485 if start_offset is None:
486 try:
487 start_offset = int(self.qualifiers["codon_start"][0]) - 1
488 except KeyError:
489 start_offset = 0
490
491 if start_offset not in [0, 1, 2]:
492 raise ValueError(
493 "The start_offset must be 0, 1, or 2. "
494 f"The supplied value is {start_offset}. "
495 "Check the value of either the codon_start qualifier "
496 "or the start_offset argument"
497 )
498
499 feat_seq = self.extract(parent_sequence)[start_offset:]
500 codon_table = self.qualifiers.get("transl_table", [table])[0]
501
502 if cds is None:
503 cds = self.type == "CDS"
504
505 return feat_seq.translate(
506 table=codon_table,
507 stop_symbol=stop_symbol,
508 to_stop=to_stop,
509 cds=cds,
510 gap=gap,
511 )
512
513 def __bool__(self):
514 """Boolean value of an instance of this class (True).
515
516 This behavior is for backwards compatibility, since until the
517 __len__ method was added, a SeqFeature always evaluated as True.
518
519 Note that in comparison, Seq objects, strings, lists, etc, will all
520 evaluate to False if they have length zero.
521
522 WARNING: The SeqFeature may in future evaluate to False when its
523 length is zero (in order to better match normal python behavior)!
524 """
525 return True
526
527 def __len__(self):
528 """Return the length of the region where the feature is located.
529
530 >>> from Bio.Seq import Seq
531 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
532 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
533 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
534 >>> len(f)
535 7
536 >>> f.extract(seq)
537 Seq('VALIVIC')
538 >>> len(f.extract(seq))
539 7
540
541 This is a proxy for taking the length of the feature's location:
542
543 >>> len(f.location)
544 7
545
546 For simple features this is the same as the region spanned (end
547 position minus start position using Pythonic counting). However, for
548 a compound location (e.g. a CDS as the join of several exons) the
549 gaps are not counted (e.g. introns). This ensures that len(f) matches
550 len(f.extract(parent_seq)), and also makes sure things work properly
551 with features wrapping the origin etc.
552 """
553 return len(self.location)
554
555 def __iter__(self):
556 """Iterate over the parent positions within the feature.
557
558 The iteration order is strand aware, and can be thought of as moving
559 along the feature using the parent sequence coordinates:
560
561 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
562 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
563 >>> len(f)
564 5
565 >>> for i in f: print(i)
566 9
567 8
568 7
569 6
570 5
571 >>> list(f)
572 [9, 8, 7, 6, 5]
573
574 This is a proxy for iterating over the location,
575
576 >>> list(f.location)
577 [9, 8, 7, 6, 5]
578 """
579 return iter(self.location)
580
581 def __contains__(self, value):
582 """Check if an integer position is within the feature.
583
584 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
585 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
586 >>> len(f)
587 5
588 >>> [i for i in range(15) if i in f]
589 [5, 6, 7, 8, 9]
590
591 For example, to see which features include a SNP position, you could
592 use this:
593
594 >>> from Bio import SeqIO
595 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
596 >>> for f in record.features:
597 ... if 1750 in f:
598 ... print("%s %s" % (f.type, f.location))
599 source [0:154478](+)
600 gene [1716:4347](-)
601 tRNA join{[4310:4347](-), [1716:1751](-)}
602
603 Note that for a feature defined as a join of several subfeatures (e.g.
604 the union of several exons) the gaps are not checked (e.g. introns).
605 In this example, the tRNA location is defined in the GenBank file as
606 complement(join(1717..1751,4311..4347)), so that position 1760 falls
607 in the gap:
608
609 >>> for f in record.features:
610 ... if 1760 in f:
611 ... print("%s %s" % (f.type, f.location))
612 source [0:154478](+)
613 gene [1716:4347](-)
614
615 Note that additional care may be required with fuzzy locations, for
616 example just before a BeforePosition:
617
618 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
619 >>> from Bio.SeqFeature import BeforePosition
620 >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain")
621 >>> len(f)
622 5
623 >>> [i for i in range(10) if i in f]
624 [3, 4, 5, 6, 7]
625
626 Note that is is a proxy for testing membership on the location.
627
628 >>> [i for i in range(10) if i in f.location]
629 [3, 4, 5, 6, 7]
630 """
631 return value in self.location
632
633
634 # --- References
635
636
637 # TODO -- Will this hold PubMed and Medline information decently?
638 class Reference:
639 """Represent a Generic Reference object.
640
641 Attributes:
642 - location - A list of Location objects specifying regions of
643 the sequence that the references correspond to. If no locations are
644 specified, the entire sequence is assumed.
645 - authors - A big old string, or a list split by author, of authors
646 for the reference.
647 - title - The title of the reference.
648 - journal - Journal the reference was published in.
649 - medline_id - A medline reference for the article.
650 - pubmed_id - A pubmed reference for the article.
651 - comment - A place to stick any comments about the reference.
652
653 """
654
655 def __init__(self):
656 """Initialize the class."""
657 self.location = []
658 self.authors = ""
659 self.consrtm = ""
660 self.title = ""
661 self.journal = ""
662 self.medline_id = ""
663 self.pubmed_id = ""
664 self.comment = ""
665
666 def __str__(self):
667 """Return the full Reference object as a python string."""
668 out = ""
669 for single_location in self.location:
670 out += f"location: {single_location}\n"
671 out += f"authors: {self.authors}\n"
672 if self.consrtm:
673 out += f"consrtm: {self.consrtm}\n"
674 out += f"title: {self.title}\n"
675 out += f"journal: {self.journal}\n"
676 out += f"medline id: {self.medline_id}\n"
677 out += f"pubmed id: {self.pubmed_id}\n"
678 out += f"comment: {self.comment}\n"
679 return out
680
681 def __repr__(self):
682 """Represent the Reference object as a string for debugging."""
683 # TODO - Update this is __init__ later accepts values
684 return f"{self.__class__.__name__}(title={self.title!r}, ...)"
685
686 def __eq__(self, other):
687 """Check if two Reference objects should be considered equal.
688
689 Note prior to Biopython 1.70 the location was not compared, as
690 until then __eq__ for the SimpleLocation class was not defined.
691 """
692 return (
693 self.authors == other.authors
694 and self.consrtm == other.consrtm
695 and self.title == other.title
696 and self.journal == other.journal
697 and self.medline_id == other.medline_id
698 and self.pubmed_id == other.pubmed_id
699 and self.comment == other.comment
700 and self.location == other.location
701 )
702
703
704 # --- Handling feature locations
705
706
707 class Location(ABC):
708 """Abstract base class representing a location."""
709
710 @abstractmethod
711 def __repr__(self):
712 """Represent the Location object as a string for debugging."""
713 return f"{self.__class__.__name__}(...)"
714
715 def fromstring(text, length=None, circular=False, stranded=True):
716 """Create a Location object from a string.
717
718 This should accept any valid location string in the INSDC Feature Table
719 format (https://www.insdc.org/submitting-standards/feature-table/) as
720 used in GenBank, DDBJ and EMBL files.
721
722 Simple examples:
723
724 >>> Location.fromstring("123..456", 1000)
725 SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1)
726 >>> Location.fromstring("complement(<123..>456)", 1000)
727 SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1)
728
729 A more complex location using within positions,
730
731 >>> Location.fromstring("(9.10)..(20.25)", 1000)
732 SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
733
734 Notice how that will act as though it has overall start 8 and end 25.
735
736 Zero length between feature,
737
738 >>> Location.fromstring("123^124", 1000)
739 SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1)
740
741 The expected sequence length is needed for a special case, a between
742 position at the start/end of a circular genome:
743
744 >>> Location.fromstring("1000^1", 1000)
745 SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
746
747 Apart from this special case, between positions P^Q must have P+1==Q,
748
749 >>> Location.fromstring("123^456", 1000)
750 Traceback (most recent call last):
751 ...
752 Bio.SeqFeature.LocationParserError: invalid feature location '123^456'
753
754 You can optionally provide a reference name:
755
756 >>> Location.fromstring("AL391218.9:105173..108462", 2000000)
757 SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
758
759 >>> Location.fromstring("<2644..159", 2868, "circular")
760 CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
761 """
762 if text.startswith("complement("):
763 if text[-1] != ")":
764 raise ValueError(f"closing bracket missing in '{text}'")
765 text = text[11:-1]
766 strand = -1
767 elif stranded:
768 strand = 1
769 else:
770 strand = None
771
772 # Determine if we have a simple location or a compound location
773 if text.startswith("join("):
774 operator = "join"
775 parts = _split(text[5:-1])[1::2]
776 # assert parts[0] == "" and parts[-1] == ""
777 elif text.startswith("order("):
778 operator = "order"
779 parts = _split(text[6:-1])[1::2]
780 # assert parts[0] == "" and parts[-1] == ""
781 elif text.startswith("bond("):
782 operator = "bond"
783 parts = _split(text[5:-1])[1::2]
784 # assert parts[0] == "" and parts[-1] == ""
785 else:
786 loc = SimpleLocation.fromstring(text, length, circular)
787 loc.strand = strand
788 if strand == -1:
789 loc.parts.reverse()
790 return loc
791 locs = []
792 for part in parts:
793 loc = SimpleLocation.fromstring(part, length, circular)
794 if loc is None:
795 break
796 if loc.strand == -1:
797 if strand == -1:
798 raise LocationParserError("double complement in '{text}'?")
799 else:
800 loc.strand = strand
801 locs.extend(loc.parts)
802 else:
803 if len(locs) == 1:
804 return loc
805 # Historically a join on the reverse strand has been represented
806 # in Biopython with both the parent SeqFeature and its children
807 # (the exons for a CDS) all given a strand of -1. Likewise, for
808 # a join feature on the forward strand they all have strand +1.
809 # However, we must also consider evil mixed strand examples like
810 # this, join(complement(69611..69724),139856..140087,140625..140650)
811 if strand == -1:
812 # Whole thing was wrapped in complement(...)
813 for loc in locs:
814 assert loc.strand == -1
815 # Reverse the backwards order used in GenBank files
816 # with complement(join(...))
817 locs = locs[::-1]
818 return CompoundLocation(locs, operator=operator)
819 # Not recognized
820 if "order" in text and "join" in text:
821 # See Bug 3197
822 raise LocationParserError(
823 f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal"
824 )
825
826 # See issue #937. Note that NCBI has already fixed this record.
827 if ",)" in text:
828 warnings.warn(
829 "Dropping trailing comma in malformed feature location",
830 BiopythonParserWarning,
831 )
832 text = text.replace(",)", ")")
833 return Location.fromstring(text)
834
835 raise LocationParserError(f"failed to parse feature location '{text}'")
836
837
838 class SimpleLocation(Location):
839 """Specify the location of a feature along a sequence.
840
841 The SimpleLocation is used for simple continuous features, which can
842 be described as running from a start position to and end position
843 (optionally with a strand and reference information). More complex
844 locations made up from several non-continuous parts (e.g. a coding
845 sequence made up of several exons) are described using a SeqFeature
846 with a CompoundLocation.
847
848 Note that the start and end location numbering follow Python's scheme,
849 thus a GenBank entry of 123..150 (one based counting) becomes a location
850 of [122:150] (zero based counting).
851
852 >>> from Bio.SeqFeature import SimpleLocation
853 >>> f = SimpleLocation(122, 150)
854 >>> print(f)
855 [122:150]
856 >>> print(f.start)
857 122
858 >>> print(f.end)
859 150
860 >>> print(f.strand)
861 None
862
863 Note the strand defaults to None. If you are working with nucleotide
864 sequences you'd want to be explicit if it is the forward strand:
865
866 >>> from Bio.SeqFeature import SimpleLocation
867 >>> f = SimpleLocation(122, 150, strand=+1)
868 >>> print(f)
869 [122:150](+)
870 >>> print(f.strand)
871 1
872
873 Note that for a parent sequence of length n, the SimpleLocation
874 start and end must satisfy the inequality 0 <= start <= end <= n.
875 This means even for features on the reverse strand of a nucleotide
876 sequence, we expect the 'start' coordinate to be less than the
877 'end'.
878
879 >>> from Bio.SeqFeature import SimpleLocation
880 >>> r = SimpleLocation(122, 150, strand=-1)
881 >>> print(r)
882 [122:150](-)
883 >>> print(r.start)
884 122
885 >>> print(r.end)
886 150
887 >>> print(r.strand)
888 -1
889
890 i.e. Rather than thinking of the 'start' and 'end' biologically in a
891 strand aware manner, think of them as the 'left most' or 'minimum'
892 boundary, and the 'right most' or 'maximum' boundary of the region
893 being described. This is particularly important with compound
894 locations describing non-continuous regions.
895
896 In the example above we have used standard exact positions, but there
897 are also specialised position objects used to represent fuzzy positions
898 as well, for example a GenBank location like complement(<123..150)
899 would use a BeforePosition object for the start.
900 """
901
902 def __init__(self, start, end, strand=None, ref=None, ref_db=None):
903 """Initialize the class.
904
905 start and end arguments specify the values where the feature begins
906 and ends. These can either by any of the ``*Position`` objects that
907 inherit from Position, or can just be integers specifying the position.
908 In the case of integers, the values are assumed to be exact and are
909 converted in ExactPosition arguments. This is meant to make it easy
910 to deal with non-fuzzy ends.
911
912 i.e. Short form:
913
914 >>> from Bio.SeqFeature import SimpleLocation
915 >>> loc = SimpleLocation(5, 10, strand=-1)
916 >>> print(loc)
917 [5:10](-)
918
919 Explicit form:
920
921 >>> from Bio.SeqFeature import SimpleLocation, ExactPosition
922 >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1)
923 >>> print(loc)
924 [5:10](-)
925
926 Other fuzzy positions are used similarly,
927
928 >>> from Bio.SeqFeature import SimpleLocation
929 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
930 >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1)
931 >>> print(loc2)
932 [<5:>10](-)
933
934 For nucleotide features you will also want to specify the strand,
935 use 1 for the forward (plus) strand, -1 for the reverse (negative)
936 strand, 0 for stranded but strand unknown (? in GFF3), or None for
937 when the strand does not apply (dot in GFF3), e.g. features on
938 proteins.
939
940 >>> loc = SimpleLocation(5, 10, strand=+1)
941 >>> print(loc)
942 [5:10](+)
943 >>> print(loc.strand)
944 1
945
946 Normally feature locations are given relative to the parent
947 sequence you are working with, but an explicit accession can
948 be given with the optional ref and db_ref strings:
949
950 >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1)
951 >>> print(loc)
952 AL391218.9[105172:108462](+)
953 >>> print(loc.ref)
954 AL391218.9
955
956 """
957 # TODO - Check 0 <= start <= end (<= length of reference)
958 if isinstance(start, Position):
959 self._start = start
960 elif isinstance(start, int):
961 self._start = ExactPosition(start)
962 else:
963 raise TypeError(f"start={start!r} {type(start)}")
964 if isinstance(end, Position):
965 self._end = end
966 elif isinstance(end, int):
967 self._end = ExactPosition(end)
968 else:
969 raise TypeError(f"end={end!r} {type(end)}")
970 if (
971 isinstance(self.start, int)
972 and isinstance(self.end, int)
973 and self.start > self.end
974 ):
975 raise ValueError(
976 f"End location ({self.end}) must be greater than "
977 f"or equal to start location ({self.start})"
978 )
979 self.strand = strand
980 self.ref = ref
981 self.ref_db = ref_db
982
983 @staticmethod
984 def fromstring(text, length=None, circular=False):
985 """Create a SimpleLocation object from a string."""
986 if text.startswith("complement("):
987 text = text[11:-1]
988 strand = -1
989 else:
990 strand = None
991 # Try simple cases first for speed
992 try:
993 s, e = text.split("..")
994 s = int(s) - 1
995 e = int(e)
996 except ValueError:
997 pass
998 else:
999 if 0 <= s < e:
1000 return SimpleLocation(s, e, strand)
1001 # Try general case
1002 try:
1003 ref, text = text.split(":")
1004 except ValueError:
1005 ref = None
1006 m = _re_location_category.match(text)
1007 if m is None:
1008 raise LocationParserError(f"Could not parse feature location '{text}'")
1009 for key, value in m.groupdict().items():
1010 if value is not None:
1011 break
1012 assert value == text
1013 if key == "bond":
1014 # e.g. bond(196)
1015 warnings.warn(
1016 "Dropping bond qualifier in feature location",
1017 BiopythonParserWarning,
1018 )
1019 text = text[5:-1]
1020 s_pos = Position.fromstring(text, -1)
1021 e_pos = Position.fromstring(text)
1022 elif key == "solo":
1023 # e.g. "123"
1024 s_pos = Position.fromstring(text, -1)
1025 e_pos = Position.fromstring(text)
1026 elif key in ("pair", "within", "oneof"):
1027 s, e = text.split("..")
1028 # Attempt to fix features that span the origin
1029 s_pos = Position.fromstring(s, -1)
1030 e_pos = Position.fromstring(e)
1031 if s_pos >= e_pos:
1032 # There is likely a problem with origin wrapping.
1033 # Create a CompoundLocation of the wrapped feature,
1034 # consisting of two SimpleLocation objects to extend to
1035 # the list of feature locations.
1036 if not circular:
1037 raise LocationParserError(
1038 f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined"
1039 )
1040 warnings.warn(
1041 "Attempting to fix invalid location %r as "
1042 "it looks like incorrect origin wrapping. "
1043 "Please fix input file, this could have "
1044 "unintended behavior." % text,
1045 BiopythonParserWarning,
1046 )
1047
1048 f1 = SimpleLocation(s_pos, length, strand)
1049 f2 = SimpleLocation(0, e_pos, strand)
1050
1051 if strand == -1:
1052 # For complementary features spanning the origin
1053 return f2 + f1
1054 else:
1055 return f1 + f2
1056 elif key == "between":
1057 # A between location like "67^68" (one based counting) is a
1058 # special case (note it has zero length). In python slice
1059 # notation this is 67:67, a zero length slice. See Bug 2622
1060 # Further more, on a circular genome of length N you can have
1061 # a location N^1 meaning the junction at the origin. See Bug 3098.
1062 # NOTE - We can imagine between locations like "2^4", but this
1063 # is just "3". Similarly, "2^5" is just "3..4"
1064 s, e = text.split("^")
1065 s = int(s)
1066 e = int(e)
1067 if s + 1 == e or (s == length and e == 1):
1068 s_pos = ExactPosition(s)
1069 e_pos = s_pos
1070 else:
1071 raise LocationParserError(f"invalid feature location '{text}'")
1072 if s_pos < 0:
1073 raise LocationParserError(
1074 f"negative starting position in feature location '{text}'"
1075 )
1076 return SimpleLocation(s_pos, e_pos, strand, ref=ref)
1077
1078 def _get_strand(self):
1079 """Get function for the strand property (PRIVATE)."""
1080 return self._strand
1081
1082 def _set_strand(self, value):
1083 """Set function for the strand property (PRIVATE)."""
1084 if value not in [+1, -1, 0, None]:
1085 raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}")
1086 self._strand = value
1087
1088 strand = property(
1089 fget=_get_strand,
1090 fset=_set_strand,
1091 doc="Strand of the location (+1, -1, 0 or None).",
1092 )
1093
1094 def __str__(self):
1095 """Return a representation of the SimpleLocation object (with python counting).
1096
1097 For the simple case this uses the python splicing syntax, [122:150]
1098 (zero based counting) which GenBank would call 123..150 (one based
1099 counting).
1100 """
1101 answer = f"[{self._start}:{self._end}]"
1102 if self.ref and self.ref_db:
1103 answer = f"{self.ref_db}:{self.ref}{answer}"
1104 elif self.ref:
1105 answer = self.ref + answer
1106 # Is ref_db without ref meaningful?
1107 if self.strand is None:
1108 return answer
1109 elif self.strand == +1:
1110 return answer + "(+)"
1111 elif self.strand == -1:
1112 return answer + "(-)"
1113 else:
1114 # strand = 0, stranded but strand unknown, ? in GFF3
1115 return answer + "(?)"
1116
1117 def __repr__(self):
1118 """Represent the SimpleLocation object as a string for debugging."""
1119 optional = ""
1120 if self.strand is not None:
1121 optional += f", strand={self.strand!r}"
1122 if self.ref is not None:
1123 optional += f", ref={self.ref!r}"
1124 if self.ref_db is not None:
1125 optional += f", ref_db={self.ref_db!r}"
1126 return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})"
1127
1128 def __add__(self, other):
1129 """Combine location with another SimpleLocation object, or shift it.
1130
1131 You can add two feature locations to make a join CompoundLocation:
1132
1133 >>> from Bio.SeqFeature import SimpleLocation
1134 >>> f1 = SimpleLocation(5, 10)
1135 >>> f2 = SimpleLocation(20, 30)
1136 >>> combined = f1 + f2
1137 >>> print(combined)
1138 join{[5:10], [20:30]}
1139
1140 This is thus equivalent to:
1141
1142 >>> from Bio.SeqFeature import CompoundLocation
1143 >>> join = CompoundLocation([f1, f2])
1144 >>> print(join)
1145 join{[5:10], [20:30]}
1146
1147 You can also use sum(...) in this way:
1148
1149 >>> join = sum([f1, f2])
1150 >>> print(join)
1151 join{[5:10], [20:30]}
1152
1153 Furthermore, you can combine a SimpleLocation with a CompoundLocation
1154 in this way.
1155
1156 Separately, adding an integer will give a new SimpleLocation with
1157 its start and end offset by that amount. For example:
1158
1159 >>> print(f1)
1160 [5:10]
1161 >>> print(f1 + 100)
1162 [105:110]
1163 >>> print(200 + f1)
1164 [205:210]
1165
1166 This can be useful when editing annotation.
1167 """
1168 if isinstance(other, SimpleLocation):
1169 return CompoundLocation([self, other])
1170 elif isinstance(other, int):
1171 return self._shift(other)
1172 else:
1173 # This will allow CompoundLocation's __radd__ to be called:
1174 return NotImplemented
1175
1176 def __radd__(self, other):
1177 """Return a SimpleLocation object by shifting the location by an integer amount."""
1178 if isinstance(other, int):
1179 return self._shift(other)
1180 else:
1181 return NotImplemented
1182
1183 def __sub__(self, other):
1184 """Subtracting an integer will shift the start and end by that amount.
1185
1186 >>> from Bio.SeqFeature import SimpleLocation
1187 >>> f1 = SimpleLocation(105, 150)
1188 >>> print(f1)
1189 [105:150]
1190 >>> print(f1 - 100)
1191 [5:50]
1192
1193 This can be useful when editing annotation. You can also add an integer
1194 to a feature location (which shifts in the opposite direction).
1195 """
1196 if isinstance(other, int):
1197 return self._shift(-other)
1198 else:
1199 return NotImplemented
1200
1201 def __nonzero__(self):
1202 """Return True regardless of the length of the feature.
1203
1204 This behavior is for backwards compatibility, since until the
1205 __len__ method was added, a SimpleLocation always evaluated as True.
1206
1207 Note that in comparison, Seq objects, strings, lists, etc, will all
1208 evaluate to False if they have length zero.
1209
1210 WARNING: The SimpleLocation may in future evaluate to False when its
1211 length is zero (in order to better match normal python behavior)!
1212 """
1213 return True
1214
1215 def __len__(self):
1216 """Return the length of the region described by the SimpleLocation object.
1217
1218 Note that extra care may be needed for fuzzy locations, e.g.
1219
1220 >>> from Bio.SeqFeature import SimpleLocation
1221 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
1222 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
1223 >>> len(loc)
1224 5
1225 """
1226 return int(self._end) - int(self._start)
1227
1228 def __contains__(self, value):
1229 """Check if an integer position is within the SimpleLocation object.
1230
1231 Note that extra care may be needed for fuzzy locations, e.g.
1232
1233 >>> from Bio.SeqFeature import SimpleLocation
1234 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
1235 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
1236 >>> len(loc)
1237 5
1238 >>> [i for i in range(15) if i in loc]
1239 [5, 6, 7, 8, 9]
1240 """
1241 if not isinstance(value, int):
1242 raise ValueError(
1243 "Currently we only support checking for integer "
1244 "positions being within a SimpleLocation."
1245 )
1246 if value < self._start or value >= self._end:
1247 return False
1248 else:
1249 return True
1250
1251 def __iter__(self):
1252 """Iterate over the parent positions within the SimpleLocation object.
1253
1254 >>> from Bio.SeqFeature import SimpleLocation
1255 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
1256 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
1257 >>> len(loc)
1258 5
1259 >>> for i in loc: print(i)
1260 5
1261 6
1262 7
1263 8
1264 9
1265 >>> list(loc)
1266 [5, 6, 7, 8, 9]
1267 >>> [i for i in range(15) if i in loc]
1268 [5, 6, 7, 8, 9]
1269
1270 Note this is strand aware:
1271
1272 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1)
1273 >>> list(loc)
1274 [9, 8, 7, 6, 5]
1275 """
1276 if self.strand == -1:
1277 yield from range(self._end - 1, self._start - 1, -1)
1278 else:
1279 yield from range(self._start, self._end)
1280
1281 def __eq__(self, other):
1282 """Implement equality by comparing all the location attributes."""
1283 if not isinstance(other, SimpleLocation):
1284 return False
1285 return (
1286 self._start == other.start
1287 and self._end == other.end
1288 and self._strand == other.strand
1289 and self.ref == other.ref
1290 and self.ref_db == other.ref_db
1291 )
1292
1293 def _shift(self, offset):
1294 """Return a copy of the SimpleLocation shifted by an offset (PRIVATE).
1295
1296 Returns self when location is relative to an external reference.
1297 """
1298 # TODO - What if offset is a fuzzy position?
1299 if self.ref or self.ref_db:
1300 return self
1301 return SimpleLocation(
1302 start=self._start + offset,
1303 end=self._end + offset,
1304 strand=self.strand,
1305 )
1306
1307 def _flip(self, length):
1308 """Return a copy of the location after the parent is reversed (PRIVATE).
1309
1310 Returns self when location is relative to an external reference.
1311 """
1312 if self.ref or self.ref_db:
1313 return self
1314 # Note this will flip the start and end too!
1315 if self.strand == +1:
1316 flip_strand = -1
1317 elif self.strand == -1:
1318 flip_strand = +1
1319 else:
1320 # 0 or None
1321 flip_strand = self.strand
1322 return SimpleLocation(
1323 start=self._end._flip(length),
1324 end=self._start._flip(length),
1325 strand=flip_strand,
1326 )
1327
1328 @property
1329 def parts(self):
1330 """Read only list of sections (always one, the SimpleLocation object).
1331
1332 This is a convenience property allowing you to write code handling
1333 both SimpleLocation objects (with one part) and more complex
1334 CompoundLocation objects (with multiple parts) interchangeably.
1335 """
1336 return [self]
1337
1338 @property
1339 def start(self):
1340 """Start location - left most (minimum) value, regardless of strand.
1341
1342 Read only, returns an integer like position object, possibly a fuzzy
1343 position.
1344 """
1345 return self._start
1346
1347 @property
1348 def end(self):
1349 """End location - right most (maximum) value, regardless of strand.
1350
1351 Read only, returns an integer like position object, possibly a fuzzy
1352 position.
1353 """
1354 return self._end
1355
1356 def extract(self, parent_sequence, references=None):
1357 """Extract the sequence from supplied parent sequence using the SimpleLocation object.
1358
1359 The parent_sequence can be a Seq like object or a string, and will
1360 generally return an object of the same type. The exception to this is
1361 a MutableSeq as the parent sequence will return a Seq object.
1362 If the location refers to other records, they must be supplied
1363 in the optional dictionary references.
1364
1365 >>> from Bio.Seq import Seq
1366 >>> from Bio.SeqFeature import SimpleLocation
1367 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
1368 >>> feature_loc = SimpleLocation(8, 15)
1369 >>> feature_loc.extract(seq)
1370 Seq('VALIVIC')
1371
1372 """
1373 if self.ref or self.ref_db:
1374 if not references:
1375 raise ValueError(
1376 f"Feature references another sequence ({self.ref}),"
1377 " references mandatory"
1378 )
1379 elif self.ref not in references:
1380 # KeyError?
1381 raise ValueError(
1382 f"Feature references another sequence ({self.ref}),"
1383 " not found in references"
1384 )
1385 parent_sequence = references[self.ref]
1386 f_seq = parent_sequence[int(self.start) : int(self.end)]
1387 if isinstance(f_seq, MutableSeq):
1388 f_seq = Seq(f_seq)
1389 if self.strand == -1:
1390 f_seq = reverse_complement(f_seq)
1391 return f_seq
1392
1393
1394 FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only.
1395
1396
1397 class CompoundLocation(Location):
1398 """For handling joins etc where a feature location has several parts."""
1399
1400 def __init__(self, parts, operator="join"):
1401 """Initialize the class.
1402
1403 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
1404 >>> f1 = SimpleLocation(10, 40, strand=+1)
1405 >>> f2 = SimpleLocation(50, 59, strand=+1)
1406 >>> f = CompoundLocation([f1, f2])
1407 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
1408 True
1409 >>> print(f.operator)
1410 join
1411 >>> 5 in f
1412 False
1413 >>> 15 in f
1414 True
1415 >>> f.strand
1416 1
1417
1418 Notice that the strand of the compound location is computed
1419 automatically - in the case of mixed strands on the sub-locations
1420 the overall strand is set to None.
1421
1422 >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1),
1423 ... SimpleLocation(10, 13, strand=-1)])
1424 >>> print(f.strand)
1425 None
1426 >>> len(f)
1427 6
1428 >>> list(f)
1429 [3, 4, 5, 12, 11, 10]
1430
1431 The example above doing list(f) iterates over the coordinates within the
1432 feature. This allows you to use max and min on the location, to find the
1433 range covered:
1434
1435 >>> min(f)
1436 3
1437 >>> max(f)
1438 12
1439
1440 More generally, you can use the compound location's start and end which
1441 give the full span covered, 0 <= start <= end <= full sequence length.
1442
1443 >>> f.start == min(f)
1444 True
1445 >>> f.end == max(f) + 1
1446 True
1447
1448 This is consistent with the behavior of the SimpleLocation for a single
1449 region, where again the 'start' and 'end' do not necessarily give the
1450 biological start and end, but rather the 'minimal' and 'maximal'
1451 coordinate boundaries.
1452
1453 Note that adding locations provides a more intuitive method of
1454 construction:
1455
1456 >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1)
1457 >>> len(f)
1458 6
1459 >>> list(f)
1460 [3, 4, 5, 12, 11, 10]
1461 """
1462 self.operator = operator
1463 self.parts = list(parts)
1464 for loc in self.parts:
1465 if not isinstance(loc, SimpleLocation):
1466 raise ValueError(
1467 "CompoundLocation should be given a list of "
1468 "SimpleLocation objects, not %s" % loc.__class__
1469 )
1470 if len(parts) < 2:
1471 raise ValueError(
1472 f"CompoundLocation should have at least 2 parts, not {parts!r}"
1473 )
1474
1475 def __str__(self):
1476 """Return a representation of the CompoundLocation object (with python counting)."""
1477 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1478
1479 def __repr__(self):
1480 """Represent the CompoundLocation object as string for debugging."""
1481 return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})"
1482
1483 def _get_strand(self):
1484 """Get function for the strand property (PRIVATE)."""
1485 # Historically a join on the reverse strand has been represented
1486 # in Biopython with both the parent SeqFeature and its children
1487 # (the exons for a CDS) all given a strand of -1. Likewise, for
1488 # a join feature on the forward strand they all have strand +1.
1489 # However, we must also consider evil mixed strand examples like
1490 # this, join(complement(69611..69724),139856..140087,140625..140650)
1491 if len({loc.strand for loc in self.parts}) == 1:
1492 return self.parts[0].strand
1493 else:
1494 return None # i.e. mixed strands
1495
1496 def _set_strand(self, value):
1497 """Set function for the strand property (PRIVATE)."""
1498 # Should this be allowed/encouraged?
1499 for loc in self.parts:
1500 loc.strand = value
1501
1502 strand = property(
1503 fget=_get_strand,
1504 fset=_set_strand,
1505 doc="""Overall strand of the compound location.
1506
1507 If all the parts have the same strand, that is returned. Otherwise
1508 for mixed strands, this returns None.
1509
1510 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
1511 >>> f1 = SimpleLocation(15, 17, strand=1)
1512 >>> f2 = SimpleLocation(20, 30, strand=-1)
1513 >>> f = f1 + f2
1514 >>> f1.strand
1515 1
1516 >>> f2.strand
1517 -1
1518 >>> f.strand
1519 >>> f.strand is None
1520 True
1521
1522 If you set the strand of a CompoundLocation, this is applied to
1523 all the parts - use with caution:
1524
1525 >>> f.strand = 1
1526 >>> f1.strand
1527 1
1528 >>> f2.strand
1529 1
1530 >>> f.strand
1531 1
1532
1533 """,
1534 )
1535
1536 def __add__(self, other):
1537 """Combine locations, or shift the location by an integer offset.
1538
1539 >>> from Bio.SeqFeature import SimpleLocation
1540 >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30)
1541 >>> print(f1)
1542 join{[15:17], [20:30]}
1543
1544 You can add another SimpleLocation:
1545
1546 >>> print(f1 + SimpleLocation(40, 50))
1547 join{[15:17], [20:30], [40:50]}
1548 >>> print(SimpleLocation(5, 10) + f1)
1549 join{[5:10], [15:17], [20:30]}
1550
1551 You can also add another CompoundLocation:
1552
1553 >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70)
1554 >>> print(f2)
1555 join{[40:50], [60:70]}
1556 >>> print(f1 + f2)
1557 join{[15:17], [20:30], [40:50], [60:70]}
1558
1559 Also, as with the SimpleLocation, adding an integer shifts the
1560 location's coordinates by that offset:
1561
1562 >>> print(f1 + 100)
1563 join{[115:117], [120:130]}
1564 >>> print(200 + f1)
1565 join{[215:217], [220:230]}
1566 >>> print(f1 + (-5))
1567 join{[10:12], [15:25]}
1568 """
1569 if isinstance(other, SimpleLocation):
1570 return CompoundLocation(self.parts + [other], self.operator)
1571 elif isinstance(other, CompoundLocation):
1572 if self.operator != other.operator:
1573 # Handle join+order -> order as a special case?
1574 raise ValueError(
1575 f"Mixed operators {self.operator} and {other.operator}"
1576 )
1577 return CompoundLocation(self.parts + other.parts, self.operator)
1578 elif isinstance(other, int):
1579 return self._shift(other)
1580 else:
1581 raise NotImplementedError
1582
1583 def __radd__(self, other):
1584 """Add a feature to the left."""
1585 if isinstance(other, SimpleLocation):
1586 return CompoundLocation([other] + self.parts, self.operator)
1587 elif isinstance(other, int):
1588 return self._shift(other)
1589 else:
1590 raise NotImplementedError
1591
1592 def __contains__(self, value):
1593 """Check if an integer position is within the CompoundLocation object."""
1594 for loc in self.parts:
1595 if value in loc:
1596 return True
1597 return False
1598
1599 def __nonzero__(self):
1600 """Return True regardless of the length of the feature.
1601
1602 This behavior is for backwards compatibility, since until the
1603 __len__ method was added, a SimpleLocation always evaluated as True.
1604
1605 Note that in comparison, Seq objects, strings, lists, etc, will all
1606 evaluate to False if they have length zero.
1607
1608 WARNING: The SimpleLocation may in future evaluate to False when its
1609 length is zero (in order to better match normal python behavior)!
1610 """
1611 return True
1612
1613 def __len__(self):
1614 """Return the length of the CompoundLocation object."""
1615 return sum(len(loc) for loc in self.parts)
1616
1617 def __iter__(self):
1618 """Iterate over the parent positions within the CompoundLocation object."""
1619 for loc in self.parts:
1620 yield from loc
1621
1622 def __eq__(self, other):
1623 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
1624 if not isinstance(other, CompoundLocation):
1625 return False
1626 if len(self.parts) != len(other.parts):
1627 return False
1628 if self.operator != other.operator:
1629 return False
1630 for self_part, other_part in zip(self.parts, other.parts):
1631 if self_part != other_part:
1632 return False
1633 return True
1634
1635 def _shift(self, offset):
1636 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
1637 return CompoundLocation(
1638 [loc._shift(offset) for loc in self.parts], self.operator
1639 )
1640
1641 def _flip(self, length):
1642 """Return a copy of the locations after the parent is reversed (PRIVATE).
1643
1644 Note that the order of the parts is NOT reversed too. Consider a CDS
1645 on the forward strand with exons small, medium and large (in length).
1646 Once we change the frame of reference to the reverse complement strand,
1647 the start codon is still part of the small exon, and the stop codon
1648 still part of the large exon - so the part order remains the same!
1649
1650 Here is an artificial example, were the features map to the two upper
1651 case regions and the lower case runs of n are not used:
1652
1653 >>> from Bio.Seq import Seq
1654 >>> from Bio.SeqFeature import SimpleLocation
1655 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
1656 >>> small = SimpleLocation(5, 20, strand=1)
1657 >>> large = SimpleLocation(28, 52, strand=1)
1658 >>> location = small + large
1659 >>> print(small)
1660 [5:20](+)
1661 >>> print(large)
1662 [28:52](+)
1663 >>> print(location)
1664 join{[5:20](+), [28:52](+)}
1665 >>> for part in location.parts:
1666 ... print(len(part))
1667 ...
1668 15
1669 24
1670
1671 As you can see, this is a silly example where each "exon" is a word:
1672
1673 >>> print(small.extract(dna).translate())
1674 SILLY
1675 >>> print(large.extract(dna).translate())
1676 EXAMPLE*
1677 >>> print(location.extract(dna).translate())
1678 SILLYEXAMPLE*
1679 >>> for part in location.parts:
1680 ... print(part.extract(dna).translate())
1681 ...
1682 SILLY
1683 EXAMPLE*
1684
1685 Now, let's look at this from the reverse strand frame of reference:
1686
1687 >>> flipped_dna = dna.reverse_complement()
1688 >>> flipped_location = location._flip(len(dna))
1689 >>> print(flipped_location.extract(flipped_dna).translate())
1690 SILLYEXAMPLE*
1691 >>> for part in flipped_location.parts:
1692 ... print(part.extract(flipped_dna).translate())
1693 ...
1694 SILLY
1695 EXAMPLE*
1696
1697 The key point here is the first part of the CompoundFeature is still the
1698 small exon, while the second part is still the large exon:
1699
1700 >>> for part in flipped_location.parts:
1701 ... print(len(part))
1702 ...
1703 15
1704 24
1705 >>> print(flipped_location)
1706 join{[37:52](-), [5:29](-)}
1707
1708 Notice the parts are not reversed. However, there was a bug here in older
1709 versions of Biopython which would have given join{[5:29](-), [37:52](-)}
1710 and the translation would have wrongly been "EXAMPLE*SILLY" instead.
1711
1712 """
1713 return CompoundLocation(
1714 [loc._flip(length) for loc in self.parts], self.operator
1715 )
1716
1717 @property
1718 def start(self):
1719 """Start location - left most (minimum) value, regardless of strand.
1720
1721 Read only, returns an integer like position object, possibly a fuzzy
1722 position.
1723
1724 For the special case of a CompoundLocation wrapping the origin of a
1725 circular genome, this will return zero.
1726 """
1727 return min(loc.start for loc in self.parts)
1728
1729 @property
1730 def end(self):
1731 """End location - right most (maximum) value, regardless of strand.
1732
1733 Read only, returns an integer like position object, possibly a fuzzy
1734 position.
1735
1736 For the special case of a CompoundLocation wrapping the origin of
1737 a circular genome this will match the genome length.
1738 """
1739 return max(loc.end for loc in self.parts)
1740
1741 @property
1742 def ref(self):
1743 """Not present in CompoundLocation, dummy method for API compatibility."""
1744 return None
1745
1746 @property
1747 def ref_db(self):
1748 """Not present in CompoundLocation, dummy method for API compatibility."""
1749 return None
1750
1751 def extract(self, parent_sequence, references=None):
1752 """Extract the sequence from supplied parent sequence using the CompoundLocation object.
1753
1754 The parent_sequence can be a Seq like object or a string, and will
1755 generally return an object of the same type. The exception to this is
1756 a MutableSeq as the parent sequence will return a Seq object.
1757 If the location refers to other records, they must be supplied
1758 in the optional dictionary references.
1759
1760 >>> from Bio.Seq import Seq
1761 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
1762 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
1763 >>> fl1 = SimpleLocation(2, 8)
1764 >>> fl2 = SimpleLocation(10, 15)
1765 >>> fl3 = CompoundLocation([fl1,fl2])
1766 >>> fl3.extract(seq)
1767 Seq('QHKAMILIVIC')
1768
1769 """
1770 # This copes with mixed strand features & all on reverse:
1771 parts = [
1772 loc.extract(parent_sequence, references=references) for loc in self.parts
1773 ]
1774 f_seq = functools.reduce(lambda x, y: x + y, parts)
1775 return f_seq
1776
1777
1778 class Position(ABC):
1779 """Abstract base class representing a position."""
1780
1781 @abstractmethod
1782 def __repr__(self):
1783 """Represent the Position object as a string for debugging."""
1784 return f"{self.__class__.__name__}(...)"
1785
1786 @staticmethod
1787 def fromstring(text, offset=0):
1788 """Build a Position object from the text string.
1789
1790 For an end position, leave offset as zero (default):
1791
1792 >>> Position.fromstring("5")
1793 ExactPosition(5)
1794
1795 For a start position, set offset to minus one (for Python counting):
1796
1797 >>> Position.fromstring("5", -1)
1798 ExactPosition(4)
1799
1800 This also covers fuzzy positions:
1801
1802 >>> p = Position.fromstring("<5")
1803 >>> p
1804 BeforePosition(5)
1805 >>> print(p)
1806 <5
1807 >>> int(p)
1808 5
1809
1810 >>> Position.fromstring(">5")
1811 AfterPosition(5)
1812
1813 By default assumes an end position, so note the integer behavior:
1814
1815 >>> p = Position.fromstring("one-of(5,8,11)")
1816 >>> p
1817 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
1818 >>> print(p)
1819 one-of(5,8,11)
1820 >>> int(p)
1821 11
1822
1823 >>> Position.fromstring("(8.10)")
1824 WithinPosition(10, left=8, right=10)
1825
1826 Fuzzy start positions:
1827
1828 >>> p = Position.fromstring("<5", -1)
1829 >>> p
1830 BeforePosition(4)
1831 >>> print(p)
1832 <4
1833 >>> int(p)
1834 4
1835
1836 Notice how the integer behavior changes too!
1837
1838 >>> p = Position.fromstring("one-of(5,8,11)", -1)
1839 >>> p
1840 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
1841 >>> print(p)
1842 one-of(4,7,10)
1843 >>> int(p)
1844 4
1845
1846 """
1847 if offset != 0 and offset != -1:
1848 raise ValueError(
1849 "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)."
1850 )
1851 if text == "?":
1852 return UnknownPosition()
1853 if text.startswith("?"):
1854 return UncertainPosition(int(text[1:]) + offset)
1855 if text.startswith("<"):
1856 return BeforePosition(int(text[1:]) + offset)
1857 if text.startswith(">"):
1858 return AfterPosition(int(text[1:]) + offset)
1859 m = _re_within_position.match(text)
1860 if m is not None:
1861 s, e = m.groups()
1862 s = int(s) + offset
1863 e = int(e) + offset
1864 if offset == -1:
1865 default = s
1866 else:
1867 default = e
1868 return WithinPosition(default, left=s, right=e)
1869 m = _re_oneof_position.match(text)
1870 if m is not None:
1871 positions = m.groups()[0]
1872 parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")]
1873 if offset == -1:
1874 default = min(int(pos) for pos in parts)
1875 else:
1876 default = max(int(pos) for pos in parts)
1877 return OneOfPosition(default, choices=parts)
1878 return ExactPosition(int(text) + offset)
1879
1880
1881 class ExactPosition(int, Position):
1882 """Specify the specific position of a boundary.
1883
1884 Arguments:
1885 - position - The position of the boundary.
1886 - extension - An optional argument which must be zero since we don't
1887 have an extension. The argument is provided so that the same number
1888 of arguments can be passed to all position types.
1889
1890 In this case, there is no fuzziness associated with the position.
1891
1892 >>> p = ExactPosition(5)
1893 >>> p
1894 ExactPosition(5)
1895 >>> print(p)
1896 5
1897
1898 >>> isinstance(p, Position)
1899 True
1900 >>> isinstance(p, int)
1901 True
1902
1903 Integer comparisons and operations should work as expected:
1904
1905 >>> p == 5
1906 True
1907 >>> p < 6
1908 True
1909 >>> p <= 5
1910 True
1911 >>> p + 10
1912 ExactPosition(15)
1913
1914 """
1915
1916 def __new__(cls, position, extension=0):
1917 """Create an ExactPosition object."""
1918 if extension != 0:
1919 raise AttributeError(f"Non-zero extension {extension} for exact position.")
1920 return int.__new__(cls, position)
1921
1922 # Must define this on Python 3.8 onwards because we redefine __repr__
1923 def __str__(self):
1924 """Return a representation of the ExactPosition object (with python counting)."""
1925 return str(int(self))
1926
1927 def __repr__(self):
1928 """Represent the ExactPosition object as a string for debugging."""
1929 return "%s(%i)" % (self.__class__.__name__, int(self))
1930
1931 def __add__(self, offset):
1932 """Return a copy of the position object with its location shifted (PRIVATE)."""
1933 # By default preserve any subclass
1934 return self.__class__(int(self) + offset)
1935
1936 def _flip(self, length):
1937 """Return a copy of the location after the parent is reversed (PRIVATE)."""
1938 # By default preserve any subclass
1939 return self.__class__(length - int(self))
1940
1941
1942 class UncertainPosition(ExactPosition):
1943 """Specify a specific position which is uncertain.
1944
1945 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
1946 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
1947 """
1948
1949
1950 class UnknownPosition(Position):
1951 """Specify a specific position which is unknown (has no position).
1952
1953 This is used in UniProt, e.g. ? or in the XML as unknown.
1954 """
1955
1956 def __repr__(self):
1957 """Represent the UnknownPosition object as a string for debugging."""
1958 return f"{self.__class__.__name__}()"
1959
1960 def __hash__(self):
1961 """Return the hash value of the UnknownPosition object."""
1962 return hash(None)
1963
1964 def __add__(self, offset):
1965 """Return a copy of the position object with its location shifted (PRIVATE)."""
1966 return self
1967
1968 def _flip(self, length):
1969 """Return a copy of the location after the parent is reversed (PRIVATE)."""
1970 return self
1971
1972
1973 class WithinPosition(int, Position):
1974 """Specify the position of a boundary within some coordinates.
1975
1976 Arguments:
1977 - position - The default integer position
1978 - left - The start (left) position of the boundary
1979 - right - The end (right) position of the boundary
1980
1981 This allows dealing with a location like ((11.14)..100). This
1982 indicates that the start of the sequence is somewhere between 11
1983 and 14. Since this is a start coordinate, it should act like
1984 it is at position 11 (or in Python counting, 10).
1985
1986 >>> p = WithinPosition(10, 10, 13)
1987 >>> p
1988 WithinPosition(10, left=10, right=13)
1989 >>> print(p)
1990 (10.13)
1991 >>> int(p)
1992 10
1993
1994 Basic integer comparisons and operations should work as though
1995 this were a plain integer:
1996
1997 >>> p == 10
1998 True
1999 >>> p in [9, 10, 11]
2000 True
2001 >>> p < 11
2002 True
2003 >>> p + 10
2004 WithinPosition(20, left=20, right=23)
2005
2006 >>> isinstance(p, WithinPosition)
2007 True
2008 >>> isinstance(p, Position)
2009 True
2010 >>> isinstance(p, int)
2011 True
2012
2013 Note this also applies for comparison to other position objects,
2014 where again the integer behavior is used:
2015
2016 >>> p == 10
2017 True
2018 >>> p == ExactPosition(10)
2019 True
2020 >>> p == BeforePosition(10)
2021 True
2022 >>> p == AfterPosition(10)
2023 True
2024
2025 If this were an end point, you would want the position to be 13
2026 (the right/larger value, not the left/smaller value as above):
2027
2028 >>> p2 = WithinPosition(13, 10, 13)
2029 >>> p2
2030 WithinPosition(13, left=10, right=13)
2031 >>> print(p2)
2032 (10.13)
2033 >>> int(p2)
2034 13
2035 >>> p2 == 13
2036 True
2037 >>> p2 == ExactPosition(13)
2038 True
2039
2040 """
2041
2042 def __new__(cls, position, left, right):
2043 """Create a WithinPosition object."""
2044 if not (position == left or position == right):
2045 raise RuntimeError(
2046 "WithinPosition: %r should match left %r or "
2047 "right %r" % (position, left, right)
2048 )
2049 obj = int.__new__(cls, position)
2050 obj._left = left
2051 obj._right = right
2052 return obj
2053
2054 def __getnewargs__(self):
2055 """Return the arguments accepted by __new__.
2056
2057 Necessary to allow pickling and unpickling of class instances.
2058 """
2059 return (int(self), self._left, self._right)
2060
2061 def __repr__(self):
2062 """Represent the WithinPosition object as a string for debugging."""
2063 return "%s(%i, left=%i, right=%i)" % (
2064 self.__class__.__name__,
2065 int(self),
2066 self._left,
2067 self._right,
2068 )
2069
2070 def __str__(self):
2071 """Return a representation of the WithinPosition object (with python counting)."""
2072 return f"({self._left}.{self._right})"
2073
2074 def __add__(self, offset):
2075 """Return a copy of the position object with its location shifted."""
2076 return self.__class__(
2077 int(self) + offset, self._left + offset, self._right + offset
2078 )
2079
2080 def _flip(self, length):
2081 """Return a copy of the location after the parent is reversed (PRIVATE)."""
2082 return self.__class__(
2083 length - int(self), length - self._right, length - self._left
2084 )
2085
2086
2087 class BetweenPosition(int, Position):
2088 """Specify the position of a boundary between two coordinates (OBSOLETE?).
2089
2090 Arguments:
2091 - position - The default integer position
2092 - left - The start (left) position of the boundary
2093 - right - The end (right) position of the boundary
2094
2095 This allows dealing with a position like 123^456. This
2096 indicates that the start of the sequence is somewhere between
2097 123 and 456. It is up to the parser to set the position argument
2098 to either boundary point (depending on if this is being used as
2099 a start or end of the feature). For example as a feature end:
2100
2101 >>> p = BetweenPosition(456, 123, 456)
2102 >>> p
2103 BetweenPosition(456, left=123, right=456)
2104 >>> print(p)
2105 (123^456)
2106 >>> int(p)
2107 456
2108
2109 Integer equality and comparison use the given position,
2110
2111 >>> p == 456
2112 True
2113 >>> p in [455, 456, 457]
2114 True
2115 >>> p > 300
2116 True
2117
2118 The old legacy properties of position and extension give the
2119 starting/lower/left position as an integer, and the distance
2120 to the ending/higher/right position as an integer. Note that
2121 the position object will act like either the left or the right
2122 end-point depending on how it was created:
2123
2124 >>> p2 = BetweenPosition(123, left=123, right=456)
2125 >>> int(p) == int(p2)
2126 False
2127 >>> p == 456
2128 True
2129 >>> p2 == 123
2130 True
2131
2132 Note this potentially surprising behavior:
2133
2134 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
2135 True
2136 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
2137 True
2138 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
2139 True
2140
2141 i.e. For equality (and sorting) the position objects behave like
2142 integers.
2143
2144 """
2145
2146 def __new__(cls, position, left, right):
2147 """Create a new instance in BetweenPosition object."""
2148 assert position == left or position == right
2149 # TODO - public API for getting left/right, especially the unknown one
2150 obj = int.__new__(cls, position)
2151 obj._left = left
2152 obj._right = right
2153 return obj
2154
2155 def __getnewargs__(self):
2156 """Return the arguments accepted by __new__.
2157
2158 Necessary to allow pickling and unpickling of class instances.
2159 """
2160 return (int(self), self._left, self._right)
2161
2162 def __repr__(self):
2163 """Represent the BetweenPosition object as a string for debugging."""
2164 return "%s(%i, left=%i, right=%i)" % (
2165 self.__class__.__name__,
2166 int(self),
2167 self._left,
2168 self._right,
2169 )
2170
2171 def __str__(self):
2172 """Return a representation of the BetweenPosition object (with python counting)."""
2173 return f"({self._left}^{self._right})"
2174
2175 def __add__(self, offset):
2176 """Return a copy of the position object with its location shifted (PRIVATE)."""
2177 return self.__class__(
2178 int(self) + offset, self._left + offset, self._right + offset
2179 )
2180
2181 def _flip(self, length):
2182 """Return a copy of the location after the parent is reversed (PRIVATE)."""
2183 return self.__class__(
2184 length - int(self), length - self._right, length - self._left
2185 )
2186
2187
2188 class BeforePosition(int, Position):
2189 """Specify a position where the actual location occurs before it.
2190
2191 Arguments:
2192 - position - The upper boundary of where the location can occur.
2193 - extension - An optional argument which must be zero since we don't
2194 have an extension. The argument is provided so that the same number
2195 of arguments can be passed to all position types.
2196
2197 This is used to specify positions like (<10..100) where the location
2198 occurs somewhere before position 10.
2199
2200 >>> p = BeforePosition(5)
2201 >>> p
2202 BeforePosition(5)
2203 >>> print(p)
2204 <5
2205 >>> int(p)
2206 5
2207 >>> p + 10
2208 BeforePosition(15)
2209
2210 Note this potentially surprising behavior:
2211
2212 >>> p == ExactPosition(5)
2213 True
2214 >>> p == AfterPosition(5)
2215 True
2216
2217 Just remember that for equality and sorting the position objects act
2218 like integers.
2219 """
2220
2221 # Subclasses int so can't use __init__
2222 def __new__(cls, position, extension=0):
2223 """Create a new instance in BeforePosition object."""
2224 if extension != 0:
2225 raise AttributeError(f"Non-zero extension {extension} for exact position.")
2226 return int.__new__(cls, position)
2227
2228 def __repr__(self):
2229 """Represent the location as a string for debugging."""
2230 return "%s(%i)" % (self.__class__.__name__, int(self))
2231
2232 def __str__(self):
2233 """Return a representation of the BeforePosition object (with python counting)."""
2234 return f"<{int(self)}"
2235
2236 def __add__(self, offset):
2237 """Return a copy of the position object with its location shifted (PRIVATE)."""
2238 return self.__class__(int(self) + offset)
2239
2240 def _flip(self, length):
2241 """Return a copy of the location after the parent is reversed (PRIVATE)."""
2242 return AfterPosition(length - int(self))
2243
2244
2245 class AfterPosition(int, Position):
2246 """Specify a position where the actual location is found after it.
2247
2248 Arguments:
2249 - position - The lower boundary of where the location can occur.
2250 - extension - An optional argument which must be zero since we don't
2251 have an extension. The argument is provided so that the same number
2252 of arguments can be passed to all position types.
2253
2254 This is used to specify positions like (>10..100) where the location
2255 occurs somewhere after position 10.
2256
2257 >>> p = AfterPosition(7)
2258 >>> p
2259 AfterPosition(7)
2260 >>> print(p)
2261 >7
2262 >>> int(p)
2263 7
2264 >>> p + 10
2265 AfterPosition(17)
2266
2267 >>> isinstance(p, AfterPosition)
2268 True
2269 >>> isinstance(p, Position)
2270 True
2271 >>> isinstance(p, int)
2272 True
2273
2274 Note this potentially surprising behavior:
2275
2276 >>> p == ExactPosition(7)
2277 True
2278 >>> p == BeforePosition(7)
2279 True
2280
2281 Just remember that for equality and sorting the position objects act
2282 like integers.
2283 """
2284
2285 # Subclasses int so can't use __init__
2286 def __new__(cls, position, extension=0):
2287 """Create a new instance of the AfterPosition object."""
2288 if extension != 0:
2289 raise AttributeError(f"Non-zero extension {extension} for exact position.")
2290 return int.__new__(cls, position)
2291
2292 def __repr__(self):
2293 """Represent the location as a string for debugging."""
2294 return "%s(%i)" % (self.__class__.__name__, int(self))
2295
2296 def __str__(self):
2297 """Return a representation of the AfterPosition object (with python counting)."""
2298 return f">{int(self)}"
2299
2300 def __add__(self, offset):
2301 """Return a copy of the position object with its location shifted (PRIVATE)."""
2302 return self.__class__(int(self) + offset)
2303
2304 def _flip(self, length):
2305 """Return a copy of the location after the parent is reversed (PRIVATE)."""
2306 return BeforePosition(length - int(self))
2307
2308
2309 class OneOfPosition(int, Position):
2310 """Specify a position where the location can be multiple positions.
2311
2312 This models the GenBank 'one-of(1888,1901)' function, and tries
2313 to make this fit within the Biopython Position models. If this was
2314 a start position it should act like 1888, but as an end position 1901.
2315
2316 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
2317 >>> p
2318 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
2319 >>> int(p)
2320 1888
2321
2322 Integer comparisons and operators act like using int(p),
2323
2324 >>> p == 1888
2325 True
2326 >>> p <= 1888
2327 True
2328 >>> p > 1888
2329 False
2330 >>> p + 100
2331 OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)])
2332
2333 >>> isinstance(p, OneOfPosition)
2334 True
2335 >>> isinstance(p, Position)
2336 True
2337 >>> isinstance(p, int)
2338 True
2339
2340 """
2341
2342 def __new__(cls, position, choices):
2343 """Initialize with a set of possible positions.
2344
2345 choices is a list of Position derived objects, specifying possible
2346 locations.
2347
2348 position is an integer specifying the default behavior.
2349 """
2350 if position not in choices:
2351 raise ValueError(
2352 f"OneOfPosition: {position!r} should match one of {choices!r}"
2353 )
2354 obj = int.__new__(cls, position)
2355 obj.position_choices = choices
2356 return obj
2357
2358 def __getnewargs__(self):
2359 """Return the arguments accepted by __new__.
2360
2361 Necessary to allow pickling and unpickling of class instances.
2362 """
2363 return (int(self), self.position_choices)
2364
2365 def __repr__(self):
2366 """Represent the OneOfPosition object as a string for debugging."""
2367 return "%s(%i, choices=%r)" % (
2368 self.__class__.__name__,
2369 int(self),
2370 self.position_choices,
2371 )
2372
2373 def __str__(self):
2374 """Return a representation of the OneOfPosition object (with python counting)."""
2375 out = "one-of("
2376 for position in self.position_choices:
2377 out += f"{position},"
2378 # replace the last comma with the closing parenthesis
2379 return out[:-1] + ")"
2380
2381 def __add__(self, offset):
2382 """Return a copy of the position object with its location shifted (PRIVATE)."""
2383 return self.__class__(
2384 int(self) + offset, [p + offset for p in self.position_choices]
2385 )
2386
2387 def _flip(self, length):
2388 """Return a copy of the location after the parent is reversed (PRIVATE)."""
2389 return self.__class__(
2390 length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
2391 )
2392
2393
2394 if __name__ == "__main__":
2395 from Bio._utils import run_doctest
2396
2397 run_doctest()