annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/SeqFeature.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # Copyright 2000-2003 Jeff Chang.
jpayne@69 2 # Copyright 2001-2008 Brad Chapman.
jpayne@69 3 # Copyright 2005-2024 by Peter Cock.
jpayne@69 4 # Copyright 2006-2009 Michiel de Hoon.
jpayne@69 5 # All rights reserved.
jpayne@69 6 #
jpayne@69 7 # This file is part of the Biopython distribution and governed by your
jpayne@69 8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69 9 # Please see the LICENSE file that should have been included as part of this
jpayne@69 10 # package.
jpayne@69 11 """Represent a Sequence Feature holding info about a part of a sequence.
jpayne@69 12
jpayne@69 13 This is heavily modeled after the Biocorba SeqFeature objects, and
jpayne@69 14 may be pretty biased towards GenBank stuff since I'm writing it
jpayne@69 15 for the GenBank parser output...
jpayne@69 16
jpayne@69 17 What's here:
jpayne@69 18
jpayne@69 19 Base class to hold a Feature
jpayne@69 20 ----------------------------
jpayne@69 21
jpayne@69 22 Classes:
jpayne@69 23 - SeqFeature
jpayne@69 24
jpayne@69 25 Hold information about a Reference
jpayne@69 26 ----------------------------------
jpayne@69 27
jpayne@69 28 This is an attempt to create a General class to hold Reference type
jpayne@69 29 information.
jpayne@69 30
jpayne@69 31 Classes:
jpayne@69 32 - Reference
jpayne@69 33
jpayne@69 34 Specify locations of a feature on a Sequence
jpayne@69 35 --------------------------------------------
jpayne@69 36
jpayne@69 37 This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
jpayne@69 38 This has the advantages of allowing us to handle fuzzy stuff in case anyone
jpayne@69 39 needs it, and also be compatible with BioPerl etc and BioSQL.
jpayne@69 40
jpayne@69 41 Classes:
jpayne@69 42 - Location - abstract base class of SimpleLocation and CompoundLocation.
jpayne@69 43 - SimpleLocation - Specify the start and end location of a feature.
jpayne@69 44 - CompoundLocation - Collection of SimpleLocation objects (for joins etc).
jpayne@69 45 - Position - abstract base class of ExactPosition, WithinPosition,
jpayne@69 46 BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and
jpayne@69 47 UnknownPosition.
jpayne@69 48 - ExactPosition - Specify the position as being exact.
jpayne@69 49 - WithinPosition - Specify a position occurring within some range.
jpayne@69 50 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
jpayne@69 51 - BeforePosition - Specify the position as being found before some base.
jpayne@69 52 - AfterPosition - Specify the position as being found after some base.
jpayne@69 53 - OneOfPosition - Specify a position consisting of multiple alternative positions.
jpayne@69 54 - UncertainPosition - Specify a specific position which is uncertain.
jpayne@69 55 - UnknownPosition - Represents missing information like '?' in UniProt.
jpayne@69 56
jpayne@69 57
jpayne@69 58 Exceptions:
jpayne@69 59 - LocationParserError - Exception indicating a failure to parse a location
jpayne@69 60 string.
jpayne@69 61
jpayne@69 62 """
jpayne@69 63 import functools
jpayne@69 64 import re
jpayne@69 65 import warnings
jpayne@69 66 from abc import ABC, abstractmethod
jpayne@69 67
jpayne@69 68 from Bio import BiopythonDeprecationWarning
jpayne@69 69 from Bio import BiopythonParserWarning
jpayne@69 70 from Bio.Seq import MutableSeq
jpayne@69 71 from Bio.Seq import reverse_complement
jpayne@69 72 from Bio.Seq import Seq
jpayne@69 73
jpayne@69 74
jpayne@69 75 # Regular expressions for location parsing
jpayne@69 76
jpayne@69 77 _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)"
jpayne@69 78 _oneof_position = r"one\-of\(\d+[,\d+]+\)"
jpayne@69 79
jpayne@69 80 _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})"
jpayne@69 81
jpayne@69 82 _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))"
jpayne@69 83
jpayne@69 84 _split = re.compile(_any_location).split
jpayne@69 85
jpayne@69 86 assert _split("123..145")[1::2] == ["123..145"]
jpayne@69 87 assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"]
jpayne@69 88 assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"]
jpayne@69 89 assert _split("complement(123..145),200..209")[1::2] == [
jpayne@69 90 "complement(123..145)",
jpayne@69 91 "200..209",
jpayne@69 92 ]
jpayne@69 93 assert _split("123..145,one-of(200,203)..209")[1::2] == [
jpayne@69 94 "123..145",
jpayne@69 95 "one-of(200,203)..209",
jpayne@69 96 ]
jpayne@69 97 assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [
jpayne@69 98 "123..145",
jpayne@69 99 "one-of(200,203)..one-of(209,211)",
jpayne@69 100 "300",
jpayne@69 101 ]
jpayne@69 102 assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [
jpayne@69 103 "123..145",
jpayne@69 104 "complement(one-of(200,203)..one-of(209,211))",
jpayne@69 105 "300",
jpayne@69 106 ]
jpayne@69 107 assert _split("123..145,200..one-of(209,211),300")[1::2] == [
jpayne@69 108 "123..145",
jpayne@69 109 "200..one-of(209,211)",
jpayne@69 110 "300",
jpayne@69 111 ]
jpayne@69 112 assert _split("123..145,200..one-of(209,211)")[1::2] == [
jpayne@69 113 "123..145",
jpayne@69 114 "200..one-of(209,211)",
jpayne@69 115 ]
jpayne@69 116 assert _split(
jpayne@69 117 "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"
jpayne@69 118 )[1::2] == [
jpayne@69 119 "complement(149815..150200)",
jpayne@69 120 "complement(293787..295573)",
jpayne@69 121 "NC_016402.1:6618..6676",
jpayne@69 122 "181647..181905",
jpayne@69 123 ]
jpayne@69 124
jpayne@69 125
jpayne@69 126 _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+"
jpayne@69 127
jpayne@69 128 _between_location = r"\d+\^\d+"
jpayne@69 129
jpayne@69 130 _within_position = r"\(\d+\.\d+\)"
jpayne@69 131 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
jpayne@69 132 _within_position,
jpayne@69 133 _within_position,
jpayne@69 134 )
jpayne@69 135 _within_position = r"\((\d+)\.(\d+)\)"
jpayne@69 136 _re_within_position = re.compile(_within_position)
jpayne@69 137 assert _re_within_position.match("(3.9)")
jpayne@69 138
jpayne@69 139 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
jpayne@69 140 _oneof_position = r"one\-of\((\d+[,\d+]+)\)"
jpayne@69 141 _re_oneof_position = re.compile(_oneof_position)
jpayne@69 142 assert _re_oneof_position.match("one-of(6,9)")
jpayne@69 143 assert not _re_oneof_position.match("one-of(3)")
jpayne@69 144 assert _re_oneof_position.match("one-of(3,6)")
jpayne@69 145 assert _re_oneof_position.match("one-of(3,6,9)")
jpayne@69 146
jpayne@69 147 _solo_location = r"[<>]?\d+"
jpayne@69 148 _solo_bond = r"bond\(%s\)" % _solo_location
jpayne@69 149
jpayne@69 150 _re_location_category = re.compile(
jpayne@69 151 r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$"
jpayne@69 152 % (
jpayne@69 153 _pair_location,
jpayne@69 154 _between_location,
jpayne@69 155 _within_location,
jpayne@69 156 _oneof_location,
jpayne@69 157 _solo_bond,
jpayne@69 158 _solo_location,
jpayne@69 159 )
jpayne@69 160 )
jpayne@69 161
jpayne@69 162
jpayne@69 163 class LocationParserError(ValueError):
jpayne@69 164 """Could not parse a feature location string."""
jpayne@69 165
jpayne@69 166
jpayne@69 167 class SeqFeature:
jpayne@69 168 """Represent a Sequence Feature on an object.
jpayne@69 169
jpayne@69 170 Attributes:
jpayne@69 171 - location - the location of the feature on the sequence (SimpleLocation)
jpayne@69 172 - type - the specified type of the feature (ie. CDS, exon, repeat...)
jpayne@69 173 - id - A string identifier for the feature.
jpayne@69 174 - qualifiers - A dictionary of qualifiers on the feature. These are
jpayne@69 175 analogous to the qualifiers from a GenBank feature table. The keys of
jpayne@69 176 the dictionary are qualifier names, the values are the qualifier
jpayne@69 177 values.
jpayne@69 178
jpayne@69 179 """
jpayne@69 180
jpayne@69 181 def __init__(
jpayne@69 182 self,
jpayne@69 183 location=None,
jpayne@69 184 type="",
jpayne@69 185 id="<unknown id>",
jpayne@69 186 qualifiers=None,
jpayne@69 187 sub_features=None,
jpayne@69 188 ):
jpayne@69 189 """Initialize a SeqFeature on a sequence.
jpayne@69 190
jpayne@69 191 location can either be a SimpleLocation (with strand argument also
jpayne@69 192 given if required), or None.
jpayne@69 193
jpayne@69 194 e.g. With no strand, on the forward strand, and on the reverse strand:
jpayne@69 195
jpayne@69 196 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 197 >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain")
jpayne@69 198 >>> f1.location.strand == None
jpayne@69 199 True
jpayne@69 200 >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS")
jpayne@69 201 >>> f2.location.strand == +1
jpayne@69 202 True
jpayne@69 203 >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS")
jpayne@69 204 >>> f3.location.strand == -1
jpayne@69 205 True
jpayne@69 206
jpayne@69 207 For exact start/end positions, an integer can be used (as shown above)
jpayne@69 208 as shorthand for the ExactPosition object. For non-exact locations, the
jpayne@69 209 SimpleLocation must be specified via the appropriate position objects.
jpayne@69 210 """
jpayne@69 211 if (
jpayne@69 212 location is not None
jpayne@69 213 and not isinstance(location, SimpleLocation)
jpayne@69 214 and not isinstance(location, CompoundLocation)
jpayne@69 215 ):
jpayne@69 216 raise TypeError(
jpayne@69 217 "SimpleLocation, CompoundLocation (or None) required for the location"
jpayne@69 218 )
jpayne@69 219 self.location = location
jpayne@69 220 self.type = type
jpayne@69 221 self.id = id
jpayne@69 222 self.qualifiers = {}
jpayne@69 223 if qualifiers is not None:
jpayne@69 224 self.qualifiers.update(qualifiers)
jpayne@69 225 if sub_features is not None:
jpayne@69 226 raise TypeError("Rather than sub_features, use a CompoundLocation")
jpayne@69 227
jpayne@69 228 def _get_strand(self):
jpayne@69 229 """Get function for the strand property (PRIVATE)."""
jpayne@69 230 warnings.warn(
jpayne@69 231 "Please use .location.strand rather than .strand",
jpayne@69 232 BiopythonDeprecationWarning,
jpayne@69 233 )
jpayne@69 234 return self.location.strand
jpayne@69 235
jpayne@69 236 def _set_strand(self, value):
jpayne@69 237 """Set function for the strand property (PRIVATE)."""
jpayne@69 238 warnings.warn(
jpayne@69 239 "Please use .location.strand rather than .strand",
jpayne@69 240 BiopythonDeprecationWarning,
jpayne@69 241 )
jpayne@69 242 try:
jpayne@69 243 self.location.strand = value
jpayne@69 244 except AttributeError:
jpayne@69 245 if self.location is None:
jpayne@69 246 if value is not None:
jpayne@69 247 raise ValueError("Can't set strand without a location.") from None
jpayne@69 248 else:
jpayne@69 249 raise
jpayne@69 250
jpayne@69 251 strand = property(
jpayne@69 252 fget=_get_strand,
jpayne@69 253 fset=_set_strand,
jpayne@69 254 doc="Alias for the location's strand (DEPRECATED).",
jpayne@69 255 )
jpayne@69 256
jpayne@69 257 def _get_ref(self):
jpayne@69 258 """Get function for the reference property (PRIVATE)."""
jpayne@69 259 warnings.warn(
jpayne@69 260 "Please use .location.ref rather than .ref",
jpayne@69 261 BiopythonDeprecationWarning,
jpayne@69 262 )
jpayne@69 263 try:
jpayne@69 264 return self.location.ref
jpayne@69 265 except AttributeError:
jpayne@69 266 return None
jpayne@69 267
jpayne@69 268 def _set_ref(self, value):
jpayne@69 269 """Set function for the reference property (PRIVATE)."""
jpayne@69 270 warnings.warn(
jpayne@69 271 "Please use .location.ref rather than .ref",
jpayne@69 272 BiopythonDeprecationWarning,
jpayne@69 273 )
jpayne@69 274 try:
jpayne@69 275 self.location.ref = value
jpayne@69 276 except AttributeError:
jpayne@69 277 if self.location is None:
jpayne@69 278 if value is not None:
jpayne@69 279 raise ValueError("Can't set ref without a location.") from None
jpayne@69 280 else:
jpayne@69 281 raise
jpayne@69 282
jpayne@69 283 ref = property(
jpayne@69 284 fget=_get_ref,
jpayne@69 285 fset=_set_ref,
jpayne@69 286 doc="Alias for the location's ref (DEPRECATED).",
jpayne@69 287 )
jpayne@69 288
jpayne@69 289 def _get_ref_db(self):
jpayne@69 290 """Get function for the database reference property (PRIVATE)."""
jpayne@69 291 warnings.warn(
jpayne@69 292 "Please use .location.ref_db rather than .ref_db",
jpayne@69 293 BiopythonDeprecationWarning,
jpayne@69 294 )
jpayne@69 295 try:
jpayne@69 296 return self.location.ref_db
jpayne@69 297 except AttributeError:
jpayne@69 298 return None
jpayne@69 299
jpayne@69 300 def _set_ref_db(self, value):
jpayne@69 301 """Set function for the database reference property (PRIVATE)."""
jpayne@69 302 warnings.warn(
jpayne@69 303 "Please use .location.ref_db rather than .ref_db",
jpayne@69 304 BiopythonDeprecationWarning,
jpayne@69 305 )
jpayne@69 306 self.location.ref_db = value
jpayne@69 307
jpayne@69 308 ref_db = property(
jpayne@69 309 fget=_get_ref_db,
jpayne@69 310 fset=_set_ref_db,
jpayne@69 311 doc="Alias for the location's ref_db (DEPRECATED).",
jpayne@69 312 )
jpayne@69 313
jpayne@69 314 def __eq__(self, other):
jpayne@69 315 """Check if two SeqFeature objects should be considered equal."""
jpayne@69 316 return (
jpayne@69 317 isinstance(other, SeqFeature)
jpayne@69 318 and self.id == other.id
jpayne@69 319 and self.type == other.type
jpayne@69 320 and self.location == other.location
jpayne@69 321 and self.qualifiers == other.qualifiers
jpayne@69 322 )
jpayne@69 323
jpayne@69 324 def __repr__(self):
jpayne@69 325 """Represent the feature as a string for debugging."""
jpayne@69 326 answer = f"{self.__class__.__name__}({self.location!r}"
jpayne@69 327 if self.type:
jpayne@69 328 answer += f", type={self.type!r}"
jpayne@69 329 if self.id and self.id != "<unknown id>":
jpayne@69 330 answer += f", id={self.id!r}"
jpayne@69 331 if self.qualifiers:
jpayne@69 332 answer += ", qualifiers=..."
jpayne@69 333 answer += ")"
jpayne@69 334 return answer
jpayne@69 335
jpayne@69 336 def __str__(self):
jpayne@69 337 """Return the full feature as a python string."""
jpayne@69 338 out = f"type: {self.type}\n"
jpayne@69 339 out += f"location: {self.location}\n"
jpayne@69 340 if self.id and self.id != "<unknown id>":
jpayne@69 341 out += f"id: {self.id}\n"
jpayne@69 342 out += "qualifiers:\n"
jpayne@69 343 for qual_key in sorted(self.qualifiers):
jpayne@69 344 out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n"
jpayne@69 345 return out
jpayne@69 346
jpayne@69 347 def _shift(self, offset):
jpayne@69 348 """Return a copy of the feature with its location shifted (PRIVATE).
jpayne@69 349
jpayne@69 350 The annotation qualifiers are copied.
jpayne@69 351 """
jpayne@69 352 return SeqFeature(
jpayne@69 353 location=self.location._shift(offset),
jpayne@69 354 type=self.type,
jpayne@69 355 id=self.id,
jpayne@69 356 qualifiers=self.qualifiers.copy(),
jpayne@69 357 )
jpayne@69 358
jpayne@69 359 def _flip(self, length):
jpayne@69 360 """Return a copy of the feature with its location flipped (PRIVATE).
jpayne@69 361
jpayne@69 362 The argument length gives the length of the parent sequence. For
jpayne@69 363 example a location 0..20 (+1 strand) with parent length 30 becomes
jpayne@69 364 after flipping 10..30 (-1 strand). Strandless (None) or unknown
jpayne@69 365 strand (0) remain like that - just their end points are changed.
jpayne@69 366
jpayne@69 367 The annotation qualifiers are copied.
jpayne@69 368 """
jpayne@69 369 return SeqFeature(
jpayne@69 370 location=self.location._flip(length),
jpayne@69 371 type=self.type,
jpayne@69 372 id=self.id,
jpayne@69 373 qualifiers=self.qualifiers.copy(),
jpayne@69 374 )
jpayne@69 375
jpayne@69 376 def extract(self, parent_sequence, references=None):
jpayne@69 377 """Extract the feature's sequence from supplied parent sequence.
jpayne@69 378
jpayne@69 379 The parent_sequence can be a Seq like object or a string, and will
jpayne@69 380 generally return an object of the same type. The exception to this is
jpayne@69 381 a MutableSeq as the parent sequence will return a Seq object.
jpayne@69 382
jpayne@69 383 This should cope with complex locations including complements, joins
jpayne@69 384 and fuzzy positions. Even mixed strand features should work! This
jpayne@69 385 also covers features on protein sequences (e.g. domains), although
jpayne@69 386 here reverse strand features are not permitted. If the
jpayne@69 387 location refers to other records, they must be supplied in the
jpayne@69 388 optional dictionary references.
jpayne@69 389
jpayne@69 390 >>> from Bio.Seq import Seq
jpayne@69 391 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 392 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@69 393 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
jpayne@69 394 >>> f.extract(seq)
jpayne@69 395 Seq('VALIVIC')
jpayne@69 396
jpayne@69 397 If the SimpleLocation is None, e.g. when parsing invalid locus
jpayne@69 398 locations in the GenBank parser, extract() will raise a ValueError.
jpayne@69 399
jpayne@69 400 >>> from Bio.Seq import Seq
jpayne@69 401 >>> from Bio.SeqFeature import SeqFeature
jpayne@69 402 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@69 403 >>> f = SeqFeature(None, type="domain")
jpayne@69 404 >>> f.extract(seq)
jpayne@69 405 Traceback (most recent call last):
jpayne@69 406 ...
jpayne@69 407 ValueError: The feature's .location is None. Check the sequence file for a valid location.
jpayne@69 408
jpayne@69 409 Note - currently only compound features of type "join" are supported.
jpayne@69 410 """
jpayne@69 411 if self.location is None:
jpayne@69 412 raise ValueError(
jpayne@69 413 "The feature's .location is None. Check the "
jpayne@69 414 "sequence file for a valid location."
jpayne@69 415 )
jpayne@69 416 return self.location.extract(parent_sequence, references=references)
jpayne@69 417
jpayne@69 418 def translate(
jpayne@69 419 self,
jpayne@69 420 parent_sequence,
jpayne@69 421 table="Standard",
jpayne@69 422 start_offset=None,
jpayne@69 423 stop_symbol="*",
jpayne@69 424 to_stop=False,
jpayne@69 425 cds=None,
jpayne@69 426 gap=None,
jpayne@69 427 ):
jpayne@69 428 """Get a translation of the feature's sequence.
jpayne@69 429
jpayne@69 430 This method is intended for CDS or other features that code proteins
jpayne@69 431 and is a shortcut that will both extract the feature and
jpayne@69 432 translate it, taking into account the codon_start and transl_table
jpayne@69 433 qualifiers, if they are present. If they are not present the
jpayne@69 434 value of the arguments "table" and "start_offset" are used.
jpayne@69 435
jpayne@69 436 The "cds" parameter is set to "True" if the feature is of type
jpayne@69 437 "CDS" but can be overridden by giving an explicit argument.
jpayne@69 438
jpayne@69 439 The arguments stop_symbol, to_stop and gap have the same meaning
jpayne@69 440 as Seq.translate, refer to that documentation for further information.
jpayne@69 441
jpayne@69 442 Arguments:
jpayne@69 443 - parent_sequence - A DNA or RNA sequence.
jpayne@69 444 - table - Which codon table to use if there is no transl_table
jpayne@69 445 qualifier for this feature. This can be either a name
jpayne@69 446 (string), an NCBI identifier (integer), or a CodonTable
jpayne@69 447 object (useful for non-standard genetic codes). This
jpayne@69 448 defaults to the "Standard" table.
jpayne@69 449 - start_offset - offset at which the first complete codon of a
jpayne@69 450 coding feature can be found, relative to the first base of
jpayne@69 451 that feature. Has a valid value of 0, 1 or 2. NOTE: this
jpayne@69 452 uses python's 0-based numbering whereas the codon_start
jpayne@69 453 qualifier in files from NCBI use 1-based numbering.
jpayne@69 454 Will override a codon_start qualifier
jpayne@69 455
jpayne@69 456 >>> from Bio.Seq import Seq
jpayne@69 457 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 458 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
jpayne@69 459 >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS")
jpayne@69 460 >>> f.qualifiers['transl_table'] = [11]
jpayne@69 461
jpayne@69 462 Note that features of type CDS are subject to the usual
jpayne@69 463 checks at translation. But you can override this behavior
jpayne@69 464 by giving explicit arguments:
jpayne@69 465
jpayne@69 466 >>> f.translate(seq, cds=False)
jpayne@69 467 Seq('GYTYR*CL**')
jpayne@69 468
jpayne@69 469 Now use the start_offset argument to change the frame. Note
jpayne@69 470 this uses python 0-based numbering.
jpayne@69 471
jpayne@69 472 >>> f.translate(seq, start_offset=1, cds=False)
jpayne@69 473 Seq('VTLTDNVSD')
jpayne@69 474
jpayne@69 475 Alternatively use the codon_start qualifier to do the same
jpayne@69 476 thing. Note: this uses 1-based numbering, which is found
jpayne@69 477 in files from NCBI.
jpayne@69 478
jpayne@69 479 >>> f.qualifiers['codon_start'] = [2]
jpayne@69 480 >>> f.translate(seq, cds=False)
jpayne@69 481 Seq('VTLTDNVSD')
jpayne@69 482 """
jpayne@69 483 # see if this feature should be translated in a different
jpayne@69 484 # frame using the "codon_start" qualifier
jpayne@69 485 if start_offset is None:
jpayne@69 486 try:
jpayne@69 487 start_offset = int(self.qualifiers["codon_start"][0]) - 1
jpayne@69 488 except KeyError:
jpayne@69 489 start_offset = 0
jpayne@69 490
jpayne@69 491 if start_offset not in [0, 1, 2]:
jpayne@69 492 raise ValueError(
jpayne@69 493 "The start_offset must be 0, 1, or 2. "
jpayne@69 494 f"The supplied value is {start_offset}. "
jpayne@69 495 "Check the value of either the codon_start qualifier "
jpayne@69 496 "or the start_offset argument"
jpayne@69 497 )
jpayne@69 498
jpayne@69 499 feat_seq = self.extract(parent_sequence)[start_offset:]
jpayne@69 500 codon_table = self.qualifiers.get("transl_table", [table])[0]
jpayne@69 501
jpayne@69 502 if cds is None:
jpayne@69 503 cds = self.type == "CDS"
jpayne@69 504
jpayne@69 505 return feat_seq.translate(
jpayne@69 506 table=codon_table,
jpayne@69 507 stop_symbol=stop_symbol,
jpayne@69 508 to_stop=to_stop,
jpayne@69 509 cds=cds,
jpayne@69 510 gap=gap,
jpayne@69 511 )
jpayne@69 512
jpayne@69 513 def __bool__(self):
jpayne@69 514 """Boolean value of an instance of this class (True).
jpayne@69 515
jpayne@69 516 This behavior is for backwards compatibility, since until the
jpayne@69 517 __len__ method was added, a SeqFeature always evaluated as True.
jpayne@69 518
jpayne@69 519 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@69 520 evaluate to False if they have length zero.
jpayne@69 521
jpayne@69 522 WARNING: The SeqFeature may in future evaluate to False when its
jpayne@69 523 length is zero (in order to better match normal python behavior)!
jpayne@69 524 """
jpayne@69 525 return True
jpayne@69 526
jpayne@69 527 def __len__(self):
jpayne@69 528 """Return the length of the region where the feature is located.
jpayne@69 529
jpayne@69 530 >>> from Bio.Seq import Seq
jpayne@69 531 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 532 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@69 533 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
jpayne@69 534 >>> len(f)
jpayne@69 535 7
jpayne@69 536 >>> f.extract(seq)
jpayne@69 537 Seq('VALIVIC')
jpayne@69 538 >>> len(f.extract(seq))
jpayne@69 539 7
jpayne@69 540
jpayne@69 541 This is a proxy for taking the length of the feature's location:
jpayne@69 542
jpayne@69 543 >>> len(f.location)
jpayne@69 544 7
jpayne@69 545
jpayne@69 546 For simple features this is the same as the region spanned (end
jpayne@69 547 position minus start position using Pythonic counting). However, for
jpayne@69 548 a compound location (e.g. a CDS as the join of several exons) the
jpayne@69 549 gaps are not counted (e.g. introns). This ensures that len(f) matches
jpayne@69 550 len(f.extract(parent_seq)), and also makes sure things work properly
jpayne@69 551 with features wrapping the origin etc.
jpayne@69 552 """
jpayne@69 553 return len(self.location)
jpayne@69 554
jpayne@69 555 def __iter__(self):
jpayne@69 556 """Iterate over the parent positions within the feature.
jpayne@69 557
jpayne@69 558 The iteration order is strand aware, and can be thought of as moving
jpayne@69 559 along the feature using the parent sequence coordinates:
jpayne@69 560
jpayne@69 561 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 562 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
jpayne@69 563 >>> len(f)
jpayne@69 564 5
jpayne@69 565 >>> for i in f: print(i)
jpayne@69 566 9
jpayne@69 567 8
jpayne@69 568 7
jpayne@69 569 6
jpayne@69 570 5
jpayne@69 571 >>> list(f)
jpayne@69 572 [9, 8, 7, 6, 5]
jpayne@69 573
jpayne@69 574 This is a proxy for iterating over the location,
jpayne@69 575
jpayne@69 576 >>> list(f.location)
jpayne@69 577 [9, 8, 7, 6, 5]
jpayne@69 578 """
jpayne@69 579 return iter(self.location)
jpayne@69 580
jpayne@69 581 def __contains__(self, value):
jpayne@69 582 """Check if an integer position is within the feature.
jpayne@69 583
jpayne@69 584 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 585 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
jpayne@69 586 >>> len(f)
jpayne@69 587 5
jpayne@69 588 >>> [i for i in range(15) if i in f]
jpayne@69 589 [5, 6, 7, 8, 9]
jpayne@69 590
jpayne@69 591 For example, to see which features include a SNP position, you could
jpayne@69 592 use this:
jpayne@69 593
jpayne@69 594 >>> from Bio import SeqIO
jpayne@69 595 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
jpayne@69 596 >>> for f in record.features:
jpayne@69 597 ... if 1750 in f:
jpayne@69 598 ... print("%s %s" % (f.type, f.location))
jpayne@69 599 source [0:154478](+)
jpayne@69 600 gene [1716:4347](-)
jpayne@69 601 tRNA join{[4310:4347](-), [1716:1751](-)}
jpayne@69 602
jpayne@69 603 Note that for a feature defined as a join of several subfeatures (e.g.
jpayne@69 604 the union of several exons) the gaps are not checked (e.g. introns).
jpayne@69 605 In this example, the tRNA location is defined in the GenBank file as
jpayne@69 606 complement(join(1717..1751,4311..4347)), so that position 1760 falls
jpayne@69 607 in the gap:
jpayne@69 608
jpayne@69 609 >>> for f in record.features:
jpayne@69 610 ... if 1760 in f:
jpayne@69 611 ... print("%s %s" % (f.type, f.location))
jpayne@69 612 source [0:154478](+)
jpayne@69 613 gene [1716:4347](-)
jpayne@69 614
jpayne@69 615 Note that additional care may be required with fuzzy locations, for
jpayne@69 616 example just before a BeforePosition:
jpayne@69 617
jpayne@69 618 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@69 619 >>> from Bio.SeqFeature import BeforePosition
jpayne@69 620 >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain")
jpayne@69 621 >>> len(f)
jpayne@69 622 5
jpayne@69 623 >>> [i for i in range(10) if i in f]
jpayne@69 624 [3, 4, 5, 6, 7]
jpayne@69 625
jpayne@69 626 Note that is is a proxy for testing membership on the location.
jpayne@69 627
jpayne@69 628 >>> [i for i in range(10) if i in f.location]
jpayne@69 629 [3, 4, 5, 6, 7]
jpayne@69 630 """
jpayne@69 631 return value in self.location
jpayne@69 632
jpayne@69 633
jpayne@69 634 # --- References
jpayne@69 635
jpayne@69 636
jpayne@69 637 # TODO -- Will this hold PubMed and Medline information decently?
jpayne@69 638 class Reference:
jpayne@69 639 """Represent a Generic Reference object.
jpayne@69 640
jpayne@69 641 Attributes:
jpayne@69 642 - location - A list of Location objects specifying regions of
jpayne@69 643 the sequence that the references correspond to. If no locations are
jpayne@69 644 specified, the entire sequence is assumed.
jpayne@69 645 - authors - A big old string, or a list split by author, of authors
jpayne@69 646 for the reference.
jpayne@69 647 - title - The title of the reference.
jpayne@69 648 - journal - Journal the reference was published in.
jpayne@69 649 - medline_id - A medline reference for the article.
jpayne@69 650 - pubmed_id - A pubmed reference for the article.
jpayne@69 651 - comment - A place to stick any comments about the reference.
jpayne@69 652
jpayne@69 653 """
jpayne@69 654
jpayne@69 655 def __init__(self):
jpayne@69 656 """Initialize the class."""
jpayne@69 657 self.location = []
jpayne@69 658 self.authors = ""
jpayne@69 659 self.consrtm = ""
jpayne@69 660 self.title = ""
jpayne@69 661 self.journal = ""
jpayne@69 662 self.medline_id = ""
jpayne@69 663 self.pubmed_id = ""
jpayne@69 664 self.comment = ""
jpayne@69 665
jpayne@69 666 def __str__(self):
jpayne@69 667 """Return the full Reference object as a python string."""
jpayne@69 668 out = ""
jpayne@69 669 for single_location in self.location:
jpayne@69 670 out += f"location: {single_location}\n"
jpayne@69 671 out += f"authors: {self.authors}\n"
jpayne@69 672 if self.consrtm:
jpayne@69 673 out += f"consrtm: {self.consrtm}\n"
jpayne@69 674 out += f"title: {self.title}\n"
jpayne@69 675 out += f"journal: {self.journal}\n"
jpayne@69 676 out += f"medline id: {self.medline_id}\n"
jpayne@69 677 out += f"pubmed id: {self.pubmed_id}\n"
jpayne@69 678 out += f"comment: {self.comment}\n"
jpayne@69 679 return out
jpayne@69 680
jpayne@69 681 def __repr__(self):
jpayne@69 682 """Represent the Reference object as a string for debugging."""
jpayne@69 683 # TODO - Update this is __init__ later accepts values
jpayne@69 684 return f"{self.__class__.__name__}(title={self.title!r}, ...)"
jpayne@69 685
jpayne@69 686 def __eq__(self, other):
jpayne@69 687 """Check if two Reference objects should be considered equal.
jpayne@69 688
jpayne@69 689 Note prior to Biopython 1.70 the location was not compared, as
jpayne@69 690 until then __eq__ for the SimpleLocation class was not defined.
jpayne@69 691 """
jpayne@69 692 return (
jpayne@69 693 self.authors == other.authors
jpayne@69 694 and self.consrtm == other.consrtm
jpayne@69 695 and self.title == other.title
jpayne@69 696 and self.journal == other.journal
jpayne@69 697 and self.medline_id == other.medline_id
jpayne@69 698 and self.pubmed_id == other.pubmed_id
jpayne@69 699 and self.comment == other.comment
jpayne@69 700 and self.location == other.location
jpayne@69 701 )
jpayne@69 702
jpayne@69 703
jpayne@69 704 # --- Handling feature locations
jpayne@69 705
jpayne@69 706
jpayne@69 707 class Location(ABC):
jpayne@69 708 """Abstract base class representing a location."""
jpayne@69 709
jpayne@69 710 @abstractmethod
jpayne@69 711 def __repr__(self):
jpayne@69 712 """Represent the Location object as a string for debugging."""
jpayne@69 713 return f"{self.__class__.__name__}(...)"
jpayne@69 714
jpayne@69 715 def fromstring(text, length=None, circular=False, stranded=True):
jpayne@69 716 """Create a Location object from a string.
jpayne@69 717
jpayne@69 718 This should accept any valid location string in the INSDC Feature Table
jpayne@69 719 format (https://www.insdc.org/submitting-standards/feature-table/) as
jpayne@69 720 used in GenBank, DDBJ and EMBL files.
jpayne@69 721
jpayne@69 722 Simple examples:
jpayne@69 723
jpayne@69 724 >>> Location.fromstring("123..456", 1000)
jpayne@69 725 SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1)
jpayne@69 726 >>> Location.fromstring("complement(<123..>456)", 1000)
jpayne@69 727 SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1)
jpayne@69 728
jpayne@69 729 A more complex location using within positions,
jpayne@69 730
jpayne@69 731 >>> Location.fromstring("(9.10)..(20.25)", 1000)
jpayne@69 732 SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
jpayne@69 733
jpayne@69 734 Notice how that will act as though it has overall start 8 and end 25.
jpayne@69 735
jpayne@69 736 Zero length between feature,
jpayne@69 737
jpayne@69 738 >>> Location.fromstring("123^124", 1000)
jpayne@69 739 SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1)
jpayne@69 740
jpayne@69 741 The expected sequence length is needed for a special case, a between
jpayne@69 742 position at the start/end of a circular genome:
jpayne@69 743
jpayne@69 744 >>> Location.fromstring("1000^1", 1000)
jpayne@69 745 SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
jpayne@69 746
jpayne@69 747 Apart from this special case, between positions P^Q must have P+1==Q,
jpayne@69 748
jpayne@69 749 >>> Location.fromstring("123^456", 1000)
jpayne@69 750 Traceback (most recent call last):
jpayne@69 751 ...
jpayne@69 752 Bio.SeqFeature.LocationParserError: invalid feature location '123^456'
jpayne@69 753
jpayne@69 754 You can optionally provide a reference name:
jpayne@69 755
jpayne@69 756 >>> Location.fromstring("AL391218.9:105173..108462", 2000000)
jpayne@69 757 SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
jpayne@69 758
jpayne@69 759 >>> Location.fromstring("<2644..159", 2868, "circular")
jpayne@69 760 CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
jpayne@69 761 """
jpayne@69 762 if text.startswith("complement("):
jpayne@69 763 if text[-1] != ")":
jpayne@69 764 raise ValueError(f"closing bracket missing in '{text}'")
jpayne@69 765 text = text[11:-1]
jpayne@69 766 strand = -1
jpayne@69 767 elif stranded:
jpayne@69 768 strand = 1
jpayne@69 769 else:
jpayne@69 770 strand = None
jpayne@69 771
jpayne@69 772 # Determine if we have a simple location or a compound location
jpayne@69 773 if text.startswith("join("):
jpayne@69 774 operator = "join"
jpayne@69 775 parts = _split(text[5:-1])[1::2]
jpayne@69 776 # assert parts[0] == "" and parts[-1] == ""
jpayne@69 777 elif text.startswith("order("):
jpayne@69 778 operator = "order"
jpayne@69 779 parts = _split(text[6:-1])[1::2]
jpayne@69 780 # assert parts[0] == "" and parts[-1] == ""
jpayne@69 781 elif text.startswith("bond("):
jpayne@69 782 operator = "bond"
jpayne@69 783 parts = _split(text[5:-1])[1::2]
jpayne@69 784 # assert parts[0] == "" and parts[-1] == ""
jpayne@69 785 else:
jpayne@69 786 loc = SimpleLocation.fromstring(text, length, circular)
jpayne@69 787 loc.strand = strand
jpayne@69 788 if strand == -1:
jpayne@69 789 loc.parts.reverse()
jpayne@69 790 return loc
jpayne@69 791 locs = []
jpayne@69 792 for part in parts:
jpayne@69 793 loc = SimpleLocation.fromstring(part, length, circular)
jpayne@69 794 if loc is None:
jpayne@69 795 break
jpayne@69 796 if loc.strand == -1:
jpayne@69 797 if strand == -1:
jpayne@69 798 raise LocationParserError("double complement in '{text}'?")
jpayne@69 799 else:
jpayne@69 800 loc.strand = strand
jpayne@69 801 locs.extend(loc.parts)
jpayne@69 802 else:
jpayne@69 803 if len(locs) == 1:
jpayne@69 804 return loc
jpayne@69 805 # Historically a join on the reverse strand has been represented
jpayne@69 806 # in Biopython with both the parent SeqFeature and its children
jpayne@69 807 # (the exons for a CDS) all given a strand of -1. Likewise, for
jpayne@69 808 # a join feature on the forward strand they all have strand +1.
jpayne@69 809 # However, we must also consider evil mixed strand examples like
jpayne@69 810 # this, join(complement(69611..69724),139856..140087,140625..140650)
jpayne@69 811 if strand == -1:
jpayne@69 812 # Whole thing was wrapped in complement(...)
jpayne@69 813 for loc in locs:
jpayne@69 814 assert loc.strand == -1
jpayne@69 815 # Reverse the backwards order used in GenBank files
jpayne@69 816 # with complement(join(...))
jpayne@69 817 locs = locs[::-1]
jpayne@69 818 return CompoundLocation(locs, operator=operator)
jpayne@69 819 # Not recognized
jpayne@69 820 if "order" in text and "join" in text:
jpayne@69 821 # See Bug 3197
jpayne@69 822 raise LocationParserError(
jpayne@69 823 f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal"
jpayne@69 824 )
jpayne@69 825
jpayne@69 826 # See issue #937. Note that NCBI has already fixed this record.
jpayne@69 827 if ",)" in text:
jpayne@69 828 warnings.warn(
jpayne@69 829 "Dropping trailing comma in malformed feature location",
jpayne@69 830 BiopythonParserWarning,
jpayne@69 831 )
jpayne@69 832 text = text.replace(",)", ")")
jpayne@69 833 return Location.fromstring(text)
jpayne@69 834
jpayne@69 835 raise LocationParserError(f"failed to parse feature location '{text}'")
jpayne@69 836
jpayne@69 837
jpayne@69 838 class SimpleLocation(Location):
jpayne@69 839 """Specify the location of a feature along a sequence.
jpayne@69 840
jpayne@69 841 The SimpleLocation is used for simple continuous features, which can
jpayne@69 842 be described as running from a start position to and end position
jpayne@69 843 (optionally with a strand and reference information). More complex
jpayne@69 844 locations made up from several non-continuous parts (e.g. a coding
jpayne@69 845 sequence made up of several exons) are described using a SeqFeature
jpayne@69 846 with a CompoundLocation.
jpayne@69 847
jpayne@69 848 Note that the start and end location numbering follow Python's scheme,
jpayne@69 849 thus a GenBank entry of 123..150 (one based counting) becomes a location
jpayne@69 850 of [122:150] (zero based counting).
jpayne@69 851
jpayne@69 852 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 853 >>> f = SimpleLocation(122, 150)
jpayne@69 854 >>> print(f)
jpayne@69 855 [122:150]
jpayne@69 856 >>> print(f.start)
jpayne@69 857 122
jpayne@69 858 >>> print(f.end)
jpayne@69 859 150
jpayne@69 860 >>> print(f.strand)
jpayne@69 861 None
jpayne@69 862
jpayne@69 863 Note the strand defaults to None. If you are working with nucleotide
jpayne@69 864 sequences you'd want to be explicit if it is the forward strand:
jpayne@69 865
jpayne@69 866 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 867 >>> f = SimpleLocation(122, 150, strand=+1)
jpayne@69 868 >>> print(f)
jpayne@69 869 [122:150](+)
jpayne@69 870 >>> print(f.strand)
jpayne@69 871 1
jpayne@69 872
jpayne@69 873 Note that for a parent sequence of length n, the SimpleLocation
jpayne@69 874 start and end must satisfy the inequality 0 <= start <= end <= n.
jpayne@69 875 This means even for features on the reverse strand of a nucleotide
jpayne@69 876 sequence, we expect the 'start' coordinate to be less than the
jpayne@69 877 'end'.
jpayne@69 878
jpayne@69 879 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 880 >>> r = SimpleLocation(122, 150, strand=-1)
jpayne@69 881 >>> print(r)
jpayne@69 882 [122:150](-)
jpayne@69 883 >>> print(r.start)
jpayne@69 884 122
jpayne@69 885 >>> print(r.end)
jpayne@69 886 150
jpayne@69 887 >>> print(r.strand)
jpayne@69 888 -1
jpayne@69 889
jpayne@69 890 i.e. Rather than thinking of the 'start' and 'end' biologically in a
jpayne@69 891 strand aware manner, think of them as the 'left most' or 'minimum'
jpayne@69 892 boundary, and the 'right most' or 'maximum' boundary of the region
jpayne@69 893 being described. This is particularly important with compound
jpayne@69 894 locations describing non-continuous regions.
jpayne@69 895
jpayne@69 896 In the example above we have used standard exact positions, but there
jpayne@69 897 are also specialised position objects used to represent fuzzy positions
jpayne@69 898 as well, for example a GenBank location like complement(<123..150)
jpayne@69 899 would use a BeforePosition object for the start.
jpayne@69 900 """
jpayne@69 901
jpayne@69 902 def __init__(self, start, end, strand=None, ref=None, ref_db=None):
jpayne@69 903 """Initialize the class.
jpayne@69 904
jpayne@69 905 start and end arguments specify the values where the feature begins
jpayne@69 906 and ends. These can either by any of the ``*Position`` objects that
jpayne@69 907 inherit from Position, or can just be integers specifying the position.
jpayne@69 908 In the case of integers, the values are assumed to be exact and are
jpayne@69 909 converted in ExactPosition arguments. This is meant to make it easy
jpayne@69 910 to deal with non-fuzzy ends.
jpayne@69 911
jpayne@69 912 i.e. Short form:
jpayne@69 913
jpayne@69 914 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 915 >>> loc = SimpleLocation(5, 10, strand=-1)
jpayne@69 916 >>> print(loc)
jpayne@69 917 [5:10](-)
jpayne@69 918
jpayne@69 919 Explicit form:
jpayne@69 920
jpayne@69 921 >>> from Bio.SeqFeature import SimpleLocation, ExactPosition
jpayne@69 922 >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1)
jpayne@69 923 >>> print(loc)
jpayne@69 924 [5:10](-)
jpayne@69 925
jpayne@69 926 Other fuzzy positions are used similarly,
jpayne@69 927
jpayne@69 928 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 929 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@69 930 >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1)
jpayne@69 931 >>> print(loc2)
jpayne@69 932 [<5:>10](-)
jpayne@69 933
jpayne@69 934 For nucleotide features you will also want to specify the strand,
jpayne@69 935 use 1 for the forward (plus) strand, -1 for the reverse (negative)
jpayne@69 936 strand, 0 for stranded but strand unknown (? in GFF3), or None for
jpayne@69 937 when the strand does not apply (dot in GFF3), e.g. features on
jpayne@69 938 proteins.
jpayne@69 939
jpayne@69 940 >>> loc = SimpleLocation(5, 10, strand=+1)
jpayne@69 941 >>> print(loc)
jpayne@69 942 [5:10](+)
jpayne@69 943 >>> print(loc.strand)
jpayne@69 944 1
jpayne@69 945
jpayne@69 946 Normally feature locations are given relative to the parent
jpayne@69 947 sequence you are working with, but an explicit accession can
jpayne@69 948 be given with the optional ref and db_ref strings:
jpayne@69 949
jpayne@69 950 >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1)
jpayne@69 951 >>> print(loc)
jpayne@69 952 AL391218.9[105172:108462](+)
jpayne@69 953 >>> print(loc.ref)
jpayne@69 954 AL391218.9
jpayne@69 955
jpayne@69 956 """
jpayne@69 957 # TODO - Check 0 <= start <= end (<= length of reference)
jpayne@69 958 if isinstance(start, Position):
jpayne@69 959 self._start = start
jpayne@69 960 elif isinstance(start, int):
jpayne@69 961 self._start = ExactPosition(start)
jpayne@69 962 else:
jpayne@69 963 raise TypeError(f"start={start!r} {type(start)}")
jpayne@69 964 if isinstance(end, Position):
jpayne@69 965 self._end = end
jpayne@69 966 elif isinstance(end, int):
jpayne@69 967 self._end = ExactPosition(end)
jpayne@69 968 else:
jpayne@69 969 raise TypeError(f"end={end!r} {type(end)}")
jpayne@69 970 if (
jpayne@69 971 isinstance(self.start, int)
jpayne@69 972 and isinstance(self.end, int)
jpayne@69 973 and self.start > self.end
jpayne@69 974 ):
jpayne@69 975 raise ValueError(
jpayne@69 976 f"End location ({self.end}) must be greater than "
jpayne@69 977 f"or equal to start location ({self.start})"
jpayne@69 978 )
jpayne@69 979 self.strand = strand
jpayne@69 980 self.ref = ref
jpayne@69 981 self.ref_db = ref_db
jpayne@69 982
jpayne@69 983 @staticmethod
jpayne@69 984 def fromstring(text, length=None, circular=False):
jpayne@69 985 """Create a SimpleLocation object from a string."""
jpayne@69 986 if text.startswith("complement("):
jpayne@69 987 text = text[11:-1]
jpayne@69 988 strand = -1
jpayne@69 989 else:
jpayne@69 990 strand = None
jpayne@69 991 # Try simple cases first for speed
jpayne@69 992 try:
jpayne@69 993 s, e = text.split("..")
jpayne@69 994 s = int(s) - 1
jpayne@69 995 e = int(e)
jpayne@69 996 except ValueError:
jpayne@69 997 pass
jpayne@69 998 else:
jpayne@69 999 if 0 <= s < e:
jpayne@69 1000 return SimpleLocation(s, e, strand)
jpayne@69 1001 # Try general case
jpayne@69 1002 try:
jpayne@69 1003 ref, text = text.split(":")
jpayne@69 1004 except ValueError:
jpayne@69 1005 ref = None
jpayne@69 1006 m = _re_location_category.match(text)
jpayne@69 1007 if m is None:
jpayne@69 1008 raise LocationParserError(f"Could not parse feature location '{text}'")
jpayne@69 1009 for key, value in m.groupdict().items():
jpayne@69 1010 if value is not None:
jpayne@69 1011 break
jpayne@69 1012 assert value == text
jpayne@69 1013 if key == "bond":
jpayne@69 1014 # e.g. bond(196)
jpayne@69 1015 warnings.warn(
jpayne@69 1016 "Dropping bond qualifier in feature location",
jpayne@69 1017 BiopythonParserWarning,
jpayne@69 1018 )
jpayne@69 1019 text = text[5:-1]
jpayne@69 1020 s_pos = Position.fromstring(text, -1)
jpayne@69 1021 e_pos = Position.fromstring(text)
jpayne@69 1022 elif key == "solo":
jpayne@69 1023 # e.g. "123"
jpayne@69 1024 s_pos = Position.fromstring(text, -1)
jpayne@69 1025 e_pos = Position.fromstring(text)
jpayne@69 1026 elif key in ("pair", "within", "oneof"):
jpayne@69 1027 s, e = text.split("..")
jpayne@69 1028 # Attempt to fix features that span the origin
jpayne@69 1029 s_pos = Position.fromstring(s, -1)
jpayne@69 1030 e_pos = Position.fromstring(e)
jpayne@69 1031 if s_pos >= e_pos:
jpayne@69 1032 # There is likely a problem with origin wrapping.
jpayne@69 1033 # Create a CompoundLocation of the wrapped feature,
jpayne@69 1034 # consisting of two SimpleLocation objects to extend to
jpayne@69 1035 # the list of feature locations.
jpayne@69 1036 if not circular:
jpayne@69 1037 raise LocationParserError(
jpayne@69 1038 f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined"
jpayne@69 1039 )
jpayne@69 1040 warnings.warn(
jpayne@69 1041 "Attempting to fix invalid location %r as "
jpayne@69 1042 "it looks like incorrect origin wrapping. "
jpayne@69 1043 "Please fix input file, this could have "
jpayne@69 1044 "unintended behavior." % text,
jpayne@69 1045 BiopythonParserWarning,
jpayne@69 1046 )
jpayne@69 1047
jpayne@69 1048 f1 = SimpleLocation(s_pos, length, strand)
jpayne@69 1049 f2 = SimpleLocation(0, e_pos, strand)
jpayne@69 1050
jpayne@69 1051 if strand == -1:
jpayne@69 1052 # For complementary features spanning the origin
jpayne@69 1053 return f2 + f1
jpayne@69 1054 else:
jpayne@69 1055 return f1 + f2
jpayne@69 1056 elif key == "between":
jpayne@69 1057 # A between location like "67^68" (one based counting) is a
jpayne@69 1058 # special case (note it has zero length). In python slice
jpayne@69 1059 # notation this is 67:67, a zero length slice. See Bug 2622
jpayne@69 1060 # Further more, on a circular genome of length N you can have
jpayne@69 1061 # a location N^1 meaning the junction at the origin. See Bug 3098.
jpayne@69 1062 # NOTE - We can imagine between locations like "2^4", but this
jpayne@69 1063 # is just "3". Similarly, "2^5" is just "3..4"
jpayne@69 1064 s, e = text.split("^")
jpayne@69 1065 s = int(s)
jpayne@69 1066 e = int(e)
jpayne@69 1067 if s + 1 == e or (s == length and e == 1):
jpayne@69 1068 s_pos = ExactPosition(s)
jpayne@69 1069 e_pos = s_pos
jpayne@69 1070 else:
jpayne@69 1071 raise LocationParserError(f"invalid feature location '{text}'")
jpayne@69 1072 if s_pos < 0:
jpayne@69 1073 raise LocationParserError(
jpayne@69 1074 f"negative starting position in feature location '{text}'"
jpayne@69 1075 )
jpayne@69 1076 return SimpleLocation(s_pos, e_pos, strand, ref=ref)
jpayne@69 1077
jpayne@69 1078 def _get_strand(self):
jpayne@69 1079 """Get function for the strand property (PRIVATE)."""
jpayne@69 1080 return self._strand
jpayne@69 1081
jpayne@69 1082 def _set_strand(self, value):
jpayne@69 1083 """Set function for the strand property (PRIVATE)."""
jpayne@69 1084 if value not in [+1, -1, 0, None]:
jpayne@69 1085 raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}")
jpayne@69 1086 self._strand = value
jpayne@69 1087
jpayne@69 1088 strand = property(
jpayne@69 1089 fget=_get_strand,
jpayne@69 1090 fset=_set_strand,
jpayne@69 1091 doc="Strand of the location (+1, -1, 0 or None).",
jpayne@69 1092 )
jpayne@69 1093
jpayne@69 1094 def __str__(self):
jpayne@69 1095 """Return a representation of the SimpleLocation object (with python counting).
jpayne@69 1096
jpayne@69 1097 For the simple case this uses the python splicing syntax, [122:150]
jpayne@69 1098 (zero based counting) which GenBank would call 123..150 (one based
jpayne@69 1099 counting).
jpayne@69 1100 """
jpayne@69 1101 answer = f"[{self._start}:{self._end}]"
jpayne@69 1102 if self.ref and self.ref_db:
jpayne@69 1103 answer = f"{self.ref_db}:{self.ref}{answer}"
jpayne@69 1104 elif self.ref:
jpayne@69 1105 answer = self.ref + answer
jpayne@69 1106 # Is ref_db without ref meaningful?
jpayne@69 1107 if self.strand is None:
jpayne@69 1108 return answer
jpayne@69 1109 elif self.strand == +1:
jpayne@69 1110 return answer + "(+)"
jpayne@69 1111 elif self.strand == -1:
jpayne@69 1112 return answer + "(-)"
jpayne@69 1113 else:
jpayne@69 1114 # strand = 0, stranded but strand unknown, ? in GFF3
jpayne@69 1115 return answer + "(?)"
jpayne@69 1116
jpayne@69 1117 def __repr__(self):
jpayne@69 1118 """Represent the SimpleLocation object as a string for debugging."""
jpayne@69 1119 optional = ""
jpayne@69 1120 if self.strand is not None:
jpayne@69 1121 optional += f", strand={self.strand!r}"
jpayne@69 1122 if self.ref is not None:
jpayne@69 1123 optional += f", ref={self.ref!r}"
jpayne@69 1124 if self.ref_db is not None:
jpayne@69 1125 optional += f", ref_db={self.ref_db!r}"
jpayne@69 1126 return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})"
jpayne@69 1127
jpayne@69 1128 def __add__(self, other):
jpayne@69 1129 """Combine location with another SimpleLocation object, or shift it.
jpayne@69 1130
jpayne@69 1131 You can add two feature locations to make a join CompoundLocation:
jpayne@69 1132
jpayne@69 1133 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1134 >>> f1 = SimpleLocation(5, 10)
jpayne@69 1135 >>> f2 = SimpleLocation(20, 30)
jpayne@69 1136 >>> combined = f1 + f2
jpayne@69 1137 >>> print(combined)
jpayne@69 1138 join{[5:10], [20:30]}
jpayne@69 1139
jpayne@69 1140 This is thus equivalent to:
jpayne@69 1141
jpayne@69 1142 >>> from Bio.SeqFeature import CompoundLocation
jpayne@69 1143 >>> join = CompoundLocation([f1, f2])
jpayne@69 1144 >>> print(join)
jpayne@69 1145 join{[5:10], [20:30]}
jpayne@69 1146
jpayne@69 1147 You can also use sum(...) in this way:
jpayne@69 1148
jpayne@69 1149 >>> join = sum([f1, f2])
jpayne@69 1150 >>> print(join)
jpayne@69 1151 join{[5:10], [20:30]}
jpayne@69 1152
jpayne@69 1153 Furthermore, you can combine a SimpleLocation with a CompoundLocation
jpayne@69 1154 in this way.
jpayne@69 1155
jpayne@69 1156 Separately, adding an integer will give a new SimpleLocation with
jpayne@69 1157 its start and end offset by that amount. For example:
jpayne@69 1158
jpayne@69 1159 >>> print(f1)
jpayne@69 1160 [5:10]
jpayne@69 1161 >>> print(f1 + 100)
jpayne@69 1162 [105:110]
jpayne@69 1163 >>> print(200 + f1)
jpayne@69 1164 [205:210]
jpayne@69 1165
jpayne@69 1166 This can be useful when editing annotation.
jpayne@69 1167 """
jpayne@69 1168 if isinstance(other, SimpleLocation):
jpayne@69 1169 return CompoundLocation([self, other])
jpayne@69 1170 elif isinstance(other, int):
jpayne@69 1171 return self._shift(other)
jpayne@69 1172 else:
jpayne@69 1173 # This will allow CompoundLocation's __radd__ to be called:
jpayne@69 1174 return NotImplemented
jpayne@69 1175
jpayne@69 1176 def __radd__(self, other):
jpayne@69 1177 """Return a SimpleLocation object by shifting the location by an integer amount."""
jpayne@69 1178 if isinstance(other, int):
jpayne@69 1179 return self._shift(other)
jpayne@69 1180 else:
jpayne@69 1181 return NotImplemented
jpayne@69 1182
jpayne@69 1183 def __sub__(self, other):
jpayne@69 1184 """Subtracting an integer will shift the start and end by that amount.
jpayne@69 1185
jpayne@69 1186 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1187 >>> f1 = SimpleLocation(105, 150)
jpayne@69 1188 >>> print(f1)
jpayne@69 1189 [105:150]
jpayne@69 1190 >>> print(f1 - 100)
jpayne@69 1191 [5:50]
jpayne@69 1192
jpayne@69 1193 This can be useful when editing annotation. You can also add an integer
jpayne@69 1194 to a feature location (which shifts in the opposite direction).
jpayne@69 1195 """
jpayne@69 1196 if isinstance(other, int):
jpayne@69 1197 return self._shift(-other)
jpayne@69 1198 else:
jpayne@69 1199 return NotImplemented
jpayne@69 1200
jpayne@69 1201 def __nonzero__(self):
jpayne@69 1202 """Return True regardless of the length of the feature.
jpayne@69 1203
jpayne@69 1204 This behavior is for backwards compatibility, since until the
jpayne@69 1205 __len__ method was added, a SimpleLocation always evaluated as True.
jpayne@69 1206
jpayne@69 1207 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@69 1208 evaluate to False if they have length zero.
jpayne@69 1209
jpayne@69 1210 WARNING: The SimpleLocation may in future evaluate to False when its
jpayne@69 1211 length is zero (in order to better match normal python behavior)!
jpayne@69 1212 """
jpayne@69 1213 return True
jpayne@69 1214
jpayne@69 1215 def __len__(self):
jpayne@69 1216 """Return the length of the region described by the SimpleLocation object.
jpayne@69 1217
jpayne@69 1218 Note that extra care may be needed for fuzzy locations, e.g.
jpayne@69 1219
jpayne@69 1220 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1221 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@69 1222 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@69 1223 >>> len(loc)
jpayne@69 1224 5
jpayne@69 1225 """
jpayne@69 1226 return int(self._end) - int(self._start)
jpayne@69 1227
jpayne@69 1228 def __contains__(self, value):
jpayne@69 1229 """Check if an integer position is within the SimpleLocation object.
jpayne@69 1230
jpayne@69 1231 Note that extra care may be needed for fuzzy locations, e.g.
jpayne@69 1232
jpayne@69 1233 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1234 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@69 1235 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@69 1236 >>> len(loc)
jpayne@69 1237 5
jpayne@69 1238 >>> [i for i in range(15) if i in loc]
jpayne@69 1239 [5, 6, 7, 8, 9]
jpayne@69 1240 """
jpayne@69 1241 if not isinstance(value, int):
jpayne@69 1242 raise ValueError(
jpayne@69 1243 "Currently we only support checking for integer "
jpayne@69 1244 "positions being within a SimpleLocation."
jpayne@69 1245 )
jpayne@69 1246 if value < self._start or value >= self._end:
jpayne@69 1247 return False
jpayne@69 1248 else:
jpayne@69 1249 return True
jpayne@69 1250
jpayne@69 1251 def __iter__(self):
jpayne@69 1252 """Iterate over the parent positions within the SimpleLocation object.
jpayne@69 1253
jpayne@69 1254 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1255 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@69 1256 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@69 1257 >>> len(loc)
jpayne@69 1258 5
jpayne@69 1259 >>> for i in loc: print(i)
jpayne@69 1260 5
jpayne@69 1261 6
jpayne@69 1262 7
jpayne@69 1263 8
jpayne@69 1264 9
jpayne@69 1265 >>> list(loc)
jpayne@69 1266 [5, 6, 7, 8, 9]
jpayne@69 1267 >>> [i for i in range(15) if i in loc]
jpayne@69 1268 [5, 6, 7, 8, 9]
jpayne@69 1269
jpayne@69 1270 Note this is strand aware:
jpayne@69 1271
jpayne@69 1272 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1)
jpayne@69 1273 >>> list(loc)
jpayne@69 1274 [9, 8, 7, 6, 5]
jpayne@69 1275 """
jpayne@69 1276 if self.strand == -1:
jpayne@69 1277 yield from range(self._end - 1, self._start - 1, -1)
jpayne@69 1278 else:
jpayne@69 1279 yield from range(self._start, self._end)
jpayne@69 1280
jpayne@69 1281 def __eq__(self, other):
jpayne@69 1282 """Implement equality by comparing all the location attributes."""
jpayne@69 1283 if not isinstance(other, SimpleLocation):
jpayne@69 1284 return False
jpayne@69 1285 return (
jpayne@69 1286 self._start == other.start
jpayne@69 1287 and self._end == other.end
jpayne@69 1288 and self._strand == other.strand
jpayne@69 1289 and self.ref == other.ref
jpayne@69 1290 and self.ref_db == other.ref_db
jpayne@69 1291 )
jpayne@69 1292
jpayne@69 1293 def _shift(self, offset):
jpayne@69 1294 """Return a copy of the SimpleLocation shifted by an offset (PRIVATE).
jpayne@69 1295
jpayne@69 1296 Returns self when location is relative to an external reference.
jpayne@69 1297 """
jpayne@69 1298 # TODO - What if offset is a fuzzy position?
jpayne@69 1299 if self.ref or self.ref_db:
jpayne@69 1300 return self
jpayne@69 1301 return SimpleLocation(
jpayne@69 1302 start=self._start + offset,
jpayne@69 1303 end=self._end + offset,
jpayne@69 1304 strand=self.strand,
jpayne@69 1305 )
jpayne@69 1306
jpayne@69 1307 def _flip(self, length):
jpayne@69 1308 """Return a copy of the location after the parent is reversed (PRIVATE).
jpayne@69 1309
jpayne@69 1310 Returns self when location is relative to an external reference.
jpayne@69 1311 """
jpayne@69 1312 if self.ref or self.ref_db:
jpayne@69 1313 return self
jpayne@69 1314 # Note this will flip the start and end too!
jpayne@69 1315 if self.strand == +1:
jpayne@69 1316 flip_strand = -1
jpayne@69 1317 elif self.strand == -1:
jpayne@69 1318 flip_strand = +1
jpayne@69 1319 else:
jpayne@69 1320 # 0 or None
jpayne@69 1321 flip_strand = self.strand
jpayne@69 1322 return SimpleLocation(
jpayne@69 1323 start=self._end._flip(length),
jpayne@69 1324 end=self._start._flip(length),
jpayne@69 1325 strand=flip_strand,
jpayne@69 1326 )
jpayne@69 1327
jpayne@69 1328 @property
jpayne@69 1329 def parts(self):
jpayne@69 1330 """Read only list of sections (always one, the SimpleLocation object).
jpayne@69 1331
jpayne@69 1332 This is a convenience property allowing you to write code handling
jpayne@69 1333 both SimpleLocation objects (with one part) and more complex
jpayne@69 1334 CompoundLocation objects (with multiple parts) interchangeably.
jpayne@69 1335 """
jpayne@69 1336 return [self]
jpayne@69 1337
jpayne@69 1338 @property
jpayne@69 1339 def start(self):
jpayne@69 1340 """Start location - left most (minimum) value, regardless of strand.
jpayne@69 1341
jpayne@69 1342 Read only, returns an integer like position object, possibly a fuzzy
jpayne@69 1343 position.
jpayne@69 1344 """
jpayne@69 1345 return self._start
jpayne@69 1346
jpayne@69 1347 @property
jpayne@69 1348 def end(self):
jpayne@69 1349 """End location - right most (maximum) value, regardless of strand.
jpayne@69 1350
jpayne@69 1351 Read only, returns an integer like position object, possibly a fuzzy
jpayne@69 1352 position.
jpayne@69 1353 """
jpayne@69 1354 return self._end
jpayne@69 1355
jpayne@69 1356 def extract(self, parent_sequence, references=None):
jpayne@69 1357 """Extract the sequence from supplied parent sequence using the SimpleLocation object.
jpayne@69 1358
jpayne@69 1359 The parent_sequence can be a Seq like object or a string, and will
jpayne@69 1360 generally return an object of the same type. The exception to this is
jpayne@69 1361 a MutableSeq as the parent sequence will return a Seq object.
jpayne@69 1362 If the location refers to other records, they must be supplied
jpayne@69 1363 in the optional dictionary references.
jpayne@69 1364
jpayne@69 1365 >>> from Bio.Seq import Seq
jpayne@69 1366 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1367 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@69 1368 >>> feature_loc = SimpleLocation(8, 15)
jpayne@69 1369 >>> feature_loc.extract(seq)
jpayne@69 1370 Seq('VALIVIC')
jpayne@69 1371
jpayne@69 1372 """
jpayne@69 1373 if self.ref or self.ref_db:
jpayne@69 1374 if not references:
jpayne@69 1375 raise ValueError(
jpayne@69 1376 f"Feature references another sequence ({self.ref}),"
jpayne@69 1377 " references mandatory"
jpayne@69 1378 )
jpayne@69 1379 elif self.ref not in references:
jpayne@69 1380 # KeyError?
jpayne@69 1381 raise ValueError(
jpayne@69 1382 f"Feature references another sequence ({self.ref}),"
jpayne@69 1383 " not found in references"
jpayne@69 1384 )
jpayne@69 1385 parent_sequence = references[self.ref]
jpayne@69 1386 f_seq = parent_sequence[int(self.start) : int(self.end)]
jpayne@69 1387 if isinstance(f_seq, MutableSeq):
jpayne@69 1388 f_seq = Seq(f_seq)
jpayne@69 1389 if self.strand == -1:
jpayne@69 1390 f_seq = reverse_complement(f_seq)
jpayne@69 1391 return f_seq
jpayne@69 1392
jpayne@69 1393
jpayne@69 1394 FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only.
jpayne@69 1395
jpayne@69 1396
jpayne@69 1397 class CompoundLocation(Location):
jpayne@69 1398 """For handling joins etc where a feature location has several parts."""
jpayne@69 1399
jpayne@69 1400 def __init__(self, parts, operator="join"):
jpayne@69 1401 """Initialize the class.
jpayne@69 1402
jpayne@69 1403 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@69 1404 >>> f1 = SimpleLocation(10, 40, strand=+1)
jpayne@69 1405 >>> f2 = SimpleLocation(50, 59, strand=+1)
jpayne@69 1406 >>> f = CompoundLocation([f1, f2])
jpayne@69 1407 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
jpayne@69 1408 True
jpayne@69 1409 >>> print(f.operator)
jpayne@69 1410 join
jpayne@69 1411 >>> 5 in f
jpayne@69 1412 False
jpayne@69 1413 >>> 15 in f
jpayne@69 1414 True
jpayne@69 1415 >>> f.strand
jpayne@69 1416 1
jpayne@69 1417
jpayne@69 1418 Notice that the strand of the compound location is computed
jpayne@69 1419 automatically - in the case of mixed strands on the sub-locations
jpayne@69 1420 the overall strand is set to None.
jpayne@69 1421
jpayne@69 1422 >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1),
jpayne@69 1423 ... SimpleLocation(10, 13, strand=-1)])
jpayne@69 1424 >>> print(f.strand)
jpayne@69 1425 None
jpayne@69 1426 >>> len(f)
jpayne@69 1427 6
jpayne@69 1428 >>> list(f)
jpayne@69 1429 [3, 4, 5, 12, 11, 10]
jpayne@69 1430
jpayne@69 1431 The example above doing list(f) iterates over the coordinates within the
jpayne@69 1432 feature. This allows you to use max and min on the location, to find the
jpayne@69 1433 range covered:
jpayne@69 1434
jpayne@69 1435 >>> min(f)
jpayne@69 1436 3
jpayne@69 1437 >>> max(f)
jpayne@69 1438 12
jpayne@69 1439
jpayne@69 1440 More generally, you can use the compound location's start and end which
jpayne@69 1441 give the full span covered, 0 <= start <= end <= full sequence length.
jpayne@69 1442
jpayne@69 1443 >>> f.start == min(f)
jpayne@69 1444 True
jpayne@69 1445 >>> f.end == max(f) + 1
jpayne@69 1446 True
jpayne@69 1447
jpayne@69 1448 This is consistent with the behavior of the SimpleLocation for a single
jpayne@69 1449 region, where again the 'start' and 'end' do not necessarily give the
jpayne@69 1450 biological start and end, but rather the 'minimal' and 'maximal'
jpayne@69 1451 coordinate boundaries.
jpayne@69 1452
jpayne@69 1453 Note that adding locations provides a more intuitive method of
jpayne@69 1454 construction:
jpayne@69 1455
jpayne@69 1456 >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1)
jpayne@69 1457 >>> len(f)
jpayne@69 1458 6
jpayne@69 1459 >>> list(f)
jpayne@69 1460 [3, 4, 5, 12, 11, 10]
jpayne@69 1461 """
jpayne@69 1462 self.operator = operator
jpayne@69 1463 self.parts = list(parts)
jpayne@69 1464 for loc in self.parts:
jpayne@69 1465 if not isinstance(loc, SimpleLocation):
jpayne@69 1466 raise ValueError(
jpayne@69 1467 "CompoundLocation should be given a list of "
jpayne@69 1468 "SimpleLocation objects, not %s" % loc.__class__
jpayne@69 1469 )
jpayne@69 1470 if len(parts) < 2:
jpayne@69 1471 raise ValueError(
jpayne@69 1472 f"CompoundLocation should have at least 2 parts, not {parts!r}"
jpayne@69 1473 )
jpayne@69 1474
jpayne@69 1475 def __str__(self):
jpayne@69 1476 """Return a representation of the CompoundLocation object (with python counting)."""
jpayne@69 1477 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
jpayne@69 1478
jpayne@69 1479 def __repr__(self):
jpayne@69 1480 """Represent the CompoundLocation object as string for debugging."""
jpayne@69 1481 return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})"
jpayne@69 1482
jpayne@69 1483 def _get_strand(self):
jpayne@69 1484 """Get function for the strand property (PRIVATE)."""
jpayne@69 1485 # Historically a join on the reverse strand has been represented
jpayne@69 1486 # in Biopython with both the parent SeqFeature and its children
jpayne@69 1487 # (the exons for a CDS) all given a strand of -1. Likewise, for
jpayne@69 1488 # a join feature on the forward strand they all have strand +1.
jpayne@69 1489 # However, we must also consider evil mixed strand examples like
jpayne@69 1490 # this, join(complement(69611..69724),139856..140087,140625..140650)
jpayne@69 1491 if len({loc.strand for loc in self.parts}) == 1:
jpayne@69 1492 return self.parts[0].strand
jpayne@69 1493 else:
jpayne@69 1494 return None # i.e. mixed strands
jpayne@69 1495
jpayne@69 1496 def _set_strand(self, value):
jpayne@69 1497 """Set function for the strand property (PRIVATE)."""
jpayne@69 1498 # Should this be allowed/encouraged?
jpayne@69 1499 for loc in self.parts:
jpayne@69 1500 loc.strand = value
jpayne@69 1501
jpayne@69 1502 strand = property(
jpayne@69 1503 fget=_get_strand,
jpayne@69 1504 fset=_set_strand,
jpayne@69 1505 doc="""Overall strand of the compound location.
jpayne@69 1506
jpayne@69 1507 If all the parts have the same strand, that is returned. Otherwise
jpayne@69 1508 for mixed strands, this returns None.
jpayne@69 1509
jpayne@69 1510 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@69 1511 >>> f1 = SimpleLocation(15, 17, strand=1)
jpayne@69 1512 >>> f2 = SimpleLocation(20, 30, strand=-1)
jpayne@69 1513 >>> f = f1 + f2
jpayne@69 1514 >>> f1.strand
jpayne@69 1515 1
jpayne@69 1516 >>> f2.strand
jpayne@69 1517 -1
jpayne@69 1518 >>> f.strand
jpayne@69 1519 >>> f.strand is None
jpayne@69 1520 True
jpayne@69 1521
jpayne@69 1522 If you set the strand of a CompoundLocation, this is applied to
jpayne@69 1523 all the parts - use with caution:
jpayne@69 1524
jpayne@69 1525 >>> f.strand = 1
jpayne@69 1526 >>> f1.strand
jpayne@69 1527 1
jpayne@69 1528 >>> f2.strand
jpayne@69 1529 1
jpayne@69 1530 >>> f.strand
jpayne@69 1531 1
jpayne@69 1532
jpayne@69 1533 """,
jpayne@69 1534 )
jpayne@69 1535
jpayne@69 1536 def __add__(self, other):
jpayne@69 1537 """Combine locations, or shift the location by an integer offset.
jpayne@69 1538
jpayne@69 1539 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1540 >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30)
jpayne@69 1541 >>> print(f1)
jpayne@69 1542 join{[15:17], [20:30]}
jpayne@69 1543
jpayne@69 1544 You can add another SimpleLocation:
jpayne@69 1545
jpayne@69 1546 >>> print(f1 + SimpleLocation(40, 50))
jpayne@69 1547 join{[15:17], [20:30], [40:50]}
jpayne@69 1548 >>> print(SimpleLocation(5, 10) + f1)
jpayne@69 1549 join{[5:10], [15:17], [20:30]}
jpayne@69 1550
jpayne@69 1551 You can also add another CompoundLocation:
jpayne@69 1552
jpayne@69 1553 >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70)
jpayne@69 1554 >>> print(f2)
jpayne@69 1555 join{[40:50], [60:70]}
jpayne@69 1556 >>> print(f1 + f2)
jpayne@69 1557 join{[15:17], [20:30], [40:50], [60:70]}
jpayne@69 1558
jpayne@69 1559 Also, as with the SimpleLocation, adding an integer shifts the
jpayne@69 1560 location's coordinates by that offset:
jpayne@69 1561
jpayne@69 1562 >>> print(f1 + 100)
jpayne@69 1563 join{[115:117], [120:130]}
jpayne@69 1564 >>> print(200 + f1)
jpayne@69 1565 join{[215:217], [220:230]}
jpayne@69 1566 >>> print(f1 + (-5))
jpayne@69 1567 join{[10:12], [15:25]}
jpayne@69 1568 """
jpayne@69 1569 if isinstance(other, SimpleLocation):
jpayne@69 1570 return CompoundLocation(self.parts + [other], self.operator)
jpayne@69 1571 elif isinstance(other, CompoundLocation):
jpayne@69 1572 if self.operator != other.operator:
jpayne@69 1573 # Handle join+order -> order as a special case?
jpayne@69 1574 raise ValueError(
jpayne@69 1575 f"Mixed operators {self.operator} and {other.operator}"
jpayne@69 1576 )
jpayne@69 1577 return CompoundLocation(self.parts + other.parts, self.operator)
jpayne@69 1578 elif isinstance(other, int):
jpayne@69 1579 return self._shift(other)
jpayne@69 1580 else:
jpayne@69 1581 raise NotImplementedError
jpayne@69 1582
jpayne@69 1583 def __radd__(self, other):
jpayne@69 1584 """Add a feature to the left."""
jpayne@69 1585 if isinstance(other, SimpleLocation):
jpayne@69 1586 return CompoundLocation([other] + self.parts, self.operator)
jpayne@69 1587 elif isinstance(other, int):
jpayne@69 1588 return self._shift(other)
jpayne@69 1589 else:
jpayne@69 1590 raise NotImplementedError
jpayne@69 1591
jpayne@69 1592 def __contains__(self, value):
jpayne@69 1593 """Check if an integer position is within the CompoundLocation object."""
jpayne@69 1594 for loc in self.parts:
jpayne@69 1595 if value in loc:
jpayne@69 1596 return True
jpayne@69 1597 return False
jpayne@69 1598
jpayne@69 1599 def __nonzero__(self):
jpayne@69 1600 """Return True regardless of the length of the feature.
jpayne@69 1601
jpayne@69 1602 This behavior is for backwards compatibility, since until the
jpayne@69 1603 __len__ method was added, a SimpleLocation always evaluated as True.
jpayne@69 1604
jpayne@69 1605 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@69 1606 evaluate to False if they have length zero.
jpayne@69 1607
jpayne@69 1608 WARNING: The SimpleLocation may in future evaluate to False when its
jpayne@69 1609 length is zero (in order to better match normal python behavior)!
jpayne@69 1610 """
jpayne@69 1611 return True
jpayne@69 1612
jpayne@69 1613 def __len__(self):
jpayne@69 1614 """Return the length of the CompoundLocation object."""
jpayne@69 1615 return sum(len(loc) for loc in self.parts)
jpayne@69 1616
jpayne@69 1617 def __iter__(self):
jpayne@69 1618 """Iterate over the parent positions within the CompoundLocation object."""
jpayne@69 1619 for loc in self.parts:
jpayne@69 1620 yield from loc
jpayne@69 1621
jpayne@69 1622 def __eq__(self, other):
jpayne@69 1623 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
jpayne@69 1624 if not isinstance(other, CompoundLocation):
jpayne@69 1625 return False
jpayne@69 1626 if len(self.parts) != len(other.parts):
jpayne@69 1627 return False
jpayne@69 1628 if self.operator != other.operator:
jpayne@69 1629 return False
jpayne@69 1630 for self_part, other_part in zip(self.parts, other.parts):
jpayne@69 1631 if self_part != other_part:
jpayne@69 1632 return False
jpayne@69 1633 return True
jpayne@69 1634
jpayne@69 1635 def _shift(self, offset):
jpayne@69 1636 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
jpayne@69 1637 return CompoundLocation(
jpayne@69 1638 [loc._shift(offset) for loc in self.parts], self.operator
jpayne@69 1639 )
jpayne@69 1640
jpayne@69 1641 def _flip(self, length):
jpayne@69 1642 """Return a copy of the locations after the parent is reversed (PRIVATE).
jpayne@69 1643
jpayne@69 1644 Note that the order of the parts is NOT reversed too. Consider a CDS
jpayne@69 1645 on the forward strand with exons small, medium and large (in length).
jpayne@69 1646 Once we change the frame of reference to the reverse complement strand,
jpayne@69 1647 the start codon is still part of the small exon, and the stop codon
jpayne@69 1648 still part of the large exon - so the part order remains the same!
jpayne@69 1649
jpayne@69 1650 Here is an artificial example, were the features map to the two upper
jpayne@69 1651 case regions and the lower case runs of n are not used:
jpayne@69 1652
jpayne@69 1653 >>> from Bio.Seq import Seq
jpayne@69 1654 >>> from Bio.SeqFeature import SimpleLocation
jpayne@69 1655 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
jpayne@69 1656 >>> small = SimpleLocation(5, 20, strand=1)
jpayne@69 1657 >>> large = SimpleLocation(28, 52, strand=1)
jpayne@69 1658 >>> location = small + large
jpayne@69 1659 >>> print(small)
jpayne@69 1660 [5:20](+)
jpayne@69 1661 >>> print(large)
jpayne@69 1662 [28:52](+)
jpayne@69 1663 >>> print(location)
jpayne@69 1664 join{[5:20](+), [28:52](+)}
jpayne@69 1665 >>> for part in location.parts:
jpayne@69 1666 ... print(len(part))
jpayne@69 1667 ...
jpayne@69 1668 15
jpayne@69 1669 24
jpayne@69 1670
jpayne@69 1671 As you can see, this is a silly example where each "exon" is a word:
jpayne@69 1672
jpayne@69 1673 >>> print(small.extract(dna).translate())
jpayne@69 1674 SILLY
jpayne@69 1675 >>> print(large.extract(dna).translate())
jpayne@69 1676 EXAMPLE*
jpayne@69 1677 >>> print(location.extract(dna).translate())
jpayne@69 1678 SILLYEXAMPLE*
jpayne@69 1679 >>> for part in location.parts:
jpayne@69 1680 ... print(part.extract(dna).translate())
jpayne@69 1681 ...
jpayne@69 1682 SILLY
jpayne@69 1683 EXAMPLE*
jpayne@69 1684
jpayne@69 1685 Now, let's look at this from the reverse strand frame of reference:
jpayne@69 1686
jpayne@69 1687 >>> flipped_dna = dna.reverse_complement()
jpayne@69 1688 >>> flipped_location = location._flip(len(dna))
jpayne@69 1689 >>> print(flipped_location.extract(flipped_dna).translate())
jpayne@69 1690 SILLYEXAMPLE*
jpayne@69 1691 >>> for part in flipped_location.parts:
jpayne@69 1692 ... print(part.extract(flipped_dna).translate())
jpayne@69 1693 ...
jpayne@69 1694 SILLY
jpayne@69 1695 EXAMPLE*
jpayne@69 1696
jpayne@69 1697 The key point here is the first part of the CompoundFeature is still the
jpayne@69 1698 small exon, while the second part is still the large exon:
jpayne@69 1699
jpayne@69 1700 >>> for part in flipped_location.parts:
jpayne@69 1701 ... print(len(part))
jpayne@69 1702 ...
jpayne@69 1703 15
jpayne@69 1704 24
jpayne@69 1705 >>> print(flipped_location)
jpayne@69 1706 join{[37:52](-), [5:29](-)}
jpayne@69 1707
jpayne@69 1708 Notice the parts are not reversed. However, there was a bug here in older
jpayne@69 1709 versions of Biopython which would have given join{[5:29](-), [37:52](-)}
jpayne@69 1710 and the translation would have wrongly been "EXAMPLE*SILLY" instead.
jpayne@69 1711
jpayne@69 1712 """
jpayne@69 1713 return CompoundLocation(
jpayne@69 1714 [loc._flip(length) for loc in self.parts], self.operator
jpayne@69 1715 )
jpayne@69 1716
jpayne@69 1717 @property
jpayne@69 1718 def start(self):
jpayne@69 1719 """Start location - left most (minimum) value, regardless of strand.
jpayne@69 1720
jpayne@69 1721 Read only, returns an integer like position object, possibly a fuzzy
jpayne@69 1722 position.
jpayne@69 1723
jpayne@69 1724 For the special case of a CompoundLocation wrapping the origin of a
jpayne@69 1725 circular genome, this will return zero.
jpayne@69 1726 """
jpayne@69 1727 return min(loc.start for loc in self.parts)
jpayne@69 1728
jpayne@69 1729 @property
jpayne@69 1730 def end(self):
jpayne@69 1731 """End location - right most (maximum) value, regardless of strand.
jpayne@69 1732
jpayne@69 1733 Read only, returns an integer like position object, possibly a fuzzy
jpayne@69 1734 position.
jpayne@69 1735
jpayne@69 1736 For the special case of a CompoundLocation wrapping the origin of
jpayne@69 1737 a circular genome this will match the genome length.
jpayne@69 1738 """
jpayne@69 1739 return max(loc.end for loc in self.parts)
jpayne@69 1740
jpayne@69 1741 @property
jpayne@69 1742 def ref(self):
jpayne@69 1743 """Not present in CompoundLocation, dummy method for API compatibility."""
jpayne@69 1744 return None
jpayne@69 1745
jpayne@69 1746 @property
jpayne@69 1747 def ref_db(self):
jpayne@69 1748 """Not present in CompoundLocation, dummy method for API compatibility."""
jpayne@69 1749 return None
jpayne@69 1750
jpayne@69 1751 def extract(self, parent_sequence, references=None):
jpayne@69 1752 """Extract the sequence from supplied parent sequence using the CompoundLocation object.
jpayne@69 1753
jpayne@69 1754 The parent_sequence can be a Seq like object or a string, and will
jpayne@69 1755 generally return an object of the same type. The exception to this is
jpayne@69 1756 a MutableSeq as the parent sequence will return a Seq object.
jpayne@69 1757 If the location refers to other records, they must be supplied
jpayne@69 1758 in the optional dictionary references.
jpayne@69 1759
jpayne@69 1760 >>> from Bio.Seq import Seq
jpayne@69 1761 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@69 1762 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@69 1763 >>> fl1 = SimpleLocation(2, 8)
jpayne@69 1764 >>> fl2 = SimpleLocation(10, 15)
jpayne@69 1765 >>> fl3 = CompoundLocation([fl1,fl2])
jpayne@69 1766 >>> fl3.extract(seq)
jpayne@69 1767 Seq('QHKAMILIVIC')
jpayne@69 1768
jpayne@69 1769 """
jpayne@69 1770 # This copes with mixed strand features & all on reverse:
jpayne@69 1771 parts = [
jpayne@69 1772 loc.extract(parent_sequence, references=references) for loc in self.parts
jpayne@69 1773 ]
jpayne@69 1774 f_seq = functools.reduce(lambda x, y: x + y, parts)
jpayne@69 1775 return f_seq
jpayne@69 1776
jpayne@69 1777
jpayne@69 1778 class Position(ABC):
jpayne@69 1779 """Abstract base class representing a position."""
jpayne@69 1780
jpayne@69 1781 @abstractmethod
jpayne@69 1782 def __repr__(self):
jpayne@69 1783 """Represent the Position object as a string for debugging."""
jpayne@69 1784 return f"{self.__class__.__name__}(...)"
jpayne@69 1785
jpayne@69 1786 @staticmethod
jpayne@69 1787 def fromstring(text, offset=0):
jpayne@69 1788 """Build a Position object from the text string.
jpayne@69 1789
jpayne@69 1790 For an end position, leave offset as zero (default):
jpayne@69 1791
jpayne@69 1792 >>> Position.fromstring("5")
jpayne@69 1793 ExactPosition(5)
jpayne@69 1794
jpayne@69 1795 For a start position, set offset to minus one (for Python counting):
jpayne@69 1796
jpayne@69 1797 >>> Position.fromstring("5", -1)
jpayne@69 1798 ExactPosition(4)
jpayne@69 1799
jpayne@69 1800 This also covers fuzzy positions:
jpayne@69 1801
jpayne@69 1802 >>> p = Position.fromstring("<5")
jpayne@69 1803 >>> p
jpayne@69 1804 BeforePosition(5)
jpayne@69 1805 >>> print(p)
jpayne@69 1806 <5
jpayne@69 1807 >>> int(p)
jpayne@69 1808 5
jpayne@69 1809
jpayne@69 1810 >>> Position.fromstring(">5")
jpayne@69 1811 AfterPosition(5)
jpayne@69 1812
jpayne@69 1813 By default assumes an end position, so note the integer behavior:
jpayne@69 1814
jpayne@69 1815 >>> p = Position.fromstring("one-of(5,8,11)")
jpayne@69 1816 >>> p
jpayne@69 1817 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
jpayne@69 1818 >>> print(p)
jpayne@69 1819 one-of(5,8,11)
jpayne@69 1820 >>> int(p)
jpayne@69 1821 11
jpayne@69 1822
jpayne@69 1823 >>> Position.fromstring("(8.10)")
jpayne@69 1824 WithinPosition(10, left=8, right=10)
jpayne@69 1825
jpayne@69 1826 Fuzzy start positions:
jpayne@69 1827
jpayne@69 1828 >>> p = Position.fromstring("<5", -1)
jpayne@69 1829 >>> p
jpayne@69 1830 BeforePosition(4)
jpayne@69 1831 >>> print(p)
jpayne@69 1832 <4
jpayne@69 1833 >>> int(p)
jpayne@69 1834 4
jpayne@69 1835
jpayne@69 1836 Notice how the integer behavior changes too!
jpayne@69 1837
jpayne@69 1838 >>> p = Position.fromstring("one-of(5,8,11)", -1)
jpayne@69 1839 >>> p
jpayne@69 1840 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
jpayne@69 1841 >>> print(p)
jpayne@69 1842 one-of(4,7,10)
jpayne@69 1843 >>> int(p)
jpayne@69 1844 4
jpayne@69 1845
jpayne@69 1846 """
jpayne@69 1847 if offset != 0 and offset != -1:
jpayne@69 1848 raise ValueError(
jpayne@69 1849 "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)."
jpayne@69 1850 )
jpayne@69 1851 if text == "?":
jpayne@69 1852 return UnknownPosition()
jpayne@69 1853 if text.startswith("?"):
jpayne@69 1854 return UncertainPosition(int(text[1:]) + offset)
jpayne@69 1855 if text.startswith("<"):
jpayne@69 1856 return BeforePosition(int(text[1:]) + offset)
jpayne@69 1857 if text.startswith(">"):
jpayne@69 1858 return AfterPosition(int(text[1:]) + offset)
jpayne@69 1859 m = _re_within_position.match(text)
jpayne@69 1860 if m is not None:
jpayne@69 1861 s, e = m.groups()
jpayne@69 1862 s = int(s) + offset
jpayne@69 1863 e = int(e) + offset
jpayne@69 1864 if offset == -1:
jpayne@69 1865 default = s
jpayne@69 1866 else:
jpayne@69 1867 default = e
jpayne@69 1868 return WithinPosition(default, left=s, right=e)
jpayne@69 1869 m = _re_oneof_position.match(text)
jpayne@69 1870 if m is not None:
jpayne@69 1871 positions = m.groups()[0]
jpayne@69 1872 parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")]
jpayne@69 1873 if offset == -1:
jpayne@69 1874 default = min(int(pos) for pos in parts)
jpayne@69 1875 else:
jpayne@69 1876 default = max(int(pos) for pos in parts)
jpayne@69 1877 return OneOfPosition(default, choices=parts)
jpayne@69 1878 return ExactPosition(int(text) + offset)
jpayne@69 1879
jpayne@69 1880
jpayne@69 1881 class ExactPosition(int, Position):
jpayne@69 1882 """Specify the specific position of a boundary.
jpayne@69 1883
jpayne@69 1884 Arguments:
jpayne@69 1885 - position - The position of the boundary.
jpayne@69 1886 - extension - An optional argument which must be zero since we don't
jpayne@69 1887 have an extension. The argument is provided so that the same number
jpayne@69 1888 of arguments can be passed to all position types.
jpayne@69 1889
jpayne@69 1890 In this case, there is no fuzziness associated with the position.
jpayne@69 1891
jpayne@69 1892 >>> p = ExactPosition(5)
jpayne@69 1893 >>> p
jpayne@69 1894 ExactPosition(5)
jpayne@69 1895 >>> print(p)
jpayne@69 1896 5
jpayne@69 1897
jpayne@69 1898 >>> isinstance(p, Position)
jpayne@69 1899 True
jpayne@69 1900 >>> isinstance(p, int)
jpayne@69 1901 True
jpayne@69 1902
jpayne@69 1903 Integer comparisons and operations should work as expected:
jpayne@69 1904
jpayne@69 1905 >>> p == 5
jpayne@69 1906 True
jpayne@69 1907 >>> p < 6
jpayne@69 1908 True
jpayne@69 1909 >>> p <= 5
jpayne@69 1910 True
jpayne@69 1911 >>> p + 10
jpayne@69 1912 ExactPosition(15)
jpayne@69 1913
jpayne@69 1914 """
jpayne@69 1915
jpayne@69 1916 def __new__(cls, position, extension=0):
jpayne@69 1917 """Create an ExactPosition object."""
jpayne@69 1918 if extension != 0:
jpayne@69 1919 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@69 1920 return int.__new__(cls, position)
jpayne@69 1921
jpayne@69 1922 # Must define this on Python 3.8 onwards because we redefine __repr__
jpayne@69 1923 def __str__(self):
jpayne@69 1924 """Return a representation of the ExactPosition object (with python counting)."""
jpayne@69 1925 return str(int(self))
jpayne@69 1926
jpayne@69 1927 def __repr__(self):
jpayne@69 1928 """Represent the ExactPosition object as a string for debugging."""
jpayne@69 1929 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@69 1930
jpayne@69 1931 def __add__(self, offset):
jpayne@69 1932 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 1933 # By default preserve any subclass
jpayne@69 1934 return self.__class__(int(self) + offset)
jpayne@69 1935
jpayne@69 1936 def _flip(self, length):
jpayne@69 1937 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 1938 # By default preserve any subclass
jpayne@69 1939 return self.__class__(length - int(self))
jpayne@69 1940
jpayne@69 1941
jpayne@69 1942 class UncertainPosition(ExactPosition):
jpayne@69 1943 """Specify a specific position which is uncertain.
jpayne@69 1944
jpayne@69 1945 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
jpayne@69 1946 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
jpayne@69 1947 """
jpayne@69 1948
jpayne@69 1949
jpayne@69 1950 class UnknownPosition(Position):
jpayne@69 1951 """Specify a specific position which is unknown (has no position).
jpayne@69 1952
jpayne@69 1953 This is used in UniProt, e.g. ? or in the XML as unknown.
jpayne@69 1954 """
jpayne@69 1955
jpayne@69 1956 def __repr__(self):
jpayne@69 1957 """Represent the UnknownPosition object as a string for debugging."""
jpayne@69 1958 return f"{self.__class__.__name__}()"
jpayne@69 1959
jpayne@69 1960 def __hash__(self):
jpayne@69 1961 """Return the hash value of the UnknownPosition object."""
jpayne@69 1962 return hash(None)
jpayne@69 1963
jpayne@69 1964 def __add__(self, offset):
jpayne@69 1965 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 1966 return self
jpayne@69 1967
jpayne@69 1968 def _flip(self, length):
jpayne@69 1969 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 1970 return self
jpayne@69 1971
jpayne@69 1972
jpayne@69 1973 class WithinPosition(int, Position):
jpayne@69 1974 """Specify the position of a boundary within some coordinates.
jpayne@69 1975
jpayne@69 1976 Arguments:
jpayne@69 1977 - position - The default integer position
jpayne@69 1978 - left - The start (left) position of the boundary
jpayne@69 1979 - right - The end (right) position of the boundary
jpayne@69 1980
jpayne@69 1981 This allows dealing with a location like ((11.14)..100). This
jpayne@69 1982 indicates that the start of the sequence is somewhere between 11
jpayne@69 1983 and 14. Since this is a start coordinate, it should act like
jpayne@69 1984 it is at position 11 (or in Python counting, 10).
jpayne@69 1985
jpayne@69 1986 >>> p = WithinPosition(10, 10, 13)
jpayne@69 1987 >>> p
jpayne@69 1988 WithinPosition(10, left=10, right=13)
jpayne@69 1989 >>> print(p)
jpayne@69 1990 (10.13)
jpayne@69 1991 >>> int(p)
jpayne@69 1992 10
jpayne@69 1993
jpayne@69 1994 Basic integer comparisons and operations should work as though
jpayne@69 1995 this were a plain integer:
jpayne@69 1996
jpayne@69 1997 >>> p == 10
jpayne@69 1998 True
jpayne@69 1999 >>> p in [9, 10, 11]
jpayne@69 2000 True
jpayne@69 2001 >>> p < 11
jpayne@69 2002 True
jpayne@69 2003 >>> p + 10
jpayne@69 2004 WithinPosition(20, left=20, right=23)
jpayne@69 2005
jpayne@69 2006 >>> isinstance(p, WithinPosition)
jpayne@69 2007 True
jpayne@69 2008 >>> isinstance(p, Position)
jpayne@69 2009 True
jpayne@69 2010 >>> isinstance(p, int)
jpayne@69 2011 True
jpayne@69 2012
jpayne@69 2013 Note this also applies for comparison to other position objects,
jpayne@69 2014 where again the integer behavior is used:
jpayne@69 2015
jpayne@69 2016 >>> p == 10
jpayne@69 2017 True
jpayne@69 2018 >>> p == ExactPosition(10)
jpayne@69 2019 True
jpayne@69 2020 >>> p == BeforePosition(10)
jpayne@69 2021 True
jpayne@69 2022 >>> p == AfterPosition(10)
jpayne@69 2023 True
jpayne@69 2024
jpayne@69 2025 If this were an end point, you would want the position to be 13
jpayne@69 2026 (the right/larger value, not the left/smaller value as above):
jpayne@69 2027
jpayne@69 2028 >>> p2 = WithinPosition(13, 10, 13)
jpayne@69 2029 >>> p2
jpayne@69 2030 WithinPosition(13, left=10, right=13)
jpayne@69 2031 >>> print(p2)
jpayne@69 2032 (10.13)
jpayne@69 2033 >>> int(p2)
jpayne@69 2034 13
jpayne@69 2035 >>> p2 == 13
jpayne@69 2036 True
jpayne@69 2037 >>> p2 == ExactPosition(13)
jpayne@69 2038 True
jpayne@69 2039
jpayne@69 2040 """
jpayne@69 2041
jpayne@69 2042 def __new__(cls, position, left, right):
jpayne@69 2043 """Create a WithinPosition object."""
jpayne@69 2044 if not (position == left or position == right):
jpayne@69 2045 raise RuntimeError(
jpayne@69 2046 "WithinPosition: %r should match left %r or "
jpayne@69 2047 "right %r" % (position, left, right)
jpayne@69 2048 )
jpayne@69 2049 obj = int.__new__(cls, position)
jpayne@69 2050 obj._left = left
jpayne@69 2051 obj._right = right
jpayne@69 2052 return obj
jpayne@69 2053
jpayne@69 2054 def __getnewargs__(self):
jpayne@69 2055 """Return the arguments accepted by __new__.
jpayne@69 2056
jpayne@69 2057 Necessary to allow pickling and unpickling of class instances.
jpayne@69 2058 """
jpayne@69 2059 return (int(self), self._left, self._right)
jpayne@69 2060
jpayne@69 2061 def __repr__(self):
jpayne@69 2062 """Represent the WithinPosition object as a string for debugging."""
jpayne@69 2063 return "%s(%i, left=%i, right=%i)" % (
jpayne@69 2064 self.__class__.__name__,
jpayne@69 2065 int(self),
jpayne@69 2066 self._left,
jpayne@69 2067 self._right,
jpayne@69 2068 )
jpayne@69 2069
jpayne@69 2070 def __str__(self):
jpayne@69 2071 """Return a representation of the WithinPosition object (with python counting)."""
jpayne@69 2072 return f"({self._left}.{self._right})"
jpayne@69 2073
jpayne@69 2074 def __add__(self, offset):
jpayne@69 2075 """Return a copy of the position object with its location shifted."""
jpayne@69 2076 return self.__class__(
jpayne@69 2077 int(self) + offset, self._left + offset, self._right + offset
jpayne@69 2078 )
jpayne@69 2079
jpayne@69 2080 def _flip(self, length):
jpayne@69 2081 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 2082 return self.__class__(
jpayne@69 2083 length - int(self), length - self._right, length - self._left
jpayne@69 2084 )
jpayne@69 2085
jpayne@69 2086
jpayne@69 2087 class BetweenPosition(int, Position):
jpayne@69 2088 """Specify the position of a boundary between two coordinates (OBSOLETE?).
jpayne@69 2089
jpayne@69 2090 Arguments:
jpayne@69 2091 - position - The default integer position
jpayne@69 2092 - left - The start (left) position of the boundary
jpayne@69 2093 - right - The end (right) position of the boundary
jpayne@69 2094
jpayne@69 2095 This allows dealing with a position like 123^456. This
jpayne@69 2096 indicates that the start of the sequence is somewhere between
jpayne@69 2097 123 and 456. It is up to the parser to set the position argument
jpayne@69 2098 to either boundary point (depending on if this is being used as
jpayne@69 2099 a start or end of the feature). For example as a feature end:
jpayne@69 2100
jpayne@69 2101 >>> p = BetweenPosition(456, 123, 456)
jpayne@69 2102 >>> p
jpayne@69 2103 BetweenPosition(456, left=123, right=456)
jpayne@69 2104 >>> print(p)
jpayne@69 2105 (123^456)
jpayne@69 2106 >>> int(p)
jpayne@69 2107 456
jpayne@69 2108
jpayne@69 2109 Integer equality and comparison use the given position,
jpayne@69 2110
jpayne@69 2111 >>> p == 456
jpayne@69 2112 True
jpayne@69 2113 >>> p in [455, 456, 457]
jpayne@69 2114 True
jpayne@69 2115 >>> p > 300
jpayne@69 2116 True
jpayne@69 2117
jpayne@69 2118 The old legacy properties of position and extension give the
jpayne@69 2119 starting/lower/left position as an integer, and the distance
jpayne@69 2120 to the ending/higher/right position as an integer. Note that
jpayne@69 2121 the position object will act like either the left or the right
jpayne@69 2122 end-point depending on how it was created:
jpayne@69 2123
jpayne@69 2124 >>> p2 = BetweenPosition(123, left=123, right=456)
jpayne@69 2125 >>> int(p) == int(p2)
jpayne@69 2126 False
jpayne@69 2127 >>> p == 456
jpayne@69 2128 True
jpayne@69 2129 >>> p2 == 123
jpayne@69 2130 True
jpayne@69 2131
jpayne@69 2132 Note this potentially surprising behavior:
jpayne@69 2133
jpayne@69 2134 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
jpayne@69 2135 True
jpayne@69 2136 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
jpayne@69 2137 True
jpayne@69 2138 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
jpayne@69 2139 True
jpayne@69 2140
jpayne@69 2141 i.e. For equality (and sorting) the position objects behave like
jpayne@69 2142 integers.
jpayne@69 2143
jpayne@69 2144 """
jpayne@69 2145
jpayne@69 2146 def __new__(cls, position, left, right):
jpayne@69 2147 """Create a new instance in BetweenPosition object."""
jpayne@69 2148 assert position == left or position == right
jpayne@69 2149 # TODO - public API for getting left/right, especially the unknown one
jpayne@69 2150 obj = int.__new__(cls, position)
jpayne@69 2151 obj._left = left
jpayne@69 2152 obj._right = right
jpayne@69 2153 return obj
jpayne@69 2154
jpayne@69 2155 def __getnewargs__(self):
jpayne@69 2156 """Return the arguments accepted by __new__.
jpayne@69 2157
jpayne@69 2158 Necessary to allow pickling and unpickling of class instances.
jpayne@69 2159 """
jpayne@69 2160 return (int(self), self._left, self._right)
jpayne@69 2161
jpayne@69 2162 def __repr__(self):
jpayne@69 2163 """Represent the BetweenPosition object as a string for debugging."""
jpayne@69 2164 return "%s(%i, left=%i, right=%i)" % (
jpayne@69 2165 self.__class__.__name__,
jpayne@69 2166 int(self),
jpayne@69 2167 self._left,
jpayne@69 2168 self._right,
jpayne@69 2169 )
jpayne@69 2170
jpayne@69 2171 def __str__(self):
jpayne@69 2172 """Return a representation of the BetweenPosition object (with python counting)."""
jpayne@69 2173 return f"({self._left}^{self._right})"
jpayne@69 2174
jpayne@69 2175 def __add__(self, offset):
jpayne@69 2176 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 2177 return self.__class__(
jpayne@69 2178 int(self) + offset, self._left + offset, self._right + offset
jpayne@69 2179 )
jpayne@69 2180
jpayne@69 2181 def _flip(self, length):
jpayne@69 2182 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 2183 return self.__class__(
jpayne@69 2184 length - int(self), length - self._right, length - self._left
jpayne@69 2185 )
jpayne@69 2186
jpayne@69 2187
jpayne@69 2188 class BeforePosition(int, Position):
jpayne@69 2189 """Specify a position where the actual location occurs before it.
jpayne@69 2190
jpayne@69 2191 Arguments:
jpayne@69 2192 - position - The upper boundary of where the location can occur.
jpayne@69 2193 - extension - An optional argument which must be zero since we don't
jpayne@69 2194 have an extension. The argument is provided so that the same number
jpayne@69 2195 of arguments can be passed to all position types.
jpayne@69 2196
jpayne@69 2197 This is used to specify positions like (<10..100) where the location
jpayne@69 2198 occurs somewhere before position 10.
jpayne@69 2199
jpayne@69 2200 >>> p = BeforePosition(5)
jpayne@69 2201 >>> p
jpayne@69 2202 BeforePosition(5)
jpayne@69 2203 >>> print(p)
jpayne@69 2204 <5
jpayne@69 2205 >>> int(p)
jpayne@69 2206 5
jpayne@69 2207 >>> p + 10
jpayne@69 2208 BeforePosition(15)
jpayne@69 2209
jpayne@69 2210 Note this potentially surprising behavior:
jpayne@69 2211
jpayne@69 2212 >>> p == ExactPosition(5)
jpayne@69 2213 True
jpayne@69 2214 >>> p == AfterPosition(5)
jpayne@69 2215 True
jpayne@69 2216
jpayne@69 2217 Just remember that for equality and sorting the position objects act
jpayne@69 2218 like integers.
jpayne@69 2219 """
jpayne@69 2220
jpayne@69 2221 # Subclasses int so can't use __init__
jpayne@69 2222 def __new__(cls, position, extension=0):
jpayne@69 2223 """Create a new instance in BeforePosition object."""
jpayne@69 2224 if extension != 0:
jpayne@69 2225 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@69 2226 return int.__new__(cls, position)
jpayne@69 2227
jpayne@69 2228 def __repr__(self):
jpayne@69 2229 """Represent the location as a string for debugging."""
jpayne@69 2230 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@69 2231
jpayne@69 2232 def __str__(self):
jpayne@69 2233 """Return a representation of the BeforePosition object (with python counting)."""
jpayne@69 2234 return f"<{int(self)}"
jpayne@69 2235
jpayne@69 2236 def __add__(self, offset):
jpayne@69 2237 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 2238 return self.__class__(int(self) + offset)
jpayne@69 2239
jpayne@69 2240 def _flip(self, length):
jpayne@69 2241 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 2242 return AfterPosition(length - int(self))
jpayne@69 2243
jpayne@69 2244
jpayne@69 2245 class AfterPosition(int, Position):
jpayne@69 2246 """Specify a position where the actual location is found after it.
jpayne@69 2247
jpayne@69 2248 Arguments:
jpayne@69 2249 - position - The lower boundary of where the location can occur.
jpayne@69 2250 - extension - An optional argument which must be zero since we don't
jpayne@69 2251 have an extension. The argument is provided so that the same number
jpayne@69 2252 of arguments can be passed to all position types.
jpayne@69 2253
jpayne@69 2254 This is used to specify positions like (>10..100) where the location
jpayne@69 2255 occurs somewhere after position 10.
jpayne@69 2256
jpayne@69 2257 >>> p = AfterPosition(7)
jpayne@69 2258 >>> p
jpayne@69 2259 AfterPosition(7)
jpayne@69 2260 >>> print(p)
jpayne@69 2261 >7
jpayne@69 2262 >>> int(p)
jpayne@69 2263 7
jpayne@69 2264 >>> p + 10
jpayne@69 2265 AfterPosition(17)
jpayne@69 2266
jpayne@69 2267 >>> isinstance(p, AfterPosition)
jpayne@69 2268 True
jpayne@69 2269 >>> isinstance(p, Position)
jpayne@69 2270 True
jpayne@69 2271 >>> isinstance(p, int)
jpayne@69 2272 True
jpayne@69 2273
jpayne@69 2274 Note this potentially surprising behavior:
jpayne@69 2275
jpayne@69 2276 >>> p == ExactPosition(7)
jpayne@69 2277 True
jpayne@69 2278 >>> p == BeforePosition(7)
jpayne@69 2279 True
jpayne@69 2280
jpayne@69 2281 Just remember that for equality and sorting the position objects act
jpayne@69 2282 like integers.
jpayne@69 2283 """
jpayne@69 2284
jpayne@69 2285 # Subclasses int so can't use __init__
jpayne@69 2286 def __new__(cls, position, extension=0):
jpayne@69 2287 """Create a new instance of the AfterPosition object."""
jpayne@69 2288 if extension != 0:
jpayne@69 2289 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@69 2290 return int.__new__(cls, position)
jpayne@69 2291
jpayne@69 2292 def __repr__(self):
jpayne@69 2293 """Represent the location as a string for debugging."""
jpayne@69 2294 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@69 2295
jpayne@69 2296 def __str__(self):
jpayne@69 2297 """Return a representation of the AfterPosition object (with python counting)."""
jpayne@69 2298 return f">{int(self)}"
jpayne@69 2299
jpayne@69 2300 def __add__(self, offset):
jpayne@69 2301 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 2302 return self.__class__(int(self) + offset)
jpayne@69 2303
jpayne@69 2304 def _flip(self, length):
jpayne@69 2305 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 2306 return BeforePosition(length - int(self))
jpayne@69 2307
jpayne@69 2308
jpayne@69 2309 class OneOfPosition(int, Position):
jpayne@69 2310 """Specify a position where the location can be multiple positions.
jpayne@69 2311
jpayne@69 2312 This models the GenBank 'one-of(1888,1901)' function, and tries
jpayne@69 2313 to make this fit within the Biopython Position models. If this was
jpayne@69 2314 a start position it should act like 1888, but as an end position 1901.
jpayne@69 2315
jpayne@69 2316 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
jpayne@69 2317 >>> p
jpayne@69 2318 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
jpayne@69 2319 >>> int(p)
jpayne@69 2320 1888
jpayne@69 2321
jpayne@69 2322 Integer comparisons and operators act like using int(p),
jpayne@69 2323
jpayne@69 2324 >>> p == 1888
jpayne@69 2325 True
jpayne@69 2326 >>> p <= 1888
jpayne@69 2327 True
jpayne@69 2328 >>> p > 1888
jpayne@69 2329 False
jpayne@69 2330 >>> p + 100
jpayne@69 2331 OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)])
jpayne@69 2332
jpayne@69 2333 >>> isinstance(p, OneOfPosition)
jpayne@69 2334 True
jpayne@69 2335 >>> isinstance(p, Position)
jpayne@69 2336 True
jpayne@69 2337 >>> isinstance(p, int)
jpayne@69 2338 True
jpayne@69 2339
jpayne@69 2340 """
jpayne@69 2341
jpayne@69 2342 def __new__(cls, position, choices):
jpayne@69 2343 """Initialize with a set of possible positions.
jpayne@69 2344
jpayne@69 2345 choices is a list of Position derived objects, specifying possible
jpayne@69 2346 locations.
jpayne@69 2347
jpayne@69 2348 position is an integer specifying the default behavior.
jpayne@69 2349 """
jpayne@69 2350 if position not in choices:
jpayne@69 2351 raise ValueError(
jpayne@69 2352 f"OneOfPosition: {position!r} should match one of {choices!r}"
jpayne@69 2353 )
jpayne@69 2354 obj = int.__new__(cls, position)
jpayne@69 2355 obj.position_choices = choices
jpayne@69 2356 return obj
jpayne@69 2357
jpayne@69 2358 def __getnewargs__(self):
jpayne@69 2359 """Return the arguments accepted by __new__.
jpayne@69 2360
jpayne@69 2361 Necessary to allow pickling and unpickling of class instances.
jpayne@69 2362 """
jpayne@69 2363 return (int(self), self.position_choices)
jpayne@69 2364
jpayne@69 2365 def __repr__(self):
jpayne@69 2366 """Represent the OneOfPosition object as a string for debugging."""
jpayne@69 2367 return "%s(%i, choices=%r)" % (
jpayne@69 2368 self.__class__.__name__,
jpayne@69 2369 int(self),
jpayne@69 2370 self.position_choices,
jpayne@69 2371 )
jpayne@69 2372
jpayne@69 2373 def __str__(self):
jpayne@69 2374 """Return a representation of the OneOfPosition object (with python counting)."""
jpayne@69 2375 out = "one-of("
jpayne@69 2376 for position in self.position_choices:
jpayne@69 2377 out += f"{position},"
jpayne@69 2378 # replace the last comma with the closing parenthesis
jpayne@69 2379 return out[:-1] + ")"
jpayne@69 2380
jpayne@69 2381 def __add__(self, offset):
jpayne@69 2382 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@69 2383 return self.__class__(
jpayne@69 2384 int(self) + offset, [p + offset for p in self.position_choices]
jpayne@69 2385 )
jpayne@69 2386
jpayne@69 2387 def _flip(self, length):
jpayne@69 2388 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@69 2389 return self.__class__(
jpayne@69 2390 length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
jpayne@69 2391 )
jpayne@69 2392
jpayne@69 2393
jpayne@69 2394 if __name__ == "__main__":
jpayne@69 2395 from Bio._utils import run_doctest
jpayne@69 2396
jpayne@69 2397 run_doctest()