annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/SeqFeature.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # Copyright 2000-2003 Jeff Chang.
jpayne@68 2 # Copyright 2001-2008 Brad Chapman.
jpayne@68 3 # Copyright 2005-2024 by Peter Cock.
jpayne@68 4 # Copyright 2006-2009 Michiel de Hoon.
jpayne@68 5 # All rights reserved.
jpayne@68 6 #
jpayne@68 7 # This file is part of the Biopython distribution and governed by your
jpayne@68 8 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68 9 # Please see the LICENSE file that should have been included as part of this
jpayne@68 10 # package.
jpayne@68 11 """Represent a Sequence Feature holding info about a part of a sequence.
jpayne@68 12
jpayne@68 13 This is heavily modeled after the Biocorba SeqFeature objects, and
jpayne@68 14 may be pretty biased towards GenBank stuff since I'm writing it
jpayne@68 15 for the GenBank parser output...
jpayne@68 16
jpayne@68 17 What's here:
jpayne@68 18
jpayne@68 19 Base class to hold a Feature
jpayne@68 20 ----------------------------
jpayne@68 21
jpayne@68 22 Classes:
jpayne@68 23 - SeqFeature
jpayne@68 24
jpayne@68 25 Hold information about a Reference
jpayne@68 26 ----------------------------------
jpayne@68 27
jpayne@68 28 This is an attempt to create a General class to hold Reference type
jpayne@68 29 information.
jpayne@68 30
jpayne@68 31 Classes:
jpayne@68 32 - Reference
jpayne@68 33
jpayne@68 34 Specify locations of a feature on a Sequence
jpayne@68 35 --------------------------------------------
jpayne@68 36
jpayne@68 37 This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
jpayne@68 38 This has the advantages of allowing us to handle fuzzy stuff in case anyone
jpayne@68 39 needs it, and also be compatible with BioPerl etc and BioSQL.
jpayne@68 40
jpayne@68 41 Classes:
jpayne@68 42 - Location - abstract base class of SimpleLocation and CompoundLocation.
jpayne@68 43 - SimpleLocation - Specify the start and end location of a feature.
jpayne@68 44 - CompoundLocation - Collection of SimpleLocation objects (for joins etc).
jpayne@68 45 - Position - abstract base class of ExactPosition, WithinPosition,
jpayne@68 46 BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and
jpayne@68 47 UnknownPosition.
jpayne@68 48 - ExactPosition - Specify the position as being exact.
jpayne@68 49 - WithinPosition - Specify a position occurring within some range.
jpayne@68 50 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
jpayne@68 51 - BeforePosition - Specify the position as being found before some base.
jpayne@68 52 - AfterPosition - Specify the position as being found after some base.
jpayne@68 53 - OneOfPosition - Specify a position consisting of multiple alternative positions.
jpayne@68 54 - UncertainPosition - Specify a specific position which is uncertain.
jpayne@68 55 - UnknownPosition - Represents missing information like '?' in UniProt.
jpayne@68 56
jpayne@68 57
jpayne@68 58 Exceptions:
jpayne@68 59 - LocationParserError - Exception indicating a failure to parse a location
jpayne@68 60 string.
jpayne@68 61
jpayne@68 62 """
jpayne@68 63 import functools
jpayne@68 64 import re
jpayne@68 65 import warnings
jpayne@68 66 from abc import ABC, abstractmethod
jpayne@68 67
jpayne@68 68 from Bio import BiopythonDeprecationWarning
jpayne@68 69 from Bio import BiopythonParserWarning
jpayne@68 70 from Bio.Seq import MutableSeq
jpayne@68 71 from Bio.Seq import reverse_complement
jpayne@68 72 from Bio.Seq import Seq
jpayne@68 73
jpayne@68 74
jpayne@68 75 # Regular expressions for location parsing
jpayne@68 76
jpayne@68 77 _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)"
jpayne@68 78 _oneof_position = r"one\-of\(\d+[,\d+]+\)"
jpayne@68 79
jpayne@68 80 _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})"
jpayne@68 81
jpayne@68 82 _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))"
jpayne@68 83
jpayne@68 84 _split = re.compile(_any_location).split
jpayne@68 85
jpayne@68 86 assert _split("123..145")[1::2] == ["123..145"]
jpayne@68 87 assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"]
jpayne@68 88 assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"]
jpayne@68 89 assert _split("complement(123..145),200..209")[1::2] == [
jpayne@68 90 "complement(123..145)",
jpayne@68 91 "200..209",
jpayne@68 92 ]
jpayne@68 93 assert _split("123..145,one-of(200,203)..209")[1::2] == [
jpayne@68 94 "123..145",
jpayne@68 95 "one-of(200,203)..209",
jpayne@68 96 ]
jpayne@68 97 assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [
jpayne@68 98 "123..145",
jpayne@68 99 "one-of(200,203)..one-of(209,211)",
jpayne@68 100 "300",
jpayne@68 101 ]
jpayne@68 102 assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [
jpayne@68 103 "123..145",
jpayne@68 104 "complement(one-of(200,203)..one-of(209,211))",
jpayne@68 105 "300",
jpayne@68 106 ]
jpayne@68 107 assert _split("123..145,200..one-of(209,211),300")[1::2] == [
jpayne@68 108 "123..145",
jpayne@68 109 "200..one-of(209,211)",
jpayne@68 110 "300",
jpayne@68 111 ]
jpayne@68 112 assert _split("123..145,200..one-of(209,211)")[1::2] == [
jpayne@68 113 "123..145",
jpayne@68 114 "200..one-of(209,211)",
jpayne@68 115 ]
jpayne@68 116 assert _split(
jpayne@68 117 "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"
jpayne@68 118 )[1::2] == [
jpayne@68 119 "complement(149815..150200)",
jpayne@68 120 "complement(293787..295573)",
jpayne@68 121 "NC_016402.1:6618..6676",
jpayne@68 122 "181647..181905",
jpayne@68 123 ]
jpayne@68 124
jpayne@68 125
jpayne@68 126 _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+"
jpayne@68 127
jpayne@68 128 _between_location = r"\d+\^\d+"
jpayne@68 129
jpayne@68 130 _within_position = r"\(\d+\.\d+\)"
jpayne@68 131 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
jpayne@68 132 _within_position,
jpayne@68 133 _within_position,
jpayne@68 134 )
jpayne@68 135 _within_position = r"\((\d+)\.(\d+)\)"
jpayne@68 136 _re_within_position = re.compile(_within_position)
jpayne@68 137 assert _re_within_position.match("(3.9)")
jpayne@68 138
jpayne@68 139 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
jpayne@68 140 _oneof_position = r"one\-of\((\d+[,\d+]+)\)"
jpayne@68 141 _re_oneof_position = re.compile(_oneof_position)
jpayne@68 142 assert _re_oneof_position.match("one-of(6,9)")
jpayne@68 143 assert not _re_oneof_position.match("one-of(3)")
jpayne@68 144 assert _re_oneof_position.match("one-of(3,6)")
jpayne@68 145 assert _re_oneof_position.match("one-of(3,6,9)")
jpayne@68 146
jpayne@68 147 _solo_location = r"[<>]?\d+"
jpayne@68 148 _solo_bond = r"bond\(%s\)" % _solo_location
jpayne@68 149
jpayne@68 150 _re_location_category = re.compile(
jpayne@68 151 r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$"
jpayne@68 152 % (
jpayne@68 153 _pair_location,
jpayne@68 154 _between_location,
jpayne@68 155 _within_location,
jpayne@68 156 _oneof_location,
jpayne@68 157 _solo_bond,
jpayne@68 158 _solo_location,
jpayne@68 159 )
jpayne@68 160 )
jpayne@68 161
jpayne@68 162
jpayne@68 163 class LocationParserError(ValueError):
jpayne@68 164 """Could not parse a feature location string."""
jpayne@68 165
jpayne@68 166
jpayne@68 167 class SeqFeature:
jpayne@68 168 """Represent a Sequence Feature on an object.
jpayne@68 169
jpayne@68 170 Attributes:
jpayne@68 171 - location - the location of the feature on the sequence (SimpleLocation)
jpayne@68 172 - type - the specified type of the feature (ie. CDS, exon, repeat...)
jpayne@68 173 - id - A string identifier for the feature.
jpayne@68 174 - qualifiers - A dictionary of qualifiers on the feature. These are
jpayne@68 175 analogous to the qualifiers from a GenBank feature table. The keys of
jpayne@68 176 the dictionary are qualifier names, the values are the qualifier
jpayne@68 177 values.
jpayne@68 178
jpayne@68 179 """
jpayne@68 180
jpayne@68 181 def __init__(
jpayne@68 182 self,
jpayne@68 183 location=None,
jpayne@68 184 type="",
jpayne@68 185 id="<unknown id>",
jpayne@68 186 qualifiers=None,
jpayne@68 187 sub_features=None,
jpayne@68 188 ):
jpayne@68 189 """Initialize a SeqFeature on a sequence.
jpayne@68 190
jpayne@68 191 location can either be a SimpleLocation (with strand argument also
jpayne@68 192 given if required), or None.
jpayne@68 193
jpayne@68 194 e.g. With no strand, on the forward strand, and on the reverse strand:
jpayne@68 195
jpayne@68 196 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 197 >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain")
jpayne@68 198 >>> f1.location.strand == None
jpayne@68 199 True
jpayne@68 200 >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS")
jpayne@68 201 >>> f2.location.strand == +1
jpayne@68 202 True
jpayne@68 203 >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS")
jpayne@68 204 >>> f3.location.strand == -1
jpayne@68 205 True
jpayne@68 206
jpayne@68 207 For exact start/end positions, an integer can be used (as shown above)
jpayne@68 208 as shorthand for the ExactPosition object. For non-exact locations, the
jpayne@68 209 SimpleLocation must be specified via the appropriate position objects.
jpayne@68 210 """
jpayne@68 211 if (
jpayne@68 212 location is not None
jpayne@68 213 and not isinstance(location, SimpleLocation)
jpayne@68 214 and not isinstance(location, CompoundLocation)
jpayne@68 215 ):
jpayne@68 216 raise TypeError(
jpayne@68 217 "SimpleLocation, CompoundLocation (or None) required for the location"
jpayne@68 218 )
jpayne@68 219 self.location = location
jpayne@68 220 self.type = type
jpayne@68 221 self.id = id
jpayne@68 222 self.qualifiers = {}
jpayne@68 223 if qualifiers is not None:
jpayne@68 224 self.qualifiers.update(qualifiers)
jpayne@68 225 if sub_features is not None:
jpayne@68 226 raise TypeError("Rather than sub_features, use a CompoundLocation")
jpayne@68 227
jpayne@68 228 def _get_strand(self):
jpayne@68 229 """Get function for the strand property (PRIVATE)."""
jpayne@68 230 warnings.warn(
jpayne@68 231 "Please use .location.strand rather than .strand",
jpayne@68 232 BiopythonDeprecationWarning,
jpayne@68 233 )
jpayne@68 234 return self.location.strand
jpayne@68 235
jpayne@68 236 def _set_strand(self, value):
jpayne@68 237 """Set function for the strand property (PRIVATE)."""
jpayne@68 238 warnings.warn(
jpayne@68 239 "Please use .location.strand rather than .strand",
jpayne@68 240 BiopythonDeprecationWarning,
jpayne@68 241 )
jpayne@68 242 try:
jpayne@68 243 self.location.strand = value
jpayne@68 244 except AttributeError:
jpayne@68 245 if self.location is None:
jpayne@68 246 if value is not None:
jpayne@68 247 raise ValueError("Can't set strand without a location.") from None
jpayne@68 248 else:
jpayne@68 249 raise
jpayne@68 250
jpayne@68 251 strand = property(
jpayne@68 252 fget=_get_strand,
jpayne@68 253 fset=_set_strand,
jpayne@68 254 doc="Alias for the location's strand (DEPRECATED).",
jpayne@68 255 )
jpayne@68 256
jpayne@68 257 def _get_ref(self):
jpayne@68 258 """Get function for the reference property (PRIVATE)."""
jpayne@68 259 warnings.warn(
jpayne@68 260 "Please use .location.ref rather than .ref",
jpayne@68 261 BiopythonDeprecationWarning,
jpayne@68 262 )
jpayne@68 263 try:
jpayne@68 264 return self.location.ref
jpayne@68 265 except AttributeError:
jpayne@68 266 return None
jpayne@68 267
jpayne@68 268 def _set_ref(self, value):
jpayne@68 269 """Set function for the reference property (PRIVATE)."""
jpayne@68 270 warnings.warn(
jpayne@68 271 "Please use .location.ref rather than .ref",
jpayne@68 272 BiopythonDeprecationWarning,
jpayne@68 273 )
jpayne@68 274 try:
jpayne@68 275 self.location.ref = value
jpayne@68 276 except AttributeError:
jpayne@68 277 if self.location is None:
jpayne@68 278 if value is not None:
jpayne@68 279 raise ValueError("Can't set ref without a location.") from None
jpayne@68 280 else:
jpayne@68 281 raise
jpayne@68 282
jpayne@68 283 ref = property(
jpayne@68 284 fget=_get_ref,
jpayne@68 285 fset=_set_ref,
jpayne@68 286 doc="Alias for the location's ref (DEPRECATED).",
jpayne@68 287 )
jpayne@68 288
jpayne@68 289 def _get_ref_db(self):
jpayne@68 290 """Get function for the database reference property (PRIVATE)."""
jpayne@68 291 warnings.warn(
jpayne@68 292 "Please use .location.ref_db rather than .ref_db",
jpayne@68 293 BiopythonDeprecationWarning,
jpayne@68 294 )
jpayne@68 295 try:
jpayne@68 296 return self.location.ref_db
jpayne@68 297 except AttributeError:
jpayne@68 298 return None
jpayne@68 299
jpayne@68 300 def _set_ref_db(self, value):
jpayne@68 301 """Set function for the database reference property (PRIVATE)."""
jpayne@68 302 warnings.warn(
jpayne@68 303 "Please use .location.ref_db rather than .ref_db",
jpayne@68 304 BiopythonDeprecationWarning,
jpayne@68 305 )
jpayne@68 306 self.location.ref_db = value
jpayne@68 307
jpayne@68 308 ref_db = property(
jpayne@68 309 fget=_get_ref_db,
jpayne@68 310 fset=_set_ref_db,
jpayne@68 311 doc="Alias for the location's ref_db (DEPRECATED).",
jpayne@68 312 )
jpayne@68 313
jpayne@68 314 def __eq__(self, other):
jpayne@68 315 """Check if two SeqFeature objects should be considered equal."""
jpayne@68 316 return (
jpayne@68 317 isinstance(other, SeqFeature)
jpayne@68 318 and self.id == other.id
jpayne@68 319 and self.type == other.type
jpayne@68 320 and self.location == other.location
jpayne@68 321 and self.qualifiers == other.qualifiers
jpayne@68 322 )
jpayne@68 323
jpayne@68 324 def __repr__(self):
jpayne@68 325 """Represent the feature as a string for debugging."""
jpayne@68 326 answer = f"{self.__class__.__name__}({self.location!r}"
jpayne@68 327 if self.type:
jpayne@68 328 answer += f", type={self.type!r}"
jpayne@68 329 if self.id and self.id != "<unknown id>":
jpayne@68 330 answer += f", id={self.id!r}"
jpayne@68 331 if self.qualifiers:
jpayne@68 332 answer += ", qualifiers=..."
jpayne@68 333 answer += ")"
jpayne@68 334 return answer
jpayne@68 335
jpayne@68 336 def __str__(self):
jpayne@68 337 """Return the full feature as a python string."""
jpayne@68 338 out = f"type: {self.type}\n"
jpayne@68 339 out += f"location: {self.location}\n"
jpayne@68 340 if self.id and self.id != "<unknown id>":
jpayne@68 341 out += f"id: {self.id}\n"
jpayne@68 342 out += "qualifiers:\n"
jpayne@68 343 for qual_key in sorted(self.qualifiers):
jpayne@68 344 out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n"
jpayne@68 345 return out
jpayne@68 346
jpayne@68 347 def _shift(self, offset):
jpayne@68 348 """Return a copy of the feature with its location shifted (PRIVATE).
jpayne@68 349
jpayne@68 350 The annotation qualifiers are copied.
jpayne@68 351 """
jpayne@68 352 return SeqFeature(
jpayne@68 353 location=self.location._shift(offset),
jpayne@68 354 type=self.type,
jpayne@68 355 id=self.id,
jpayne@68 356 qualifiers=self.qualifiers.copy(),
jpayne@68 357 )
jpayne@68 358
jpayne@68 359 def _flip(self, length):
jpayne@68 360 """Return a copy of the feature with its location flipped (PRIVATE).
jpayne@68 361
jpayne@68 362 The argument length gives the length of the parent sequence. For
jpayne@68 363 example a location 0..20 (+1 strand) with parent length 30 becomes
jpayne@68 364 after flipping 10..30 (-1 strand). Strandless (None) or unknown
jpayne@68 365 strand (0) remain like that - just their end points are changed.
jpayne@68 366
jpayne@68 367 The annotation qualifiers are copied.
jpayne@68 368 """
jpayne@68 369 return SeqFeature(
jpayne@68 370 location=self.location._flip(length),
jpayne@68 371 type=self.type,
jpayne@68 372 id=self.id,
jpayne@68 373 qualifiers=self.qualifiers.copy(),
jpayne@68 374 )
jpayne@68 375
jpayne@68 376 def extract(self, parent_sequence, references=None):
jpayne@68 377 """Extract the feature's sequence from supplied parent sequence.
jpayne@68 378
jpayne@68 379 The parent_sequence can be a Seq like object or a string, and will
jpayne@68 380 generally return an object of the same type. The exception to this is
jpayne@68 381 a MutableSeq as the parent sequence will return a Seq object.
jpayne@68 382
jpayne@68 383 This should cope with complex locations including complements, joins
jpayne@68 384 and fuzzy positions. Even mixed strand features should work! This
jpayne@68 385 also covers features on protein sequences (e.g. domains), although
jpayne@68 386 here reverse strand features are not permitted. If the
jpayne@68 387 location refers to other records, they must be supplied in the
jpayne@68 388 optional dictionary references.
jpayne@68 389
jpayne@68 390 >>> from Bio.Seq import Seq
jpayne@68 391 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 392 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@68 393 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
jpayne@68 394 >>> f.extract(seq)
jpayne@68 395 Seq('VALIVIC')
jpayne@68 396
jpayne@68 397 If the SimpleLocation is None, e.g. when parsing invalid locus
jpayne@68 398 locations in the GenBank parser, extract() will raise a ValueError.
jpayne@68 399
jpayne@68 400 >>> from Bio.Seq import Seq
jpayne@68 401 >>> from Bio.SeqFeature import SeqFeature
jpayne@68 402 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@68 403 >>> f = SeqFeature(None, type="domain")
jpayne@68 404 >>> f.extract(seq)
jpayne@68 405 Traceback (most recent call last):
jpayne@68 406 ...
jpayne@68 407 ValueError: The feature's .location is None. Check the sequence file for a valid location.
jpayne@68 408
jpayne@68 409 Note - currently only compound features of type "join" are supported.
jpayne@68 410 """
jpayne@68 411 if self.location is None:
jpayne@68 412 raise ValueError(
jpayne@68 413 "The feature's .location is None. Check the "
jpayne@68 414 "sequence file for a valid location."
jpayne@68 415 )
jpayne@68 416 return self.location.extract(parent_sequence, references=references)
jpayne@68 417
jpayne@68 418 def translate(
jpayne@68 419 self,
jpayne@68 420 parent_sequence,
jpayne@68 421 table="Standard",
jpayne@68 422 start_offset=None,
jpayne@68 423 stop_symbol="*",
jpayne@68 424 to_stop=False,
jpayne@68 425 cds=None,
jpayne@68 426 gap=None,
jpayne@68 427 ):
jpayne@68 428 """Get a translation of the feature's sequence.
jpayne@68 429
jpayne@68 430 This method is intended for CDS or other features that code proteins
jpayne@68 431 and is a shortcut that will both extract the feature and
jpayne@68 432 translate it, taking into account the codon_start and transl_table
jpayne@68 433 qualifiers, if they are present. If they are not present the
jpayne@68 434 value of the arguments "table" and "start_offset" are used.
jpayne@68 435
jpayne@68 436 The "cds" parameter is set to "True" if the feature is of type
jpayne@68 437 "CDS" but can be overridden by giving an explicit argument.
jpayne@68 438
jpayne@68 439 The arguments stop_symbol, to_stop and gap have the same meaning
jpayne@68 440 as Seq.translate, refer to that documentation for further information.
jpayne@68 441
jpayne@68 442 Arguments:
jpayne@68 443 - parent_sequence - A DNA or RNA sequence.
jpayne@68 444 - table - Which codon table to use if there is no transl_table
jpayne@68 445 qualifier for this feature. This can be either a name
jpayne@68 446 (string), an NCBI identifier (integer), or a CodonTable
jpayne@68 447 object (useful for non-standard genetic codes). This
jpayne@68 448 defaults to the "Standard" table.
jpayne@68 449 - start_offset - offset at which the first complete codon of a
jpayne@68 450 coding feature can be found, relative to the first base of
jpayne@68 451 that feature. Has a valid value of 0, 1 or 2. NOTE: this
jpayne@68 452 uses python's 0-based numbering whereas the codon_start
jpayne@68 453 qualifier in files from NCBI use 1-based numbering.
jpayne@68 454 Will override a codon_start qualifier
jpayne@68 455
jpayne@68 456 >>> from Bio.Seq import Seq
jpayne@68 457 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 458 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
jpayne@68 459 >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS")
jpayne@68 460 >>> f.qualifiers['transl_table'] = [11]
jpayne@68 461
jpayne@68 462 Note that features of type CDS are subject to the usual
jpayne@68 463 checks at translation. But you can override this behavior
jpayne@68 464 by giving explicit arguments:
jpayne@68 465
jpayne@68 466 >>> f.translate(seq, cds=False)
jpayne@68 467 Seq('GYTYR*CL**')
jpayne@68 468
jpayne@68 469 Now use the start_offset argument to change the frame. Note
jpayne@68 470 this uses python 0-based numbering.
jpayne@68 471
jpayne@68 472 >>> f.translate(seq, start_offset=1, cds=False)
jpayne@68 473 Seq('VTLTDNVSD')
jpayne@68 474
jpayne@68 475 Alternatively use the codon_start qualifier to do the same
jpayne@68 476 thing. Note: this uses 1-based numbering, which is found
jpayne@68 477 in files from NCBI.
jpayne@68 478
jpayne@68 479 >>> f.qualifiers['codon_start'] = [2]
jpayne@68 480 >>> f.translate(seq, cds=False)
jpayne@68 481 Seq('VTLTDNVSD')
jpayne@68 482 """
jpayne@68 483 # see if this feature should be translated in a different
jpayne@68 484 # frame using the "codon_start" qualifier
jpayne@68 485 if start_offset is None:
jpayne@68 486 try:
jpayne@68 487 start_offset = int(self.qualifiers["codon_start"][0]) - 1
jpayne@68 488 except KeyError:
jpayne@68 489 start_offset = 0
jpayne@68 490
jpayne@68 491 if start_offset not in [0, 1, 2]:
jpayne@68 492 raise ValueError(
jpayne@68 493 "The start_offset must be 0, 1, or 2. "
jpayne@68 494 f"The supplied value is {start_offset}. "
jpayne@68 495 "Check the value of either the codon_start qualifier "
jpayne@68 496 "or the start_offset argument"
jpayne@68 497 )
jpayne@68 498
jpayne@68 499 feat_seq = self.extract(parent_sequence)[start_offset:]
jpayne@68 500 codon_table = self.qualifiers.get("transl_table", [table])[0]
jpayne@68 501
jpayne@68 502 if cds is None:
jpayne@68 503 cds = self.type == "CDS"
jpayne@68 504
jpayne@68 505 return feat_seq.translate(
jpayne@68 506 table=codon_table,
jpayne@68 507 stop_symbol=stop_symbol,
jpayne@68 508 to_stop=to_stop,
jpayne@68 509 cds=cds,
jpayne@68 510 gap=gap,
jpayne@68 511 )
jpayne@68 512
jpayne@68 513 def __bool__(self):
jpayne@68 514 """Boolean value of an instance of this class (True).
jpayne@68 515
jpayne@68 516 This behavior is for backwards compatibility, since until the
jpayne@68 517 __len__ method was added, a SeqFeature always evaluated as True.
jpayne@68 518
jpayne@68 519 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@68 520 evaluate to False if they have length zero.
jpayne@68 521
jpayne@68 522 WARNING: The SeqFeature may in future evaluate to False when its
jpayne@68 523 length is zero (in order to better match normal python behavior)!
jpayne@68 524 """
jpayne@68 525 return True
jpayne@68 526
jpayne@68 527 def __len__(self):
jpayne@68 528 """Return the length of the region where the feature is located.
jpayne@68 529
jpayne@68 530 >>> from Bio.Seq import Seq
jpayne@68 531 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 532 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@68 533 >>> f = SeqFeature(SimpleLocation(8, 15), type="domain")
jpayne@68 534 >>> len(f)
jpayne@68 535 7
jpayne@68 536 >>> f.extract(seq)
jpayne@68 537 Seq('VALIVIC')
jpayne@68 538 >>> len(f.extract(seq))
jpayne@68 539 7
jpayne@68 540
jpayne@68 541 This is a proxy for taking the length of the feature's location:
jpayne@68 542
jpayne@68 543 >>> len(f.location)
jpayne@68 544 7
jpayne@68 545
jpayne@68 546 For simple features this is the same as the region spanned (end
jpayne@68 547 position minus start position using Pythonic counting). However, for
jpayne@68 548 a compound location (e.g. a CDS as the join of several exons) the
jpayne@68 549 gaps are not counted (e.g. introns). This ensures that len(f) matches
jpayne@68 550 len(f.extract(parent_seq)), and also makes sure things work properly
jpayne@68 551 with features wrapping the origin etc.
jpayne@68 552 """
jpayne@68 553 return len(self.location)
jpayne@68 554
jpayne@68 555 def __iter__(self):
jpayne@68 556 """Iterate over the parent positions within the feature.
jpayne@68 557
jpayne@68 558 The iteration order is strand aware, and can be thought of as moving
jpayne@68 559 along the feature using the parent sequence coordinates:
jpayne@68 560
jpayne@68 561 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 562 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
jpayne@68 563 >>> len(f)
jpayne@68 564 5
jpayne@68 565 >>> for i in f: print(i)
jpayne@68 566 9
jpayne@68 567 8
jpayne@68 568 7
jpayne@68 569 6
jpayne@68 570 5
jpayne@68 571 >>> list(f)
jpayne@68 572 [9, 8, 7, 6, 5]
jpayne@68 573
jpayne@68 574 This is a proxy for iterating over the location,
jpayne@68 575
jpayne@68 576 >>> list(f.location)
jpayne@68 577 [9, 8, 7, 6, 5]
jpayne@68 578 """
jpayne@68 579 return iter(self.location)
jpayne@68 580
jpayne@68 581 def __contains__(self, value):
jpayne@68 582 """Check if an integer position is within the feature.
jpayne@68 583
jpayne@68 584 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 585 >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain")
jpayne@68 586 >>> len(f)
jpayne@68 587 5
jpayne@68 588 >>> [i for i in range(15) if i in f]
jpayne@68 589 [5, 6, 7, 8, 9]
jpayne@68 590
jpayne@68 591 For example, to see which features include a SNP position, you could
jpayne@68 592 use this:
jpayne@68 593
jpayne@68 594 >>> from Bio import SeqIO
jpayne@68 595 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
jpayne@68 596 >>> for f in record.features:
jpayne@68 597 ... if 1750 in f:
jpayne@68 598 ... print("%s %s" % (f.type, f.location))
jpayne@68 599 source [0:154478](+)
jpayne@68 600 gene [1716:4347](-)
jpayne@68 601 tRNA join{[4310:4347](-), [1716:1751](-)}
jpayne@68 602
jpayne@68 603 Note that for a feature defined as a join of several subfeatures (e.g.
jpayne@68 604 the union of several exons) the gaps are not checked (e.g. introns).
jpayne@68 605 In this example, the tRNA location is defined in the GenBank file as
jpayne@68 606 complement(join(1717..1751,4311..4347)), so that position 1760 falls
jpayne@68 607 in the gap:
jpayne@68 608
jpayne@68 609 >>> for f in record.features:
jpayne@68 610 ... if 1760 in f:
jpayne@68 611 ... print("%s %s" % (f.type, f.location))
jpayne@68 612 source [0:154478](+)
jpayne@68 613 gene [1716:4347](-)
jpayne@68 614
jpayne@68 615 Note that additional care may be required with fuzzy locations, for
jpayne@68 616 example just before a BeforePosition:
jpayne@68 617
jpayne@68 618 >>> from Bio.SeqFeature import SeqFeature, SimpleLocation
jpayne@68 619 >>> from Bio.SeqFeature import BeforePosition
jpayne@68 620 >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain")
jpayne@68 621 >>> len(f)
jpayne@68 622 5
jpayne@68 623 >>> [i for i in range(10) if i in f]
jpayne@68 624 [3, 4, 5, 6, 7]
jpayne@68 625
jpayne@68 626 Note that is is a proxy for testing membership on the location.
jpayne@68 627
jpayne@68 628 >>> [i for i in range(10) if i in f.location]
jpayne@68 629 [3, 4, 5, 6, 7]
jpayne@68 630 """
jpayne@68 631 return value in self.location
jpayne@68 632
jpayne@68 633
jpayne@68 634 # --- References
jpayne@68 635
jpayne@68 636
jpayne@68 637 # TODO -- Will this hold PubMed and Medline information decently?
jpayne@68 638 class Reference:
jpayne@68 639 """Represent a Generic Reference object.
jpayne@68 640
jpayne@68 641 Attributes:
jpayne@68 642 - location - A list of Location objects specifying regions of
jpayne@68 643 the sequence that the references correspond to. If no locations are
jpayne@68 644 specified, the entire sequence is assumed.
jpayne@68 645 - authors - A big old string, or a list split by author, of authors
jpayne@68 646 for the reference.
jpayne@68 647 - title - The title of the reference.
jpayne@68 648 - journal - Journal the reference was published in.
jpayne@68 649 - medline_id - A medline reference for the article.
jpayne@68 650 - pubmed_id - A pubmed reference for the article.
jpayne@68 651 - comment - A place to stick any comments about the reference.
jpayne@68 652
jpayne@68 653 """
jpayne@68 654
jpayne@68 655 def __init__(self):
jpayne@68 656 """Initialize the class."""
jpayne@68 657 self.location = []
jpayne@68 658 self.authors = ""
jpayne@68 659 self.consrtm = ""
jpayne@68 660 self.title = ""
jpayne@68 661 self.journal = ""
jpayne@68 662 self.medline_id = ""
jpayne@68 663 self.pubmed_id = ""
jpayne@68 664 self.comment = ""
jpayne@68 665
jpayne@68 666 def __str__(self):
jpayne@68 667 """Return the full Reference object as a python string."""
jpayne@68 668 out = ""
jpayne@68 669 for single_location in self.location:
jpayne@68 670 out += f"location: {single_location}\n"
jpayne@68 671 out += f"authors: {self.authors}\n"
jpayne@68 672 if self.consrtm:
jpayne@68 673 out += f"consrtm: {self.consrtm}\n"
jpayne@68 674 out += f"title: {self.title}\n"
jpayne@68 675 out += f"journal: {self.journal}\n"
jpayne@68 676 out += f"medline id: {self.medline_id}\n"
jpayne@68 677 out += f"pubmed id: {self.pubmed_id}\n"
jpayne@68 678 out += f"comment: {self.comment}\n"
jpayne@68 679 return out
jpayne@68 680
jpayne@68 681 def __repr__(self):
jpayne@68 682 """Represent the Reference object as a string for debugging."""
jpayne@68 683 # TODO - Update this is __init__ later accepts values
jpayne@68 684 return f"{self.__class__.__name__}(title={self.title!r}, ...)"
jpayne@68 685
jpayne@68 686 def __eq__(self, other):
jpayne@68 687 """Check if two Reference objects should be considered equal.
jpayne@68 688
jpayne@68 689 Note prior to Biopython 1.70 the location was not compared, as
jpayne@68 690 until then __eq__ for the SimpleLocation class was not defined.
jpayne@68 691 """
jpayne@68 692 return (
jpayne@68 693 self.authors == other.authors
jpayne@68 694 and self.consrtm == other.consrtm
jpayne@68 695 and self.title == other.title
jpayne@68 696 and self.journal == other.journal
jpayne@68 697 and self.medline_id == other.medline_id
jpayne@68 698 and self.pubmed_id == other.pubmed_id
jpayne@68 699 and self.comment == other.comment
jpayne@68 700 and self.location == other.location
jpayne@68 701 )
jpayne@68 702
jpayne@68 703
jpayne@68 704 # --- Handling feature locations
jpayne@68 705
jpayne@68 706
jpayne@68 707 class Location(ABC):
jpayne@68 708 """Abstract base class representing a location."""
jpayne@68 709
jpayne@68 710 @abstractmethod
jpayne@68 711 def __repr__(self):
jpayne@68 712 """Represent the Location object as a string for debugging."""
jpayne@68 713 return f"{self.__class__.__name__}(...)"
jpayne@68 714
jpayne@68 715 def fromstring(text, length=None, circular=False, stranded=True):
jpayne@68 716 """Create a Location object from a string.
jpayne@68 717
jpayne@68 718 This should accept any valid location string in the INSDC Feature Table
jpayne@68 719 format (https://www.insdc.org/submitting-standards/feature-table/) as
jpayne@68 720 used in GenBank, DDBJ and EMBL files.
jpayne@68 721
jpayne@68 722 Simple examples:
jpayne@68 723
jpayne@68 724 >>> Location.fromstring("123..456", 1000)
jpayne@68 725 SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1)
jpayne@68 726 >>> Location.fromstring("complement(<123..>456)", 1000)
jpayne@68 727 SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1)
jpayne@68 728
jpayne@68 729 A more complex location using within positions,
jpayne@68 730
jpayne@68 731 >>> Location.fromstring("(9.10)..(20.25)", 1000)
jpayne@68 732 SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
jpayne@68 733
jpayne@68 734 Notice how that will act as though it has overall start 8 and end 25.
jpayne@68 735
jpayne@68 736 Zero length between feature,
jpayne@68 737
jpayne@68 738 >>> Location.fromstring("123^124", 1000)
jpayne@68 739 SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1)
jpayne@68 740
jpayne@68 741 The expected sequence length is needed for a special case, a between
jpayne@68 742 position at the start/end of a circular genome:
jpayne@68 743
jpayne@68 744 >>> Location.fromstring("1000^1", 1000)
jpayne@68 745 SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
jpayne@68 746
jpayne@68 747 Apart from this special case, between positions P^Q must have P+1==Q,
jpayne@68 748
jpayne@68 749 >>> Location.fromstring("123^456", 1000)
jpayne@68 750 Traceback (most recent call last):
jpayne@68 751 ...
jpayne@68 752 Bio.SeqFeature.LocationParserError: invalid feature location '123^456'
jpayne@68 753
jpayne@68 754 You can optionally provide a reference name:
jpayne@68 755
jpayne@68 756 >>> Location.fromstring("AL391218.9:105173..108462", 2000000)
jpayne@68 757 SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
jpayne@68 758
jpayne@68 759 >>> Location.fromstring("<2644..159", 2868, "circular")
jpayne@68 760 CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
jpayne@68 761 """
jpayne@68 762 if text.startswith("complement("):
jpayne@68 763 if text[-1] != ")":
jpayne@68 764 raise ValueError(f"closing bracket missing in '{text}'")
jpayne@68 765 text = text[11:-1]
jpayne@68 766 strand = -1
jpayne@68 767 elif stranded:
jpayne@68 768 strand = 1
jpayne@68 769 else:
jpayne@68 770 strand = None
jpayne@68 771
jpayne@68 772 # Determine if we have a simple location or a compound location
jpayne@68 773 if text.startswith("join("):
jpayne@68 774 operator = "join"
jpayne@68 775 parts = _split(text[5:-1])[1::2]
jpayne@68 776 # assert parts[0] == "" and parts[-1] == ""
jpayne@68 777 elif text.startswith("order("):
jpayne@68 778 operator = "order"
jpayne@68 779 parts = _split(text[6:-1])[1::2]
jpayne@68 780 # assert parts[0] == "" and parts[-1] == ""
jpayne@68 781 elif text.startswith("bond("):
jpayne@68 782 operator = "bond"
jpayne@68 783 parts = _split(text[5:-1])[1::2]
jpayne@68 784 # assert parts[0] == "" and parts[-1] == ""
jpayne@68 785 else:
jpayne@68 786 loc = SimpleLocation.fromstring(text, length, circular)
jpayne@68 787 loc.strand = strand
jpayne@68 788 if strand == -1:
jpayne@68 789 loc.parts.reverse()
jpayne@68 790 return loc
jpayne@68 791 locs = []
jpayne@68 792 for part in parts:
jpayne@68 793 loc = SimpleLocation.fromstring(part, length, circular)
jpayne@68 794 if loc is None:
jpayne@68 795 break
jpayne@68 796 if loc.strand == -1:
jpayne@68 797 if strand == -1:
jpayne@68 798 raise LocationParserError("double complement in '{text}'?")
jpayne@68 799 else:
jpayne@68 800 loc.strand = strand
jpayne@68 801 locs.extend(loc.parts)
jpayne@68 802 else:
jpayne@68 803 if len(locs) == 1:
jpayne@68 804 return loc
jpayne@68 805 # Historically a join on the reverse strand has been represented
jpayne@68 806 # in Biopython with both the parent SeqFeature and its children
jpayne@68 807 # (the exons for a CDS) all given a strand of -1. Likewise, for
jpayne@68 808 # a join feature on the forward strand they all have strand +1.
jpayne@68 809 # However, we must also consider evil mixed strand examples like
jpayne@68 810 # this, join(complement(69611..69724),139856..140087,140625..140650)
jpayne@68 811 if strand == -1:
jpayne@68 812 # Whole thing was wrapped in complement(...)
jpayne@68 813 for loc in locs:
jpayne@68 814 assert loc.strand == -1
jpayne@68 815 # Reverse the backwards order used in GenBank files
jpayne@68 816 # with complement(join(...))
jpayne@68 817 locs = locs[::-1]
jpayne@68 818 return CompoundLocation(locs, operator=operator)
jpayne@68 819 # Not recognized
jpayne@68 820 if "order" in text and "join" in text:
jpayne@68 821 # See Bug 3197
jpayne@68 822 raise LocationParserError(
jpayne@68 823 f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal"
jpayne@68 824 )
jpayne@68 825
jpayne@68 826 # See issue #937. Note that NCBI has already fixed this record.
jpayne@68 827 if ",)" in text:
jpayne@68 828 warnings.warn(
jpayne@68 829 "Dropping trailing comma in malformed feature location",
jpayne@68 830 BiopythonParserWarning,
jpayne@68 831 )
jpayne@68 832 text = text.replace(",)", ")")
jpayne@68 833 return Location.fromstring(text)
jpayne@68 834
jpayne@68 835 raise LocationParserError(f"failed to parse feature location '{text}'")
jpayne@68 836
jpayne@68 837
jpayne@68 838 class SimpleLocation(Location):
jpayne@68 839 """Specify the location of a feature along a sequence.
jpayne@68 840
jpayne@68 841 The SimpleLocation is used for simple continuous features, which can
jpayne@68 842 be described as running from a start position to and end position
jpayne@68 843 (optionally with a strand and reference information). More complex
jpayne@68 844 locations made up from several non-continuous parts (e.g. a coding
jpayne@68 845 sequence made up of several exons) are described using a SeqFeature
jpayne@68 846 with a CompoundLocation.
jpayne@68 847
jpayne@68 848 Note that the start and end location numbering follow Python's scheme,
jpayne@68 849 thus a GenBank entry of 123..150 (one based counting) becomes a location
jpayne@68 850 of [122:150] (zero based counting).
jpayne@68 851
jpayne@68 852 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 853 >>> f = SimpleLocation(122, 150)
jpayne@68 854 >>> print(f)
jpayne@68 855 [122:150]
jpayne@68 856 >>> print(f.start)
jpayne@68 857 122
jpayne@68 858 >>> print(f.end)
jpayne@68 859 150
jpayne@68 860 >>> print(f.strand)
jpayne@68 861 None
jpayne@68 862
jpayne@68 863 Note the strand defaults to None. If you are working with nucleotide
jpayne@68 864 sequences you'd want to be explicit if it is the forward strand:
jpayne@68 865
jpayne@68 866 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 867 >>> f = SimpleLocation(122, 150, strand=+1)
jpayne@68 868 >>> print(f)
jpayne@68 869 [122:150](+)
jpayne@68 870 >>> print(f.strand)
jpayne@68 871 1
jpayne@68 872
jpayne@68 873 Note that for a parent sequence of length n, the SimpleLocation
jpayne@68 874 start and end must satisfy the inequality 0 <= start <= end <= n.
jpayne@68 875 This means even for features on the reverse strand of a nucleotide
jpayne@68 876 sequence, we expect the 'start' coordinate to be less than the
jpayne@68 877 'end'.
jpayne@68 878
jpayne@68 879 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 880 >>> r = SimpleLocation(122, 150, strand=-1)
jpayne@68 881 >>> print(r)
jpayne@68 882 [122:150](-)
jpayne@68 883 >>> print(r.start)
jpayne@68 884 122
jpayne@68 885 >>> print(r.end)
jpayne@68 886 150
jpayne@68 887 >>> print(r.strand)
jpayne@68 888 -1
jpayne@68 889
jpayne@68 890 i.e. Rather than thinking of the 'start' and 'end' biologically in a
jpayne@68 891 strand aware manner, think of them as the 'left most' or 'minimum'
jpayne@68 892 boundary, and the 'right most' or 'maximum' boundary of the region
jpayne@68 893 being described. This is particularly important with compound
jpayne@68 894 locations describing non-continuous regions.
jpayne@68 895
jpayne@68 896 In the example above we have used standard exact positions, but there
jpayne@68 897 are also specialised position objects used to represent fuzzy positions
jpayne@68 898 as well, for example a GenBank location like complement(<123..150)
jpayne@68 899 would use a BeforePosition object for the start.
jpayne@68 900 """
jpayne@68 901
jpayne@68 902 def __init__(self, start, end, strand=None, ref=None, ref_db=None):
jpayne@68 903 """Initialize the class.
jpayne@68 904
jpayne@68 905 start and end arguments specify the values where the feature begins
jpayne@68 906 and ends. These can either by any of the ``*Position`` objects that
jpayne@68 907 inherit from Position, or can just be integers specifying the position.
jpayne@68 908 In the case of integers, the values are assumed to be exact and are
jpayne@68 909 converted in ExactPosition arguments. This is meant to make it easy
jpayne@68 910 to deal with non-fuzzy ends.
jpayne@68 911
jpayne@68 912 i.e. Short form:
jpayne@68 913
jpayne@68 914 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 915 >>> loc = SimpleLocation(5, 10, strand=-1)
jpayne@68 916 >>> print(loc)
jpayne@68 917 [5:10](-)
jpayne@68 918
jpayne@68 919 Explicit form:
jpayne@68 920
jpayne@68 921 >>> from Bio.SeqFeature import SimpleLocation, ExactPosition
jpayne@68 922 >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1)
jpayne@68 923 >>> print(loc)
jpayne@68 924 [5:10](-)
jpayne@68 925
jpayne@68 926 Other fuzzy positions are used similarly,
jpayne@68 927
jpayne@68 928 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 929 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@68 930 >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1)
jpayne@68 931 >>> print(loc2)
jpayne@68 932 [<5:>10](-)
jpayne@68 933
jpayne@68 934 For nucleotide features you will also want to specify the strand,
jpayne@68 935 use 1 for the forward (plus) strand, -1 for the reverse (negative)
jpayne@68 936 strand, 0 for stranded but strand unknown (? in GFF3), or None for
jpayne@68 937 when the strand does not apply (dot in GFF3), e.g. features on
jpayne@68 938 proteins.
jpayne@68 939
jpayne@68 940 >>> loc = SimpleLocation(5, 10, strand=+1)
jpayne@68 941 >>> print(loc)
jpayne@68 942 [5:10](+)
jpayne@68 943 >>> print(loc.strand)
jpayne@68 944 1
jpayne@68 945
jpayne@68 946 Normally feature locations are given relative to the parent
jpayne@68 947 sequence you are working with, but an explicit accession can
jpayne@68 948 be given with the optional ref and db_ref strings:
jpayne@68 949
jpayne@68 950 >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1)
jpayne@68 951 >>> print(loc)
jpayne@68 952 AL391218.9[105172:108462](+)
jpayne@68 953 >>> print(loc.ref)
jpayne@68 954 AL391218.9
jpayne@68 955
jpayne@68 956 """
jpayne@68 957 # TODO - Check 0 <= start <= end (<= length of reference)
jpayne@68 958 if isinstance(start, Position):
jpayne@68 959 self._start = start
jpayne@68 960 elif isinstance(start, int):
jpayne@68 961 self._start = ExactPosition(start)
jpayne@68 962 else:
jpayne@68 963 raise TypeError(f"start={start!r} {type(start)}")
jpayne@68 964 if isinstance(end, Position):
jpayne@68 965 self._end = end
jpayne@68 966 elif isinstance(end, int):
jpayne@68 967 self._end = ExactPosition(end)
jpayne@68 968 else:
jpayne@68 969 raise TypeError(f"end={end!r} {type(end)}")
jpayne@68 970 if (
jpayne@68 971 isinstance(self.start, int)
jpayne@68 972 and isinstance(self.end, int)
jpayne@68 973 and self.start > self.end
jpayne@68 974 ):
jpayne@68 975 raise ValueError(
jpayne@68 976 f"End location ({self.end}) must be greater than "
jpayne@68 977 f"or equal to start location ({self.start})"
jpayne@68 978 )
jpayne@68 979 self.strand = strand
jpayne@68 980 self.ref = ref
jpayne@68 981 self.ref_db = ref_db
jpayne@68 982
jpayne@68 983 @staticmethod
jpayne@68 984 def fromstring(text, length=None, circular=False):
jpayne@68 985 """Create a SimpleLocation object from a string."""
jpayne@68 986 if text.startswith("complement("):
jpayne@68 987 text = text[11:-1]
jpayne@68 988 strand = -1
jpayne@68 989 else:
jpayne@68 990 strand = None
jpayne@68 991 # Try simple cases first for speed
jpayne@68 992 try:
jpayne@68 993 s, e = text.split("..")
jpayne@68 994 s = int(s) - 1
jpayne@68 995 e = int(e)
jpayne@68 996 except ValueError:
jpayne@68 997 pass
jpayne@68 998 else:
jpayne@68 999 if 0 <= s < e:
jpayne@68 1000 return SimpleLocation(s, e, strand)
jpayne@68 1001 # Try general case
jpayne@68 1002 try:
jpayne@68 1003 ref, text = text.split(":")
jpayne@68 1004 except ValueError:
jpayne@68 1005 ref = None
jpayne@68 1006 m = _re_location_category.match(text)
jpayne@68 1007 if m is None:
jpayne@68 1008 raise LocationParserError(f"Could not parse feature location '{text}'")
jpayne@68 1009 for key, value in m.groupdict().items():
jpayne@68 1010 if value is not None:
jpayne@68 1011 break
jpayne@68 1012 assert value == text
jpayne@68 1013 if key == "bond":
jpayne@68 1014 # e.g. bond(196)
jpayne@68 1015 warnings.warn(
jpayne@68 1016 "Dropping bond qualifier in feature location",
jpayne@68 1017 BiopythonParserWarning,
jpayne@68 1018 )
jpayne@68 1019 text = text[5:-1]
jpayne@68 1020 s_pos = Position.fromstring(text, -1)
jpayne@68 1021 e_pos = Position.fromstring(text)
jpayne@68 1022 elif key == "solo":
jpayne@68 1023 # e.g. "123"
jpayne@68 1024 s_pos = Position.fromstring(text, -1)
jpayne@68 1025 e_pos = Position.fromstring(text)
jpayne@68 1026 elif key in ("pair", "within", "oneof"):
jpayne@68 1027 s, e = text.split("..")
jpayne@68 1028 # Attempt to fix features that span the origin
jpayne@68 1029 s_pos = Position.fromstring(s, -1)
jpayne@68 1030 e_pos = Position.fromstring(e)
jpayne@68 1031 if s_pos >= e_pos:
jpayne@68 1032 # There is likely a problem with origin wrapping.
jpayne@68 1033 # Create a CompoundLocation of the wrapped feature,
jpayne@68 1034 # consisting of two SimpleLocation objects to extend to
jpayne@68 1035 # the list of feature locations.
jpayne@68 1036 if not circular:
jpayne@68 1037 raise LocationParserError(
jpayne@68 1038 f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined"
jpayne@68 1039 )
jpayne@68 1040 warnings.warn(
jpayne@68 1041 "Attempting to fix invalid location %r as "
jpayne@68 1042 "it looks like incorrect origin wrapping. "
jpayne@68 1043 "Please fix input file, this could have "
jpayne@68 1044 "unintended behavior." % text,
jpayne@68 1045 BiopythonParserWarning,
jpayne@68 1046 )
jpayne@68 1047
jpayne@68 1048 f1 = SimpleLocation(s_pos, length, strand)
jpayne@68 1049 f2 = SimpleLocation(0, e_pos, strand)
jpayne@68 1050
jpayne@68 1051 if strand == -1:
jpayne@68 1052 # For complementary features spanning the origin
jpayne@68 1053 return f2 + f1
jpayne@68 1054 else:
jpayne@68 1055 return f1 + f2
jpayne@68 1056 elif key == "between":
jpayne@68 1057 # A between location like "67^68" (one based counting) is a
jpayne@68 1058 # special case (note it has zero length). In python slice
jpayne@68 1059 # notation this is 67:67, a zero length slice. See Bug 2622
jpayne@68 1060 # Further more, on a circular genome of length N you can have
jpayne@68 1061 # a location N^1 meaning the junction at the origin. See Bug 3098.
jpayne@68 1062 # NOTE - We can imagine between locations like "2^4", but this
jpayne@68 1063 # is just "3". Similarly, "2^5" is just "3..4"
jpayne@68 1064 s, e = text.split("^")
jpayne@68 1065 s = int(s)
jpayne@68 1066 e = int(e)
jpayne@68 1067 if s + 1 == e or (s == length and e == 1):
jpayne@68 1068 s_pos = ExactPosition(s)
jpayne@68 1069 e_pos = s_pos
jpayne@68 1070 else:
jpayne@68 1071 raise LocationParserError(f"invalid feature location '{text}'")
jpayne@68 1072 if s_pos < 0:
jpayne@68 1073 raise LocationParserError(
jpayne@68 1074 f"negative starting position in feature location '{text}'"
jpayne@68 1075 )
jpayne@68 1076 return SimpleLocation(s_pos, e_pos, strand, ref=ref)
jpayne@68 1077
jpayne@68 1078 def _get_strand(self):
jpayne@68 1079 """Get function for the strand property (PRIVATE)."""
jpayne@68 1080 return self._strand
jpayne@68 1081
jpayne@68 1082 def _set_strand(self, value):
jpayne@68 1083 """Set function for the strand property (PRIVATE)."""
jpayne@68 1084 if value not in [+1, -1, 0, None]:
jpayne@68 1085 raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}")
jpayne@68 1086 self._strand = value
jpayne@68 1087
jpayne@68 1088 strand = property(
jpayne@68 1089 fget=_get_strand,
jpayne@68 1090 fset=_set_strand,
jpayne@68 1091 doc="Strand of the location (+1, -1, 0 or None).",
jpayne@68 1092 )
jpayne@68 1093
jpayne@68 1094 def __str__(self):
jpayne@68 1095 """Return a representation of the SimpleLocation object (with python counting).
jpayne@68 1096
jpayne@68 1097 For the simple case this uses the python splicing syntax, [122:150]
jpayne@68 1098 (zero based counting) which GenBank would call 123..150 (one based
jpayne@68 1099 counting).
jpayne@68 1100 """
jpayne@68 1101 answer = f"[{self._start}:{self._end}]"
jpayne@68 1102 if self.ref and self.ref_db:
jpayne@68 1103 answer = f"{self.ref_db}:{self.ref}{answer}"
jpayne@68 1104 elif self.ref:
jpayne@68 1105 answer = self.ref + answer
jpayne@68 1106 # Is ref_db without ref meaningful?
jpayne@68 1107 if self.strand is None:
jpayne@68 1108 return answer
jpayne@68 1109 elif self.strand == +1:
jpayne@68 1110 return answer + "(+)"
jpayne@68 1111 elif self.strand == -1:
jpayne@68 1112 return answer + "(-)"
jpayne@68 1113 else:
jpayne@68 1114 # strand = 0, stranded but strand unknown, ? in GFF3
jpayne@68 1115 return answer + "(?)"
jpayne@68 1116
jpayne@68 1117 def __repr__(self):
jpayne@68 1118 """Represent the SimpleLocation object as a string for debugging."""
jpayne@68 1119 optional = ""
jpayne@68 1120 if self.strand is not None:
jpayne@68 1121 optional += f", strand={self.strand!r}"
jpayne@68 1122 if self.ref is not None:
jpayne@68 1123 optional += f", ref={self.ref!r}"
jpayne@68 1124 if self.ref_db is not None:
jpayne@68 1125 optional += f", ref_db={self.ref_db!r}"
jpayne@68 1126 return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})"
jpayne@68 1127
jpayne@68 1128 def __add__(self, other):
jpayne@68 1129 """Combine location with another SimpleLocation object, or shift it.
jpayne@68 1130
jpayne@68 1131 You can add two feature locations to make a join CompoundLocation:
jpayne@68 1132
jpayne@68 1133 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1134 >>> f1 = SimpleLocation(5, 10)
jpayne@68 1135 >>> f2 = SimpleLocation(20, 30)
jpayne@68 1136 >>> combined = f1 + f2
jpayne@68 1137 >>> print(combined)
jpayne@68 1138 join{[5:10], [20:30]}
jpayne@68 1139
jpayne@68 1140 This is thus equivalent to:
jpayne@68 1141
jpayne@68 1142 >>> from Bio.SeqFeature import CompoundLocation
jpayne@68 1143 >>> join = CompoundLocation([f1, f2])
jpayne@68 1144 >>> print(join)
jpayne@68 1145 join{[5:10], [20:30]}
jpayne@68 1146
jpayne@68 1147 You can also use sum(...) in this way:
jpayne@68 1148
jpayne@68 1149 >>> join = sum([f1, f2])
jpayne@68 1150 >>> print(join)
jpayne@68 1151 join{[5:10], [20:30]}
jpayne@68 1152
jpayne@68 1153 Furthermore, you can combine a SimpleLocation with a CompoundLocation
jpayne@68 1154 in this way.
jpayne@68 1155
jpayne@68 1156 Separately, adding an integer will give a new SimpleLocation with
jpayne@68 1157 its start and end offset by that amount. For example:
jpayne@68 1158
jpayne@68 1159 >>> print(f1)
jpayne@68 1160 [5:10]
jpayne@68 1161 >>> print(f1 + 100)
jpayne@68 1162 [105:110]
jpayne@68 1163 >>> print(200 + f1)
jpayne@68 1164 [205:210]
jpayne@68 1165
jpayne@68 1166 This can be useful when editing annotation.
jpayne@68 1167 """
jpayne@68 1168 if isinstance(other, SimpleLocation):
jpayne@68 1169 return CompoundLocation([self, other])
jpayne@68 1170 elif isinstance(other, int):
jpayne@68 1171 return self._shift(other)
jpayne@68 1172 else:
jpayne@68 1173 # This will allow CompoundLocation's __radd__ to be called:
jpayne@68 1174 return NotImplemented
jpayne@68 1175
jpayne@68 1176 def __radd__(self, other):
jpayne@68 1177 """Return a SimpleLocation object by shifting the location by an integer amount."""
jpayne@68 1178 if isinstance(other, int):
jpayne@68 1179 return self._shift(other)
jpayne@68 1180 else:
jpayne@68 1181 return NotImplemented
jpayne@68 1182
jpayne@68 1183 def __sub__(self, other):
jpayne@68 1184 """Subtracting an integer will shift the start and end by that amount.
jpayne@68 1185
jpayne@68 1186 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1187 >>> f1 = SimpleLocation(105, 150)
jpayne@68 1188 >>> print(f1)
jpayne@68 1189 [105:150]
jpayne@68 1190 >>> print(f1 - 100)
jpayne@68 1191 [5:50]
jpayne@68 1192
jpayne@68 1193 This can be useful when editing annotation. You can also add an integer
jpayne@68 1194 to a feature location (which shifts in the opposite direction).
jpayne@68 1195 """
jpayne@68 1196 if isinstance(other, int):
jpayne@68 1197 return self._shift(-other)
jpayne@68 1198 else:
jpayne@68 1199 return NotImplemented
jpayne@68 1200
jpayne@68 1201 def __nonzero__(self):
jpayne@68 1202 """Return True regardless of the length of the feature.
jpayne@68 1203
jpayne@68 1204 This behavior is for backwards compatibility, since until the
jpayne@68 1205 __len__ method was added, a SimpleLocation always evaluated as True.
jpayne@68 1206
jpayne@68 1207 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@68 1208 evaluate to False if they have length zero.
jpayne@68 1209
jpayne@68 1210 WARNING: The SimpleLocation may in future evaluate to False when its
jpayne@68 1211 length is zero (in order to better match normal python behavior)!
jpayne@68 1212 """
jpayne@68 1213 return True
jpayne@68 1214
jpayne@68 1215 def __len__(self):
jpayne@68 1216 """Return the length of the region described by the SimpleLocation object.
jpayne@68 1217
jpayne@68 1218 Note that extra care may be needed for fuzzy locations, e.g.
jpayne@68 1219
jpayne@68 1220 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1221 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@68 1222 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@68 1223 >>> len(loc)
jpayne@68 1224 5
jpayne@68 1225 """
jpayne@68 1226 return int(self._end) - int(self._start)
jpayne@68 1227
jpayne@68 1228 def __contains__(self, value):
jpayne@68 1229 """Check if an integer position is within the SimpleLocation object.
jpayne@68 1230
jpayne@68 1231 Note that extra care may be needed for fuzzy locations, e.g.
jpayne@68 1232
jpayne@68 1233 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1234 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@68 1235 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@68 1236 >>> len(loc)
jpayne@68 1237 5
jpayne@68 1238 >>> [i for i in range(15) if i in loc]
jpayne@68 1239 [5, 6, 7, 8, 9]
jpayne@68 1240 """
jpayne@68 1241 if not isinstance(value, int):
jpayne@68 1242 raise ValueError(
jpayne@68 1243 "Currently we only support checking for integer "
jpayne@68 1244 "positions being within a SimpleLocation."
jpayne@68 1245 )
jpayne@68 1246 if value < self._start or value >= self._end:
jpayne@68 1247 return False
jpayne@68 1248 else:
jpayne@68 1249 return True
jpayne@68 1250
jpayne@68 1251 def __iter__(self):
jpayne@68 1252 """Iterate over the parent positions within the SimpleLocation object.
jpayne@68 1253
jpayne@68 1254 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1255 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
jpayne@68 1256 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10))
jpayne@68 1257 >>> len(loc)
jpayne@68 1258 5
jpayne@68 1259 >>> for i in loc: print(i)
jpayne@68 1260 5
jpayne@68 1261 6
jpayne@68 1262 7
jpayne@68 1263 8
jpayne@68 1264 9
jpayne@68 1265 >>> list(loc)
jpayne@68 1266 [5, 6, 7, 8, 9]
jpayne@68 1267 >>> [i for i in range(15) if i in loc]
jpayne@68 1268 [5, 6, 7, 8, 9]
jpayne@68 1269
jpayne@68 1270 Note this is strand aware:
jpayne@68 1271
jpayne@68 1272 >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1)
jpayne@68 1273 >>> list(loc)
jpayne@68 1274 [9, 8, 7, 6, 5]
jpayne@68 1275 """
jpayne@68 1276 if self.strand == -1:
jpayne@68 1277 yield from range(self._end - 1, self._start - 1, -1)
jpayne@68 1278 else:
jpayne@68 1279 yield from range(self._start, self._end)
jpayne@68 1280
jpayne@68 1281 def __eq__(self, other):
jpayne@68 1282 """Implement equality by comparing all the location attributes."""
jpayne@68 1283 if not isinstance(other, SimpleLocation):
jpayne@68 1284 return False
jpayne@68 1285 return (
jpayne@68 1286 self._start == other.start
jpayne@68 1287 and self._end == other.end
jpayne@68 1288 and self._strand == other.strand
jpayne@68 1289 and self.ref == other.ref
jpayne@68 1290 and self.ref_db == other.ref_db
jpayne@68 1291 )
jpayne@68 1292
jpayne@68 1293 def _shift(self, offset):
jpayne@68 1294 """Return a copy of the SimpleLocation shifted by an offset (PRIVATE).
jpayne@68 1295
jpayne@68 1296 Returns self when location is relative to an external reference.
jpayne@68 1297 """
jpayne@68 1298 # TODO - What if offset is a fuzzy position?
jpayne@68 1299 if self.ref or self.ref_db:
jpayne@68 1300 return self
jpayne@68 1301 return SimpleLocation(
jpayne@68 1302 start=self._start + offset,
jpayne@68 1303 end=self._end + offset,
jpayne@68 1304 strand=self.strand,
jpayne@68 1305 )
jpayne@68 1306
jpayne@68 1307 def _flip(self, length):
jpayne@68 1308 """Return a copy of the location after the parent is reversed (PRIVATE).
jpayne@68 1309
jpayne@68 1310 Returns self when location is relative to an external reference.
jpayne@68 1311 """
jpayne@68 1312 if self.ref or self.ref_db:
jpayne@68 1313 return self
jpayne@68 1314 # Note this will flip the start and end too!
jpayne@68 1315 if self.strand == +1:
jpayne@68 1316 flip_strand = -1
jpayne@68 1317 elif self.strand == -1:
jpayne@68 1318 flip_strand = +1
jpayne@68 1319 else:
jpayne@68 1320 # 0 or None
jpayne@68 1321 flip_strand = self.strand
jpayne@68 1322 return SimpleLocation(
jpayne@68 1323 start=self._end._flip(length),
jpayne@68 1324 end=self._start._flip(length),
jpayne@68 1325 strand=flip_strand,
jpayne@68 1326 )
jpayne@68 1327
jpayne@68 1328 @property
jpayne@68 1329 def parts(self):
jpayne@68 1330 """Read only list of sections (always one, the SimpleLocation object).
jpayne@68 1331
jpayne@68 1332 This is a convenience property allowing you to write code handling
jpayne@68 1333 both SimpleLocation objects (with one part) and more complex
jpayne@68 1334 CompoundLocation objects (with multiple parts) interchangeably.
jpayne@68 1335 """
jpayne@68 1336 return [self]
jpayne@68 1337
jpayne@68 1338 @property
jpayne@68 1339 def start(self):
jpayne@68 1340 """Start location - left most (minimum) value, regardless of strand.
jpayne@68 1341
jpayne@68 1342 Read only, returns an integer like position object, possibly a fuzzy
jpayne@68 1343 position.
jpayne@68 1344 """
jpayne@68 1345 return self._start
jpayne@68 1346
jpayne@68 1347 @property
jpayne@68 1348 def end(self):
jpayne@68 1349 """End location - right most (maximum) value, regardless of strand.
jpayne@68 1350
jpayne@68 1351 Read only, returns an integer like position object, possibly a fuzzy
jpayne@68 1352 position.
jpayne@68 1353 """
jpayne@68 1354 return self._end
jpayne@68 1355
jpayne@68 1356 def extract(self, parent_sequence, references=None):
jpayne@68 1357 """Extract the sequence from supplied parent sequence using the SimpleLocation object.
jpayne@68 1358
jpayne@68 1359 The parent_sequence can be a Seq like object or a string, and will
jpayne@68 1360 generally return an object of the same type. The exception to this is
jpayne@68 1361 a MutableSeq as the parent sequence will return a Seq object.
jpayne@68 1362 If the location refers to other records, they must be supplied
jpayne@68 1363 in the optional dictionary references.
jpayne@68 1364
jpayne@68 1365 >>> from Bio.Seq import Seq
jpayne@68 1366 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1367 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@68 1368 >>> feature_loc = SimpleLocation(8, 15)
jpayne@68 1369 >>> feature_loc.extract(seq)
jpayne@68 1370 Seq('VALIVIC')
jpayne@68 1371
jpayne@68 1372 """
jpayne@68 1373 if self.ref or self.ref_db:
jpayne@68 1374 if not references:
jpayne@68 1375 raise ValueError(
jpayne@68 1376 f"Feature references another sequence ({self.ref}),"
jpayne@68 1377 " references mandatory"
jpayne@68 1378 )
jpayne@68 1379 elif self.ref not in references:
jpayne@68 1380 # KeyError?
jpayne@68 1381 raise ValueError(
jpayne@68 1382 f"Feature references another sequence ({self.ref}),"
jpayne@68 1383 " not found in references"
jpayne@68 1384 )
jpayne@68 1385 parent_sequence = references[self.ref]
jpayne@68 1386 f_seq = parent_sequence[int(self.start) : int(self.end)]
jpayne@68 1387 if isinstance(f_seq, MutableSeq):
jpayne@68 1388 f_seq = Seq(f_seq)
jpayne@68 1389 if self.strand == -1:
jpayne@68 1390 f_seq = reverse_complement(f_seq)
jpayne@68 1391 return f_seq
jpayne@68 1392
jpayne@68 1393
jpayne@68 1394 FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only.
jpayne@68 1395
jpayne@68 1396
jpayne@68 1397 class CompoundLocation(Location):
jpayne@68 1398 """For handling joins etc where a feature location has several parts."""
jpayne@68 1399
jpayne@68 1400 def __init__(self, parts, operator="join"):
jpayne@68 1401 """Initialize the class.
jpayne@68 1402
jpayne@68 1403 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@68 1404 >>> f1 = SimpleLocation(10, 40, strand=+1)
jpayne@68 1405 >>> f2 = SimpleLocation(50, 59, strand=+1)
jpayne@68 1406 >>> f = CompoundLocation([f1, f2])
jpayne@68 1407 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
jpayne@68 1408 True
jpayne@68 1409 >>> print(f.operator)
jpayne@68 1410 join
jpayne@68 1411 >>> 5 in f
jpayne@68 1412 False
jpayne@68 1413 >>> 15 in f
jpayne@68 1414 True
jpayne@68 1415 >>> f.strand
jpayne@68 1416 1
jpayne@68 1417
jpayne@68 1418 Notice that the strand of the compound location is computed
jpayne@68 1419 automatically - in the case of mixed strands on the sub-locations
jpayne@68 1420 the overall strand is set to None.
jpayne@68 1421
jpayne@68 1422 >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1),
jpayne@68 1423 ... SimpleLocation(10, 13, strand=-1)])
jpayne@68 1424 >>> print(f.strand)
jpayne@68 1425 None
jpayne@68 1426 >>> len(f)
jpayne@68 1427 6
jpayne@68 1428 >>> list(f)
jpayne@68 1429 [3, 4, 5, 12, 11, 10]
jpayne@68 1430
jpayne@68 1431 The example above doing list(f) iterates over the coordinates within the
jpayne@68 1432 feature. This allows you to use max and min on the location, to find the
jpayne@68 1433 range covered:
jpayne@68 1434
jpayne@68 1435 >>> min(f)
jpayne@68 1436 3
jpayne@68 1437 >>> max(f)
jpayne@68 1438 12
jpayne@68 1439
jpayne@68 1440 More generally, you can use the compound location's start and end which
jpayne@68 1441 give the full span covered, 0 <= start <= end <= full sequence length.
jpayne@68 1442
jpayne@68 1443 >>> f.start == min(f)
jpayne@68 1444 True
jpayne@68 1445 >>> f.end == max(f) + 1
jpayne@68 1446 True
jpayne@68 1447
jpayne@68 1448 This is consistent with the behavior of the SimpleLocation for a single
jpayne@68 1449 region, where again the 'start' and 'end' do not necessarily give the
jpayne@68 1450 biological start and end, but rather the 'minimal' and 'maximal'
jpayne@68 1451 coordinate boundaries.
jpayne@68 1452
jpayne@68 1453 Note that adding locations provides a more intuitive method of
jpayne@68 1454 construction:
jpayne@68 1455
jpayne@68 1456 >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1)
jpayne@68 1457 >>> len(f)
jpayne@68 1458 6
jpayne@68 1459 >>> list(f)
jpayne@68 1460 [3, 4, 5, 12, 11, 10]
jpayne@68 1461 """
jpayne@68 1462 self.operator = operator
jpayne@68 1463 self.parts = list(parts)
jpayne@68 1464 for loc in self.parts:
jpayne@68 1465 if not isinstance(loc, SimpleLocation):
jpayne@68 1466 raise ValueError(
jpayne@68 1467 "CompoundLocation should be given a list of "
jpayne@68 1468 "SimpleLocation objects, not %s" % loc.__class__
jpayne@68 1469 )
jpayne@68 1470 if len(parts) < 2:
jpayne@68 1471 raise ValueError(
jpayne@68 1472 f"CompoundLocation should have at least 2 parts, not {parts!r}"
jpayne@68 1473 )
jpayne@68 1474
jpayne@68 1475 def __str__(self):
jpayne@68 1476 """Return a representation of the CompoundLocation object (with python counting)."""
jpayne@68 1477 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
jpayne@68 1478
jpayne@68 1479 def __repr__(self):
jpayne@68 1480 """Represent the CompoundLocation object as string for debugging."""
jpayne@68 1481 return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})"
jpayne@68 1482
jpayne@68 1483 def _get_strand(self):
jpayne@68 1484 """Get function for the strand property (PRIVATE)."""
jpayne@68 1485 # Historically a join on the reverse strand has been represented
jpayne@68 1486 # in Biopython with both the parent SeqFeature and its children
jpayne@68 1487 # (the exons for a CDS) all given a strand of -1. Likewise, for
jpayne@68 1488 # a join feature on the forward strand they all have strand +1.
jpayne@68 1489 # However, we must also consider evil mixed strand examples like
jpayne@68 1490 # this, join(complement(69611..69724),139856..140087,140625..140650)
jpayne@68 1491 if len({loc.strand for loc in self.parts}) == 1:
jpayne@68 1492 return self.parts[0].strand
jpayne@68 1493 else:
jpayne@68 1494 return None # i.e. mixed strands
jpayne@68 1495
jpayne@68 1496 def _set_strand(self, value):
jpayne@68 1497 """Set function for the strand property (PRIVATE)."""
jpayne@68 1498 # Should this be allowed/encouraged?
jpayne@68 1499 for loc in self.parts:
jpayne@68 1500 loc.strand = value
jpayne@68 1501
jpayne@68 1502 strand = property(
jpayne@68 1503 fget=_get_strand,
jpayne@68 1504 fset=_set_strand,
jpayne@68 1505 doc="""Overall strand of the compound location.
jpayne@68 1506
jpayne@68 1507 If all the parts have the same strand, that is returned. Otherwise
jpayne@68 1508 for mixed strands, this returns None.
jpayne@68 1509
jpayne@68 1510 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@68 1511 >>> f1 = SimpleLocation(15, 17, strand=1)
jpayne@68 1512 >>> f2 = SimpleLocation(20, 30, strand=-1)
jpayne@68 1513 >>> f = f1 + f2
jpayne@68 1514 >>> f1.strand
jpayne@68 1515 1
jpayne@68 1516 >>> f2.strand
jpayne@68 1517 -1
jpayne@68 1518 >>> f.strand
jpayne@68 1519 >>> f.strand is None
jpayne@68 1520 True
jpayne@68 1521
jpayne@68 1522 If you set the strand of a CompoundLocation, this is applied to
jpayne@68 1523 all the parts - use with caution:
jpayne@68 1524
jpayne@68 1525 >>> f.strand = 1
jpayne@68 1526 >>> f1.strand
jpayne@68 1527 1
jpayne@68 1528 >>> f2.strand
jpayne@68 1529 1
jpayne@68 1530 >>> f.strand
jpayne@68 1531 1
jpayne@68 1532
jpayne@68 1533 """,
jpayne@68 1534 )
jpayne@68 1535
jpayne@68 1536 def __add__(self, other):
jpayne@68 1537 """Combine locations, or shift the location by an integer offset.
jpayne@68 1538
jpayne@68 1539 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1540 >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30)
jpayne@68 1541 >>> print(f1)
jpayne@68 1542 join{[15:17], [20:30]}
jpayne@68 1543
jpayne@68 1544 You can add another SimpleLocation:
jpayne@68 1545
jpayne@68 1546 >>> print(f1 + SimpleLocation(40, 50))
jpayne@68 1547 join{[15:17], [20:30], [40:50]}
jpayne@68 1548 >>> print(SimpleLocation(5, 10) + f1)
jpayne@68 1549 join{[5:10], [15:17], [20:30]}
jpayne@68 1550
jpayne@68 1551 You can also add another CompoundLocation:
jpayne@68 1552
jpayne@68 1553 >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70)
jpayne@68 1554 >>> print(f2)
jpayne@68 1555 join{[40:50], [60:70]}
jpayne@68 1556 >>> print(f1 + f2)
jpayne@68 1557 join{[15:17], [20:30], [40:50], [60:70]}
jpayne@68 1558
jpayne@68 1559 Also, as with the SimpleLocation, adding an integer shifts the
jpayne@68 1560 location's coordinates by that offset:
jpayne@68 1561
jpayne@68 1562 >>> print(f1 + 100)
jpayne@68 1563 join{[115:117], [120:130]}
jpayne@68 1564 >>> print(200 + f1)
jpayne@68 1565 join{[215:217], [220:230]}
jpayne@68 1566 >>> print(f1 + (-5))
jpayne@68 1567 join{[10:12], [15:25]}
jpayne@68 1568 """
jpayne@68 1569 if isinstance(other, SimpleLocation):
jpayne@68 1570 return CompoundLocation(self.parts + [other], self.operator)
jpayne@68 1571 elif isinstance(other, CompoundLocation):
jpayne@68 1572 if self.operator != other.operator:
jpayne@68 1573 # Handle join+order -> order as a special case?
jpayne@68 1574 raise ValueError(
jpayne@68 1575 f"Mixed operators {self.operator} and {other.operator}"
jpayne@68 1576 )
jpayne@68 1577 return CompoundLocation(self.parts + other.parts, self.operator)
jpayne@68 1578 elif isinstance(other, int):
jpayne@68 1579 return self._shift(other)
jpayne@68 1580 else:
jpayne@68 1581 raise NotImplementedError
jpayne@68 1582
jpayne@68 1583 def __radd__(self, other):
jpayne@68 1584 """Add a feature to the left."""
jpayne@68 1585 if isinstance(other, SimpleLocation):
jpayne@68 1586 return CompoundLocation([other] + self.parts, self.operator)
jpayne@68 1587 elif isinstance(other, int):
jpayne@68 1588 return self._shift(other)
jpayne@68 1589 else:
jpayne@68 1590 raise NotImplementedError
jpayne@68 1591
jpayne@68 1592 def __contains__(self, value):
jpayne@68 1593 """Check if an integer position is within the CompoundLocation object."""
jpayne@68 1594 for loc in self.parts:
jpayne@68 1595 if value in loc:
jpayne@68 1596 return True
jpayne@68 1597 return False
jpayne@68 1598
jpayne@68 1599 def __nonzero__(self):
jpayne@68 1600 """Return True regardless of the length of the feature.
jpayne@68 1601
jpayne@68 1602 This behavior is for backwards compatibility, since until the
jpayne@68 1603 __len__ method was added, a SimpleLocation always evaluated as True.
jpayne@68 1604
jpayne@68 1605 Note that in comparison, Seq objects, strings, lists, etc, will all
jpayne@68 1606 evaluate to False if they have length zero.
jpayne@68 1607
jpayne@68 1608 WARNING: The SimpleLocation may in future evaluate to False when its
jpayne@68 1609 length is zero (in order to better match normal python behavior)!
jpayne@68 1610 """
jpayne@68 1611 return True
jpayne@68 1612
jpayne@68 1613 def __len__(self):
jpayne@68 1614 """Return the length of the CompoundLocation object."""
jpayne@68 1615 return sum(len(loc) for loc in self.parts)
jpayne@68 1616
jpayne@68 1617 def __iter__(self):
jpayne@68 1618 """Iterate over the parent positions within the CompoundLocation object."""
jpayne@68 1619 for loc in self.parts:
jpayne@68 1620 yield from loc
jpayne@68 1621
jpayne@68 1622 def __eq__(self, other):
jpayne@68 1623 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
jpayne@68 1624 if not isinstance(other, CompoundLocation):
jpayne@68 1625 return False
jpayne@68 1626 if len(self.parts) != len(other.parts):
jpayne@68 1627 return False
jpayne@68 1628 if self.operator != other.operator:
jpayne@68 1629 return False
jpayne@68 1630 for self_part, other_part in zip(self.parts, other.parts):
jpayne@68 1631 if self_part != other_part:
jpayne@68 1632 return False
jpayne@68 1633 return True
jpayne@68 1634
jpayne@68 1635 def _shift(self, offset):
jpayne@68 1636 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
jpayne@68 1637 return CompoundLocation(
jpayne@68 1638 [loc._shift(offset) for loc in self.parts], self.operator
jpayne@68 1639 )
jpayne@68 1640
jpayne@68 1641 def _flip(self, length):
jpayne@68 1642 """Return a copy of the locations after the parent is reversed (PRIVATE).
jpayne@68 1643
jpayne@68 1644 Note that the order of the parts is NOT reversed too. Consider a CDS
jpayne@68 1645 on the forward strand with exons small, medium and large (in length).
jpayne@68 1646 Once we change the frame of reference to the reverse complement strand,
jpayne@68 1647 the start codon is still part of the small exon, and the stop codon
jpayne@68 1648 still part of the large exon - so the part order remains the same!
jpayne@68 1649
jpayne@68 1650 Here is an artificial example, were the features map to the two upper
jpayne@68 1651 case regions and the lower case runs of n are not used:
jpayne@68 1652
jpayne@68 1653 >>> from Bio.Seq import Seq
jpayne@68 1654 >>> from Bio.SeqFeature import SimpleLocation
jpayne@68 1655 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
jpayne@68 1656 >>> small = SimpleLocation(5, 20, strand=1)
jpayne@68 1657 >>> large = SimpleLocation(28, 52, strand=1)
jpayne@68 1658 >>> location = small + large
jpayne@68 1659 >>> print(small)
jpayne@68 1660 [5:20](+)
jpayne@68 1661 >>> print(large)
jpayne@68 1662 [28:52](+)
jpayne@68 1663 >>> print(location)
jpayne@68 1664 join{[5:20](+), [28:52](+)}
jpayne@68 1665 >>> for part in location.parts:
jpayne@68 1666 ... print(len(part))
jpayne@68 1667 ...
jpayne@68 1668 15
jpayne@68 1669 24
jpayne@68 1670
jpayne@68 1671 As you can see, this is a silly example where each "exon" is a word:
jpayne@68 1672
jpayne@68 1673 >>> print(small.extract(dna).translate())
jpayne@68 1674 SILLY
jpayne@68 1675 >>> print(large.extract(dna).translate())
jpayne@68 1676 EXAMPLE*
jpayne@68 1677 >>> print(location.extract(dna).translate())
jpayne@68 1678 SILLYEXAMPLE*
jpayne@68 1679 >>> for part in location.parts:
jpayne@68 1680 ... print(part.extract(dna).translate())
jpayne@68 1681 ...
jpayne@68 1682 SILLY
jpayne@68 1683 EXAMPLE*
jpayne@68 1684
jpayne@68 1685 Now, let's look at this from the reverse strand frame of reference:
jpayne@68 1686
jpayne@68 1687 >>> flipped_dna = dna.reverse_complement()
jpayne@68 1688 >>> flipped_location = location._flip(len(dna))
jpayne@68 1689 >>> print(flipped_location.extract(flipped_dna).translate())
jpayne@68 1690 SILLYEXAMPLE*
jpayne@68 1691 >>> for part in flipped_location.parts:
jpayne@68 1692 ... print(part.extract(flipped_dna).translate())
jpayne@68 1693 ...
jpayne@68 1694 SILLY
jpayne@68 1695 EXAMPLE*
jpayne@68 1696
jpayne@68 1697 The key point here is the first part of the CompoundFeature is still the
jpayne@68 1698 small exon, while the second part is still the large exon:
jpayne@68 1699
jpayne@68 1700 >>> for part in flipped_location.parts:
jpayne@68 1701 ... print(len(part))
jpayne@68 1702 ...
jpayne@68 1703 15
jpayne@68 1704 24
jpayne@68 1705 >>> print(flipped_location)
jpayne@68 1706 join{[37:52](-), [5:29](-)}
jpayne@68 1707
jpayne@68 1708 Notice the parts are not reversed. However, there was a bug here in older
jpayne@68 1709 versions of Biopython which would have given join{[5:29](-), [37:52](-)}
jpayne@68 1710 and the translation would have wrongly been "EXAMPLE*SILLY" instead.
jpayne@68 1711
jpayne@68 1712 """
jpayne@68 1713 return CompoundLocation(
jpayne@68 1714 [loc._flip(length) for loc in self.parts], self.operator
jpayne@68 1715 )
jpayne@68 1716
jpayne@68 1717 @property
jpayne@68 1718 def start(self):
jpayne@68 1719 """Start location - left most (minimum) value, regardless of strand.
jpayne@68 1720
jpayne@68 1721 Read only, returns an integer like position object, possibly a fuzzy
jpayne@68 1722 position.
jpayne@68 1723
jpayne@68 1724 For the special case of a CompoundLocation wrapping the origin of a
jpayne@68 1725 circular genome, this will return zero.
jpayne@68 1726 """
jpayne@68 1727 return min(loc.start for loc in self.parts)
jpayne@68 1728
jpayne@68 1729 @property
jpayne@68 1730 def end(self):
jpayne@68 1731 """End location - right most (maximum) value, regardless of strand.
jpayne@68 1732
jpayne@68 1733 Read only, returns an integer like position object, possibly a fuzzy
jpayne@68 1734 position.
jpayne@68 1735
jpayne@68 1736 For the special case of a CompoundLocation wrapping the origin of
jpayne@68 1737 a circular genome this will match the genome length.
jpayne@68 1738 """
jpayne@68 1739 return max(loc.end for loc in self.parts)
jpayne@68 1740
jpayne@68 1741 @property
jpayne@68 1742 def ref(self):
jpayne@68 1743 """Not present in CompoundLocation, dummy method for API compatibility."""
jpayne@68 1744 return None
jpayne@68 1745
jpayne@68 1746 @property
jpayne@68 1747 def ref_db(self):
jpayne@68 1748 """Not present in CompoundLocation, dummy method for API compatibility."""
jpayne@68 1749 return None
jpayne@68 1750
jpayne@68 1751 def extract(self, parent_sequence, references=None):
jpayne@68 1752 """Extract the sequence from supplied parent sequence using the CompoundLocation object.
jpayne@68 1753
jpayne@68 1754 The parent_sequence can be a Seq like object or a string, and will
jpayne@68 1755 generally return an object of the same type. The exception to this is
jpayne@68 1756 a MutableSeq as the parent sequence will return a Seq object.
jpayne@68 1757 If the location refers to other records, they must be supplied
jpayne@68 1758 in the optional dictionary references.
jpayne@68 1759
jpayne@68 1760 >>> from Bio.Seq import Seq
jpayne@68 1761 >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation
jpayne@68 1762 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
jpayne@68 1763 >>> fl1 = SimpleLocation(2, 8)
jpayne@68 1764 >>> fl2 = SimpleLocation(10, 15)
jpayne@68 1765 >>> fl3 = CompoundLocation([fl1,fl2])
jpayne@68 1766 >>> fl3.extract(seq)
jpayne@68 1767 Seq('QHKAMILIVIC')
jpayne@68 1768
jpayne@68 1769 """
jpayne@68 1770 # This copes with mixed strand features & all on reverse:
jpayne@68 1771 parts = [
jpayne@68 1772 loc.extract(parent_sequence, references=references) for loc in self.parts
jpayne@68 1773 ]
jpayne@68 1774 f_seq = functools.reduce(lambda x, y: x + y, parts)
jpayne@68 1775 return f_seq
jpayne@68 1776
jpayne@68 1777
jpayne@68 1778 class Position(ABC):
jpayne@68 1779 """Abstract base class representing a position."""
jpayne@68 1780
jpayne@68 1781 @abstractmethod
jpayne@68 1782 def __repr__(self):
jpayne@68 1783 """Represent the Position object as a string for debugging."""
jpayne@68 1784 return f"{self.__class__.__name__}(...)"
jpayne@68 1785
jpayne@68 1786 @staticmethod
jpayne@68 1787 def fromstring(text, offset=0):
jpayne@68 1788 """Build a Position object from the text string.
jpayne@68 1789
jpayne@68 1790 For an end position, leave offset as zero (default):
jpayne@68 1791
jpayne@68 1792 >>> Position.fromstring("5")
jpayne@68 1793 ExactPosition(5)
jpayne@68 1794
jpayne@68 1795 For a start position, set offset to minus one (for Python counting):
jpayne@68 1796
jpayne@68 1797 >>> Position.fromstring("5", -1)
jpayne@68 1798 ExactPosition(4)
jpayne@68 1799
jpayne@68 1800 This also covers fuzzy positions:
jpayne@68 1801
jpayne@68 1802 >>> p = Position.fromstring("<5")
jpayne@68 1803 >>> p
jpayne@68 1804 BeforePosition(5)
jpayne@68 1805 >>> print(p)
jpayne@68 1806 <5
jpayne@68 1807 >>> int(p)
jpayne@68 1808 5
jpayne@68 1809
jpayne@68 1810 >>> Position.fromstring(">5")
jpayne@68 1811 AfterPosition(5)
jpayne@68 1812
jpayne@68 1813 By default assumes an end position, so note the integer behavior:
jpayne@68 1814
jpayne@68 1815 >>> p = Position.fromstring("one-of(5,8,11)")
jpayne@68 1816 >>> p
jpayne@68 1817 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
jpayne@68 1818 >>> print(p)
jpayne@68 1819 one-of(5,8,11)
jpayne@68 1820 >>> int(p)
jpayne@68 1821 11
jpayne@68 1822
jpayne@68 1823 >>> Position.fromstring("(8.10)")
jpayne@68 1824 WithinPosition(10, left=8, right=10)
jpayne@68 1825
jpayne@68 1826 Fuzzy start positions:
jpayne@68 1827
jpayne@68 1828 >>> p = Position.fromstring("<5", -1)
jpayne@68 1829 >>> p
jpayne@68 1830 BeforePosition(4)
jpayne@68 1831 >>> print(p)
jpayne@68 1832 <4
jpayne@68 1833 >>> int(p)
jpayne@68 1834 4
jpayne@68 1835
jpayne@68 1836 Notice how the integer behavior changes too!
jpayne@68 1837
jpayne@68 1838 >>> p = Position.fromstring("one-of(5,8,11)", -1)
jpayne@68 1839 >>> p
jpayne@68 1840 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
jpayne@68 1841 >>> print(p)
jpayne@68 1842 one-of(4,7,10)
jpayne@68 1843 >>> int(p)
jpayne@68 1844 4
jpayne@68 1845
jpayne@68 1846 """
jpayne@68 1847 if offset != 0 and offset != -1:
jpayne@68 1848 raise ValueError(
jpayne@68 1849 "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)."
jpayne@68 1850 )
jpayne@68 1851 if text == "?":
jpayne@68 1852 return UnknownPosition()
jpayne@68 1853 if text.startswith("?"):
jpayne@68 1854 return UncertainPosition(int(text[1:]) + offset)
jpayne@68 1855 if text.startswith("<"):
jpayne@68 1856 return BeforePosition(int(text[1:]) + offset)
jpayne@68 1857 if text.startswith(">"):
jpayne@68 1858 return AfterPosition(int(text[1:]) + offset)
jpayne@68 1859 m = _re_within_position.match(text)
jpayne@68 1860 if m is not None:
jpayne@68 1861 s, e = m.groups()
jpayne@68 1862 s = int(s) + offset
jpayne@68 1863 e = int(e) + offset
jpayne@68 1864 if offset == -1:
jpayne@68 1865 default = s
jpayne@68 1866 else:
jpayne@68 1867 default = e
jpayne@68 1868 return WithinPosition(default, left=s, right=e)
jpayne@68 1869 m = _re_oneof_position.match(text)
jpayne@68 1870 if m is not None:
jpayne@68 1871 positions = m.groups()[0]
jpayne@68 1872 parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")]
jpayne@68 1873 if offset == -1:
jpayne@68 1874 default = min(int(pos) for pos in parts)
jpayne@68 1875 else:
jpayne@68 1876 default = max(int(pos) for pos in parts)
jpayne@68 1877 return OneOfPosition(default, choices=parts)
jpayne@68 1878 return ExactPosition(int(text) + offset)
jpayne@68 1879
jpayne@68 1880
jpayne@68 1881 class ExactPosition(int, Position):
jpayne@68 1882 """Specify the specific position of a boundary.
jpayne@68 1883
jpayne@68 1884 Arguments:
jpayne@68 1885 - position - The position of the boundary.
jpayne@68 1886 - extension - An optional argument which must be zero since we don't
jpayne@68 1887 have an extension. The argument is provided so that the same number
jpayne@68 1888 of arguments can be passed to all position types.
jpayne@68 1889
jpayne@68 1890 In this case, there is no fuzziness associated with the position.
jpayne@68 1891
jpayne@68 1892 >>> p = ExactPosition(5)
jpayne@68 1893 >>> p
jpayne@68 1894 ExactPosition(5)
jpayne@68 1895 >>> print(p)
jpayne@68 1896 5
jpayne@68 1897
jpayne@68 1898 >>> isinstance(p, Position)
jpayne@68 1899 True
jpayne@68 1900 >>> isinstance(p, int)
jpayne@68 1901 True
jpayne@68 1902
jpayne@68 1903 Integer comparisons and operations should work as expected:
jpayne@68 1904
jpayne@68 1905 >>> p == 5
jpayne@68 1906 True
jpayne@68 1907 >>> p < 6
jpayne@68 1908 True
jpayne@68 1909 >>> p <= 5
jpayne@68 1910 True
jpayne@68 1911 >>> p + 10
jpayne@68 1912 ExactPosition(15)
jpayne@68 1913
jpayne@68 1914 """
jpayne@68 1915
jpayne@68 1916 def __new__(cls, position, extension=0):
jpayne@68 1917 """Create an ExactPosition object."""
jpayne@68 1918 if extension != 0:
jpayne@68 1919 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@68 1920 return int.__new__(cls, position)
jpayne@68 1921
jpayne@68 1922 # Must define this on Python 3.8 onwards because we redefine __repr__
jpayne@68 1923 def __str__(self):
jpayne@68 1924 """Return a representation of the ExactPosition object (with python counting)."""
jpayne@68 1925 return str(int(self))
jpayne@68 1926
jpayne@68 1927 def __repr__(self):
jpayne@68 1928 """Represent the ExactPosition object as a string for debugging."""
jpayne@68 1929 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@68 1930
jpayne@68 1931 def __add__(self, offset):
jpayne@68 1932 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 1933 # By default preserve any subclass
jpayne@68 1934 return self.__class__(int(self) + offset)
jpayne@68 1935
jpayne@68 1936 def _flip(self, length):
jpayne@68 1937 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 1938 # By default preserve any subclass
jpayne@68 1939 return self.__class__(length - int(self))
jpayne@68 1940
jpayne@68 1941
jpayne@68 1942 class UncertainPosition(ExactPosition):
jpayne@68 1943 """Specify a specific position which is uncertain.
jpayne@68 1944
jpayne@68 1945 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
jpayne@68 1946 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
jpayne@68 1947 """
jpayne@68 1948
jpayne@68 1949
jpayne@68 1950 class UnknownPosition(Position):
jpayne@68 1951 """Specify a specific position which is unknown (has no position).
jpayne@68 1952
jpayne@68 1953 This is used in UniProt, e.g. ? or in the XML as unknown.
jpayne@68 1954 """
jpayne@68 1955
jpayne@68 1956 def __repr__(self):
jpayne@68 1957 """Represent the UnknownPosition object as a string for debugging."""
jpayne@68 1958 return f"{self.__class__.__name__}()"
jpayne@68 1959
jpayne@68 1960 def __hash__(self):
jpayne@68 1961 """Return the hash value of the UnknownPosition object."""
jpayne@68 1962 return hash(None)
jpayne@68 1963
jpayne@68 1964 def __add__(self, offset):
jpayne@68 1965 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 1966 return self
jpayne@68 1967
jpayne@68 1968 def _flip(self, length):
jpayne@68 1969 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 1970 return self
jpayne@68 1971
jpayne@68 1972
jpayne@68 1973 class WithinPosition(int, Position):
jpayne@68 1974 """Specify the position of a boundary within some coordinates.
jpayne@68 1975
jpayne@68 1976 Arguments:
jpayne@68 1977 - position - The default integer position
jpayne@68 1978 - left - The start (left) position of the boundary
jpayne@68 1979 - right - The end (right) position of the boundary
jpayne@68 1980
jpayne@68 1981 This allows dealing with a location like ((11.14)..100). This
jpayne@68 1982 indicates that the start of the sequence is somewhere between 11
jpayne@68 1983 and 14. Since this is a start coordinate, it should act like
jpayne@68 1984 it is at position 11 (or in Python counting, 10).
jpayne@68 1985
jpayne@68 1986 >>> p = WithinPosition(10, 10, 13)
jpayne@68 1987 >>> p
jpayne@68 1988 WithinPosition(10, left=10, right=13)
jpayne@68 1989 >>> print(p)
jpayne@68 1990 (10.13)
jpayne@68 1991 >>> int(p)
jpayne@68 1992 10
jpayne@68 1993
jpayne@68 1994 Basic integer comparisons and operations should work as though
jpayne@68 1995 this were a plain integer:
jpayne@68 1996
jpayne@68 1997 >>> p == 10
jpayne@68 1998 True
jpayne@68 1999 >>> p in [9, 10, 11]
jpayne@68 2000 True
jpayne@68 2001 >>> p < 11
jpayne@68 2002 True
jpayne@68 2003 >>> p + 10
jpayne@68 2004 WithinPosition(20, left=20, right=23)
jpayne@68 2005
jpayne@68 2006 >>> isinstance(p, WithinPosition)
jpayne@68 2007 True
jpayne@68 2008 >>> isinstance(p, Position)
jpayne@68 2009 True
jpayne@68 2010 >>> isinstance(p, int)
jpayne@68 2011 True
jpayne@68 2012
jpayne@68 2013 Note this also applies for comparison to other position objects,
jpayne@68 2014 where again the integer behavior is used:
jpayne@68 2015
jpayne@68 2016 >>> p == 10
jpayne@68 2017 True
jpayne@68 2018 >>> p == ExactPosition(10)
jpayne@68 2019 True
jpayne@68 2020 >>> p == BeforePosition(10)
jpayne@68 2021 True
jpayne@68 2022 >>> p == AfterPosition(10)
jpayne@68 2023 True
jpayne@68 2024
jpayne@68 2025 If this were an end point, you would want the position to be 13
jpayne@68 2026 (the right/larger value, not the left/smaller value as above):
jpayne@68 2027
jpayne@68 2028 >>> p2 = WithinPosition(13, 10, 13)
jpayne@68 2029 >>> p2
jpayne@68 2030 WithinPosition(13, left=10, right=13)
jpayne@68 2031 >>> print(p2)
jpayne@68 2032 (10.13)
jpayne@68 2033 >>> int(p2)
jpayne@68 2034 13
jpayne@68 2035 >>> p2 == 13
jpayne@68 2036 True
jpayne@68 2037 >>> p2 == ExactPosition(13)
jpayne@68 2038 True
jpayne@68 2039
jpayne@68 2040 """
jpayne@68 2041
jpayne@68 2042 def __new__(cls, position, left, right):
jpayne@68 2043 """Create a WithinPosition object."""
jpayne@68 2044 if not (position == left or position == right):
jpayne@68 2045 raise RuntimeError(
jpayne@68 2046 "WithinPosition: %r should match left %r or "
jpayne@68 2047 "right %r" % (position, left, right)
jpayne@68 2048 )
jpayne@68 2049 obj = int.__new__(cls, position)
jpayne@68 2050 obj._left = left
jpayne@68 2051 obj._right = right
jpayne@68 2052 return obj
jpayne@68 2053
jpayne@68 2054 def __getnewargs__(self):
jpayne@68 2055 """Return the arguments accepted by __new__.
jpayne@68 2056
jpayne@68 2057 Necessary to allow pickling and unpickling of class instances.
jpayne@68 2058 """
jpayne@68 2059 return (int(self), self._left, self._right)
jpayne@68 2060
jpayne@68 2061 def __repr__(self):
jpayne@68 2062 """Represent the WithinPosition object as a string for debugging."""
jpayne@68 2063 return "%s(%i, left=%i, right=%i)" % (
jpayne@68 2064 self.__class__.__name__,
jpayne@68 2065 int(self),
jpayne@68 2066 self._left,
jpayne@68 2067 self._right,
jpayne@68 2068 )
jpayne@68 2069
jpayne@68 2070 def __str__(self):
jpayne@68 2071 """Return a representation of the WithinPosition object (with python counting)."""
jpayne@68 2072 return f"({self._left}.{self._right})"
jpayne@68 2073
jpayne@68 2074 def __add__(self, offset):
jpayne@68 2075 """Return a copy of the position object with its location shifted."""
jpayne@68 2076 return self.__class__(
jpayne@68 2077 int(self) + offset, self._left + offset, self._right + offset
jpayne@68 2078 )
jpayne@68 2079
jpayne@68 2080 def _flip(self, length):
jpayne@68 2081 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 2082 return self.__class__(
jpayne@68 2083 length - int(self), length - self._right, length - self._left
jpayne@68 2084 )
jpayne@68 2085
jpayne@68 2086
jpayne@68 2087 class BetweenPosition(int, Position):
jpayne@68 2088 """Specify the position of a boundary between two coordinates (OBSOLETE?).
jpayne@68 2089
jpayne@68 2090 Arguments:
jpayne@68 2091 - position - The default integer position
jpayne@68 2092 - left - The start (left) position of the boundary
jpayne@68 2093 - right - The end (right) position of the boundary
jpayne@68 2094
jpayne@68 2095 This allows dealing with a position like 123^456. This
jpayne@68 2096 indicates that the start of the sequence is somewhere between
jpayne@68 2097 123 and 456. It is up to the parser to set the position argument
jpayne@68 2098 to either boundary point (depending on if this is being used as
jpayne@68 2099 a start or end of the feature). For example as a feature end:
jpayne@68 2100
jpayne@68 2101 >>> p = BetweenPosition(456, 123, 456)
jpayne@68 2102 >>> p
jpayne@68 2103 BetweenPosition(456, left=123, right=456)
jpayne@68 2104 >>> print(p)
jpayne@68 2105 (123^456)
jpayne@68 2106 >>> int(p)
jpayne@68 2107 456
jpayne@68 2108
jpayne@68 2109 Integer equality and comparison use the given position,
jpayne@68 2110
jpayne@68 2111 >>> p == 456
jpayne@68 2112 True
jpayne@68 2113 >>> p in [455, 456, 457]
jpayne@68 2114 True
jpayne@68 2115 >>> p > 300
jpayne@68 2116 True
jpayne@68 2117
jpayne@68 2118 The old legacy properties of position and extension give the
jpayne@68 2119 starting/lower/left position as an integer, and the distance
jpayne@68 2120 to the ending/higher/right position as an integer. Note that
jpayne@68 2121 the position object will act like either the left or the right
jpayne@68 2122 end-point depending on how it was created:
jpayne@68 2123
jpayne@68 2124 >>> p2 = BetweenPosition(123, left=123, right=456)
jpayne@68 2125 >>> int(p) == int(p2)
jpayne@68 2126 False
jpayne@68 2127 >>> p == 456
jpayne@68 2128 True
jpayne@68 2129 >>> p2 == 123
jpayne@68 2130 True
jpayne@68 2131
jpayne@68 2132 Note this potentially surprising behavior:
jpayne@68 2133
jpayne@68 2134 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
jpayne@68 2135 True
jpayne@68 2136 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
jpayne@68 2137 True
jpayne@68 2138 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
jpayne@68 2139 True
jpayne@68 2140
jpayne@68 2141 i.e. For equality (and sorting) the position objects behave like
jpayne@68 2142 integers.
jpayne@68 2143
jpayne@68 2144 """
jpayne@68 2145
jpayne@68 2146 def __new__(cls, position, left, right):
jpayne@68 2147 """Create a new instance in BetweenPosition object."""
jpayne@68 2148 assert position == left or position == right
jpayne@68 2149 # TODO - public API for getting left/right, especially the unknown one
jpayne@68 2150 obj = int.__new__(cls, position)
jpayne@68 2151 obj._left = left
jpayne@68 2152 obj._right = right
jpayne@68 2153 return obj
jpayne@68 2154
jpayne@68 2155 def __getnewargs__(self):
jpayne@68 2156 """Return the arguments accepted by __new__.
jpayne@68 2157
jpayne@68 2158 Necessary to allow pickling and unpickling of class instances.
jpayne@68 2159 """
jpayne@68 2160 return (int(self), self._left, self._right)
jpayne@68 2161
jpayne@68 2162 def __repr__(self):
jpayne@68 2163 """Represent the BetweenPosition object as a string for debugging."""
jpayne@68 2164 return "%s(%i, left=%i, right=%i)" % (
jpayne@68 2165 self.__class__.__name__,
jpayne@68 2166 int(self),
jpayne@68 2167 self._left,
jpayne@68 2168 self._right,
jpayne@68 2169 )
jpayne@68 2170
jpayne@68 2171 def __str__(self):
jpayne@68 2172 """Return a representation of the BetweenPosition object (with python counting)."""
jpayne@68 2173 return f"({self._left}^{self._right})"
jpayne@68 2174
jpayne@68 2175 def __add__(self, offset):
jpayne@68 2176 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 2177 return self.__class__(
jpayne@68 2178 int(self) + offset, self._left + offset, self._right + offset
jpayne@68 2179 )
jpayne@68 2180
jpayne@68 2181 def _flip(self, length):
jpayne@68 2182 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 2183 return self.__class__(
jpayne@68 2184 length - int(self), length - self._right, length - self._left
jpayne@68 2185 )
jpayne@68 2186
jpayne@68 2187
jpayne@68 2188 class BeforePosition(int, Position):
jpayne@68 2189 """Specify a position where the actual location occurs before it.
jpayne@68 2190
jpayne@68 2191 Arguments:
jpayne@68 2192 - position - The upper boundary of where the location can occur.
jpayne@68 2193 - extension - An optional argument which must be zero since we don't
jpayne@68 2194 have an extension. The argument is provided so that the same number
jpayne@68 2195 of arguments can be passed to all position types.
jpayne@68 2196
jpayne@68 2197 This is used to specify positions like (<10..100) where the location
jpayne@68 2198 occurs somewhere before position 10.
jpayne@68 2199
jpayne@68 2200 >>> p = BeforePosition(5)
jpayne@68 2201 >>> p
jpayne@68 2202 BeforePosition(5)
jpayne@68 2203 >>> print(p)
jpayne@68 2204 <5
jpayne@68 2205 >>> int(p)
jpayne@68 2206 5
jpayne@68 2207 >>> p + 10
jpayne@68 2208 BeforePosition(15)
jpayne@68 2209
jpayne@68 2210 Note this potentially surprising behavior:
jpayne@68 2211
jpayne@68 2212 >>> p == ExactPosition(5)
jpayne@68 2213 True
jpayne@68 2214 >>> p == AfterPosition(5)
jpayne@68 2215 True
jpayne@68 2216
jpayne@68 2217 Just remember that for equality and sorting the position objects act
jpayne@68 2218 like integers.
jpayne@68 2219 """
jpayne@68 2220
jpayne@68 2221 # Subclasses int so can't use __init__
jpayne@68 2222 def __new__(cls, position, extension=0):
jpayne@68 2223 """Create a new instance in BeforePosition object."""
jpayne@68 2224 if extension != 0:
jpayne@68 2225 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@68 2226 return int.__new__(cls, position)
jpayne@68 2227
jpayne@68 2228 def __repr__(self):
jpayne@68 2229 """Represent the location as a string for debugging."""
jpayne@68 2230 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@68 2231
jpayne@68 2232 def __str__(self):
jpayne@68 2233 """Return a representation of the BeforePosition object (with python counting)."""
jpayne@68 2234 return f"<{int(self)}"
jpayne@68 2235
jpayne@68 2236 def __add__(self, offset):
jpayne@68 2237 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 2238 return self.__class__(int(self) + offset)
jpayne@68 2239
jpayne@68 2240 def _flip(self, length):
jpayne@68 2241 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 2242 return AfterPosition(length - int(self))
jpayne@68 2243
jpayne@68 2244
jpayne@68 2245 class AfterPosition(int, Position):
jpayne@68 2246 """Specify a position where the actual location is found after it.
jpayne@68 2247
jpayne@68 2248 Arguments:
jpayne@68 2249 - position - The lower boundary of where the location can occur.
jpayne@68 2250 - extension - An optional argument which must be zero since we don't
jpayne@68 2251 have an extension. The argument is provided so that the same number
jpayne@68 2252 of arguments can be passed to all position types.
jpayne@68 2253
jpayne@68 2254 This is used to specify positions like (>10..100) where the location
jpayne@68 2255 occurs somewhere after position 10.
jpayne@68 2256
jpayne@68 2257 >>> p = AfterPosition(7)
jpayne@68 2258 >>> p
jpayne@68 2259 AfterPosition(7)
jpayne@68 2260 >>> print(p)
jpayne@68 2261 >7
jpayne@68 2262 >>> int(p)
jpayne@68 2263 7
jpayne@68 2264 >>> p + 10
jpayne@68 2265 AfterPosition(17)
jpayne@68 2266
jpayne@68 2267 >>> isinstance(p, AfterPosition)
jpayne@68 2268 True
jpayne@68 2269 >>> isinstance(p, Position)
jpayne@68 2270 True
jpayne@68 2271 >>> isinstance(p, int)
jpayne@68 2272 True
jpayne@68 2273
jpayne@68 2274 Note this potentially surprising behavior:
jpayne@68 2275
jpayne@68 2276 >>> p == ExactPosition(7)
jpayne@68 2277 True
jpayne@68 2278 >>> p == BeforePosition(7)
jpayne@68 2279 True
jpayne@68 2280
jpayne@68 2281 Just remember that for equality and sorting the position objects act
jpayne@68 2282 like integers.
jpayne@68 2283 """
jpayne@68 2284
jpayne@68 2285 # Subclasses int so can't use __init__
jpayne@68 2286 def __new__(cls, position, extension=0):
jpayne@68 2287 """Create a new instance of the AfterPosition object."""
jpayne@68 2288 if extension != 0:
jpayne@68 2289 raise AttributeError(f"Non-zero extension {extension} for exact position.")
jpayne@68 2290 return int.__new__(cls, position)
jpayne@68 2291
jpayne@68 2292 def __repr__(self):
jpayne@68 2293 """Represent the location as a string for debugging."""
jpayne@68 2294 return "%s(%i)" % (self.__class__.__name__, int(self))
jpayne@68 2295
jpayne@68 2296 def __str__(self):
jpayne@68 2297 """Return a representation of the AfterPosition object (with python counting)."""
jpayne@68 2298 return f">{int(self)}"
jpayne@68 2299
jpayne@68 2300 def __add__(self, offset):
jpayne@68 2301 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 2302 return self.__class__(int(self) + offset)
jpayne@68 2303
jpayne@68 2304 def _flip(self, length):
jpayne@68 2305 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 2306 return BeforePosition(length - int(self))
jpayne@68 2307
jpayne@68 2308
jpayne@68 2309 class OneOfPosition(int, Position):
jpayne@68 2310 """Specify a position where the location can be multiple positions.
jpayne@68 2311
jpayne@68 2312 This models the GenBank 'one-of(1888,1901)' function, and tries
jpayne@68 2313 to make this fit within the Biopython Position models. If this was
jpayne@68 2314 a start position it should act like 1888, but as an end position 1901.
jpayne@68 2315
jpayne@68 2316 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
jpayne@68 2317 >>> p
jpayne@68 2318 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
jpayne@68 2319 >>> int(p)
jpayne@68 2320 1888
jpayne@68 2321
jpayne@68 2322 Integer comparisons and operators act like using int(p),
jpayne@68 2323
jpayne@68 2324 >>> p == 1888
jpayne@68 2325 True
jpayne@68 2326 >>> p <= 1888
jpayne@68 2327 True
jpayne@68 2328 >>> p > 1888
jpayne@68 2329 False
jpayne@68 2330 >>> p + 100
jpayne@68 2331 OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)])
jpayne@68 2332
jpayne@68 2333 >>> isinstance(p, OneOfPosition)
jpayne@68 2334 True
jpayne@68 2335 >>> isinstance(p, Position)
jpayne@68 2336 True
jpayne@68 2337 >>> isinstance(p, int)
jpayne@68 2338 True
jpayne@68 2339
jpayne@68 2340 """
jpayne@68 2341
jpayne@68 2342 def __new__(cls, position, choices):
jpayne@68 2343 """Initialize with a set of possible positions.
jpayne@68 2344
jpayne@68 2345 choices is a list of Position derived objects, specifying possible
jpayne@68 2346 locations.
jpayne@68 2347
jpayne@68 2348 position is an integer specifying the default behavior.
jpayne@68 2349 """
jpayne@68 2350 if position not in choices:
jpayne@68 2351 raise ValueError(
jpayne@68 2352 f"OneOfPosition: {position!r} should match one of {choices!r}"
jpayne@68 2353 )
jpayne@68 2354 obj = int.__new__(cls, position)
jpayne@68 2355 obj.position_choices = choices
jpayne@68 2356 return obj
jpayne@68 2357
jpayne@68 2358 def __getnewargs__(self):
jpayne@68 2359 """Return the arguments accepted by __new__.
jpayne@68 2360
jpayne@68 2361 Necessary to allow pickling and unpickling of class instances.
jpayne@68 2362 """
jpayne@68 2363 return (int(self), self.position_choices)
jpayne@68 2364
jpayne@68 2365 def __repr__(self):
jpayne@68 2366 """Represent the OneOfPosition object as a string for debugging."""
jpayne@68 2367 return "%s(%i, choices=%r)" % (
jpayne@68 2368 self.__class__.__name__,
jpayne@68 2369 int(self),
jpayne@68 2370 self.position_choices,
jpayne@68 2371 )
jpayne@68 2372
jpayne@68 2373 def __str__(self):
jpayne@68 2374 """Return a representation of the OneOfPosition object (with python counting)."""
jpayne@68 2375 out = "one-of("
jpayne@68 2376 for position in self.position_choices:
jpayne@68 2377 out += f"{position},"
jpayne@68 2378 # replace the last comma with the closing parenthesis
jpayne@68 2379 return out[:-1] + ")"
jpayne@68 2380
jpayne@68 2381 def __add__(self, offset):
jpayne@68 2382 """Return a copy of the position object with its location shifted (PRIVATE)."""
jpayne@68 2383 return self.__class__(
jpayne@68 2384 int(self) + offset, [p + offset for p in self.position_choices]
jpayne@68 2385 )
jpayne@68 2386
jpayne@68 2387 def _flip(self, length):
jpayne@68 2388 """Return a copy of the location after the parent is reversed (PRIVATE)."""
jpayne@68 2389 return self.__class__(
jpayne@68 2390 length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
jpayne@68 2391 )
jpayne@68 2392
jpayne@68 2393
jpayne@68 2394 if __name__ == "__main__":
jpayne@68 2395 from Bio._utils import run_doctest
jpayne@68 2396
jpayne@68 2397 run_doctest()