annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # distutils: language = c++
jpayne@68 2 # cython: language_level=2
jpayne@68 3
jpayne@68 4 # String notes:
jpayne@68 5 #
jpayne@68 6 # Anything that goes in C++ objects should be converted to a C++ <string>
jpayne@68 7 # type, using the _cppstr() function. For example: Interval._bed.file_type,
jpayne@68 8 # or the entries in Interval._bed.fields.
jpayne@68 9 #
jpayne@68 10 # Any Python accessor methods (Interval.fields, Interval.__getitem__) should
jpayne@68 11 # then be converted to Python strings using the _pystr() function.
jpayne@68 12 #
jpayne@68 13 # Cython uses the `str` type as whatever the native Python version uses as
jpayne@68 14 # str.
jpayne@68 15
jpayne@68 16 from libcpp.string cimport string
jpayne@68 17 import numpy as np
jpayne@68 18
jpayne@68 19 # Python byte strings automatically coerce to/from C++ strings.
jpayne@68 20
jpayne@68 21 cdef _cppstr(s):
jpayne@68 22 # Use this to handle incoming strings from Python.
jpayne@68 23 #
jpayne@68 24 # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
jpayne@68 25 # are unicode and so must be encoded to bytestring.
jpayne@68 26 if isinstance(s, integer_types):
jpayne@68 27 s = str(s)
jpayne@68 28 if isinstance(s, unicode):
jpayne@68 29 s = s.encode('UTF-8')
jpayne@68 30 return <string> s
jpayne@68 31
jpayne@68 32 cdef _pystr(string s):
jpayne@68 33 # Use this to prepare a string for sending to Python.
jpayne@68 34 #
jpayne@68 35 # Always returns unicode.
jpayne@68 36 return s.decode('UTF-8', 'strict')
jpayne@68 37
jpayne@68 38 integer_types = (int, long, np.int64)
jpayne@68 39
jpayne@68 40
jpayne@68 41 """
jpayne@68 42 bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
jpayne@68 43
jpayne@68 44 Authors: Aaron Quinlan[1], Brent Pedersen[2]
jpayne@68 45 Affl: [1] Center for Public Health Genomics, University of Virginia
jpayne@68 46 [2]
jpayne@68 47 Email: aaronquinlan at gmail dot com
jpayne@68 48 """
jpayne@68 49 from cython.operator cimport dereference as deref
jpayne@68 50 import sys
jpayne@68 51 import subprocess
jpayne@68 52 from collections import defaultdict
jpayne@68 53
jpayne@68 54 cdef dict LOOKUPS = {
jpayne@68 55 "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
jpayne@68 56 "vcf": {"chrom": 0, "start": 1},
jpayne@68 57 "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
jpayne@68 58 }
jpayne@68 59 for ktype, kdict in list(LOOKUPS.items()):
jpayne@68 60 for k, v in list(kdict.items()):
jpayne@68 61 kdict[v] = k
jpayne@68 62
jpayne@68 63 # Keys are tuples of start/start, stop/stop, start/stop, stop/start.
jpayne@68 64 # Values are which operators should return True, otherwise False
jpayne@68 65 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
jpayne@68 66 PROFILES_TRUE = {
jpayne@68 67 (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b
jpayne@68 68 # a ---------
jpayne@68 69 # b ---------
jpayne@68 70
jpayne@68 71 (-1, -1, -1, -1): (0, 1), # a < b, a <= b
jpayne@68 72 # a ----
jpayne@68 73 # b -----
jpayne@68 74
jpayne@68 75 (-1, -1, -1, 0): (1,), # a <= b
jpayne@68 76 # a ----
jpayne@68 77 # b ----- (book-ended)
jpayne@68 78
jpayne@68 79 (1, 1, 0, 1): (5,), # a >= b
jpayne@68 80 # a -----
jpayne@68 81 # b ---- (book-ended)
jpayne@68 82
jpayne@68 83 (1, 1, 1, 1): (4, 5), # a > b, a >= b
jpayne@68 84 # a ------
jpayne@68 85 # b ----
jpayne@68 86
jpayne@68 87 (0, 1, -1, 1): (5,), # a >= b
jpayne@68 88 # a ------------
jpayne@68 89 # b ---------
jpayne@68 90
jpayne@68 91 (1, 0, -1, 1): (5,), # a >= b
jpayne@68 92 # a -----------
jpayne@68 93 # b -------------
jpayne@68 94
jpayne@68 95 (-1, 0, -1, 1): (1,), # a <= b
jpayne@68 96 # a -------------
jpayne@68 97 # b -----------
jpayne@68 98
jpayne@68 99 (0, -1, -1, 1): (1,), # a <= b
jpayne@68 100 # a ---------
jpayne@68 101 # b ------------
jpayne@68 102
jpayne@68 103 (-1, -1, -1, 1): (1,), # a <= b
jpayne@68 104 # a -----------
jpayne@68 105 # b -----------
jpayne@68 106
jpayne@68 107 (1, 1, -1, 1): (5,), # a >= b
jpayne@68 108 # a -----------
jpayne@68 109 # b -----------
jpayne@68 110
jpayne@68 111 (1, -1, -1, 1): tuple(), # undef
jpayne@68 112 # a ----
jpayne@68 113 # b -----------
jpayne@68 114
jpayne@68 115 (-1, 1, -1, 1): tuple(), # undef
jpayne@68 116 # a -----------
jpayne@68 117 # b ----
jpayne@68 118
jpayne@68 119 (-1, 0, -1, 0): (1,), # a <= b
jpayne@68 120 # a -----------
jpayne@68 121 # b -
jpayne@68 122
jpayne@68 123 (1, 0, 0, 1): (5,), # a >= b
jpayne@68 124 # a -
jpayne@68 125 # b -----------
jpayne@68 126
jpayne@68 127 (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b
jpayne@68 128 # a -
jpayne@68 129 # b - (starts and stops are identical for all features)
jpayne@68 130 }
jpayne@68 131
jpayne@68 132
jpayne@68 133 class MalformedBedLineError(Exception):
jpayne@68 134 pass
jpayne@68 135
jpayne@68 136
jpayne@68 137 class BedToolsFileError(Exception):
jpayne@68 138 pass
jpayne@68 139
jpayne@68 140
jpayne@68 141 class Attributes(dict):
jpayne@68 142 """
jpayne@68 143 Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
jpayne@68 144 """
jpayne@68 145
jpayne@68 146 def __init__(self, attr_str=""):
jpayne@68 147 attr_str = str(attr_str)
jpayne@68 148 self._attr_str = attr_str
jpayne@68 149 self.sort_keys = False
jpayne@68 150
jpayne@68 151 # in general, GFF files will have either as many '=' as ';'
jpayne@68 152 # (or ';'-1 if there's no trailing ';')
jpayne@68 153 n_semi = attr_str.count(';')
jpayne@68 154 n_eq = attr_str.count('=')
jpayne@68 155 n_quotes = attr_str.count('"')
jpayne@68 156
jpayne@68 157 if n_eq > n_semi - 1:
jpayne@68 158 self.sep, self.field_sep = (';', '=')
jpayne@68 159 else:
jpayne@68 160 self.sep, self.field_sep = (';', ' ')
jpayne@68 161
jpayne@68 162 self._quoted = {}
jpayne@68 163
jpayne@68 164 # TODO: pathological case . . . detect this as GFF:
jpayne@68 165 #
jpayne@68 166 # class_code=" "
jpayne@68 167 #
jpayne@68 168 # and this as GTF:
jpayne@68 169 #
jpayne@68 170 # class_code "="
jpayne@68 171
jpayne@68 172 # quick exit
jpayne@68 173 if attr_str == "":
jpayne@68 174 return
jpayne@68 175
jpayne@68 176 kvs = map(str.strip, attr_str.strip().split(self.sep))
jpayne@68 177 for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
jpayne@68 178 if value.count('"') == 2:
jpayne@68 179 self._quoted[field] = True
jpayne@68 180 self[field] = value.replace('"', '')
jpayne@68 181
jpayne@68 182 def __str__(self):
jpayne@68 183 # stringify all items first
jpayne@68 184 items = []
jpayne@68 185 for field, val in dict.iteritems(self):
jpayne@68 186 try:
jpayne@68 187 if self._quoted[field]:
jpayne@68 188 val = '"' + str(val) + '"'
jpayne@68 189 except KeyError:
jpayne@68 190 pass
jpayne@68 191 items.append((field, val))
jpayne@68 192
jpayne@68 193 pairs = []
jpayne@68 194 if self.sort_keys:
jpayne@68 195 items.sort()
jpayne@68 196 for k, v in items:
jpayne@68 197 pairs.append(self.field_sep.join([k, v]))
jpayne@68 198
jpayne@68 199 return self.sep.join(pairs) + self.sep
jpayne@68 200
jpayne@68 201 cdef class Interval:
jpayne@68 202 """
jpayne@68 203 Class to represent a genomic interval.
jpayne@68 204
jpayne@68 205 Constructor::
jpayne@68 206
jpayne@68 207 Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
jpayne@68 208
jpayne@68 209 Class to represent a genomic interval of any format. Requires at least 3
jpayne@68 210 args: chrom (string), start (int), end (int).
jpayne@68 211
jpayne@68 212 `start` is *always* the 0-based start coordinate. If this Interval is to
jpayne@68 213 represent a GFF object (which uses a 1-based coordinate system), then
jpayne@68 214 subtract 1 from the 4th item in the line to get the start position in
jpayne@68 215 0-based coords for this Interval. The 1-based GFF coord will still be
jpayne@68 216 available, albeit as a string, in fields[3].
jpayne@68 217
jpayne@68 218 `otherfields` is a list of fields that don't fit into the other kwargs, and
jpayne@68 219 will be stored in the `fields` attribute of the Interval.
jpayne@68 220
jpayne@68 221 All the items in `otherfields` must be strings for proper conversion to
jpayne@68 222 C++.
jpayne@68 223
jpayne@68 224 By convention, for BED files, `otherfields` is everything past the first 6
jpayne@68 225 items in the line. This allows an Interval to represent composite features
jpayne@68 226 (e.g., a GFF line concatenated to the end of a BED line)
jpayne@68 227
jpayne@68 228 But for other formats (VCF, GFF, SAM), the entire line should be passed in
jpayne@68 229 as a list for `otherfields` so that we can always check the
jpayne@68 230 Interval.file_type and extract the fields we want, knowing that they'll be
jpayne@68 231 in the right order as passed in with `otherfields`.
jpayne@68 232
jpayne@68 233 Example usage:
jpayne@68 234
jpayne@68 235 >>> from pybedtools import Interval
jpayne@68 236 >>> i = Interval("chr1", 22, 44, strand='-')
jpayne@68 237 >>> i
jpayne@68 238 Interval(chr1:22-44)
jpayne@68 239
jpayne@68 240
jpayne@68 241 """
jpayne@68 242 def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
jpayne@68 243 if otherfields is None:
jpayne@68 244 otherfields = []
jpayne@68 245 otherfields = [_cppstr(i) for i in otherfields]
jpayne@68 246 self._bed = new BED(
jpayne@68 247 _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
jpayne@68 248 _cppstr(strand), otherfields)
jpayne@68 249
jpayne@68 250 #self._bed.chrom = _cppstr(chrom)
jpayne@68 251 #self._bed.start = start
jpayne@68 252 #self._bed.end = end
jpayne@68 253 #self._bed.name = _cppstr(name)
jpayne@68 254 #self._bed.score = _cppstr(score)
jpayne@68 255 #self._bed.strand = _cppstr(strand)
jpayne@68 256 fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
jpayne@68 257 fields.extend(otherfields)
jpayne@68 258 self._bed.fields = fields
jpayne@68 259 self._attrs = None
jpayne@68 260
jpayne@68 261 def __copy__(self):
jpayne@68 262 return create_interval_from_list(self.fields)
jpayne@68 263
jpayne@68 264 def __hash__(self):
jpayne@68 265 return hash("\t".join(self.fields))
jpayne@68 266
jpayne@68 267 property chrom:
jpayne@68 268 """ the chromosome of the feature"""
jpayne@68 269 def __get__(self):
jpayne@68 270 return _pystr(self._bed.chrom)
jpayne@68 271
jpayne@68 272 def __set__(self, chrom):
jpayne@68 273 chrom = _cppstr(chrom)
jpayne@68 274 self._bed.chrom = chrom
jpayne@68 275 idx = LOOKUPS[self.file_type]["chrom"]
jpayne@68 276 self._bed.fields[idx] = _cppstr(chrom)
jpayne@68 277
jpayne@68 278 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
jpayne@68 279 def __richcmp__(self, other, int op):
jpayne@68 280 if (self.chrom != other.chrom) or (self.strand != other.strand):
jpayne@68 281 if op == 3: return True
jpayne@68 282 return False
jpayne@68 283
jpayne@68 284 def cmp(x, y):
jpayne@68 285 if x < y:
jpayne@68 286 return -1
jpayne@68 287 if x == y:
jpayne@68 288 return 0
jpayne@68 289 if x > y:
jpayne@68 290 return 1
jpayne@68 291
jpayne@68 292
jpayne@68 293 # check all 4 so that we can handle nesting and partial overlaps.
jpayne@68 294 profile = (cmp(self.start, other.start),
jpayne@68 295 cmp(self.stop, other.stop),
jpayne@68 296 cmp(self.start, other.stop),
jpayne@68 297 cmp(self.stop, other.start))
jpayne@68 298
jpayne@68 299 try:
jpayne@68 300 if PROFILES_TRUE[profile] == tuple():
jpayne@68 301 raise NotImplementedError('Features are nested -- comparison undefined')
jpayne@68 302
jpayne@68 303 if op != 3:
jpayne@68 304 if op in PROFILES_TRUE[profile]:
jpayne@68 305 return True
jpayne@68 306 return False
jpayne@68 307 else:
jpayne@68 308 if 2 in PROFILES_TRUE[profile]:
jpayne@68 309 return False
jpayne@68 310 return True
jpayne@68 311 except KeyError:
jpayne@68 312 raise ValueError('Currently unsupported comparison -- please '
jpayne@68 313 'submit a bug report')
jpayne@68 314
jpayne@68 315 property start:
jpayne@68 316 """The 0-based start of the feature."""
jpayne@68 317 def __get__(self):
jpayne@68 318 return self._bed.start
jpayne@68 319
jpayne@68 320 def __set__(self, int start):
jpayne@68 321 self._bed.start = start
jpayne@68 322 idx = LOOKUPS[self.file_type]["start"]
jpayne@68 323
jpayne@68 324 # Non-BED files should have 1-based coords in fields
jpayne@68 325 if self.file_type != 'bed':
jpayne@68 326 start += 1
jpayne@68 327 self._bed.fields[idx] = _cppstr(str(start))
jpayne@68 328
jpayne@68 329 property end:
jpayne@68 330 """The end of the feature"""
jpayne@68 331 def __get__(self):
jpayne@68 332 return self._bed.end
jpayne@68 333
jpayne@68 334 def __set__(self, int end):
jpayne@68 335 self._bed.end = end
jpayne@68 336 idx = LOOKUPS[self.file_type]["stop"]
jpayne@68 337 self._bed.fields[idx] = _cppstr(str(end))
jpayne@68 338
jpayne@68 339 property stop:
jpayne@68 340 """ the end of the feature"""
jpayne@68 341 def __get__(self):
jpayne@68 342 return self._bed.end
jpayne@68 343
jpayne@68 344 def __set__(self, int end):
jpayne@68 345 idx = LOOKUPS[self.file_type]["stop"]
jpayne@68 346 self._bed.fields[idx] = _cppstr(str(end))
jpayne@68 347 self._bed.end = end
jpayne@68 348
jpayne@68 349 property strand:
jpayne@68 350 """ the strand of the feature"""
jpayne@68 351 def __get__(self):
jpayne@68 352 return _pystr(self._bed.strand)
jpayne@68 353
jpayne@68 354 def __set__(self, strand):
jpayne@68 355 idx = LOOKUPS[self.file_type]["strand"]
jpayne@68 356 self._bed.fields[idx] = _cppstr(strand)
jpayne@68 357 self._bed.strand = _cppstr(strand)
jpayne@68 358
jpayne@68 359 property length:
jpayne@68 360 """ the length of the feature"""
jpayne@68 361 def __get__(self):
jpayne@68 362 return self._bed.end - self._bed.start
jpayne@68 363
jpayne@68 364 cpdef deparse_attrs(self):
jpayne@68 365
jpayne@68 366 if not self._attrs: return
jpayne@68 367
jpayne@68 368 if self.file_type != "gff":
jpayne@68 369 raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
jpayne@68 370
jpayne@68 371 s = self._attrs.__str__()
jpayne@68 372 self._bed.fields[8] = _cppstr(s)
jpayne@68 373
jpayne@68 374 property fields:
jpayne@68 375 def __get__(self):
jpayne@68 376 self.deparse_attrs()
jpayne@68 377 items = []
jpayne@68 378 for i in self._bed.fields:
jpayne@68 379 if isinstance(i, int):
jpayne@68 380 items.append(i)
jpayne@68 381 else:
jpayne@68 382 items.append(_pystr(i))
jpayne@68 383 return items
jpayne@68 384
jpayne@68 385 property attrs:
jpayne@68 386 def __get__(self):
jpayne@68 387 if self._attrs is None:
jpayne@68 388 ft = _pystr(self._bed.file_type)
jpayne@68 389 if ft == 'gff':
jpayne@68 390 self._attrs = Attributes(_pystr(self._bed.fields[8]))
jpayne@68 391 else:
jpayne@68 392 self._attrs = Attributes("")
jpayne@68 393 return self._attrs
jpayne@68 394
jpayne@68 395 def __set__(self, attrs):
jpayne@68 396 self._attrs = attrs
jpayne@68 397
jpayne@68 398 # TODO: make this more robust.
jpayne@68 399 @property
jpayne@68 400 def count(self):
jpayne@68 401 return int(self.fields[-1])
jpayne@68 402
jpayne@68 403 property name:
jpayne@68 404 """
jpayne@68 405 >>> import pybedtools
jpayne@68 406 >>> vcf = pybedtools.example_bedtool('v.vcf')
jpayne@68 407 >>> [v.name for v in vcf]
jpayne@68 408 ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
jpayne@68 409
jpayne@68 410 """
jpayne@68 411 def __get__(self):
jpayne@68 412 cdef string ftype = self._bed.file_type
jpayne@68 413 value = None
jpayne@68 414 if ftype == <string>"gff":
jpayne@68 415 """
jpayne@68 416 # TODO. allow setting a name_key in the BedTool constructor?
jpayne@68 417 if self.name_key and self.name_key in attrs:
jpayne@68 418 return attrs[self.name_key]
jpayne@68 419 """
jpayne@68 420 for key in ("ID", "Name", "gene_name", "transcript_id", \
jpayne@68 421 "gene_id", "Parent"):
jpayne@68 422 if key in self.attrs:
jpayne@68 423 value = self.attrs[key]
jpayne@68 424 break
jpayne@68 425
jpayne@68 426 elif ftype == <string>"vcf":
jpayne@68 427 s = self.fields[2]
jpayne@68 428 if s in ("", "."):
jpayne@68 429 value = "%s:%i" % (self.chrom, self.start)
jpayne@68 430 else:
jpayne@68 431 value = _pystr(s)
jpayne@68 432 elif ftype == <string>"bed":
jpayne@68 433 value = _pystr(self._bed.name)
jpayne@68 434
jpayne@68 435 return value
jpayne@68 436
jpayne@68 437 def __set__(self, value):
jpayne@68 438 cdef string ftype = self._bed.file_type
jpayne@68 439
jpayne@68 440 if ftype == <string>"gff":
jpayne@68 441 for key in ("ID", "Name", "gene_name", "transcript_id", \
jpayne@68 442 "gene_id", "Parent"):
jpayne@68 443 if not key in self.attrs:
jpayne@68 444 continue
jpayne@68 445
jpayne@68 446 # If it's incoming from Python it's unicode, so store that directly
jpayne@68 447 # in the attributes (since an Attribute object works on
jpayne@68 448 # unicode)...
jpayne@68 449 self.attrs[key] = value
jpayne@68 450 break
jpayne@68 451
jpayne@68 452 # Otherwise use _cppstr() because we're storing it in _bed.fields.
jpayne@68 453 elif ftype == <string>"vcf":
jpayne@68 454 self._bed.fields[2] = _cppstr(value)
jpayne@68 455 else:
jpayne@68 456 self._bed.name = _cppstr(value)
jpayne@68 457 self._bed.fields[3] = _cppstr(value)
jpayne@68 458
jpayne@68 459 property score:
jpayne@68 460 def __get__(self):
jpayne@68 461 return _pystr(self._bed.score)
jpayne@68 462
jpayne@68 463 def __set__(self, value):
jpayne@68 464 value = _cppstr(value)
jpayne@68 465 self._bed.score = value
jpayne@68 466 idx = LOOKUPS[self.file_type]["score"]
jpayne@68 467 self._bed.fields[idx] = value
jpayne@68 468
jpayne@68 469 property file_type:
jpayne@68 470 "bed/vcf/gff"
jpayne@68 471 def __get__(self):
jpayne@68 472 return _pystr(self._bed.file_type)
jpayne@68 473
jpayne@68 474 def __set__(self, value):
jpayne@68 475 self._bed.file_type = _cppstr(value)
jpayne@68 476
jpayne@68 477 # TODO: maybe bed.overlap_start or bed.overlap.start ??
jpayne@68 478 @property
jpayne@68 479 def o_start(self):
jpayne@68 480 return self._bed.o_start
jpayne@68 481
jpayne@68 482 @property
jpayne@68 483 def o_end(self):
jpayne@68 484 return self._bed.o_end
jpayne@68 485
jpayne@68 486 @property
jpayne@68 487 def o_amt(self):
jpayne@68 488 return self._bed.o_end - self._bed.o_start
jpayne@68 489
jpayne@68 490 def __str__(self):
jpayne@68 491 """
jpayne@68 492 Interval objects always print with a newline to mimic a line in a
jpayne@68 493 BED/GFF/VCF file
jpayne@68 494 """
jpayne@68 495 items = []
jpayne@68 496 for i in self.fields:
jpayne@68 497 if isinstance(i, int):
jpayne@68 498 i = str(i)
jpayne@68 499 items.append(i)
jpayne@68 500
jpayne@68 501 return '\t'.join(items) + '\n'
jpayne@68 502
jpayne@68 503 def __repr__(self):
jpayne@68 504 return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
jpayne@68 505
jpayne@68 506 def __dealloc__(self):
jpayne@68 507 del self._bed
jpayne@68 508
jpayne@68 509 def __len__(self):
jpayne@68 510 return self._bed.end - self._bed.start
jpayne@68 511
jpayne@68 512 def __getitem__(self, object key):
jpayne@68 513 cdef int i
jpayne@68 514 ftype = _pystr(self._bed.file_type)
jpayne@68 515
jpayne@68 516 self.deparse_attrs()
jpayne@68 517
jpayne@68 518 if isinstance(key, (int, long)):
jpayne@68 519 nfields = self._bed.fields.size()
jpayne@68 520 if key >= nfields:
jpayne@68 521 raise IndexError('field index out of range')
jpayne@68 522 elif key < 0:
jpayne@68 523 key = nfields + key
jpayne@68 524 return _pystr(self._bed.fields.at(key))
jpayne@68 525 elif isinstance(key, slice):
jpayne@68 526 indices = key.indices(self._bed.fields.size())
jpayne@68 527 return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
jpayne@68 528
jpayne@68 529 elif isinstance(key, str):
jpayne@68 530 if ftype == "gff":
jpayne@68 531 try:
jpayne@68 532 return self.attrs[key]
jpayne@68 533 except KeyError:
jpayne@68 534 pass
jpayne@68 535 # We don't have to convert using _pystr() because the __get__
jpayne@68 536 # methods do that already.
jpayne@68 537 return getattr(self, key)
jpayne@68 538
jpayne@68 539 def __setitem__(self, object key, object value):
jpayne@68 540 if isinstance(key, (int, long)):
jpayne@68 541 nfields = self._bed.fields.size()
jpayne@68 542 if key >= nfields:
jpayne@68 543 raise IndexError('field index out of range')
jpayne@68 544 elif key < 0:
jpayne@68 545 key = nfields + key
jpayne@68 546 self._bed.fields[key] = _cppstr(value)
jpayne@68 547
jpayne@68 548 ft = _pystr(self._bed.file_type)
jpayne@68 549 if key in LOOKUPS[ft]:
jpayne@68 550 setattr(self, LOOKUPS[ft][key], value)
jpayne@68 551
jpayne@68 552 elif isinstance(key, (basestring)):
jpayne@68 553 setattr(self, key, value)
jpayne@68 554
jpayne@68 555 cpdef append(self, object value):
jpayne@68 556 self._bed.fields.push_back(_cppstr(value))
jpayne@68 557
jpayne@68 558 def __nonzero__(self):
jpayne@68 559 return True
jpayne@68 560
jpayne@68 561
jpayne@68 562 cdef Interval create_interval(BED b):
jpayne@68 563 cdef Interval pyb = Interval.__new__(Interval)
jpayne@68 564 pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
jpayne@68 565 b.score, b.strand, b.fields,
jpayne@68 566 b.o_start, b.o_end, b.bedType, b.file_type, b.status)
jpayne@68 567 pyb._bed.fields = b.fields
jpayne@68 568 return pyb
jpayne@68 569
jpayne@68 570 # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
jpayne@68 571 # checking in create_interval_from_list for filetype heuruistics. Is there
jpayne@68 572 # a performance hit by checking instances?
jpayne@68 573 cdef isdigit(s):
jpayne@68 574 if isinstance(s, integer_types):
jpayne@68 575 return True
jpayne@68 576 return s.isdigit()
jpayne@68 577
jpayne@68 578
jpayne@68 579 cpdef Interval create_interval_from_list(list fields):
jpayne@68 580 """
jpayne@68 581 Create an Interval object from a list of strings.
jpayne@68 582
jpayne@68 583 Constructor::
jpayne@68 584
jpayne@68 585 create_interval_from_list(fields)
jpayne@68 586
jpayne@68 587 Given the list of strings, `fields`, automatically detects the format (BED,
jpayne@68 588 GFF, VCF, SAM) and creates a new Interval object.
jpayne@68 589
jpayne@68 590 `fields` is a list with an arbitrary number of items (it can be quite long,
jpayne@68 591 say after a -wao intersection of a BED12 and a GFF), however, the first
jpayne@68 592 fields must conform to one of the supported formats. For example, if you
jpayne@68 593 want the resulting Interval to be considered a GFF feature, then the first
jpayne@68 594 9 fields must conform to the GFF format. Similarly, if you want the
jpayne@68 595 resulting Interval to be considered a BED feature, then the first three
jpayne@68 596 fields must be chrom, start, stop.
jpayne@68 597
jpayne@68 598 Example usage:
jpayne@68 599
jpayne@68 600 >>> # Creates a BED3 feature
jpayne@68 601 >>> feature = create_interval_from_list(['chr1', '1', '100'])
jpayne@68 602
jpayne@68 603 """
jpayne@68 604
jpayne@68 605 # TODO: this function is used a lot, and is doing a bit of work. We should
jpayne@68 606 # have an optimized version that is directly provided the filetype.
jpayne@68 607
jpayne@68 608 cdef Interval pyb = Interval.__new__(Interval)
jpayne@68 609 orig_fields = fields[:]
jpayne@68 610 # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
jpayne@68 611 # digit
jpayne@68 612
jpayne@68 613 # SAM
jpayne@68 614 if (
jpayne@68 615 (len(fields) >= 11)
jpayne@68 616 and isdigit(fields[1])
jpayne@68 617 and isdigit(fields[3])
jpayne@68 618 and isdigit(fields[4])
jpayne@68 619 and (fields[5] not in ['.', '+', '-'])
jpayne@68 620 ):
jpayne@68 621 # TODO: what should the stop position be? Here, it's just the start
jpayne@68 622 # plus the length of the sequence, but perhaps this should eventually
jpayne@68 623 # do CIGAR string parsing.
jpayne@68 624 if int(fields[1]) & 0x04:
jpayne@68 625 # handle unmapped reads
jpayne@68 626 chrom = _cppstr("*")
jpayne@68 627 start = 0
jpayne@68 628 stop = 0
jpayne@68 629 else:
jpayne@68 630 chrom = _cppstr(fields[2])
jpayne@68 631 start = int(fields[3]) - 1
jpayne@68 632 stop = int(fields[3]) + len(fields[9]) - 1
jpayne@68 633 name = _cppstr(fields[0])
jpayne@68 634 score = _cppstr(fields[1])
jpayne@68 635 if int(fields[1]) & 0x10:
jpayne@68 636 strand = _cppstr('-')
jpayne@68 637 else:
jpayne@68 638 strand = _cppstr('+')
jpayne@68 639
jpayne@68 640 # Fields is in SAM format
jpayne@68 641 fields[3] = str(start + 1)
jpayne@68 642
jpayne@68 643 pyb._bed = new BED(
jpayne@68 644 chrom,
jpayne@68 645 start,
jpayne@68 646 stop,
jpayne@68 647 strand,
jpayne@68 648 name,
jpayne@68 649 score,
jpayne@68 650 list_to_vector(fields))
jpayne@68 651 pyb.file_type = _cppstr('sam')
jpayne@68 652
jpayne@68 653
jpayne@68 654 elif isdigit(fields[1]) and isdigit(fields[2]):
jpayne@68 655 # if it's too short, just add some empty fields.
jpayne@68 656 if len(fields) < 7:
jpayne@68 657 fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
jpayne@68 658 other_fields = []
jpayne@68 659 else:
jpayne@68 660 other_fields = fields[6:]
jpayne@68 661
jpayne@68 662 pyb._bed = new BED(
jpayne@68 663 _cppstr(fields[0]),
jpayne@68 664 int(fields[1]),
jpayne@68 665 int(fields[2]),
jpayne@68 666 _cppstr(fields[3]),
jpayne@68 667 _cppstr(fields[4]),
jpayne@68 668 _cppstr(fields[5]),
jpayne@68 669 list_to_vector(other_fields))
jpayne@68 670 pyb.file_type = _cppstr('bed')
jpayne@68 671
jpayne@68 672 # VCF
jpayne@68 673 elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
jpayne@68 674 pyb._bed = new BED(
jpayne@68 675 _cppstr(fields[0]),
jpayne@68 676 int(fields[1]) - 1,
jpayne@68 677 int(fields[1]),
jpayne@68 678 _cppstr(fields[2]),
jpayne@68 679 _cppstr(fields[5]),
jpayne@68 680 _cppstr('.'),
jpayne@68 681 list_to_vector(fields))
jpayne@68 682 pyb.file_type = b'vcf'
jpayne@68 683
jpayne@68 684
jpayne@68 685 # GFF
jpayne@68 686 elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
jpayne@68 687 pyb._bed = new BED(
jpayne@68 688 _cppstr(fields[0]),
jpayne@68 689 int(fields[3])-1, int(fields[4]),
jpayne@68 690 _cppstr(fields[2]),
jpayne@68 691 _cppstr(fields[5]),
jpayne@68 692 _cppstr(fields[6]),
jpayne@68 693 list_to_vector(fields[7:]))
jpayne@68 694 pyb.file_type = _cppstr('gff')
jpayne@68 695 else:
jpayne@68 696 raise MalformedBedLineError('Unable to detect format from %s' % fields)
jpayne@68 697
jpayne@68 698 if pyb.start > pyb.end:
jpayne@68 699 raise MalformedBedLineError("Start is greater than stop")
jpayne@68 700 pyb._bed.fields = list_to_vector(orig_fields)
jpayne@68 701 return pyb
jpayne@68 702
jpayne@68 703 cdef vector[string] list_to_vector(list li):
jpayne@68 704 cdef vector[string] s
jpayne@68 705 cdef int i
jpayne@68 706 for i in range(len(li)):
jpayne@68 707 _s = li[i]
jpayne@68 708 s.push_back(_cppstr(_s))
jpayne@68 709 return s
jpayne@68 710
jpayne@68 711 cdef list string_vec2list(vector[string] sv):
jpayne@68 712 cdef size_t size = sv.size(), i
jpayne@68 713 return [_pystr(sv.at(i)) for i in range(size)]
jpayne@68 714
jpayne@68 715 cdef list bed_vec2list(vector[BED] bv):
jpayne@68 716 cdef size_t size = bv.size(), i
jpayne@68 717 cdef list l = []
jpayne@68 718 cdef BED b
jpayne@68 719 for i in range(size):
jpayne@68 720 b = bv.at(i)
jpayne@68 721 l.append(create_interval(b))
jpayne@68 722 return l
jpayne@68 723
jpayne@68 724
jpayne@68 725 def overlap(int s1, int s2, int e1, int e2):
jpayne@68 726 return min(e1, e2) - max(s1, s2)
jpayne@68 727
jpayne@68 728
jpayne@68 729 cdef class IntervalIterator:
jpayne@68 730 cdef object stream
jpayne@68 731 cdef int _itemtype
jpayne@68 732 def __init__(self, stream):
jpayne@68 733 self.stream = stream
jpayne@68 734
jpayne@68 735 # For speed, check int rather than call isinstance().
jpayne@68 736 # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
jpayne@68 737 #
jpayne@68 738 # Also assumes that all items in the iterable `stream` are the same
jpayne@68 739 # type...this seems like a reasonable assumption.
jpayne@68 740 self._itemtype = -1
jpayne@68 741
jpayne@68 742 def __dealloc__(self):
jpayne@68 743 try:
jpayne@68 744 self.stream.close()
jpayne@68 745 except AttributeError:
jpayne@68 746 pass
jpayne@68 747
jpayne@68 748 def __iter__(self):
jpayne@68 749 return self
jpayne@68 750
jpayne@68 751 def __next__(self):
jpayne@68 752 while True:
jpayne@68 753 if hasattr(self.stream, 'closed'):
jpayne@68 754 if self.stream.closed:
jpayne@68 755 raise StopIteration
jpayne@68 756 try:
jpayne@68 757 line = next(self.stream)
jpayne@68 758 except StopIteration:
jpayne@68 759 if hasattr(self.stream, 'close'):
jpayne@68 760 self.stream.close()
jpayne@68 761 raise StopIteration
jpayne@68 762
jpayne@68 763 if self._itemtype < 0:
jpayne@68 764 if isinstance(line, Interval):
jpayne@68 765 self._itemtype = 2
jpayne@68 766 elif isinstance(line, basestring):
jpayne@68 767 self._itemtype = 1
jpayne@68 768 else:
jpayne@68 769 self._itemtype = 0
jpayne@68 770
jpayne@68 771 if self._itemtype == 1:
jpayne@68 772 if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
jpayne@68 773 continue
jpayne@68 774 break
jpayne@68 775
jpayne@68 776 # Iterable of Interval objects
jpayne@68 777 if self._itemtype == 2:
jpayne@68 778 return line
jpayne@68 779
jpayne@68 780 # Iterable of strings, in which case we need to split
jpayne@68 781 elif self._itemtype == 1:
jpayne@68 782 fields = line.rstrip('\r\n').split('\t')
jpayne@68 783
jpayne@68 784 # Otherwise assume list/tuple/iterable of fields
jpayne@68 785 else:
jpayne@68 786 fields = list(line)
jpayne@68 787
jpayne@68 788 # TODO: optimization: create_interval_from_list should have a version
jpayne@68 789 # that accepts C++ string instances
jpayne@68 790 return create_interval_from_list(fields)
jpayne@68 791
jpayne@68 792
jpayne@68 793
jpayne@68 794 cdef class IntervalFile:
jpayne@68 795 cdef BedFile *intervalFile_ptr
jpayne@68 796 cdef bint _loaded
jpayne@68 797 cdef bint _open
jpayne@68 798 cdef string _fn
jpayne@68 799 """
jpayne@68 800 An IntervalFile provides low-level access to the BEDTools API.
jpayne@68 801
jpayne@68 802 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68 803 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68 804
jpayne@68 805 """
jpayne@68 806 def __init__(self, intervalFile):
jpayne@68 807 self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
jpayne@68 808 self._loaded = 0
jpayne@68 809 self._open = 0
jpayne@68 810 self._fn = _cppstr(intervalFile)
jpayne@68 811
jpayne@68 812 def __dealloc__(self):
jpayne@68 813 del self.intervalFile_ptr
jpayne@68 814
jpayne@68 815 def __iter__(self):
jpayne@68 816 return self
jpayne@68 817
jpayne@68 818 def __next__(self):
jpayne@68 819 if not self._open:
jpayne@68 820 result = self.intervalFile_ptr.Open()
jpayne@68 821 if result == -1:
jpayne@68 822 raise BedToolsFileError("Error opening file")
jpayne@68 823 self._open = 1
jpayne@68 824 cdef BED b = self.intervalFile_ptr.GetNextBed()
jpayne@68 825 if b.status == BED_VALID:
jpayne@68 826 return create_interval(b)
jpayne@68 827 elif b.status == BED_INVALID:
jpayne@68 828 self.intervalFile_ptr.Close()
jpayne@68 829 raise StopIteration
jpayne@68 830 elif b.status == BED_MALFORMED:
jpayne@68 831 self.intervalFile_ptr.Close()
jpayne@68 832 raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
jpayne@68 833 else:
jpayne@68 834 return next(self)
jpayne@68 835
jpayne@68 836 @property
jpayne@68 837 def fn(self):
jpayne@68 838 return _pystr(self._fn)
jpayne@68 839
jpayne@68 840 @property
jpayne@68 841 def file_type(self):
jpayne@68 842 if not self.intervalFile_ptr._typeIsKnown:
jpayne@68 843 try:
jpayne@68 844 a = next(iter(self))
jpayne@68 845 file_type = _pystr(self.intervalFile_ptr.file_type)
jpayne@68 846 self.intervalFile_ptr.Close()
jpayne@68 847 return file_type
jpayne@68 848 except MalformedBedLineError:
jpayne@68 849 # If it's a SAM, raise a meaningful exception. If not, fail.
jpayne@68 850 with open(self.fn) as fn:
jpayne@68 851 interval = create_interval_from_list(fn.readline().strip().split())
jpayne@68 852 if interval.file_type == 'sam':
jpayne@68 853 raise ValueError('IntervalFile objects do not yet natively support SAM. '
jpayne@68 854 'Please convert to BED/GFF/VCF first if you want to '
jpayne@68 855 'use the low-level API of IntervalFile')
jpayne@68 856 else:
jpayne@68 857 raise
jpayne@68 858
jpayne@68 859
jpayne@68 860 def loadIntoMap(self):
jpayne@68 861 """
jpayne@68 862 Prepares file for checking intersections. Used by other methods like all_hits()
jpayne@68 863 """
jpayne@68 864 if self._loaded:
jpayne@68 865 return
jpayne@68 866 self.intervalFile_ptr.loadBedFileIntoMap()
jpayne@68 867 self._loaded = 1
jpayne@68 868
jpayne@68 869 def rewind(self):
jpayne@68 870 """
jpayne@68 871 Jump to the beginning of the file.
jpayne@68 872 """
jpayne@68 873 if not self._open:
jpayne@68 874 self.intervalFile_ptr.Open()
jpayne@68 875 self._open = 1
jpayne@68 876 self.intervalFile_ptr.Rewind()
jpayne@68 877
jpayne@68 878 def seek(self, offset):
jpayne@68 879 """
jpayne@68 880 Jump to a specific byte offset in the file
jpayne@68 881 """
jpayne@68 882 if not self._open:
jpayne@68 883 self.intervalFile_ptr.Open()
jpayne@68 884 self._open = 1
jpayne@68 885 self.intervalFile_ptr.Seek(offset)
jpayne@68 886
jpayne@68 887
jpayne@68 888 def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68 889 """
jpayne@68 890 :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68 891
jpayne@68 892 Search for the Interval `interval` this file and return **all**
jpayne@68 893 overlaps as a list.
jpayne@68 894
jpayne@68 895 `same_strand`, if True, will only consider hits on the same strand as `interval`.
jpayne@68 896
jpayne@68 897 `overlap` can be used to specify the fraction of overlap between
jpayne@68 898 `interval` and each feature in the IntervalFile.
jpayne@68 899
jpayne@68 900 Example usage:
jpayne@68 901
jpayne@68 902 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68 903
jpayne@68 904 >>> # create an Interval to query with
jpayne@68 905 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68 906
jpayne@68 907 >>> # Create an IntervalFile out of a.bed
jpayne@68 908 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68 909
jpayne@68 910 >>> # get stranded hits
jpayne@68 911 >>> intervalfile.all_hits(i, same_strand=True)
jpayne@68 912 [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
jpayne@68 913
jpayne@68 914 """
jpayne@68 915 cdef vector[BED] vec_b
jpayne@68 916 self.loadIntoMap()
jpayne@68 917
jpayne@68 918 if same_strand == False:
jpayne@68 919 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68 920 try:
jpayne@68 921 return bed_vec2list(vec_b)
jpayne@68 922 finally:
jpayne@68 923 pass
jpayne@68 924 else:
jpayne@68 925 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
jpayne@68 926 try:
jpayne@68 927 return bed_vec2list(vec_b)
jpayne@68 928 finally:
jpayne@68 929 pass
jpayne@68 930
jpayne@68 931 # search() is an alias for all_hits
jpayne@68 932 search = all_hits
jpayne@68 933
jpayne@68 934 def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68 935 """
jpayne@68 936 :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68 937
jpayne@68 938 Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
jpayne@68 939
jpayne@68 940 `same_strand`, if True, will only consider hits on the same strand as `interval`.
jpayne@68 941
jpayne@68 942 `overlap` can be used to specify the fraction of overlap between
jpayne@68 943 `interval` and each feature in the IntervalFile.
jpayne@68 944
jpayne@68 945 Example usage:
jpayne@68 946
jpayne@68 947 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68 948
jpayne@68 949 >>> # create an Interval to query with
jpayne@68 950 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68 951
jpayne@68 952 >>> # Create an IntervalFile out of a.bed
jpayne@68 953 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68 954
jpayne@68 955 >>> # any stranded hits?
jpayne@68 956 >>> intervalfile.any_hits(i, same_strand=True)
jpayne@68 957 1
jpayne@68 958
jpayne@68 959 """
jpayne@68 960 found = 0
jpayne@68 961 self.loadIntoMap()
jpayne@68 962
jpayne@68 963 if same_strand == False:
jpayne@68 964 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68 965 else:
jpayne@68 966 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
jpayne@68 967
jpayne@68 968 return found
jpayne@68 969
jpayne@68 970 def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68 971 """
jpayne@68 972 :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68 973
jpayne@68 974 Return the number of overlaps of the Interval `interval` had with this
jpayne@68 975 IntervalFile.
jpayne@68 976
jpayne@68 977 `same_strand`, if True, will only consider hits on the same strand as
jpayne@68 978 `interval`.
jpayne@68 979
jpayne@68 980 `overlap` can be used to specify the fraction of overlap between
jpayne@68 981 `interval` and each feature in the IntervalFile.
jpayne@68 982
jpayne@68 983 Example usage:
jpayne@68 984
jpayne@68 985 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68 986
jpayne@68 987 >>> # create an Interval to query with
jpayne@68 988 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68 989
jpayne@68 990 >>> # Create an IntervalFile out of a.bed
jpayne@68 991 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68 992
jpayne@68 993 >>> # get number of stranded hits
jpayne@68 994 >>> intervalfile.count_hits(i, same_strand=True)
jpayne@68 995 3
jpayne@68 996
jpayne@68 997 """
jpayne@68 998 self.loadIntoMap()
jpayne@68 999
jpayne@68 1000 if same_strand == False:
jpayne@68 1001 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68 1002 else:
jpayne@68 1003 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)