comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # distutils: language = c++
2 # cython: language_level=2
3
4 # String notes:
5 #
6 # Anything that goes in C++ objects should be converted to a C++ <string>
7 # type, using the _cppstr() function. For example: Interval._bed.file_type,
8 # or the entries in Interval._bed.fields.
9 #
10 # Any Python accessor methods (Interval.fields, Interval.__getitem__) should
11 # then be converted to Python strings using the _pystr() function.
12 #
13 # Cython uses the `str` type as whatever the native Python version uses as
14 # str.
15
16 from libcpp.string cimport string
17 import numpy as np
18
19 # Python byte strings automatically coerce to/from C++ strings.
20
21 cdef _cppstr(s):
22 # Use this to handle incoming strings from Python.
23 #
24 # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
25 # are unicode and so must be encoded to bytestring.
26 if isinstance(s, integer_types):
27 s = str(s)
28 if isinstance(s, unicode):
29 s = s.encode('UTF-8')
30 return <string> s
31
32 cdef _pystr(string s):
33 # Use this to prepare a string for sending to Python.
34 #
35 # Always returns unicode.
36 return s.decode('UTF-8', 'strict')
37
38 integer_types = (int, long, np.int64)
39
40
41 """
42 bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
43
44 Authors: Aaron Quinlan[1], Brent Pedersen[2]
45 Affl: [1] Center for Public Health Genomics, University of Virginia
46 [2]
47 Email: aaronquinlan at gmail dot com
48 """
49 from cython.operator cimport dereference as deref
50 import sys
51 import subprocess
52 from collections import defaultdict
53
54 cdef dict LOOKUPS = {
55 "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
56 "vcf": {"chrom": 0, "start": 1},
57 "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
58 }
59 for ktype, kdict in list(LOOKUPS.items()):
60 for k, v in list(kdict.items()):
61 kdict[v] = k
62
63 # Keys are tuples of start/start, stop/stop, start/stop, stop/start.
64 # Values are which operators should return True, otherwise False
65 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
66 PROFILES_TRUE = {
67 (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b
68 # a ---------
69 # b ---------
70
71 (-1, -1, -1, -1): (0, 1), # a < b, a <= b
72 # a ----
73 # b -----
74
75 (-1, -1, -1, 0): (1,), # a <= b
76 # a ----
77 # b ----- (book-ended)
78
79 (1, 1, 0, 1): (5,), # a >= b
80 # a -----
81 # b ---- (book-ended)
82
83 (1, 1, 1, 1): (4, 5), # a > b, a >= b
84 # a ------
85 # b ----
86
87 (0, 1, -1, 1): (5,), # a >= b
88 # a ------------
89 # b ---------
90
91 (1, 0, -1, 1): (5,), # a >= b
92 # a -----------
93 # b -------------
94
95 (-1, 0, -1, 1): (1,), # a <= b
96 # a -------------
97 # b -----------
98
99 (0, -1, -1, 1): (1,), # a <= b
100 # a ---------
101 # b ------------
102
103 (-1, -1, -1, 1): (1,), # a <= b
104 # a -----------
105 # b -----------
106
107 (1, 1, -1, 1): (5,), # a >= b
108 # a -----------
109 # b -----------
110
111 (1, -1, -1, 1): tuple(), # undef
112 # a ----
113 # b -----------
114
115 (-1, 1, -1, 1): tuple(), # undef
116 # a -----------
117 # b ----
118
119 (-1, 0, -1, 0): (1,), # a <= b
120 # a -----------
121 # b -
122
123 (1, 0, 0, 1): (5,), # a >= b
124 # a -
125 # b -----------
126
127 (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b
128 # a -
129 # b - (starts and stops are identical for all features)
130 }
131
132
133 class MalformedBedLineError(Exception):
134 pass
135
136
137 class BedToolsFileError(Exception):
138 pass
139
140
141 class Attributes(dict):
142 """
143 Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
144 """
145
146 def __init__(self, attr_str=""):
147 attr_str = str(attr_str)
148 self._attr_str = attr_str
149 self.sort_keys = False
150
151 # in general, GFF files will have either as many '=' as ';'
152 # (or ';'-1 if there's no trailing ';')
153 n_semi = attr_str.count(';')
154 n_eq = attr_str.count('=')
155 n_quotes = attr_str.count('"')
156
157 if n_eq > n_semi - 1:
158 self.sep, self.field_sep = (';', '=')
159 else:
160 self.sep, self.field_sep = (';', ' ')
161
162 self._quoted = {}
163
164 # TODO: pathological case . . . detect this as GFF:
165 #
166 # class_code=" "
167 #
168 # and this as GTF:
169 #
170 # class_code "="
171
172 # quick exit
173 if attr_str == "":
174 return
175
176 kvs = map(str.strip, attr_str.strip().split(self.sep))
177 for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
178 if value.count('"') == 2:
179 self._quoted[field] = True
180 self[field] = value.replace('"', '')
181
182 def __str__(self):
183 # stringify all items first
184 items = []
185 for field, val in dict.iteritems(self):
186 try:
187 if self._quoted[field]:
188 val = '"' + str(val) + '"'
189 except KeyError:
190 pass
191 items.append((field, val))
192
193 pairs = []
194 if self.sort_keys:
195 items.sort()
196 for k, v in items:
197 pairs.append(self.field_sep.join([k, v]))
198
199 return self.sep.join(pairs) + self.sep
200
201 cdef class Interval:
202 """
203 Class to represent a genomic interval.
204
205 Constructor::
206
207 Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
208
209 Class to represent a genomic interval of any format. Requires at least 3
210 args: chrom (string), start (int), end (int).
211
212 `start` is *always* the 0-based start coordinate. If this Interval is to
213 represent a GFF object (which uses a 1-based coordinate system), then
214 subtract 1 from the 4th item in the line to get the start position in
215 0-based coords for this Interval. The 1-based GFF coord will still be
216 available, albeit as a string, in fields[3].
217
218 `otherfields` is a list of fields that don't fit into the other kwargs, and
219 will be stored in the `fields` attribute of the Interval.
220
221 All the items in `otherfields` must be strings for proper conversion to
222 C++.
223
224 By convention, for BED files, `otherfields` is everything past the first 6
225 items in the line. This allows an Interval to represent composite features
226 (e.g., a GFF line concatenated to the end of a BED line)
227
228 But for other formats (VCF, GFF, SAM), the entire line should be passed in
229 as a list for `otherfields` so that we can always check the
230 Interval.file_type and extract the fields we want, knowing that they'll be
231 in the right order as passed in with `otherfields`.
232
233 Example usage:
234
235 >>> from pybedtools import Interval
236 >>> i = Interval("chr1", 22, 44, strand='-')
237 >>> i
238 Interval(chr1:22-44)
239
240
241 """
242 def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
243 if otherfields is None:
244 otherfields = []
245 otherfields = [_cppstr(i) for i in otherfields]
246 self._bed = new BED(
247 _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
248 _cppstr(strand), otherfields)
249
250 #self._bed.chrom = _cppstr(chrom)
251 #self._bed.start = start
252 #self._bed.end = end
253 #self._bed.name = _cppstr(name)
254 #self._bed.score = _cppstr(score)
255 #self._bed.strand = _cppstr(strand)
256 fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
257 fields.extend(otherfields)
258 self._bed.fields = fields
259 self._attrs = None
260
261 def __copy__(self):
262 return create_interval_from_list(self.fields)
263
264 def __hash__(self):
265 return hash("\t".join(self.fields))
266
267 property chrom:
268 """ the chromosome of the feature"""
269 def __get__(self):
270 return _pystr(self._bed.chrom)
271
272 def __set__(self, chrom):
273 chrom = _cppstr(chrom)
274 self._bed.chrom = chrom
275 idx = LOOKUPS[self.file_type]["chrom"]
276 self._bed.fields[idx] = _cppstr(chrom)
277
278 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
279 def __richcmp__(self, other, int op):
280 if (self.chrom != other.chrom) or (self.strand != other.strand):
281 if op == 3: return True
282 return False
283
284 def cmp(x, y):
285 if x < y:
286 return -1
287 if x == y:
288 return 0
289 if x > y:
290 return 1
291
292
293 # check all 4 so that we can handle nesting and partial overlaps.
294 profile = (cmp(self.start, other.start),
295 cmp(self.stop, other.stop),
296 cmp(self.start, other.stop),
297 cmp(self.stop, other.start))
298
299 try:
300 if PROFILES_TRUE[profile] == tuple():
301 raise NotImplementedError('Features are nested -- comparison undefined')
302
303 if op != 3:
304 if op in PROFILES_TRUE[profile]:
305 return True
306 return False
307 else:
308 if 2 in PROFILES_TRUE[profile]:
309 return False
310 return True
311 except KeyError:
312 raise ValueError('Currently unsupported comparison -- please '
313 'submit a bug report')
314
315 property start:
316 """The 0-based start of the feature."""
317 def __get__(self):
318 return self._bed.start
319
320 def __set__(self, int start):
321 self._bed.start = start
322 idx = LOOKUPS[self.file_type]["start"]
323
324 # Non-BED files should have 1-based coords in fields
325 if self.file_type != 'bed':
326 start += 1
327 self._bed.fields[idx] = _cppstr(str(start))
328
329 property end:
330 """The end of the feature"""
331 def __get__(self):
332 return self._bed.end
333
334 def __set__(self, int end):
335 self._bed.end = end
336 idx = LOOKUPS[self.file_type]["stop"]
337 self._bed.fields[idx] = _cppstr(str(end))
338
339 property stop:
340 """ the end of the feature"""
341 def __get__(self):
342 return self._bed.end
343
344 def __set__(self, int end):
345 idx = LOOKUPS[self.file_type]["stop"]
346 self._bed.fields[idx] = _cppstr(str(end))
347 self._bed.end = end
348
349 property strand:
350 """ the strand of the feature"""
351 def __get__(self):
352 return _pystr(self._bed.strand)
353
354 def __set__(self, strand):
355 idx = LOOKUPS[self.file_type]["strand"]
356 self._bed.fields[idx] = _cppstr(strand)
357 self._bed.strand = _cppstr(strand)
358
359 property length:
360 """ the length of the feature"""
361 def __get__(self):
362 return self._bed.end - self._bed.start
363
364 cpdef deparse_attrs(self):
365
366 if not self._attrs: return
367
368 if self.file_type != "gff":
369 raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
370
371 s = self._attrs.__str__()
372 self._bed.fields[8] = _cppstr(s)
373
374 property fields:
375 def __get__(self):
376 self.deparse_attrs()
377 items = []
378 for i in self._bed.fields:
379 if isinstance(i, int):
380 items.append(i)
381 else:
382 items.append(_pystr(i))
383 return items
384
385 property attrs:
386 def __get__(self):
387 if self._attrs is None:
388 ft = _pystr(self._bed.file_type)
389 if ft == 'gff':
390 self._attrs = Attributes(_pystr(self._bed.fields[8]))
391 else:
392 self._attrs = Attributes("")
393 return self._attrs
394
395 def __set__(self, attrs):
396 self._attrs = attrs
397
398 # TODO: make this more robust.
399 @property
400 def count(self):
401 return int(self.fields[-1])
402
403 property name:
404 """
405 >>> import pybedtools
406 >>> vcf = pybedtools.example_bedtool('v.vcf')
407 >>> [v.name for v in vcf]
408 ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
409
410 """
411 def __get__(self):
412 cdef string ftype = self._bed.file_type
413 value = None
414 if ftype == <string>"gff":
415 """
416 # TODO. allow setting a name_key in the BedTool constructor?
417 if self.name_key and self.name_key in attrs:
418 return attrs[self.name_key]
419 """
420 for key in ("ID", "Name", "gene_name", "transcript_id", \
421 "gene_id", "Parent"):
422 if key in self.attrs:
423 value = self.attrs[key]
424 break
425
426 elif ftype == <string>"vcf":
427 s = self.fields[2]
428 if s in ("", "."):
429 value = "%s:%i" % (self.chrom, self.start)
430 else:
431 value = _pystr(s)
432 elif ftype == <string>"bed":
433 value = _pystr(self._bed.name)
434
435 return value
436
437 def __set__(self, value):
438 cdef string ftype = self._bed.file_type
439
440 if ftype == <string>"gff":
441 for key in ("ID", "Name", "gene_name", "transcript_id", \
442 "gene_id", "Parent"):
443 if not key in self.attrs:
444 continue
445
446 # If it's incoming from Python it's unicode, so store that directly
447 # in the attributes (since an Attribute object works on
448 # unicode)...
449 self.attrs[key] = value
450 break
451
452 # Otherwise use _cppstr() because we're storing it in _bed.fields.
453 elif ftype == <string>"vcf":
454 self._bed.fields[2] = _cppstr(value)
455 else:
456 self._bed.name = _cppstr(value)
457 self._bed.fields[3] = _cppstr(value)
458
459 property score:
460 def __get__(self):
461 return _pystr(self._bed.score)
462
463 def __set__(self, value):
464 value = _cppstr(value)
465 self._bed.score = value
466 idx = LOOKUPS[self.file_type]["score"]
467 self._bed.fields[idx] = value
468
469 property file_type:
470 "bed/vcf/gff"
471 def __get__(self):
472 return _pystr(self._bed.file_type)
473
474 def __set__(self, value):
475 self._bed.file_type = _cppstr(value)
476
477 # TODO: maybe bed.overlap_start or bed.overlap.start ??
478 @property
479 def o_start(self):
480 return self._bed.o_start
481
482 @property
483 def o_end(self):
484 return self._bed.o_end
485
486 @property
487 def o_amt(self):
488 return self._bed.o_end - self._bed.o_start
489
490 def __str__(self):
491 """
492 Interval objects always print with a newline to mimic a line in a
493 BED/GFF/VCF file
494 """
495 items = []
496 for i in self.fields:
497 if isinstance(i, int):
498 i = str(i)
499 items.append(i)
500
501 return '\t'.join(items) + '\n'
502
503 def __repr__(self):
504 return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
505
506 def __dealloc__(self):
507 del self._bed
508
509 def __len__(self):
510 return self._bed.end - self._bed.start
511
512 def __getitem__(self, object key):
513 cdef int i
514 ftype = _pystr(self._bed.file_type)
515
516 self.deparse_attrs()
517
518 if isinstance(key, (int, long)):
519 nfields = self._bed.fields.size()
520 if key >= nfields:
521 raise IndexError('field index out of range')
522 elif key < 0:
523 key = nfields + key
524 return _pystr(self._bed.fields.at(key))
525 elif isinstance(key, slice):
526 indices = key.indices(self._bed.fields.size())
527 return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
528
529 elif isinstance(key, str):
530 if ftype == "gff":
531 try:
532 return self.attrs[key]
533 except KeyError:
534 pass
535 # We don't have to convert using _pystr() because the __get__
536 # methods do that already.
537 return getattr(self, key)
538
539 def __setitem__(self, object key, object value):
540 if isinstance(key, (int, long)):
541 nfields = self._bed.fields.size()
542 if key >= nfields:
543 raise IndexError('field index out of range')
544 elif key < 0:
545 key = nfields + key
546 self._bed.fields[key] = _cppstr(value)
547
548 ft = _pystr(self._bed.file_type)
549 if key in LOOKUPS[ft]:
550 setattr(self, LOOKUPS[ft][key], value)
551
552 elif isinstance(key, (basestring)):
553 setattr(self, key, value)
554
555 cpdef append(self, object value):
556 self._bed.fields.push_back(_cppstr(value))
557
558 def __nonzero__(self):
559 return True
560
561
562 cdef Interval create_interval(BED b):
563 cdef Interval pyb = Interval.__new__(Interval)
564 pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
565 b.score, b.strand, b.fields,
566 b.o_start, b.o_end, b.bedType, b.file_type, b.status)
567 pyb._bed.fields = b.fields
568 return pyb
569
570 # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
571 # checking in create_interval_from_list for filetype heuruistics. Is there
572 # a performance hit by checking instances?
573 cdef isdigit(s):
574 if isinstance(s, integer_types):
575 return True
576 return s.isdigit()
577
578
579 cpdef Interval create_interval_from_list(list fields):
580 """
581 Create an Interval object from a list of strings.
582
583 Constructor::
584
585 create_interval_from_list(fields)
586
587 Given the list of strings, `fields`, automatically detects the format (BED,
588 GFF, VCF, SAM) and creates a new Interval object.
589
590 `fields` is a list with an arbitrary number of items (it can be quite long,
591 say after a -wao intersection of a BED12 and a GFF), however, the first
592 fields must conform to one of the supported formats. For example, if you
593 want the resulting Interval to be considered a GFF feature, then the first
594 9 fields must conform to the GFF format. Similarly, if you want the
595 resulting Interval to be considered a BED feature, then the first three
596 fields must be chrom, start, stop.
597
598 Example usage:
599
600 >>> # Creates a BED3 feature
601 >>> feature = create_interval_from_list(['chr1', '1', '100'])
602
603 """
604
605 # TODO: this function is used a lot, and is doing a bit of work. We should
606 # have an optimized version that is directly provided the filetype.
607
608 cdef Interval pyb = Interval.__new__(Interval)
609 orig_fields = fields[:]
610 # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
611 # digit
612
613 # SAM
614 if (
615 (len(fields) >= 11)
616 and isdigit(fields[1])
617 and isdigit(fields[3])
618 and isdigit(fields[4])
619 and (fields[5] not in ['.', '+', '-'])
620 ):
621 # TODO: what should the stop position be? Here, it's just the start
622 # plus the length of the sequence, but perhaps this should eventually
623 # do CIGAR string parsing.
624 if int(fields[1]) & 0x04:
625 # handle unmapped reads
626 chrom = _cppstr("*")
627 start = 0
628 stop = 0
629 else:
630 chrom = _cppstr(fields[2])
631 start = int(fields[3]) - 1
632 stop = int(fields[3]) + len(fields[9]) - 1
633 name = _cppstr(fields[0])
634 score = _cppstr(fields[1])
635 if int(fields[1]) & 0x10:
636 strand = _cppstr('-')
637 else:
638 strand = _cppstr('+')
639
640 # Fields is in SAM format
641 fields[3] = str(start + 1)
642
643 pyb._bed = new BED(
644 chrom,
645 start,
646 stop,
647 strand,
648 name,
649 score,
650 list_to_vector(fields))
651 pyb.file_type = _cppstr('sam')
652
653
654 elif isdigit(fields[1]) and isdigit(fields[2]):
655 # if it's too short, just add some empty fields.
656 if len(fields) < 7:
657 fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
658 other_fields = []
659 else:
660 other_fields = fields[6:]
661
662 pyb._bed = new BED(
663 _cppstr(fields[0]),
664 int(fields[1]),
665 int(fields[2]),
666 _cppstr(fields[3]),
667 _cppstr(fields[4]),
668 _cppstr(fields[5]),
669 list_to_vector(other_fields))
670 pyb.file_type = _cppstr('bed')
671
672 # VCF
673 elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
674 pyb._bed = new BED(
675 _cppstr(fields[0]),
676 int(fields[1]) - 1,
677 int(fields[1]),
678 _cppstr(fields[2]),
679 _cppstr(fields[5]),
680 _cppstr('.'),
681 list_to_vector(fields))
682 pyb.file_type = b'vcf'
683
684
685 # GFF
686 elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
687 pyb._bed = new BED(
688 _cppstr(fields[0]),
689 int(fields[3])-1, int(fields[4]),
690 _cppstr(fields[2]),
691 _cppstr(fields[5]),
692 _cppstr(fields[6]),
693 list_to_vector(fields[7:]))
694 pyb.file_type = _cppstr('gff')
695 else:
696 raise MalformedBedLineError('Unable to detect format from %s' % fields)
697
698 if pyb.start > pyb.end:
699 raise MalformedBedLineError("Start is greater than stop")
700 pyb._bed.fields = list_to_vector(orig_fields)
701 return pyb
702
703 cdef vector[string] list_to_vector(list li):
704 cdef vector[string] s
705 cdef int i
706 for i in range(len(li)):
707 _s = li[i]
708 s.push_back(_cppstr(_s))
709 return s
710
711 cdef list string_vec2list(vector[string] sv):
712 cdef size_t size = sv.size(), i
713 return [_pystr(sv.at(i)) for i in range(size)]
714
715 cdef list bed_vec2list(vector[BED] bv):
716 cdef size_t size = bv.size(), i
717 cdef list l = []
718 cdef BED b
719 for i in range(size):
720 b = bv.at(i)
721 l.append(create_interval(b))
722 return l
723
724
725 def overlap(int s1, int s2, int e1, int e2):
726 return min(e1, e2) - max(s1, s2)
727
728
729 cdef class IntervalIterator:
730 cdef object stream
731 cdef int _itemtype
732 def __init__(self, stream):
733 self.stream = stream
734
735 # For speed, check int rather than call isinstance().
736 # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
737 #
738 # Also assumes that all items in the iterable `stream` are the same
739 # type...this seems like a reasonable assumption.
740 self._itemtype = -1
741
742 def __dealloc__(self):
743 try:
744 self.stream.close()
745 except AttributeError:
746 pass
747
748 def __iter__(self):
749 return self
750
751 def __next__(self):
752 while True:
753 if hasattr(self.stream, 'closed'):
754 if self.stream.closed:
755 raise StopIteration
756 try:
757 line = next(self.stream)
758 except StopIteration:
759 if hasattr(self.stream, 'close'):
760 self.stream.close()
761 raise StopIteration
762
763 if self._itemtype < 0:
764 if isinstance(line, Interval):
765 self._itemtype = 2
766 elif isinstance(line, basestring):
767 self._itemtype = 1
768 else:
769 self._itemtype = 0
770
771 if self._itemtype == 1:
772 if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
773 continue
774 break
775
776 # Iterable of Interval objects
777 if self._itemtype == 2:
778 return line
779
780 # Iterable of strings, in which case we need to split
781 elif self._itemtype == 1:
782 fields = line.rstrip('\r\n').split('\t')
783
784 # Otherwise assume list/tuple/iterable of fields
785 else:
786 fields = list(line)
787
788 # TODO: optimization: create_interval_from_list should have a version
789 # that accepts C++ string instances
790 return create_interval_from_list(fields)
791
792
793
794 cdef class IntervalFile:
795 cdef BedFile *intervalFile_ptr
796 cdef bint _loaded
797 cdef bint _open
798 cdef string _fn
799 """
800 An IntervalFile provides low-level access to the BEDTools API.
801
802 >>> fn = pybedtools.example_filename('a.bed')
803 >>> intervalfile = pybedtools.IntervalFile(fn)
804
805 """
806 def __init__(self, intervalFile):
807 self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
808 self._loaded = 0
809 self._open = 0
810 self._fn = _cppstr(intervalFile)
811
812 def __dealloc__(self):
813 del self.intervalFile_ptr
814
815 def __iter__(self):
816 return self
817
818 def __next__(self):
819 if not self._open:
820 result = self.intervalFile_ptr.Open()
821 if result == -1:
822 raise BedToolsFileError("Error opening file")
823 self._open = 1
824 cdef BED b = self.intervalFile_ptr.GetNextBed()
825 if b.status == BED_VALID:
826 return create_interval(b)
827 elif b.status == BED_INVALID:
828 self.intervalFile_ptr.Close()
829 raise StopIteration
830 elif b.status == BED_MALFORMED:
831 self.intervalFile_ptr.Close()
832 raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
833 else:
834 return next(self)
835
836 @property
837 def fn(self):
838 return _pystr(self._fn)
839
840 @property
841 def file_type(self):
842 if not self.intervalFile_ptr._typeIsKnown:
843 try:
844 a = next(iter(self))
845 file_type = _pystr(self.intervalFile_ptr.file_type)
846 self.intervalFile_ptr.Close()
847 return file_type
848 except MalformedBedLineError:
849 # If it's a SAM, raise a meaningful exception. If not, fail.
850 with open(self.fn) as fn:
851 interval = create_interval_from_list(fn.readline().strip().split())
852 if interval.file_type == 'sam':
853 raise ValueError('IntervalFile objects do not yet natively support SAM. '
854 'Please convert to BED/GFF/VCF first if you want to '
855 'use the low-level API of IntervalFile')
856 else:
857 raise
858
859
860 def loadIntoMap(self):
861 """
862 Prepares file for checking intersections. Used by other methods like all_hits()
863 """
864 if self._loaded:
865 return
866 self.intervalFile_ptr.loadBedFileIntoMap()
867 self._loaded = 1
868
869 def rewind(self):
870 """
871 Jump to the beginning of the file.
872 """
873 if not self._open:
874 self.intervalFile_ptr.Open()
875 self._open = 1
876 self.intervalFile_ptr.Rewind()
877
878 def seek(self, offset):
879 """
880 Jump to a specific byte offset in the file
881 """
882 if not self._open:
883 self.intervalFile_ptr.Open()
884 self._open = 1
885 self.intervalFile_ptr.Seek(offset)
886
887
888 def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
889 """
890 :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
891
892 Search for the Interval `interval` this file and return **all**
893 overlaps as a list.
894
895 `same_strand`, if True, will only consider hits on the same strand as `interval`.
896
897 `overlap` can be used to specify the fraction of overlap between
898 `interval` and each feature in the IntervalFile.
899
900 Example usage:
901
902 >>> fn = pybedtools.example_filename('a.bed')
903
904 >>> # create an Interval to query with
905 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
906
907 >>> # Create an IntervalFile out of a.bed
908 >>> intervalfile = pybedtools.IntervalFile(fn)
909
910 >>> # get stranded hits
911 >>> intervalfile.all_hits(i, same_strand=True)
912 [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
913
914 """
915 cdef vector[BED] vec_b
916 self.loadIntoMap()
917
918 if same_strand == False:
919 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
920 try:
921 return bed_vec2list(vec_b)
922 finally:
923 pass
924 else:
925 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
926 try:
927 return bed_vec2list(vec_b)
928 finally:
929 pass
930
931 # search() is an alias for all_hits
932 search = all_hits
933
934 def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
935 """
936 :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
937
938 Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
939
940 `same_strand`, if True, will only consider hits on the same strand as `interval`.
941
942 `overlap` can be used to specify the fraction of overlap between
943 `interval` and each feature in the IntervalFile.
944
945 Example usage:
946
947 >>> fn = pybedtools.example_filename('a.bed')
948
949 >>> # create an Interval to query with
950 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
951
952 >>> # Create an IntervalFile out of a.bed
953 >>> intervalfile = pybedtools.IntervalFile(fn)
954
955 >>> # any stranded hits?
956 >>> intervalfile.any_hits(i, same_strand=True)
957 1
958
959 """
960 found = 0
961 self.loadIntoMap()
962
963 if same_strand == False:
964 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
965 else:
966 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
967
968 return found
969
970 def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
971 """
972 :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
973
974 Return the number of overlaps of the Interval `interval` had with this
975 IntervalFile.
976
977 `same_strand`, if True, will only consider hits on the same strand as
978 `interval`.
979
980 `overlap` can be used to specify the fraction of overlap between
981 `interval` and each feature in the IntervalFile.
982
983 Example usage:
984
985 >>> fn = pybedtools.example_filename('a.bed')
986
987 >>> # create an Interval to query with
988 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
989
990 >>> # Create an IntervalFile out of a.bed
991 >>> intervalfile = pybedtools.IntervalFile(fn)
992
993 >>> # get number of stranded hits
994 >>> intervalfile.count_hits(i, same_strand=True)
995 3
996
997 """
998 self.loadIntoMap()
999
1000 if same_strand == False:
1001 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
1002 else:
1003 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)