jpayne@68
|
1 # distutils: language = c++
|
jpayne@68
|
2 # cython: language_level=2
|
jpayne@68
|
3
|
jpayne@68
|
4 # String notes:
|
jpayne@68
|
5 #
|
jpayne@68
|
6 # Anything that goes in C++ objects should be converted to a C++ <string>
|
jpayne@68
|
7 # type, using the _cppstr() function. For example: Interval._bed.file_type,
|
jpayne@68
|
8 # or the entries in Interval._bed.fields.
|
jpayne@68
|
9 #
|
jpayne@68
|
10 # Any Python accessor methods (Interval.fields, Interval.__getitem__) should
|
jpayne@68
|
11 # then be converted to Python strings using the _pystr() function.
|
jpayne@68
|
12 #
|
jpayne@68
|
13 # Cython uses the `str` type as whatever the native Python version uses as
|
jpayne@68
|
14 # str.
|
jpayne@68
|
15
|
jpayne@68
|
16 from libcpp.string cimport string
|
jpayne@68
|
17 import numpy as np
|
jpayne@68
|
18
|
jpayne@68
|
19 # Python byte strings automatically coerce to/from C++ strings.
|
jpayne@68
|
20
|
jpayne@68
|
21 cdef _cppstr(s):
|
jpayne@68
|
22 # Use this to handle incoming strings from Python.
|
jpayne@68
|
23 #
|
jpayne@68
|
24 # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
|
jpayne@68
|
25 # are unicode and so must be encoded to bytestring.
|
jpayne@68
|
26 if isinstance(s, integer_types):
|
jpayne@68
|
27 s = str(s)
|
jpayne@68
|
28 if isinstance(s, unicode):
|
jpayne@68
|
29 s = s.encode('UTF-8')
|
jpayne@68
|
30 return <string> s
|
jpayne@68
|
31
|
jpayne@68
|
32 cdef _pystr(string s):
|
jpayne@68
|
33 # Use this to prepare a string for sending to Python.
|
jpayne@68
|
34 #
|
jpayne@68
|
35 # Always returns unicode.
|
jpayne@68
|
36 return s.decode('UTF-8', 'strict')
|
jpayne@68
|
37
|
jpayne@68
|
38 integer_types = (int, long, np.int64)
|
jpayne@68
|
39
|
jpayne@68
|
40
|
jpayne@68
|
41 """
|
jpayne@68
|
42 bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
|
jpayne@68
|
43
|
jpayne@68
|
44 Authors: Aaron Quinlan[1], Brent Pedersen[2]
|
jpayne@68
|
45 Affl: [1] Center for Public Health Genomics, University of Virginia
|
jpayne@68
|
46 [2]
|
jpayne@68
|
47 Email: aaronquinlan at gmail dot com
|
jpayne@68
|
48 """
|
jpayne@68
|
49 from cython.operator cimport dereference as deref
|
jpayne@68
|
50 import sys
|
jpayne@68
|
51 import subprocess
|
jpayne@68
|
52 from collections import defaultdict
|
jpayne@68
|
53
|
jpayne@68
|
54 cdef dict LOOKUPS = {
|
jpayne@68
|
55 "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
|
jpayne@68
|
56 "vcf": {"chrom": 0, "start": 1},
|
jpayne@68
|
57 "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
|
jpayne@68
|
58 }
|
jpayne@68
|
59 for ktype, kdict in list(LOOKUPS.items()):
|
jpayne@68
|
60 for k, v in list(kdict.items()):
|
jpayne@68
|
61 kdict[v] = k
|
jpayne@68
|
62
|
jpayne@68
|
63 # Keys are tuples of start/start, stop/stop, start/stop, stop/start.
|
jpayne@68
|
64 # Values are which operators should return True, otherwise False
|
jpayne@68
|
65 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
|
jpayne@68
|
66 PROFILES_TRUE = {
|
jpayne@68
|
67 (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b
|
jpayne@68
|
68 # a ---------
|
jpayne@68
|
69 # b ---------
|
jpayne@68
|
70
|
jpayne@68
|
71 (-1, -1, -1, -1): (0, 1), # a < b, a <= b
|
jpayne@68
|
72 # a ----
|
jpayne@68
|
73 # b -----
|
jpayne@68
|
74
|
jpayne@68
|
75 (-1, -1, -1, 0): (1,), # a <= b
|
jpayne@68
|
76 # a ----
|
jpayne@68
|
77 # b ----- (book-ended)
|
jpayne@68
|
78
|
jpayne@68
|
79 (1, 1, 0, 1): (5,), # a >= b
|
jpayne@68
|
80 # a -----
|
jpayne@68
|
81 # b ---- (book-ended)
|
jpayne@68
|
82
|
jpayne@68
|
83 (1, 1, 1, 1): (4, 5), # a > b, a >= b
|
jpayne@68
|
84 # a ------
|
jpayne@68
|
85 # b ----
|
jpayne@68
|
86
|
jpayne@68
|
87 (0, 1, -1, 1): (5,), # a >= b
|
jpayne@68
|
88 # a ------------
|
jpayne@68
|
89 # b ---------
|
jpayne@68
|
90
|
jpayne@68
|
91 (1, 0, -1, 1): (5,), # a >= b
|
jpayne@68
|
92 # a -----------
|
jpayne@68
|
93 # b -------------
|
jpayne@68
|
94
|
jpayne@68
|
95 (-1, 0, -1, 1): (1,), # a <= b
|
jpayne@68
|
96 # a -------------
|
jpayne@68
|
97 # b -----------
|
jpayne@68
|
98
|
jpayne@68
|
99 (0, -1, -1, 1): (1,), # a <= b
|
jpayne@68
|
100 # a ---------
|
jpayne@68
|
101 # b ------------
|
jpayne@68
|
102
|
jpayne@68
|
103 (-1, -1, -1, 1): (1,), # a <= b
|
jpayne@68
|
104 # a -----------
|
jpayne@68
|
105 # b -----------
|
jpayne@68
|
106
|
jpayne@68
|
107 (1, 1, -1, 1): (5,), # a >= b
|
jpayne@68
|
108 # a -----------
|
jpayne@68
|
109 # b -----------
|
jpayne@68
|
110
|
jpayne@68
|
111 (1, -1, -1, 1): tuple(), # undef
|
jpayne@68
|
112 # a ----
|
jpayne@68
|
113 # b -----------
|
jpayne@68
|
114
|
jpayne@68
|
115 (-1, 1, -1, 1): tuple(), # undef
|
jpayne@68
|
116 # a -----------
|
jpayne@68
|
117 # b ----
|
jpayne@68
|
118
|
jpayne@68
|
119 (-1, 0, -1, 0): (1,), # a <= b
|
jpayne@68
|
120 # a -----------
|
jpayne@68
|
121 # b -
|
jpayne@68
|
122
|
jpayne@68
|
123 (1, 0, 0, 1): (5,), # a >= b
|
jpayne@68
|
124 # a -
|
jpayne@68
|
125 # b -----------
|
jpayne@68
|
126
|
jpayne@68
|
127 (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b
|
jpayne@68
|
128 # a -
|
jpayne@68
|
129 # b - (starts and stops are identical for all features)
|
jpayne@68
|
130 }
|
jpayne@68
|
131
|
jpayne@68
|
132
|
jpayne@68
|
133 class MalformedBedLineError(Exception):
|
jpayne@68
|
134 pass
|
jpayne@68
|
135
|
jpayne@68
|
136
|
jpayne@68
|
137 class BedToolsFileError(Exception):
|
jpayne@68
|
138 pass
|
jpayne@68
|
139
|
jpayne@68
|
140
|
jpayne@68
|
141 class Attributes(dict):
|
jpayne@68
|
142 """
|
jpayne@68
|
143 Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
|
jpayne@68
|
144 """
|
jpayne@68
|
145
|
jpayne@68
|
146 def __init__(self, attr_str=""):
|
jpayne@68
|
147 attr_str = str(attr_str)
|
jpayne@68
|
148 self._attr_str = attr_str
|
jpayne@68
|
149 self.sort_keys = False
|
jpayne@68
|
150
|
jpayne@68
|
151 # in general, GFF files will have either as many '=' as ';'
|
jpayne@68
|
152 # (or ';'-1 if there's no trailing ';')
|
jpayne@68
|
153 n_semi = attr_str.count(';')
|
jpayne@68
|
154 n_eq = attr_str.count('=')
|
jpayne@68
|
155 n_quotes = attr_str.count('"')
|
jpayne@68
|
156
|
jpayne@68
|
157 if n_eq > n_semi - 1:
|
jpayne@68
|
158 self.sep, self.field_sep = (';', '=')
|
jpayne@68
|
159 else:
|
jpayne@68
|
160 self.sep, self.field_sep = (';', ' ')
|
jpayne@68
|
161
|
jpayne@68
|
162 self._quoted = {}
|
jpayne@68
|
163
|
jpayne@68
|
164 # TODO: pathological case . . . detect this as GFF:
|
jpayne@68
|
165 #
|
jpayne@68
|
166 # class_code=" "
|
jpayne@68
|
167 #
|
jpayne@68
|
168 # and this as GTF:
|
jpayne@68
|
169 #
|
jpayne@68
|
170 # class_code "="
|
jpayne@68
|
171
|
jpayne@68
|
172 # quick exit
|
jpayne@68
|
173 if attr_str == "":
|
jpayne@68
|
174 return
|
jpayne@68
|
175
|
jpayne@68
|
176 kvs = map(str.strip, attr_str.strip().split(self.sep))
|
jpayne@68
|
177 for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
|
jpayne@68
|
178 if value.count('"') == 2:
|
jpayne@68
|
179 self._quoted[field] = True
|
jpayne@68
|
180 self[field] = value.replace('"', '')
|
jpayne@68
|
181
|
jpayne@68
|
182 def __str__(self):
|
jpayne@68
|
183 # stringify all items first
|
jpayne@68
|
184 items = []
|
jpayne@68
|
185 for field, val in dict.iteritems(self):
|
jpayne@68
|
186 try:
|
jpayne@68
|
187 if self._quoted[field]:
|
jpayne@68
|
188 val = '"' + str(val) + '"'
|
jpayne@68
|
189 except KeyError:
|
jpayne@68
|
190 pass
|
jpayne@68
|
191 items.append((field, val))
|
jpayne@68
|
192
|
jpayne@68
|
193 pairs = []
|
jpayne@68
|
194 if self.sort_keys:
|
jpayne@68
|
195 items.sort()
|
jpayne@68
|
196 for k, v in items:
|
jpayne@68
|
197 pairs.append(self.field_sep.join([k, v]))
|
jpayne@68
|
198
|
jpayne@68
|
199 return self.sep.join(pairs) + self.sep
|
jpayne@68
|
200
|
jpayne@68
|
201 cdef class Interval:
|
jpayne@68
|
202 """
|
jpayne@68
|
203 Class to represent a genomic interval.
|
jpayne@68
|
204
|
jpayne@68
|
205 Constructor::
|
jpayne@68
|
206
|
jpayne@68
|
207 Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
|
jpayne@68
|
208
|
jpayne@68
|
209 Class to represent a genomic interval of any format. Requires at least 3
|
jpayne@68
|
210 args: chrom (string), start (int), end (int).
|
jpayne@68
|
211
|
jpayne@68
|
212 `start` is *always* the 0-based start coordinate. If this Interval is to
|
jpayne@68
|
213 represent a GFF object (which uses a 1-based coordinate system), then
|
jpayne@68
|
214 subtract 1 from the 4th item in the line to get the start position in
|
jpayne@68
|
215 0-based coords for this Interval. The 1-based GFF coord will still be
|
jpayne@68
|
216 available, albeit as a string, in fields[3].
|
jpayne@68
|
217
|
jpayne@68
|
218 `otherfields` is a list of fields that don't fit into the other kwargs, and
|
jpayne@68
|
219 will be stored in the `fields` attribute of the Interval.
|
jpayne@68
|
220
|
jpayne@68
|
221 All the items in `otherfields` must be strings for proper conversion to
|
jpayne@68
|
222 C++.
|
jpayne@68
|
223
|
jpayne@68
|
224 By convention, for BED files, `otherfields` is everything past the first 6
|
jpayne@68
|
225 items in the line. This allows an Interval to represent composite features
|
jpayne@68
|
226 (e.g., a GFF line concatenated to the end of a BED line)
|
jpayne@68
|
227
|
jpayne@68
|
228 But for other formats (VCF, GFF, SAM), the entire line should be passed in
|
jpayne@68
|
229 as a list for `otherfields` so that we can always check the
|
jpayne@68
|
230 Interval.file_type and extract the fields we want, knowing that they'll be
|
jpayne@68
|
231 in the right order as passed in with `otherfields`.
|
jpayne@68
|
232
|
jpayne@68
|
233 Example usage:
|
jpayne@68
|
234
|
jpayne@68
|
235 >>> from pybedtools import Interval
|
jpayne@68
|
236 >>> i = Interval("chr1", 22, 44, strand='-')
|
jpayne@68
|
237 >>> i
|
jpayne@68
|
238 Interval(chr1:22-44)
|
jpayne@68
|
239
|
jpayne@68
|
240
|
jpayne@68
|
241 """
|
jpayne@68
|
242 def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
|
jpayne@68
|
243 if otherfields is None:
|
jpayne@68
|
244 otherfields = []
|
jpayne@68
|
245 otherfields = [_cppstr(i) for i in otherfields]
|
jpayne@68
|
246 self._bed = new BED(
|
jpayne@68
|
247 _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
|
jpayne@68
|
248 _cppstr(strand), otherfields)
|
jpayne@68
|
249
|
jpayne@68
|
250 #self._bed.chrom = _cppstr(chrom)
|
jpayne@68
|
251 #self._bed.start = start
|
jpayne@68
|
252 #self._bed.end = end
|
jpayne@68
|
253 #self._bed.name = _cppstr(name)
|
jpayne@68
|
254 #self._bed.score = _cppstr(score)
|
jpayne@68
|
255 #self._bed.strand = _cppstr(strand)
|
jpayne@68
|
256 fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
|
jpayne@68
|
257 fields.extend(otherfields)
|
jpayne@68
|
258 self._bed.fields = fields
|
jpayne@68
|
259 self._attrs = None
|
jpayne@68
|
260
|
jpayne@68
|
261 def __copy__(self):
|
jpayne@68
|
262 return create_interval_from_list(self.fields)
|
jpayne@68
|
263
|
jpayne@68
|
264 def __hash__(self):
|
jpayne@68
|
265 return hash("\t".join(self.fields))
|
jpayne@68
|
266
|
jpayne@68
|
267 property chrom:
|
jpayne@68
|
268 """ the chromosome of the feature"""
|
jpayne@68
|
269 def __get__(self):
|
jpayne@68
|
270 return _pystr(self._bed.chrom)
|
jpayne@68
|
271
|
jpayne@68
|
272 def __set__(self, chrom):
|
jpayne@68
|
273 chrom = _cppstr(chrom)
|
jpayne@68
|
274 self._bed.chrom = chrom
|
jpayne@68
|
275 idx = LOOKUPS[self.file_type]["chrom"]
|
jpayne@68
|
276 self._bed.fields[idx] = _cppstr(chrom)
|
jpayne@68
|
277
|
jpayne@68
|
278 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
|
jpayne@68
|
279 def __richcmp__(self, other, int op):
|
jpayne@68
|
280 if (self.chrom != other.chrom) or (self.strand != other.strand):
|
jpayne@68
|
281 if op == 3: return True
|
jpayne@68
|
282 return False
|
jpayne@68
|
283
|
jpayne@68
|
284 def cmp(x, y):
|
jpayne@68
|
285 if x < y:
|
jpayne@68
|
286 return -1
|
jpayne@68
|
287 if x == y:
|
jpayne@68
|
288 return 0
|
jpayne@68
|
289 if x > y:
|
jpayne@68
|
290 return 1
|
jpayne@68
|
291
|
jpayne@68
|
292
|
jpayne@68
|
293 # check all 4 so that we can handle nesting and partial overlaps.
|
jpayne@68
|
294 profile = (cmp(self.start, other.start),
|
jpayne@68
|
295 cmp(self.stop, other.stop),
|
jpayne@68
|
296 cmp(self.start, other.stop),
|
jpayne@68
|
297 cmp(self.stop, other.start))
|
jpayne@68
|
298
|
jpayne@68
|
299 try:
|
jpayne@68
|
300 if PROFILES_TRUE[profile] == tuple():
|
jpayne@68
|
301 raise NotImplementedError('Features are nested -- comparison undefined')
|
jpayne@68
|
302
|
jpayne@68
|
303 if op != 3:
|
jpayne@68
|
304 if op in PROFILES_TRUE[profile]:
|
jpayne@68
|
305 return True
|
jpayne@68
|
306 return False
|
jpayne@68
|
307 else:
|
jpayne@68
|
308 if 2 in PROFILES_TRUE[profile]:
|
jpayne@68
|
309 return False
|
jpayne@68
|
310 return True
|
jpayne@68
|
311 except KeyError:
|
jpayne@68
|
312 raise ValueError('Currently unsupported comparison -- please '
|
jpayne@68
|
313 'submit a bug report')
|
jpayne@68
|
314
|
jpayne@68
|
315 property start:
|
jpayne@68
|
316 """The 0-based start of the feature."""
|
jpayne@68
|
317 def __get__(self):
|
jpayne@68
|
318 return self._bed.start
|
jpayne@68
|
319
|
jpayne@68
|
320 def __set__(self, int start):
|
jpayne@68
|
321 self._bed.start = start
|
jpayne@68
|
322 idx = LOOKUPS[self.file_type]["start"]
|
jpayne@68
|
323
|
jpayne@68
|
324 # Non-BED files should have 1-based coords in fields
|
jpayne@68
|
325 if self.file_type != 'bed':
|
jpayne@68
|
326 start += 1
|
jpayne@68
|
327 self._bed.fields[idx] = _cppstr(str(start))
|
jpayne@68
|
328
|
jpayne@68
|
329 property end:
|
jpayne@68
|
330 """The end of the feature"""
|
jpayne@68
|
331 def __get__(self):
|
jpayne@68
|
332 return self._bed.end
|
jpayne@68
|
333
|
jpayne@68
|
334 def __set__(self, int end):
|
jpayne@68
|
335 self._bed.end = end
|
jpayne@68
|
336 idx = LOOKUPS[self.file_type]["stop"]
|
jpayne@68
|
337 self._bed.fields[idx] = _cppstr(str(end))
|
jpayne@68
|
338
|
jpayne@68
|
339 property stop:
|
jpayne@68
|
340 """ the end of the feature"""
|
jpayne@68
|
341 def __get__(self):
|
jpayne@68
|
342 return self._bed.end
|
jpayne@68
|
343
|
jpayne@68
|
344 def __set__(self, int end):
|
jpayne@68
|
345 idx = LOOKUPS[self.file_type]["stop"]
|
jpayne@68
|
346 self._bed.fields[idx] = _cppstr(str(end))
|
jpayne@68
|
347 self._bed.end = end
|
jpayne@68
|
348
|
jpayne@68
|
349 property strand:
|
jpayne@68
|
350 """ the strand of the feature"""
|
jpayne@68
|
351 def __get__(self):
|
jpayne@68
|
352 return _pystr(self._bed.strand)
|
jpayne@68
|
353
|
jpayne@68
|
354 def __set__(self, strand):
|
jpayne@68
|
355 idx = LOOKUPS[self.file_type]["strand"]
|
jpayne@68
|
356 self._bed.fields[idx] = _cppstr(strand)
|
jpayne@68
|
357 self._bed.strand = _cppstr(strand)
|
jpayne@68
|
358
|
jpayne@68
|
359 property length:
|
jpayne@68
|
360 """ the length of the feature"""
|
jpayne@68
|
361 def __get__(self):
|
jpayne@68
|
362 return self._bed.end - self._bed.start
|
jpayne@68
|
363
|
jpayne@68
|
364 cpdef deparse_attrs(self):
|
jpayne@68
|
365
|
jpayne@68
|
366 if not self._attrs: return
|
jpayne@68
|
367
|
jpayne@68
|
368 if self.file_type != "gff":
|
jpayne@68
|
369 raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
|
jpayne@68
|
370
|
jpayne@68
|
371 s = self._attrs.__str__()
|
jpayne@68
|
372 self._bed.fields[8] = _cppstr(s)
|
jpayne@68
|
373
|
jpayne@68
|
374 property fields:
|
jpayne@68
|
375 def __get__(self):
|
jpayne@68
|
376 self.deparse_attrs()
|
jpayne@68
|
377 items = []
|
jpayne@68
|
378 for i in self._bed.fields:
|
jpayne@68
|
379 if isinstance(i, int):
|
jpayne@68
|
380 items.append(i)
|
jpayne@68
|
381 else:
|
jpayne@68
|
382 items.append(_pystr(i))
|
jpayne@68
|
383 return items
|
jpayne@68
|
384
|
jpayne@68
|
385 property attrs:
|
jpayne@68
|
386 def __get__(self):
|
jpayne@68
|
387 if self._attrs is None:
|
jpayne@68
|
388 ft = _pystr(self._bed.file_type)
|
jpayne@68
|
389 if ft == 'gff':
|
jpayne@68
|
390 self._attrs = Attributes(_pystr(self._bed.fields[8]))
|
jpayne@68
|
391 else:
|
jpayne@68
|
392 self._attrs = Attributes("")
|
jpayne@68
|
393 return self._attrs
|
jpayne@68
|
394
|
jpayne@68
|
395 def __set__(self, attrs):
|
jpayne@68
|
396 self._attrs = attrs
|
jpayne@68
|
397
|
jpayne@68
|
398 # TODO: make this more robust.
|
jpayne@68
|
399 @property
|
jpayne@68
|
400 def count(self):
|
jpayne@68
|
401 return int(self.fields[-1])
|
jpayne@68
|
402
|
jpayne@68
|
403 property name:
|
jpayne@68
|
404 """
|
jpayne@68
|
405 >>> import pybedtools
|
jpayne@68
|
406 >>> vcf = pybedtools.example_bedtool('v.vcf')
|
jpayne@68
|
407 >>> [v.name for v in vcf]
|
jpayne@68
|
408 ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
|
jpayne@68
|
409
|
jpayne@68
|
410 """
|
jpayne@68
|
411 def __get__(self):
|
jpayne@68
|
412 cdef string ftype = self._bed.file_type
|
jpayne@68
|
413 value = None
|
jpayne@68
|
414 if ftype == <string>"gff":
|
jpayne@68
|
415 """
|
jpayne@68
|
416 # TODO. allow setting a name_key in the BedTool constructor?
|
jpayne@68
|
417 if self.name_key and self.name_key in attrs:
|
jpayne@68
|
418 return attrs[self.name_key]
|
jpayne@68
|
419 """
|
jpayne@68
|
420 for key in ("ID", "Name", "gene_name", "transcript_id", \
|
jpayne@68
|
421 "gene_id", "Parent"):
|
jpayne@68
|
422 if key in self.attrs:
|
jpayne@68
|
423 value = self.attrs[key]
|
jpayne@68
|
424 break
|
jpayne@68
|
425
|
jpayne@68
|
426 elif ftype == <string>"vcf":
|
jpayne@68
|
427 s = self.fields[2]
|
jpayne@68
|
428 if s in ("", "."):
|
jpayne@68
|
429 value = "%s:%i" % (self.chrom, self.start)
|
jpayne@68
|
430 else:
|
jpayne@68
|
431 value = _pystr(s)
|
jpayne@68
|
432 elif ftype == <string>"bed":
|
jpayne@68
|
433 value = _pystr(self._bed.name)
|
jpayne@68
|
434
|
jpayne@68
|
435 return value
|
jpayne@68
|
436
|
jpayne@68
|
437 def __set__(self, value):
|
jpayne@68
|
438 cdef string ftype = self._bed.file_type
|
jpayne@68
|
439
|
jpayne@68
|
440 if ftype == <string>"gff":
|
jpayne@68
|
441 for key in ("ID", "Name", "gene_name", "transcript_id", \
|
jpayne@68
|
442 "gene_id", "Parent"):
|
jpayne@68
|
443 if not key in self.attrs:
|
jpayne@68
|
444 continue
|
jpayne@68
|
445
|
jpayne@68
|
446 # If it's incoming from Python it's unicode, so store that directly
|
jpayne@68
|
447 # in the attributes (since an Attribute object works on
|
jpayne@68
|
448 # unicode)...
|
jpayne@68
|
449 self.attrs[key] = value
|
jpayne@68
|
450 break
|
jpayne@68
|
451
|
jpayne@68
|
452 # Otherwise use _cppstr() because we're storing it in _bed.fields.
|
jpayne@68
|
453 elif ftype == <string>"vcf":
|
jpayne@68
|
454 self._bed.fields[2] = _cppstr(value)
|
jpayne@68
|
455 else:
|
jpayne@68
|
456 self._bed.name = _cppstr(value)
|
jpayne@68
|
457 self._bed.fields[3] = _cppstr(value)
|
jpayne@68
|
458
|
jpayne@68
|
459 property score:
|
jpayne@68
|
460 def __get__(self):
|
jpayne@68
|
461 return _pystr(self._bed.score)
|
jpayne@68
|
462
|
jpayne@68
|
463 def __set__(self, value):
|
jpayne@68
|
464 value = _cppstr(value)
|
jpayne@68
|
465 self._bed.score = value
|
jpayne@68
|
466 idx = LOOKUPS[self.file_type]["score"]
|
jpayne@68
|
467 self._bed.fields[idx] = value
|
jpayne@68
|
468
|
jpayne@68
|
469 property file_type:
|
jpayne@68
|
470 "bed/vcf/gff"
|
jpayne@68
|
471 def __get__(self):
|
jpayne@68
|
472 return _pystr(self._bed.file_type)
|
jpayne@68
|
473
|
jpayne@68
|
474 def __set__(self, value):
|
jpayne@68
|
475 self._bed.file_type = _cppstr(value)
|
jpayne@68
|
476
|
jpayne@68
|
477 # TODO: maybe bed.overlap_start or bed.overlap.start ??
|
jpayne@68
|
478 @property
|
jpayne@68
|
479 def o_start(self):
|
jpayne@68
|
480 return self._bed.o_start
|
jpayne@68
|
481
|
jpayne@68
|
482 @property
|
jpayne@68
|
483 def o_end(self):
|
jpayne@68
|
484 return self._bed.o_end
|
jpayne@68
|
485
|
jpayne@68
|
486 @property
|
jpayne@68
|
487 def o_amt(self):
|
jpayne@68
|
488 return self._bed.o_end - self._bed.o_start
|
jpayne@68
|
489
|
jpayne@68
|
490 def __str__(self):
|
jpayne@68
|
491 """
|
jpayne@68
|
492 Interval objects always print with a newline to mimic a line in a
|
jpayne@68
|
493 BED/GFF/VCF file
|
jpayne@68
|
494 """
|
jpayne@68
|
495 items = []
|
jpayne@68
|
496 for i in self.fields:
|
jpayne@68
|
497 if isinstance(i, int):
|
jpayne@68
|
498 i = str(i)
|
jpayne@68
|
499 items.append(i)
|
jpayne@68
|
500
|
jpayne@68
|
501 return '\t'.join(items) + '\n'
|
jpayne@68
|
502
|
jpayne@68
|
503 def __repr__(self):
|
jpayne@68
|
504 return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
|
jpayne@68
|
505
|
jpayne@68
|
506 def __dealloc__(self):
|
jpayne@68
|
507 del self._bed
|
jpayne@68
|
508
|
jpayne@68
|
509 def __len__(self):
|
jpayne@68
|
510 return self._bed.end - self._bed.start
|
jpayne@68
|
511
|
jpayne@68
|
512 def __getitem__(self, object key):
|
jpayne@68
|
513 cdef int i
|
jpayne@68
|
514 ftype = _pystr(self._bed.file_type)
|
jpayne@68
|
515
|
jpayne@68
|
516 self.deparse_attrs()
|
jpayne@68
|
517
|
jpayne@68
|
518 if isinstance(key, (int, long)):
|
jpayne@68
|
519 nfields = self._bed.fields.size()
|
jpayne@68
|
520 if key >= nfields:
|
jpayne@68
|
521 raise IndexError('field index out of range')
|
jpayne@68
|
522 elif key < 0:
|
jpayne@68
|
523 key = nfields + key
|
jpayne@68
|
524 return _pystr(self._bed.fields.at(key))
|
jpayne@68
|
525 elif isinstance(key, slice):
|
jpayne@68
|
526 indices = key.indices(self._bed.fields.size())
|
jpayne@68
|
527 return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
|
jpayne@68
|
528
|
jpayne@68
|
529 elif isinstance(key, str):
|
jpayne@68
|
530 if ftype == "gff":
|
jpayne@68
|
531 try:
|
jpayne@68
|
532 return self.attrs[key]
|
jpayne@68
|
533 except KeyError:
|
jpayne@68
|
534 pass
|
jpayne@68
|
535 # We don't have to convert using _pystr() because the __get__
|
jpayne@68
|
536 # methods do that already.
|
jpayne@68
|
537 return getattr(self, key)
|
jpayne@68
|
538
|
jpayne@68
|
539 def __setitem__(self, object key, object value):
|
jpayne@68
|
540 if isinstance(key, (int, long)):
|
jpayne@68
|
541 nfields = self._bed.fields.size()
|
jpayne@68
|
542 if key >= nfields:
|
jpayne@68
|
543 raise IndexError('field index out of range')
|
jpayne@68
|
544 elif key < 0:
|
jpayne@68
|
545 key = nfields + key
|
jpayne@68
|
546 self._bed.fields[key] = _cppstr(value)
|
jpayne@68
|
547
|
jpayne@68
|
548 ft = _pystr(self._bed.file_type)
|
jpayne@68
|
549 if key in LOOKUPS[ft]:
|
jpayne@68
|
550 setattr(self, LOOKUPS[ft][key], value)
|
jpayne@68
|
551
|
jpayne@68
|
552 elif isinstance(key, (basestring)):
|
jpayne@68
|
553 setattr(self, key, value)
|
jpayne@68
|
554
|
jpayne@68
|
555 cpdef append(self, object value):
|
jpayne@68
|
556 self._bed.fields.push_back(_cppstr(value))
|
jpayne@68
|
557
|
jpayne@68
|
558 def __nonzero__(self):
|
jpayne@68
|
559 return True
|
jpayne@68
|
560
|
jpayne@68
|
561
|
jpayne@68
|
562 cdef Interval create_interval(BED b):
|
jpayne@68
|
563 cdef Interval pyb = Interval.__new__(Interval)
|
jpayne@68
|
564 pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
|
jpayne@68
|
565 b.score, b.strand, b.fields,
|
jpayne@68
|
566 b.o_start, b.o_end, b.bedType, b.file_type, b.status)
|
jpayne@68
|
567 pyb._bed.fields = b.fields
|
jpayne@68
|
568 return pyb
|
jpayne@68
|
569
|
jpayne@68
|
570 # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
|
jpayne@68
|
571 # checking in create_interval_from_list for filetype heuruistics. Is there
|
jpayne@68
|
572 # a performance hit by checking instances?
|
jpayne@68
|
573 cdef isdigit(s):
|
jpayne@68
|
574 if isinstance(s, integer_types):
|
jpayne@68
|
575 return True
|
jpayne@68
|
576 return s.isdigit()
|
jpayne@68
|
577
|
jpayne@68
|
578
|
jpayne@68
|
579 cpdef Interval create_interval_from_list(list fields):
|
jpayne@68
|
580 """
|
jpayne@68
|
581 Create an Interval object from a list of strings.
|
jpayne@68
|
582
|
jpayne@68
|
583 Constructor::
|
jpayne@68
|
584
|
jpayne@68
|
585 create_interval_from_list(fields)
|
jpayne@68
|
586
|
jpayne@68
|
587 Given the list of strings, `fields`, automatically detects the format (BED,
|
jpayne@68
|
588 GFF, VCF, SAM) and creates a new Interval object.
|
jpayne@68
|
589
|
jpayne@68
|
590 `fields` is a list with an arbitrary number of items (it can be quite long,
|
jpayne@68
|
591 say after a -wao intersection of a BED12 and a GFF), however, the first
|
jpayne@68
|
592 fields must conform to one of the supported formats. For example, if you
|
jpayne@68
|
593 want the resulting Interval to be considered a GFF feature, then the first
|
jpayne@68
|
594 9 fields must conform to the GFF format. Similarly, if you want the
|
jpayne@68
|
595 resulting Interval to be considered a BED feature, then the first three
|
jpayne@68
|
596 fields must be chrom, start, stop.
|
jpayne@68
|
597
|
jpayne@68
|
598 Example usage:
|
jpayne@68
|
599
|
jpayne@68
|
600 >>> # Creates a BED3 feature
|
jpayne@68
|
601 >>> feature = create_interval_from_list(['chr1', '1', '100'])
|
jpayne@68
|
602
|
jpayne@68
|
603 """
|
jpayne@68
|
604
|
jpayne@68
|
605 # TODO: this function is used a lot, and is doing a bit of work. We should
|
jpayne@68
|
606 # have an optimized version that is directly provided the filetype.
|
jpayne@68
|
607
|
jpayne@68
|
608 cdef Interval pyb = Interval.__new__(Interval)
|
jpayne@68
|
609 orig_fields = fields[:]
|
jpayne@68
|
610 # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
|
jpayne@68
|
611 # digit
|
jpayne@68
|
612
|
jpayne@68
|
613 # SAM
|
jpayne@68
|
614 if (
|
jpayne@68
|
615 (len(fields) >= 11)
|
jpayne@68
|
616 and isdigit(fields[1])
|
jpayne@68
|
617 and isdigit(fields[3])
|
jpayne@68
|
618 and isdigit(fields[4])
|
jpayne@68
|
619 and (fields[5] not in ['.', '+', '-'])
|
jpayne@68
|
620 ):
|
jpayne@68
|
621 # TODO: what should the stop position be? Here, it's just the start
|
jpayne@68
|
622 # plus the length of the sequence, but perhaps this should eventually
|
jpayne@68
|
623 # do CIGAR string parsing.
|
jpayne@68
|
624 if int(fields[1]) & 0x04:
|
jpayne@68
|
625 # handle unmapped reads
|
jpayne@68
|
626 chrom = _cppstr("*")
|
jpayne@68
|
627 start = 0
|
jpayne@68
|
628 stop = 0
|
jpayne@68
|
629 else:
|
jpayne@68
|
630 chrom = _cppstr(fields[2])
|
jpayne@68
|
631 start = int(fields[3]) - 1
|
jpayne@68
|
632 stop = int(fields[3]) + len(fields[9]) - 1
|
jpayne@68
|
633 name = _cppstr(fields[0])
|
jpayne@68
|
634 score = _cppstr(fields[1])
|
jpayne@68
|
635 if int(fields[1]) & 0x10:
|
jpayne@68
|
636 strand = _cppstr('-')
|
jpayne@68
|
637 else:
|
jpayne@68
|
638 strand = _cppstr('+')
|
jpayne@68
|
639
|
jpayne@68
|
640 # Fields is in SAM format
|
jpayne@68
|
641 fields[3] = str(start + 1)
|
jpayne@68
|
642
|
jpayne@68
|
643 pyb._bed = new BED(
|
jpayne@68
|
644 chrom,
|
jpayne@68
|
645 start,
|
jpayne@68
|
646 stop,
|
jpayne@68
|
647 strand,
|
jpayne@68
|
648 name,
|
jpayne@68
|
649 score,
|
jpayne@68
|
650 list_to_vector(fields))
|
jpayne@68
|
651 pyb.file_type = _cppstr('sam')
|
jpayne@68
|
652
|
jpayne@68
|
653
|
jpayne@68
|
654 elif isdigit(fields[1]) and isdigit(fields[2]):
|
jpayne@68
|
655 # if it's too short, just add some empty fields.
|
jpayne@68
|
656 if len(fields) < 7:
|
jpayne@68
|
657 fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
|
jpayne@68
|
658 other_fields = []
|
jpayne@68
|
659 else:
|
jpayne@68
|
660 other_fields = fields[6:]
|
jpayne@68
|
661
|
jpayne@68
|
662 pyb._bed = new BED(
|
jpayne@68
|
663 _cppstr(fields[0]),
|
jpayne@68
|
664 int(fields[1]),
|
jpayne@68
|
665 int(fields[2]),
|
jpayne@68
|
666 _cppstr(fields[3]),
|
jpayne@68
|
667 _cppstr(fields[4]),
|
jpayne@68
|
668 _cppstr(fields[5]),
|
jpayne@68
|
669 list_to_vector(other_fields))
|
jpayne@68
|
670 pyb.file_type = _cppstr('bed')
|
jpayne@68
|
671
|
jpayne@68
|
672 # VCF
|
jpayne@68
|
673 elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
|
jpayne@68
|
674 pyb._bed = new BED(
|
jpayne@68
|
675 _cppstr(fields[0]),
|
jpayne@68
|
676 int(fields[1]) - 1,
|
jpayne@68
|
677 int(fields[1]),
|
jpayne@68
|
678 _cppstr(fields[2]),
|
jpayne@68
|
679 _cppstr(fields[5]),
|
jpayne@68
|
680 _cppstr('.'),
|
jpayne@68
|
681 list_to_vector(fields))
|
jpayne@68
|
682 pyb.file_type = b'vcf'
|
jpayne@68
|
683
|
jpayne@68
|
684
|
jpayne@68
|
685 # GFF
|
jpayne@68
|
686 elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
|
jpayne@68
|
687 pyb._bed = new BED(
|
jpayne@68
|
688 _cppstr(fields[0]),
|
jpayne@68
|
689 int(fields[3])-1, int(fields[4]),
|
jpayne@68
|
690 _cppstr(fields[2]),
|
jpayne@68
|
691 _cppstr(fields[5]),
|
jpayne@68
|
692 _cppstr(fields[6]),
|
jpayne@68
|
693 list_to_vector(fields[7:]))
|
jpayne@68
|
694 pyb.file_type = _cppstr('gff')
|
jpayne@68
|
695 else:
|
jpayne@68
|
696 raise MalformedBedLineError('Unable to detect format from %s' % fields)
|
jpayne@68
|
697
|
jpayne@68
|
698 if pyb.start > pyb.end:
|
jpayne@68
|
699 raise MalformedBedLineError("Start is greater than stop")
|
jpayne@68
|
700 pyb._bed.fields = list_to_vector(orig_fields)
|
jpayne@68
|
701 return pyb
|
jpayne@68
|
702
|
jpayne@68
|
703 cdef vector[string] list_to_vector(list li):
|
jpayne@68
|
704 cdef vector[string] s
|
jpayne@68
|
705 cdef int i
|
jpayne@68
|
706 for i in range(len(li)):
|
jpayne@68
|
707 _s = li[i]
|
jpayne@68
|
708 s.push_back(_cppstr(_s))
|
jpayne@68
|
709 return s
|
jpayne@68
|
710
|
jpayne@68
|
711 cdef list string_vec2list(vector[string] sv):
|
jpayne@68
|
712 cdef size_t size = sv.size(), i
|
jpayne@68
|
713 return [_pystr(sv.at(i)) for i in range(size)]
|
jpayne@68
|
714
|
jpayne@68
|
715 cdef list bed_vec2list(vector[BED] bv):
|
jpayne@68
|
716 cdef size_t size = bv.size(), i
|
jpayne@68
|
717 cdef list l = []
|
jpayne@68
|
718 cdef BED b
|
jpayne@68
|
719 for i in range(size):
|
jpayne@68
|
720 b = bv.at(i)
|
jpayne@68
|
721 l.append(create_interval(b))
|
jpayne@68
|
722 return l
|
jpayne@68
|
723
|
jpayne@68
|
724
|
jpayne@68
|
725 def overlap(int s1, int s2, int e1, int e2):
|
jpayne@68
|
726 return min(e1, e2) - max(s1, s2)
|
jpayne@68
|
727
|
jpayne@68
|
728
|
jpayne@68
|
729 cdef class IntervalIterator:
|
jpayne@68
|
730 cdef object stream
|
jpayne@68
|
731 cdef int _itemtype
|
jpayne@68
|
732 def __init__(self, stream):
|
jpayne@68
|
733 self.stream = stream
|
jpayne@68
|
734
|
jpayne@68
|
735 # For speed, check int rather than call isinstance().
|
jpayne@68
|
736 # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
|
jpayne@68
|
737 #
|
jpayne@68
|
738 # Also assumes that all items in the iterable `stream` are the same
|
jpayne@68
|
739 # type...this seems like a reasonable assumption.
|
jpayne@68
|
740 self._itemtype = -1
|
jpayne@68
|
741
|
jpayne@68
|
742 def __dealloc__(self):
|
jpayne@68
|
743 try:
|
jpayne@68
|
744 self.stream.close()
|
jpayne@68
|
745 except AttributeError:
|
jpayne@68
|
746 pass
|
jpayne@68
|
747
|
jpayne@68
|
748 def __iter__(self):
|
jpayne@68
|
749 return self
|
jpayne@68
|
750
|
jpayne@68
|
751 def __next__(self):
|
jpayne@68
|
752 while True:
|
jpayne@68
|
753 if hasattr(self.stream, 'closed'):
|
jpayne@68
|
754 if self.stream.closed:
|
jpayne@68
|
755 raise StopIteration
|
jpayne@68
|
756 try:
|
jpayne@68
|
757 line = next(self.stream)
|
jpayne@68
|
758 except StopIteration:
|
jpayne@68
|
759 if hasattr(self.stream, 'close'):
|
jpayne@68
|
760 self.stream.close()
|
jpayne@68
|
761 raise StopIteration
|
jpayne@68
|
762
|
jpayne@68
|
763 if self._itemtype < 0:
|
jpayne@68
|
764 if isinstance(line, Interval):
|
jpayne@68
|
765 self._itemtype = 2
|
jpayne@68
|
766 elif isinstance(line, basestring):
|
jpayne@68
|
767 self._itemtype = 1
|
jpayne@68
|
768 else:
|
jpayne@68
|
769 self._itemtype = 0
|
jpayne@68
|
770
|
jpayne@68
|
771 if self._itemtype == 1:
|
jpayne@68
|
772 if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
|
jpayne@68
|
773 continue
|
jpayne@68
|
774 break
|
jpayne@68
|
775
|
jpayne@68
|
776 # Iterable of Interval objects
|
jpayne@68
|
777 if self._itemtype == 2:
|
jpayne@68
|
778 return line
|
jpayne@68
|
779
|
jpayne@68
|
780 # Iterable of strings, in which case we need to split
|
jpayne@68
|
781 elif self._itemtype == 1:
|
jpayne@68
|
782 fields = line.rstrip('\r\n').split('\t')
|
jpayne@68
|
783
|
jpayne@68
|
784 # Otherwise assume list/tuple/iterable of fields
|
jpayne@68
|
785 else:
|
jpayne@68
|
786 fields = list(line)
|
jpayne@68
|
787
|
jpayne@68
|
788 # TODO: optimization: create_interval_from_list should have a version
|
jpayne@68
|
789 # that accepts C++ string instances
|
jpayne@68
|
790 return create_interval_from_list(fields)
|
jpayne@68
|
791
|
jpayne@68
|
792
|
jpayne@68
|
793
|
jpayne@68
|
794 cdef class IntervalFile:
|
jpayne@68
|
795 cdef BedFile *intervalFile_ptr
|
jpayne@68
|
796 cdef bint _loaded
|
jpayne@68
|
797 cdef bint _open
|
jpayne@68
|
798 cdef string _fn
|
jpayne@68
|
799 """
|
jpayne@68
|
800 An IntervalFile provides low-level access to the BEDTools API.
|
jpayne@68
|
801
|
jpayne@68
|
802 >>> fn = pybedtools.example_filename('a.bed')
|
jpayne@68
|
803 >>> intervalfile = pybedtools.IntervalFile(fn)
|
jpayne@68
|
804
|
jpayne@68
|
805 """
|
jpayne@68
|
806 def __init__(self, intervalFile):
|
jpayne@68
|
807 self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
|
jpayne@68
|
808 self._loaded = 0
|
jpayne@68
|
809 self._open = 0
|
jpayne@68
|
810 self._fn = _cppstr(intervalFile)
|
jpayne@68
|
811
|
jpayne@68
|
812 def __dealloc__(self):
|
jpayne@68
|
813 del self.intervalFile_ptr
|
jpayne@68
|
814
|
jpayne@68
|
815 def __iter__(self):
|
jpayne@68
|
816 return self
|
jpayne@68
|
817
|
jpayne@68
|
818 def __next__(self):
|
jpayne@68
|
819 if not self._open:
|
jpayne@68
|
820 result = self.intervalFile_ptr.Open()
|
jpayne@68
|
821 if result == -1:
|
jpayne@68
|
822 raise BedToolsFileError("Error opening file")
|
jpayne@68
|
823 self._open = 1
|
jpayne@68
|
824 cdef BED b = self.intervalFile_ptr.GetNextBed()
|
jpayne@68
|
825 if b.status == BED_VALID:
|
jpayne@68
|
826 return create_interval(b)
|
jpayne@68
|
827 elif b.status == BED_INVALID:
|
jpayne@68
|
828 self.intervalFile_ptr.Close()
|
jpayne@68
|
829 raise StopIteration
|
jpayne@68
|
830 elif b.status == BED_MALFORMED:
|
jpayne@68
|
831 self.intervalFile_ptr.Close()
|
jpayne@68
|
832 raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
|
jpayne@68
|
833 else:
|
jpayne@68
|
834 return next(self)
|
jpayne@68
|
835
|
jpayne@68
|
836 @property
|
jpayne@68
|
837 def fn(self):
|
jpayne@68
|
838 return _pystr(self._fn)
|
jpayne@68
|
839
|
jpayne@68
|
840 @property
|
jpayne@68
|
841 def file_type(self):
|
jpayne@68
|
842 if not self.intervalFile_ptr._typeIsKnown:
|
jpayne@68
|
843 try:
|
jpayne@68
|
844 a = next(iter(self))
|
jpayne@68
|
845 file_type = _pystr(self.intervalFile_ptr.file_type)
|
jpayne@68
|
846 self.intervalFile_ptr.Close()
|
jpayne@68
|
847 return file_type
|
jpayne@68
|
848 except MalformedBedLineError:
|
jpayne@68
|
849 # If it's a SAM, raise a meaningful exception. If not, fail.
|
jpayne@68
|
850 with open(self.fn) as fn:
|
jpayne@68
|
851 interval = create_interval_from_list(fn.readline().strip().split())
|
jpayne@68
|
852 if interval.file_type == 'sam':
|
jpayne@68
|
853 raise ValueError('IntervalFile objects do not yet natively support SAM. '
|
jpayne@68
|
854 'Please convert to BED/GFF/VCF first if you want to '
|
jpayne@68
|
855 'use the low-level API of IntervalFile')
|
jpayne@68
|
856 else:
|
jpayne@68
|
857 raise
|
jpayne@68
|
858
|
jpayne@68
|
859
|
jpayne@68
|
860 def loadIntoMap(self):
|
jpayne@68
|
861 """
|
jpayne@68
|
862 Prepares file for checking intersections. Used by other methods like all_hits()
|
jpayne@68
|
863 """
|
jpayne@68
|
864 if self._loaded:
|
jpayne@68
|
865 return
|
jpayne@68
|
866 self.intervalFile_ptr.loadBedFileIntoMap()
|
jpayne@68
|
867 self._loaded = 1
|
jpayne@68
|
868
|
jpayne@68
|
869 def rewind(self):
|
jpayne@68
|
870 """
|
jpayne@68
|
871 Jump to the beginning of the file.
|
jpayne@68
|
872 """
|
jpayne@68
|
873 if not self._open:
|
jpayne@68
|
874 self.intervalFile_ptr.Open()
|
jpayne@68
|
875 self._open = 1
|
jpayne@68
|
876 self.intervalFile_ptr.Rewind()
|
jpayne@68
|
877
|
jpayne@68
|
878 def seek(self, offset):
|
jpayne@68
|
879 """
|
jpayne@68
|
880 Jump to a specific byte offset in the file
|
jpayne@68
|
881 """
|
jpayne@68
|
882 if not self._open:
|
jpayne@68
|
883 self.intervalFile_ptr.Open()
|
jpayne@68
|
884 self._open = 1
|
jpayne@68
|
885 self.intervalFile_ptr.Seek(offset)
|
jpayne@68
|
886
|
jpayne@68
|
887
|
jpayne@68
|
888 def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
|
jpayne@68
|
889 """
|
jpayne@68
|
890 :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
|
jpayne@68
|
891
|
jpayne@68
|
892 Search for the Interval `interval` this file and return **all**
|
jpayne@68
|
893 overlaps as a list.
|
jpayne@68
|
894
|
jpayne@68
|
895 `same_strand`, if True, will only consider hits on the same strand as `interval`.
|
jpayne@68
|
896
|
jpayne@68
|
897 `overlap` can be used to specify the fraction of overlap between
|
jpayne@68
|
898 `interval` and each feature in the IntervalFile.
|
jpayne@68
|
899
|
jpayne@68
|
900 Example usage:
|
jpayne@68
|
901
|
jpayne@68
|
902 >>> fn = pybedtools.example_filename('a.bed')
|
jpayne@68
|
903
|
jpayne@68
|
904 >>> # create an Interval to query with
|
jpayne@68
|
905 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
|
jpayne@68
|
906
|
jpayne@68
|
907 >>> # Create an IntervalFile out of a.bed
|
jpayne@68
|
908 >>> intervalfile = pybedtools.IntervalFile(fn)
|
jpayne@68
|
909
|
jpayne@68
|
910 >>> # get stranded hits
|
jpayne@68
|
911 >>> intervalfile.all_hits(i, same_strand=True)
|
jpayne@68
|
912 [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
|
jpayne@68
|
913
|
jpayne@68
|
914 """
|
jpayne@68
|
915 cdef vector[BED] vec_b
|
jpayne@68
|
916 self.loadIntoMap()
|
jpayne@68
|
917
|
jpayne@68
|
918 if same_strand == False:
|
jpayne@68
|
919 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
|
jpayne@68
|
920 try:
|
jpayne@68
|
921 return bed_vec2list(vec_b)
|
jpayne@68
|
922 finally:
|
jpayne@68
|
923 pass
|
jpayne@68
|
924 else:
|
jpayne@68
|
925 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
|
jpayne@68
|
926 try:
|
jpayne@68
|
927 return bed_vec2list(vec_b)
|
jpayne@68
|
928 finally:
|
jpayne@68
|
929 pass
|
jpayne@68
|
930
|
jpayne@68
|
931 # search() is an alias for all_hits
|
jpayne@68
|
932 search = all_hits
|
jpayne@68
|
933
|
jpayne@68
|
934 def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
|
jpayne@68
|
935 """
|
jpayne@68
|
936 :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
|
jpayne@68
|
937
|
jpayne@68
|
938 Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
|
jpayne@68
|
939
|
jpayne@68
|
940 `same_strand`, if True, will only consider hits on the same strand as `interval`.
|
jpayne@68
|
941
|
jpayne@68
|
942 `overlap` can be used to specify the fraction of overlap between
|
jpayne@68
|
943 `interval` and each feature in the IntervalFile.
|
jpayne@68
|
944
|
jpayne@68
|
945 Example usage:
|
jpayne@68
|
946
|
jpayne@68
|
947 >>> fn = pybedtools.example_filename('a.bed')
|
jpayne@68
|
948
|
jpayne@68
|
949 >>> # create an Interval to query with
|
jpayne@68
|
950 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
|
jpayne@68
|
951
|
jpayne@68
|
952 >>> # Create an IntervalFile out of a.bed
|
jpayne@68
|
953 >>> intervalfile = pybedtools.IntervalFile(fn)
|
jpayne@68
|
954
|
jpayne@68
|
955 >>> # any stranded hits?
|
jpayne@68
|
956 >>> intervalfile.any_hits(i, same_strand=True)
|
jpayne@68
|
957 1
|
jpayne@68
|
958
|
jpayne@68
|
959 """
|
jpayne@68
|
960 found = 0
|
jpayne@68
|
961 self.loadIntoMap()
|
jpayne@68
|
962
|
jpayne@68
|
963 if same_strand == False:
|
jpayne@68
|
964 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
|
jpayne@68
|
965 else:
|
jpayne@68
|
966 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
|
jpayne@68
|
967
|
jpayne@68
|
968 return found
|
jpayne@68
|
969
|
jpayne@68
|
970 def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
|
jpayne@68
|
971 """
|
jpayne@68
|
972 :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
|
jpayne@68
|
973
|
jpayne@68
|
974 Return the number of overlaps of the Interval `interval` had with this
|
jpayne@68
|
975 IntervalFile.
|
jpayne@68
|
976
|
jpayne@68
|
977 `same_strand`, if True, will only consider hits on the same strand as
|
jpayne@68
|
978 `interval`.
|
jpayne@68
|
979
|
jpayne@68
|
980 `overlap` can be used to specify the fraction of overlap between
|
jpayne@68
|
981 `interval` and each feature in the IntervalFile.
|
jpayne@68
|
982
|
jpayne@68
|
983 Example usage:
|
jpayne@68
|
984
|
jpayne@68
|
985 >>> fn = pybedtools.example_filename('a.bed')
|
jpayne@68
|
986
|
jpayne@68
|
987 >>> # create an Interval to query with
|
jpayne@68
|
988 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
|
jpayne@68
|
989
|
jpayne@68
|
990 >>> # Create an IntervalFile out of a.bed
|
jpayne@68
|
991 >>> intervalfile = pybedtools.IntervalFile(fn)
|
jpayne@68
|
992
|
jpayne@68
|
993 >>> # get number of stranded hits
|
jpayne@68
|
994 >>> intervalfile.count_hits(i, same_strand=True)
|
jpayne@68
|
995 3
|
jpayne@68
|
996
|
jpayne@68
|
997 """
|
jpayne@68
|
998 self.loadIntoMap()
|
jpayne@68
|
999
|
jpayne@68
|
1000 if same_strand == False:
|
jpayne@68
|
1001 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
|
jpayne@68
|
1002 else:
|
jpayne@68
|
1003 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)
|