Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # distutils: language = c++ | |
2 # cython: language_level=2 | |
3 | |
4 # String notes: | |
5 # | |
6 # Anything that goes in C++ objects should be converted to a C++ <string> | |
7 # type, using the _cppstr() function. For example: Interval._bed.file_type, | |
8 # or the entries in Interval._bed.fields. | |
9 # | |
10 # Any Python accessor methods (Interval.fields, Interval.__getitem__) should | |
11 # then be converted to Python strings using the _pystr() function. | |
12 # | |
13 # Cython uses the `str` type as whatever the native Python version uses as | |
14 # str. | |
15 | |
16 from libcpp.string cimport string | |
17 import numpy as np | |
18 | |
19 # Python byte strings automatically coerce to/from C++ strings. | |
20 | |
21 cdef _cppstr(s): | |
22 # Use this to handle incoming strings from Python. | |
23 # | |
24 # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings | |
25 # are unicode and so must be encoded to bytestring. | |
26 if isinstance(s, integer_types): | |
27 s = str(s) | |
28 if isinstance(s, unicode): | |
29 s = s.encode('UTF-8') | |
30 return <string> s | |
31 | |
32 cdef _pystr(string s): | |
33 # Use this to prepare a string for sending to Python. | |
34 # | |
35 # Always returns unicode. | |
36 return s.decode('UTF-8', 'strict') | |
37 | |
38 integer_types = (int, long, np.int64) | |
39 | |
40 | |
41 """ | |
42 bedtools.pyx: A Cython wrapper for the BEDTools BedFile class | |
43 | |
44 Authors: Aaron Quinlan[1], Brent Pedersen[2] | |
45 Affl: [1] Center for Public Health Genomics, University of Virginia | |
46 [2] | |
47 Email: aaronquinlan at gmail dot com | |
48 """ | |
49 from cython.operator cimport dereference as deref | |
50 import sys | |
51 import subprocess | |
52 from collections import defaultdict | |
53 | |
54 cdef dict LOOKUPS = { | |
55 "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6}, | |
56 "vcf": {"chrom": 0, "start": 1}, | |
57 "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5} | |
58 } | |
59 for ktype, kdict in list(LOOKUPS.items()): | |
60 for k, v in list(kdict.items()): | |
61 kdict[v] = k | |
62 | |
63 # Keys are tuples of start/start, stop/stop, start/stop, stop/start. | |
64 # Values are which operators should return True, otherwise False | |
65 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 | |
66 PROFILES_TRUE = { | |
67 (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b | |
68 # a --------- | |
69 # b --------- | |
70 | |
71 (-1, -1, -1, -1): (0, 1), # a < b, a <= b | |
72 # a ---- | |
73 # b ----- | |
74 | |
75 (-1, -1, -1, 0): (1,), # a <= b | |
76 # a ---- | |
77 # b ----- (book-ended) | |
78 | |
79 (1, 1, 0, 1): (5,), # a >= b | |
80 # a ----- | |
81 # b ---- (book-ended) | |
82 | |
83 (1, 1, 1, 1): (4, 5), # a > b, a >= b | |
84 # a ------ | |
85 # b ---- | |
86 | |
87 (0, 1, -1, 1): (5,), # a >= b | |
88 # a ------------ | |
89 # b --------- | |
90 | |
91 (1, 0, -1, 1): (5,), # a >= b | |
92 # a ----------- | |
93 # b ------------- | |
94 | |
95 (-1, 0, -1, 1): (1,), # a <= b | |
96 # a ------------- | |
97 # b ----------- | |
98 | |
99 (0, -1, -1, 1): (1,), # a <= b | |
100 # a --------- | |
101 # b ------------ | |
102 | |
103 (-1, -1, -1, 1): (1,), # a <= b | |
104 # a ----------- | |
105 # b ----------- | |
106 | |
107 (1, 1, -1, 1): (5,), # a >= b | |
108 # a ----------- | |
109 # b ----------- | |
110 | |
111 (1, -1, -1, 1): tuple(), # undef | |
112 # a ---- | |
113 # b ----------- | |
114 | |
115 (-1, 1, -1, 1): tuple(), # undef | |
116 # a ----------- | |
117 # b ---- | |
118 | |
119 (-1, 0, -1, 0): (1,), # a <= b | |
120 # a ----------- | |
121 # b - | |
122 | |
123 (1, 0, 0, 1): (5,), # a >= b | |
124 # a - | |
125 # b ----------- | |
126 | |
127 (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b | |
128 # a - | |
129 # b - (starts and stops are identical for all features) | |
130 } | |
131 | |
132 | |
133 class MalformedBedLineError(Exception): | |
134 pass | |
135 | |
136 | |
137 class BedToolsFileError(Exception): | |
138 pass | |
139 | |
140 | |
141 class Attributes(dict): | |
142 """ | |
143 Class to map between a dict of attrs and fields[8] of a GFF Interval obj. | |
144 """ | |
145 | |
146 def __init__(self, attr_str=""): | |
147 attr_str = str(attr_str) | |
148 self._attr_str = attr_str | |
149 self.sort_keys = False | |
150 | |
151 # in general, GFF files will have either as many '=' as ';' | |
152 # (or ';'-1 if there's no trailing ';') | |
153 n_semi = attr_str.count(';') | |
154 n_eq = attr_str.count('=') | |
155 n_quotes = attr_str.count('"') | |
156 | |
157 if n_eq > n_semi - 1: | |
158 self.sep, self.field_sep = (';', '=') | |
159 else: | |
160 self.sep, self.field_sep = (';', ' ') | |
161 | |
162 self._quoted = {} | |
163 | |
164 # TODO: pathological case . . . detect this as GFF: | |
165 # | |
166 # class_code=" " | |
167 # | |
168 # and this as GTF: | |
169 # | |
170 # class_code "=" | |
171 | |
172 # quick exit | |
173 if attr_str == "": | |
174 return | |
175 | |
176 kvs = map(str.strip, attr_str.strip().split(self.sep)) | |
177 for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]: | |
178 if value.count('"') == 2: | |
179 self._quoted[field] = True | |
180 self[field] = value.replace('"', '') | |
181 | |
182 def __str__(self): | |
183 # stringify all items first | |
184 items = [] | |
185 for field, val in dict.iteritems(self): | |
186 try: | |
187 if self._quoted[field]: | |
188 val = '"' + str(val) + '"' | |
189 except KeyError: | |
190 pass | |
191 items.append((field, val)) | |
192 | |
193 pairs = [] | |
194 if self.sort_keys: | |
195 items.sort() | |
196 for k, v in items: | |
197 pairs.append(self.field_sep.join([k, v])) | |
198 | |
199 return self.sep.join(pairs) + self.sep | |
200 | |
201 cdef class Interval: | |
202 """ | |
203 Class to represent a genomic interval. | |
204 | |
205 Constructor:: | |
206 | |
207 Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None) | |
208 | |
209 Class to represent a genomic interval of any format. Requires at least 3 | |
210 args: chrom (string), start (int), end (int). | |
211 | |
212 `start` is *always* the 0-based start coordinate. If this Interval is to | |
213 represent a GFF object (which uses a 1-based coordinate system), then | |
214 subtract 1 from the 4th item in the line to get the start position in | |
215 0-based coords for this Interval. The 1-based GFF coord will still be | |
216 available, albeit as a string, in fields[3]. | |
217 | |
218 `otherfields` is a list of fields that don't fit into the other kwargs, and | |
219 will be stored in the `fields` attribute of the Interval. | |
220 | |
221 All the items in `otherfields` must be strings for proper conversion to | |
222 C++. | |
223 | |
224 By convention, for BED files, `otherfields` is everything past the first 6 | |
225 items in the line. This allows an Interval to represent composite features | |
226 (e.g., a GFF line concatenated to the end of a BED line) | |
227 | |
228 But for other formats (VCF, GFF, SAM), the entire line should be passed in | |
229 as a list for `otherfields` so that we can always check the | |
230 Interval.file_type and extract the fields we want, knowing that they'll be | |
231 in the right order as passed in with `otherfields`. | |
232 | |
233 Example usage: | |
234 | |
235 >>> from pybedtools import Interval | |
236 >>> i = Interval("chr1", 22, 44, strand='-') | |
237 >>> i | |
238 Interval(chr1:22-44) | |
239 | |
240 | |
241 """ | |
242 def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None): | |
243 if otherfields is None: | |
244 otherfields = [] | |
245 otherfields = [_cppstr(i) for i in otherfields] | |
246 self._bed = new BED( | |
247 _cppstr(chrom), start, end, _cppstr(name), _cppstr(score), | |
248 _cppstr(strand), otherfields) | |
249 | |
250 #self._bed.chrom = _cppstr(chrom) | |
251 #self._bed.start = start | |
252 #self._bed.end = end | |
253 #self._bed.name = _cppstr(name) | |
254 #self._bed.score = _cppstr(score) | |
255 #self._bed.strand = _cppstr(strand) | |
256 fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)] | |
257 fields.extend(otherfields) | |
258 self._bed.fields = fields | |
259 self._attrs = None | |
260 | |
261 def __copy__(self): | |
262 return create_interval_from_list(self.fields) | |
263 | |
264 def __hash__(self): | |
265 return hash("\t".join(self.fields)) | |
266 | |
267 property chrom: | |
268 """ the chromosome of the feature""" | |
269 def __get__(self): | |
270 return _pystr(self._bed.chrom) | |
271 | |
272 def __set__(self, chrom): | |
273 chrom = _cppstr(chrom) | |
274 self._bed.chrom = chrom | |
275 idx = LOOKUPS[self.file_type]["chrom"] | |
276 self._bed.fields[idx] = _cppstr(chrom) | |
277 | |
278 # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 | |
279 def __richcmp__(self, other, int op): | |
280 if (self.chrom != other.chrom) or (self.strand != other.strand): | |
281 if op == 3: return True | |
282 return False | |
283 | |
284 def cmp(x, y): | |
285 if x < y: | |
286 return -1 | |
287 if x == y: | |
288 return 0 | |
289 if x > y: | |
290 return 1 | |
291 | |
292 | |
293 # check all 4 so that we can handle nesting and partial overlaps. | |
294 profile = (cmp(self.start, other.start), | |
295 cmp(self.stop, other.stop), | |
296 cmp(self.start, other.stop), | |
297 cmp(self.stop, other.start)) | |
298 | |
299 try: | |
300 if PROFILES_TRUE[profile] == tuple(): | |
301 raise NotImplementedError('Features are nested -- comparison undefined') | |
302 | |
303 if op != 3: | |
304 if op in PROFILES_TRUE[profile]: | |
305 return True | |
306 return False | |
307 else: | |
308 if 2 in PROFILES_TRUE[profile]: | |
309 return False | |
310 return True | |
311 except KeyError: | |
312 raise ValueError('Currently unsupported comparison -- please ' | |
313 'submit a bug report') | |
314 | |
315 property start: | |
316 """The 0-based start of the feature.""" | |
317 def __get__(self): | |
318 return self._bed.start | |
319 | |
320 def __set__(self, int start): | |
321 self._bed.start = start | |
322 idx = LOOKUPS[self.file_type]["start"] | |
323 | |
324 # Non-BED files should have 1-based coords in fields | |
325 if self.file_type != 'bed': | |
326 start += 1 | |
327 self._bed.fields[idx] = _cppstr(str(start)) | |
328 | |
329 property end: | |
330 """The end of the feature""" | |
331 def __get__(self): | |
332 return self._bed.end | |
333 | |
334 def __set__(self, int end): | |
335 self._bed.end = end | |
336 idx = LOOKUPS[self.file_type]["stop"] | |
337 self._bed.fields[idx] = _cppstr(str(end)) | |
338 | |
339 property stop: | |
340 """ the end of the feature""" | |
341 def __get__(self): | |
342 return self._bed.end | |
343 | |
344 def __set__(self, int end): | |
345 idx = LOOKUPS[self.file_type]["stop"] | |
346 self._bed.fields[idx] = _cppstr(str(end)) | |
347 self._bed.end = end | |
348 | |
349 property strand: | |
350 """ the strand of the feature""" | |
351 def __get__(self): | |
352 return _pystr(self._bed.strand) | |
353 | |
354 def __set__(self, strand): | |
355 idx = LOOKUPS[self.file_type]["strand"] | |
356 self._bed.fields[idx] = _cppstr(strand) | |
357 self._bed.strand = _cppstr(strand) | |
358 | |
359 property length: | |
360 """ the length of the feature""" | |
361 def __get__(self): | |
362 return self._bed.end - self._bed.start | |
363 | |
364 cpdef deparse_attrs(self): | |
365 | |
366 if not self._attrs: return | |
367 | |
368 if self.file_type != "gff": | |
369 raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval') | |
370 | |
371 s = self._attrs.__str__() | |
372 self._bed.fields[8] = _cppstr(s) | |
373 | |
374 property fields: | |
375 def __get__(self): | |
376 self.deparse_attrs() | |
377 items = [] | |
378 for i in self._bed.fields: | |
379 if isinstance(i, int): | |
380 items.append(i) | |
381 else: | |
382 items.append(_pystr(i)) | |
383 return items | |
384 | |
385 property attrs: | |
386 def __get__(self): | |
387 if self._attrs is None: | |
388 ft = _pystr(self._bed.file_type) | |
389 if ft == 'gff': | |
390 self._attrs = Attributes(_pystr(self._bed.fields[8])) | |
391 else: | |
392 self._attrs = Attributes("") | |
393 return self._attrs | |
394 | |
395 def __set__(self, attrs): | |
396 self._attrs = attrs | |
397 | |
398 # TODO: make this more robust. | |
399 @property | |
400 def count(self): | |
401 return int(self.fields[-1]) | |
402 | |
403 property name: | |
404 """ | |
405 >>> import pybedtools | |
406 >>> vcf = pybedtools.example_bedtool('v.vcf') | |
407 >>> [v.name for v in vcf] | |
408 ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1'] | |
409 | |
410 """ | |
411 def __get__(self): | |
412 cdef string ftype = self._bed.file_type | |
413 value = None | |
414 if ftype == <string>"gff": | |
415 """ | |
416 # TODO. allow setting a name_key in the BedTool constructor? | |
417 if self.name_key and self.name_key in attrs: | |
418 return attrs[self.name_key] | |
419 """ | |
420 for key in ("ID", "Name", "gene_name", "transcript_id", \ | |
421 "gene_id", "Parent"): | |
422 if key in self.attrs: | |
423 value = self.attrs[key] | |
424 break | |
425 | |
426 elif ftype == <string>"vcf": | |
427 s = self.fields[2] | |
428 if s in ("", "."): | |
429 value = "%s:%i" % (self.chrom, self.start) | |
430 else: | |
431 value = _pystr(s) | |
432 elif ftype == <string>"bed": | |
433 value = _pystr(self._bed.name) | |
434 | |
435 return value | |
436 | |
437 def __set__(self, value): | |
438 cdef string ftype = self._bed.file_type | |
439 | |
440 if ftype == <string>"gff": | |
441 for key in ("ID", "Name", "gene_name", "transcript_id", \ | |
442 "gene_id", "Parent"): | |
443 if not key in self.attrs: | |
444 continue | |
445 | |
446 # If it's incoming from Python it's unicode, so store that directly | |
447 # in the attributes (since an Attribute object works on | |
448 # unicode)... | |
449 self.attrs[key] = value | |
450 break | |
451 | |
452 # Otherwise use _cppstr() because we're storing it in _bed.fields. | |
453 elif ftype == <string>"vcf": | |
454 self._bed.fields[2] = _cppstr(value) | |
455 else: | |
456 self._bed.name = _cppstr(value) | |
457 self._bed.fields[3] = _cppstr(value) | |
458 | |
459 property score: | |
460 def __get__(self): | |
461 return _pystr(self._bed.score) | |
462 | |
463 def __set__(self, value): | |
464 value = _cppstr(value) | |
465 self._bed.score = value | |
466 idx = LOOKUPS[self.file_type]["score"] | |
467 self._bed.fields[idx] = value | |
468 | |
469 property file_type: | |
470 "bed/vcf/gff" | |
471 def __get__(self): | |
472 return _pystr(self._bed.file_type) | |
473 | |
474 def __set__(self, value): | |
475 self._bed.file_type = _cppstr(value) | |
476 | |
477 # TODO: maybe bed.overlap_start or bed.overlap.start ?? | |
478 @property | |
479 def o_start(self): | |
480 return self._bed.o_start | |
481 | |
482 @property | |
483 def o_end(self): | |
484 return self._bed.o_end | |
485 | |
486 @property | |
487 def o_amt(self): | |
488 return self._bed.o_end - self._bed.o_start | |
489 | |
490 def __str__(self): | |
491 """ | |
492 Interval objects always print with a newline to mimic a line in a | |
493 BED/GFF/VCF file | |
494 """ | |
495 items = [] | |
496 for i in self.fields: | |
497 if isinstance(i, int): | |
498 i = str(i) | |
499 items.append(i) | |
500 | |
501 return '\t'.join(items) + '\n' | |
502 | |
503 def __repr__(self): | |
504 return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end) | |
505 | |
506 def __dealloc__(self): | |
507 del self._bed | |
508 | |
509 def __len__(self): | |
510 return self._bed.end - self._bed.start | |
511 | |
512 def __getitem__(self, object key): | |
513 cdef int i | |
514 ftype = _pystr(self._bed.file_type) | |
515 | |
516 self.deparse_attrs() | |
517 | |
518 if isinstance(key, (int, long)): | |
519 nfields = self._bed.fields.size() | |
520 if key >= nfields: | |
521 raise IndexError('field index out of range') | |
522 elif key < 0: | |
523 key = nfields + key | |
524 return _pystr(self._bed.fields.at(key)) | |
525 elif isinstance(key, slice): | |
526 indices = key.indices(self._bed.fields.size()) | |
527 return [_pystr(self._bed.fields.at(i)) for i in range(*indices)] | |
528 | |
529 elif isinstance(key, str): | |
530 if ftype == "gff": | |
531 try: | |
532 return self.attrs[key] | |
533 except KeyError: | |
534 pass | |
535 # We don't have to convert using _pystr() because the __get__ | |
536 # methods do that already. | |
537 return getattr(self, key) | |
538 | |
539 def __setitem__(self, object key, object value): | |
540 if isinstance(key, (int, long)): | |
541 nfields = self._bed.fields.size() | |
542 if key >= nfields: | |
543 raise IndexError('field index out of range') | |
544 elif key < 0: | |
545 key = nfields + key | |
546 self._bed.fields[key] = _cppstr(value) | |
547 | |
548 ft = _pystr(self._bed.file_type) | |
549 if key in LOOKUPS[ft]: | |
550 setattr(self, LOOKUPS[ft][key], value) | |
551 | |
552 elif isinstance(key, (basestring)): | |
553 setattr(self, key, value) | |
554 | |
555 cpdef append(self, object value): | |
556 self._bed.fields.push_back(_cppstr(value)) | |
557 | |
558 def __nonzero__(self): | |
559 return True | |
560 | |
561 | |
562 cdef Interval create_interval(BED b): | |
563 cdef Interval pyb = Interval.__new__(Interval) | |
564 pyb._bed = new BED(b.chrom, b.start, b.end, b.name, | |
565 b.score, b.strand, b.fields, | |
566 b.o_start, b.o_end, b.bedType, b.file_type, b.status) | |
567 pyb._bed.fields = b.fields | |
568 return pyb | |
569 | |
570 # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when | |
571 # checking in create_interval_from_list for filetype heuruistics. Is there | |
572 # a performance hit by checking instances? | |
573 cdef isdigit(s): | |
574 if isinstance(s, integer_types): | |
575 return True | |
576 return s.isdigit() | |
577 | |
578 | |
579 cpdef Interval create_interval_from_list(list fields): | |
580 """ | |
581 Create an Interval object from a list of strings. | |
582 | |
583 Constructor:: | |
584 | |
585 create_interval_from_list(fields) | |
586 | |
587 Given the list of strings, `fields`, automatically detects the format (BED, | |
588 GFF, VCF, SAM) and creates a new Interval object. | |
589 | |
590 `fields` is a list with an arbitrary number of items (it can be quite long, | |
591 say after a -wao intersection of a BED12 and a GFF), however, the first | |
592 fields must conform to one of the supported formats. For example, if you | |
593 want the resulting Interval to be considered a GFF feature, then the first | |
594 9 fields must conform to the GFF format. Similarly, if you want the | |
595 resulting Interval to be considered a BED feature, then the first three | |
596 fields must be chrom, start, stop. | |
597 | |
598 Example usage: | |
599 | |
600 >>> # Creates a BED3 feature | |
601 >>> feature = create_interval_from_list(['chr1', '1', '100']) | |
602 | |
603 """ | |
604 | |
605 # TODO: this function is used a lot, and is doing a bit of work. We should | |
606 # have an optimized version that is directly provided the filetype. | |
607 | |
608 cdef Interval pyb = Interval.__new__(Interval) | |
609 orig_fields = fields[:] | |
610 # BED -- though a VCF will be detected as BED if its 2nd field, id, is a | |
611 # digit | |
612 | |
613 # SAM | |
614 if ( | |
615 (len(fields) >= 11) | |
616 and isdigit(fields[1]) | |
617 and isdigit(fields[3]) | |
618 and isdigit(fields[4]) | |
619 and (fields[5] not in ['.', '+', '-']) | |
620 ): | |
621 # TODO: what should the stop position be? Here, it's just the start | |
622 # plus the length of the sequence, but perhaps this should eventually | |
623 # do CIGAR string parsing. | |
624 if int(fields[1]) & 0x04: | |
625 # handle unmapped reads | |
626 chrom = _cppstr("*") | |
627 start = 0 | |
628 stop = 0 | |
629 else: | |
630 chrom = _cppstr(fields[2]) | |
631 start = int(fields[3]) - 1 | |
632 stop = int(fields[3]) + len(fields[9]) - 1 | |
633 name = _cppstr(fields[0]) | |
634 score = _cppstr(fields[1]) | |
635 if int(fields[1]) & 0x10: | |
636 strand = _cppstr('-') | |
637 else: | |
638 strand = _cppstr('+') | |
639 | |
640 # Fields is in SAM format | |
641 fields[3] = str(start + 1) | |
642 | |
643 pyb._bed = new BED( | |
644 chrom, | |
645 start, | |
646 stop, | |
647 strand, | |
648 name, | |
649 score, | |
650 list_to_vector(fields)) | |
651 pyb.file_type = _cppstr('sam') | |
652 | |
653 | |
654 elif isdigit(fields[1]) and isdigit(fields[2]): | |
655 # if it's too short, just add some empty fields. | |
656 if len(fields) < 7: | |
657 fields.extend([".".encode('UTF-8')] * (6 - len(fields))) | |
658 other_fields = [] | |
659 else: | |
660 other_fields = fields[6:] | |
661 | |
662 pyb._bed = new BED( | |
663 _cppstr(fields[0]), | |
664 int(fields[1]), | |
665 int(fields[2]), | |
666 _cppstr(fields[3]), | |
667 _cppstr(fields[4]), | |
668 _cppstr(fields[5]), | |
669 list_to_vector(other_fields)) | |
670 pyb.file_type = _cppstr('bed') | |
671 | |
672 # VCF | |
673 elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8: | |
674 pyb._bed = new BED( | |
675 _cppstr(fields[0]), | |
676 int(fields[1]) - 1, | |
677 int(fields[1]), | |
678 _cppstr(fields[2]), | |
679 _cppstr(fields[5]), | |
680 _cppstr('.'), | |
681 list_to_vector(fields)) | |
682 pyb.file_type = b'vcf' | |
683 | |
684 | |
685 # GFF | |
686 elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]): | |
687 pyb._bed = new BED( | |
688 _cppstr(fields[0]), | |
689 int(fields[3])-1, int(fields[4]), | |
690 _cppstr(fields[2]), | |
691 _cppstr(fields[5]), | |
692 _cppstr(fields[6]), | |
693 list_to_vector(fields[7:])) | |
694 pyb.file_type = _cppstr('gff') | |
695 else: | |
696 raise MalformedBedLineError('Unable to detect format from %s' % fields) | |
697 | |
698 if pyb.start > pyb.end: | |
699 raise MalformedBedLineError("Start is greater than stop") | |
700 pyb._bed.fields = list_to_vector(orig_fields) | |
701 return pyb | |
702 | |
703 cdef vector[string] list_to_vector(list li): | |
704 cdef vector[string] s | |
705 cdef int i | |
706 for i in range(len(li)): | |
707 _s = li[i] | |
708 s.push_back(_cppstr(_s)) | |
709 return s | |
710 | |
711 cdef list string_vec2list(vector[string] sv): | |
712 cdef size_t size = sv.size(), i | |
713 return [_pystr(sv.at(i)) for i in range(size)] | |
714 | |
715 cdef list bed_vec2list(vector[BED] bv): | |
716 cdef size_t size = bv.size(), i | |
717 cdef list l = [] | |
718 cdef BED b | |
719 for i in range(size): | |
720 b = bv.at(i) | |
721 l.append(create_interval(b)) | |
722 return l | |
723 | |
724 | |
725 def overlap(int s1, int s2, int e1, int e2): | |
726 return min(e1, e2) - max(s1, s2) | |
727 | |
728 | |
729 cdef class IntervalIterator: | |
730 cdef object stream | |
731 cdef int _itemtype | |
732 def __init__(self, stream): | |
733 self.stream = stream | |
734 | |
735 # For speed, check int rather than call isinstance(). | |
736 # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string. | |
737 # | |
738 # Also assumes that all items in the iterable `stream` are the same | |
739 # type...this seems like a reasonable assumption. | |
740 self._itemtype = -1 | |
741 | |
742 def __dealloc__(self): | |
743 try: | |
744 self.stream.close() | |
745 except AttributeError: | |
746 pass | |
747 | |
748 def __iter__(self): | |
749 return self | |
750 | |
751 def __next__(self): | |
752 while True: | |
753 if hasattr(self.stream, 'closed'): | |
754 if self.stream.closed: | |
755 raise StopIteration | |
756 try: | |
757 line = next(self.stream) | |
758 except StopIteration: | |
759 if hasattr(self.stream, 'close'): | |
760 self.stream.close() | |
761 raise StopIteration | |
762 | |
763 if self._itemtype < 0: | |
764 if isinstance(line, Interval): | |
765 self._itemtype = 2 | |
766 elif isinstance(line, basestring): | |
767 self._itemtype = 1 | |
768 else: | |
769 self._itemtype = 0 | |
770 | |
771 if self._itemtype == 1: | |
772 if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0: | |
773 continue | |
774 break | |
775 | |
776 # Iterable of Interval objects | |
777 if self._itemtype == 2: | |
778 return line | |
779 | |
780 # Iterable of strings, in which case we need to split | |
781 elif self._itemtype == 1: | |
782 fields = line.rstrip('\r\n').split('\t') | |
783 | |
784 # Otherwise assume list/tuple/iterable of fields | |
785 else: | |
786 fields = list(line) | |
787 | |
788 # TODO: optimization: create_interval_from_list should have a version | |
789 # that accepts C++ string instances | |
790 return create_interval_from_list(fields) | |
791 | |
792 | |
793 | |
794 cdef class IntervalFile: | |
795 cdef BedFile *intervalFile_ptr | |
796 cdef bint _loaded | |
797 cdef bint _open | |
798 cdef string _fn | |
799 """ | |
800 An IntervalFile provides low-level access to the BEDTools API. | |
801 | |
802 >>> fn = pybedtools.example_filename('a.bed') | |
803 >>> intervalfile = pybedtools.IntervalFile(fn) | |
804 | |
805 """ | |
806 def __init__(self, intervalFile): | |
807 self.intervalFile_ptr = new BedFile(_cppstr(intervalFile)) | |
808 self._loaded = 0 | |
809 self._open = 0 | |
810 self._fn = _cppstr(intervalFile) | |
811 | |
812 def __dealloc__(self): | |
813 del self.intervalFile_ptr | |
814 | |
815 def __iter__(self): | |
816 return self | |
817 | |
818 def __next__(self): | |
819 if not self._open: | |
820 result = self.intervalFile_ptr.Open() | |
821 if result == -1: | |
822 raise BedToolsFileError("Error opening file") | |
823 self._open = 1 | |
824 cdef BED b = self.intervalFile_ptr.GetNextBed() | |
825 if b.status == BED_VALID: | |
826 return create_interval(b) | |
827 elif b.status == BED_INVALID: | |
828 self.intervalFile_ptr.Close() | |
829 raise StopIteration | |
830 elif b.status == BED_MALFORMED: | |
831 self.intervalFile_ptr.Close() | |
832 raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields)) | |
833 else: | |
834 return next(self) | |
835 | |
836 @property | |
837 def fn(self): | |
838 return _pystr(self._fn) | |
839 | |
840 @property | |
841 def file_type(self): | |
842 if not self.intervalFile_ptr._typeIsKnown: | |
843 try: | |
844 a = next(iter(self)) | |
845 file_type = _pystr(self.intervalFile_ptr.file_type) | |
846 self.intervalFile_ptr.Close() | |
847 return file_type | |
848 except MalformedBedLineError: | |
849 # If it's a SAM, raise a meaningful exception. If not, fail. | |
850 with open(self.fn) as fn: | |
851 interval = create_interval_from_list(fn.readline().strip().split()) | |
852 if interval.file_type == 'sam': | |
853 raise ValueError('IntervalFile objects do not yet natively support SAM. ' | |
854 'Please convert to BED/GFF/VCF first if you want to ' | |
855 'use the low-level API of IntervalFile') | |
856 else: | |
857 raise | |
858 | |
859 | |
860 def loadIntoMap(self): | |
861 """ | |
862 Prepares file for checking intersections. Used by other methods like all_hits() | |
863 """ | |
864 if self._loaded: | |
865 return | |
866 self.intervalFile_ptr.loadBedFileIntoMap() | |
867 self._loaded = 1 | |
868 | |
869 def rewind(self): | |
870 """ | |
871 Jump to the beginning of the file. | |
872 """ | |
873 if not self._open: | |
874 self.intervalFile_ptr.Open() | |
875 self._open = 1 | |
876 self.intervalFile_ptr.Rewind() | |
877 | |
878 def seek(self, offset): | |
879 """ | |
880 Jump to a specific byte offset in the file | |
881 """ | |
882 if not self._open: | |
883 self.intervalFile_ptr.Open() | |
884 self._open = 1 | |
885 self.intervalFile_ptr.Seek(offset) | |
886 | |
887 | |
888 def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): | |
889 """ | |
890 :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)` | |
891 | |
892 Search for the Interval `interval` this file and return **all** | |
893 overlaps as a list. | |
894 | |
895 `same_strand`, if True, will only consider hits on the same strand as `interval`. | |
896 | |
897 `overlap` can be used to specify the fraction of overlap between | |
898 `interval` and each feature in the IntervalFile. | |
899 | |
900 Example usage: | |
901 | |
902 >>> fn = pybedtools.example_filename('a.bed') | |
903 | |
904 >>> # create an Interval to query with | |
905 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') | |
906 | |
907 >>> # Create an IntervalFile out of a.bed | |
908 >>> intervalfile = pybedtools.IntervalFile(fn) | |
909 | |
910 >>> # get stranded hits | |
911 >>> intervalfile.all_hits(i, same_strand=True) | |
912 [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)] | |
913 | |
914 """ | |
915 cdef vector[BED] vec_b | |
916 self.loadIntoMap() | |
917 | |
918 if same_strand == False: | |
919 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap) | |
920 try: | |
921 return bed_vec2list(vec_b) | |
922 finally: | |
923 pass | |
924 else: | |
925 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap) | |
926 try: | |
927 return bed_vec2list(vec_b) | |
928 finally: | |
929 pass | |
930 | |
931 # search() is an alias for all_hits | |
932 search = all_hits | |
933 | |
934 def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): | |
935 """ | |
936 :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)` | |
937 | |
938 Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise. | |
939 | |
940 `same_strand`, if True, will only consider hits on the same strand as `interval`. | |
941 | |
942 `overlap` can be used to specify the fraction of overlap between | |
943 `interval` and each feature in the IntervalFile. | |
944 | |
945 Example usage: | |
946 | |
947 >>> fn = pybedtools.example_filename('a.bed') | |
948 | |
949 >>> # create an Interval to query with | |
950 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') | |
951 | |
952 >>> # Create an IntervalFile out of a.bed | |
953 >>> intervalfile = pybedtools.IntervalFile(fn) | |
954 | |
955 >>> # any stranded hits? | |
956 >>> intervalfile.any_hits(i, same_strand=True) | |
957 1 | |
958 | |
959 """ | |
960 found = 0 | |
961 self.loadIntoMap() | |
962 | |
963 if same_strand == False: | |
964 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap) | |
965 else: | |
966 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap) | |
967 | |
968 return found | |
969 | |
970 def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): | |
971 """ | |
972 :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)` | |
973 | |
974 Return the number of overlaps of the Interval `interval` had with this | |
975 IntervalFile. | |
976 | |
977 `same_strand`, if True, will only consider hits on the same strand as | |
978 `interval`. | |
979 | |
980 `overlap` can be used to specify the fraction of overlap between | |
981 `interval` and each feature in the IntervalFile. | |
982 | |
983 Example usage: | |
984 | |
985 >>> fn = pybedtools.example_filename('a.bed') | |
986 | |
987 >>> # create an Interval to query with | |
988 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') | |
989 | |
990 >>> # Create an IntervalFile out of a.bed | |
991 >>> intervalfile = pybedtools.IntervalFile(fn) | |
992 | |
993 >>> # get number of stranded hits | |
994 >>> intervalfile.count_hits(i, same_strand=True) | |
995 3 | |
996 | |
997 """ | |
998 self.loadIntoMap() | |
999 | |
1000 if same_strand == False: | |
1001 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap) | |
1002 else: | |
1003 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap) |