csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # distutils: language = c++
jpayne@68	2 # cython: language_level=2
jpayne@68	3
jpayne@68	4 # String notes:
jpayne@68	5 #
jpayne@68	6 # Anything that goes in C++ objects should be converted to a C++ <string>
jpayne@68	7 # type, using the _cppstr() function. For example: Interval._bed.file_type,
jpayne@68	8 # or the entries in Interval._bed.fields.
jpayne@68	9 #
jpayne@68	10 # Any Python accessor methods (Interval.fields, Interval.__getitem__) should
jpayne@68	11 # then be converted to Python strings using the _pystr() function.
jpayne@68	12 #
jpayne@68	13 # Cython uses the `str` type as whatever the native Python version uses as
jpayne@68	14 # str.
jpayne@68	15
jpayne@68	16 from libcpp.string cimport string
jpayne@68	17 import numpy as np
jpayne@68	18
jpayne@68	19 # Python byte strings automatically coerce to/from C++ strings.
jpayne@68	20
jpayne@68	21 cdef _cppstr(s):
jpayne@68	22 # Use this to handle incoming strings from Python.
jpayne@68	23 #
jpayne@68	24 # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
jpayne@68	25 # are unicode and so must be encoded to bytestring.
jpayne@68	26 if isinstance(s, integer_types):
jpayne@68	27 s = str(s)
jpayne@68	28 if isinstance(s, unicode):
jpayne@68	29 s = s.encode('UTF-8')
jpayne@68	30 return <string> s
jpayne@68	31
jpayne@68	32 cdef _pystr(string s):
jpayne@68	33 # Use this to prepare a string for sending to Python.
jpayne@68	34 #
jpayne@68	35 # Always returns unicode.
jpayne@68	36 return s.decode('UTF-8', 'strict')
jpayne@68	37
jpayne@68	38 integer_types = (int, long, np.int64)
jpayne@68	39
jpayne@68	40
jpayne@68	41 """
jpayne@68	42 bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
jpayne@68	43
jpayne@68	44 Authors: Aaron Quinlan[1], Brent Pedersen[2]
jpayne@68	45 Affl: [1] Center for Public Health Genomics, University of Virginia
jpayne@68	46 [2]
jpayne@68	47 Email: aaronquinlan at gmail dot com
jpayne@68	48 """
jpayne@68	49 from cython.operator cimport dereference as deref
jpayne@68	50 import sys
jpayne@68	51 import subprocess
jpayne@68	52 from collections import defaultdict
jpayne@68	53
jpayne@68	54 cdef dict LOOKUPS = {
jpayne@68	55 "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
jpayne@68	56 "vcf": {"chrom": 0, "start": 1},
jpayne@68	57 "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
jpayne@68	58 }
jpayne@68	59 for ktype, kdict in list(LOOKUPS.items()):
jpayne@68	60 for k, v in list(kdict.items()):
jpayne@68	61 kdict[v] = k
jpayne@68	62
jpayne@68	63 # Keys are tuples of start/start, stop/stop, start/stop, stop/start.
jpayne@68	64 # Values are which operators should return True, otherwise False
jpayne@68	65 # < 0 \| <= 1 \| == 2 \| != 3 \| > 4 \| >= 5
jpayne@68	66 PROFILES_TRUE = {
jpayne@68	67 (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b
jpayne@68	68 # a ---------
jpayne@68	69 # b ---------
jpayne@68	70
jpayne@68	71 (-1, -1, -1, -1): (0, 1), # a < b, a <= b
jpayne@68	72 # a ----
jpayne@68	73 # b -----
jpayne@68	74
jpayne@68	75 (-1, -1, -1, 0): (1,), # a <= b
jpayne@68	76 # a ----
jpayne@68	77 # b ----- (book-ended)
jpayne@68	78
jpayne@68	79 (1, 1, 0, 1): (5,), # a >= b
jpayne@68	80 # a -----
jpayne@68	81 # b ---- (book-ended)
jpayne@68	82
jpayne@68	83 (1, 1, 1, 1): (4, 5), # a > b, a >= b
jpayne@68	84 # a ------
jpayne@68	85 # b ----
jpayne@68	86
jpayne@68	87 (0, 1, -1, 1): (5,), # a >= b
jpayne@68	88 # a ------------
jpayne@68	89 # b ---------
jpayne@68	90
jpayne@68	91 (1, 0, -1, 1): (5,), # a >= b
jpayne@68	92 # a -----------
jpayne@68	93 # b -------------
jpayne@68	94
jpayne@68	95 (-1, 0, -1, 1): (1,), # a <= b
jpayne@68	96 # a -------------
jpayne@68	97 # b -----------
jpayne@68	98
jpayne@68	99 (0, -1, -1, 1): (1,), # a <= b
jpayne@68	100 # a ---------
jpayne@68	101 # b ------------
jpayne@68	102
jpayne@68	103 (-1, -1, -1, 1): (1,), # a <= b
jpayne@68	104 # a -----------
jpayne@68	105 # b -----------
jpayne@68	106
jpayne@68	107 (1, 1, -1, 1): (5,), # a >= b
jpayne@68	108 # a -----------
jpayne@68	109 # b -----------
jpayne@68	110
jpayne@68	111 (1, -1, -1, 1): tuple(), # undef
jpayne@68	112 # a ----
jpayne@68	113 # b -----------
jpayne@68	114
jpayne@68	115 (-1, 1, -1, 1): tuple(), # undef
jpayne@68	116 # a -----------
jpayne@68	117 # b ----
jpayne@68	118
jpayne@68	119 (-1, 0, -1, 0): (1,), # a <= b
jpayne@68	120 # a -----------
jpayne@68	121 # b -
jpayne@68	122
jpayne@68	123 (1, 0, 0, 1): (5,), # a >= b
jpayne@68	124 # a -
jpayne@68	125 # b -----------
jpayne@68	126
jpayne@68	127 (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b
jpayne@68	128 # a -
jpayne@68	129 # b - (starts and stops are identical for all features)
jpayne@68	130 }
jpayne@68	131
jpayne@68	132
jpayne@68	133 class MalformedBedLineError(Exception):
jpayne@68	134 pass
jpayne@68	135
jpayne@68	136
jpayne@68	137 class BedToolsFileError(Exception):
jpayne@68	138 pass
jpayne@68	139
jpayne@68	140
jpayne@68	141 class Attributes(dict):
jpayne@68	142 """
jpayne@68	143 Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
jpayne@68	144 """
jpayne@68	145
jpayne@68	146 def __init__(self, attr_str=""):
jpayne@68	147 attr_str = str(attr_str)
jpayne@68	148 self._attr_str = attr_str
jpayne@68	149 self.sort_keys = False
jpayne@68	150
jpayne@68	151 # in general, GFF files will have either as many '=' as ';'
jpayne@68	152 # (or ';'-1 if there's no trailing ';')
jpayne@68	153 n_semi = attr_str.count(';')
jpayne@68	154 n_eq = attr_str.count('=')
jpayne@68	155 n_quotes = attr_str.count('"')
jpayne@68	156
jpayne@68	157 if n_eq > n_semi - 1:
jpayne@68	158 self.sep, self.field_sep = (';', '=')
jpayne@68	159 else:
jpayne@68	160 self.sep, self.field_sep = (';', ' ')
jpayne@68	161
jpayne@68	162 self._quoted = {}
jpayne@68	163
jpayne@68	164 # TODO: pathological case . . . detect this as GFF:
jpayne@68	165 #
jpayne@68	166 # class_code=" "
jpayne@68	167 #
jpayne@68	168 # and this as GTF:
jpayne@68	169 #
jpayne@68	170 # class_code "="
jpayne@68	171
jpayne@68	172 # quick exit
jpayne@68	173 if attr_str == "":
jpayne@68	174 return
jpayne@68	175
jpayne@68	176 kvs = map(str.strip, attr_str.strip().split(self.sep))
jpayne@68	177 for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
jpayne@68	178 if value.count('"') == 2:
jpayne@68	179 self._quoted[field] = True
jpayne@68	180 self[field] = value.replace('"', '')
jpayne@68	181
jpayne@68	182 def __str__(self):
jpayne@68	183 # stringify all items first
jpayne@68	184 items = []
jpayne@68	185 for field, val in dict.iteritems(self):
jpayne@68	186 try:
jpayne@68	187 if self._quoted[field]:
jpayne@68	188 val = '"' + str(val) + '"'
jpayne@68	189 except KeyError:
jpayne@68	190 pass
jpayne@68	191 items.append((field, val))
jpayne@68	192
jpayne@68	193 pairs = []
jpayne@68	194 if self.sort_keys:
jpayne@68	195 items.sort()
jpayne@68	196 for k, v in items:
jpayne@68	197 pairs.append(self.field_sep.join([k, v]))
jpayne@68	198
jpayne@68	199 return self.sep.join(pairs) + self.sep
jpayne@68	200
jpayne@68	201 cdef class Interval:
jpayne@68	202 """
jpayne@68	203 Class to represent a genomic interval.
jpayne@68	204
jpayne@68	205 Constructor::
jpayne@68	206
jpayne@68	207 Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
jpayne@68	208
jpayne@68	209 Class to represent a genomic interval of any format. Requires at least 3
jpayne@68	210 args: chrom (string), start (int), end (int).
jpayne@68	211
jpayne@68	212 `start` is always the 0-based start coordinate. If this Interval is to
jpayne@68	213 represent a GFF object (which uses a 1-based coordinate system), then
jpayne@68	214 subtract 1 from the 4th item in the line to get the start position in
jpayne@68	215 0-based coords for this Interval. The 1-based GFF coord will still be
jpayne@68	216 available, albeit as a string, in fields[3].
jpayne@68	217
jpayne@68	218 `otherfields` is a list of fields that don't fit into the other kwargs, and
jpayne@68	219 will be stored in the `fields` attribute of the Interval.
jpayne@68	220
jpayne@68	221 All the items in `otherfields` must be strings for proper conversion to
jpayne@68	222 C++.
jpayne@68	223
jpayne@68	224 By convention, for BED files, `otherfields` is everything past the first 6
jpayne@68	225 items in the line. This allows an Interval to represent composite features
jpayne@68	226 (e.g., a GFF line concatenated to the end of a BED line)
jpayne@68	227
jpayne@68	228 But for other formats (VCF, GFF, SAM), the entire line should be passed in
jpayne@68	229 as a list for `otherfields` so that we can always check the
jpayne@68	230 Interval.file_type and extract the fields we want, knowing that they'll be
jpayne@68	231 in the right order as passed in with `otherfields`.
jpayne@68	232
jpayne@68	233 Example usage:
jpayne@68	234
jpayne@68	235 >>> from pybedtools import Interval
jpayne@68	236 >>> i = Interval("chr1", 22, 44, strand='-')
jpayne@68	237 >>> i
jpayne@68	238 Interval(chr1:22-44)
jpayne@68	239
jpayne@68	240
jpayne@68	241 """
jpayne@68	242 def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
jpayne@68	243 if otherfields is None:
jpayne@68	244 otherfields = []
jpayne@68	245 otherfields = [_cppstr(i) for i in otherfields]
jpayne@68	246 self._bed = new BED(
jpayne@68	247 _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
jpayne@68	248 _cppstr(strand), otherfields)
jpayne@68	249
jpayne@68	250 #self._bed.chrom = _cppstr(chrom)
jpayne@68	251 #self._bed.start = start
jpayne@68	252 #self._bed.end = end
jpayne@68	253 #self._bed.name = _cppstr(name)
jpayne@68	254 #self._bed.score = _cppstr(score)
jpayne@68	255 #self._bed.strand = _cppstr(strand)
jpayne@68	256 fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
jpayne@68	257 fields.extend(otherfields)
jpayne@68	258 self._bed.fields = fields
jpayne@68	259 self._attrs = None
jpayne@68	260
jpayne@68	261 def __copy__(self):
jpayne@68	262 return create_interval_from_list(self.fields)
jpayne@68	263
jpayne@68	264 def __hash__(self):
jpayne@68	265 return hash("\t".join(self.fields))
jpayne@68	266
jpayne@68	267 property chrom:
jpayne@68	268 """ the chromosome of the feature"""
jpayne@68	269 def __get__(self):
jpayne@68	270 return _pystr(self._bed.chrom)
jpayne@68	271
jpayne@68	272 def __set__(self, chrom):
jpayne@68	273 chrom = _cppstr(chrom)
jpayne@68	274 self._bed.chrom = chrom
jpayne@68	275 idx = LOOKUPS[self.file_type]["chrom"]
jpayne@68	276 self._bed.fields[idx] = _cppstr(chrom)
jpayne@68	277
jpayne@68	278 # < 0 \| <= 1 \| == 2 \| != 3 \| > 4 \| >= 5
jpayne@68	279 def __richcmp__(self, other, int op):
jpayne@68	280 if (self.chrom != other.chrom) or (self.strand != other.strand):
jpayne@68	281 if op == 3: return True
jpayne@68	282 return False
jpayne@68	283
jpayne@68	284 def cmp(x, y):
jpayne@68	285 if x < y:
jpayne@68	286 return -1
jpayne@68	287 if x == y:
jpayne@68	288 return 0
jpayne@68	289 if x > y:
jpayne@68	290 return 1
jpayne@68	291
jpayne@68	292
jpayne@68	293 # check all 4 so that we can handle nesting and partial overlaps.
jpayne@68	294 profile = (cmp(self.start, other.start),
jpayne@68	295 cmp(self.stop, other.stop),
jpayne@68	296 cmp(self.start, other.stop),
jpayne@68	297 cmp(self.stop, other.start))
jpayne@68	298
jpayne@68	299 try:
jpayne@68	300 if PROFILES_TRUE[profile] == tuple():
jpayne@68	301 raise NotImplementedError('Features are nested -- comparison undefined')
jpayne@68	302
jpayne@68	303 if op != 3:
jpayne@68	304 if op in PROFILES_TRUE[profile]:
jpayne@68	305 return True
jpayne@68	306 return False
jpayne@68	307 else:
jpayne@68	308 if 2 in PROFILES_TRUE[profile]:
jpayne@68	309 return False
jpayne@68	310 return True
jpayne@68	311 except KeyError:
jpayne@68	312 raise ValueError('Currently unsupported comparison -- please '
jpayne@68	313 'submit a bug report')
jpayne@68	314
jpayne@68	315 property start:
jpayne@68	316 """The 0-based start of the feature."""
jpayne@68	317 def __get__(self):
jpayne@68	318 return self._bed.start
jpayne@68	319
jpayne@68	320 def __set__(self, int start):
jpayne@68	321 self._bed.start = start
jpayne@68	322 idx = LOOKUPS[self.file_type]["start"]
jpayne@68	323
jpayne@68	324 # Non-BED files should have 1-based coords in fields
jpayne@68	325 if self.file_type != 'bed':
jpayne@68	326 start += 1
jpayne@68	327 self._bed.fields[idx] = _cppstr(str(start))
jpayne@68	328
jpayne@68	329 property end:
jpayne@68	330 """The end of the feature"""
jpayne@68	331 def __get__(self):
jpayne@68	332 return self._bed.end
jpayne@68	333
jpayne@68	334 def __set__(self, int end):
jpayne@68	335 self._bed.end = end
jpayne@68	336 idx = LOOKUPS[self.file_type]["stop"]
jpayne@68	337 self._bed.fields[idx] = _cppstr(str(end))
jpayne@68	338
jpayne@68	339 property stop:
jpayne@68	340 """ the end of the feature"""
jpayne@68	341 def __get__(self):
jpayne@68	342 return self._bed.end
jpayne@68	343
jpayne@68	344 def __set__(self, int end):
jpayne@68	345 idx = LOOKUPS[self.file_type]["stop"]
jpayne@68	346 self._bed.fields[idx] = _cppstr(str(end))
jpayne@68	347 self._bed.end = end
jpayne@68	348
jpayne@68	349 property strand:
jpayne@68	350 """ the strand of the feature"""
jpayne@68	351 def __get__(self):
jpayne@68	352 return _pystr(self._bed.strand)
jpayne@68	353
jpayne@68	354 def __set__(self, strand):
jpayne@68	355 idx = LOOKUPS[self.file_type]["strand"]
jpayne@68	356 self._bed.fields[idx] = _cppstr(strand)
jpayne@68	357 self._bed.strand = _cppstr(strand)
jpayne@68	358
jpayne@68	359 property length:
jpayne@68	360 """ the length of the feature"""
jpayne@68	361 def __get__(self):
jpayne@68	362 return self._bed.end - self._bed.start
jpayne@68	363
jpayne@68	364 cpdef deparse_attrs(self):
jpayne@68	365
jpayne@68	366 if not self._attrs: return
jpayne@68	367
jpayne@68	368 if self.file_type != "gff":
jpayne@68	369 raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
jpayne@68	370
jpayne@68	371 s = self._attrs.__str__()
jpayne@68	372 self._bed.fields[8] = _cppstr(s)
jpayne@68	373
jpayne@68	374 property fields:
jpayne@68	375 def __get__(self):
jpayne@68	376 self.deparse_attrs()
jpayne@68	377 items = []
jpayne@68	378 for i in self._bed.fields:
jpayne@68	379 if isinstance(i, int):
jpayne@68	380 items.append(i)
jpayne@68	381 else:
jpayne@68	382 items.append(_pystr(i))
jpayne@68	383 return items
jpayne@68	384
jpayne@68	385 property attrs:
jpayne@68	386 def __get__(self):
jpayne@68	387 if self._attrs is None:
jpayne@68	388 ft = _pystr(self._bed.file_type)
jpayne@68	389 if ft == 'gff':
jpayne@68	390 self._attrs = Attributes(_pystr(self._bed.fields[8]))
jpayne@68	391 else:
jpayne@68	392 self._attrs = Attributes("")
jpayne@68	393 return self._attrs
jpayne@68	394
jpayne@68	395 def __set__(self, attrs):
jpayne@68	396 self._attrs = attrs
jpayne@68	397
jpayne@68	398 # TODO: make this more robust.
jpayne@68	399 @property
jpayne@68	400 def count(self):
jpayne@68	401 return int(self.fields[-1])
jpayne@68	402
jpayne@68	403 property name:
jpayne@68	404 """
jpayne@68	405 >>> import pybedtools
jpayne@68	406 >>> vcf = pybedtools.example_bedtool('v.vcf')
jpayne@68	407 >>> [v.name for v in vcf]
jpayne@68	408 ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
jpayne@68	409
jpayne@68	410 """
jpayne@68	411 def __get__(self):
jpayne@68	412 cdef string ftype = self._bed.file_type
jpayne@68	413 value = None
jpayne@68	414 if ftype == <string>"gff":
jpayne@68	415 """
jpayne@68	416 # TODO. allow setting a name_key in the BedTool constructor?
jpayne@68	417 if self.name_key and self.name_key in attrs:
jpayne@68	418 return attrs[self.name_key]
jpayne@68	419 """
jpayne@68	420 for key in ("ID", "Name", "gene_name", "transcript_id", \
jpayne@68	421 "gene_id", "Parent"):
jpayne@68	422 if key in self.attrs:
jpayne@68	423 value = self.attrs[key]
jpayne@68	424 break
jpayne@68	425
jpayne@68	426 elif ftype == <string>"vcf":
jpayne@68	427 s = self.fields[2]
jpayne@68	428 if s in ("", "."):
jpayne@68	429 value = "%s:%i" % (self.chrom, self.start)
jpayne@68	430 else:
jpayne@68	431 value = _pystr(s)
jpayne@68	432 elif ftype == <string>"bed":
jpayne@68	433 value = _pystr(self._bed.name)
jpayne@68	434
jpayne@68	435 return value
jpayne@68	436
jpayne@68	437 def __set__(self, value):
jpayne@68	438 cdef string ftype = self._bed.file_type
jpayne@68	439
jpayne@68	440 if ftype == <string>"gff":
jpayne@68	441 for key in ("ID", "Name", "gene_name", "transcript_id", \
jpayne@68	442 "gene_id", "Parent"):
jpayne@68	443 if not key in self.attrs:
jpayne@68	444 continue
jpayne@68	445
jpayne@68	446 # If it's incoming from Python it's unicode, so store that directly
jpayne@68	447 # in the attributes (since an Attribute object works on
jpayne@68	448 # unicode)...
jpayne@68	449 self.attrs[key] = value
jpayne@68	450 break
jpayne@68	451
jpayne@68	452 # Otherwise use _cppstr() because we're storing it in _bed.fields.
jpayne@68	453 elif ftype == <string>"vcf":
jpayne@68	454 self._bed.fields[2] = _cppstr(value)
jpayne@68	455 else:
jpayne@68	456 self._bed.name = _cppstr(value)
jpayne@68	457 self._bed.fields[3] = _cppstr(value)
jpayne@68	458
jpayne@68	459 property score:
jpayne@68	460 def __get__(self):
jpayne@68	461 return _pystr(self._bed.score)
jpayne@68	462
jpayne@68	463 def __set__(self, value):
jpayne@68	464 value = _cppstr(value)
jpayne@68	465 self._bed.score = value
jpayne@68	466 idx = LOOKUPS[self.file_type]["score"]
jpayne@68	467 self._bed.fields[idx] = value
jpayne@68	468
jpayne@68	469 property file_type:
jpayne@68	470 "bed/vcf/gff"
jpayne@68	471 def __get__(self):
jpayne@68	472 return _pystr(self._bed.file_type)
jpayne@68	473
jpayne@68	474 def __set__(self, value):
jpayne@68	475 self._bed.file_type = _cppstr(value)
jpayne@68	476
jpayne@68	477 # TODO: maybe bed.overlap_start or bed.overlap.start ??
jpayne@68	478 @property
jpayne@68	479 def o_start(self):
jpayne@68	480 return self._bed.o_start
jpayne@68	481
jpayne@68	482 @property
jpayne@68	483 def o_end(self):
jpayne@68	484 return self._bed.o_end
jpayne@68	485
jpayne@68	486 @property
jpayne@68	487 def o_amt(self):
jpayne@68	488 return self._bed.o_end - self._bed.o_start
jpayne@68	489
jpayne@68	490 def __str__(self):
jpayne@68	491 """
jpayne@68	492 Interval objects always print with a newline to mimic a line in a
jpayne@68	493 BED/GFF/VCF file
jpayne@68	494 """
jpayne@68	495 items = []
jpayne@68	496 for i in self.fields:
jpayne@68	497 if isinstance(i, int):
jpayne@68	498 i = str(i)
jpayne@68	499 items.append(i)
jpayne@68	500
jpayne@68	501 return '\t'.join(items) + '\n'
jpayne@68	502
jpayne@68	503 def __repr__(self):
jpayne@68	504 return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
jpayne@68	505
jpayne@68	506 def __dealloc__(self):
jpayne@68	507 del self._bed
jpayne@68	508
jpayne@68	509 def __len__(self):
jpayne@68	510 return self._bed.end - self._bed.start
jpayne@68	511
jpayne@68	512 def __getitem__(self, object key):
jpayne@68	513 cdef int i
jpayne@68	514 ftype = _pystr(self._bed.file_type)
jpayne@68	515
jpayne@68	516 self.deparse_attrs()
jpayne@68	517
jpayne@68	518 if isinstance(key, (int, long)):
jpayne@68	519 nfields = self._bed.fields.size()
jpayne@68	520 if key >= nfields:
jpayne@68	521 raise IndexError('field index out of range')
jpayne@68	522 elif key < 0:
jpayne@68	523 key = nfields + key
jpayne@68	524 return _pystr(self._bed.fields.at(key))
jpayne@68	525 elif isinstance(key, slice):
jpayne@68	526 indices = key.indices(self._bed.fields.size())
jpayne@68	527 return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
jpayne@68	528
jpayne@68	529 elif isinstance(key, str):
jpayne@68	530 if ftype == "gff":
jpayne@68	531 try:
jpayne@68	532 return self.attrs[key]
jpayne@68	533 except KeyError:
jpayne@68	534 pass
jpayne@68	535 # We don't have to convert using _pystr() because the __get__
jpayne@68	536 # methods do that already.
jpayne@68	537 return getattr(self, key)
jpayne@68	538
jpayne@68	539 def __setitem__(self, object key, object value):
jpayne@68	540 if isinstance(key, (int, long)):
jpayne@68	541 nfields = self._bed.fields.size()
jpayne@68	542 if key >= nfields:
jpayne@68	543 raise IndexError('field index out of range')
jpayne@68	544 elif key < 0:
jpayne@68	545 key = nfields + key
jpayne@68	546 self._bed.fields[key] = _cppstr(value)
jpayne@68	547
jpayne@68	548 ft = _pystr(self._bed.file_type)
jpayne@68	549 if key in LOOKUPS[ft]:
jpayne@68	550 setattr(self, LOOKUPS[ft][key], value)
jpayne@68	551
jpayne@68	552 elif isinstance(key, (basestring)):
jpayne@68	553 setattr(self, key, value)
jpayne@68	554
jpayne@68	555 cpdef append(self, object value):
jpayne@68	556 self._bed.fields.push_back(_cppstr(value))
jpayne@68	557
jpayne@68	558 def __nonzero__(self):
jpayne@68	559 return True
jpayne@68	560
jpayne@68	561
jpayne@68	562 cdef Interval create_interval(BED b):
jpayne@68	563 cdef Interval pyb = Interval.__new__(Interval)
jpayne@68	564 pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
jpayne@68	565 b.score, b.strand, b.fields,
jpayne@68	566 b.o_start, b.o_end, b.bedType, b.file_type, b.status)
jpayne@68	567 pyb._bed.fields = b.fields
jpayne@68	568 return pyb
jpayne@68	569
jpayne@68	570 # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
jpayne@68	571 # checking in create_interval_from_list for filetype heuruistics. Is there
jpayne@68	572 # a performance hit by checking instances?
jpayne@68	573 cdef isdigit(s):
jpayne@68	574 if isinstance(s, integer_types):
jpayne@68	575 return True
jpayne@68	576 return s.isdigit()
jpayne@68	577
jpayne@68	578
jpayne@68	579 cpdef Interval create_interval_from_list(list fields):
jpayne@68	580 """
jpayne@68	581 Create an Interval object from a list of strings.
jpayne@68	582
jpayne@68	583 Constructor::
jpayne@68	584
jpayne@68	585 create_interval_from_list(fields)
jpayne@68	586
jpayne@68	587 Given the list of strings, `fields`, automatically detects the format (BED,
jpayne@68	588 GFF, VCF, SAM) and creates a new Interval object.
jpayne@68	589
jpayne@68	590 `fields` is a list with an arbitrary number of items (it can be quite long,
jpayne@68	591 say after a -wao intersection of a BED12 and a GFF), however, the first
jpayne@68	592 fields must conform to one of the supported formats. For example, if you
jpayne@68	593 want the resulting Interval to be considered a GFF feature, then the first
jpayne@68	594 9 fields must conform to the GFF format. Similarly, if you want the
jpayne@68	595 resulting Interval to be considered a BED feature, then the first three
jpayne@68	596 fields must be chrom, start, stop.
jpayne@68	597
jpayne@68	598 Example usage:
jpayne@68	599
jpayne@68	600 >>> # Creates a BED3 feature
jpayne@68	601 >>> feature = create_interval_from_list(['chr1', '1', '100'])
jpayne@68	602
jpayne@68	603 """
jpayne@68	604
jpayne@68	605 # TODO: this function is used a lot, and is doing a bit of work. We should
jpayne@68	606 # have an optimized version that is directly provided the filetype.
jpayne@68	607
jpayne@68	608 cdef Interval pyb = Interval.__new__(Interval)
jpayne@68	609 orig_fields = fields[:]
jpayne@68	610 # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
jpayne@68	611 # digit
jpayne@68	612
jpayne@68	613 # SAM
jpayne@68	614 if (
jpayne@68	615 (len(fields) >= 11)
jpayne@68	616 and isdigit(fields[1])
jpayne@68	617 and isdigit(fields[3])
jpayne@68	618 and isdigit(fields[4])
jpayne@68	619 and (fields[5] not in ['.', '+', '-'])
jpayne@68	620 ):
jpayne@68	621 # TODO: what should the stop position be? Here, it's just the start
jpayne@68	622 # plus the length of the sequence, but perhaps this should eventually
jpayne@68	623 # do CIGAR string parsing.
jpayne@68	624 if int(fields[1]) & 0x04:
jpayne@68	625 # handle unmapped reads
jpayne@68	626 chrom = _cppstr("*")
jpayne@68	627 start = 0
jpayne@68	628 stop = 0
jpayne@68	629 else:
jpayne@68	630 chrom = _cppstr(fields[2])
jpayne@68	631 start = int(fields[3]) - 1
jpayne@68	632 stop = int(fields[3]) + len(fields[9]) - 1
jpayne@68	633 name = _cppstr(fields[0])
jpayne@68	634 score = _cppstr(fields[1])
jpayne@68	635 if int(fields[1]) & 0x10:
jpayne@68	636 strand = _cppstr('-')
jpayne@68	637 else:
jpayne@68	638 strand = _cppstr('+')
jpayne@68	639
jpayne@68	640 # Fields is in SAM format
jpayne@68	641 fields[3] = str(start + 1)
jpayne@68	642
jpayne@68	643 pyb._bed = new BED(
jpayne@68	644 chrom,
jpayne@68	645 start,
jpayne@68	646 stop,
jpayne@68	647 strand,
jpayne@68	648 name,
jpayne@68	649 score,
jpayne@68	650 list_to_vector(fields))
jpayne@68	651 pyb.file_type = _cppstr('sam')
jpayne@68	652
jpayne@68	653
jpayne@68	654 elif isdigit(fields[1]) and isdigit(fields[2]):
jpayne@68	655 # if it's too short, just add some empty fields.
jpayne@68	656 if len(fields) < 7:
jpayne@68	657 fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
jpayne@68	658 other_fields = []
jpayne@68	659 else:
jpayne@68	660 other_fields = fields[6:]
jpayne@68	661
jpayne@68	662 pyb._bed = new BED(
jpayne@68	663 _cppstr(fields[0]),
jpayne@68	664 int(fields[1]),
jpayne@68	665 int(fields[2]),
jpayne@68	666 _cppstr(fields[3]),
jpayne@68	667 _cppstr(fields[4]),
jpayne@68	668 _cppstr(fields[5]),
jpayne@68	669 list_to_vector(other_fields))
jpayne@68	670 pyb.file_type = _cppstr('bed')
jpayne@68	671
jpayne@68	672 # VCF
jpayne@68	673 elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
jpayne@68	674 pyb._bed = new BED(
jpayne@68	675 _cppstr(fields[0]),
jpayne@68	676 int(fields[1]) - 1,
jpayne@68	677 int(fields[1]),
jpayne@68	678 _cppstr(fields[2]),
jpayne@68	679 _cppstr(fields[5]),
jpayne@68	680 _cppstr('.'),
jpayne@68	681 list_to_vector(fields))
jpayne@68	682 pyb.file_type = b'vcf'
jpayne@68	683
jpayne@68	684
jpayne@68	685 # GFF
jpayne@68	686 elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
jpayne@68	687 pyb._bed = new BED(
jpayne@68	688 _cppstr(fields[0]),
jpayne@68	689 int(fields[3])-1, int(fields[4]),
jpayne@68	690 _cppstr(fields[2]),
jpayne@68	691 _cppstr(fields[5]),
jpayne@68	692 _cppstr(fields[6]),
jpayne@68	693 list_to_vector(fields[7:]))
jpayne@68	694 pyb.file_type = _cppstr('gff')
jpayne@68	695 else:
jpayne@68	696 raise MalformedBedLineError('Unable to detect format from %s' % fields)
jpayne@68	697
jpayne@68	698 if pyb.start > pyb.end:
jpayne@68	699 raise MalformedBedLineError("Start is greater than stop")
jpayne@68	700 pyb._bed.fields = list_to_vector(orig_fields)
jpayne@68	701 return pyb
jpayne@68	702
jpayne@68	703 cdef vector[string] list_to_vector(list li):
jpayne@68	704 cdef vector[string] s
jpayne@68	705 cdef int i
jpayne@68	706 for i in range(len(li)):
jpayne@68	707 _s = li[i]
jpayne@68	708 s.push_back(_cppstr(_s))
jpayne@68	709 return s
jpayne@68	710
jpayne@68	711 cdef list string_vec2list(vector[string] sv):
jpayne@68	712 cdef size_t size = sv.size(), i
jpayne@68	713 return [_pystr(sv.at(i)) for i in range(size)]
jpayne@68	714
jpayne@68	715 cdef list bed_vec2list(vector[BED] bv):
jpayne@68	716 cdef size_t size = bv.size(), i
jpayne@68	717 cdef list l = []
jpayne@68	718 cdef BED b
jpayne@68	719 for i in range(size):
jpayne@68	720 b = bv.at(i)
jpayne@68	721 l.append(create_interval(b))
jpayne@68	722 return l
jpayne@68	723
jpayne@68	724
jpayne@68	725 def overlap(int s1, int s2, int e1, int e2):
jpayne@68	726 return min(e1, e2) - max(s1, s2)
jpayne@68	727
jpayne@68	728
jpayne@68	729 cdef class IntervalIterator:
jpayne@68	730 cdef object stream
jpayne@68	731 cdef int _itemtype
jpayne@68	732 def __init__(self, stream):
jpayne@68	733 self.stream = stream
jpayne@68	734
jpayne@68	735 # For speed, check int rather than call isinstance().
jpayne@68	736 # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
jpayne@68	737 #
jpayne@68	738 # Also assumes that all items in the iterable `stream` are the same
jpayne@68	739 # type...this seems like a reasonable assumption.
jpayne@68	740 self._itemtype = -1
jpayne@68	741
jpayne@68	742 def __dealloc__(self):
jpayne@68	743 try:
jpayne@68	744 self.stream.close()
jpayne@68	745 except AttributeError:
jpayne@68	746 pass
jpayne@68	747
jpayne@68	748 def __iter__(self):
jpayne@68	749 return self
jpayne@68	750
jpayne@68	751 def __next__(self):
jpayne@68	752 while True:
jpayne@68	753 if hasattr(self.stream, 'closed'):
jpayne@68	754 if self.stream.closed:
jpayne@68	755 raise StopIteration
jpayne@68	756 try:
jpayne@68	757 line = next(self.stream)
jpayne@68	758 except StopIteration:
jpayne@68	759 if hasattr(self.stream, 'close'):
jpayne@68	760 self.stream.close()
jpayne@68	761 raise StopIteration
jpayne@68	762
jpayne@68	763 if self._itemtype < 0:
jpayne@68	764 if isinstance(line, Interval):
jpayne@68	765 self._itemtype = 2
jpayne@68	766 elif isinstance(line, basestring):
jpayne@68	767 self._itemtype = 1
jpayne@68	768 else:
jpayne@68	769 self._itemtype = 0
jpayne@68	770
jpayne@68	771 if self._itemtype == 1:
jpayne@68	772 if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
jpayne@68	773 continue
jpayne@68	774 break
jpayne@68	775
jpayne@68	776 # Iterable of Interval objects
jpayne@68	777 if self._itemtype == 2:
jpayne@68	778 return line
jpayne@68	779
jpayne@68	780 # Iterable of strings, in which case we need to split
jpayne@68	781 elif self._itemtype == 1:
jpayne@68	782 fields = line.rstrip('\r\n').split('\t')
jpayne@68	783
jpayne@68	784 # Otherwise assume list/tuple/iterable of fields
jpayne@68	785 else:
jpayne@68	786 fields = list(line)
jpayne@68	787
jpayne@68	788 # TODO: optimization: create_interval_from_list should have a version
jpayne@68	789 # that accepts C++ string instances
jpayne@68	790 return create_interval_from_list(fields)
jpayne@68	791
jpayne@68	792
jpayne@68	793
jpayne@68	794 cdef class IntervalFile:
jpayne@68	795 cdef BedFile *intervalFile_ptr
jpayne@68	796 cdef bint _loaded
jpayne@68	797 cdef bint _open
jpayne@68	798 cdef string _fn
jpayne@68	799 """
jpayne@68	800 An IntervalFile provides low-level access to the BEDTools API.
jpayne@68	801
jpayne@68	802 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68	803 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68	804
jpayne@68	805 """
jpayne@68	806 def __init__(self, intervalFile):
jpayne@68	807 self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
jpayne@68	808 self._loaded = 0
jpayne@68	809 self._open = 0
jpayne@68	810 self._fn = _cppstr(intervalFile)
jpayne@68	811
jpayne@68	812 def __dealloc__(self):
jpayne@68	813 del self.intervalFile_ptr
jpayne@68	814
jpayne@68	815 def __iter__(self):
jpayne@68	816 return self
jpayne@68	817
jpayne@68	818 def __next__(self):
jpayne@68	819 if not self._open:
jpayne@68	820 result = self.intervalFile_ptr.Open()
jpayne@68	821 if result == -1:
jpayne@68	822 raise BedToolsFileError("Error opening file")
jpayne@68	823 self._open = 1
jpayne@68	824 cdef BED b = self.intervalFile_ptr.GetNextBed()
jpayne@68	825 if b.status == BED_VALID:
jpayne@68	826 return create_interval(b)
jpayne@68	827 elif b.status == BED_INVALID:
jpayne@68	828 self.intervalFile_ptr.Close()
jpayne@68	829 raise StopIteration
jpayne@68	830 elif b.status == BED_MALFORMED:
jpayne@68	831 self.intervalFile_ptr.Close()
jpayne@68	832 raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
jpayne@68	833 else:
jpayne@68	834 return next(self)
jpayne@68	835
jpayne@68	836 @property
jpayne@68	837 def fn(self):
jpayne@68	838 return _pystr(self._fn)
jpayne@68	839
jpayne@68	840 @property
jpayne@68	841 def file_type(self):
jpayne@68	842 if not self.intervalFile_ptr._typeIsKnown:
jpayne@68	843 try:
jpayne@68	844 a = next(iter(self))
jpayne@68	845 file_type = _pystr(self.intervalFile_ptr.file_type)
jpayne@68	846 self.intervalFile_ptr.Close()
jpayne@68	847 return file_type
jpayne@68	848 except MalformedBedLineError:
jpayne@68	849 # If it's a SAM, raise a meaningful exception. If not, fail.
jpayne@68	850 with open(self.fn) as fn:
jpayne@68	851 interval = create_interval_from_list(fn.readline().strip().split())
jpayne@68	852 if interval.file_type == 'sam':
jpayne@68	853 raise ValueError('IntervalFile objects do not yet natively support SAM. '
jpayne@68	854 'Please convert to BED/GFF/VCF first if you want to '
jpayne@68	855 'use the low-level API of IntervalFile')
jpayne@68	856 else:
jpayne@68	857 raise
jpayne@68	858
jpayne@68	859
jpayne@68	860 def loadIntoMap(self):
jpayne@68	861 """
jpayne@68	862 Prepares file for checking intersections. Used by other methods like all_hits()
jpayne@68	863 """
jpayne@68	864 if self._loaded:
jpayne@68	865 return
jpayne@68	866 self.intervalFile_ptr.loadBedFileIntoMap()
jpayne@68	867 self._loaded = 1
jpayne@68	868
jpayne@68	869 def rewind(self):
jpayne@68	870 """
jpayne@68	871 Jump to the beginning of the file.
jpayne@68	872 """
jpayne@68	873 if not self._open:
jpayne@68	874 self.intervalFile_ptr.Open()
jpayne@68	875 self._open = 1
jpayne@68	876 self.intervalFile_ptr.Rewind()
jpayne@68	877
jpayne@68	878 def seek(self, offset):
jpayne@68	879 """
jpayne@68	880 Jump to a specific byte offset in the file
jpayne@68	881 """
jpayne@68	882 if not self._open:
jpayne@68	883 self.intervalFile_ptr.Open()
jpayne@68	884 self._open = 1
jpayne@68	885 self.intervalFile_ptr.Seek(offset)
jpayne@68	886
jpayne@68	887
jpayne@68	888 def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68	889 """
jpayne@68	890 :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68	891
jpayne@68	892 Search for the Interval `interval` this file and return all
jpayne@68	893 overlaps as a list.
jpayne@68	894
jpayne@68	895 `same_strand`, if True, will only consider hits on the same strand as `interval`.
jpayne@68	896
jpayne@68	897 `overlap` can be used to specify the fraction of overlap between
jpayne@68	898 `interval` and each feature in the IntervalFile.
jpayne@68	899
jpayne@68	900 Example usage:
jpayne@68	901
jpayne@68	902 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68	903
jpayne@68	904 >>> # create an Interval to query with
jpayne@68	905 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68	906
jpayne@68	907 >>> # Create an IntervalFile out of a.bed
jpayne@68	908 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68	909
jpayne@68	910 >>> # get stranded hits
jpayne@68	911 >>> intervalfile.all_hits(i, same_strand=True)
jpayne@68	912 [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
jpayne@68	913
jpayne@68	914 """
jpayne@68	915 cdef vector[BED] vec_b
jpayne@68	916 self.loadIntoMap()
jpayne@68	917
jpayne@68	918 if same_strand == False:
jpayne@68	919 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68	920 try:
jpayne@68	921 return bed_vec2list(vec_b)
jpayne@68	922 finally:
jpayne@68	923 pass
jpayne@68	924 else:
jpayne@68	925 vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
jpayne@68	926 try:
jpayne@68	927 return bed_vec2list(vec_b)
jpayne@68	928 finally:
jpayne@68	929 pass
jpayne@68	930
jpayne@68	931 # search() is an alias for all_hits
jpayne@68	932 search = all_hits
jpayne@68	933
jpayne@68	934 def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68	935 """
jpayne@68	936 :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68	937
jpayne@68	938 Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
jpayne@68	939
jpayne@68	940 `same_strand`, if True, will only consider hits on the same strand as `interval`.
jpayne@68	941
jpayne@68	942 `overlap` can be used to specify the fraction of overlap between
jpayne@68	943 `interval` and each feature in the IntervalFile.
jpayne@68	944
jpayne@68	945 Example usage:
jpayne@68	946
jpayne@68	947 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68	948
jpayne@68	949 >>> # create an Interval to query with
jpayne@68	950 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68	951
jpayne@68	952 >>> # Create an IntervalFile out of a.bed
jpayne@68	953 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68	954
jpayne@68	955 >>> # any stranded hits?
jpayne@68	956 >>> intervalfile.any_hits(i, same_strand=True)
jpayne@68	957 1
jpayne@68	958
jpayne@68	959 """
jpayne@68	960 found = 0
jpayne@68	961 self.loadIntoMap()
jpayne@68	962
jpayne@68	963 if same_strand == False:
jpayne@68	964 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68	965 else:
jpayne@68	966 found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
jpayne@68	967
jpayne@68	968 return found
jpayne@68	969
jpayne@68	970 def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
jpayne@68	971 """
jpayne@68	972 :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
jpayne@68	973
jpayne@68	974 Return the number of overlaps of the Interval `interval` had with this
jpayne@68	975 IntervalFile.
jpayne@68	976
jpayne@68	977 `same_strand`, if True, will only consider hits on the same strand as
jpayne@68	978 `interval`.
jpayne@68	979
jpayne@68	980 `overlap` can be used to specify the fraction of overlap between
jpayne@68	981 `interval` and each feature in the IntervalFile.
jpayne@68	982
jpayne@68	983 Example usage:
jpayne@68	984
jpayne@68	985 >>> fn = pybedtools.example_filename('a.bed')
jpayne@68	986
jpayne@68	987 >>> # create an Interval to query with
jpayne@68	988 >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
jpayne@68	989
jpayne@68	990 >>> # Create an IntervalFile out of a.bed
jpayne@68	991 >>> intervalfile = pybedtools.IntervalFile(fn)
jpayne@68	992
jpayne@68	993 >>> # get number of stranded hits
jpayne@68	994 >>> intervalfile.count_hits(i, same_strand=True)
jpayne@68	995 3
jpayne@68	996
jpayne@68	997 """
jpayne@68	998 self.loadIntoMap()
jpayne@68	999
jpayne@68	1000 if same_strand == False:
jpayne@68	1001 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
jpayne@68	1002 else:
jpayne@68	1003 return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b