Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1003 @@ +# distutils: language = c++ +# cython: language_level=2 + +# String notes: +# +# Anything that goes in C++ objects should be converted to a C++ <string> +# type, using the _cppstr() function. For example: Interval._bed.file_type, +# or the entries in Interval._bed.fields. +# +# Any Python accessor methods (Interval.fields, Interval.__getitem__) should +# then be converted to Python strings using the _pystr() function. +# +# Cython uses the `str` type as whatever the native Python version uses as +# str. + +from libcpp.string cimport string +import numpy as np + +# Python byte strings automatically coerce to/from C++ strings. + +cdef _cppstr(s): + # Use this to handle incoming strings from Python. + # + # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings + # are unicode and so must be encoded to bytestring. + if isinstance(s, integer_types): + s = str(s) + if isinstance(s, unicode): + s = s.encode('UTF-8') + return <string> s + +cdef _pystr(string s): + # Use this to prepare a string for sending to Python. + # + # Always returns unicode. + return s.decode('UTF-8', 'strict') + +integer_types = (int, long, np.int64) + + +""" + bedtools.pyx: A Cython wrapper for the BEDTools BedFile class + + Authors: Aaron Quinlan[1], Brent Pedersen[2] + Affl: [1] Center for Public Health Genomics, University of Virginia + [2] + Email: aaronquinlan at gmail dot com +""" +from cython.operator cimport dereference as deref +import sys +import subprocess +from collections import defaultdict + +cdef dict LOOKUPS = { + "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6}, + "vcf": {"chrom": 0, "start": 1}, + "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5} +} +for ktype, kdict in list(LOOKUPS.items()): + for k, v in list(kdict.items()): + kdict[v] = k + +# Keys are tuples of start/start, stop/stop, start/stop, stop/start. +# Values are which operators should return True, otherwise False +# < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 +PROFILES_TRUE = { + (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b + # a --------- + # b --------- + + (-1, -1, -1, -1): (0, 1), # a < b, a <= b + # a ---- + # b ----- + + (-1, -1, -1, 0): (1,), # a <= b + # a ---- + # b ----- (book-ended) + + (1, 1, 0, 1): (5,), # a >= b + # a ----- + # b ---- (book-ended) + + (1, 1, 1, 1): (4, 5), # a > b, a >= b + # a ------ + # b ---- + + (0, 1, -1, 1): (5,), # a >= b + # a ------------ + # b --------- + + (1, 0, -1, 1): (5,), # a >= b + # a ----------- + # b ------------- + + (-1, 0, -1, 1): (1,), # a <= b + # a ------------- + # b ----------- + + (0, -1, -1, 1): (1,), # a <= b + # a --------- + # b ------------ + + (-1, -1, -1, 1): (1,), # a <= b + # a ----------- + # b ----------- + + (1, 1, -1, 1): (5,), # a >= b + # a ----------- + # b ----------- + + (1, -1, -1, 1): tuple(), # undef + # a ---- + # b ----------- + + (-1, 1, -1, 1): tuple(), # undef + # a ----------- + # b ---- + + (-1, 0, -1, 0): (1,), # a <= b + # a ----------- + # b - + + (1, 0, 0, 1): (5,), # a >= b + # a - + # b ----------- + + (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b + # a - + # b - (starts and stops are identical for all features) + } + + +class MalformedBedLineError(Exception): + pass + + +class BedToolsFileError(Exception): + pass + + +class Attributes(dict): + """ + Class to map between a dict of attrs and fields[8] of a GFF Interval obj. + """ + + def __init__(self, attr_str=""): + attr_str = str(attr_str) + self._attr_str = attr_str + self.sort_keys = False + + # in general, GFF files will have either as many '=' as ';' + # (or ';'-1 if there's no trailing ';') + n_semi = attr_str.count(';') + n_eq = attr_str.count('=') + n_quotes = attr_str.count('"') + + if n_eq > n_semi - 1: + self.sep, self.field_sep = (';', '=') + else: + self.sep, self.field_sep = (';', ' ') + + self._quoted = {} + + # TODO: pathological case . . . detect this as GFF: + # + # class_code=" " + # + # and this as GTF: + # + # class_code "=" + + # quick exit + if attr_str == "": + return + + kvs = map(str.strip, attr_str.strip().split(self.sep)) + for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]: + if value.count('"') == 2: + self._quoted[field] = True + self[field] = value.replace('"', '') + + def __str__(self): + # stringify all items first + items = [] + for field, val in dict.iteritems(self): + try: + if self._quoted[field]: + val = '"' + str(val) + '"' + except KeyError: + pass + items.append((field, val)) + + pairs = [] + if self.sort_keys: + items.sort() + for k, v in items: + pairs.append(self.field_sep.join([k, v])) + + return self.sep.join(pairs) + self.sep + +cdef class Interval: + """ + Class to represent a genomic interval. + + Constructor:: + + Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None) + + Class to represent a genomic interval of any format. Requires at least 3 + args: chrom (string), start (int), end (int). + + `start` is *always* the 0-based start coordinate. If this Interval is to + represent a GFF object (which uses a 1-based coordinate system), then + subtract 1 from the 4th item in the line to get the start position in + 0-based coords for this Interval. The 1-based GFF coord will still be + available, albeit as a string, in fields[3]. + + `otherfields` is a list of fields that don't fit into the other kwargs, and + will be stored in the `fields` attribute of the Interval. + + All the items in `otherfields` must be strings for proper conversion to + C++. + + By convention, for BED files, `otherfields` is everything past the first 6 + items in the line. This allows an Interval to represent composite features + (e.g., a GFF line concatenated to the end of a BED line) + + But for other formats (VCF, GFF, SAM), the entire line should be passed in + as a list for `otherfields` so that we can always check the + Interval.file_type and extract the fields we want, knowing that they'll be + in the right order as passed in with `otherfields`. + + Example usage: + + >>> from pybedtools import Interval + >>> i = Interval("chr1", 22, 44, strand='-') + >>> i + Interval(chr1:22-44) + + + """ + def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None): + if otherfields is None: + otherfields = [] + otherfields = [_cppstr(i) for i in otherfields] + self._bed = new BED( + _cppstr(chrom), start, end, _cppstr(name), _cppstr(score), + _cppstr(strand), otherfields) + + #self._bed.chrom = _cppstr(chrom) + #self._bed.start = start + #self._bed.end = end + #self._bed.name = _cppstr(name) + #self._bed.score = _cppstr(score) + #self._bed.strand = _cppstr(strand) + fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)] + fields.extend(otherfields) + self._bed.fields = fields + self._attrs = None + + def __copy__(self): + return create_interval_from_list(self.fields) + + def __hash__(self): + return hash("\t".join(self.fields)) + + property chrom: + """ the chromosome of the feature""" + def __get__(self): + return _pystr(self._bed.chrom) + + def __set__(self, chrom): + chrom = _cppstr(chrom) + self._bed.chrom = chrom + idx = LOOKUPS[self.file_type]["chrom"] + self._bed.fields[idx] = _cppstr(chrom) + + # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 + def __richcmp__(self, other, int op): + if (self.chrom != other.chrom) or (self.strand != other.strand): + if op == 3: return True + return False + + def cmp(x, y): + if x < y: + return -1 + if x == y: + return 0 + if x > y: + return 1 + + + # check all 4 so that we can handle nesting and partial overlaps. + profile = (cmp(self.start, other.start), + cmp(self.stop, other.stop), + cmp(self.start, other.stop), + cmp(self.stop, other.start)) + + try: + if PROFILES_TRUE[profile] == tuple(): + raise NotImplementedError('Features are nested -- comparison undefined') + + if op != 3: + if op in PROFILES_TRUE[profile]: + return True + return False + else: + if 2 in PROFILES_TRUE[profile]: + return False + return True + except KeyError: + raise ValueError('Currently unsupported comparison -- please ' + 'submit a bug report') + + property start: + """The 0-based start of the feature.""" + def __get__(self): + return self._bed.start + + def __set__(self, int start): + self._bed.start = start + idx = LOOKUPS[self.file_type]["start"] + + # Non-BED files should have 1-based coords in fields + if self.file_type != 'bed': + start += 1 + self._bed.fields[idx] = _cppstr(str(start)) + + property end: + """The end of the feature""" + def __get__(self): + return self._bed.end + + def __set__(self, int end): + self._bed.end = end + idx = LOOKUPS[self.file_type]["stop"] + self._bed.fields[idx] = _cppstr(str(end)) + + property stop: + """ the end of the feature""" + def __get__(self): + return self._bed.end + + def __set__(self, int end): + idx = LOOKUPS[self.file_type]["stop"] + self._bed.fields[idx] = _cppstr(str(end)) + self._bed.end = end + + property strand: + """ the strand of the feature""" + def __get__(self): + return _pystr(self._bed.strand) + + def __set__(self, strand): + idx = LOOKUPS[self.file_type]["strand"] + self._bed.fields[idx] = _cppstr(strand) + self._bed.strand = _cppstr(strand) + + property length: + """ the length of the feature""" + def __get__(self): + return self._bed.end - self._bed.start + + cpdef deparse_attrs(self): + + if not self._attrs: return + + if self.file_type != "gff": + raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval') + + s = self._attrs.__str__() + self._bed.fields[8] = _cppstr(s) + + property fields: + def __get__(self): + self.deparse_attrs() + items = [] + for i in self._bed.fields: + if isinstance(i, int): + items.append(i) + else: + items.append(_pystr(i)) + return items + + property attrs: + def __get__(self): + if self._attrs is None: + ft = _pystr(self._bed.file_type) + if ft == 'gff': + self._attrs = Attributes(_pystr(self._bed.fields[8])) + else: + self._attrs = Attributes("") + return self._attrs + + def __set__(self, attrs): + self._attrs = attrs + + # TODO: make this more robust. + @property + def count(self): + return int(self.fields[-1]) + + property name: + """ + >>> import pybedtools + >>> vcf = pybedtools.example_bedtool('v.vcf') + >>> [v.name for v in vcf] + ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1'] + + """ + def __get__(self): + cdef string ftype = self._bed.file_type + value = None + if ftype == <string>"gff": + """ + # TODO. allow setting a name_key in the BedTool constructor? + if self.name_key and self.name_key in attrs: + return attrs[self.name_key] + """ + for key in ("ID", "Name", "gene_name", "transcript_id", \ + "gene_id", "Parent"): + if key in self.attrs: + value = self.attrs[key] + break + + elif ftype == <string>"vcf": + s = self.fields[2] + if s in ("", "."): + value = "%s:%i" % (self.chrom, self.start) + else: + value = _pystr(s) + elif ftype == <string>"bed": + value = _pystr(self._bed.name) + + return value + + def __set__(self, value): + cdef string ftype = self._bed.file_type + + if ftype == <string>"gff": + for key in ("ID", "Name", "gene_name", "transcript_id", \ + "gene_id", "Parent"): + if not key in self.attrs: + continue + + # If it's incoming from Python it's unicode, so store that directly + # in the attributes (since an Attribute object works on + # unicode)... + self.attrs[key] = value + break + + # Otherwise use _cppstr() because we're storing it in _bed.fields. + elif ftype == <string>"vcf": + self._bed.fields[2] = _cppstr(value) + else: + self._bed.name = _cppstr(value) + self._bed.fields[3] = _cppstr(value) + + property score: + def __get__(self): + return _pystr(self._bed.score) + + def __set__(self, value): + value = _cppstr(value) + self._bed.score = value + idx = LOOKUPS[self.file_type]["score"] + self._bed.fields[idx] = value + + property file_type: + "bed/vcf/gff" + def __get__(self): + return _pystr(self._bed.file_type) + + def __set__(self, value): + self._bed.file_type = _cppstr(value) + + # TODO: maybe bed.overlap_start or bed.overlap.start ?? + @property + def o_start(self): + return self._bed.o_start + + @property + def o_end(self): + return self._bed.o_end + + @property + def o_amt(self): + return self._bed.o_end - self._bed.o_start + + def __str__(self): + """ + Interval objects always print with a newline to mimic a line in a + BED/GFF/VCF file + """ + items = [] + for i in self.fields: + if isinstance(i, int): + i = str(i) + items.append(i) + + return '\t'.join(items) + '\n' + + def __repr__(self): + return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end) + + def __dealloc__(self): + del self._bed + + def __len__(self): + return self._bed.end - self._bed.start + + def __getitem__(self, object key): + cdef int i + ftype = _pystr(self._bed.file_type) + + self.deparse_attrs() + + if isinstance(key, (int, long)): + nfields = self._bed.fields.size() + if key >= nfields: + raise IndexError('field index out of range') + elif key < 0: + key = nfields + key + return _pystr(self._bed.fields.at(key)) + elif isinstance(key, slice): + indices = key.indices(self._bed.fields.size()) + return [_pystr(self._bed.fields.at(i)) for i in range(*indices)] + + elif isinstance(key, str): + if ftype == "gff": + try: + return self.attrs[key] + except KeyError: + pass + # We don't have to convert using _pystr() because the __get__ + # methods do that already. + return getattr(self, key) + + def __setitem__(self, object key, object value): + if isinstance(key, (int, long)): + nfields = self._bed.fields.size() + if key >= nfields: + raise IndexError('field index out of range') + elif key < 0: + key = nfields + key + self._bed.fields[key] = _cppstr(value) + + ft = _pystr(self._bed.file_type) + if key in LOOKUPS[ft]: + setattr(self, LOOKUPS[ft][key], value) + + elif isinstance(key, (basestring)): + setattr(self, key, value) + + cpdef append(self, object value): + self._bed.fields.push_back(_cppstr(value)) + + def __nonzero__(self): + return True + + +cdef Interval create_interval(BED b): + cdef Interval pyb = Interval.__new__(Interval) + pyb._bed = new BED(b.chrom, b.start, b.end, b.name, + b.score, b.strand, b.fields, + b.o_start, b.o_end, b.bedType, b.file_type, b.status) + pyb._bed.fields = b.fields + return pyb + +# TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when +# checking in create_interval_from_list for filetype heuruistics. Is there +# a performance hit by checking instances? +cdef isdigit(s): + if isinstance(s, integer_types): + return True + return s.isdigit() + + +cpdef Interval create_interval_from_list(list fields): + """ + Create an Interval object from a list of strings. + + Constructor:: + + create_interval_from_list(fields) + + Given the list of strings, `fields`, automatically detects the format (BED, + GFF, VCF, SAM) and creates a new Interval object. + + `fields` is a list with an arbitrary number of items (it can be quite long, + say after a -wao intersection of a BED12 and a GFF), however, the first + fields must conform to one of the supported formats. For example, if you + want the resulting Interval to be considered a GFF feature, then the first + 9 fields must conform to the GFF format. Similarly, if you want the + resulting Interval to be considered a BED feature, then the first three + fields must be chrom, start, stop. + + Example usage: + + >>> # Creates a BED3 feature + >>> feature = create_interval_from_list(['chr1', '1', '100']) + + """ + + # TODO: this function is used a lot, and is doing a bit of work. We should + # have an optimized version that is directly provided the filetype. + + cdef Interval pyb = Interval.__new__(Interval) + orig_fields = fields[:] + # BED -- though a VCF will be detected as BED if its 2nd field, id, is a + # digit + + # SAM + if ( + (len(fields) >= 11) + and isdigit(fields[1]) + and isdigit(fields[3]) + and isdigit(fields[4]) + and (fields[5] not in ['.', '+', '-']) + ): + # TODO: what should the stop position be? Here, it's just the start + # plus the length of the sequence, but perhaps this should eventually + # do CIGAR string parsing. + if int(fields[1]) & 0x04: + # handle unmapped reads + chrom = _cppstr("*") + start = 0 + stop = 0 + else: + chrom = _cppstr(fields[2]) + start = int(fields[3]) - 1 + stop = int(fields[3]) + len(fields[9]) - 1 + name = _cppstr(fields[0]) + score = _cppstr(fields[1]) + if int(fields[1]) & 0x10: + strand = _cppstr('-') + else: + strand = _cppstr('+') + + # Fields is in SAM format + fields[3] = str(start + 1) + + pyb._bed = new BED( + chrom, + start, + stop, + strand, + name, + score, + list_to_vector(fields)) + pyb.file_type = _cppstr('sam') + + + elif isdigit(fields[1]) and isdigit(fields[2]): + # if it's too short, just add some empty fields. + if len(fields) < 7: + fields.extend([".".encode('UTF-8')] * (6 - len(fields))) + other_fields = [] + else: + other_fields = fields[6:] + + pyb._bed = new BED( + _cppstr(fields[0]), + int(fields[1]), + int(fields[2]), + _cppstr(fields[3]), + _cppstr(fields[4]), + _cppstr(fields[5]), + list_to_vector(other_fields)) + pyb.file_type = _cppstr('bed') + + # VCF + elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8: + pyb._bed = new BED( + _cppstr(fields[0]), + int(fields[1]) - 1, + int(fields[1]), + _cppstr(fields[2]), + _cppstr(fields[5]), + _cppstr('.'), + list_to_vector(fields)) + pyb.file_type = b'vcf' + + + # GFF + elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]): + pyb._bed = new BED( + _cppstr(fields[0]), + int(fields[3])-1, int(fields[4]), + _cppstr(fields[2]), + _cppstr(fields[5]), + _cppstr(fields[6]), + list_to_vector(fields[7:])) + pyb.file_type = _cppstr('gff') + else: + raise MalformedBedLineError('Unable to detect format from %s' % fields) + + if pyb.start > pyb.end: + raise MalformedBedLineError("Start is greater than stop") + pyb._bed.fields = list_to_vector(orig_fields) + return pyb + +cdef vector[string] list_to_vector(list li): + cdef vector[string] s + cdef int i + for i in range(len(li)): + _s = li[i] + s.push_back(_cppstr(_s)) + return s + +cdef list string_vec2list(vector[string] sv): + cdef size_t size = sv.size(), i + return [_pystr(sv.at(i)) for i in range(size)] + +cdef list bed_vec2list(vector[BED] bv): + cdef size_t size = bv.size(), i + cdef list l = [] + cdef BED b + for i in range(size): + b = bv.at(i) + l.append(create_interval(b)) + return l + + +def overlap(int s1, int s2, int e1, int e2): + return min(e1, e2) - max(s1, s2) + + +cdef class IntervalIterator: + cdef object stream + cdef int _itemtype + def __init__(self, stream): + self.stream = stream + + # For speed, check int rather than call isinstance(). + # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string. + # + # Also assumes that all items in the iterable `stream` are the same + # type...this seems like a reasonable assumption. + self._itemtype = -1 + + def __dealloc__(self): + try: + self.stream.close() + except AttributeError: + pass + + def __iter__(self): + return self + + def __next__(self): + while True: + if hasattr(self.stream, 'closed'): + if self.stream.closed: + raise StopIteration + try: + line = next(self.stream) + except StopIteration: + if hasattr(self.stream, 'close'): + self.stream.close() + raise StopIteration + + if self._itemtype < 0: + if isinstance(line, Interval): + self._itemtype = 2 + elif isinstance(line, basestring): + self._itemtype = 1 + else: + self._itemtype = 0 + + if self._itemtype == 1: + if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0: + continue + break + + # Iterable of Interval objects + if self._itemtype == 2: + return line + + # Iterable of strings, in which case we need to split + elif self._itemtype == 1: + fields = line.rstrip('\r\n').split('\t') + + # Otherwise assume list/tuple/iterable of fields + else: + fields = list(line) + + # TODO: optimization: create_interval_from_list should have a version + # that accepts C++ string instances + return create_interval_from_list(fields) + + + +cdef class IntervalFile: + cdef BedFile *intervalFile_ptr + cdef bint _loaded + cdef bint _open + cdef string _fn + """ + An IntervalFile provides low-level access to the BEDTools API. + + >>> fn = pybedtools.example_filename('a.bed') + >>> intervalfile = pybedtools.IntervalFile(fn) + + """ + def __init__(self, intervalFile): + self.intervalFile_ptr = new BedFile(_cppstr(intervalFile)) + self._loaded = 0 + self._open = 0 + self._fn = _cppstr(intervalFile) + + def __dealloc__(self): + del self.intervalFile_ptr + + def __iter__(self): + return self + + def __next__(self): + if not self._open: + result = self.intervalFile_ptr.Open() + if result == -1: + raise BedToolsFileError("Error opening file") + self._open = 1 + cdef BED b = self.intervalFile_ptr.GetNextBed() + if b.status == BED_VALID: + return create_interval(b) + elif b.status == BED_INVALID: + self.intervalFile_ptr.Close() + raise StopIteration + elif b.status == BED_MALFORMED: + self.intervalFile_ptr.Close() + raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields)) + else: + return next(self) + + @property + def fn(self): + return _pystr(self._fn) + + @property + def file_type(self): + if not self.intervalFile_ptr._typeIsKnown: + try: + a = next(iter(self)) + file_type = _pystr(self.intervalFile_ptr.file_type) + self.intervalFile_ptr.Close() + return file_type + except MalformedBedLineError: + # If it's a SAM, raise a meaningful exception. If not, fail. + with open(self.fn) as fn: + interval = create_interval_from_list(fn.readline().strip().split()) + if interval.file_type == 'sam': + raise ValueError('IntervalFile objects do not yet natively support SAM. ' + 'Please convert to BED/GFF/VCF first if you want to ' + 'use the low-level API of IntervalFile') + else: + raise + + + def loadIntoMap(self): + """ + Prepares file for checking intersections. Used by other methods like all_hits() + """ + if self._loaded: + return + self.intervalFile_ptr.loadBedFileIntoMap() + self._loaded = 1 + + def rewind(self): + """ + Jump to the beginning of the file. + """ + if not self._open: + self.intervalFile_ptr.Open() + self._open = 1 + self.intervalFile_ptr.Rewind() + + def seek(self, offset): + """ + Jump to a specific byte offset in the file + """ + if not self._open: + self.intervalFile_ptr.Open() + self._open = 1 + self.intervalFile_ptr.Seek(offset) + + + def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): + """ + :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)` + + Search for the Interval `interval` this file and return **all** + overlaps as a list. + + `same_strand`, if True, will only consider hits on the same strand as `interval`. + + `overlap` can be used to specify the fraction of overlap between + `interval` and each feature in the IntervalFile. + + Example usage: + + >>> fn = pybedtools.example_filename('a.bed') + + >>> # create an Interval to query with + >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') + + >>> # Create an IntervalFile out of a.bed + >>> intervalfile = pybedtools.IntervalFile(fn) + + >>> # get stranded hits + >>> intervalfile.all_hits(i, same_strand=True) + [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)] + + """ + cdef vector[BED] vec_b + self.loadIntoMap() + + if same_strand == False: + vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap) + try: + return bed_vec2list(vec_b) + finally: + pass + else: + vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap) + try: + return bed_vec2list(vec_b) + finally: + pass + + # search() is an alias for all_hits + search = all_hits + + def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): + """ + :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)` + + Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise. + + `same_strand`, if True, will only consider hits on the same strand as `interval`. + + `overlap` can be used to specify the fraction of overlap between + `interval` and each feature in the IntervalFile. + + Example usage: + + >>> fn = pybedtools.example_filename('a.bed') + + >>> # create an Interval to query with + >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') + + >>> # Create an IntervalFile out of a.bed + >>> intervalfile = pybedtools.IntervalFile(fn) + + >>> # any stranded hits? + >>> intervalfile.any_hits(i, same_strand=True) + 1 + + """ + found = 0 + self.loadIntoMap() + + if same_strand == False: + found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap) + else: + found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap) + + return found + + def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): + """ + :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)` + + Return the number of overlaps of the Interval `interval` had with this + IntervalFile. + + `same_strand`, if True, will only consider hits on the same strand as + `interval`. + + `overlap` can be used to specify the fraction of overlap between + `interval` and each feature in the IntervalFile. + + Example usage: + + >>> fn = pybedtools.example_filename('a.bed') + + >>> # create an Interval to query with + >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') + + >>> # Create an IntervalFile out of a.bed + >>> intervalfile = pybedtools.IntervalFile(fn) + + >>> # get number of stranded hits + >>> intervalfile.count_hits(i, same_strand=True) + 3 + + """ + self.loadIntoMap() + + if same_strand == False: + return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap) + else: + return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)