Mercurial > repos > rliterman > csp2

diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author: jpayne
date: Tue, 18 Mar 2025 16:23:26 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/cbedtools.pyx	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,1003 @@
+# distutils: language = c++
+# cython: language_level=2
+
+# String notes:
+#
+#   Anything that goes in C++ objects should be converted to a C++ <string>
+#   type, using the _cppstr() function.  For example: Interval._bed.file_type,
+#   or the entries in Interval._bed.fields.
+#
+#   Any Python accessor methods (Interval.fields, Interval.__getitem__) should
+#   then be converted to Python strings using the _pystr() function.
+#
+#   Cython uses the `str` type as whatever the native Python version uses as
+#   str.
+
+from libcpp.string cimport string
+import numpy as np
+
+# Python byte strings automatically coerce to/from C++ strings.
+
+cdef _cppstr(s):
+    # Use this to handle incoming strings from Python.
+    #
+    # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
+    # are unicode and so must be encoded to bytestring.
+    if isinstance(s, integer_types):
+        s = str(s)
+    if isinstance(s, unicode):
+        s = s.encode('UTF-8')
+    return <string> s
+
+cdef _pystr(string s):
+    # Use this to prepare a string for sending to Python.
+    #
+    # Always returns unicode.
+    return s.decode('UTF-8', 'strict')
+
+integer_types = (int, long, np.int64)
+
+
+"""
+    bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
+
+    Authors: Aaron Quinlan[1], Brent Pedersen[2]
+    Affl:    [1] Center for Public Health Genomics, University of Virginia
+             [2]
+    Email:  aaronquinlan at gmail dot com
+"""
+from cython.operator cimport dereference as deref
+import sys
+import subprocess
+from collections import defaultdict
+
+cdef dict LOOKUPS = {
+    "gff":  {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
+    "vcf":  {"chrom": 0, "start": 1},
+    "bed":  {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
+}
+for ktype, kdict in list(LOOKUPS.items()):
+    for k, v in list(kdict.items()):
+        kdict[v] = k
+
+# Keys are tuples of start/start, stop/stop, start/stop, stop/start.
+# Values are which operators should return True, otherwise False
+# < 0 | <= 1 | == 2 | != 3 |  > 4 | >= 5
+PROFILES_TRUE = {
+                (0, 0, -1, 1): (2, 1, 5),  # a == b, a >= b, a <= b
+                # a  ---------
+                # b  ---------
+
+                (-1, -1, -1, -1): (0, 1),  # a < b, a <= b
+                # a ----
+                # b       -----
+
+                (-1, -1, -1, 0): (1,),  # a <= b
+                # a ----
+                # b     -----  (book-ended)
+
+                (1, 1, 0, 1): (5,),  # a >= b
+                # a     -----
+                # b ----      (book-ended)
+
+                (1, 1, 1, 1): (4, 5), # a > b, a >= b
+                # a       ------
+                # b ----
+
+                (0, 1, -1, 1): (5,),  # a >= b
+                # a  ------------
+                # b  ---------
+
+                (1, 0, -1, 1): (5,),  # a >= b
+                # a   -----------
+                # b -------------
+
+                (-1, 0, -1, 1): (1,),  # a <= b
+                # a -------------
+                # b   -----------
+
+                (0, -1, -1, 1): (1,), # a <= b
+                # a  ---------
+                # b  ------------
+
+                (-1, -1, -1, 1): (1,), # a <= b
+                # a -----------
+                # b        -----------
+
+                (1, 1, -1, 1): (5,),  # a >= b
+                # a        -----------
+                # b -----------
+
+                (1, -1, -1, 1): tuple(), # undef
+                # a    ----
+                # b -----------
+
+                (-1, 1, -1, 1): tuple(), # undef
+                # a -----------
+                # b    ----
+
+                (-1, 0, -1, 0): (1,),  # a <= b
+                # a -----------
+                # b           -
+
+                (1, 0, 0, 1): (5,),  # a >= b
+                # a           -
+                # b -----------
+
+                (0, 0, 0, 0): (1, 2, 5),  # a == b, a <= b, a >= b
+                # a -
+                # b -  (starts and stops are identical for all features)
+            }
+
+
+class MalformedBedLineError(Exception):
+    pass
+
+
+class BedToolsFileError(Exception):
+    pass
+
+
+class Attributes(dict):
+    """
+    Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
+    """
+
+    def __init__(self, attr_str=""):
+        attr_str = str(attr_str)
+        self._attr_str = attr_str
+        self.sort_keys = False
+
+        # in general, GFF files will have either as many '=' as ';'
+        # (or ';'-1 if there's no trailing ';')
+        n_semi = attr_str.count(';')
+        n_eq = attr_str.count('=')
+        n_quotes = attr_str.count('"')
+
+        if n_eq > n_semi - 1:
+            self.sep, self.field_sep = (';', '=')
+        else:
+            self.sep, self.field_sep = (';', ' ')
+
+        self._quoted = {}
+
+        # TODO: pathological case . . . detect this as GFF:
+        #
+        #   class_code=" "
+        #
+        # and this as GTF:
+        #
+        #   class_code "="
+
+        # quick exit
+        if attr_str == "":
+            return
+
+        kvs = map(str.strip, attr_str.strip().split(self.sep))
+        for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
+            if value.count('"') == 2:
+                self._quoted[field] = True
+            self[field] = value.replace('"', '')
+
+    def __str__(self):
+        # stringify all items first
+        items = []
+        for field, val in dict.iteritems(self):
+            try:
+                if self._quoted[field]:
+                    val = '"' + str(val) + '"'
+            except KeyError:
+                pass
+            items.append((field, val))
+
+        pairs = []
+        if self.sort_keys:
+            items.sort()
+        for k, v in items:
+            pairs.append(self.field_sep.join([k, v]))
+
+        return self.sep.join(pairs) + self.sep
+
+cdef class Interval:
+    """
+    Class to represent a genomic interval.
+
+    Constructor::
+
+        Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
+
+    Class to represent a genomic interval of any format.  Requires at least 3
+    args: chrom (string), start (int), end (int).
+
+    `start` is *always* the 0-based start coordinate.  If this Interval is to
+    represent a GFF object (which uses a 1-based coordinate system), then
+    subtract 1 from the 4th item in the line to get the start position in
+    0-based coords for this Interval.  The 1-based GFF coord will still be
+    available, albeit as a string, in fields[3].
+
+    `otherfields` is a list of fields that don't fit into the other kwargs, and
+    will be stored in the `fields` attribute of the Interval.
+
+    All the items in `otherfields` must be strings for proper conversion to
+    C++.
+
+    By convention, for BED files, `otherfields` is everything past the first 6
+    items in the line.  This allows an Interval to represent composite features
+    (e.g., a GFF line concatenated to the end of a BED line)
+
+    But for other formats (VCF, GFF, SAM), the entire line should be passed in
+    as a list for `otherfields` so that we can always check the
+    Interval.file_type and extract the fields we want, knowing that they'll be
+    in the right order as passed in with `otherfields`.
+
+    Example usage:
+
+        >>> from pybedtools import Interval
+        >>> i = Interval("chr1", 22, 44, strand='-')
+        >>> i
+        Interval(chr1:22-44)
+
+
+    """
+    def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
+        if otherfields is None:
+            otherfields = []
+        otherfields = [_cppstr(i) for i in otherfields]
+        self._bed = new BED(
+            _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
+            _cppstr(strand), otherfields)
+
+        #self._bed.chrom = _cppstr(chrom)
+        #self._bed.start = start
+        #self._bed.end = end
+        #self._bed.name = _cppstr(name)
+        #self._bed.score = _cppstr(score)
+        #self._bed.strand = _cppstr(strand)
+        fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
+        fields.extend(otherfields)
+        self._bed.fields = fields
+        self._attrs = None
+
+    def __copy__(self):
+        return create_interval_from_list(self.fields)
+
+    def __hash__(self):
+        return hash("\t".join(self.fields))
+
+    property chrom:
+        """ the chromosome of the feature"""
+        def __get__(self):
+            return _pystr(self._bed.chrom)
+
+        def __set__(self, chrom):
+            chrom = _cppstr(chrom)
+            self._bed.chrom = chrom
+            idx = LOOKUPS[self.file_type]["chrom"]
+            self._bed.fields[idx] = _cppstr(chrom)
+
+    # < 0 | <= 1 | == 2 | != 3 |  > 4 | >= 5
+    def __richcmp__(self, other, int op):
+        if (self.chrom != other.chrom) or (self.strand != other.strand):
+            if op == 3: return True
+            return False
+
+        def cmp(x, y):
+            if x < y:
+                return -1
+            if x == y:
+                return 0
+            if x > y:
+                return 1
+
+
+        # check all 4 so that we can handle nesting and partial overlaps.
+        profile = (cmp(self.start, other.start),
+                   cmp(self.stop, other.stop),
+                   cmp(self.start, other.stop),
+                   cmp(self.stop, other.start))
+
+        try:
+            if PROFILES_TRUE[profile] == tuple():
+                raise NotImplementedError('Features are nested -- comparison undefined')
+
+            if op != 3:
+                if op in PROFILES_TRUE[profile]:
+                    return True
+                return False
+            else:
+                if 2 in PROFILES_TRUE[profile]:
+                    return False
+                return True
+        except KeyError:
+            raise ValueError('Currently unsupported comparison -- please '
+                             'submit a bug report')
+
+    property start:
+        """The 0-based start of the feature."""
+        def __get__(self):
+            return self._bed.start
+
+        def __set__(self, int start):
+            self._bed.start = start
+            idx = LOOKUPS[self.file_type]["start"]
+
+            # Non-BED files should have 1-based coords in fields
+            if self.file_type != 'bed':
+                start += 1
+            self._bed.fields[idx] = _cppstr(str(start))
+
+    property end:
+        """The end of the feature"""
+        def __get__(self):
+            return self._bed.end
+
+        def __set__(self, int end):
+            self._bed.end = end
+            idx = LOOKUPS[self.file_type]["stop"]
+            self._bed.fields[idx] = _cppstr(str(end))
+
+    property stop:
+        """ the end of the feature"""
+        def __get__(self):
+            return self._bed.end
+
+        def __set__(self, int end):
+            idx = LOOKUPS[self.file_type]["stop"]
+            self._bed.fields[idx] = _cppstr(str(end))
+            self._bed.end = end
+
+    property strand:
+        """ the strand of the feature"""
+        def __get__(self):
+            return _pystr(self._bed.strand)
+
+        def __set__(self, strand):
+            idx = LOOKUPS[self.file_type]["strand"]
+            self._bed.fields[idx] = _cppstr(strand)
+            self._bed.strand = _cppstr(strand)
+
+    property length:
+        """ the length of the feature"""
+        def __get__(self):
+            return self._bed.end - self._bed.start
+
+    cpdef deparse_attrs(self):
+
+        if not self._attrs: return
+
+        if self.file_type != "gff":
+            raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
+
+        s = self._attrs.__str__()
+        self._bed.fields[8] = _cppstr(s)
+
+    property fields:
+        def __get__(self):
+            self.deparse_attrs()
+            items = []
+            for i in self._bed.fields:
+                if isinstance(i, int):
+                    items.append(i)
+                else:
+                    items.append(_pystr(i))
+            return items
+
+    property attrs:
+        def __get__(self):
+            if self._attrs is None:
+                ft = _pystr(self._bed.file_type)
+                if ft == 'gff':
+                    self._attrs = Attributes(_pystr(self._bed.fields[8]))
+                else:
+                    self._attrs = Attributes("")
+            return self._attrs
+
+        def __set__(self, attrs):
+            self._attrs = attrs
+
+    # TODO: make this more robust.
+    @property
+    def count(self):
+        return int(self.fields[-1])
+
+    property name:
+        """
+        >>> import pybedtools
+        >>> vcf = pybedtools.example_bedtool('v.vcf')
+        >>> [v.name for v in vcf]
+        ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
+
+        """
+        def __get__(self):
+            cdef string ftype = self._bed.file_type
+            value = None
+            if ftype == <string>"gff":
+                """
+                # TODO. allow setting a name_key in the BedTool constructor?
+                if self.name_key and self.name_key in attrs:
+                    return attrs[self.name_key]
+                """
+                for key in ("ID", "Name", "gene_name", "transcript_id", \
+                            "gene_id", "Parent"):
+                    if key in self.attrs:
+                        value = self.attrs[key]
+                        break
+
+            elif ftype == <string>"vcf":
+                s = self.fields[2]
+                if s in ("", "."):
+                    value = "%s:%i" % (self.chrom, self.start)
+                else:
+                    value = _pystr(s)
+            elif ftype == <string>"bed":
+                value = _pystr(self._bed.name)
+
+            return value
+
+        def __set__(self, value):
+            cdef string ftype = self._bed.file_type
+
+            if ftype == <string>"gff":
+                for key in ("ID", "Name", "gene_name", "transcript_id", \
+                            "gene_id", "Parent"):
+                    if not key in self.attrs:
+                        continue
+
+                    # If it's incoming from Python it's unicode, so store that directly
+                    # in the attributes (since an Attribute object works on
+                    # unicode)...
+                    self.attrs[key] = value
+                    break
+
+            # Otherwise use _cppstr() because we're storing it in _bed.fields.
+            elif ftype == <string>"vcf":
+                self._bed.fields[2] = _cppstr(value)
+            else:
+                self._bed.name = _cppstr(value)
+                self._bed.fields[3] = _cppstr(value)
+
+    property score:
+        def __get__(self):
+            return _pystr(self._bed.score)
+
+        def __set__(self, value):
+            value = _cppstr(value)
+            self._bed.score = value
+            idx = LOOKUPS[self.file_type]["score"]
+            self._bed.fields[idx] = value
+
+    property file_type:
+        "bed/vcf/gff"
+        def __get__(self):
+            return _pystr(self._bed.file_type)
+
+        def __set__(self, value):
+            self._bed.file_type = _cppstr(value)
+
+    # TODO: maybe bed.overlap_start or bed.overlap.start ??
+    @property
+    def o_start(self):
+        return self._bed.o_start
+
+    @property
+    def o_end(self):
+        return self._bed.o_end
+
+    @property
+    def o_amt(self):
+        return self._bed.o_end - self._bed.o_start
+
+    def __str__(self):
+        """
+        Interval objects always print with a newline to mimic a line in a
+        BED/GFF/VCF file
+        """
+        items = []
+        for i in self.fields:
+            if isinstance(i, int):
+                i = str(i)
+            items.append(i)
+
+        return '\t'.join(items) + '\n'
+
+    def __repr__(self):
+        return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
+
+    def __dealloc__(self):
+        del self._bed
+
+    def __len__(self):
+        return self._bed.end - self._bed.start
+
+    def __getitem__(self, object key):
+        cdef int i
+        ftype = _pystr(self._bed.file_type)
+
+        self.deparse_attrs()
+
+        if isinstance(key, (int, long)):
+            nfields = self._bed.fields.size()
+            if key >= nfields:
+                raise IndexError('field index out of range')
+            elif key < 0:
+                key = nfields + key
+            return _pystr(self._bed.fields.at(key))
+        elif isinstance(key, slice):
+            indices = key.indices(self._bed.fields.size())
+            return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
+
+        elif isinstance(key, str):
+            if ftype == "gff":
+                try:
+                    return self.attrs[key]
+                except KeyError:
+                    pass
+            # We don't have to convert using _pystr() because the __get__
+            # methods do that already.
+            return getattr(self, key)
+
+    def __setitem__(self, object key, object value):
+        if isinstance(key, (int, long)):
+            nfields = self._bed.fields.size()
+            if key >= nfields:
+                raise IndexError('field index out of range')
+            elif key < 0:
+                key = nfields + key
+            self._bed.fields[key] = _cppstr(value)
+
+            ft = _pystr(self._bed.file_type)
+            if key in LOOKUPS[ft]:
+                setattr(self, LOOKUPS[ft][key], value)
+
+        elif isinstance(key, (basestring)):
+            setattr(self, key, value)
+
+    cpdef append(self, object value):
+        self._bed.fields.push_back(_cppstr(value))
+
+    def __nonzero__(self):
+        return True
+
+
+cdef Interval create_interval(BED b):
+    cdef Interval pyb = Interval.__new__(Interval)
+    pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
+                       b.score, b.strand, b.fields,
+                       b.o_start, b.o_end, b.bedType, b.file_type, b.status)
+    pyb._bed.fields = b.fields
+    return pyb
+
+# TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
+# checking in create_interval_from_list for filetype heuruistics. Is there
+# a performance hit by checking instances?
+cdef isdigit(s):
+    if isinstance(s, integer_types):
+        return True
+    return s.isdigit()
+
+
+cpdef Interval create_interval_from_list(list fields):
+    """
+    Create an Interval object from a list of strings.
+
+    Constructor::
+
+        create_interval_from_list(fields)
+
+    Given the list of strings, `fields`, automatically detects the format (BED,
+    GFF, VCF, SAM) and creates a new Interval object.
+
+    `fields` is a list with an arbitrary number of items (it can be quite long,
+    say after a -wao intersection of a BED12 and a GFF), however, the first
+    fields must conform to one of the supported formats.  For example, if you
+    want the resulting Interval to be considered a GFF feature, then the first
+    9 fields must conform to the GFF format.  Similarly, if you want the
+    resulting Interval to be considered a BED feature, then the first three
+    fields must be chrom, start, stop.
+
+    Example usage:
+
+        >>> # Creates a BED3 feature
+        >>> feature = create_interval_from_list(['chr1', '1', '100'])
+
+    """
+
+    # TODO: this function is used a lot, and is doing a bit of work. We should
+    # have an optimized version that is directly provided the filetype.
+
+    cdef Interval pyb = Interval.__new__(Interval)
+    orig_fields = fields[:]
+    # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
+    # digit
+
+    # SAM
+    if (
+        (len(fields) >= 11)
+        and isdigit(fields[1])
+        and isdigit(fields[3])
+        and isdigit(fields[4])
+        and (fields[5] not in ['.', '+', '-'])
+    ):
+        # TODO: what should the stop position be?  Here, it's just the start
+        # plus the length of the sequence, but perhaps this should eventually
+        # do CIGAR string parsing.
+        if int(fields[1]) & 0x04:
+            # handle unmapped reads
+            chrom = _cppstr("*")
+            start = 0
+            stop = 0
+        else:
+            chrom = _cppstr(fields[2])
+            start = int(fields[3]) - 1
+            stop = int(fields[3]) + len(fields[9]) - 1
+        name = _cppstr(fields[0])
+        score = _cppstr(fields[1])
+        if int(fields[1]) & 0x10:
+            strand = _cppstr('-')
+        else:
+            strand = _cppstr('+')
+
+        # Fields is in SAM format
+        fields[3] = str(start + 1)
+
+        pyb._bed = new BED(
+            chrom,
+            start,
+            stop,
+            strand,
+            name,
+            score,
+            list_to_vector(fields))
+        pyb.file_type = _cppstr('sam')
+
+
+    elif isdigit(fields[1]) and isdigit(fields[2]):
+        # if it's too short, just add some empty fields.
+        if len(fields) < 7:
+            fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
+            other_fields = []
+        else:
+            other_fields = fields[6:]
+
+        pyb._bed = new BED(
+            _cppstr(fields[0]),
+            int(fields[1]),
+            int(fields[2]),
+            _cppstr(fields[3]),
+            _cppstr(fields[4]),
+            _cppstr(fields[5]),
+            list_to_vector(other_fields))
+        pyb.file_type = _cppstr('bed')
+
+    # VCF
+    elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
+        pyb._bed = new BED(
+            _cppstr(fields[0]),
+            int(fields[1]) - 1,
+            int(fields[1]),
+            _cppstr(fields[2]),
+            _cppstr(fields[5]),
+            _cppstr('.'),
+            list_to_vector(fields))
+        pyb.file_type = b'vcf'
+
+
+    # GFF
+    elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
+        pyb._bed = new BED(
+            _cppstr(fields[0]),
+            int(fields[3])-1, int(fields[4]),
+            _cppstr(fields[2]),
+            _cppstr(fields[5]),
+            _cppstr(fields[6]),
+            list_to_vector(fields[7:]))
+        pyb.file_type = _cppstr('gff')
+    else:
+        raise MalformedBedLineError('Unable to detect format from %s' % fields)
+
+    if pyb.start > pyb.end:
+        raise MalformedBedLineError("Start is greater than stop")
+    pyb._bed.fields = list_to_vector(orig_fields)
+    return pyb
+
+cdef vector[string] list_to_vector(list li):
+    cdef vector[string] s
+    cdef int i
+    for i in range(len(li)):
+        _s = li[i]
+        s.push_back(_cppstr(_s))
+    return s
+
+cdef list string_vec2list(vector[string] sv):
+    cdef size_t size = sv.size(), i
+    return [_pystr(sv.at(i)) for i in range(size)]
+
+cdef list bed_vec2list(vector[BED] bv):
+    cdef size_t size = bv.size(), i
+    cdef list l = []
+    cdef BED b
+    for i in range(size):
+        b = bv.at(i)
+        l.append(create_interval(b))
+    return l
+
+
+def overlap(int s1, int s2, int e1, int e2):
+    return min(e1, e2) - max(s1, s2)
+
+
+cdef class IntervalIterator:
+    cdef object stream
+    cdef int _itemtype
+    def __init__(self, stream):
+        self.stream = stream
+
+        # For speed, check int rather than call isinstance().
+        # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
+        #
+        # Also assumes that all items in the iterable `stream` are the same
+        # type...this seems like a reasonable assumption.
+        self._itemtype = -1
+
+    def __dealloc__(self):
+        try:
+            self.stream.close()
+        except AttributeError:
+            pass
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            if hasattr(self.stream, 'closed'):
+                if self.stream.closed:
+                    raise StopIteration
+            try:
+                line = next(self.stream)
+            except StopIteration:
+                if hasattr(self.stream, 'close'):
+                    self.stream.close()
+                raise StopIteration
+
+            if self._itemtype < 0:
+                if isinstance(line, Interval):
+                    self._itemtype = 2
+                elif isinstance(line, basestring):
+                    self._itemtype = 1
+                else:
+                    self._itemtype = 0
+
+            if self._itemtype == 1:
+                if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
+                    continue
+            break
+
+        # Iterable of Interval objects
+        if self._itemtype == 2:
+            return line
+
+        # Iterable of strings, in which case we need to split
+        elif self._itemtype == 1:
+            fields = line.rstrip('\r\n').split('\t')
+
+        # Otherwise assume list/tuple/iterable of fields
+        else:
+            fields = list(line)
+
+        # TODO: optimization: create_interval_from_list should have a version
+        # that accepts C++ string instances
+        return create_interval_from_list(fields)
+
+
+
+cdef class IntervalFile:
+    cdef BedFile *intervalFile_ptr
+    cdef bint _loaded
+    cdef bint _open
+    cdef string _fn
+    """
+    An IntervalFile provides low-level access to the BEDTools API.
+
+    >>> fn = pybedtools.example_filename('a.bed')
+    >>> intervalfile = pybedtools.IntervalFile(fn)
+
+    """
+    def __init__(self, intervalFile):
+        self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
+        self._loaded = 0
+        self._open = 0
+        self._fn = _cppstr(intervalFile)
+
+    def __dealloc__(self):
+        del self.intervalFile_ptr
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self._open:
+            result = self.intervalFile_ptr.Open()
+            if result == -1:
+                raise BedToolsFileError("Error opening file")
+            self._open = 1
+        cdef BED b = self.intervalFile_ptr.GetNextBed()
+        if b.status == BED_VALID:
+            return create_interval(b)
+        elif b.status == BED_INVALID:
+            self.intervalFile_ptr.Close()
+            raise StopIteration
+        elif b.status == BED_MALFORMED:
+            self.intervalFile_ptr.Close()
+            raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
+        else:
+            return next(self)
+
+    @property
+    def fn(self):
+        return _pystr(self._fn)
+
+    @property
+    def file_type(self):
+        if not self.intervalFile_ptr._typeIsKnown:
+            try:
+                a = next(iter(self))
+                file_type = _pystr(self.intervalFile_ptr.file_type)
+                self.intervalFile_ptr.Close()
+                return file_type
+            except MalformedBedLineError:
+                # If it's a SAM, raise a meaningful exception.  If not, fail.
+                with open(self.fn) as fn:
+                    interval = create_interval_from_list(fn.readline().strip().split())
+                if interval.file_type == 'sam':
+                    raise ValueError('IntervalFile objects do not yet natively support SAM. '
+                                     'Please convert to BED/GFF/VCF first if you want to '
+                                     'use the low-level API of IntervalFile')
+                else:
+                    raise
+
+
+    def loadIntoMap(self):
+        """
+        Prepares file for checking intersections.  Used by other methods like all_hits()
+        """
+        if self._loaded:
+            return
+        self.intervalFile_ptr.loadBedFileIntoMap()
+        self._loaded = 1
+
+    def rewind(self):
+        """
+        Jump to the beginning of the file.
+        """
+        if not self._open:
+            self.intervalFile_ptr.Open()
+            self._open = 1
+        self.intervalFile_ptr.Rewind()
+
+    def seek(self, offset):
+        """
+        Jump to a specific byte offset in the file
+        """
+        if not self._open:
+            self.intervalFile_ptr.Open()
+            self._open = 1
+        self.intervalFile_ptr.Seek(offset)
+
+
+    def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+        """
+        :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
+
+        Search for the Interval `interval` this file and return **all**
+        overlaps as a list.
+
+        `same_strand`, if True, will only consider hits on the same strand as `interval`.
+
+        `overlap` can be used to specify the fraction of overlap between
+        `interval` and each feature in the IntervalFile.
+
+        Example usage:
+
+        >>> fn = pybedtools.example_filename('a.bed')
+
+        >>> # create an Interval to query with
+        >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+        >>> # Create an IntervalFile out of a.bed
+        >>> intervalfile = pybedtools.IntervalFile(fn)
+
+        >>> # get stranded hits
+        >>> intervalfile.all_hits(i, same_strand=True)
+        [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
+
+        """
+        cdef vector[BED] vec_b
+        self.loadIntoMap()
+
+        if same_strand == False:
+            vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
+            try:
+                return bed_vec2list(vec_b)
+            finally:
+                pass
+        else:
+            vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
+            try:
+                return bed_vec2list(vec_b)
+            finally:
+                pass
+
+    # search() is an alias for all_hits
+    search = all_hits
+
+    def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+        """
+        :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
+
+        Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
+
+        `same_strand`, if True, will only consider hits on the same strand as `interval`.
+
+        `overlap` can be used to specify the fraction of overlap between
+        `interval` and each feature in the IntervalFile.
+
+        Example usage:
+
+        >>> fn = pybedtools.example_filename('a.bed')
+
+        >>> # create an Interval to query with
+        >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+        >>> # Create an IntervalFile out of a.bed
+        >>> intervalfile = pybedtools.IntervalFile(fn)
+
+        >>> # any stranded hits?
+        >>> intervalfile.any_hits(i, same_strand=True)
+        1
+
+        """
+        found = 0
+        self.loadIntoMap()
+
+        if same_strand == False:
+            found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
+        else:
+            found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
+
+        return found
+
+    def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+        """
+        :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
+
+        Return the number of overlaps of the Interval `interval` had with this
+        IntervalFile.
+
+        `same_strand`, if True, will only consider hits on the same strand as
+        `interval`.
+
+        `overlap` can be used to specify the fraction of overlap between
+        `interval` and each feature in the IntervalFile.
+
+        Example usage:
+
+        >>> fn = pybedtools.example_filename('a.bed')
+
+        >>> # create an Interval to query with
+        >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+        >>> # Create an IntervalFile out of a.bed
+        >>> intervalfile = pybedtools.IntervalFile(fn)
+
+        >>> # get number of stranded hits
+        >>> intervalfile.count_hits(i, same_strand=True)
+        3
+
+        """
+        self.loadIntoMap()
+
+        if same_strand == False:
+            return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
+        else:
+            return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)
author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children