jpayne@68: # distutils: language = c++ jpayne@68: # cython: language_level=2 jpayne@68: jpayne@68: # String notes: jpayne@68: # jpayne@68: # Anything that goes in C++ objects should be converted to a C++ jpayne@68: # type, using the _cppstr() function. For example: Interval._bed.file_type, jpayne@68: # or the entries in Interval._bed.fields. jpayne@68: # jpayne@68: # Any Python accessor methods (Interval.fields, Interval.__getitem__) should jpayne@68: # then be converted to Python strings using the _pystr() function. jpayne@68: # jpayne@68: # Cython uses the `str` type as whatever the native Python version uses as jpayne@68: # str. jpayne@68: jpayne@68: from libcpp.string cimport string jpayne@68: import numpy as np jpayne@68: jpayne@68: # Python byte strings automatically coerce to/from C++ strings. jpayne@68: jpayne@68: cdef _cppstr(s): jpayne@68: # Use this to handle incoming strings from Python. jpayne@68: # jpayne@68: # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings jpayne@68: # are unicode and so must be encoded to bytestring. jpayne@68: if isinstance(s, integer_types): jpayne@68: s = str(s) jpayne@68: if isinstance(s, unicode): jpayne@68: s = s.encode('UTF-8') jpayne@68: return s jpayne@68: jpayne@68: cdef _pystr(string s): jpayne@68: # Use this to prepare a string for sending to Python. jpayne@68: # jpayne@68: # Always returns unicode. jpayne@68: return s.decode('UTF-8', 'strict') jpayne@68: jpayne@68: integer_types = (int, long, np.int64) jpayne@68: jpayne@68: jpayne@68: """ jpayne@68: bedtools.pyx: A Cython wrapper for the BEDTools BedFile class jpayne@68: jpayne@68: Authors: Aaron Quinlan[1], Brent Pedersen[2] jpayne@68: Affl: [1] Center for Public Health Genomics, University of Virginia jpayne@68: [2] jpayne@68: Email: aaronquinlan at gmail dot com jpayne@68: """ jpayne@68: from cython.operator cimport dereference as deref jpayne@68: import sys jpayne@68: import subprocess jpayne@68: from collections import defaultdict jpayne@68: jpayne@68: cdef dict LOOKUPS = { jpayne@68: "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6}, jpayne@68: "vcf": {"chrom": 0, "start": 1}, jpayne@68: "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5} jpayne@68: } jpayne@68: for ktype, kdict in list(LOOKUPS.items()): jpayne@68: for k, v in list(kdict.items()): jpayne@68: kdict[v] = k jpayne@68: jpayne@68: # Keys are tuples of start/start, stop/stop, start/stop, stop/start. jpayne@68: # Values are which operators should return True, otherwise False jpayne@68: # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 jpayne@68: PROFILES_TRUE = { jpayne@68: (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b jpayne@68: # a --------- jpayne@68: # b --------- jpayne@68: jpayne@68: (-1, -1, -1, -1): (0, 1), # a < b, a <= b jpayne@68: # a ---- jpayne@68: # b ----- jpayne@68: jpayne@68: (-1, -1, -1, 0): (1,), # a <= b jpayne@68: # a ---- jpayne@68: # b ----- (book-ended) jpayne@68: jpayne@68: (1, 1, 0, 1): (5,), # a >= b jpayne@68: # a ----- jpayne@68: # b ---- (book-ended) jpayne@68: jpayne@68: (1, 1, 1, 1): (4, 5), # a > b, a >= b jpayne@68: # a ------ jpayne@68: # b ---- jpayne@68: jpayne@68: (0, 1, -1, 1): (5,), # a >= b jpayne@68: # a ------------ jpayne@68: # b --------- jpayne@68: jpayne@68: (1, 0, -1, 1): (5,), # a >= b jpayne@68: # a ----------- jpayne@68: # b ------------- jpayne@68: jpayne@68: (-1, 0, -1, 1): (1,), # a <= b jpayne@68: # a ------------- jpayne@68: # b ----------- jpayne@68: jpayne@68: (0, -1, -1, 1): (1,), # a <= b jpayne@68: # a --------- jpayne@68: # b ------------ jpayne@68: jpayne@68: (-1, -1, -1, 1): (1,), # a <= b jpayne@68: # a ----------- jpayne@68: # b ----------- jpayne@68: jpayne@68: (1, 1, -1, 1): (5,), # a >= b jpayne@68: # a ----------- jpayne@68: # b ----------- jpayne@68: jpayne@68: (1, -1, -1, 1): tuple(), # undef jpayne@68: # a ---- jpayne@68: # b ----------- jpayne@68: jpayne@68: (-1, 1, -1, 1): tuple(), # undef jpayne@68: # a ----------- jpayne@68: # b ---- jpayne@68: jpayne@68: (-1, 0, -1, 0): (1,), # a <= b jpayne@68: # a ----------- jpayne@68: # b - jpayne@68: jpayne@68: (1, 0, 0, 1): (5,), # a >= b jpayne@68: # a - jpayne@68: # b ----------- jpayne@68: jpayne@68: (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b jpayne@68: # a - jpayne@68: # b - (starts and stops are identical for all features) jpayne@68: } jpayne@68: jpayne@68: jpayne@68: class MalformedBedLineError(Exception): jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: class BedToolsFileError(Exception): jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: class Attributes(dict): jpayne@68: """ jpayne@68: Class to map between a dict of attrs and fields[8] of a GFF Interval obj. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, attr_str=""): jpayne@68: attr_str = str(attr_str) jpayne@68: self._attr_str = attr_str jpayne@68: self.sort_keys = False jpayne@68: jpayne@68: # in general, GFF files will have either as many '=' as ';' jpayne@68: # (or ';'-1 if there's no trailing ';') jpayne@68: n_semi = attr_str.count(';') jpayne@68: n_eq = attr_str.count('=') jpayne@68: n_quotes = attr_str.count('"') jpayne@68: jpayne@68: if n_eq > n_semi - 1: jpayne@68: self.sep, self.field_sep = (';', '=') jpayne@68: else: jpayne@68: self.sep, self.field_sep = (';', ' ') jpayne@68: jpayne@68: self._quoted = {} jpayne@68: jpayne@68: # TODO: pathological case . . . detect this as GFF: jpayne@68: # jpayne@68: # class_code=" " jpayne@68: # jpayne@68: # and this as GTF: jpayne@68: # jpayne@68: # class_code "=" jpayne@68: jpayne@68: # quick exit jpayne@68: if attr_str == "": jpayne@68: return jpayne@68: jpayne@68: kvs = map(str.strip, attr_str.strip().split(self.sep)) jpayne@68: for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]: jpayne@68: if value.count('"') == 2: jpayne@68: self._quoted[field] = True jpayne@68: self[field] = value.replace('"', '') jpayne@68: jpayne@68: def __str__(self): jpayne@68: # stringify all items first jpayne@68: items = [] jpayne@68: for field, val in dict.iteritems(self): jpayne@68: try: jpayne@68: if self._quoted[field]: jpayne@68: val = '"' + str(val) + '"' jpayne@68: except KeyError: jpayne@68: pass jpayne@68: items.append((field, val)) jpayne@68: jpayne@68: pairs = [] jpayne@68: if self.sort_keys: jpayne@68: items.sort() jpayne@68: for k, v in items: jpayne@68: pairs.append(self.field_sep.join([k, v])) jpayne@68: jpayne@68: return self.sep.join(pairs) + self.sep jpayne@68: jpayne@68: cdef class Interval: jpayne@68: """ jpayne@68: Class to represent a genomic interval. jpayne@68: jpayne@68: Constructor:: jpayne@68: jpayne@68: Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None) jpayne@68: jpayne@68: Class to represent a genomic interval of any format. Requires at least 3 jpayne@68: args: chrom (string), start (int), end (int). jpayne@68: jpayne@68: `start` is *always* the 0-based start coordinate. If this Interval is to jpayne@68: represent a GFF object (which uses a 1-based coordinate system), then jpayne@68: subtract 1 from the 4th item in the line to get the start position in jpayne@68: 0-based coords for this Interval. The 1-based GFF coord will still be jpayne@68: available, albeit as a string, in fields[3]. jpayne@68: jpayne@68: `otherfields` is a list of fields that don't fit into the other kwargs, and jpayne@68: will be stored in the `fields` attribute of the Interval. jpayne@68: jpayne@68: All the items in `otherfields` must be strings for proper conversion to jpayne@68: C++. jpayne@68: jpayne@68: By convention, for BED files, `otherfields` is everything past the first 6 jpayne@68: items in the line. This allows an Interval to represent composite features jpayne@68: (e.g., a GFF line concatenated to the end of a BED line) jpayne@68: jpayne@68: But for other formats (VCF, GFF, SAM), the entire line should be passed in jpayne@68: as a list for `otherfields` so that we can always check the jpayne@68: Interval.file_type and extract the fields we want, knowing that they'll be jpayne@68: in the right order as passed in with `otherfields`. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> from pybedtools import Interval jpayne@68: >>> i = Interval("chr1", 22, 44, strand='-') jpayne@68: >>> i jpayne@68: Interval(chr1:22-44) jpayne@68: jpayne@68: jpayne@68: """ jpayne@68: def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None): jpayne@68: if otherfields is None: jpayne@68: otherfields = [] jpayne@68: otherfields = [_cppstr(i) for i in otherfields] jpayne@68: self._bed = new BED( jpayne@68: _cppstr(chrom), start, end, _cppstr(name), _cppstr(score), jpayne@68: _cppstr(strand), otherfields) jpayne@68: jpayne@68: #self._bed.chrom = _cppstr(chrom) jpayne@68: #self._bed.start = start jpayne@68: #self._bed.end = end jpayne@68: #self._bed.name = _cppstr(name) jpayne@68: #self._bed.score = _cppstr(score) jpayne@68: #self._bed.strand = _cppstr(strand) jpayne@68: fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)] jpayne@68: fields.extend(otherfields) jpayne@68: self._bed.fields = fields jpayne@68: self._attrs = None jpayne@68: jpayne@68: def __copy__(self): jpayne@68: return create_interval_from_list(self.fields) jpayne@68: jpayne@68: def __hash__(self): jpayne@68: return hash("\t".join(self.fields)) jpayne@68: jpayne@68: property chrom: jpayne@68: """ the chromosome of the feature""" jpayne@68: def __get__(self): jpayne@68: return _pystr(self._bed.chrom) jpayne@68: jpayne@68: def __set__(self, chrom): jpayne@68: chrom = _cppstr(chrom) jpayne@68: self._bed.chrom = chrom jpayne@68: idx = LOOKUPS[self.file_type]["chrom"] jpayne@68: self._bed.fields[idx] = _cppstr(chrom) jpayne@68: jpayne@68: # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 jpayne@68: def __richcmp__(self, other, int op): jpayne@68: if (self.chrom != other.chrom) or (self.strand != other.strand): jpayne@68: if op == 3: return True jpayne@68: return False jpayne@68: jpayne@68: def cmp(x, y): jpayne@68: if x < y: jpayne@68: return -1 jpayne@68: if x == y: jpayne@68: return 0 jpayne@68: if x > y: jpayne@68: return 1 jpayne@68: jpayne@68: jpayne@68: # check all 4 so that we can handle nesting and partial overlaps. jpayne@68: profile = (cmp(self.start, other.start), jpayne@68: cmp(self.stop, other.stop), jpayne@68: cmp(self.start, other.stop), jpayne@68: cmp(self.stop, other.start)) jpayne@68: jpayne@68: try: jpayne@68: if PROFILES_TRUE[profile] == tuple(): jpayne@68: raise NotImplementedError('Features are nested -- comparison undefined') jpayne@68: jpayne@68: if op != 3: jpayne@68: if op in PROFILES_TRUE[profile]: jpayne@68: return True jpayne@68: return False jpayne@68: else: jpayne@68: if 2 in PROFILES_TRUE[profile]: jpayne@68: return False jpayne@68: return True jpayne@68: except KeyError: jpayne@68: raise ValueError('Currently unsupported comparison -- please ' jpayne@68: 'submit a bug report') jpayne@68: jpayne@68: property start: jpayne@68: """The 0-based start of the feature.""" jpayne@68: def __get__(self): jpayne@68: return self._bed.start jpayne@68: jpayne@68: def __set__(self, int start): jpayne@68: self._bed.start = start jpayne@68: idx = LOOKUPS[self.file_type]["start"] jpayne@68: jpayne@68: # Non-BED files should have 1-based coords in fields jpayne@68: if self.file_type != 'bed': jpayne@68: start += 1 jpayne@68: self._bed.fields[idx] = _cppstr(str(start)) jpayne@68: jpayne@68: property end: jpayne@68: """The end of the feature""" jpayne@68: def __get__(self): jpayne@68: return self._bed.end jpayne@68: jpayne@68: def __set__(self, int end): jpayne@68: self._bed.end = end jpayne@68: idx = LOOKUPS[self.file_type]["stop"] jpayne@68: self._bed.fields[idx] = _cppstr(str(end)) jpayne@68: jpayne@68: property stop: jpayne@68: """ the end of the feature""" jpayne@68: def __get__(self): jpayne@68: return self._bed.end jpayne@68: jpayne@68: def __set__(self, int end): jpayne@68: idx = LOOKUPS[self.file_type]["stop"] jpayne@68: self._bed.fields[idx] = _cppstr(str(end)) jpayne@68: self._bed.end = end jpayne@68: jpayne@68: property strand: jpayne@68: """ the strand of the feature""" jpayne@68: def __get__(self): jpayne@68: return _pystr(self._bed.strand) jpayne@68: jpayne@68: def __set__(self, strand): jpayne@68: idx = LOOKUPS[self.file_type]["strand"] jpayne@68: self._bed.fields[idx] = _cppstr(strand) jpayne@68: self._bed.strand = _cppstr(strand) jpayne@68: jpayne@68: property length: jpayne@68: """ the length of the feature""" jpayne@68: def __get__(self): jpayne@68: return self._bed.end - self._bed.start jpayne@68: jpayne@68: cpdef deparse_attrs(self): jpayne@68: jpayne@68: if not self._attrs: return jpayne@68: jpayne@68: if self.file_type != "gff": jpayne@68: raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval') jpayne@68: jpayne@68: s = self._attrs.__str__() jpayne@68: self._bed.fields[8] = _cppstr(s) jpayne@68: jpayne@68: property fields: jpayne@68: def __get__(self): jpayne@68: self.deparse_attrs() jpayne@68: items = [] jpayne@68: for i in self._bed.fields: jpayne@68: if isinstance(i, int): jpayne@68: items.append(i) jpayne@68: else: jpayne@68: items.append(_pystr(i)) jpayne@68: return items jpayne@68: jpayne@68: property attrs: jpayne@68: def __get__(self): jpayne@68: if self._attrs is None: jpayne@68: ft = _pystr(self._bed.file_type) jpayne@68: if ft == 'gff': jpayne@68: self._attrs = Attributes(_pystr(self._bed.fields[8])) jpayne@68: else: jpayne@68: self._attrs = Attributes("") jpayne@68: return self._attrs jpayne@68: jpayne@68: def __set__(self, attrs): jpayne@68: self._attrs = attrs jpayne@68: jpayne@68: # TODO: make this more robust. jpayne@68: @property jpayne@68: def count(self): jpayne@68: return int(self.fields[-1]) jpayne@68: jpayne@68: property name: jpayne@68: """ jpayne@68: >>> import pybedtools jpayne@68: >>> vcf = pybedtools.example_bedtool('v.vcf') jpayne@68: >>> [v.name for v in vcf] jpayne@68: ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1'] jpayne@68: jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: cdef string ftype = self._bed.file_type jpayne@68: value = None jpayne@68: if ftype == "gff": jpayne@68: """ jpayne@68: # TODO. allow setting a name_key in the BedTool constructor? jpayne@68: if self.name_key and self.name_key in attrs: jpayne@68: return attrs[self.name_key] jpayne@68: """ jpayne@68: for key in ("ID", "Name", "gene_name", "transcript_id", \ jpayne@68: "gene_id", "Parent"): jpayne@68: if key in self.attrs: jpayne@68: value = self.attrs[key] jpayne@68: break jpayne@68: jpayne@68: elif ftype == "vcf": jpayne@68: s = self.fields[2] jpayne@68: if s in ("", "."): jpayne@68: value = "%s:%i" % (self.chrom, self.start) jpayne@68: else: jpayne@68: value = _pystr(s) jpayne@68: elif ftype == "bed": jpayne@68: value = _pystr(self._bed.name) jpayne@68: jpayne@68: return value jpayne@68: jpayne@68: def __set__(self, value): jpayne@68: cdef string ftype = self._bed.file_type jpayne@68: jpayne@68: if ftype == "gff": jpayne@68: for key in ("ID", "Name", "gene_name", "transcript_id", \ jpayne@68: "gene_id", "Parent"): jpayne@68: if not key in self.attrs: jpayne@68: continue jpayne@68: jpayne@68: # If it's incoming from Python it's unicode, so store that directly jpayne@68: # in the attributes (since an Attribute object works on jpayne@68: # unicode)... jpayne@68: self.attrs[key] = value jpayne@68: break jpayne@68: jpayne@68: # Otherwise use _cppstr() because we're storing it in _bed.fields. jpayne@68: elif ftype == "vcf": jpayne@68: self._bed.fields[2] = _cppstr(value) jpayne@68: else: jpayne@68: self._bed.name = _cppstr(value) jpayne@68: self._bed.fields[3] = _cppstr(value) jpayne@68: jpayne@68: property score: jpayne@68: def __get__(self): jpayne@68: return _pystr(self._bed.score) jpayne@68: jpayne@68: def __set__(self, value): jpayne@68: value = _cppstr(value) jpayne@68: self._bed.score = value jpayne@68: idx = LOOKUPS[self.file_type]["score"] jpayne@68: self._bed.fields[idx] = value jpayne@68: jpayne@68: property file_type: jpayne@68: "bed/vcf/gff" jpayne@68: def __get__(self): jpayne@68: return _pystr(self._bed.file_type) jpayne@68: jpayne@68: def __set__(self, value): jpayne@68: self._bed.file_type = _cppstr(value) jpayne@68: jpayne@68: # TODO: maybe bed.overlap_start or bed.overlap.start ?? jpayne@68: @property jpayne@68: def o_start(self): jpayne@68: return self._bed.o_start jpayne@68: jpayne@68: @property jpayne@68: def o_end(self): jpayne@68: return self._bed.o_end jpayne@68: jpayne@68: @property jpayne@68: def o_amt(self): jpayne@68: return self._bed.o_end - self._bed.o_start jpayne@68: jpayne@68: def __str__(self): jpayne@68: """ jpayne@68: Interval objects always print with a newline to mimic a line in a jpayne@68: BED/GFF/VCF file jpayne@68: """ jpayne@68: items = [] jpayne@68: for i in self.fields: jpayne@68: if isinstance(i, int): jpayne@68: i = str(i) jpayne@68: items.append(i) jpayne@68: jpayne@68: return '\t'.join(items) + '\n' jpayne@68: jpayne@68: def __repr__(self): jpayne@68: return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: del self._bed jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self._bed.end - self._bed.start jpayne@68: jpayne@68: def __getitem__(self, object key): jpayne@68: cdef int i jpayne@68: ftype = _pystr(self._bed.file_type) jpayne@68: jpayne@68: self.deparse_attrs() jpayne@68: jpayne@68: if isinstance(key, (int, long)): jpayne@68: nfields = self._bed.fields.size() jpayne@68: if key >= nfields: jpayne@68: raise IndexError('field index out of range') jpayne@68: elif key < 0: jpayne@68: key = nfields + key jpayne@68: return _pystr(self._bed.fields.at(key)) jpayne@68: elif isinstance(key, slice): jpayne@68: indices = key.indices(self._bed.fields.size()) jpayne@68: return [_pystr(self._bed.fields.at(i)) for i in range(*indices)] jpayne@68: jpayne@68: elif isinstance(key, str): jpayne@68: if ftype == "gff": jpayne@68: try: jpayne@68: return self.attrs[key] jpayne@68: except KeyError: jpayne@68: pass jpayne@68: # We don't have to convert using _pystr() because the __get__ jpayne@68: # methods do that already. jpayne@68: return getattr(self, key) jpayne@68: jpayne@68: def __setitem__(self, object key, object value): jpayne@68: if isinstance(key, (int, long)): jpayne@68: nfields = self._bed.fields.size() jpayne@68: if key >= nfields: jpayne@68: raise IndexError('field index out of range') jpayne@68: elif key < 0: jpayne@68: key = nfields + key jpayne@68: self._bed.fields[key] = _cppstr(value) jpayne@68: jpayne@68: ft = _pystr(self._bed.file_type) jpayne@68: if key in LOOKUPS[ft]: jpayne@68: setattr(self, LOOKUPS[ft][key], value) jpayne@68: jpayne@68: elif isinstance(key, (basestring)): jpayne@68: setattr(self, key, value) jpayne@68: jpayne@68: cpdef append(self, object value): jpayne@68: self._bed.fields.push_back(_cppstr(value)) jpayne@68: jpayne@68: def __nonzero__(self): jpayne@68: return True jpayne@68: jpayne@68: jpayne@68: cdef Interval create_interval(BED b): jpayne@68: cdef Interval pyb = Interval.__new__(Interval) jpayne@68: pyb._bed = new BED(b.chrom, b.start, b.end, b.name, jpayne@68: b.score, b.strand, b.fields, jpayne@68: b.o_start, b.o_end, b.bedType, b.file_type, b.status) jpayne@68: pyb._bed.fields = b.fields jpayne@68: return pyb jpayne@68: jpayne@68: # TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when jpayne@68: # checking in create_interval_from_list for filetype heuruistics. Is there jpayne@68: # a performance hit by checking instances? jpayne@68: cdef isdigit(s): jpayne@68: if isinstance(s, integer_types): jpayne@68: return True jpayne@68: return s.isdigit() jpayne@68: jpayne@68: jpayne@68: cpdef Interval create_interval_from_list(list fields): jpayne@68: """ jpayne@68: Create an Interval object from a list of strings. jpayne@68: jpayne@68: Constructor:: jpayne@68: jpayne@68: create_interval_from_list(fields) jpayne@68: jpayne@68: Given the list of strings, `fields`, automatically detects the format (BED, jpayne@68: GFF, VCF, SAM) and creates a new Interval object. jpayne@68: jpayne@68: `fields` is a list with an arbitrary number of items (it can be quite long, jpayne@68: say after a -wao intersection of a BED12 and a GFF), however, the first jpayne@68: fields must conform to one of the supported formats. For example, if you jpayne@68: want the resulting Interval to be considered a GFF feature, then the first jpayne@68: 9 fields must conform to the GFF format. Similarly, if you want the jpayne@68: resulting Interval to be considered a BED feature, then the first three jpayne@68: fields must be chrom, start, stop. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> # Creates a BED3 feature jpayne@68: >>> feature = create_interval_from_list(['chr1', '1', '100']) jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: # TODO: this function is used a lot, and is doing a bit of work. We should jpayne@68: # have an optimized version that is directly provided the filetype. jpayne@68: jpayne@68: cdef Interval pyb = Interval.__new__(Interval) jpayne@68: orig_fields = fields[:] jpayne@68: # BED -- though a VCF will be detected as BED if its 2nd field, id, is a jpayne@68: # digit jpayne@68: jpayne@68: # SAM jpayne@68: if ( jpayne@68: (len(fields) >= 11) jpayne@68: and isdigit(fields[1]) jpayne@68: and isdigit(fields[3]) jpayne@68: and isdigit(fields[4]) jpayne@68: and (fields[5] not in ['.', '+', '-']) jpayne@68: ): jpayne@68: # TODO: what should the stop position be? Here, it's just the start jpayne@68: # plus the length of the sequence, but perhaps this should eventually jpayne@68: # do CIGAR string parsing. jpayne@68: if int(fields[1]) & 0x04: jpayne@68: # handle unmapped reads jpayne@68: chrom = _cppstr("*") jpayne@68: start = 0 jpayne@68: stop = 0 jpayne@68: else: jpayne@68: chrom = _cppstr(fields[2]) jpayne@68: start = int(fields[3]) - 1 jpayne@68: stop = int(fields[3]) + len(fields[9]) - 1 jpayne@68: name = _cppstr(fields[0]) jpayne@68: score = _cppstr(fields[1]) jpayne@68: if int(fields[1]) & 0x10: jpayne@68: strand = _cppstr('-') jpayne@68: else: jpayne@68: strand = _cppstr('+') jpayne@68: jpayne@68: # Fields is in SAM format jpayne@68: fields[3] = str(start + 1) jpayne@68: jpayne@68: pyb._bed = new BED( jpayne@68: chrom, jpayne@68: start, jpayne@68: stop, jpayne@68: strand, jpayne@68: name, jpayne@68: score, jpayne@68: list_to_vector(fields)) jpayne@68: pyb.file_type = _cppstr('sam') jpayne@68: jpayne@68: jpayne@68: elif isdigit(fields[1]) and isdigit(fields[2]): jpayne@68: # if it's too short, just add some empty fields. jpayne@68: if len(fields) < 7: jpayne@68: fields.extend([".".encode('UTF-8')] * (6 - len(fields))) jpayne@68: other_fields = [] jpayne@68: else: jpayne@68: other_fields = fields[6:] jpayne@68: jpayne@68: pyb._bed = new BED( jpayne@68: _cppstr(fields[0]), jpayne@68: int(fields[1]), jpayne@68: int(fields[2]), jpayne@68: _cppstr(fields[3]), jpayne@68: _cppstr(fields[4]), jpayne@68: _cppstr(fields[5]), jpayne@68: list_to_vector(other_fields)) jpayne@68: pyb.file_type = _cppstr('bed') jpayne@68: jpayne@68: # VCF jpayne@68: elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8: jpayne@68: pyb._bed = new BED( jpayne@68: _cppstr(fields[0]), jpayne@68: int(fields[1]) - 1, jpayne@68: int(fields[1]), jpayne@68: _cppstr(fields[2]), jpayne@68: _cppstr(fields[5]), jpayne@68: _cppstr('.'), jpayne@68: list_to_vector(fields)) jpayne@68: pyb.file_type = b'vcf' jpayne@68: jpayne@68: jpayne@68: # GFF jpayne@68: elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]): jpayne@68: pyb._bed = new BED( jpayne@68: _cppstr(fields[0]), jpayne@68: int(fields[3])-1, int(fields[4]), jpayne@68: _cppstr(fields[2]), jpayne@68: _cppstr(fields[5]), jpayne@68: _cppstr(fields[6]), jpayne@68: list_to_vector(fields[7:])) jpayne@68: pyb.file_type = _cppstr('gff') jpayne@68: else: jpayne@68: raise MalformedBedLineError('Unable to detect format from %s' % fields) jpayne@68: jpayne@68: if pyb.start > pyb.end: jpayne@68: raise MalformedBedLineError("Start is greater than stop") jpayne@68: pyb._bed.fields = list_to_vector(orig_fields) jpayne@68: return pyb jpayne@68: jpayne@68: cdef vector[string] list_to_vector(list li): jpayne@68: cdef vector[string] s jpayne@68: cdef int i jpayne@68: for i in range(len(li)): jpayne@68: _s = li[i] jpayne@68: s.push_back(_cppstr(_s)) jpayne@68: return s jpayne@68: jpayne@68: cdef list string_vec2list(vector[string] sv): jpayne@68: cdef size_t size = sv.size(), i jpayne@68: return [_pystr(sv.at(i)) for i in range(size)] jpayne@68: jpayne@68: cdef list bed_vec2list(vector[BED] bv): jpayne@68: cdef size_t size = bv.size(), i jpayne@68: cdef list l = [] jpayne@68: cdef BED b jpayne@68: for i in range(size): jpayne@68: b = bv.at(i) jpayne@68: l.append(create_interval(b)) jpayne@68: return l jpayne@68: jpayne@68: jpayne@68: def overlap(int s1, int s2, int e1, int e2): jpayne@68: return min(e1, e2) - max(s1, s2) jpayne@68: jpayne@68: jpayne@68: cdef class IntervalIterator: jpayne@68: cdef object stream jpayne@68: cdef int _itemtype jpayne@68: def __init__(self, stream): jpayne@68: self.stream = stream jpayne@68: jpayne@68: # For speed, check int rather than call isinstance(). jpayne@68: # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string. jpayne@68: # jpayne@68: # Also assumes that all items in the iterable `stream` are the same jpayne@68: # type...this seems like a reasonable assumption. jpayne@68: self._itemtype = -1 jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: try: jpayne@68: self.stream.close() jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: while True: jpayne@68: if hasattr(self.stream, 'closed'): jpayne@68: if self.stream.closed: jpayne@68: raise StopIteration jpayne@68: try: jpayne@68: line = next(self.stream) jpayne@68: except StopIteration: jpayne@68: if hasattr(self.stream, 'close'): jpayne@68: self.stream.close() jpayne@68: raise StopIteration jpayne@68: jpayne@68: if self._itemtype < 0: jpayne@68: if isinstance(line, Interval): jpayne@68: self._itemtype = 2 jpayne@68: elif isinstance(line, basestring): jpayne@68: self._itemtype = 1 jpayne@68: else: jpayne@68: self._itemtype = 0 jpayne@68: jpayne@68: if self._itemtype == 1: jpayne@68: if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0: jpayne@68: continue jpayne@68: break jpayne@68: jpayne@68: # Iterable of Interval objects jpayne@68: if self._itemtype == 2: jpayne@68: return line jpayne@68: jpayne@68: # Iterable of strings, in which case we need to split jpayne@68: elif self._itemtype == 1: jpayne@68: fields = line.rstrip('\r\n').split('\t') jpayne@68: jpayne@68: # Otherwise assume list/tuple/iterable of fields jpayne@68: else: jpayne@68: fields = list(line) jpayne@68: jpayne@68: # TODO: optimization: create_interval_from_list should have a version jpayne@68: # that accepts C++ string instances jpayne@68: return create_interval_from_list(fields) jpayne@68: jpayne@68: jpayne@68: jpayne@68: cdef class IntervalFile: jpayne@68: cdef BedFile *intervalFile_ptr jpayne@68: cdef bint _loaded jpayne@68: cdef bint _open jpayne@68: cdef string _fn jpayne@68: """ jpayne@68: An IntervalFile provides low-level access to the BEDTools API. jpayne@68: jpayne@68: >>> fn = pybedtools.example_filename('a.bed') jpayne@68: >>> intervalfile = pybedtools.IntervalFile(fn) jpayne@68: jpayne@68: """ jpayne@68: def __init__(self, intervalFile): jpayne@68: self.intervalFile_ptr = new BedFile(_cppstr(intervalFile)) jpayne@68: self._loaded = 0 jpayne@68: self._open = 0 jpayne@68: self._fn = _cppstr(intervalFile) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: del self.intervalFile_ptr jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: if not self._open: jpayne@68: result = self.intervalFile_ptr.Open() jpayne@68: if result == -1: jpayne@68: raise BedToolsFileError("Error opening file") jpayne@68: self._open = 1 jpayne@68: cdef BED b = self.intervalFile_ptr.GetNextBed() jpayne@68: if b.status == BED_VALID: jpayne@68: return create_interval(b) jpayne@68: elif b.status == BED_INVALID: jpayne@68: self.intervalFile_ptr.Close() jpayne@68: raise StopIteration jpayne@68: elif b.status == BED_MALFORMED: jpayne@68: self.intervalFile_ptr.Close() jpayne@68: raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields)) jpayne@68: else: jpayne@68: return next(self) jpayne@68: jpayne@68: @property jpayne@68: def fn(self): jpayne@68: return _pystr(self._fn) jpayne@68: jpayne@68: @property jpayne@68: def file_type(self): jpayne@68: if not self.intervalFile_ptr._typeIsKnown: jpayne@68: try: jpayne@68: a = next(iter(self)) jpayne@68: file_type = _pystr(self.intervalFile_ptr.file_type) jpayne@68: self.intervalFile_ptr.Close() jpayne@68: return file_type jpayne@68: except MalformedBedLineError: jpayne@68: # If it's a SAM, raise a meaningful exception. If not, fail. jpayne@68: with open(self.fn) as fn: jpayne@68: interval = create_interval_from_list(fn.readline().strip().split()) jpayne@68: if interval.file_type == 'sam': jpayne@68: raise ValueError('IntervalFile objects do not yet natively support SAM. ' jpayne@68: 'Please convert to BED/GFF/VCF first if you want to ' jpayne@68: 'use the low-level API of IntervalFile') jpayne@68: else: jpayne@68: raise jpayne@68: jpayne@68: jpayne@68: def loadIntoMap(self): jpayne@68: """ jpayne@68: Prepares file for checking intersections. Used by other methods like all_hits() jpayne@68: """ jpayne@68: if self._loaded: jpayne@68: return jpayne@68: self.intervalFile_ptr.loadBedFileIntoMap() jpayne@68: self._loaded = 1 jpayne@68: jpayne@68: def rewind(self): jpayne@68: """ jpayne@68: Jump to the beginning of the file. jpayne@68: """ jpayne@68: if not self._open: jpayne@68: self.intervalFile_ptr.Open() jpayne@68: self._open = 1 jpayne@68: self.intervalFile_ptr.Rewind() jpayne@68: jpayne@68: def seek(self, offset): jpayne@68: """ jpayne@68: Jump to a specific byte offset in the file jpayne@68: """ jpayne@68: if not self._open: jpayne@68: self.intervalFile_ptr.Open() jpayne@68: self._open = 1 jpayne@68: self.intervalFile_ptr.Seek(offset) jpayne@68: jpayne@68: jpayne@68: def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): jpayne@68: """ jpayne@68: :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)` jpayne@68: jpayne@68: Search for the Interval `interval` this file and return **all** jpayne@68: overlaps as a list. jpayne@68: jpayne@68: `same_strand`, if True, will only consider hits on the same strand as `interval`. jpayne@68: jpayne@68: `overlap` can be used to specify the fraction of overlap between jpayne@68: `interval` and each feature in the IntervalFile. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> fn = pybedtools.example_filename('a.bed') jpayne@68: jpayne@68: >>> # create an Interval to query with jpayne@68: >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') jpayne@68: jpayne@68: >>> # Create an IntervalFile out of a.bed jpayne@68: >>> intervalfile = pybedtools.IntervalFile(fn) jpayne@68: jpayne@68: >>> # get stranded hits jpayne@68: >>> intervalfile.all_hits(i, same_strand=True) jpayne@68: [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)] jpayne@68: jpayne@68: """ jpayne@68: cdef vector[BED] vec_b jpayne@68: self.loadIntoMap() jpayne@68: jpayne@68: if same_strand == False: jpayne@68: vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap) jpayne@68: try: jpayne@68: return bed_vec2list(vec_b) jpayne@68: finally: jpayne@68: pass jpayne@68: else: jpayne@68: vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap) jpayne@68: try: jpayne@68: return bed_vec2list(vec_b) jpayne@68: finally: jpayne@68: pass jpayne@68: jpayne@68: # search() is an alias for all_hits jpayne@68: search = all_hits jpayne@68: jpayne@68: def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): jpayne@68: """ jpayne@68: :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)` jpayne@68: jpayne@68: Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise. jpayne@68: jpayne@68: `same_strand`, if True, will only consider hits on the same strand as `interval`. jpayne@68: jpayne@68: `overlap` can be used to specify the fraction of overlap between jpayne@68: `interval` and each feature in the IntervalFile. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> fn = pybedtools.example_filename('a.bed') jpayne@68: jpayne@68: >>> # create an Interval to query with jpayne@68: >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') jpayne@68: jpayne@68: >>> # Create an IntervalFile out of a.bed jpayne@68: >>> intervalfile = pybedtools.IntervalFile(fn) jpayne@68: jpayne@68: >>> # any stranded hits? jpayne@68: >>> intervalfile.any_hits(i, same_strand=True) jpayne@68: 1 jpayne@68: jpayne@68: """ jpayne@68: found = 0 jpayne@68: self.loadIntoMap() jpayne@68: jpayne@68: if same_strand == False: jpayne@68: found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap) jpayne@68: else: jpayne@68: found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap) jpayne@68: jpayne@68: return found jpayne@68: jpayne@68: def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0): jpayne@68: """ jpayne@68: :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)` jpayne@68: jpayne@68: Return the number of overlaps of the Interval `interval` had with this jpayne@68: IntervalFile. jpayne@68: jpayne@68: `same_strand`, if True, will only consider hits on the same strand as jpayne@68: `interval`. jpayne@68: jpayne@68: `overlap` can be used to specify the fraction of overlap between jpayne@68: `interval` and each feature in the IntervalFile. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> fn = pybedtools.example_filename('a.bed') jpayne@68: jpayne@68: >>> # create an Interval to query with jpayne@68: >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+') jpayne@68: jpayne@68: >>> # Create an IntervalFile out of a.bed jpayne@68: >>> intervalfile = pybedtools.IntervalFile(fn) jpayne@68: jpayne@68: >>> # get number of stranded hits jpayne@68: >>> intervalfile.count_hits(i, same_strand=True) jpayne@68: 3 jpayne@68: jpayne@68: """ jpayne@68: self.loadIntoMap() jpayne@68: jpayne@68: if same_strand == False: jpayne@68: return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap) jpayne@68: else: jpayne@68: return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)