jpayne@69: # cython: language_level=3 jpayne@69: # cython: embedsignature=True jpayne@69: # cython: profile=True jpayne@69: ############################################################################### jpayne@69: ############################################################################### jpayne@69: # Cython wrapper for access to tabix indexed files in bgzf format jpayne@69: ############################################################################### jpayne@69: # The principal classes and functions defined in this module are: jpayne@69: # jpayne@69: # class TabixFile class wrapping tabix indexed files in bgzf format jpayne@69: # jpayne@69: # class asTuple Parser class for tuples jpayne@69: # class asGTF Parser class for GTF formatted rows jpayne@69: # class asGFF3 Parser class for GFF3 formatted rows jpayne@69: # class asBed Parser class for Bed formatted rows jpayne@69: # class asVCF Parser class for VCF formatted rows jpayne@69: # jpayne@69: # class tabix_generic_iterator Streamed iterator of bgzf formatted files jpayne@69: # jpayne@69: # Additionally this module defines several additional classes that are part jpayne@69: # of the internal API. These are: jpayne@69: # jpayne@69: # class Parser base class for parsers of tab-separated rows jpayne@69: # class tabix_file_iterator jpayne@69: # class TabixIterator iterator class over rows in bgzf file jpayne@69: # class EmptyIterator jpayne@69: # jpayne@69: # For backwards compatibility, the following classes are also defined: jpayne@69: # jpayne@69: # class Tabixfile equivalent to TabixFile jpayne@69: # jpayne@69: ############################################################################### jpayne@69: # jpayne@69: # The MIT License jpayne@69: # jpayne@69: # Copyright (c) 2015 Andreas Heger jpayne@69: # jpayne@69: # Permission is hereby granted, free of charge, to any person obtaining a jpayne@69: # copy of this software and associated documentation files (the "Software"), jpayne@69: # to deal in the Software without restriction, including without limitation jpayne@69: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jpayne@69: # and/or sell copies of the Software, and to permit persons to whom the jpayne@69: # Software is furnished to do so, subject to the following conditions: jpayne@69: # jpayne@69: # The above copyright notice and this permission notice shall be included in jpayne@69: # all copies or substantial portions of the Software. jpayne@69: # jpayne@69: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@69: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@69: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL jpayne@69: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@69: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jpayne@69: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jpayne@69: # DEALINGS IN THE SOFTWARE. jpayne@69: # jpayne@69: ############################################################################### jpayne@69: import os jpayne@69: import sys jpayne@69: jpayne@69: from libc.stdio cimport printf, fprintf, stderr jpayne@69: from libc.string cimport strerror jpayne@69: from libc.errno cimport errno jpayne@69: from posix.unistd cimport dup jpayne@69: jpayne@69: from cpython cimport PyErr_SetString, PyBytes_Check, \ jpayne@69: PyUnicode_Check, PyBytes_FromStringAndSize, \ jpayne@69: PyObject_AsFileDescriptor jpayne@69: jpayne@69: cimport pysam.libctabixproxies as ctabixproxies jpayne@69: jpayne@69: from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ jpayne@69: BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ jpayne@69: tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ jpayne@69: tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ jpayne@69: tbx_destroy, hisremote, region_list, hts_getline, \ jpayne@69: TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \ jpayne@69: no_compression, bcf, bcf_index_build2 jpayne@69: jpayne@69: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str jpayne@69: from pysam.libcutils cimport encode_filename, from_string_and_size jpayne@69: jpayne@69: cdef class Parser: jpayne@69: jpayne@69: def __init__(self, encoding="ascii"): jpayne@69: self.encoding = encoding jpayne@69: jpayne@69: def set_encoding(self, encoding): jpayne@69: self.encoding = encoding jpayne@69: jpayne@69: def get_encoding(self): jpayne@69: return self.encoding jpayne@69: jpayne@69: cdef parse(self, char * buffer, int length): jpayne@69: raise NotImplementedError( jpayne@69: 'parse method of %s not implemented' % str(self)) jpayne@69: jpayne@69: def __call__(self, char * buffer, int length): jpayne@69: return self.parse(buffer, length) jpayne@69: jpayne@69: jpayne@69: cdef class asTuple(Parser): jpayne@69: '''converts a :term:`tabix row` into a python tuple. jpayne@69: jpayne@69: A field in a row is accessed by numeric index. jpayne@69: ''' jpayne@69: cdef parse(self, char * buffer, int len): jpayne@69: cdef ctabixproxies.TupleProxy r jpayne@69: r = ctabixproxies.TupleProxy(self.encoding) jpayne@69: # need to copy - there were some jpayne@69: # persistence issues with "present" jpayne@69: r.copy(buffer, len) jpayne@69: return r jpayne@69: jpayne@69: jpayne@69: cdef class asGFF3(Parser): jpayne@69: '''converts a :term:`tabix row` into a GFF record with the following jpayne@69: fields: jpayne@69: jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |*Column* |*Name* |*Content* | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |1 |contig |the chromosome name | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |2 |feature |The feature type | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |3 |source |The feature source | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |4 |start |genomic start coordinate | jpayne@69: | | |(0-based) | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |5 |end |genomic end coordinate | jpayne@69: | | |(0-based) | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |6 |score |feature score | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |7 |strand |strand | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |8 |frame |frame | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |9 |attributes|the attribute field | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: jpayne@69: ''' jpayne@69: cdef parse(self, char * buffer, int len): jpayne@69: cdef ctabixproxies.GFF3Proxy r jpayne@69: r = ctabixproxies.GFF3Proxy(self.encoding) jpayne@69: r.copy(buffer, len) jpayne@69: return r jpayne@69: jpayne@69: jpayne@69: cdef class asGTF(Parser): jpayne@69: '''converts a :term:`tabix row` into a GTF record with the following jpayne@69: fields: jpayne@69: jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |*Column* |*Name* |*Content* | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |1 |contig |the chromosome name | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |2 |feature |The feature type | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |3 |source |The feature source | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |4 |start |genomic start coordinate | jpayne@69: | | |(0-based) | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |5 |end |genomic end coordinate | jpayne@69: | | |(0-based) | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |6 |score |feature score | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |7 |strand |strand | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |8 |frame |frame | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: |9 |attributes|the attribute field | jpayne@69: +----------+----------+-------------------------------+ jpayne@69: jpayne@69: GTF formatted entries also define the following fields that jpayne@69: are derived from the attributes field: jpayne@69: jpayne@69: +--------------------+------------------------------+ jpayne@69: |*Name* |*Content* | jpayne@69: +--------------------+------------------------------+ jpayne@69: |gene_id |the gene identifier | jpayne@69: +--------------------+------------------------------+ jpayne@69: |transcript_id |the transcript identifier | jpayne@69: +--------------------+------------------------------+ jpayne@69: jpayne@69: ''' jpayne@69: cdef parse(self, char * buffer, int len): jpayne@69: cdef ctabixproxies.GTFProxy r jpayne@69: r = ctabixproxies.GTFProxy(self.encoding) jpayne@69: r.copy(buffer, len) jpayne@69: return r jpayne@69: jpayne@69: jpayne@69: cdef class asBed(Parser): jpayne@69: '''converts a :term:`tabix row` into a bed record jpayne@69: with the following fields: jpayne@69: jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |*Column* |*Field* |*Contents* | jpayne@69: | | | | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |1 |contig |contig | jpayne@69: | | | | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |2 |start |genomic start coordinate (zero-based) | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |3 |end |genomic end coordinate plus one | jpayne@69: | | |(zero-based) | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |4 |name |name of feature. | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |5 |score |score of feature | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |6 |strand |strand of feature | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |7 |thickStart |thickStart | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |8 |thickEnd |thickEnd | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |9 |itemRGB |itemRGB | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |10 |blockCount |number of bocks | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |11 |blockSizes |',' separated string of block sizes | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: |12 |blockStarts|',' separated string of block genomic | jpayne@69: | | |start positions | jpayne@69: +-----------+-----------+------------------------------------------+ jpayne@69: jpayne@69: Only the first three fields are required. Additional jpayne@69: fields are optional, but if one is defined, all the preceding jpayne@69: need to be defined as well. jpayne@69: jpayne@69: ''' jpayne@69: cdef parse(self, char * buffer, int len): jpayne@69: cdef ctabixproxies.BedProxy r jpayne@69: r = ctabixproxies.BedProxy(self.encoding) jpayne@69: r.copy(buffer, len) jpayne@69: return r jpayne@69: jpayne@69: jpayne@69: cdef class asVCF(Parser): jpayne@69: '''converts a :term:`tabix row` into a VCF record with jpayne@69: the following fields: jpayne@69: jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |*Column* |*Field* |*Contents* | jpayne@69: | | | | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |1 |contig |chromosome | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |2 |pos |chromosomal position, zero-based | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |3 |id |id | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |4 |ref |reference allele | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |5 |alt |alternate alleles | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |6 |qual |quality | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |7 |filter |filter | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |8 |info |info | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: |9 |format |format specifier. | jpayne@69: +----------+---------+------------------------------------+ jpayne@69: jpayne@69: Access to genotypes is via index:: jpayne@69: jpayne@69: contig = vcf.contig jpayne@69: first_sample_genotype = vcf[0] jpayne@69: second_sample_genotype = vcf[1] jpayne@69: jpayne@69: ''' jpayne@69: cdef parse(self, char * buffer, int len): jpayne@69: cdef ctabixproxies.VCFProxy r jpayne@69: r = ctabixproxies.VCFProxy(self.encoding) jpayne@69: r.copy(buffer, len) jpayne@69: return r jpayne@69: jpayne@69: jpayne@69: cdef class TabixFile: jpayne@69: """Random access to bgzf formatted files that jpayne@69: have been indexed by :term:`tabix`. jpayne@69: jpayne@69: The file is automatically opened. The index file of file jpayne@69: ```` is expected to be called ``.tbi`` jpayne@69: by default (see parameter `index`). jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: filename : string jpayne@69: Filename of bgzf file to be opened. jpayne@69: jpayne@69: index : string jpayne@69: The filename of the index. If not set, the default is to jpayne@69: assume that the index is called ``filename.tbi`` jpayne@69: jpayne@69: mode : char jpayne@69: The file opening mode. Currently, only ``r`` is permitted. jpayne@69: jpayne@69: parser : :class:`pysam.Parser` jpayne@69: jpayne@69: sets the default parser for this tabix file. If `parser` jpayne@69: is None, the results are returned as an unparsed string. jpayne@69: Otherwise, `parser` is assumed to be a functor that will return jpayne@69: parsed data (see for example :class:`~pysam.asTuple` and jpayne@69: :class:`~pysam.asGTF`). jpayne@69: jpayne@69: encoding : string jpayne@69: jpayne@69: The encoding passed to the parser jpayne@69: jpayne@69: threads: integer jpayne@69: Number of threads to use for decompressing Tabix files. jpayne@69: (Default=1) jpayne@69: jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: ValueError jpayne@69: if index file is missing. jpayne@69: jpayne@69: IOError jpayne@69: if file could not be opened jpayne@69: """ jpayne@69: def __cinit__(self, jpayne@69: filename, jpayne@69: mode='r', jpayne@69: parser=None, jpayne@69: index=None, jpayne@69: encoding="ascii", jpayne@69: threads=1, jpayne@69: *args, jpayne@69: **kwargs ): jpayne@69: jpayne@69: self.htsfile = NULL jpayne@69: self.is_remote = False jpayne@69: self.is_stream = False jpayne@69: self.parser = parser jpayne@69: self.threads = threads jpayne@69: self._open(filename, mode, index, *args, **kwargs) jpayne@69: self.encoding = encoding jpayne@69: jpayne@69: def _open( self, jpayne@69: filename, jpayne@69: mode='r', jpayne@69: index=None, jpayne@69: threads=1, jpayne@69: ): jpayne@69: '''open a :term:`tabix file` for reading.''' jpayne@69: jpayne@69: if mode != 'r': jpayne@69: raise ValueError("invalid file opening mode `%s`" % mode) jpayne@69: jpayne@69: if self.htsfile != NULL: jpayne@69: self.close() jpayne@69: self.htsfile = NULL jpayne@69: self.threads=threads jpayne@69: jpayne@69: filename_index = index or (filename + ".tbi") jpayne@69: # encode all the strings to pass to tabix jpayne@69: self.filename = encode_filename(filename) jpayne@69: self.filename_index = encode_filename(filename_index) jpayne@69: jpayne@69: self.is_stream = self.filename == b'-' jpayne@69: self.is_remote = hisremote(self.filename) jpayne@69: jpayne@69: if not self.is_remote: jpayne@69: if not os.path.exists(filename): jpayne@69: raise IOError("file `%s` not found" % filename) jpayne@69: jpayne@69: if not os.path.exists(filename_index): jpayne@69: raise IOError("index `%s` not found" % filename_index) jpayne@69: jpayne@69: # open file jpayne@69: cdef char *cfilename = self.filename jpayne@69: cdef char *cfilename_index = self.filename_index jpayne@69: with nogil: jpayne@69: self.htsfile = hts_open(cfilename, 'r') jpayne@69: jpayne@69: if self.htsfile == NULL: jpayne@69: raise IOError("could not open file `%s`" % filename) jpayne@69: jpayne@69: #if self.htsfile.format.category != region_list: jpayne@69: # raise ValueError("file does not contain region data") jpayne@69: jpayne@69: with nogil: jpayne@69: self.index = tbx_index_load2(cfilename, cfilename_index) jpayne@69: jpayne@69: if self.index == NULL: jpayne@69: raise IOError("could not open index for `%s`" % filename) jpayne@69: jpayne@69: if not self.is_stream: jpayne@69: self.start_offset = self.tell() jpayne@69: jpayne@69: def _dup(self): jpayne@69: '''return a copy of this tabix file. jpayne@69: jpayne@69: The file is being re-opened. jpayne@69: ''' jpayne@69: return TabixFile(self.filename, jpayne@69: mode="r", jpayne@69: threads=self.threads, jpayne@69: parser=self.parser, jpayne@69: index=self.filename_index, jpayne@69: encoding=self.encoding) jpayne@69: jpayne@69: def fetch(self, jpayne@69: reference=None, jpayne@69: start=None, jpayne@69: end=None, jpayne@69: region=None, jpayne@69: parser=None, jpayne@69: multiple_iterators=False): jpayne@69: '''fetch one or more rows in a :term:`region` using 0-based jpayne@69: indexing. The region is specified by :term:`reference`, jpayne@69: *start* and *end*. Alternatively, a samtools :term:`region` jpayne@69: string can be supplied. jpayne@69: jpayne@69: Without *reference* or *region* all entries will be fetched. jpayne@69: jpayne@69: If only *reference* is set, all reads matching on *reference* jpayne@69: will be fetched. jpayne@69: jpayne@69: If *parser* is None, the default parser will be used for jpayne@69: parsing. jpayne@69: jpayne@69: Set *multiple_iterators* to true if you will be using multiple jpayne@69: iterators on the same file at the same time. The iterator jpayne@69: returned will receive its own copy of a filehandle to the file jpayne@69: effectively re-opening the file. Re-opening a file creates jpayne@69: some overhead, so beware. jpayne@69: jpayne@69: ''' jpayne@69: if not self.is_open(): jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: jpayne@69: # convert coordinates to region string, which is one-based jpayne@69: if reference: jpayne@69: if end is not None: jpayne@69: if end < 0: jpayne@69: raise ValueError("end out of range (%i)" % end) jpayne@69: if start is None: jpayne@69: start = 0 jpayne@69: jpayne@69: if start < 0: jpayne@69: raise ValueError("start out of range (%i)" % end) jpayne@69: elif start > end: jpayne@69: raise ValueError( jpayne@69: 'start (%i) >= end (%i)' % (start, end)) jpayne@69: elif start == end: jpayne@69: return EmptyIterator() jpayne@69: else: jpayne@69: region = '%s:%i-%i' % (reference, start + 1, end) jpayne@69: elif start is not None: jpayne@69: if start < 0: jpayne@69: raise ValueError("start out of range (%i)" % end) jpayne@69: region = '%s:%i' % (reference, start + 1) jpayne@69: else: jpayne@69: region = reference jpayne@69: jpayne@69: # get iterator jpayne@69: cdef hts_itr_t * itr jpayne@69: cdef char *cstr jpayne@69: cdef TabixFile fileobj jpayne@69: jpayne@69: # reopen the same file if necessary jpayne@69: if multiple_iterators: jpayne@69: fileobj = self._dup() jpayne@69: else: jpayne@69: fileobj = self jpayne@69: jpayne@69: if region is None: jpayne@69: # without region or reference - iterate from start jpayne@69: with nogil: jpayne@69: itr = tbx_itr_queryi(fileobj.index, jpayne@69: HTS_IDX_START, jpayne@69: 0, jpayne@69: 0) jpayne@69: else: jpayne@69: s = force_bytes(region, encoding=fileobj.encoding) jpayne@69: cstr = s jpayne@69: with nogil: jpayne@69: itr = tbx_itr_querys(fileobj.index, cstr) jpayne@69: jpayne@69: if itr == NULL: jpayne@69: if region is None: jpayne@69: if len(self.contigs) > 0: jpayne@69: # when accessing a tabix file created prior tabix 1.0 jpayne@69: # the full-file iterator is empty. jpayne@69: raise ValueError( jpayne@69: "could not create iterator, possible " jpayne@69: "tabix version mismatch") jpayne@69: else: jpayne@69: # possible reason is that the file is empty - jpayne@69: # return an empty iterator jpayne@69: return EmptyIterator() jpayne@69: else: jpayne@69: raise ValueError( jpayne@69: "could not create iterator for region '%s'" % jpayne@69: region) jpayne@69: jpayne@69: # use default parser if no parser is specified jpayne@69: if parser is None: jpayne@69: parser = fileobj.parser jpayne@69: jpayne@69: cdef TabixIterator a jpayne@69: if parser is None: jpayne@69: a = TabixIterator(encoding=fileobj.encoding) jpayne@69: else: jpayne@69: parser.set_encoding(fileobj.encoding) jpayne@69: a = TabixIteratorParsed(parser) jpayne@69: jpayne@69: a.tabixfile = fileobj jpayne@69: a.iterator = itr jpayne@69: jpayne@69: return a jpayne@69: jpayne@69: ############################################################### jpayne@69: ############################################################### jpayne@69: ############################################################### jpayne@69: ## properties jpayne@69: ############################################################### jpayne@69: property header: jpayne@69: '''the file header. jpayne@69: jpayne@69: The file header consists of the lines at the beginning of a jpayne@69: file that are prefixed by the comment character ``#``. jpayne@69: jpayne@69: .. note:: jpayne@69: The header is returned as an iterator presenting lines jpayne@69: without the newline character. jpayne@69: ''' jpayne@69: jpayne@69: def __get__(self): jpayne@69: jpayne@69: cdef char *cfilename = self.filename jpayne@69: cdef char *cfilename_index = self.filename_index jpayne@69: jpayne@69: cdef kstring_t buffer jpayne@69: buffer.l = buffer.m = 0 jpayne@69: buffer.s = NULL jpayne@69: jpayne@69: cdef htsFile * fp = NULL jpayne@69: cdef int KS_SEP_LINE = 2 jpayne@69: cdef tbx_t * tbx = NULL jpayne@69: lines = [] jpayne@69: with nogil: jpayne@69: fp = hts_open(cfilename, 'r') jpayne@69: jpayne@69: if fp == NULL: jpayne@69: raise OSError("could not open {} for reading header".format(self.filename)) jpayne@69: jpayne@69: with nogil: jpayne@69: tbx = tbx_index_load2(cfilename, cfilename_index) jpayne@69: jpayne@69: if tbx == NULL: jpayne@69: raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) jpayne@69: jpayne@69: while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: jpayne@69: if not buffer.l or buffer.s[0] != tbx.conf.meta_char: jpayne@69: break jpayne@69: lines.append(force_str(buffer.s, self.encoding)) jpayne@69: jpayne@69: with nogil: jpayne@69: hts_close(fp) jpayne@69: free(buffer.s) jpayne@69: jpayne@69: return lines jpayne@69: jpayne@69: property contigs: jpayne@69: '''list of chromosome names''' jpayne@69: def __get__(self): jpayne@69: cdef const char ** sequences jpayne@69: cdef int nsequences jpayne@69: jpayne@69: with nogil: jpayne@69: sequences = tbx_seqnames(self.index, &nsequences) jpayne@69: cdef int x jpayne@69: result = [] jpayne@69: for x from 0 <= x < nsequences: jpayne@69: result.append(force_str(sequences[x])) jpayne@69: jpayne@69: # htslib instructions: jpayne@69: # only free container, not the sequences themselves jpayne@69: free(sequences) jpayne@69: jpayne@69: return result jpayne@69: jpayne@69: def close(self): jpayne@69: ''' jpayne@69: closes the :class:`pysam.TabixFile`.''' jpayne@69: if self.htsfile != NULL: jpayne@69: hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: if self.index != NULL: jpayne@69: tbx_destroy(self.index) jpayne@69: self.index = NULL jpayne@69: jpayne@69: def __dealloc__( self ): jpayne@69: # remember: dealloc cannot call other python methods jpayne@69: # note: no doc string jpayne@69: # note: __del__ is not called. jpayne@69: if self.htsfile != NULL: jpayne@69: hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: if self.index != NULL: jpayne@69: tbx_destroy(self.index) jpayne@69: jpayne@69: jpayne@69: cdef class TabixIterator: jpayne@69: """iterates over rows in *tabixfile* in region jpayne@69: given by *tid*, *start* and *end*. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, encoding="ascii"): jpayne@69: self.encoding = encoding jpayne@69: jpayne@69: def __iter__(self): jpayne@69: self.buffer.s = NULL jpayne@69: self.buffer.l = 0 jpayne@69: self.buffer.m = 0 jpayne@69: jpayne@69: return self jpayne@69: jpayne@69: cdef int __cnext__(self): jpayne@69: '''iterate to next element. jpayne@69: jpayne@69: Return -5 if file has been closed when this function jpayne@69: was called. jpayne@69: ''' jpayne@69: if self.tabixfile.htsfile == NULL: jpayne@69: return -5 jpayne@69: jpayne@69: cdef int retval jpayne@69: jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: retval = tbx_itr_next( jpayne@69: self.tabixfile.htsfile, jpayne@69: self.tabixfile.index, jpayne@69: self.iterator, jpayne@69: &self.buffer) jpayne@69: jpayne@69: if retval < 0: jpayne@69: break jpayne@69: jpayne@69: if self.buffer.s[0] != b'#': jpayne@69: break jpayne@69: jpayne@69: return retval jpayne@69: jpayne@69: def __next__(self): jpayne@69: """python version of next(). jpayne@69: jpayne@69: pyrex uses this non-standard name instead of next() jpayne@69: """ jpayne@69: jpayne@69: cdef int retval = self.__cnext__() jpayne@69: if retval == -5: jpayne@69: raise IOError("iteration on closed file") jpayne@69: elif retval < 0: jpayne@69: raise StopIteration jpayne@69: jpayne@69: return charptr_to_str(self.buffer.s, self.encoding) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.iterator != NULL: jpayne@69: tbx_itr_destroy(self.iterator) jpayne@69: if self.buffer.s != NULL: jpayne@69: free(self.buffer.s) jpayne@69: jpayne@69: jpayne@69: class EmptyIterator: jpayne@69: '''empty iterator''' jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: raise StopIteration() jpayne@69: jpayne@69: jpayne@69: cdef class TabixIteratorParsed(TabixIterator): jpayne@69: """iterates over mapped reads in a region. jpayne@69: jpayne@69: The *parser* determines the encoding. jpayne@69: jpayne@69: Returns parsed data. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, Parser parser): jpayne@69: super().__init__() jpayne@69: self.parser = parser jpayne@69: jpayne@69: def __next__(self): jpayne@69: """python version of next(). jpayne@69: jpayne@69: pyrex uses this non-standard name instead of next() jpayne@69: """ jpayne@69: jpayne@69: cdef int retval = self.__cnext__() jpayne@69: if retval == -5: jpayne@69: raise IOError("iteration on closed file") jpayne@69: elif retval < 0: jpayne@69: raise StopIteration jpayne@69: jpayne@69: return self.parser.parse(self.buffer.s, jpayne@69: self.buffer.l) jpayne@69: jpayne@69: jpayne@69: cdef class GZIterator: jpayne@69: def __init__(self, filename, int buffer_size=65536, encoding="ascii"): jpayne@69: '''iterate line-by-line through gzip (or bgzip) jpayne@69: compressed file. jpayne@69: ''' jpayne@69: if not os.path.exists(filename): jpayne@69: raise IOError("No such file or directory: %s" % filename) jpayne@69: jpayne@69: filename = encode_filename(filename) jpayne@69: cdef char *cfilename = filename jpayne@69: with nogil: jpayne@69: self.gzipfile = bgzf_open(cfilename, "r") jpayne@69: self._filename = filename jpayne@69: self.kstream = ks_init(self.gzipfile) jpayne@69: self.encoding = encoding jpayne@69: jpayne@69: self.buffer.l = 0 jpayne@69: self.buffer.m = 0 jpayne@69: self.buffer.s = malloc(buffer_size) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: '''close file.''' jpayne@69: if self.gzipfile != NULL: jpayne@69: bgzf_close(self.gzipfile) jpayne@69: self.gzipfile = NULL jpayne@69: if self.buffer.s != NULL: jpayne@69: free(self.buffer.s) jpayne@69: if self.kstream != NULL: jpayne@69: ks_destroy(self.kstream) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef int __cnext__(self): jpayne@69: cdef int dret = 0 jpayne@69: cdef int retval = 0 jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) jpayne@69: jpayne@69: if retval < 0: jpayne@69: break jpayne@69: jpayne@69: return dret jpayne@69: return -1 jpayne@69: jpayne@69: def __next__(self): jpayne@69: """python version of next(). jpayne@69: """ jpayne@69: cdef int retval = self.__cnext__() jpayne@69: if retval < 0: jpayne@69: raise StopIteration jpayne@69: return force_str(self.buffer.s, self.encoding) jpayne@69: jpayne@69: jpayne@69: cdef class GZIteratorHead(GZIterator): jpayne@69: '''iterate line-by-line through gzip (or bgzip) jpayne@69: compressed file returning comments at top of file. jpayne@69: ''' jpayne@69: jpayne@69: def __next__(self): jpayne@69: """python version of next(). jpayne@69: """ jpayne@69: cdef int retval = self.__cnext__() jpayne@69: if retval < 0: jpayne@69: raise StopIteration jpayne@69: if self.buffer.s[0] == b'#': jpayne@69: return self.buffer.s jpayne@69: else: jpayne@69: raise StopIteration jpayne@69: jpayne@69: jpayne@69: cdef class GZIteratorParsed(GZIterator): jpayne@69: '''iterate line-by-line through gzip (or bgzip) jpayne@69: compressed file returning comments at top of file. jpayne@69: ''' jpayne@69: jpayne@69: def __init__(self, parser): jpayne@69: self.parser = parser jpayne@69: jpayne@69: def __next__(self): jpayne@69: """python version of next(). jpayne@69: """ jpayne@69: cdef int retval = self.__cnext__() jpayne@69: if retval < 0: jpayne@69: raise StopIteration jpayne@69: jpayne@69: return self.parser.parse(self.buffer.s, jpayne@69: self.buffer.l) jpayne@69: jpayne@69: jpayne@69: def tabix_compress(filename_in, jpayne@69: filename_out, jpayne@69: force=False): jpayne@69: '''compress *filename_in* writing the output to *filename_out*. jpayne@69: jpayne@69: Raise an IOError if *filename_out* already exists, unless *force* jpayne@69: is set. jpayne@69: ''' jpayne@69: jpayne@69: if not force and os.path.exists(filename_out): jpayne@69: raise IOError( jpayne@69: "Filename '%s' already exists, use *force* to " jpayne@69: "overwrite" % filename_out) jpayne@69: jpayne@69: cdef int WINDOW_SIZE jpayne@69: cdef int c, r jpayne@69: cdef void * buffer jpayne@69: cdef BGZF * fp jpayne@69: cdef int fd_src jpayne@69: cdef bint is_empty = True jpayne@69: cdef int O_RDONLY jpayne@69: O_RDONLY = os.O_RDONLY jpayne@69: jpayne@69: WINDOW_SIZE = 64 * 1024 jpayne@69: jpayne@69: fn = encode_filename(filename_out) jpayne@69: cdef char *cfn = fn jpayne@69: with nogil: jpayne@69: fp = bgzf_open(cfn, "w") jpayne@69: if fp == NULL: jpayne@69: raise IOError("could not open '%s' for writing" % filename_out) jpayne@69: jpayne@69: fn = encode_filename(filename_in) jpayne@69: fd_src = open(fn, O_RDONLY) jpayne@69: if fd_src == 0: jpayne@69: raise IOError("could not open '%s' for reading" % filename_in) jpayne@69: jpayne@69: buffer = malloc(WINDOW_SIZE) jpayne@69: c = 1 jpayne@69: jpayne@69: while c > 0: jpayne@69: with nogil: jpayne@69: c = read(fd_src, buffer, WINDOW_SIZE) jpayne@69: if c > 0: jpayne@69: is_empty = False jpayne@69: r = bgzf_write(fp, buffer, c) jpayne@69: if r < 0: jpayne@69: free(buffer) jpayne@69: raise IOError("writing failed") jpayne@69: jpayne@69: free(buffer) jpayne@69: r = bgzf_close(fp) jpayne@69: if r < 0: jpayne@69: raise IOError("error %i when writing to file %s" % (r, filename_out)) jpayne@69: jpayne@69: r = close(fd_src) jpayne@69: # an empty file will return with -1, thus ignore this. jpayne@69: if r < 0: jpayne@69: if not (r == -1 and is_empty): jpayne@69: raise IOError("error %i when closing file %s" % (r, filename_in)) jpayne@69: jpayne@69: jpayne@69: def tabix_index(filename, jpayne@69: force=False, jpayne@69: seq_col=None, jpayne@69: start_col=None, jpayne@69: end_col=None, jpayne@69: preset=None, jpayne@69: meta_char="#", jpayne@69: int line_skip=0, jpayne@69: zerobased=False, jpayne@69: int min_shift=-1, jpayne@69: index=None, jpayne@69: keep_original=False, jpayne@69: csi=False, jpayne@69: ): jpayne@69: '''index tab-separated *filename* using tabix. jpayne@69: jpayne@69: An existing index will not be overwritten unless *force* is set. jpayne@69: jpayne@69: The index will be built from coordinates in columns *seq_col*, jpayne@69: *start_col* and *end_col*. jpayne@69: jpayne@69: The contents of *filename* have to be sorted by contig and jpayne@69: position - the method does not check if the file is sorted. jpayne@69: jpayne@69: Column indices are 0-based. Note that this is different from the jpayne@69: tabix command line utility where column indices start at 1. jpayne@69: jpayne@69: Coordinates in the file are assumed to be 1-based unless jpayne@69: *zerobased* is set. jpayne@69: jpayne@69: If *preset* is provided, the column coordinates are taken from a jpayne@69: preset. Valid values for preset are "gff", "bed", "sam", "vcf", jpayne@69: psltbl", "pileup". jpayne@69: jpayne@69: Lines beginning with *meta_char* and the first *line_skip* lines jpayne@69: will be skipped. jpayne@69: jpayne@69: If *filename* is not detected as a gzip file it will be automatically jpayne@69: compressed. The original file will be removed and only the compressed jpayne@69: file will be retained. jpayne@69: jpayne@69: By default or when *min_shift* is 0, creates a TBI index. If *min_shift* jpayne@69: is greater than zero and/or *csi* is True, creates a CSI index with a jpayne@69: minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set). jpayne@69: jpayne@69: *index* controls the filename which should be used for creating the index. jpayne@69: If not set, the default is to append ``.tbi`` to *filename*. jpayne@69: jpayne@69: When automatically compressing files, if *keep_original* is set the jpayne@69: uncompressed file will not be deleted. jpayne@69: jpayne@69: returns the filename of the compressed data jpayne@69: jpayne@69: ''' jpayne@69: jpayne@69: if preset is None and \ jpayne@69: (seq_col is None or start_col is None or end_col is None): jpayne@69: raise ValueError( jpayne@69: "neither preset nor seq_col,start_col and end_col given") jpayne@69: jpayne@69: fn = encode_filename(filename) jpayne@69: cdef char *cfn = fn jpayne@69: jpayne@69: cdef htsFile *fp = hts_open(cfn, "r") jpayne@69: if fp == NULL: jpayne@69: raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno)))) jpayne@69: jpayne@69: cdef htsFormat fmt = hts_get_format(fp)[0] jpayne@69: hts_close(fp) jpayne@69: jpayne@69: if fmt.compression == no_compression: jpayne@69: tabix_compress(filename, filename + ".gz", force=force) jpayne@69: if not keep_original: jpayne@69: os.unlink(filename) jpayne@69: filename += ".gz" jpayne@69: fn = encode_filename(filename) jpayne@69: cfn = fn jpayne@69: jpayne@69: # columns (1-based): jpayne@69: # preset-code, contig, start, end, metachar for jpayne@69: # comments, lines to ignore at beginning jpayne@69: # 0 is a missing column jpayne@69: preset2conf = { jpayne@69: 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0), jpayne@69: 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0), jpayne@69: 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0), jpayne@69: 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0), jpayne@69: 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0), jpayne@69: } jpayne@69: jpayne@69: conf_data = None jpayne@69: if preset == "bcf" or fmt.format == bcf: jpayne@69: csi = True jpayne@69: elif preset: jpayne@69: try: jpayne@69: conf_data = preset2conf[preset] jpayne@69: except KeyError: jpayne@69: raise KeyError( jpayne@69: "unknown preset '%s', valid presets are '%s'" % jpayne@69: (preset, ",".join(preset2conf.keys()))) jpayne@69: else: jpayne@69: if end_col is None: jpayne@69: end_col = -1 jpayne@69: jpayne@69: preset = 0 jpayne@69: # tabix internally works with 0-based coordinates and jpayne@69: # open/closed intervals. When using a preset, conversion is jpayne@69: # automatically taken care of. Otherwise, the coordinates are jpayne@69: # assumed to be 1-based closed intervals and -1 is subtracted jpayne@69: # from the start coordinate. To avoid doing this, set the jpayne@69: # TI_FLAG_UCSC=0x10000 flag: jpayne@69: if zerobased: jpayne@69: preset = preset | TBX_UCSC jpayne@69: jpayne@69: conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip) jpayne@69: jpayne@69: cdef tbx_conf_t conf jpayne@69: if conf_data: jpayne@69: conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data jpayne@69: jpayne@69: if csi or min_shift > 0: jpayne@69: suffix = ".csi" jpayne@69: if min_shift <= 0: min_shift = 14 jpayne@69: else: jpayne@69: suffix = ".tbi" jpayne@69: min_shift = 0 jpayne@69: jpayne@69: index = index or filename + suffix jpayne@69: fn_index = encode_filename(index) jpayne@69: jpayne@69: if not force and os.path.exists(index): jpayne@69: raise IOError( jpayne@69: "filename '%s' already exists, use *force* to overwrite" % index) jpayne@69: jpayne@69: cdef char *fnidx = fn_index jpayne@69: cdef int retval = 0 jpayne@69: jpayne@69: if csi and fmt.format == bcf: jpayne@69: with nogil: jpayne@69: retval = bcf_index_build2(cfn, fnidx, min_shift) jpayne@69: else: jpayne@69: with nogil: jpayne@69: retval = tbx_index_build2(cfn, fnidx, min_shift, &conf) jpayne@69: jpayne@69: if retval != 0: jpayne@69: raise OSError("building of index for {} failed".format(filename)) jpayne@69: jpayne@69: return filename jpayne@69: jpayne@69: # ######################################################### jpayne@69: # cdef class tabix_file_iterator_old: jpayne@69: # '''iterate over ``infile``. jpayne@69: jpayne@69: # This iterator is not safe. If the :meth:`__next__()` method is called jpayne@69: # after ``infile`` is closed, the result is undefined (see ``fclose()``). jpayne@69: jpayne@69: # The iterator might either raise a StopIteration or segfault. jpayne@69: # ''' jpayne@69: jpayne@69: jpayne@69: # def __cinit__(self, jpayne@69: # infile, jpayne@69: # Parser parser, jpayne@69: # int buffer_size = 65536 ): jpayne@69: jpayne@69: # cdef int fd = PyObject_AsFileDescriptor( infile ) jpayne@69: # if fd == -1: raise ValueError( "I/O operation on closed file." ) jpayne@69: # self.infile = fdopen( fd, 'r') jpayne@69: jpayne@69: # if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) jpayne@69: jpayne@69: # self.buffer = malloc( buffer_size ) jpayne@69: # self.size = buffer_size jpayne@69: # self.parser = parser jpayne@69: jpayne@69: # def __iter__(self): jpayne@69: # return self jpayne@69: jpayne@69: # cdef __cnext__(self): jpayne@69: jpayne@69: # cdef char * b jpayne@69: # cdef size_t nbytes jpayne@69: # b = self.buffer jpayne@69: jpayne@69: # while not feof( self.infile ): jpayne@69: # nbytes = getline( &b, &self.size, self.infile) jpayne@69: jpayne@69: # # stop at first error or eof jpayne@69: # if (nbytes == -1): break jpayne@69: # # skip comments jpayne@69: # if (b[0] == '#'): continue jpayne@69: jpayne@69: # # skip empty lines jpayne@69: # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue jpayne@69: jpayne@69: # # make sure that entry is complete jpayne@69: # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': jpayne@69: # result = b jpayne@69: # raise ValueError( "incomplete line at %s" % result ) jpayne@69: jpayne@69: # # make sure that this goes fully through C jpayne@69: # # otherwise buffer is copied to/from a jpayne@69: # # Python object causing segfaults as jpayne@69: # # the wrong memory is freed jpayne@69: # return self.parser.parse( b, nbytes ) jpayne@69: jpayne@69: # raise StopIteration jpayne@69: jpayne@69: # def __dealloc__(self): jpayne@69: # free(self.buffer) jpayne@69: jpayne@69: # def __next__(self): jpayne@69: # return self.__cnext__() jpayne@69: jpayne@69: ######################################################### jpayne@69: ######################################################### jpayne@69: ######################################################### jpayne@69: ## Iterators for parsing through unindexed files. jpayne@69: ######################################################### jpayne@69: # cdef buildGzipError(void *gzfp): jpayne@69: # cdef int errnum = 0 jpayne@69: # cdef char *s = gzerror(gzfp, &errnum) jpayne@69: # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) jpayne@69: jpayne@69: jpayne@69: cdef class tabix_file_iterator: jpayne@69: '''iterate over a compressed or uncompressed ``infile``. jpayne@69: ''' jpayne@69: jpayne@69: def __cinit__(self, jpayne@69: infile, jpayne@69: Parser parser, jpayne@69: int buffer_size=65536): jpayne@69: jpayne@69: if infile.closed: jpayne@69: raise ValueError("I/O operation on closed file.") jpayne@69: jpayne@69: self.infile = infile jpayne@69: jpayne@69: cdef int fd = PyObject_AsFileDescriptor(infile) jpayne@69: if fd == -1: jpayne@69: raise ValueError("I/O operation on closed file.") jpayne@69: jpayne@69: self.duplicated_fd = dup(fd) jpayne@69: jpayne@69: # From the manual: jpayne@69: # gzopen can be used to read a file which is not in gzip format; jpayne@69: # in this case gzread will directly read from the file without decompression. jpayne@69: # When reading, this will be detected automatically by looking jpayne@69: # for the magic two-byte gzip header. jpayne@69: self.fh = bgzf_dopen(self.duplicated_fd, 'r') jpayne@69: jpayne@69: if self.fh == NULL: jpayne@69: raise IOError('%s' % strerror(errno)) jpayne@69: jpayne@69: self.kstream = ks_init(self.fh) jpayne@69: jpayne@69: self.buffer.s = malloc(buffer_size) jpayne@69: #if self.buffer == NULL: jpayne@69: # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) jpayne@69: #self.size = buffer_size jpayne@69: self.parser = parser jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef __cnext__(self): jpayne@69: jpayne@69: cdef char * b jpayne@69: cdef int dret = 0 jpayne@69: cdef int retval = 0 jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) jpayne@69: jpayne@69: if retval < 0: jpayne@69: break jpayne@69: #raise IOError('gzip error: %s' % buildGzipError( self.fh )) jpayne@69: jpayne@69: b = self.buffer.s jpayne@69: jpayne@69: # skip comments jpayne@69: if (b[0] == b'#'): jpayne@69: continue jpayne@69: jpayne@69: # skip empty lines jpayne@69: if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': jpayne@69: continue jpayne@69: jpayne@69: # gzgets terminates at \n, no need to test jpayne@69: jpayne@69: # parser creates a copy jpayne@69: return self.parser.parse(b, self.buffer.l) jpayne@69: jpayne@69: raise StopIteration jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: free(self.buffer.s) jpayne@69: ks_destroy(self.kstream) jpayne@69: bgzf_close(self.fh) jpayne@69: jpayne@69: def __next__(self): jpayne@69: return self.__cnext__() jpayne@69: jpayne@69: jpayne@69: class tabix_generic_iterator: jpayne@69: '''iterate over ``infile``. jpayne@69: jpayne@69: Permits the use of file-like objects for example from the gzip module. jpayne@69: ''' jpayne@69: def __init__(self, infile, parser): jpayne@69: jpayne@69: self.infile = infile jpayne@69: if self.infile.closed: jpayne@69: raise ValueError("I/O operation on closed file.") jpayne@69: self.parser = parser jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: # cython version - required for python 3 jpayne@69: def __next__(self): jpayne@69: jpayne@69: cdef char * b jpayne@69: cdef char * cpy jpayne@69: cdef size_t nbytes jpayne@69: jpayne@69: encoding = self.parser.get_encoding() jpayne@69: jpayne@69: # note that GzipFile.close() does not close the file jpayne@69: # reading is still possible. jpayne@69: if self.infile.closed: jpayne@69: raise ValueError("I/O operation on closed file.") jpayne@69: jpayne@69: while 1: jpayne@69: jpayne@69: line = self.infile.readline() jpayne@69: if not line: jpayne@69: break jpayne@69: jpayne@69: s = force_bytes(line, encoding) jpayne@69: b = s jpayne@69: nbytes = len(line) jpayne@69: assert b[nbytes] == b'\0' jpayne@69: jpayne@69: # skip comments jpayne@69: if b[0] == b'#': jpayne@69: continue jpayne@69: jpayne@69: # skip empty lines jpayne@69: if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': jpayne@69: continue jpayne@69: jpayne@69: # make sure that entry is complete jpayne@69: if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': jpayne@69: raise ValueError("incomplete line at %s" % line) jpayne@69: jpayne@69: bytes_cpy = b jpayne@69: cpy = bytes_cpy jpayne@69: jpayne@69: return self.parser(cpy, nbytes) jpayne@69: jpayne@69: raise StopIteration jpayne@69: jpayne@69: jpayne@69: def tabix_iterator(infile, parser): jpayne@69: """return an iterator over all entries in a file. jpayne@69: jpayne@69: Results are returned parsed as specified by the *parser*. If jpayne@69: *parser* is None, the results are returned as an unparsed string. jpayne@69: Otherwise, *parser* is assumed to be a functor that will return jpayne@69: parsed data (see for example :class:`~pysam.asTuple` and jpayne@69: :class:`~pysam.asGTF`). jpayne@69: jpayne@69: """ jpayne@69: return tabix_generic_iterator(infile, parser) jpayne@69: jpayne@69: jpayne@69: cdef class Tabixfile(TabixFile): jpayne@69: """Tabixfile is deprecated: use TabixFile instead""" jpayne@69: pass jpayne@69: jpayne@69: jpayne@69: __all__ = [ jpayne@69: "tabix_index", jpayne@69: "tabix_compress", jpayne@69: "TabixFile", jpayne@69: "Tabixfile", jpayne@69: "asTuple", jpayne@69: "asGTF", jpayne@69: "asGFF3", jpayne@69: "asVCF", jpayne@69: "asBed", jpayne@69: "GZIterator", jpayne@69: "GZIteratorHead", jpayne@69: "tabix_iterator", jpayne@69: "tabix_generic_iterator", jpayne@69: "tabix_file_iterator", jpayne@69: ]