jpayne@69: # cython: language_level=3
jpayne@69: # cython: embedsignature=True
jpayne@69: # cython: profile=True
jpayne@69: ###############################################################################
jpayne@69: ###############################################################################
jpayne@69: # Cython wrapper for access to tabix indexed files in bgzf format
jpayne@69: ###############################################################################
jpayne@69: # The principal classes and functions defined in this module are:
jpayne@69: #
jpayne@69: # class TabixFile  class wrapping tabix indexed files in bgzf format
jpayne@69: #
jpayne@69: # class asTuple  Parser class for tuples
jpayne@69: # class asGTF    Parser class for GTF formatted rows
jpayne@69: # class asGFF3   Parser class for GFF3 formatted rows
jpayne@69: # class asBed    Parser class for Bed formatted rows
jpayne@69: # class asVCF    Parser class for VCF formatted rows
jpayne@69: #
jpayne@69: # class tabix_generic_iterator  Streamed iterator of bgzf formatted files
jpayne@69: #
jpayne@69: # Additionally this module defines several additional classes that are part
jpayne@69: # of the internal API. These are:
jpayne@69: #
jpayne@69: # class Parser                base class for parsers of tab-separated rows
jpayne@69: # class tabix_file_iterator
jpayne@69: # class TabixIterator         iterator class over rows in bgzf file
jpayne@69: # class EmptyIterator
jpayne@69: #
jpayne@69: # For backwards compatibility, the following classes are also defined:
jpayne@69: #
jpayne@69: # class Tabixfile   equivalent to TabixFile
jpayne@69: #
jpayne@69: ###############################################################################
jpayne@69: #
jpayne@69: # The MIT License
jpayne@69: #
jpayne@69: # Copyright (c) 2015 Andreas Heger
jpayne@69: #
jpayne@69: # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69: # copy of this software and associated documentation files (the "Software"),
jpayne@69: # to deal in the Software without restriction, including without limitation
jpayne@69: # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69: # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69: # Software is furnished to do so, subject to the following conditions:
jpayne@69: #
jpayne@69: # The above copyright notice and this permission notice shall be included in
jpayne@69: # all copies or substantial portions of the Software.
jpayne@69: #
jpayne@69: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
jpayne@69: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69: # DEALINGS IN THE SOFTWARE.
jpayne@69: #
jpayne@69: ###############################################################################
jpayne@69: import os
jpayne@69: import sys
jpayne@69: 
jpayne@69: from libc.stdio cimport printf, fprintf, stderr
jpayne@69: from libc.string cimport strerror
jpayne@69: from libc.errno cimport errno
jpayne@69: from posix.unistd cimport dup
jpayne@69: 
jpayne@69: from cpython cimport PyErr_SetString, PyBytes_Check, \
jpayne@69:     PyUnicode_Check, PyBytes_FromStringAndSize, \
jpayne@69:     PyObject_AsFileDescriptor
jpayne@69: 
jpayne@69: cimport pysam.libctabixproxies as ctabixproxies
jpayne@69: 
jpayne@69: from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
jpayne@69:     BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
jpayne@69:     tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
jpayne@69:     tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
jpayne@69:     tbx_destroy, hisremote, region_list, hts_getline, \
jpayne@69:     TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
jpayne@69:     no_compression, bcf, bcf_index_build2
jpayne@69: 
jpayne@69: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@69: from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@69: 
jpayne@69: cdef class Parser:
jpayne@69: 
jpayne@69:     def __init__(self, encoding="ascii"):
jpayne@69:         self.encoding = encoding
jpayne@69: 
jpayne@69:     def set_encoding(self, encoding):
jpayne@69:         self.encoding = encoding
jpayne@69: 
jpayne@69:     def get_encoding(self):
jpayne@69:         return self.encoding
jpayne@69: 
jpayne@69:     cdef parse(self, char * buffer, int length):
jpayne@69:         raise NotImplementedError(
jpayne@69:             'parse method of %s not implemented' % str(self))
jpayne@69: 
jpayne@69:     def __call__(self, char * buffer, int length):
jpayne@69:         return self.parse(buffer, length)
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class asTuple(Parser):
jpayne@69:     '''converts a :term:`tabix row` into a python tuple.
jpayne@69: 
jpayne@69:     A field in a row is accessed by numeric index.
jpayne@69:     ''' 
jpayne@69:     cdef parse(self, char * buffer, int len):
jpayne@69:         cdef ctabixproxies.TupleProxy r
jpayne@69:         r = ctabixproxies.TupleProxy(self.encoding)
jpayne@69:         # need to copy - there were some
jpayne@69:         # persistence issues with "present"
jpayne@69:         r.copy(buffer, len)
jpayne@69:         return r
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class asGFF3(Parser):
jpayne@69:     '''converts a :term:`tabix row` into a GFF record with the following
jpayne@69:     fields:
jpayne@69:    
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |*Column*  |*Name*    |*Content*                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |1         |contig    |the chromosome name            |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |2         |feature   |The feature type               |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |3         |source    |The feature source             |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |4         |start     |genomic start coordinate       |
jpayne@69:     |          |          |(0-based)                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |5         |end       |genomic end coordinate         |
jpayne@69:     |          |          |(0-based)                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |6         |score     |feature score                  |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |7         |strand    |strand                         |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |8         |frame     |frame                          |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |9         |attributes|the attribute field            |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69: 
jpayne@69:     ''' 
jpayne@69:     cdef parse(self, char * buffer, int len):
jpayne@69:         cdef ctabixproxies.GFF3Proxy r
jpayne@69:         r = ctabixproxies.GFF3Proxy(self.encoding)
jpayne@69:         r.copy(buffer, len)
jpayne@69:         return r
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class asGTF(Parser):
jpayne@69:     '''converts a :term:`tabix row` into a GTF record with the following
jpayne@69:     fields:
jpayne@69:    
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |*Column*  |*Name*    |*Content*                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |1         |contig    |the chromosome name            |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |2         |feature   |The feature type               |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |3         |source    |The feature source             |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |4         |start     |genomic start coordinate       |
jpayne@69:     |          |          |(0-based)                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |5         |end       |genomic end coordinate         |
jpayne@69:     |          |          |(0-based)                      |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |6         |score     |feature score                  |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |7         |strand    |strand                         |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |8         |frame     |frame                          |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69:     |9         |attributes|the attribute field            |
jpayne@69:     +----------+----------+-------------------------------+
jpayne@69: 
jpayne@69:     GTF formatted entries also define the following fields that
jpayne@69:     are derived from the attributes field:
jpayne@69: 
jpayne@69:     +--------------------+------------------------------+
jpayne@69:     |*Name*              |*Content*                     |
jpayne@69:     +--------------------+------------------------------+
jpayne@69:     |gene_id             |the gene identifier           |
jpayne@69:     +--------------------+------------------------------+
jpayne@69:     |transcript_id       |the transcript identifier     |
jpayne@69:     +--------------------+------------------------------+
jpayne@69: 
jpayne@69:     ''' 
jpayne@69:     cdef parse(self, char * buffer, int len):
jpayne@69:         cdef ctabixproxies.GTFProxy r
jpayne@69:         r = ctabixproxies.GTFProxy(self.encoding)
jpayne@69:         r.copy(buffer, len)
jpayne@69:         return r
jpayne@69:     
jpayne@69: 
jpayne@69: cdef class asBed(Parser):
jpayne@69:     '''converts a :term:`tabix row` into a bed record
jpayne@69:     with the following fields:
jpayne@69: 
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |*Column*   |*Field*    |*Contents*                                |
jpayne@69:     |           |           |                                          |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |1          |contig     |contig                                    |
jpayne@69:     |           |           |                                          |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |2          |start      |genomic start coordinate (zero-based)     |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |3          |end        |genomic end coordinate plus one           |
jpayne@69:     |           |           |(zero-based)                              |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |4          |name       |name of feature.                          |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |5          |score      |score of feature                          |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |6          |strand     |strand of feature                         |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |7          |thickStart |thickStart                                |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |8          |thickEnd   |thickEnd                                  |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |9          |itemRGB    |itemRGB                                   |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |10         |blockCount |number of bocks                           |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |11         |blockSizes |',' separated string of block sizes       |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69:     |12         |blockStarts|',' separated string of block genomic     |
jpayne@69:     |           |           |start positions                           |
jpayne@69:     +-----------+-----------+------------------------------------------+
jpayne@69: 
jpayne@69:     Only the first three fields are required. Additional
jpayne@69:     fields are optional, but if one is defined, all the preceding
jpayne@69:     need to be defined as well.
jpayne@69: 
jpayne@69:     ''' 
jpayne@69:     cdef parse(self, char * buffer, int len):
jpayne@69:         cdef ctabixproxies.BedProxy r
jpayne@69:         r = ctabixproxies.BedProxy(self.encoding)
jpayne@69:         r.copy(buffer, len)
jpayne@69:         return r
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class asVCF(Parser): 
jpayne@69:     '''converts a :term:`tabix row` into a VCF record with
jpayne@69:     the following fields:
jpayne@69:     
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |*Column*  |*Field*  |*Contents*                          |
jpayne@69:     |          |         |                                    |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |1         |contig   |chromosome                          |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |2         |pos      |chromosomal position, zero-based    |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |3         |id       |id                                  |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |4         |ref      |reference allele                    |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |5         |alt      |alternate alleles                   |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |6         |qual     |quality                             |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |7         |filter   |filter                              |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |8         |info     |info                                |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69:     |9         |format   |format specifier.                   |
jpayne@69:     +----------+---------+------------------------------------+
jpayne@69: 
jpayne@69:     Access to genotypes is via index::
jpayne@69: 
jpayne@69:         contig = vcf.contig
jpayne@69:         first_sample_genotype = vcf[0]
jpayne@69:         second_sample_genotype = vcf[1]
jpayne@69: 
jpayne@69:     '''
jpayne@69:     cdef parse(self, char * buffer, int len):
jpayne@69:         cdef ctabixproxies.VCFProxy r
jpayne@69:         r = ctabixproxies.VCFProxy(self.encoding)
jpayne@69:         r.copy(buffer, len)
jpayne@69:         return r
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class TabixFile:
jpayne@69:     """Random access to bgzf formatted files that
jpayne@69:     have been indexed by :term:`tabix`.
jpayne@69: 
jpayne@69:     The file is automatically opened. The index file of file
jpayne@69:     ``<filename>`` is expected to be called ``<filename>.tbi``
jpayne@69:     by default (see parameter `index`).
jpayne@69:     
jpayne@69:     Parameters
jpayne@69:     ----------
jpayne@69:     
jpayne@69:     filename : string
jpayne@69:         Filename of bgzf file to be opened.
jpayne@69: 
jpayne@69:     index : string
jpayne@69:         The filename of the index. If not set, the default is to
jpayne@69:         assume that the index is called ``filename.tbi``
jpayne@69: 
jpayne@69:     mode : char
jpayne@69:         The file opening mode. Currently, only ``r`` is permitted.
jpayne@69:         
jpayne@69:     parser : :class:`pysam.Parser`
jpayne@69:     
jpayne@69:         sets the default parser for this tabix file. If `parser`
jpayne@69:         is None, the results are returned as an unparsed string.
jpayne@69:         Otherwise, `parser` is assumed to be a functor that will return
jpayne@69:         parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69:         :class:`~pysam.asGTF`).
jpayne@69: 
jpayne@69:     encoding : string
jpayne@69: 
jpayne@69:         The encoding passed to the parser
jpayne@69: 
jpayne@69:     threads: integer
jpayne@69:         Number of threads to use for decompressing Tabix files.
jpayne@69:         (Default=1)
jpayne@69: 
jpayne@69: 
jpayne@69:     Raises
jpayne@69:     ------
jpayne@69:     
jpayne@69:     ValueError
jpayne@69:         if index file is missing.
jpayne@69: 
jpayne@69:     IOError
jpayne@69:         if file could not be opened
jpayne@69:     """
jpayne@69:     def __cinit__(self,
jpayne@69:                   filename,
jpayne@69:                   mode='r',
jpayne@69:                   parser=None,
jpayne@69:                   index=None,
jpayne@69:                   encoding="ascii",
jpayne@69:                   threads=1,
jpayne@69:                   *args,
jpayne@69:                   **kwargs ):
jpayne@69: 
jpayne@69:         self.htsfile = NULL
jpayne@69:         self.is_remote = False
jpayne@69:         self.is_stream = False
jpayne@69:         self.parser = parser
jpayne@69:         self.threads = threads
jpayne@69:         self._open(filename, mode, index, *args, **kwargs)
jpayne@69:         self.encoding = encoding
jpayne@69: 
jpayne@69:     def _open( self,
jpayne@69:                filename,
jpayne@69:                mode='r',
jpayne@69:                index=None,
jpayne@69:                threads=1,
jpayne@69:               ):
jpayne@69:         '''open a :term:`tabix file` for reading.'''
jpayne@69: 
jpayne@69:         if mode != 'r':
jpayne@69:             raise ValueError("invalid file opening mode `%s`" % mode)
jpayne@69: 
jpayne@69:         if self.htsfile != NULL:
jpayne@69:             self.close()
jpayne@69:         self.htsfile = NULL
jpayne@69:         self.threads=threads
jpayne@69: 
jpayne@69:         filename_index = index or (filename + ".tbi")
jpayne@69:         # encode all the strings to pass to tabix
jpayne@69:         self.filename = encode_filename(filename)
jpayne@69:         self.filename_index = encode_filename(filename_index)
jpayne@69: 
jpayne@69:         self.is_stream = self.filename == b'-'
jpayne@69:         self.is_remote = hisremote(self.filename)
jpayne@69: 
jpayne@69:         if not self.is_remote:
jpayne@69:             if not os.path.exists(filename):
jpayne@69:                 raise IOError("file `%s` not found" % filename)
jpayne@69: 
jpayne@69:             if not os.path.exists(filename_index):
jpayne@69:                 raise IOError("index `%s` not found" % filename_index)
jpayne@69: 
jpayne@69:         # open file
jpayne@69:         cdef char *cfilename = self.filename
jpayne@69:         cdef char *cfilename_index = self.filename_index
jpayne@69:         with nogil:
jpayne@69:             self.htsfile = hts_open(cfilename, 'r')
jpayne@69: 
jpayne@69:         if self.htsfile == NULL:
jpayne@69:             raise IOError("could not open file `%s`" % filename)
jpayne@69:         
jpayne@69:         #if self.htsfile.format.category != region_list:
jpayne@69:         #    raise ValueError("file does not contain region data")
jpayne@69: 
jpayne@69:         with nogil:
jpayne@69:             self.index = tbx_index_load2(cfilename, cfilename_index)
jpayne@69: 
jpayne@69:         if self.index == NULL:
jpayne@69:             raise IOError("could not open index for `%s`" % filename)
jpayne@69: 
jpayne@69:         if not self.is_stream:
jpayne@69:             self.start_offset = self.tell()
jpayne@69: 
jpayne@69:     def _dup(self):
jpayne@69:         '''return a copy of this tabix file.
jpayne@69:         
jpayne@69:         The file is being re-opened.
jpayne@69:         '''
jpayne@69:         return TabixFile(self.filename,
jpayne@69:                          mode="r",
jpayne@69:                          threads=self.threads,
jpayne@69:                          parser=self.parser,
jpayne@69:                          index=self.filename_index,
jpayne@69:                          encoding=self.encoding)
jpayne@69: 
jpayne@69:     def fetch(self, 
jpayne@69:               reference=None,
jpayne@69:               start=None, 
jpayne@69:               end=None, 
jpayne@69:               region=None,
jpayne@69:               parser=None,
jpayne@69:               multiple_iterators=False):
jpayne@69:         '''fetch one or more rows in a :term:`region` using 0-based
jpayne@69:         indexing. The region is specified by :term:`reference`,
jpayne@69:         *start* and *end*. Alternatively, a samtools :term:`region`
jpayne@69:         string can be supplied.
jpayne@69: 
jpayne@69:         Without *reference* or *region* all entries will be fetched. 
jpayne@69:         
jpayne@69:         If only *reference* is set, all reads matching on *reference*
jpayne@69:         will be fetched.
jpayne@69: 
jpayne@69:         If *parser* is None, the default parser will be used for
jpayne@69:         parsing.
jpayne@69:         
jpayne@69:         Set *multiple_iterators* to true if you will be using multiple
jpayne@69:         iterators on the same file at the same time. The iterator
jpayne@69:         returned will receive its own copy of a filehandle to the file
jpayne@69:         effectively re-opening the file. Re-opening a file creates
jpayne@69:         some overhead, so beware.
jpayne@69: 
jpayne@69:         '''
jpayne@69:         if not self.is_open():
jpayne@69:             raise ValueError("I/O operation on closed file")
jpayne@69: 
jpayne@69:         # convert coordinates to region string, which is one-based
jpayne@69:         if reference:
jpayne@69:             if end is not None:
jpayne@69:                 if end < 0:
jpayne@69:                     raise ValueError("end out of range (%i)" % end)
jpayne@69:                 if start is None:
jpayne@69:                     start = 0
jpayne@69:                     
jpayne@69:                 if start < 0:
jpayne@69:                     raise ValueError("start out of range (%i)" % end)
jpayne@69:                 elif start > end:
jpayne@69:                     raise ValueError(
jpayne@69:                         'start (%i) >= end (%i)' % (start, end))
jpayne@69:                 elif start == end:
jpayne@69:                     return EmptyIterator()
jpayne@69:                 else:
jpayne@69:                     region = '%s:%i-%i' % (reference, start + 1, end)
jpayne@69:             elif start is not None:
jpayne@69:                 if start < 0:
jpayne@69:                     raise ValueError("start out of range (%i)" % end)
jpayne@69:                 region = '%s:%i' % (reference, start + 1)
jpayne@69:             else:
jpayne@69:                 region = reference
jpayne@69: 
jpayne@69:         # get iterator
jpayne@69:         cdef hts_itr_t * itr
jpayne@69:         cdef char *cstr
jpayne@69:         cdef TabixFile fileobj
jpayne@69: 
jpayne@69:         # reopen the same file if necessary
jpayne@69:         if multiple_iterators:
jpayne@69:             fileobj = self._dup()
jpayne@69:         else:
jpayne@69:             fileobj = self
jpayne@69: 
jpayne@69:         if region is None:
jpayne@69:             # without region or reference - iterate from start
jpayne@69:             with nogil:
jpayne@69:                 itr = tbx_itr_queryi(fileobj.index,
jpayne@69:                                      HTS_IDX_START,
jpayne@69:                                      0,
jpayne@69:                                      0)
jpayne@69:         else:
jpayne@69:             s = force_bytes(region, encoding=fileobj.encoding)
jpayne@69:             cstr = s
jpayne@69:             with nogil:
jpayne@69:                 itr = tbx_itr_querys(fileobj.index, cstr)
jpayne@69: 
jpayne@69:         if itr == NULL:
jpayne@69:             if region is None:
jpayne@69:                 if len(self.contigs) > 0:
jpayne@69:                     # when accessing a tabix file created prior tabix 1.0
jpayne@69:                     # the full-file iterator is empty.
jpayne@69:                     raise ValueError(
jpayne@69:                         "could not create iterator, possible "
jpayne@69:                         "tabix version mismatch")
jpayne@69:                 else:
jpayne@69:                     # possible reason is that the file is empty -
jpayne@69:                     # return an empty iterator
jpayne@69:                     return EmptyIterator()
jpayne@69:             else:
jpayne@69:                 raise ValueError(
jpayne@69:                     "could not create iterator for region '%s'" %
jpayne@69:                     region)
jpayne@69:             
jpayne@69:         # use default parser if no parser is specified
jpayne@69:         if parser is None:
jpayne@69:             parser = fileobj.parser
jpayne@69: 
jpayne@69:         cdef TabixIterator a
jpayne@69:         if parser is None: 
jpayne@69:             a = TabixIterator(encoding=fileobj.encoding)
jpayne@69:         else:
jpayne@69:             parser.set_encoding(fileobj.encoding)
jpayne@69:             a = TabixIteratorParsed(parser)
jpayne@69: 
jpayne@69:         a.tabixfile = fileobj
jpayne@69:         a.iterator = itr
jpayne@69: 
jpayne@69:         return a
jpayne@69: 
jpayne@69:     ###############################################################
jpayne@69:     ###############################################################
jpayne@69:     ###############################################################
jpayne@69:     ## properties
jpayne@69:     ###############################################################
jpayne@69:     property header:
jpayne@69:         '''the file header.
jpayne@69: 
jpayne@69:         The file header consists of the lines at the beginning of a
jpayne@69:         file that are prefixed by the comment character ``#``.
jpayne@69:        
jpayne@69:         .. note::
jpayne@69:             The header is returned as an iterator presenting lines
jpayne@69:             without the newline character.
jpayne@69:         '''
jpayne@69:         
jpayne@69:         def __get__(self):
jpayne@69: 
jpayne@69:             cdef char *cfilename = self.filename
jpayne@69:             cdef char *cfilename_index = self.filename_index
jpayne@69:             
jpayne@69:             cdef kstring_t buffer
jpayne@69:             buffer.l = buffer.m = 0
jpayne@69:             buffer.s = NULL
jpayne@69:             
jpayne@69:             cdef htsFile * fp = NULL
jpayne@69:             cdef int KS_SEP_LINE = 2
jpayne@69:             cdef tbx_t * tbx = NULL
jpayne@69:             lines = []
jpayne@69:             with nogil:
jpayne@69:                 fp = hts_open(cfilename, 'r')
jpayne@69:                 
jpayne@69:             if fp == NULL:
jpayne@69:                 raise OSError("could not open {} for reading header".format(self.filename))
jpayne@69: 
jpayne@69:             with nogil:
jpayne@69:                 tbx = tbx_index_load2(cfilename, cfilename_index)
jpayne@69:                 
jpayne@69:             if tbx == NULL:
jpayne@69:                 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
jpayne@69: 
jpayne@69:             while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
jpayne@69:                 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
jpayne@69:                     break
jpayne@69:                 lines.append(force_str(buffer.s, self.encoding))
jpayne@69: 
jpayne@69:             with nogil:
jpayne@69:                 hts_close(fp)
jpayne@69:                 free(buffer.s)
jpayne@69: 
jpayne@69:             return lines
jpayne@69: 
jpayne@69:     property contigs:
jpayne@69:         '''list of chromosome names'''
jpayne@69:         def __get__(self):
jpayne@69:             cdef const char ** sequences
jpayne@69:             cdef int nsequences
jpayne@69:             
jpayne@69:             with nogil:
jpayne@69:                 sequences = tbx_seqnames(self.index, &nsequences)
jpayne@69:             cdef int x
jpayne@69:             result = []
jpayne@69:             for x from 0 <= x < nsequences:
jpayne@69:                 result.append(force_str(sequences[x]))
jpayne@69:             
jpayne@69:             # htslib instructions:
jpayne@69:             # only free container, not the sequences themselves
jpayne@69:             free(sequences)
jpayne@69: 
jpayne@69:             return result
jpayne@69:             
jpayne@69:     def close(self):
jpayne@69:         '''
jpayne@69:         closes the :class:`pysam.TabixFile`.'''
jpayne@69:         if self.htsfile != NULL:
jpayne@69:             hts_close(self.htsfile)
jpayne@69:             self.htsfile = NULL
jpayne@69:         if self.index != NULL:
jpayne@69:             tbx_destroy(self.index)
jpayne@69:             self.index = NULL
jpayne@69: 
jpayne@69:     def __dealloc__( self ):
jpayne@69:         # remember: dealloc cannot call other python methods
jpayne@69:         # note: no doc string
jpayne@69:         # note: __del__ is not called.
jpayne@69:         if self.htsfile != NULL:
jpayne@69:             hts_close(self.htsfile)
jpayne@69:             self.htsfile = NULL
jpayne@69:         if self.index != NULL:
jpayne@69:             tbx_destroy(self.index)
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class TabixIterator:
jpayne@69:     """iterates over rows in *tabixfile* in region
jpayne@69:     given by *tid*, *start* and *end*.
jpayne@69:     """
jpayne@69: 
jpayne@69:     def __init__(self, encoding="ascii"):
jpayne@69:         self.encoding = encoding
jpayne@69:     
jpayne@69:     def __iter__(self):
jpayne@69:         self.buffer.s = NULL
jpayne@69:         self.buffer.l = 0
jpayne@69:         self.buffer.m = 0
jpayne@69: 
jpayne@69:         return self 
jpayne@69: 
jpayne@69:     cdef int __cnext__(self):
jpayne@69:         '''iterate to next element.
jpayne@69:         
jpayne@69:         Return -5 if file has been closed when this function
jpayne@69:         was called.
jpayne@69:         '''
jpayne@69:         if self.tabixfile.htsfile == NULL:
jpayne@69:             return -5
jpayne@69: 
jpayne@69:         cdef int retval
jpayne@69: 
jpayne@69:         while 1:
jpayne@69:             with nogil:
jpayne@69:                 retval = tbx_itr_next(
jpayne@69:                     self.tabixfile.htsfile,
jpayne@69:                     self.tabixfile.index,
jpayne@69:                     self.iterator,
jpayne@69:                     &self.buffer)
jpayne@69: 
jpayne@69:             if retval < 0:
jpayne@69:                 break
jpayne@69: 
jpayne@69:             if self.buffer.s[0] != b'#':
jpayne@69:                 break
jpayne@69: 
jpayne@69:         return retval
jpayne@69: 
jpayne@69:     def __next__(self): 
jpayne@69:         """python version of next().
jpayne@69: 
jpayne@69:         pyrex uses this non-standard name instead of next()
jpayne@69:         """
jpayne@69:         
jpayne@69:         cdef int retval = self.__cnext__()
jpayne@69:         if retval == -5:
jpayne@69:             raise IOError("iteration on closed file")
jpayne@69:         elif retval < 0:
jpayne@69:             raise StopIteration
jpayne@69: 
jpayne@69:         return charptr_to_str(self.buffer.s, self.encoding)
jpayne@69: 
jpayne@69:     def __dealloc__(self):
jpayne@69:         if <void*>self.iterator != NULL:
jpayne@69:             tbx_itr_destroy(self.iterator)
jpayne@69:         if self.buffer.s != NULL:
jpayne@69:             free(self.buffer.s)
jpayne@69: 
jpayne@69: 
jpayne@69: class EmptyIterator:
jpayne@69:     '''empty iterator'''
jpayne@69: 
jpayne@69:     def __iter__(self):
jpayne@69:         return self
jpayne@69: 
jpayne@69:     def __next__(self):
jpayne@69:         raise StopIteration()
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class TabixIteratorParsed(TabixIterator):
jpayne@69:     """iterates over mapped reads in a region.
jpayne@69: 
jpayne@69:     The *parser* determines the encoding.
jpayne@69: 
jpayne@69:     Returns parsed data.
jpayne@69:     """
jpayne@69: 
jpayne@69:     def __init__(self, Parser parser):
jpayne@69:         super().__init__()
jpayne@69:         self.parser = parser
jpayne@69: 
jpayne@69:     def __next__(self): 
jpayne@69:         """python version of next().
jpayne@69: 
jpayne@69:         pyrex uses this non-standard name instead of next()
jpayne@69:         """
jpayne@69:         
jpayne@69:         cdef int retval = self.__cnext__()
jpayne@69:         if retval == -5:
jpayne@69:             raise IOError("iteration on closed file")
jpayne@69:         elif retval < 0:
jpayne@69:             raise StopIteration
jpayne@69: 
jpayne@69:         return self.parser.parse(self.buffer.s,
jpayne@69:                                  self.buffer.l)
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class GZIterator:
jpayne@69:     def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
jpayne@69:         '''iterate line-by-line through gzip (or bgzip)
jpayne@69:         compressed file.
jpayne@69:         '''
jpayne@69:         if not os.path.exists(filename):
jpayne@69:             raise IOError("No such file or directory: %s" % filename)
jpayne@69: 
jpayne@69:         filename = encode_filename(filename)
jpayne@69:         cdef char *cfilename = filename
jpayne@69:         with nogil:
jpayne@69:             self.gzipfile = bgzf_open(cfilename, "r")
jpayne@69:         self._filename = filename
jpayne@69:         self.kstream = ks_init(self.gzipfile)
jpayne@69:         self.encoding = encoding
jpayne@69: 
jpayne@69:         self.buffer.l = 0
jpayne@69:         self.buffer.m = 0
jpayne@69:         self.buffer.s = <char*>malloc(buffer_size)
jpayne@69: 
jpayne@69:     def __dealloc__(self):
jpayne@69:         '''close file.'''
jpayne@69:         if self.gzipfile != NULL:
jpayne@69:             bgzf_close(self.gzipfile)
jpayne@69:             self.gzipfile = NULL
jpayne@69:         if self.buffer.s != NULL:
jpayne@69:             free(self.buffer.s)
jpayne@69:         if self.kstream != NULL:
jpayne@69:             ks_destroy(self.kstream)
jpayne@69: 
jpayne@69:     def __iter__(self):
jpayne@69:         return self
jpayne@69: 
jpayne@69:     cdef int __cnext__(self):
jpayne@69:         cdef int dret = 0
jpayne@69:         cdef int retval = 0
jpayne@69:         while 1:
jpayne@69:             with nogil:
jpayne@69:                 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69:             
jpayne@69:             if retval < 0: 
jpayne@69:                 break
jpayne@69: 
jpayne@69:             return dret
jpayne@69:         return -1
jpayne@69: 
jpayne@69:     def __next__(self):
jpayne@69:         """python version of next().
jpayne@69:         """
jpayne@69:         cdef int retval = self.__cnext__()
jpayne@69:         if retval < 0:
jpayne@69:             raise StopIteration
jpayne@69:         return force_str(self.buffer.s, self.encoding)
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class GZIteratorHead(GZIterator):
jpayne@69:     '''iterate line-by-line through gzip (or bgzip)
jpayne@69:     compressed file returning comments at top of file.
jpayne@69:     '''
jpayne@69: 
jpayne@69:     def __next__(self):
jpayne@69:         """python version of next().
jpayne@69:         """
jpayne@69:         cdef int retval = self.__cnext__()
jpayne@69:         if retval < 0:
jpayne@69:             raise StopIteration
jpayne@69:         if self.buffer.s[0] == b'#':
jpayne@69:             return self.buffer.s
jpayne@69:         else:
jpayne@69:             raise StopIteration
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class GZIteratorParsed(GZIterator):
jpayne@69:     '''iterate line-by-line through gzip (or bgzip)
jpayne@69:     compressed file returning comments at top of file.
jpayne@69:     '''
jpayne@69: 
jpayne@69:     def __init__(self, parser):
jpayne@69:         self.parser = parser
jpayne@69: 
jpayne@69:     def __next__(self):
jpayne@69:         """python version of next().
jpayne@69:         """
jpayne@69:         cdef int retval = self.__cnext__()
jpayne@69:         if retval < 0:
jpayne@69:             raise StopIteration
jpayne@69: 
jpayne@69:         return self.parser.parse(self.buffer.s,
jpayne@69:                                  self.buffer.l)
jpayne@69: 
jpayne@69: 
jpayne@69: def tabix_compress(filename_in, 
jpayne@69:                    filename_out,
jpayne@69:                    force=False):
jpayne@69:     '''compress *filename_in* writing the output to *filename_out*.
jpayne@69:     
jpayne@69:     Raise an IOError if *filename_out* already exists, unless *force*
jpayne@69:     is set.
jpayne@69:     '''
jpayne@69: 
jpayne@69:     if not force and os.path.exists(filename_out):
jpayne@69:         raise IOError(
jpayne@69:             "Filename '%s' already exists, use *force* to "
jpayne@69:             "overwrite" % filename_out)
jpayne@69: 
jpayne@69:     cdef int WINDOW_SIZE
jpayne@69:     cdef int c, r
jpayne@69:     cdef void * buffer
jpayne@69:     cdef BGZF * fp
jpayne@69:     cdef int fd_src
jpayne@69:     cdef bint is_empty = True
jpayne@69:     cdef int O_RDONLY
jpayne@69:     O_RDONLY = os.O_RDONLY
jpayne@69: 
jpayne@69:     WINDOW_SIZE = 64 * 1024
jpayne@69: 
jpayne@69:     fn = encode_filename(filename_out)
jpayne@69:     cdef char *cfn = fn
jpayne@69:     with nogil:
jpayne@69:         fp = bgzf_open(cfn, "w")
jpayne@69:     if fp == NULL:
jpayne@69:         raise IOError("could not open '%s' for writing" % filename_out)
jpayne@69: 
jpayne@69:     fn = encode_filename(filename_in)
jpayne@69:     fd_src = open(fn, O_RDONLY)
jpayne@69:     if fd_src == 0:
jpayne@69:         raise IOError("could not open '%s' for reading" % filename_in)
jpayne@69: 
jpayne@69:     buffer = malloc(WINDOW_SIZE)
jpayne@69:     c = 1
jpayne@69:     
jpayne@69:     while c > 0:
jpayne@69:         with nogil:
jpayne@69:             c = read(fd_src, buffer, WINDOW_SIZE)
jpayne@69:             if c > 0:
jpayne@69:                 is_empty = False
jpayne@69:             r = bgzf_write(fp, buffer, c)
jpayne@69:         if r < 0:
jpayne@69:             free(buffer)
jpayne@69:             raise IOError("writing failed")
jpayne@69:         
jpayne@69:     free(buffer)
jpayne@69:     r = bgzf_close(fp)
jpayne@69:     if r < 0:
jpayne@69:         raise IOError("error %i when writing to file %s" % (r, filename_out))
jpayne@69: 
jpayne@69:     r = close(fd_src)
jpayne@69:     # an empty file will return with -1, thus ignore this.
jpayne@69:     if r < 0:
jpayne@69:         if not (r == -1 and is_empty):
jpayne@69:             raise IOError("error %i when closing file %s" % (r, filename_in))
jpayne@69: 
jpayne@69: 
jpayne@69: def tabix_index(filename,
jpayne@69:                 force=False,
jpayne@69:                 seq_col=None,
jpayne@69:                 start_col=None,
jpayne@69:                 end_col=None,
jpayne@69:                 preset=None,
jpayne@69:                 meta_char="#",
jpayne@69:                 int line_skip=0,
jpayne@69:                 zerobased=False,
jpayne@69:                 int min_shift=-1,
jpayne@69:                 index=None,
jpayne@69:                 keep_original=False,
jpayne@69:                 csi=False,
jpayne@69:                 ):
jpayne@69:     '''index tab-separated *filename* using tabix.
jpayne@69: 
jpayne@69:     An existing index will not be overwritten unless *force* is set.
jpayne@69: 
jpayne@69:     The index will be built from coordinates in columns *seq_col*,
jpayne@69:     *start_col* and *end_col*.
jpayne@69: 
jpayne@69:     The contents of *filename* have to be sorted by contig and
jpayne@69:     position - the method does not check if the file is sorted.
jpayne@69: 
jpayne@69:     Column indices are 0-based. Note that this is different from the
jpayne@69:     tabix command line utility where column indices start at 1.
jpayne@69:     
jpayne@69:     Coordinates in the file are assumed to be 1-based unless
jpayne@69:     *zerobased* is set.
jpayne@69: 
jpayne@69:     If *preset* is provided, the column coordinates are taken from a
jpayne@69:     preset. Valid values for preset are "gff", "bed", "sam", "vcf",
jpayne@69:     psltbl", "pileup".
jpayne@69:     
jpayne@69:     Lines beginning with *meta_char* and the first *line_skip* lines
jpayne@69:     will be skipped.
jpayne@69: 
jpayne@69:     If *filename* is not detected as a gzip file it will be automatically
jpayne@69:     compressed. The original file will be removed and only the compressed
jpayne@69:     file will be retained.
jpayne@69: 
jpayne@69:     By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
jpayne@69:     is greater than zero and/or *csi* is True, creates a CSI index with a
jpayne@69:     minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
jpayne@69: 
jpayne@69:     *index* controls the filename which should be used for creating the index.
jpayne@69:     If not set, the default is to append ``.tbi`` to *filename*.
jpayne@69: 
jpayne@69:     When automatically compressing files, if *keep_original* is set the
jpayne@69:     uncompressed file will not be deleted.
jpayne@69: 
jpayne@69:     returns the filename of the compressed data
jpayne@69: 
jpayne@69:     '''
jpayne@69:     
jpayne@69:     if preset is None and \
jpayne@69:        (seq_col is None or start_col is None or end_col is None):
jpayne@69:         raise ValueError(
jpayne@69:             "neither preset nor seq_col,start_col and end_col given")
jpayne@69: 
jpayne@69:     fn = encode_filename(filename)
jpayne@69:     cdef char *cfn = fn
jpayne@69: 
jpayne@69:     cdef htsFile *fp = hts_open(cfn, "r")
jpayne@69:     if fp == NULL:
jpayne@69:         raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
jpayne@69: 
jpayne@69:     cdef htsFormat fmt = hts_get_format(fp)[0]
jpayne@69:     hts_close(fp)
jpayne@69: 
jpayne@69:     if fmt.compression == no_compression:
jpayne@69:         tabix_compress(filename, filename + ".gz", force=force)
jpayne@69:         if not keep_original:
jpayne@69:             os.unlink(filename)
jpayne@69:         filename += ".gz"
jpayne@69:         fn = encode_filename(filename)
jpayne@69:         cfn = fn
jpayne@69: 
jpayne@69:     # columns (1-based):
jpayne@69:     #   preset-code, contig, start, end, metachar for
jpayne@69:     #     comments, lines to ignore at beginning
jpayne@69:     # 0 is a missing column
jpayne@69:     preset2conf = {
jpayne@69:         'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
jpayne@69:         'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
jpayne@69:         'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
jpayne@69:         'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
jpayne@69:         'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
jpayne@69:         }
jpayne@69:     
jpayne@69:     conf_data = None
jpayne@69:     if preset == "bcf" or fmt.format == bcf:
jpayne@69:         csi = True
jpayne@69:     elif preset:
jpayne@69:         try:
jpayne@69:             conf_data = preset2conf[preset]
jpayne@69:         except KeyError:
jpayne@69:             raise KeyError(
jpayne@69:                 "unknown preset '%s', valid presets are '%s'" %
jpayne@69:                 (preset, ",".join(preset2conf.keys())))
jpayne@69:     else:
jpayne@69:         if end_col is None:
jpayne@69:             end_col = -1
jpayne@69:             
jpayne@69:         preset = 0
jpayne@69:         # tabix internally works with 0-based coordinates and
jpayne@69:         # open/closed intervals.  When using a preset, conversion is
jpayne@69:         # automatically taken care of.  Otherwise, the coordinates are
jpayne@69:         # assumed to be 1-based closed intervals and -1 is subtracted
jpayne@69:         # from the start coordinate. To avoid doing this, set the
jpayne@69:         # TI_FLAG_UCSC=0x10000 flag:
jpayne@69:         if zerobased:
jpayne@69:             preset = preset | TBX_UCSC
jpayne@69: 
jpayne@69:         conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
jpayne@69: 
jpayne@69:     cdef tbx_conf_t conf
jpayne@69:     if conf_data:
jpayne@69:         conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
jpayne@69: 
jpayne@69:     if csi or min_shift > 0:
jpayne@69:         suffix = ".csi"
jpayne@69:         if min_shift <= 0: min_shift = 14
jpayne@69:     else:
jpayne@69:         suffix = ".tbi"
jpayne@69:         min_shift = 0
jpayne@69: 
jpayne@69:     index = index or filename + suffix    
jpayne@69:     fn_index = encode_filename(index)
jpayne@69: 
jpayne@69:     if not force and os.path.exists(index):
jpayne@69:         raise IOError(
jpayne@69:             "filename '%s' already exists, use *force* to overwrite" % index)
jpayne@69:     
jpayne@69:     cdef char *fnidx = fn_index
jpayne@69:     cdef int retval = 0
jpayne@69: 
jpayne@69:     if csi and fmt.format == bcf:
jpayne@69:         with nogil:
jpayne@69:             retval = bcf_index_build2(cfn, fnidx, min_shift)
jpayne@69:     else:
jpayne@69:         with nogil:
jpayne@69:             retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
jpayne@69:             
jpayne@69:     if retval != 0:
jpayne@69:         raise OSError("building of index for {} failed".format(filename))
jpayne@69:     
jpayne@69:     return filename
jpayne@69: 
jpayne@69: # #########################################################
jpayne@69: # cdef class tabix_file_iterator_old:
jpayne@69: #     '''iterate over ``infile``.
jpayne@69: 
jpayne@69: #     This iterator is not safe. If the :meth:`__next__()` method is called 
jpayne@69: #     after ``infile`` is closed, the result is undefined (see ``fclose()``).
jpayne@69: 
jpayne@69: #     The iterator might either raise a StopIteration or segfault.
jpayne@69: #     '''
jpayne@69: 
jpayne@69: 
jpayne@69: #     def __cinit__(self, 
jpayne@69: #                   infile, 
jpayne@69: #                   Parser parser,
jpayne@69: #                   int buffer_size = 65536 ):
jpayne@69: 
jpayne@69: #         cdef int fd = PyObject_AsFileDescriptor( infile )
jpayne@69: #         if fd == -1: raise ValueError( "I/O operation on closed file." )
jpayne@69: #         self.infile = fdopen( fd, 'r')
jpayne@69: 
jpayne@69: #         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
jpayne@69: 
jpayne@69: #         self.buffer = <char*>malloc( buffer_size )        
jpayne@69: #         self.size = buffer_size
jpayne@69: #         self.parser = parser
jpayne@69: 
jpayne@69: #     def __iter__(self):
jpayne@69: #         return self
jpayne@69: 
jpayne@69: #     cdef __cnext__(self):
jpayne@69: 
jpayne@69: #         cdef char * b
jpayne@69: #         cdef size_t nbytes
jpayne@69: #         b = self.buffer
jpayne@69: 
jpayne@69: #         while not feof( self.infile ):
jpayne@69: #             nbytes = getline( &b, &self.size, self.infile)
jpayne@69: 
jpayne@69: #             # stop at first error or eof
jpayne@69: #             if (nbytes == -1): break
jpayne@69: #             # skip comments
jpayne@69: #             if (b[0] == '#'): continue
jpayne@69: 
jpayne@69: #             # skip empty lines
jpayne@69: #             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
jpayne@69: 
jpayne@69: #             # make sure that entry is complete
jpayne@69: #             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
jpayne@69: #                 result = b
jpayne@69: #                 raise ValueError( "incomplete line at %s" % result )
jpayne@69: 
jpayne@69: #             # make sure that this goes fully through C
jpayne@69: #             # otherwise buffer is copied to/from a
jpayne@69: #             # Python object causing segfaults as
jpayne@69: #             # the wrong memory is freed
jpayne@69: #             return self.parser.parse( b, nbytes )
jpayne@69: 
jpayne@69: #         raise StopIteration
jpayne@69: 
jpayne@69: #     def __dealloc__(self):
jpayne@69: #         free(self.buffer)
jpayne@69: 
jpayne@69: #     def __next__(self):
jpayne@69: #         return self.__cnext__()
jpayne@69: 
jpayne@69: #########################################################
jpayne@69: #########################################################
jpayne@69: #########################################################
jpayne@69: ## Iterators for parsing through unindexed files.
jpayne@69: #########################################################
jpayne@69: # cdef buildGzipError(void *gzfp):
jpayne@69: #     cdef int errnum = 0
jpayne@69: #     cdef char *s = gzerror(gzfp, &errnum)
jpayne@69: #     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
jpayne@69: 
jpayne@69: 
jpayne@69: cdef class tabix_file_iterator:
jpayne@69:     '''iterate over a compressed or uncompressed ``infile``.
jpayne@69:     '''
jpayne@69: 
jpayne@69:     def __cinit__(self, 
jpayne@69:                   infile, 
jpayne@69:                   Parser parser,
jpayne@69:                   int buffer_size=65536):
jpayne@69: 
jpayne@69:         if infile.closed:
jpayne@69:             raise ValueError("I/O operation on closed file.")
jpayne@69: 
jpayne@69:         self.infile = infile
jpayne@69: 
jpayne@69:         cdef int fd = PyObject_AsFileDescriptor(infile)
jpayne@69:         if fd == -1:
jpayne@69:             raise ValueError("I/O operation on closed file.")
jpayne@69: 
jpayne@69:         self.duplicated_fd = dup(fd)
jpayne@69: 
jpayne@69:         # From the manual:
jpayne@69:         # gzopen can be used to read a file which is not in gzip format; 
jpayne@69:         # in this case gzread will directly read from the file without decompression. 
jpayne@69:         # When reading, this will be detected automatically by looking 
jpayne@69:         # for the magic two-byte gzip header. 
jpayne@69:         self.fh = bgzf_dopen(self.duplicated_fd, 'r')
jpayne@69: 
jpayne@69:         if self.fh == NULL: 
jpayne@69:             raise IOError('%s' % strerror(errno))
jpayne@69: 
jpayne@69:         self.kstream = ks_init(self.fh) 
jpayne@69:         
jpayne@69:         self.buffer.s = <char*>malloc(buffer_size)
jpayne@69:         #if self.buffer == NULL:
jpayne@69:         #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
jpayne@69:         #self.size = buffer_size
jpayne@69:         self.parser = parser
jpayne@69: 
jpayne@69:     def __iter__(self):
jpayne@69:         return self
jpayne@69: 
jpayne@69:     cdef __cnext__(self):
jpayne@69: 
jpayne@69:         cdef char * b
jpayne@69:         cdef int dret = 0
jpayne@69:         cdef int retval = 0
jpayne@69:         while 1:
jpayne@69:             with nogil:
jpayne@69:                 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69:             
jpayne@69:             if retval < 0: 
jpayne@69:                 break
jpayne@69:                 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
jpayne@69: 
jpayne@69:             b = self.buffer.s
jpayne@69:             
jpayne@69:             # skip comments
jpayne@69:             if (b[0] == b'#'):
jpayne@69:                 continue
jpayne@69: 
jpayne@69:             # skip empty lines
jpayne@69:             if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69:                 continue
jpayne@69: 
jpayne@69:             # gzgets terminates at \n, no need to test
jpayne@69: 
jpayne@69:             # parser creates a copy
jpayne@69:             return self.parser.parse(b, self.buffer.l)
jpayne@69: 
jpayne@69:         raise StopIteration
jpayne@69: 
jpayne@69:     def __dealloc__(self):
jpayne@69:         free(self.buffer.s)
jpayne@69:         ks_destroy(self.kstream)
jpayne@69:         bgzf_close(self.fh)
jpayne@69:         
jpayne@69:     def __next__(self):
jpayne@69:         return self.__cnext__()
jpayne@69: 
jpayne@69: 
jpayne@69: class tabix_generic_iterator:
jpayne@69:     '''iterate over ``infile``.
jpayne@69:     
jpayne@69:     Permits the use of file-like objects for example from the gzip module.
jpayne@69:     '''
jpayne@69:     def __init__(self, infile, parser):
jpayne@69: 
jpayne@69:         self.infile = infile
jpayne@69:         if self.infile.closed:
jpayne@69:             raise ValueError("I/O operation on closed file.")
jpayne@69:         self.parser = parser
jpayne@69: 
jpayne@69:     def __iter__(self):
jpayne@69:         return self
jpayne@69: 
jpayne@69:     # cython version - required for python 3
jpayne@69:     def __next__(self):
jpayne@69:         
jpayne@69:         cdef char * b
jpayne@69:         cdef char * cpy
jpayne@69:         cdef size_t nbytes
jpayne@69: 
jpayne@69:         encoding = self.parser.get_encoding()
jpayne@69: 
jpayne@69:         # note that GzipFile.close() does not close the file
jpayne@69:         # reading is still possible.
jpayne@69:         if self.infile.closed:
jpayne@69:             raise ValueError("I/O operation on closed file.")
jpayne@69: 
jpayne@69:         while 1:
jpayne@69: 
jpayne@69:             line = self.infile.readline()
jpayne@69:             if not line:
jpayne@69:                 break
jpayne@69:             
jpayne@69:             s = force_bytes(line, encoding)
jpayne@69:             b = s
jpayne@69:             nbytes = len(line)
jpayne@69:             assert b[nbytes] == b'\0'
jpayne@69: 
jpayne@69:             # skip comments
jpayne@69:             if b[0] == b'#':
jpayne@69:                 continue
jpayne@69: 
jpayne@69:             # skip empty lines
jpayne@69:             if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69:                 continue
jpayne@69:             
jpayne@69:             # make sure that entry is complete
jpayne@69:             if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
jpayne@69:                 raise ValueError("incomplete line at %s" % line)
jpayne@69:             
jpayne@69:             bytes_cpy = <bytes> b
jpayne@69:             cpy = <char *> bytes_cpy
jpayne@69: 
jpayne@69:             return self.parser(cpy, nbytes)            
jpayne@69: 
jpayne@69:         raise StopIteration
jpayne@69: 
jpayne@69: 
jpayne@69: def tabix_iterator(infile, parser):
jpayne@69:     """return an iterator over all entries in a file.
jpayne@69:     
jpayne@69:     Results are returned parsed as specified by the *parser*. If
jpayne@69:     *parser* is None, the results are returned as an unparsed string.
jpayne@69:     Otherwise, *parser* is assumed to be a functor that will return
jpayne@69:     parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69:     :class:`~pysam.asGTF`).
jpayne@69: 
jpayne@69:     """
jpayne@69:     return tabix_generic_iterator(infile, parser)
jpayne@69:         
jpayne@69:     
jpayne@69: cdef class Tabixfile(TabixFile):
jpayne@69:     """Tabixfile is deprecated: use TabixFile instead"""
jpayne@69:     pass
jpayne@69: 
jpayne@69: 
jpayne@69: __all__ = [
jpayne@69:     "tabix_index", 
jpayne@69:     "tabix_compress",
jpayne@69:     "TabixFile",
jpayne@69:     "Tabixfile",
jpayne@69:     "asTuple",
jpayne@69:     "asGTF",
jpayne@69:     "asGFF3",
jpayne@69:     "asVCF",
jpayne@69:     "asBed",
jpayne@69:     "GZIterator",
jpayne@69:     "GZIteratorHead",
jpayne@69:     "tabix_iterator", 
jpayne@69:     "tabix_generic_iterator", 
jpayne@69:     "tabix_file_iterator", 
jpayne@69: ]