Mercurial > repos > rliterman > csp2

diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author: jpayne
date: Tue, 18 Mar 2025 17:55:14 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,1271 @@
+# cython: language_level=3
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for access to tabix indexed files in bgzf format
+###############################################################################
+# The principal classes and functions defined in this module are:
+#
+# class TabixFile  class wrapping tabix indexed files in bgzf format
+#
+# class asTuple  Parser class for tuples
+# class asGTF    Parser class for GTF formatted rows
+# class asGFF3   Parser class for GFF3 formatted rows
+# class asBed    Parser class for Bed formatted rows
+# class asVCF    Parser class for VCF formatted rows
+#
+# class tabix_generic_iterator  Streamed iterator of bgzf formatted files
+#
+# Additionally this module defines several additional classes that are part
+# of the internal API. These are:
+#
+# class Parser                base class for parsers of tab-separated rows
+# class tabix_file_iterator
+# class TabixIterator         iterator class over rows in bgzf file
+# class EmptyIterator
+#
+# For backwards compatibility, the following classes are also defined:
+#
+# class Tabixfile   equivalent to TabixFile
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import os
+import sys
+
+from libc.stdio cimport printf, fprintf, stderr
+from libc.string cimport strerror
+from libc.errno cimport errno
+from posix.unistd cimport dup
+
+from cpython cimport PyErr_SetString, PyBytes_Check, \
+    PyUnicode_Check, PyBytes_FromStringAndSize, \
+    PyObject_AsFileDescriptor
+
+cimport pysam.libctabixproxies as ctabixproxies
+
+from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
+    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
+    tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
+    tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
+    tbx_destroy, hisremote, region_list, hts_getline, \
+    TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
+    no_compression, bcf, bcf_index_build2
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+cdef class Parser:
+
+    def __init__(self, encoding="ascii"):
+        self.encoding = encoding
+
+    def set_encoding(self, encoding):
+        self.encoding = encoding
+
+    def get_encoding(self):
+        return self.encoding
+
+    cdef parse(self, char * buffer, int length):
+        raise NotImplementedError(
+            'parse method of %s not implemented' % str(self))
+
+    def __call__(self, char * buffer, int length):
+        return self.parse(buffer, length)
+
+
+cdef class asTuple(Parser):
+    '''converts a :term:`tabix row` into a python tuple.
+
+    A field in a row is accessed by numeric index.
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.TupleProxy r
+        r = ctabixproxies.TupleProxy(self.encoding)
+        # need to copy - there were some
+        # persistence issues with "present"
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asGFF3(Parser):
+    '''converts a :term:`tabix row` into a GFF record with the following
+    fields:
+   
+    +----------+----------+-------------------------------+
+    |*Column*  |*Name*    |*Content*                      |
+    +----------+----------+-------------------------------+
+    |1         |contig    |the chromosome name            |
+    +----------+----------+-------------------------------+
+    |2         |feature   |The feature type               |
+    +----------+----------+-------------------------------+
+    |3         |source    |The feature source             |
+    +----------+----------+-------------------------------+
+    |4         |start     |genomic start coordinate       |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |5         |end       |genomic end coordinate         |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |6         |score     |feature score                  |
+    +----------+----------+-------------------------------+
+    |7         |strand    |strand                         |
+    +----------+----------+-------------------------------+
+    |8         |frame     |frame                          |
+    +----------+----------+-------------------------------+
+    |9         |attributes|the attribute field            |
+    +----------+----------+-------------------------------+
+
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.GFF3Proxy r
+        r = ctabixproxies.GFF3Proxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asGTF(Parser):
+    '''converts a :term:`tabix row` into a GTF record with the following
+    fields:
+   
+    +----------+----------+-------------------------------+
+    |*Column*  |*Name*    |*Content*                      |
+    +----------+----------+-------------------------------+
+    |1         |contig    |the chromosome name            |
+    +----------+----------+-------------------------------+
+    |2         |feature   |The feature type               |
+    +----------+----------+-------------------------------+
+    |3         |source    |The feature source             |
+    +----------+----------+-------------------------------+
+    |4         |start     |genomic start coordinate       |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |5         |end       |genomic end coordinate         |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |6         |score     |feature score                  |
+    +----------+----------+-------------------------------+
+    |7         |strand    |strand                         |
+    +----------+----------+-------------------------------+
+    |8         |frame     |frame                          |
+    +----------+----------+-------------------------------+
+    |9         |attributes|the attribute field            |
+    +----------+----------+-------------------------------+
+
+    GTF formatted entries also define the following fields that
+    are derived from the attributes field:
+
+    +--------------------+------------------------------+
+    |*Name*              |*Content*                     |
+    +--------------------+------------------------------+
+    |gene_id             |the gene identifier           |
+    +--------------------+------------------------------+
+    |transcript_id       |the transcript identifier     |
+    +--------------------+------------------------------+
+
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.GTFProxy r
+        r = ctabixproxies.GTFProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+    
+
+cdef class asBed(Parser):
+    '''converts a :term:`tabix row` into a bed record
+    with the following fields:
+
+    +-----------+-----------+------------------------------------------+
+    |*Column*   |*Field*    |*Contents*                                |
+    |           |           |                                          |
+    +-----------+-----------+------------------------------------------+
+    |1          |contig     |contig                                    |
+    |           |           |                                          |
+    +-----------+-----------+------------------------------------------+
+    |2          |start      |genomic start coordinate (zero-based)     |
+    +-----------+-----------+------------------------------------------+
+    |3          |end        |genomic end coordinate plus one           |
+    |           |           |(zero-based)                              |
+    +-----------+-----------+------------------------------------------+
+    |4          |name       |name of feature.                          |
+    +-----------+-----------+------------------------------------------+
+    |5          |score      |score of feature                          |
+    +-----------+-----------+------------------------------------------+
+    |6          |strand     |strand of feature                         |
+    +-----------+-----------+------------------------------------------+
+    |7          |thickStart |thickStart                                |
+    +-----------+-----------+------------------------------------------+
+    |8          |thickEnd   |thickEnd                                  |
+    +-----------+-----------+------------------------------------------+
+    |9          |itemRGB    |itemRGB                                   |
+    +-----------+-----------+------------------------------------------+
+    |10         |blockCount |number of bocks                           |
+    +-----------+-----------+------------------------------------------+
+    |11         |blockSizes |',' separated string of block sizes       |
+    +-----------+-----------+------------------------------------------+
+    |12         |blockStarts|',' separated string of block genomic     |
+    |           |           |start positions                           |
+    +-----------+-----------+------------------------------------------+
+
+    Only the first three fields are required. Additional
+    fields are optional, but if one is defined, all the preceding
+    need to be defined as well.
+
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.BedProxy r
+        r = ctabixproxies.BedProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asVCF(Parser): 
+    '''converts a :term:`tabix row` into a VCF record with
+    the following fields:
+    
+    +----------+---------+------------------------------------+
+    |*Column*  |*Field*  |*Contents*                          |
+    |          |         |                                    |
+    +----------+---------+------------------------------------+
+    |1         |contig   |chromosome                          |
+    +----------+---------+------------------------------------+
+    |2         |pos      |chromosomal position, zero-based    |
+    +----------+---------+------------------------------------+
+    |3         |id       |id                                  |
+    +----------+---------+------------------------------------+
+    |4         |ref      |reference allele                    |
+    +----------+---------+------------------------------------+
+    |5         |alt      |alternate alleles                   |
+    +----------+---------+------------------------------------+
+    |6         |qual     |quality                             |
+    +----------+---------+------------------------------------+
+    |7         |filter   |filter                              |
+    +----------+---------+------------------------------------+
+    |8         |info     |info                                |
+    +----------+---------+------------------------------------+
+    |9         |format   |format specifier.                   |
+    +----------+---------+------------------------------------+
+
+    Access to genotypes is via index::
+
+        contig = vcf.contig
+        first_sample_genotype = vcf[0]
+        second_sample_genotype = vcf[1]
+
+    '''
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.VCFProxy r
+        r = ctabixproxies.VCFProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class TabixFile:
+    """Random access to bgzf formatted files that
+    have been indexed by :term:`tabix`.
+
+    The file is automatically opened. The index file of file
+    ``<filename>`` is expected to be called ``<filename>.tbi``
+    by default (see parameter `index`).
+    
+    Parameters
+    ----------
+    
+    filename : string
+        Filename of bgzf file to be opened.
+
+    index : string
+        The filename of the index. If not set, the default is to
+        assume that the index is called ``filename.tbi``
+
+    mode : char
+        The file opening mode. Currently, only ``r`` is permitted.
+        
+    parser : :class:`pysam.Parser`
+    
+        sets the default parser for this tabix file. If `parser`
+        is None, the results are returned as an unparsed string.
+        Otherwise, `parser` is assumed to be a functor that will return
+        parsed data (see for example :class:`~pysam.asTuple` and
+        :class:`~pysam.asGTF`).
+
+    encoding : string
+
+        The encoding passed to the parser
+
+    threads: integer
+        Number of threads to use for decompressing Tabix files.
+        (Default=1)
+
+
+    Raises
+    ------
+    
+    ValueError
+        if index file is missing.
+
+    IOError
+        if file could not be opened
+    """
+    def __cinit__(self,
+                  filename,
+                  mode='r',
+                  parser=None,
+                  index=None,
+                  encoding="ascii",
+                  threads=1,
+                  *args,
+                  **kwargs ):
+
+        self.htsfile = NULL
+        self.is_remote = False
+        self.is_stream = False
+        self.parser = parser
+        self.threads = threads
+        self._open(filename, mode, index, *args, **kwargs)
+        self.encoding = encoding
+
+    def _open( self,
+               filename,
+               mode='r',
+               index=None,
+               threads=1,
+              ):
+        '''open a :term:`tabix file` for reading.'''
+
+        if mode != 'r':
+            raise ValueError("invalid file opening mode `%s`" % mode)
+
+        if self.htsfile != NULL:
+            self.close()
+        self.htsfile = NULL
+        self.threads=threads
+
+        filename_index = index or (filename + ".tbi")
+        # encode all the strings to pass to tabix
+        self.filename = encode_filename(filename)
+        self.filename_index = encode_filename(filename_index)
+
+        self.is_stream = self.filename == b'-'
+        self.is_remote = hisremote(self.filename)
+
+        if not self.is_remote:
+            if not os.path.exists(filename):
+                raise IOError("file `%s` not found" % filename)
+
+            if not os.path.exists(filename_index):
+                raise IOError("index `%s` not found" % filename_index)
+
+        # open file
+        cdef char *cfilename = self.filename
+        cdef char *cfilename_index = self.filename_index
+        with nogil:
+            self.htsfile = hts_open(cfilename, 'r')
+
+        if self.htsfile == NULL:
+            raise IOError("could not open file `%s`" % filename)
+        
+        #if self.htsfile.format.category != region_list:
+        #    raise ValueError("file does not contain region data")
+
+        with nogil:
+            self.index = tbx_index_load2(cfilename, cfilename_index)
+
+        if self.index == NULL:
+            raise IOError("could not open index for `%s`" % filename)
+
+        if not self.is_stream:
+            self.start_offset = self.tell()
+
+    def _dup(self):
+        '''return a copy of this tabix file.
+        
+        The file is being re-opened.
+        '''
+        return TabixFile(self.filename,
+                         mode="r",
+                         threads=self.threads,
+                         parser=self.parser,
+                         index=self.filename_index,
+                         encoding=self.encoding)
+
+    def fetch(self, 
+              reference=None,
+              start=None, 
+              end=None, 
+              region=None,
+              parser=None,
+              multiple_iterators=False):
+        '''fetch one or more rows in a :term:`region` using 0-based
+        indexing. The region is specified by :term:`reference`,
+        *start* and *end*. Alternatively, a samtools :term:`region`
+        string can be supplied.
+
+        Without *reference* or *region* all entries will be fetched. 
+        
+        If only *reference* is set, all reads matching on *reference*
+        will be fetched.
+
+        If *parser* is None, the default parser will be used for
+        parsing.
+        
+        Set *multiple_iterators* to true if you will be using multiple
+        iterators on the same file at the same time. The iterator
+        returned will receive its own copy of a filehandle to the file
+        effectively re-opening the file. Re-opening a file creates
+        some overhead, so beware.
+
+        '''
+        if not self.is_open():
+            raise ValueError("I/O operation on closed file")
+
+        # convert coordinates to region string, which is one-based
+        if reference:
+            if end is not None:
+                if end < 0:
+                    raise ValueError("end out of range (%i)" % end)
+                if start is None:
+                    start = 0
+                    
+                if start < 0:
+                    raise ValueError("start out of range (%i)" % end)
+                elif start > end:
+                    raise ValueError(
+                        'start (%i) >= end (%i)' % (start, end))
+                elif start == end:
+                    return EmptyIterator()
+                else:
+                    region = '%s:%i-%i' % (reference, start + 1, end)
+            elif start is not None:
+                if start < 0:
+                    raise ValueError("start out of range (%i)" % end)
+                region = '%s:%i' % (reference, start + 1)
+            else:
+                region = reference
+
+        # get iterator
+        cdef hts_itr_t * itr
+        cdef char *cstr
+        cdef TabixFile fileobj
+
+        # reopen the same file if necessary
+        if multiple_iterators:
+            fileobj = self._dup()
+        else:
+            fileobj = self
+
+        if region is None:
+            # without region or reference - iterate from start
+            with nogil:
+                itr = tbx_itr_queryi(fileobj.index,
+                                     HTS_IDX_START,
+                                     0,
+                                     0)
+        else:
+            s = force_bytes(region, encoding=fileobj.encoding)
+            cstr = s
+            with nogil:
+                itr = tbx_itr_querys(fileobj.index, cstr)
+
+        if itr == NULL:
+            if region is None:
+                if len(self.contigs) > 0:
+                    # when accessing a tabix file created prior tabix 1.0
+                    # the full-file iterator is empty.
+                    raise ValueError(
+                        "could not create iterator, possible "
+                        "tabix version mismatch")
+                else:
+                    # possible reason is that the file is empty -
+                    # return an empty iterator
+                    return EmptyIterator()
+            else:
+                raise ValueError(
+                    "could not create iterator for region '%s'" %
+                    region)
+            
+        # use default parser if no parser is specified
+        if parser is None:
+            parser = fileobj.parser
+
+        cdef TabixIterator a
+        if parser is None: 
+            a = TabixIterator(encoding=fileobj.encoding)
+        else:
+            parser.set_encoding(fileobj.encoding)
+            a = TabixIteratorParsed(parser)
+
+        a.tabixfile = fileobj
+        a.iterator = itr
+
+        return a
+
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## properties
+    ###############################################################
+    property header:
+        '''the file header.
+
+        The file header consists of the lines at the beginning of a
+        file that are prefixed by the comment character ``#``.
+       
+        .. note::
+            The header is returned as an iterator presenting lines
+            without the newline character.
+        '''
+        
+        def __get__(self):
+
+            cdef char *cfilename = self.filename
+            cdef char *cfilename_index = self.filename_index
+            
+            cdef kstring_t buffer
+            buffer.l = buffer.m = 0
+            buffer.s = NULL
+            
+            cdef htsFile * fp = NULL
+            cdef int KS_SEP_LINE = 2
+            cdef tbx_t * tbx = NULL
+            lines = []
+            with nogil:
+                fp = hts_open(cfilename, 'r')
+                
+            if fp == NULL:
+                raise OSError("could not open {} for reading header".format(self.filename))
+
+            with nogil:
+                tbx = tbx_index_load2(cfilename, cfilename_index)
+                
+            if tbx == NULL:
+                raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
+
+            while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
+                if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
+                    break
+                lines.append(force_str(buffer.s, self.encoding))
+
+            with nogil:
+                hts_close(fp)
+                free(buffer.s)
+
+            return lines
+
+    property contigs:
+        '''list of chromosome names'''
+        def __get__(self):
+            cdef const char ** sequences
+            cdef int nsequences
+            
+            with nogil:
+                sequences = tbx_seqnames(self.index, &nsequences)
+            cdef int x
+            result = []
+            for x from 0 <= x < nsequences:
+                result.append(force_str(sequences[x]))
+            
+            # htslib instructions:
+            # only free container, not the sequences themselves
+            free(sequences)
+
+            return result
+            
+    def close(self):
+        '''
+        closes the :class:`pysam.TabixFile`.'''
+        if self.htsfile != NULL:
+            hts_close(self.htsfile)
+            self.htsfile = NULL
+        if self.index != NULL:
+            tbx_destroy(self.index)
+            self.index = NULL
+
+    def __dealloc__( self ):
+        # remember: dealloc cannot call other python methods
+        # note: no doc string
+        # note: __del__ is not called.
+        if self.htsfile != NULL:
+            hts_close(self.htsfile)
+            self.htsfile = NULL
+        if self.index != NULL:
+            tbx_destroy(self.index)
+
+
+cdef class TabixIterator:
+    """iterates over rows in *tabixfile* in region
+    given by *tid*, *start* and *end*.
+    """
+
+    def __init__(self, encoding="ascii"):
+        self.encoding = encoding
+    
+    def __iter__(self):
+        self.buffer.s = NULL
+        self.buffer.l = 0
+        self.buffer.m = 0
+
+        return self 
+
+    cdef int __cnext__(self):
+        '''iterate to next element.
+        
+        Return -5 if file has been closed when this function
+        was called.
+        '''
+        if self.tabixfile.htsfile == NULL:
+            return -5
+
+        cdef int retval
+
+        while 1:
+            with nogil:
+                retval = tbx_itr_next(
+                    self.tabixfile.htsfile,
+                    self.tabixfile.index,
+                    self.iterator,
+                    &self.buffer)
+
+            if retval < 0:
+                break
+
+            if self.buffer.s[0] != b'#':
+                break
+
+        return retval
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        
+        cdef int retval = self.__cnext__()
+        if retval == -5:
+            raise IOError("iteration on closed file")
+        elif retval < 0:
+            raise StopIteration
+
+        return charptr_to_str(self.buffer.s, self.encoding)
+
+    def __dealloc__(self):
+        if <void*>self.iterator != NULL:
+            tbx_itr_destroy(self.iterator)
+        if self.buffer.s != NULL:
+            free(self.buffer.s)
+
+
+class EmptyIterator:
+    '''empty iterator'''
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise StopIteration()
+
+
+cdef class TabixIteratorParsed(TabixIterator):
+    """iterates over mapped reads in a region.
+
+    The *parser* determines the encoding.
+
+    Returns parsed data.
+    """
+
+    def __init__(self, Parser parser):
+        super().__init__()
+        self.parser = parser
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        
+        cdef int retval = self.__cnext__()
+        if retval == -5:
+            raise IOError("iteration on closed file")
+        elif retval < 0:
+            raise StopIteration
+
+        return self.parser.parse(self.buffer.s,
+                                 self.buffer.l)
+
+
+cdef class GZIterator:
+    def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
+        '''iterate line-by-line through gzip (or bgzip)
+        compressed file.
+        '''
+        if not os.path.exists(filename):
+            raise IOError("No such file or directory: %s" % filename)
+
+        filename = encode_filename(filename)
+        cdef char *cfilename = filename
+        with nogil:
+            self.gzipfile = bgzf_open(cfilename, "r")
+        self._filename = filename
+        self.kstream = ks_init(self.gzipfile)
+        self.encoding = encoding
+
+        self.buffer.l = 0
+        self.buffer.m = 0
+        self.buffer.s = <char*>malloc(buffer_size)
+
+    def __dealloc__(self):
+        '''close file.'''
+        if self.gzipfile != NULL:
+            bgzf_close(self.gzipfile)
+            self.gzipfile = NULL
+        if self.buffer.s != NULL:
+            free(self.buffer.s)
+        if self.kstream != NULL:
+            ks_destroy(self.kstream)
+
+    def __iter__(self):
+        return self
+
+    cdef int __cnext__(self):
+        cdef int dret = 0
+        cdef int retval = 0
+        while 1:
+            with nogil:
+                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
+            
+            if retval < 0: 
+                break
+
+            return dret
+        return -1
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+        return force_str(self.buffer.s, self.encoding)
+
+
+cdef class GZIteratorHead(GZIterator):
+    '''iterate line-by-line through gzip (or bgzip)
+    compressed file returning comments at top of file.
+    '''
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+        if self.buffer.s[0] == b'#':
+            return self.buffer.s
+        else:
+            raise StopIteration
+
+
+cdef class GZIteratorParsed(GZIterator):
+    '''iterate line-by-line through gzip (or bgzip)
+    compressed file returning comments at top of file.
+    '''
+
+    def __init__(self, parser):
+        self.parser = parser
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+
+        return self.parser.parse(self.buffer.s,
+                                 self.buffer.l)
+
+
+def tabix_compress(filename_in, 
+                   filename_out,
+                   force=False):
+    '''compress *filename_in* writing the output to *filename_out*.
+    
+    Raise an IOError if *filename_out* already exists, unless *force*
+    is set.
+    '''
+
+    if not force and os.path.exists(filename_out):
+        raise IOError(
+            "Filename '%s' already exists, use *force* to "
+            "overwrite" % filename_out)
+
+    cdef int WINDOW_SIZE
+    cdef int c, r
+    cdef void * buffer
+    cdef BGZF * fp
+    cdef int fd_src
+    cdef bint is_empty = True
+    cdef int O_RDONLY
+    O_RDONLY = os.O_RDONLY
+
+    WINDOW_SIZE = 64 * 1024
+
+    fn = encode_filename(filename_out)
+    cdef char *cfn = fn
+    with nogil:
+        fp = bgzf_open(cfn, "w")
+    if fp == NULL:
+        raise IOError("could not open '%s' for writing" % filename_out)
+
+    fn = encode_filename(filename_in)
+    fd_src = open(fn, O_RDONLY)
+    if fd_src == 0:
+        raise IOError("could not open '%s' for reading" % filename_in)
+
+    buffer = malloc(WINDOW_SIZE)
+    c = 1
+    
+    while c > 0:
+        with nogil:
+            c = read(fd_src, buffer, WINDOW_SIZE)
+            if c > 0:
+                is_empty = False
+            r = bgzf_write(fp, buffer, c)
+        if r < 0:
+            free(buffer)
+            raise IOError("writing failed")
+        
+    free(buffer)
+    r = bgzf_close(fp)
+    if r < 0:
+        raise IOError("error %i when writing to file %s" % (r, filename_out))
+
+    r = close(fd_src)
+    # an empty file will return with -1, thus ignore this.
+    if r < 0:
+        if not (r == -1 and is_empty):
+            raise IOError("error %i when closing file %s" % (r, filename_in))
+
+
+def tabix_index(filename,
+                force=False,
+                seq_col=None,
+                start_col=None,
+                end_col=None,
+                preset=None,
+                meta_char="#",
+                int line_skip=0,
+                zerobased=False,
+                int min_shift=-1,
+                index=None,
+                keep_original=False,
+                csi=False,
+                ):
+    '''index tab-separated *filename* using tabix.
+
+    An existing index will not be overwritten unless *force* is set.
+
+    The index will be built from coordinates in columns *seq_col*,
+    *start_col* and *end_col*.
+
+    The contents of *filename* have to be sorted by contig and
+    position - the method does not check if the file is sorted.
+
+    Column indices are 0-based. Note that this is different from the
+    tabix command line utility where column indices start at 1.
+    
+    Coordinates in the file are assumed to be 1-based unless
+    *zerobased* is set.
+
+    If *preset* is provided, the column coordinates are taken from a
+    preset. Valid values for preset are "gff", "bed", "sam", "vcf",
+    psltbl", "pileup".
+    
+    Lines beginning with *meta_char* and the first *line_skip* lines
+    will be skipped.
+
+    If *filename* is not detected as a gzip file it will be automatically
+    compressed. The original file will be removed and only the compressed
+    file will be retained.
+
+    By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
+    is greater than zero and/or *csi* is True, creates a CSI index with a
+    minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
+
+    *index* controls the filename which should be used for creating the index.
+    If not set, the default is to append ``.tbi`` to *filename*.
+
+    When automatically compressing files, if *keep_original* is set the
+    uncompressed file will not be deleted.
+
+    returns the filename of the compressed data
+
+    '''
+    
+    if preset is None and \
+       (seq_col is None or start_col is None or end_col is None):
+        raise ValueError(
+            "neither preset nor seq_col,start_col and end_col given")
+
+    fn = encode_filename(filename)
+    cdef char *cfn = fn
+
+    cdef htsFile *fp = hts_open(cfn, "r")
+    if fp == NULL:
+        raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
+
+    cdef htsFormat fmt = hts_get_format(fp)[0]
+    hts_close(fp)
+
+    if fmt.compression == no_compression:
+        tabix_compress(filename, filename + ".gz", force=force)
+        if not keep_original:
+            os.unlink(filename)
+        filename += ".gz"
+        fn = encode_filename(filename)
+        cfn = fn
+
+    # columns (1-based):
+    #   preset-code, contig, start, end, metachar for
+    #     comments, lines to ignore at beginning
+    # 0 is a missing column
+    preset2conf = {
+        'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
+        'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
+        'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
+        'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
+        'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
+        }
+    
+    conf_data = None
+    if preset == "bcf" or fmt.format == bcf:
+        csi = True
+    elif preset:
+        try:
+            conf_data = preset2conf[preset]
+        except KeyError:
+            raise KeyError(
+                "unknown preset '%s', valid presets are '%s'" %
+                (preset, ",".join(preset2conf.keys())))
+    else:
+        if end_col is None:
+            end_col = -1
+            
+        preset = 0
+        # tabix internally works with 0-based coordinates and
+        # open/closed intervals.  When using a preset, conversion is
+        # automatically taken care of.  Otherwise, the coordinates are
+        # assumed to be 1-based closed intervals and -1 is subtracted
+        # from the start coordinate. To avoid doing this, set the
+        # TI_FLAG_UCSC=0x10000 flag:
+        if zerobased:
+            preset = preset | TBX_UCSC
+
+        conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
+
+    cdef tbx_conf_t conf
+    if conf_data:
+        conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
+
+    if csi or min_shift > 0:
+        suffix = ".csi"
+        if min_shift <= 0: min_shift = 14
+    else:
+        suffix = ".tbi"
+        min_shift = 0
+
+    index = index or filename + suffix    
+    fn_index = encode_filename(index)
+
+    if not force and os.path.exists(index):
+        raise IOError(
+            "filename '%s' already exists, use *force* to overwrite" % index)
+    
+    cdef char *fnidx = fn_index
+    cdef int retval = 0
+
+    if csi and fmt.format == bcf:
+        with nogil:
+            retval = bcf_index_build2(cfn, fnidx, min_shift)
+    else:
+        with nogil:
+            retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
+            
+    if retval != 0:
+        raise OSError("building of index for {} failed".format(filename))
+    
+    return filename
+
+# #########################################################
+# cdef class tabix_file_iterator_old:
+#     '''iterate over ``infile``.
+
+#     This iterator is not safe. If the :meth:`__next__()` method is called 
+#     after ``infile`` is closed, the result is undefined (see ``fclose()``).
+
+#     The iterator might either raise a StopIteration or segfault.
+#     '''
+
+
+#     def __cinit__(self, 
+#                   infile, 
+#                   Parser parser,
+#                   int buffer_size = 65536 ):
+
+#         cdef int fd = PyObject_AsFileDescriptor( infile )
+#         if fd == -1: raise ValueError( "I/O operation on closed file." )
+#         self.infile = fdopen( fd, 'r')
+
+#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
+
+#         self.buffer = <char*>malloc( buffer_size )        
+#         self.size = buffer_size
+#         self.parser = parser
+
+#     def __iter__(self):
+#         return self
+
+#     cdef __cnext__(self):
+
+#         cdef char * b
+#         cdef size_t nbytes
+#         b = self.buffer
+
+#         while not feof( self.infile ):
+#             nbytes = getline( &b, &self.size, self.infile)
+
+#             # stop at first error or eof
+#             if (nbytes == -1): break
+#             # skip comments
+#             if (b[0] == '#'): continue
+
+#             # skip empty lines
+#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+#             # make sure that entry is complete
+#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+#                 result = b
+#                 raise ValueError( "incomplete line at %s" % result )
+
+#             # make sure that this goes fully through C
+#             # otherwise buffer is copied to/from a
+#             # Python object causing segfaults as
+#             # the wrong memory is freed
+#             return self.parser.parse( b, nbytes )
+
+#         raise StopIteration
+
+#     def __dealloc__(self):
+#         free(self.buffer)
+
+#     def __next__(self):
+#         return self.__cnext__()
+
+#########################################################
+#########################################################
+#########################################################
+## Iterators for parsing through unindexed files.
+#########################################################
+# cdef buildGzipError(void *gzfp):
+#     cdef int errnum = 0
+#     cdef char *s = gzerror(gzfp, &errnum)
+#     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
+
+
+cdef class tabix_file_iterator:
+    '''iterate over a compressed or uncompressed ``infile``.
+    '''
+
+    def __cinit__(self, 
+                  infile, 
+                  Parser parser,
+                  int buffer_size=65536):
+
+        if infile.closed:
+            raise ValueError("I/O operation on closed file.")
+
+        self.infile = infile
+
+        cdef int fd = PyObject_AsFileDescriptor(infile)
+        if fd == -1:
+            raise ValueError("I/O operation on closed file.")
+
+        self.duplicated_fd = dup(fd)
+
+        # From the manual:
+        # gzopen can be used to read a file which is not in gzip format; 
+        # in this case gzread will directly read from the file without decompression. 
+        # When reading, this will be detected automatically by looking 
+        # for the magic two-byte gzip header. 
+        self.fh = bgzf_dopen(self.duplicated_fd, 'r')
+
+        if self.fh == NULL: 
+            raise IOError('%s' % strerror(errno))
+
+        self.kstream = ks_init(self.fh) 
+        
+        self.buffer.s = <char*>malloc(buffer_size)
+        #if self.buffer == NULL:
+        #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
+        #self.size = buffer_size
+        self.parser = parser
+
+    def __iter__(self):
+        return self
+
+    cdef __cnext__(self):
+
+        cdef char * b
+        cdef int dret = 0
+        cdef int retval = 0
+        while 1:
+            with nogil:
+                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
+            
+            if retval < 0: 
+                break
+                #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
+
+            b = self.buffer.s
+            
+            # skip comments
+            if (b[0] == b'#'):
+                continue
+
+            # skip empty lines
+            if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
+                continue
+
+            # gzgets terminates at \n, no need to test
+
+            # parser creates a copy
+            return self.parser.parse(b, self.buffer.l)
+
+        raise StopIteration
+
+    def __dealloc__(self):
+        free(self.buffer.s)
+        ks_destroy(self.kstream)
+        bgzf_close(self.fh)
+        
+    def __next__(self):
+        return self.__cnext__()
+
+
+class tabix_generic_iterator:
+    '''iterate over ``infile``.
+    
+    Permits the use of file-like objects for example from the gzip module.
+    '''
+    def __init__(self, infile, parser):
+
+        self.infile = infile
+        if self.infile.closed:
+            raise ValueError("I/O operation on closed file.")
+        self.parser = parser
+
+    def __iter__(self):
+        return self
+
+    # cython version - required for python 3
+    def __next__(self):
+        
+        cdef char * b
+        cdef char * cpy
+        cdef size_t nbytes
+
+        encoding = self.parser.get_encoding()
+
+        # note that GzipFile.close() does not close the file
+        # reading is still possible.
+        if self.infile.closed:
+            raise ValueError("I/O operation on closed file.")
+
+        while 1:
+
+            line = self.infile.readline()
+            if not line:
+                break
+            
+            s = force_bytes(line, encoding)
+            b = s
+            nbytes = len(line)
+            assert b[nbytes] == b'\0'
+
+            # skip comments
+            if b[0] == b'#':
+                continue
+
+            # skip empty lines
+            if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
+                continue
+            
+            # make sure that entry is complete
+            if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
+                raise ValueError("incomplete line at %s" % line)
+            
+            bytes_cpy = <bytes> b
+            cpy = <char *> bytes_cpy
+
+            return self.parser(cpy, nbytes)            
+
+        raise StopIteration
+
+
+def tabix_iterator(infile, parser):
+    """return an iterator over all entries in a file.
+    
+    Results are returned parsed as specified by the *parser*. If
+    *parser* is None, the results are returned as an unparsed string.
+    Otherwise, *parser* is assumed to be a functor that will return
+    parsed data (see for example :class:`~pysam.asTuple` and
+    :class:`~pysam.asGTF`).
+
+    """
+    return tabix_generic_iterator(infile, parser)
+        
+    
+cdef class Tabixfile(TabixFile):
+    """Tabixfile is deprecated: use TabixFile instead"""
+    pass
+
+
+__all__ = [
+    "tabix_index", 
+    "tabix_compress",
+    "TabixFile",
+    "Tabixfile",
+    "asTuple",
+    "asGTF",
+    "asGFF3",
+    "asVCF",
+    "asBed",
+    "GZIterator",
+    "GZIteratorHead",
+    "tabix_iterator", 
+    "tabix_generic_iterator", 
+    "tabix_file_iterator", 
+]
author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children