Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,1271 @@ +# cython: language_level=3 +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +# Cython wrapper for access to tabix indexed files in bgzf format +############################################################################### +# The principal classes and functions defined in this module are: +# +# class TabixFile class wrapping tabix indexed files in bgzf format +# +# class asTuple Parser class for tuples +# class asGTF Parser class for GTF formatted rows +# class asGFF3 Parser class for GFF3 formatted rows +# class asBed Parser class for Bed formatted rows +# class asVCF Parser class for VCF formatted rows +# +# class tabix_generic_iterator Streamed iterator of bgzf formatted files +# +# Additionally this module defines several additional classes that are part +# of the internal API. These are: +# +# class Parser base class for parsers of tab-separated rows +# class tabix_file_iterator +# class TabixIterator iterator class over rows in bgzf file +# class EmptyIterator +# +# For backwards compatibility, the following classes are also defined: +# +# class Tabixfile equivalent to TabixFile +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +import os +import sys + +from libc.stdio cimport printf, fprintf, stderr +from libc.string cimport strerror +from libc.errno cimport errno +from posix.unistd cimport dup + +from cpython cimport PyErr_SetString, PyBytes_Check, \ + PyUnicode_Check, PyBytes_FromStringAndSize, \ + PyObject_AsFileDescriptor + +cimport pysam.libctabixproxies as ctabixproxies + +from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ + BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ + tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ + tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ + tbx_destroy, hisremote, region_list, hts_getline, \ + TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \ + no_compression, bcf, bcf_index_build2 + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size + +cdef class Parser: + + def __init__(self, encoding="ascii"): + self.encoding = encoding + + def set_encoding(self, encoding): + self.encoding = encoding + + def get_encoding(self): + return self.encoding + + cdef parse(self, char * buffer, int length): + raise NotImplementedError( + 'parse method of %s not implemented' % str(self)) + + def __call__(self, char * buffer, int length): + return self.parse(buffer, length) + + +cdef class asTuple(Parser): + '''converts a :term:`tabix row` into a python tuple. + + A field in a row is accessed by numeric index. + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.TupleProxy r + r = ctabixproxies.TupleProxy(self.encoding) + # need to copy - there were some + # persistence issues with "present" + r.copy(buffer, len) + return r + + +cdef class asGFF3(Parser): + '''converts a :term:`tabix row` into a GFF record with the following + fields: + + +----------+----------+-------------------------------+ + |*Column* |*Name* |*Content* | + +----------+----------+-------------------------------+ + |1 |contig |the chromosome name | + +----------+----------+-------------------------------+ + |2 |feature |The feature type | + +----------+----------+-------------------------------+ + |3 |source |The feature source | + +----------+----------+-------------------------------+ + |4 |start |genomic start coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |5 |end |genomic end coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |6 |score |feature score | + +----------+----------+-------------------------------+ + |7 |strand |strand | + +----------+----------+-------------------------------+ + |8 |frame |frame | + +----------+----------+-------------------------------+ + |9 |attributes|the attribute field | + +----------+----------+-------------------------------+ + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.GFF3Proxy r + r = ctabixproxies.GFF3Proxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class asGTF(Parser): + '''converts a :term:`tabix row` into a GTF record with the following + fields: + + +----------+----------+-------------------------------+ + |*Column* |*Name* |*Content* | + +----------+----------+-------------------------------+ + |1 |contig |the chromosome name | + +----------+----------+-------------------------------+ + |2 |feature |The feature type | + +----------+----------+-------------------------------+ + |3 |source |The feature source | + +----------+----------+-------------------------------+ + |4 |start |genomic start coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |5 |end |genomic end coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |6 |score |feature score | + +----------+----------+-------------------------------+ + |7 |strand |strand | + +----------+----------+-------------------------------+ + |8 |frame |frame | + +----------+----------+-------------------------------+ + |9 |attributes|the attribute field | + +----------+----------+-------------------------------+ + + GTF formatted entries also define the following fields that + are derived from the attributes field: + + +--------------------+------------------------------+ + |*Name* |*Content* | + +--------------------+------------------------------+ + |gene_id |the gene identifier | + +--------------------+------------------------------+ + |transcript_id |the transcript identifier | + +--------------------+------------------------------+ + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.GTFProxy r + r = ctabixproxies.GTFProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class asBed(Parser): + '''converts a :term:`tabix row` into a bed record + with the following fields: + + +-----------+-----------+------------------------------------------+ + |*Column* |*Field* |*Contents* | + | | | | + +-----------+-----------+------------------------------------------+ + |1 |contig |contig | + | | | | + +-----------+-----------+------------------------------------------+ + |2 |start |genomic start coordinate (zero-based) | + +-----------+-----------+------------------------------------------+ + |3 |end |genomic end coordinate plus one | + | | |(zero-based) | + +-----------+-----------+------------------------------------------+ + |4 |name |name of feature. | + +-----------+-----------+------------------------------------------+ + |5 |score |score of feature | + +-----------+-----------+------------------------------------------+ + |6 |strand |strand of feature | + +-----------+-----------+------------------------------------------+ + |7 |thickStart |thickStart | + +-----------+-----------+------------------------------------------+ + |8 |thickEnd |thickEnd | + +-----------+-----------+------------------------------------------+ + |9 |itemRGB |itemRGB | + +-----------+-----------+------------------------------------------+ + |10 |blockCount |number of bocks | + +-----------+-----------+------------------------------------------+ + |11 |blockSizes |',' separated string of block sizes | + +-----------+-----------+------------------------------------------+ + |12 |blockStarts|',' separated string of block genomic | + | | |start positions | + +-----------+-----------+------------------------------------------+ + + Only the first three fields are required. Additional + fields are optional, but if one is defined, all the preceding + need to be defined as well. + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.BedProxy r + r = ctabixproxies.BedProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class asVCF(Parser): + '''converts a :term:`tabix row` into a VCF record with + the following fields: + + +----------+---------+------------------------------------+ + |*Column* |*Field* |*Contents* | + | | | | + +----------+---------+------------------------------------+ + |1 |contig |chromosome | + +----------+---------+------------------------------------+ + |2 |pos |chromosomal position, zero-based | + +----------+---------+------------------------------------+ + |3 |id |id | + +----------+---------+------------------------------------+ + |4 |ref |reference allele | + +----------+---------+------------------------------------+ + |5 |alt |alternate alleles | + +----------+---------+------------------------------------+ + |6 |qual |quality | + +----------+---------+------------------------------------+ + |7 |filter |filter | + +----------+---------+------------------------------------+ + |8 |info |info | + +----------+---------+------------------------------------+ + |9 |format |format specifier. | + +----------+---------+------------------------------------+ + + Access to genotypes is via index:: + + contig = vcf.contig + first_sample_genotype = vcf[0] + second_sample_genotype = vcf[1] + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.VCFProxy r + r = ctabixproxies.VCFProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class TabixFile: + """Random access to bgzf formatted files that + have been indexed by :term:`tabix`. + + The file is automatically opened. The index file of file + ``<filename>`` is expected to be called ``<filename>.tbi`` + by default (see parameter `index`). + + Parameters + ---------- + + filename : string + Filename of bgzf file to be opened. + + index : string + The filename of the index. If not set, the default is to + assume that the index is called ``filename.tbi`` + + mode : char + The file opening mode. Currently, only ``r`` is permitted. + + parser : :class:`pysam.Parser` + + sets the default parser for this tabix file. If `parser` + is None, the results are returned as an unparsed string. + Otherwise, `parser` is assumed to be a functor that will return + parsed data (see for example :class:`~pysam.asTuple` and + :class:`~pysam.asGTF`). + + encoding : string + + The encoding passed to the parser + + threads: integer + Number of threads to use for decompressing Tabix files. + (Default=1) + + + Raises + ------ + + ValueError + if index file is missing. + + IOError + if file could not be opened + """ + def __cinit__(self, + filename, + mode='r', + parser=None, + index=None, + encoding="ascii", + threads=1, + *args, + **kwargs ): + + self.htsfile = NULL + self.is_remote = False + self.is_stream = False + self.parser = parser + self.threads = threads + self._open(filename, mode, index, *args, **kwargs) + self.encoding = encoding + + def _open( self, + filename, + mode='r', + index=None, + threads=1, + ): + '''open a :term:`tabix file` for reading.''' + + if mode != 'r': + raise ValueError("invalid file opening mode `%s`" % mode) + + if self.htsfile != NULL: + self.close() + self.htsfile = NULL + self.threads=threads + + filename_index = index or (filename + ".tbi") + # encode all the strings to pass to tabix + self.filename = encode_filename(filename) + self.filename_index = encode_filename(filename_index) + + self.is_stream = self.filename == b'-' + self.is_remote = hisremote(self.filename) + + if not self.is_remote: + if not os.path.exists(filename): + raise IOError("file `%s` not found" % filename) + + if not os.path.exists(filename_index): + raise IOError("index `%s` not found" % filename_index) + + # open file + cdef char *cfilename = self.filename + cdef char *cfilename_index = self.filename_index + with nogil: + self.htsfile = hts_open(cfilename, 'r') + + if self.htsfile == NULL: + raise IOError("could not open file `%s`" % filename) + + #if self.htsfile.format.category != region_list: + # raise ValueError("file does not contain region data") + + with nogil: + self.index = tbx_index_load2(cfilename, cfilename_index) + + if self.index == NULL: + raise IOError("could not open index for `%s`" % filename) + + if not self.is_stream: + self.start_offset = self.tell() + + def _dup(self): + '''return a copy of this tabix file. + + The file is being re-opened. + ''' + return TabixFile(self.filename, + mode="r", + threads=self.threads, + parser=self.parser, + index=self.filename_index, + encoding=self.encoding) + + def fetch(self, + reference=None, + start=None, + end=None, + region=None, + parser=None, + multiple_iterators=False): + '''fetch one or more rows in a :term:`region` using 0-based + indexing. The region is specified by :term:`reference`, + *start* and *end*. Alternatively, a samtools :term:`region` + string can be supplied. + + Without *reference* or *region* all entries will be fetched. + + If only *reference* is set, all reads matching on *reference* + will be fetched. + + If *parser* is None, the default parser will be used for + parsing. + + Set *multiple_iterators* to true if you will be using multiple + iterators on the same file at the same time. The iterator + returned will receive its own copy of a filehandle to the file + effectively re-opening the file. Re-opening a file creates + some overhead, so beware. + + ''' + if not self.is_open(): + raise ValueError("I/O operation on closed file") + + # convert coordinates to region string, which is one-based + if reference: + if end is not None: + if end < 0: + raise ValueError("end out of range (%i)" % end) + if start is None: + start = 0 + + if start < 0: + raise ValueError("start out of range (%i)" % end) + elif start > end: + raise ValueError( + 'start (%i) >= end (%i)' % (start, end)) + elif start == end: + return EmptyIterator() + else: + region = '%s:%i-%i' % (reference, start + 1, end) + elif start is not None: + if start < 0: + raise ValueError("start out of range (%i)" % end) + region = '%s:%i' % (reference, start + 1) + else: + region = reference + + # get iterator + cdef hts_itr_t * itr + cdef char *cstr + cdef TabixFile fileobj + + # reopen the same file if necessary + if multiple_iterators: + fileobj = self._dup() + else: + fileobj = self + + if region is None: + # without region or reference - iterate from start + with nogil: + itr = tbx_itr_queryi(fileobj.index, + HTS_IDX_START, + 0, + 0) + else: + s = force_bytes(region, encoding=fileobj.encoding) + cstr = s + with nogil: + itr = tbx_itr_querys(fileobj.index, cstr) + + if itr == NULL: + if region is None: + if len(self.contigs) > 0: + # when accessing a tabix file created prior tabix 1.0 + # the full-file iterator is empty. + raise ValueError( + "could not create iterator, possible " + "tabix version mismatch") + else: + # possible reason is that the file is empty - + # return an empty iterator + return EmptyIterator() + else: + raise ValueError( + "could not create iterator for region '%s'" % + region) + + # use default parser if no parser is specified + if parser is None: + parser = fileobj.parser + + cdef TabixIterator a + if parser is None: + a = TabixIterator(encoding=fileobj.encoding) + else: + parser.set_encoding(fileobj.encoding) + a = TabixIteratorParsed(parser) + + a.tabixfile = fileobj + a.iterator = itr + + return a + + ############################################################### + ############################################################### + ############################################################### + ## properties + ############################################################### + property header: + '''the file header. + + The file header consists of the lines at the beginning of a + file that are prefixed by the comment character ``#``. + + .. note:: + The header is returned as an iterator presenting lines + without the newline character. + ''' + + def __get__(self): + + cdef char *cfilename = self.filename + cdef char *cfilename_index = self.filename_index + + cdef kstring_t buffer + buffer.l = buffer.m = 0 + buffer.s = NULL + + cdef htsFile * fp = NULL + cdef int KS_SEP_LINE = 2 + cdef tbx_t * tbx = NULL + lines = [] + with nogil: + fp = hts_open(cfilename, 'r') + + if fp == NULL: + raise OSError("could not open {} for reading header".format(self.filename)) + + with nogil: + tbx = tbx_index_load2(cfilename, cfilename_index) + + if tbx == NULL: + raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) + + while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: + if not buffer.l or buffer.s[0] != tbx.conf.meta_char: + break + lines.append(force_str(buffer.s, self.encoding)) + + with nogil: + hts_close(fp) + free(buffer.s) + + return lines + + property contigs: + '''list of chromosome names''' + def __get__(self): + cdef const char ** sequences + cdef int nsequences + + with nogil: + sequences = tbx_seqnames(self.index, &nsequences) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append(force_str(sequences[x])) + + # htslib instructions: + # only free container, not the sequences themselves + free(sequences) + + return result + + def close(self): + ''' + closes the :class:`pysam.TabixFile`.''' + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL + if self.index != NULL: + tbx_destroy(self.index) + self.index = NULL + + def __dealloc__( self ): + # remember: dealloc cannot call other python methods + # note: no doc string + # note: __del__ is not called. + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL + if self.index != NULL: + tbx_destroy(self.index) + + +cdef class TabixIterator: + """iterates over rows in *tabixfile* in region + given by *tid*, *start* and *end*. + """ + + def __init__(self, encoding="ascii"): + self.encoding = encoding + + def __iter__(self): + self.buffer.s = NULL + self.buffer.l = 0 + self.buffer.m = 0 + + return self + + cdef int __cnext__(self): + '''iterate to next element. + + Return -5 if file has been closed when this function + was called. + ''' + if self.tabixfile.htsfile == NULL: + return -5 + + cdef int retval + + while 1: + with nogil: + retval = tbx_itr_next( + self.tabixfile.htsfile, + self.tabixfile.index, + self.iterator, + &self.buffer) + + if retval < 0: + break + + if self.buffer.s[0] != b'#': + break + + return retval + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef int retval = self.__cnext__() + if retval == -5: + raise IOError("iteration on closed file") + elif retval < 0: + raise StopIteration + + return charptr_to_str(self.buffer.s, self.encoding) + + def __dealloc__(self): + if <void*>self.iterator != NULL: + tbx_itr_destroy(self.iterator) + if self.buffer.s != NULL: + free(self.buffer.s) + + +class EmptyIterator: + '''empty iterator''' + + def __iter__(self): + return self + + def __next__(self): + raise StopIteration() + + +cdef class TabixIteratorParsed(TabixIterator): + """iterates over mapped reads in a region. + + The *parser* determines the encoding. + + Returns parsed data. + """ + + def __init__(self, Parser parser): + super().__init__() + self.parser = parser + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef int retval = self.__cnext__() + if retval == -5: + raise IOError("iteration on closed file") + elif retval < 0: + raise StopIteration + + return self.parser.parse(self.buffer.s, + self.buffer.l) + + +cdef class GZIterator: + def __init__(self, filename, int buffer_size=65536, encoding="ascii"): + '''iterate line-by-line through gzip (or bgzip) + compressed file. + ''' + if not os.path.exists(filename): + raise IOError("No such file or directory: %s" % filename) + + filename = encode_filename(filename) + cdef char *cfilename = filename + with nogil: + self.gzipfile = bgzf_open(cfilename, "r") + self._filename = filename + self.kstream = ks_init(self.gzipfile) + self.encoding = encoding + + self.buffer.l = 0 + self.buffer.m = 0 + self.buffer.s = <char*>malloc(buffer_size) + + def __dealloc__(self): + '''close file.''' + if self.gzipfile != NULL: + bgzf_close(self.gzipfile) + self.gzipfile = NULL + if self.buffer.s != NULL: + free(self.buffer.s) + if self.kstream != NULL: + ks_destroy(self.kstream) + + def __iter__(self): + return self + + cdef int __cnext__(self): + cdef int dret = 0 + cdef int retval = 0 + while 1: + with nogil: + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) + + if retval < 0: + break + + return dret + return -1 + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + return force_str(self.buffer.s, self.encoding) + + +cdef class GZIteratorHead(GZIterator): + '''iterate line-by-line through gzip (or bgzip) + compressed file returning comments at top of file. + ''' + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + if self.buffer.s[0] == b'#': + return self.buffer.s + else: + raise StopIteration + + +cdef class GZIteratorParsed(GZIterator): + '''iterate line-by-line through gzip (or bgzip) + compressed file returning comments at top of file. + ''' + + def __init__(self, parser): + self.parser = parser + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + + return self.parser.parse(self.buffer.s, + self.buffer.l) + + +def tabix_compress(filename_in, + filename_out, + force=False): + '''compress *filename_in* writing the output to *filename_out*. + + Raise an IOError if *filename_out* already exists, unless *force* + is set. + ''' + + if not force and os.path.exists(filename_out): + raise IOError( + "Filename '%s' already exists, use *force* to " + "overwrite" % filename_out) + + cdef int WINDOW_SIZE + cdef int c, r + cdef void * buffer + cdef BGZF * fp + cdef int fd_src + cdef bint is_empty = True + cdef int O_RDONLY + O_RDONLY = os.O_RDONLY + + WINDOW_SIZE = 64 * 1024 + + fn = encode_filename(filename_out) + cdef char *cfn = fn + with nogil: + fp = bgzf_open(cfn, "w") + if fp == NULL: + raise IOError("could not open '%s' for writing" % filename_out) + + fn = encode_filename(filename_in) + fd_src = open(fn, O_RDONLY) + if fd_src == 0: + raise IOError("could not open '%s' for reading" % filename_in) + + buffer = malloc(WINDOW_SIZE) + c = 1 + + while c > 0: + with nogil: + c = read(fd_src, buffer, WINDOW_SIZE) + if c > 0: + is_empty = False + r = bgzf_write(fp, buffer, c) + if r < 0: + free(buffer) + raise IOError("writing failed") + + free(buffer) + r = bgzf_close(fp) + if r < 0: + raise IOError("error %i when writing to file %s" % (r, filename_out)) + + r = close(fd_src) + # an empty file will return with -1, thus ignore this. + if r < 0: + if not (r == -1 and is_empty): + raise IOError("error %i when closing file %s" % (r, filename_in)) + + +def tabix_index(filename, + force=False, + seq_col=None, + start_col=None, + end_col=None, + preset=None, + meta_char="#", + int line_skip=0, + zerobased=False, + int min_shift=-1, + index=None, + keep_original=False, + csi=False, + ): + '''index tab-separated *filename* using tabix. + + An existing index will not be overwritten unless *force* is set. + + The index will be built from coordinates in columns *seq_col*, + *start_col* and *end_col*. + + The contents of *filename* have to be sorted by contig and + position - the method does not check if the file is sorted. + + Column indices are 0-based. Note that this is different from the + tabix command line utility where column indices start at 1. + + Coordinates in the file are assumed to be 1-based unless + *zerobased* is set. + + If *preset* is provided, the column coordinates are taken from a + preset. Valid values for preset are "gff", "bed", "sam", "vcf", + psltbl", "pileup". + + Lines beginning with *meta_char* and the first *line_skip* lines + will be skipped. + + If *filename* is not detected as a gzip file it will be automatically + compressed. The original file will be removed and only the compressed + file will be retained. + + By default or when *min_shift* is 0, creates a TBI index. If *min_shift* + is greater than zero and/or *csi* is True, creates a CSI index with a + minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set). + + *index* controls the filename which should be used for creating the index. + If not set, the default is to append ``.tbi`` to *filename*. + + When automatically compressing files, if *keep_original* is set the + uncompressed file will not be deleted. + + returns the filename of the compressed data + + ''' + + if preset is None and \ + (seq_col is None or start_col is None or end_col is None): + raise ValueError( + "neither preset nor seq_col,start_col and end_col given") + + fn = encode_filename(filename) + cdef char *cfn = fn + + cdef htsFile *fp = hts_open(cfn, "r") + if fp == NULL: + raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno)))) + + cdef htsFormat fmt = hts_get_format(fp)[0] + hts_close(fp) + + if fmt.compression == no_compression: + tabix_compress(filename, filename + ".gz", force=force) + if not keep_original: + os.unlink(filename) + filename += ".gz" + fn = encode_filename(filename) + cfn = fn + + # columns (1-based): + # preset-code, contig, start, end, metachar for + # comments, lines to ignore at beginning + # 0 is a missing column + preset2conf = { + 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0), + 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0), + 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0), + 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0), + 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0), + } + + conf_data = None + if preset == "bcf" or fmt.format == bcf: + csi = True + elif preset: + try: + conf_data = preset2conf[preset] + except KeyError: + raise KeyError( + "unknown preset '%s', valid presets are '%s'" % + (preset, ",".join(preset2conf.keys()))) + else: + if end_col is None: + end_col = -1 + + preset = 0 + # tabix internally works with 0-based coordinates and + # open/closed intervals. When using a preset, conversion is + # automatically taken care of. Otherwise, the coordinates are + # assumed to be 1-based closed intervals and -1 is subtracted + # from the start coordinate. To avoid doing this, set the + # TI_FLAG_UCSC=0x10000 flag: + if zerobased: + preset = preset | TBX_UCSC + + conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip) + + cdef tbx_conf_t conf + if conf_data: + conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data + + if csi or min_shift > 0: + suffix = ".csi" + if min_shift <= 0: min_shift = 14 + else: + suffix = ".tbi" + min_shift = 0 + + index = index or filename + suffix + fn_index = encode_filename(index) + + if not force and os.path.exists(index): + raise IOError( + "filename '%s' already exists, use *force* to overwrite" % index) + + cdef char *fnidx = fn_index + cdef int retval = 0 + + if csi and fmt.format == bcf: + with nogil: + retval = bcf_index_build2(cfn, fnidx, min_shift) + else: + with nogil: + retval = tbx_index_build2(cfn, fnidx, min_shift, &conf) + + if retval != 0: + raise OSError("building of index for {} failed".format(filename)) + + return filename + +# ######################################################### +# cdef class tabix_file_iterator_old: +# '''iterate over ``infile``. + +# This iterator is not safe. If the :meth:`__next__()` method is called +# after ``infile`` is closed, the result is undefined (see ``fclose()``). + +# The iterator might either raise a StopIteration or segfault. +# ''' + + +# def __cinit__(self, +# infile, +# Parser parser, +# int buffer_size = 65536 ): + +# cdef int fd = PyObject_AsFileDescriptor( infile ) +# if fd == -1: raise ValueError( "I/O operation on closed file." ) +# self.infile = fdopen( fd, 'r') + +# if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) + +# self.buffer = <char*>malloc( buffer_size ) +# self.size = buffer_size +# self.parser = parser + +# def __iter__(self): +# return self + +# cdef __cnext__(self): + +# cdef char * b +# cdef size_t nbytes +# b = self.buffer + +# while not feof( self.infile ): +# nbytes = getline( &b, &self.size, self.infile) + +# # stop at first error or eof +# if (nbytes == -1): break +# # skip comments +# if (b[0] == '#'): continue + +# # skip empty lines +# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue + +# # make sure that entry is complete +# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': +# result = b +# raise ValueError( "incomplete line at %s" % result ) + +# # make sure that this goes fully through C +# # otherwise buffer is copied to/from a +# # Python object causing segfaults as +# # the wrong memory is freed +# return self.parser.parse( b, nbytes ) + +# raise StopIteration + +# def __dealloc__(self): +# free(self.buffer) + +# def __next__(self): +# return self.__cnext__() + +######################################################### +######################################################### +######################################################### +## Iterators for parsing through unindexed files. +######################################################### +# cdef buildGzipError(void *gzfp): +# cdef int errnum = 0 +# cdef char *s = gzerror(gzfp, &errnum) +# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) + + +cdef class tabix_file_iterator: + '''iterate over a compressed or uncompressed ``infile``. + ''' + + def __cinit__(self, + infile, + Parser parser, + int buffer_size=65536): + + if infile.closed: + raise ValueError("I/O operation on closed file.") + + self.infile = infile + + cdef int fd = PyObject_AsFileDescriptor(infile) + if fd == -1: + raise ValueError("I/O operation on closed file.") + + self.duplicated_fd = dup(fd) + + # From the manual: + # gzopen can be used to read a file which is not in gzip format; + # in this case gzread will directly read from the file without decompression. + # When reading, this will be detected automatically by looking + # for the magic two-byte gzip header. + self.fh = bgzf_dopen(self.duplicated_fd, 'r') + + if self.fh == NULL: + raise IOError('%s' % strerror(errno)) + + self.kstream = ks_init(self.fh) + + self.buffer.s = <char*>malloc(buffer_size) + #if self.buffer == NULL: + # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) + #self.size = buffer_size + self.parser = parser + + def __iter__(self): + return self + + cdef __cnext__(self): + + cdef char * b + cdef int dret = 0 + cdef int retval = 0 + while 1: + with nogil: + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) + + if retval < 0: + break + #raise IOError('gzip error: %s' % buildGzipError( self.fh )) + + b = self.buffer.s + + # skip comments + if (b[0] == b'#'): + continue + + # skip empty lines + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': + continue + + # gzgets terminates at \n, no need to test + + # parser creates a copy + return self.parser.parse(b, self.buffer.l) + + raise StopIteration + + def __dealloc__(self): + free(self.buffer.s) + ks_destroy(self.kstream) + bgzf_close(self.fh) + + def __next__(self): + return self.__cnext__() + + +class tabix_generic_iterator: + '''iterate over ``infile``. + + Permits the use of file-like objects for example from the gzip module. + ''' + def __init__(self, infile, parser): + + self.infile = infile + if self.infile.closed: + raise ValueError("I/O operation on closed file.") + self.parser = parser + + def __iter__(self): + return self + + # cython version - required for python 3 + def __next__(self): + + cdef char * b + cdef char * cpy + cdef size_t nbytes + + encoding = self.parser.get_encoding() + + # note that GzipFile.close() does not close the file + # reading is still possible. + if self.infile.closed: + raise ValueError("I/O operation on closed file.") + + while 1: + + line = self.infile.readline() + if not line: + break + + s = force_bytes(line, encoding) + b = s + nbytes = len(line) + assert b[nbytes] == b'\0' + + # skip comments + if b[0] == b'#': + continue + + # skip empty lines + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': + continue + + # make sure that entry is complete + if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': + raise ValueError("incomplete line at %s" % line) + + bytes_cpy = <bytes> b + cpy = <char *> bytes_cpy + + return self.parser(cpy, nbytes) + + raise StopIteration + + +def tabix_iterator(infile, parser): + """return an iterator over all entries in a file. + + Results are returned parsed as specified by the *parser*. If + *parser* is None, the results are returned as an unparsed string. + Otherwise, *parser* is assumed to be a functor that will return + parsed data (see for example :class:`~pysam.asTuple` and + :class:`~pysam.asGTF`). + + """ + return tabix_generic_iterator(infile, parser) + + +cdef class Tabixfile(TabixFile): + """Tabixfile is deprecated: use TabixFile instead""" + pass + + +__all__ = [ + "tabix_index", + "tabix_compress", + "TabixFile", + "Tabixfile", + "asTuple", + "asGTF", + "asGFF3", + "asVCF", + "asBed", + "GZIterator", + "GZIteratorHead", + "tabix_iterator", + "tabix_generic_iterator", + "tabix_file_iterator", +]