Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line source
# cython: language_level=3 # cython: embedsignature=True # cython: profile=True ############################################################################### ############################################################################### # Cython wrapper for access to tabix indexed files in bgzf format ############################################################################### # The principal classes and functions defined in this module are: # # class TabixFile class wrapping tabix indexed files in bgzf format # # class asTuple Parser class for tuples # class asGTF Parser class for GTF formatted rows # class asGFF3 Parser class for GFF3 formatted rows # class asBed Parser class for Bed formatted rows # class asVCF Parser class for VCF formatted rows # # class tabix_generic_iterator Streamed iterator of bgzf formatted files # # Additionally this module defines several additional classes that are part # of the internal API. These are: # # class Parser base class for parsers of tab-separated rows # class tabix_file_iterator # class TabixIterator iterator class over rows in bgzf file # class EmptyIterator # # For backwards compatibility, the following classes are also defined: # # class Tabixfile equivalent to TabixFile # ############################################################################### # # The MIT License # # Copyright (c) 2015 Andreas Heger # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # ############################################################################### import os import sys from libc.stdio cimport printf, fprintf, stderr from libc.string cimport strerror from libc.errno cimport errno from posix.unistd cimport dup from cpython cimport PyErr_SetString, PyBytes_Check, \ PyUnicode_Check, PyBytes_FromStringAndSize, \ PyObject_AsFileDescriptor cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ tbx_destroy, hisremote, region_list, hts_getline, \ TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \ no_compression, bcf, bcf_index_build2 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size cdef class Parser: def __init__(self, encoding="ascii"): self.encoding = encoding def set_encoding(self, encoding): self.encoding = encoding def get_encoding(self): return self.encoding cdef parse(self, char * buffer, int length): raise NotImplementedError( 'parse method of %s not implemented' % str(self)) def __call__(self, char * buffer, int length): return self.parse(buffer, length) cdef class asTuple(Parser): '''converts a :term:`tabix row` into a python tuple. A field in a row is accessed by numeric index. ''' cdef parse(self, char * buffer, int len): cdef ctabixproxies.TupleProxy r r = ctabixproxies.TupleProxy(self.encoding) # need to copy - there were some # persistence issues with "present" r.copy(buffer, len) return r cdef class asGFF3(Parser): '''converts a :term:`tabix row` into a GFF record with the following fields: +----------+----------+-------------------------------+ |*Column* |*Name* |*Content* | +----------+----------+-------------------------------+ |1 |contig |the chromosome name | +----------+----------+-------------------------------+ |2 |feature |The feature type | +----------+----------+-------------------------------+ |3 |source |The feature source | +----------+----------+-------------------------------+ |4 |start |genomic start coordinate | | | |(0-based) | +----------+----------+-------------------------------+ |5 |end |genomic end coordinate | | | |(0-based) | +----------+----------+-------------------------------+ |6 |score |feature score | +----------+----------+-------------------------------+ |7 |strand |strand | +----------+----------+-------------------------------+ |8 |frame |frame | +----------+----------+-------------------------------+ |9 |attributes|the attribute field | +----------+----------+-------------------------------+ ''' cdef parse(self, char * buffer, int len): cdef ctabixproxies.GFF3Proxy r r = ctabixproxies.GFF3Proxy(self.encoding) r.copy(buffer, len) return r cdef class asGTF(Parser): '''converts a :term:`tabix row` into a GTF record with the following fields: +----------+----------+-------------------------------+ |*Column* |*Name* |*Content* | +----------+----------+-------------------------------+ |1 |contig |the chromosome name | +----------+----------+-------------------------------+ |2 |feature |The feature type | +----------+----------+-------------------------------+ |3 |source |The feature source | +----------+----------+-------------------------------+ |4 |start |genomic start coordinate | | | |(0-based) | +----------+----------+-------------------------------+ |5 |end |genomic end coordinate | | | |(0-based) | +----------+----------+-------------------------------+ |6 |score |feature score | +----------+----------+-------------------------------+ |7 |strand |strand | +----------+----------+-------------------------------+ |8 |frame |frame | +----------+----------+-------------------------------+ |9 |attributes|the attribute field | +----------+----------+-------------------------------+ GTF formatted entries also define the following fields that are derived from the attributes field: +--------------------+------------------------------+ |*Name* |*Content* | +--------------------+------------------------------+ |gene_id |the gene identifier | +--------------------+------------------------------+ |transcript_id |the transcript identifier | +--------------------+------------------------------+ ''' cdef parse(self, char * buffer, int len): cdef ctabixproxies.GTFProxy r r = ctabixproxies.GTFProxy(self.encoding) r.copy(buffer, len) return r cdef class asBed(Parser): '''converts a :term:`tabix row` into a bed record with the following fields: +-----------+-----------+------------------------------------------+ |*Column* |*Field* |*Contents* | | | | | +-----------+-----------+------------------------------------------+ |1 |contig |contig | | | | | +-----------+-----------+------------------------------------------+ |2 |start |genomic start coordinate (zero-based) | +-----------+-----------+------------------------------------------+ |3 |end |genomic end coordinate plus one | | | |(zero-based) | +-----------+-----------+------------------------------------------+ |4 |name |name of feature. | +-----------+-----------+------------------------------------------+ |5 |score |score of feature | +-----------+-----------+------------------------------------------+ |6 |strand |strand of feature | +-----------+-----------+------------------------------------------+ |7 |thickStart |thickStart | +-----------+-----------+------------------------------------------+ |8 |thickEnd |thickEnd | +-----------+-----------+------------------------------------------+ |9 |itemRGB |itemRGB | +-----------+-----------+------------------------------------------+ |10 |blockCount |number of bocks | +-----------+-----------+------------------------------------------+ |11 |blockSizes |',' separated string of block sizes | +-----------+-----------+------------------------------------------+ |12 |blockStarts|',' separated string of block genomic | | | |start positions | +-----------+-----------+------------------------------------------+ Only the first three fields are required. Additional fields are optional, but if one is defined, all the preceding need to be defined as well. ''' cdef parse(self, char * buffer, int len): cdef ctabixproxies.BedProxy r r = ctabixproxies.BedProxy(self.encoding) r.copy(buffer, len) return r cdef class asVCF(Parser): '''converts a :term:`tabix row` into a VCF record with the following fields: +----------+---------+------------------------------------+ |*Column* |*Field* |*Contents* | | | | | +----------+---------+------------------------------------+ |1 |contig |chromosome | +----------+---------+------------------------------------+ |2 |pos |chromosomal position, zero-based | +----------+---------+------------------------------------+ |3 |id |id | +----------+---------+------------------------------------+ |4 |ref |reference allele | +----------+---------+------------------------------------+ |5 |alt |alternate alleles | +----------+---------+------------------------------------+ |6 |qual |quality | +----------+---------+------------------------------------+ |7 |filter |filter | +----------+---------+------------------------------------+ |8 |info |info | +----------+---------+------------------------------------+ |9 |format |format specifier. | +----------+---------+------------------------------------+ Access to genotypes is via index:: contig = vcf.contig first_sample_genotype = vcf[0] second_sample_genotype = vcf[1] ''' cdef parse(self, char * buffer, int len): cdef ctabixproxies.VCFProxy r r = ctabixproxies.VCFProxy(self.encoding) r.copy(buffer, len) return r cdef class TabixFile: """Random access to bgzf formatted files that have been indexed by :term:`tabix`. The file is automatically opened. The index file of file ``<filename>`` is expected to be called ``<filename>.tbi`` by default (see parameter `index`). Parameters ---------- filename : string Filename of bgzf file to be opened. index : string The filename of the index. If not set, the default is to assume that the index is called ``filename.tbi`` mode : char The file opening mode. Currently, only ``r`` is permitted. parser : :class:`pysam.Parser` sets the default parser for this tabix file. If `parser` is None, the results are returned as an unparsed string. Otherwise, `parser` is assumed to be a functor that will return parsed data (see for example :class:`~pysam.asTuple` and :class:`~pysam.asGTF`). encoding : string The encoding passed to the parser threads: integer Number of threads to use for decompressing Tabix files. (Default=1) Raises ------ ValueError if index file is missing. IOError if file could not be opened """ def __cinit__(self, filename, mode='r', parser=None, index=None, encoding="ascii", threads=1, *args, **kwargs ): self.htsfile = NULL self.is_remote = False self.is_stream = False self.parser = parser self.threads = threads self._open(filename, mode, index, *args, **kwargs) self.encoding = encoding def _open( self, filename, mode='r', index=None, threads=1, ): '''open a :term:`tabix file` for reading.''' if mode != 'r': raise ValueError("invalid file opening mode `%s`" % mode) if self.htsfile != NULL: self.close() self.htsfile = NULL self.threads=threads filename_index = index or (filename + ".tbi") # encode all the strings to pass to tabix self.filename = encode_filename(filename) self.filename_index = encode_filename(filename_index) self.is_stream = self.filename == b'-' self.is_remote = hisremote(self.filename) if not self.is_remote: if not os.path.exists(filename): raise IOError("file `%s` not found" % filename) if not os.path.exists(filename_index): raise IOError("index `%s` not found" % filename_index) # open file cdef char *cfilename = self.filename cdef char *cfilename_index = self.filename_index with nogil: self.htsfile = hts_open(cfilename, 'r') if self.htsfile == NULL: raise IOError("could not open file `%s`" % filename) #if self.htsfile.format.category != region_list: # raise ValueError("file does not contain region data") with nogil: self.index = tbx_index_load2(cfilename, cfilename_index) if self.index == NULL: raise IOError("could not open index for `%s`" % filename) if not self.is_stream: self.start_offset = self.tell() def _dup(self): '''return a copy of this tabix file. The file is being re-opened. ''' return TabixFile(self.filename, mode="r", threads=self.threads, parser=self.parser, index=self.filename_index, encoding=self.encoding) def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False): '''fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. Without *reference* or *region* all entries will be fetched. If only *reference* is set, all reads matching on *reference* will be fetched. If *parser* is None, the default parser will be used for parsing. Set *multiple_iterators* to true if you will be using multiple iterators on the same file at the same time. The iterator returned will receive its own copy of a filehandle to the file effectively re-opening the file. Re-opening a file creates some overhead, so beware. ''' if not self.is_open(): raise ValueError("I/O operation on closed file") # convert coordinates to region string, which is one-based if reference: if end is not None: if end < 0: raise ValueError("end out of range (%i)" % end) if start is None: start = 0 if start < 0: raise ValueError("start out of range (%i)" % end) elif start > end: raise ValueError( 'start (%i) >= end (%i)' % (start, end)) elif start == end: return EmptyIterator() else: region = '%s:%i-%i' % (reference, start + 1, end) elif start is not None: if start < 0: raise ValueError("start out of range (%i)" % end) region = '%s:%i' % (reference, start + 1) else: region = reference # get iterator cdef hts_itr_t * itr cdef char *cstr cdef TabixFile fileobj # reopen the same file if necessary if multiple_iterators: fileobj = self._dup() else: fileobj = self if region is None: # without region or reference - iterate from start with nogil: itr = tbx_itr_queryi(fileobj.index, HTS_IDX_START, 0, 0) else: s = force_bytes(region, encoding=fileobj.encoding) cstr = s with nogil: itr = tbx_itr_querys(fileobj.index, cstr) if itr == NULL: if region is None: if len(self.contigs) > 0: # when accessing a tabix file created prior tabix 1.0 # the full-file iterator is empty. raise ValueError( "could not create iterator, possible " "tabix version mismatch") else: # possible reason is that the file is empty - # return an empty iterator return EmptyIterator() else: raise ValueError( "could not create iterator for region '%s'" % region) # use default parser if no parser is specified if parser is None: parser = fileobj.parser cdef TabixIterator a if parser is None: a = TabixIterator(encoding=fileobj.encoding) else: parser.set_encoding(fileobj.encoding) a = TabixIteratorParsed(parser) a.tabixfile = fileobj a.iterator = itr return a ############################################################### ############################################################### ############################################################### ## properties ############################################################### property header: '''the file header. The file header consists of the lines at the beginning of a file that are prefixed by the comment character ``#``. .. note:: The header is returned as an iterator presenting lines without the newline character. ''' def __get__(self): cdef char *cfilename = self.filename cdef char *cfilename_index = self.filename_index cdef kstring_t buffer buffer.l = buffer.m = 0 buffer.s = NULL cdef htsFile * fp = NULL cdef int KS_SEP_LINE = 2 cdef tbx_t * tbx = NULL lines = [] with nogil: fp = hts_open(cfilename, 'r') if fp == NULL: raise OSError("could not open {} for reading header".format(self.filename)) with nogil: tbx = tbx_index_load2(cfilename, cfilename_index) if tbx == NULL: raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: if not buffer.l or buffer.s[0] != tbx.conf.meta_char: break lines.append(force_str(buffer.s, self.encoding)) with nogil: hts_close(fp) free(buffer.s) return lines property contigs: '''list of chromosome names''' def __get__(self): cdef const char ** sequences cdef int nsequences with nogil: sequences = tbx_seqnames(self.index, &nsequences) cdef int x result = [] for x from 0 <= x < nsequences: result.append(force_str(sequences[x])) # htslib instructions: # only free container, not the sequences themselves free(sequences) return result def close(self): ''' closes the :class:`pysam.TabixFile`.''' if self.htsfile != NULL: hts_close(self.htsfile) self.htsfile = NULL if self.index != NULL: tbx_destroy(self.index) self.index = NULL def __dealloc__( self ): # remember: dealloc cannot call other python methods # note: no doc string # note: __del__ is not called. if self.htsfile != NULL: hts_close(self.htsfile) self.htsfile = NULL if self.index != NULL: tbx_destroy(self.index) cdef class TabixIterator: """iterates over rows in *tabixfile* in region given by *tid*, *start* and *end*. """ def __init__(self, encoding="ascii"): self.encoding = encoding def __iter__(self): self.buffer.s = NULL self.buffer.l = 0 self.buffer.m = 0 return self cdef int __cnext__(self): '''iterate to next element. Return -5 if file has been closed when this function was called. ''' if self.tabixfile.htsfile == NULL: return -5 cdef int retval while 1: with nogil: retval = tbx_itr_next( self.tabixfile.htsfile, self.tabixfile.index, self.iterator, &self.buffer) if retval < 0: break if self.buffer.s[0] != b'#': break return retval def __next__(self): """python version of next(). pyrex uses this non-standard name instead of next() """ cdef int retval = self.__cnext__() if retval == -5: raise IOError("iteration on closed file") elif retval < 0: raise StopIteration return charptr_to_str(self.buffer.s, self.encoding) def __dealloc__(self): if <void*>self.iterator != NULL: tbx_itr_destroy(self.iterator) if self.buffer.s != NULL: free(self.buffer.s) class EmptyIterator: '''empty iterator''' def __iter__(self): return self def __next__(self): raise StopIteration() cdef class TabixIteratorParsed(TabixIterator): """iterates over mapped reads in a region. The *parser* determines the encoding. Returns parsed data. """ def __init__(self, Parser parser): super().__init__() self.parser = parser def __next__(self): """python version of next(). pyrex uses this non-standard name instead of next() """ cdef int retval = self.__cnext__() if retval == -5: raise IOError("iteration on closed file") elif retval < 0: raise StopIteration return self.parser.parse(self.buffer.s, self.buffer.l) cdef class GZIterator: def __init__(self, filename, int buffer_size=65536, encoding="ascii"): '''iterate line-by-line through gzip (or bgzip) compressed file. ''' if not os.path.exists(filename): raise IOError("No such file or directory: %s" % filename) filename = encode_filename(filename) cdef char *cfilename = filename with nogil: self.gzipfile = bgzf_open(cfilename, "r") self._filename = filename self.kstream = ks_init(self.gzipfile) self.encoding = encoding self.buffer.l = 0 self.buffer.m = 0 self.buffer.s = <char*>malloc(buffer_size) def __dealloc__(self): '''close file.''' if self.gzipfile != NULL: bgzf_close(self.gzipfile) self.gzipfile = NULL if self.buffer.s != NULL: free(self.buffer.s) if self.kstream != NULL: ks_destroy(self.kstream) def __iter__(self): return self cdef int __cnext__(self): cdef int dret = 0 cdef int retval = 0 while 1: with nogil: retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) if retval < 0: break return dret return -1 def __next__(self): """python version of next(). """ cdef int retval = self.__cnext__() if retval < 0: raise StopIteration return force_str(self.buffer.s, self.encoding) cdef class GZIteratorHead(GZIterator): '''iterate line-by-line through gzip (or bgzip) compressed file returning comments at top of file. ''' def __next__(self): """python version of next(). """ cdef int retval = self.__cnext__() if retval < 0: raise StopIteration if self.buffer.s[0] == b'#': return self.buffer.s else: raise StopIteration cdef class GZIteratorParsed(GZIterator): '''iterate line-by-line through gzip (or bgzip) compressed file returning comments at top of file. ''' def __init__(self, parser): self.parser = parser def __next__(self): """python version of next(). """ cdef int retval = self.__cnext__() if retval < 0: raise StopIteration return self.parser.parse(self.buffer.s, self.buffer.l) def tabix_compress(filename_in, filename_out, force=False): '''compress *filename_in* writing the output to *filename_out*. Raise an IOError if *filename_out* already exists, unless *force* is set. ''' if not force and os.path.exists(filename_out): raise IOError( "Filename '%s' already exists, use *force* to " "overwrite" % filename_out) cdef int WINDOW_SIZE cdef int c, r cdef void * buffer cdef BGZF * fp cdef int fd_src cdef bint is_empty = True cdef int O_RDONLY O_RDONLY = os.O_RDONLY WINDOW_SIZE = 64 * 1024 fn = encode_filename(filename_out) cdef char *cfn = fn with nogil: fp = bgzf_open(cfn, "w") if fp == NULL: raise IOError("could not open '%s' for writing" % filename_out) fn = encode_filename(filename_in) fd_src = open(fn, O_RDONLY) if fd_src == 0: raise IOError("could not open '%s' for reading" % filename_in) buffer = malloc(WINDOW_SIZE) c = 1 while c > 0: with nogil: c = read(fd_src, buffer, WINDOW_SIZE) if c > 0: is_empty = False r = bgzf_write(fp, buffer, c) if r < 0: free(buffer) raise IOError("writing failed") free(buffer) r = bgzf_close(fp) if r < 0: raise IOError("error %i when writing to file %s" % (r, filename_out)) r = close(fd_src) # an empty file will return with -1, thus ignore this. if r < 0: if not (r == -1 and is_empty): raise IOError("error %i when closing file %s" % (r, filename_in)) def tabix_index(filename, force=False, seq_col=None, start_col=None, end_col=None, preset=None, meta_char="#", int line_skip=0, zerobased=False, int min_shift=-1, index=None, keep_original=False, csi=False, ): '''index tab-separated *filename* using tabix. An existing index will not be overwritten unless *force* is set. The index will be built from coordinates in columns *seq_col*, *start_col* and *end_col*. The contents of *filename* have to be sorted by contig and position - the method does not check if the file is sorted. Column indices are 0-based. Note that this is different from the tabix command line utility where column indices start at 1. Coordinates in the file are assumed to be 1-based unless *zerobased* is set. If *preset* is provided, the column coordinates are taken from a preset. Valid values for preset are "gff", "bed", "sam", "vcf", psltbl", "pileup". Lines beginning with *meta_char* and the first *line_skip* lines will be skipped. If *filename* is not detected as a gzip file it will be automatically compressed. The original file will be removed and only the compressed file will be retained. By default or when *min_shift* is 0, creates a TBI index. If *min_shift* is greater than zero and/or *csi* is True, creates a CSI index with a minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set). *index* controls the filename which should be used for creating the index. If not set, the default is to append ``.tbi`` to *filename*. When automatically compressing files, if *keep_original* is set the uncompressed file will not be deleted. returns the filename of the compressed data ''' if preset is None and \ (seq_col is None or start_col is None or end_col is None): raise ValueError( "neither preset nor seq_col,start_col and end_col given") fn = encode_filename(filename) cdef char *cfn = fn cdef htsFile *fp = hts_open(cfn, "r") if fp == NULL: raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno)))) cdef htsFormat fmt = hts_get_format(fp)[0] hts_close(fp) if fmt.compression == no_compression: tabix_compress(filename, filename + ".gz", force=force) if not keep_original: os.unlink(filename) filename += ".gz" fn = encode_filename(filename) cfn = fn # columns (1-based): # preset-code, contig, start, end, metachar for # comments, lines to ignore at beginning # 0 is a missing column preset2conf = { 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0), 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0), 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0), 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0), 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0), } conf_data = None if preset == "bcf" or fmt.format == bcf: csi = True elif preset: try: conf_data = preset2conf[preset] except KeyError: raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys()))) else: if end_col is None: end_col = -1 preset = 0 # tabix internally works with 0-based coordinates and # open/closed intervals. When using a preset, conversion is # automatically taken care of. Otherwise, the coordinates are # assumed to be 1-based closed intervals and -1 is subtracted # from the start coordinate. To avoid doing this, set the # TI_FLAG_UCSC=0x10000 flag: if zerobased: preset = preset | TBX_UCSC conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip) cdef tbx_conf_t conf if conf_data: conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data if csi or min_shift > 0: suffix = ".csi" if min_shift <= 0: min_shift = 14 else: suffix = ".tbi" min_shift = 0 index = index or filename + suffix fn_index = encode_filename(index) if not force and os.path.exists(index): raise IOError( "filename '%s' already exists, use *force* to overwrite" % index) cdef char *fnidx = fn_index cdef int retval = 0 if csi and fmt.format == bcf: with nogil: retval = bcf_index_build2(cfn, fnidx, min_shift) else: with nogil: retval = tbx_index_build2(cfn, fnidx, min_shift, &conf) if retval != 0: raise OSError("building of index for {} failed".format(filename)) return filename # ######################################################### # cdef class tabix_file_iterator_old: # '''iterate over ``infile``. # This iterator is not safe. If the :meth:`__next__()` method is called # after ``infile`` is closed, the result is undefined (see ``fclose()``). # The iterator might either raise a StopIteration or segfault. # ''' # def __cinit__(self, # infile, # Parser parser, # int buffer_size = 65536 ): # cdef int fd = PyObject_AsFileDescriptor( infile ) # if fd == -1: raise ValueError( "I/O operation on closed file." ) # self.infile = fdopen( fd, 'r') # if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) # self.buffer = <char*>malloc( buffer_size ) # self.size = buffer_size # self.parser = parser # def __iter__(self): # return self # cdef __cnext__(self): # cdef char * b # cdef size_t nbytes # b = self.buffer # while not feof( self.infile ): # nbytes = getline( &b, &self.size, self.infile) # # stop at first error or eof # if (nbytes == -1): break # # skip comments # if (b[0] == '#'): continue # # skip empty lines # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue # # make sure that entry is complete # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': # result = b # raise ValueError( "incomplete line at %s" % result ) # # make sure that this goes fully through C # # otherwise buffer is copied to/from a # # Python object causing segfaults as # # the wrong memory is freed # return self.parser.parse( b, nbytes ) # raise StopIteration # def __dealloc__(self): # free(self.buffer) # def __next__(self): # return self.__cnext__() ######################################################### ######################################################### ######################################################### ## Iterators for parsing through unindexed files. ######################################################### # cdef buildGzipError(void *gzfp): # cdef int errnum = 0 # cdef char *s = gzerror(gzfp, &errnum) # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) cdef class tabix_file_iterator: '''iterate over a compressed or uncompressed ``infile``. ''' def __cinit__(self, infile, Parser parser, int buffer_size=65536): if infile.closed: raise ValueError("I/O operation on closed file.") self.infile = infile cdef int fd = PyObject_AsFileDescriptor(infile) if fd == -1: raise ValueError("I/O operation on closed file.") self.duplicated_fd = dup(fd) # From the manual: # gzopen can be used to read a file which is not in gzip format; # in this case gzread will directly read from the file without decompression. # When reading, this will be detected automatically by looking # for the magic two-byte gzip header. self.fh = bgzf_dopen(self.duplicated_fd, 'r') if self.fh == NULL: raise IOError('%s' % strerror(errno)) self.kstream = ks_init(self.fh) self.buffer.s = <char*>malloc(buffer_size) #if self.buffer == NULL: # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) #self.size = buffer_size self.parser = parser def __iter__(self): return self cdef __cnext__(self): cdef char * b cdef int dret = 0 cdef int retval = 0 while 1: with nogil: retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) if retval < 0: break #raise IOError('gzip error: %s' % buildGzipError( self.fh )) b = self.buffer.s # skip comments if (b[0] == b'#'): continue # skip empty lines if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue # gzgets terminates at \n, no need to test # parser creates a copy return self.parser.parse(b, self.buffer.l) raise StopIteration def __dealloc__(self): free(self.buffer.s) ks_destroy(self.kstream) bgzf_close(self.fh) def __next__(self): return self.__cnext__() class tabix_generic_iterator: '''iterate over ``infile``. Permits the use of file-like objects for example from the gzip module. ''' def __init__(self, infile, parser): self.infile = infile if self.infile.closed: raise ValueError("I/O operation on closed file.") self.parser = parser def __iter__(self): return self # cython version - required for python 3 def __next__(self): cdef char * b cdef char * cpy cdef size_t nbytes encoding = self.parser.get_encoding() # note that GzipFile.close() does not close the file # reading is still possible. if self.infile.closed: raise ValueError("I/O operation on closed file.") while 1: line = self.infile.readline() if not line: break s = force_bytes(line, encoding) b = s nbytes = len(line) assert b[nbytes] == b'\0' # skip comments if b[0] == b'#': continue # skip empty lines if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue # make sure that entry is complete if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': raise ValueError("incomplete line at %s" % line) bytes_cpy = <bytes> b cpy = <char *> bytes_cpy return self.parser(cpy, nbytes) raise StopIteration def tabix_iterator(infile, parser): """return an iterator over all entries in a file. Results are returned parsed as specified by the *parser*. If *parser* is None, the results are returned as an unparsed string. Otherwise, *parser* is assumed to be a functor that will return parsed data (see for example :class:`~pysam.asTuple` and :class:`~pysam.asGTF`). """ return tabix_generic_iterator(infile, parser) cdef class Tabixfile(TabixFile): """Tabixfile is deprecated: use TabixFile instead""" pass __all__ = [ "tabix_index", "tabix_compress", "TabixFile", "Tabixfile", "asTuple", "asGTF", "asGFF3", "asVCF", "asBed", "GZIterator", "GZIteratorHead", "tabix_iterator", "tabix_generic_iterator", "tabix_file_iterator", ]