csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # cython: language_level=3
jpayne@68	2 # cython: embedsignature=True
jpayne@68	3 # cython: profile=True
jpayne@68	4 ###############################################################################
jpayne@68	5 ###############################################################################
jpayne@68	6 # Cython wrapper for access to tabix indexed files in bgzf format
jpayne@68	7 ###############################################################################
jpayne@68	8 # The principal classes and functions defined in this module are:
jpayne@68	9 #
jpayne@68	10 # class TabixFile class wrapping tabix indexed files in bgzf format
jpayne@68	11 #
jpayne@68	12 # class asTuple Parser class for tuples
jpayne@68	13 # class asGTF Parser class for GTF formatted rows
jpayne@68	14 # class asGFF3 Parser class for GFF3 formatted rows
jpayne@68	15 # class asBed Parser class for Bed formatted rows
jpayne@68	16 # class asVCF Parser class for VCF formatted rows
jpayne@68	17 #
jpayne@68	18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
jpayne@68	19 #
jpayne@68	20 # Additionally this module defines several additional classes that are part
jpayne@68	21 # of the internal API. These are:
jpayne@68	22 #
jpayne@68	23 # class Parser base class for parsers of tab-separated rows
jpayne@68	24 # class tabix_file_iterator
jpayne@68	25 # class TabixIterator iterator class over rows in bgzf file
jpayne@68	26 # class EmptyIterator
jpayne@68	27 #
jpayne@68	28 # For backwards compatibility, the following classes are also defined:
jpayne@68	29 #
jpayne@68	30 # class Tabixfile equivalent to TabixFile
jpayne@68	31 #
jpayne@68	32 ###############################################################################
jpayne@68	33 #
jpayne@68	34 # The MIT License
jpayne@68	35 #
jpayne@68	36 # Copyright (c) 2015 Andreas Heger
jpayne@68	37 #
jpayne@68	38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68	39 # copy of this software and associated documentation files (the "Software"),
jpayne@68	40 # to deal in the Software without restriction, including without limitation
jpayne@68	41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68	42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68	43 # Software is furnished to do so, subject to the following conditions:
jpayne@68	44 #
jpayne@68	45 # The above copyright notice and this permission notice shall be included in
jpayne@68	46 # all copies or substantial portions of the Software.
jpayne@68	47 #
jpayne@68	48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68	49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68	50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68	51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68	52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68	53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68	54 # DEALINGS IN THE SOFTWARE.
jpayne@68	55 #
jpayne@68	56 ###############################################################################
jpayne@68	57 import os
jpayne@68	58 import sys
jpayne@68	59
jpayne@68	60 from libc.stdio cimport printf, fprintf, stderr
jpayne@68	61 from libc.string cimport strerror
jpayne@68	62 from libc.errno cimport errno
jpayne@68	63 from posix.unistd cimport dup
jpayne@68	64
jpayne@68	65 from cpython cimport PyErr_SetString, PyBytes_Check, \
jpayne@68	66 PyUnicode_Check, PyBytes_FromStringAndSize, \
jpayne@68	67 PyObject_AsFileDescriptor
jpayne@68	68
jpayne@68	69 cimport pysam.libctabixproxies as ctabixproxies
jpayne@68	70
jpayne@68	71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
jpayne@68	72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
jpayne@68	73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
jpayne@68	74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
jpayne@68	75 tbx_destroy, hisremote, region_list, hts_getline, \
jpayne@68	76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
jpayne@68	77 no_compression, bcf, bcf_index_build2
jpayne@68	78
jpayne@68	79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68	80 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68	81
jpayne@68	82 cdef class Parser:
jpayne@68	83
jpayne@68	84 def __init__(self, encoding="ascii"):
jpayne@68	85 self.encoding = encoding
jpayne@68	86
jpayne@68	87 def set_encoding(self, encoding):
jpayne@68	88 self.encoding = encoding
jpayne@68	89
jpayne@68	90 def get_encoding(self):
jpayne@68	91 return self.encoding
jpayne@68	92
jpayne@68	93 cdef parse(self, char * buffer, int length):
jpayne@68	94 raise NotImplementedError(
jpayne@68	95 'parse method of %s not implemented' % str(self))
jpayne@68	96
jpayne@68	97 def __call__(self, char * buffer, int length):
jpayne@68	98 return self.parse(buffer, length)
jpayne@68	99
jpayne@68	100
jpayne@68	101 cdef class asTuple(Parser):
jpayne@68	102 '''converts a :term:`tabix row` into a python tuple.
jpayne@68	103
jpayne@68	104 A field in a row is accessed by numeric index.
jpayne@68	105 '''
jpayne@68	106 cdef parse(self, char * buffer, int len):
jpayne@68	107 cdef ctabixproxies.TupleProxy r
jpayne@68	108 r = ctabixproxies.TupleProxy(self.encoding)
jpayne@68	109 # need to copy - there were some
jpayne@68	110 # persistence issues with "present"
jpayne@68	111 r.copy(buffer, len)
jpayne@68	112 return r
jpayne@68	113
jpayne@68	114
jpayne@68	115 cdef class asGFF3(Parser):
jpayne@68	116 '''converts a :term:`tabix row` into a GFF record with the following
jpayne@68	117 fields:
jpayne@68	118
jpayne@68	119 +----------+----------+-------------------------------+
jpayne@68	120 \|Column \|Name \|Content \|
jpayne@68	121 +----------+----------+-------------------------------+
jpayne@68	122 \|1 \|contig \|the chromosome name \|
jpayne@68	123 +----------+----------+-------------------------------+
jpayne@68	124 \|2 \|feature \|The feature type \|
jpayne@68	125 +----------+----------+-------------------------------+
jpayne@68	126 \|3 \|source \|The feature source \|
jpayne@68	127 +----------+----------+-------------------------------+
jpayne@68	128 \|4 \|start \|genomic start coordinate \|
jpayne@68	129 \| \| \|(0-based) \|
jpayne@68	130 +----------+----------+-------------------------------+
jpayne@68	131 \|5 \|end \|genomic end coordinate \|
jpayne@68	132 \| \| \|(0-based) \|
jpayne@68	133 +----------+----------+-------------------------------+
jpayne@68	134 \|6 \|score \|feature score \|
jpayne@68	135 +----------+----------+-------------------------------+
jpayne@68	136 \|7 \|strand \|strand \|
jpayne@68	137 +----------+----------+-------------------------------+
jpayne@68	138 \|8 \|frame \|frame \|
jpayne@68	139 +----------+----------+-------------------------------+
jpayne@68	140 \|9 \|attributes\|the attribute field \|
jpayne@68	141 +----------+----------+-------------------------------+
jpayne@68	142
jpayne@68	143 '''
jpayne@68	144 cdef parse(self, char * buffer, int len):
jpayne@68	145 cdef ctabixproxies.GFF3Proxy r
jpayne@68	146 r = ctabixproxies.GFF3Proxy(self.encoding)
jpayne@68	147 r.copy(buffer, len)
jpayne@68	148 return r
jpayne@68	149
jpayne@68	150
jpayne@68	151 cdef class asGTF(Parser):
jpayne@68	152 '''converts a :term:`tabix row` into a GTF record with the following
jpayne@68	153 fields:
jpayne@68	154
jpayne@68	155 +----------+----------+-------------------------------+
jpayne@68	156 \|Column \|Name \|Content \|
jpayne@68	157 +----------+----------+-------------------------------+
jpayne@68	158 \|1 \|contig \|the chromosome name \|
jpayne@68	159 +----------+----------+-------------------------------+
jpayne@68	160 \|2 \|feature \|The feature type \|
jpayne@68	161 +----------+----------+-------------------------------+
jpayne@68	162 \|3 \|source \|The feature source \|
jpayne@68	163 +----------+----------+-------------------------------+
jpayne@68	164 \|4 \|start \|genomic start coordinate \|
jpayne@68	165 \| \| \|(0-based) \|
jpayne@68	166 +----------+----------+-------------------------------+
jpayne@68	167 \|5 \|end \|genomic end coordinate \|
jpayne@68	168 \| \| \|(0-based) \|
jpayne@68	169 +----------+----------+-------------------------------+
jpayne@68	170 \|6 \|score \|feature score \|
jpayne@68	171 +----------+----------+-------------------------------+
jpayne@68	172 \|7 \|strand \|strand \|
jpayne@68	173 +----------+----------+-------------------------------+
jpayne@68	174 \|8 \|frame \|frame \|
jpayne@68	175 +----------+----------+-------------------------------+
jpayne@68	176 \|9 \|attributes\|the attribute field \|
jpayne@68	177 +----------+----------+-------------------------------+
jpayne@68	178
jpayne@68	179 GTF formatted entries also define the following fields that
jpayne@68	180 are derived from the attributes field:
jpayne@68	181
jpayne@68	182 +--------------------+------------------------------+
jpayne@68	183 \|Name \|Content \|
jpayne@68	184 +--------------------+------------------------------+
jpayne@68	185 \|gene_id \|the gene identifier \|
jpayne@68	186 +--------------------+------------------------------+
jpayne@68	187 \|transcript_id \|the transcript identifier \|
jpayne@68	188 +--------------------+------------------------------+
jpayne@68	189
jpayne@68	190 '''
jpayne@68	191 cdef parse(self, char * buffer, int len):
jpayne@68	192 cdef ctabixproxies.GTFProxy r
jpayne@68	193 r = ctabixproxies.GTFProxy(self.encoding)
jpayne@68	194 r.copy(buffer, len)
jpayne@68	195 return r
jpayne@68	196
jpayne@68	197
jpayne@68	198 cdef class asBed(Parser):
jpayne@68	199 '''converts a :term:`tabix row` into a bed record
jpayne@68	200 with the following fields:
jpayne@68	201
jpayne@68	202 +-----------+-----------+------------------------------------------+
jpayne@68	203 \|Column \|Field \|Contents \|
jpayne@68	204 \| \| \| \|
jpayne@68	205 +-----------+-----------+------------------------------------------+
jpayne@68	206 \|1 \|contig \|contig \|
jpayne@68	207 \| \| \| \|
jpayne@68	208 +-----------+-----------+------------------------------------------+
jpayne@68	209 \|2 \|start \|genomic start coordinate (zero-based) \|
jpayne@68	210 +-----------+-----------+------------------------------------------+
jpayne@68	211 \|3 \|end \|genomic end coordinate plus one \|
jpayne@68	212 \| \| \|(zero-based) \|
jpayne@68	213 +-----------+-----------+------------------------------------------+
jpayne@68	214 \|4 \|name \|name of feature. \|
jpayne@68	215 +-----------+-----------+------------------------------------------+
jpayne@68	216 \|5 \|score \|score of feature \|
jpayne@68	217 +-----------+-----------+------------------------------------------+
jpayne@68	218 \|6 \|strand \|strand of feature \|
jpayne@68	219 +-----------+-----------+------------------------------------------+
jpayne@68	220 \|7 \|thickStart \|thickStart \|
jpayne@68	221 +-----------+-----------+------------------------------------------+
jpayne@68	222 \|8 \|thickEnd \|thickEnd \|
jpayne@68	223 +-----------+-----------+------------------------------------------+
jpayne@68	224 \|9 \|itemRGB \|itemRGB \|
jpayne@68	225 +-----------+-----------+------------------------------------------+
jpayne@68	226 \|10 \|blockCount \|number of bocks \|
jpayne@68	227 +-----------+-----------+------------------------------------------+
jpayne@68	228 \|11 \|blockSizes \|',' separated string of block sizes \|
jpayne@68	229 +-----------+-----------+------------------------------------------+
jpayne@68	230 \|12 \|blockStarts\|',' separated string of block genomic \|
jpayne@68	231 \| \| \|start positions \|
jpayne@68	232 +-----------+-----------+------------------------------------------+
jpayne@68	233
jpayne@68	234 Only the first three fields are required. Additional
jpayne@68	235 fields are optional, but if one is defined, all the preceding
jpayne@68	236 need to be defined as well.
jpayne@68	237
jpayne@68	238 '''
jpayne@68	239 cdef parse(self, char * buffer, int len):
jpayne@68	240 cdef ctabixproxies.BedProxy r
jpayne@68	241 r = ctabixproxies.BedProxy(self.encoding)
jpayne@68	242 r.copy(buffer, len)
jpayne@68	243 return r
jpayne@68	244
jpayne@68	245
jpayne@68	246 cdef class asVCF(Parser):
jpayne@68	247 '''converts a :term:`tabix row` into a VCF record with
jpayne@68	248 the following fields:
jpayne@68	249
jpayne@68	250 +----------+---------+------------------------------------+
jpayne@68	251 \|Column \|Field \|Contents \|
jpayne@68	252 \| \| \| \|
jpayne@68	253 +----------+---------+------------------------------------+
jpayne@68	254 \|1 \|contig \|chromosome \|
jpayne@68	255 +----------+---------+------------------------------------+
jpayne@68	256 \|2 \|pos \|chromosomal position, zero-based \|
jpayne@68	257 +----------+---------+------------------------------------+
jpayne@68	258 \|3 \|id \|id \|
jpayne@68	259 +----------+---------+------------------------------------+
jpayne@68	260 \|4 \|ref \|reference allele \|
jpayne@68	261 +----------+---------+------------------------------------+
jpayne@68	262 \|5 \|alt \|alternate alleles \|
jpayne@68	263 +----------+---------+------------------------------------+
jpayne@68	264 \|6 \|qual \|quality \|
jpayne@68	265 +----------+---------+------------------------------------+
jpayne@68	266 \|7 \|filter \|filter \|
jpayne@68	267 +----------+---------+------------------------------------+
jpayne@68	268 \|8 \|info \|info \|
jpayne@68	269 +----------+---------+------------------------------------+
jpayne@68	270 \|9 \|format \|format specifier. \|
jpayne@68	271 +----------+---------+------------------------------------+
jpayne@68	272
jpayne@68	273 Access to genotypes is via index::
jpayne@68	274
jpayne@68	275 contig = vcf.contig
jpayne@68	276 first_sample_genotype = vcf[0]
jpayne@68	277 second_sample_genotype = vcf[1]
jpayne@68	278
jpayne@68	279 '''
jpayne@68	280 cdef parse(self, char * buffer, int len):
jpayne@68	281 cdef ctabixproxies.VCFProxy r
jpayne@68	282 r = ctabixproxies.VCFProxy(self.encoding)
jpayne@68	283 r.copy(buffer, len)
jpayne@68	284 return r
jpayne@68	285
jpayne@68	286
jpayne@68	287 cdef class TabixFile:
jpayne@68	288 """Random access to bgzf formatted files that
jpayne@68	289 have been indexed by :term:`tabix`.
jpayne@68	290
jpayne@68	291 The file is automatically opened. The index file of file
jpayne@68	292 ``<filename>`` is expected to be called ``<filename>.tbi``
jpayne@68	293 by default (see parameter `index`).
jpayne@68	294
jpayne@68	295 Parameters
jpayne@68	296 ----------
jpayne@68	297
jpayne@68	298 filename : string
jpayne@68	299 Filename of bgzf file to be opened.
jpayne@68	300
jpayne@68	301 index : string
jpayne@68	302 The filename of the index. If not set, the default is to
jpayne@68	303 assume that the index is called ``filename.tbi``
jpayne@68	304
jpayne@68	305 mode : char
jpayne@68	306 The file opening mode. Currently, only ``r`` is permitted.
jpayne@68	307
jpayne@68	308 parser : :class:`pysam.Parser`
jpayne@68	309
jpayne@68	310 sets the default parser for this tabix file. If `parser`
jpayne@68	311 is None, the results are returned as an unparsed string.
jpayne@68	312 Otherwise, `parser` is assumed to be a functor that will return
jpayne@68	313 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@68	314 :class:`~pysam.asGTF`).
jpayne@68	315
jpayne@68	316 encoding : string
jpayne@68	317
jpayne@68	318 The encoding passed to the parser
jpayne@68	319
jpayne@68	320 threads: integer
jpayne@68	321 Number of threads to use for decompressing Tabix files.
jpayne@68	322 (Default=1)
jpayne@68	323
jpayne@68	324
jpayne@68	325 Raises
jpayne@68	326 ------
jpayne@68	327
jpayne@68	328 ValueError
jpayne@68	329 if index file is missing.
jpayne@68	330
jpayne@68	331 IOError
jpayne@68	332 if file could not be opened
jpayne@68	333 """
jpayne@68	334 def __cinit__(self,
jpayne@68	335 filename,
jpayne@68	336 mode='r',
jpayne@68	337 parser=None,
jpayne@68	338 index=None,
jpayne@68	339 encoding="ascii",
jpayne@68	340 threads=1,
jpayne@68	341 *args,
jpayne@68	342 **kwargs ):
jpayne@68	343
jpayne@68	344 self.htsfile = NULL
jpayne@68	345 self.is_remote = False
jpayne@68	346 self.is_stream = False
jpayne@68	347 self.parser = parser
jpayne@68	348 self.threads = threads
jpayne@68	349 self._open(filename, mode, index, args, *kwargs)
jpayne@68	350 self.encoding = encoding
jpayne@68	351
jpayne@68	352 def _open( self,
jpayne@68	353 filename,
jpayne@68	354 mode='r',
jpayne@68	355 index=None,
jpayne@68	356 threads=1,
jpayne@68	357 ):
jpayne@68	358 '''open a :term:`tabix file` for reading.'''
jpayne@68	359
jpayne@68	360 if mode != 'r':
jpayne@68	361 raise ValueError("invalid file opening mode `%s`" % mode)
jpayne@68	362
jpayne@68	363 if self.htsfile != NULL:
jpayne@68	364 self.close()
jpayne@68	365 self.htsfile = NULL
jpayne@68	366 self.threads=threads
jpayne@68	367
jpayne@68	368 filename_index = index or (filename + ".tbi")
jpayne@68	369 # encode all the strings to pass to tabix
jpayne@68	370 self.filename = encode_filename(filename)
jpayne@68	371 self.filename_index = encode_filename(filename_index)
jpayne@68	372
jpayne@68	373 self.is_stream = self.filename == b'-'
jpayne@68	374 self.is_remote = hisremote(self.filename)
jpayne@68	375
jpayne@68	376 if not self.is_remote:
jpayne@68	377 if not os.path.exists(filename):
jpayne@68	378 raise IOError("file `%s` not found" % filename)
jpayne@68	379
jpayne@68	380 if not os.path.exists(filename_index):
jpayne@68	381 raise IOError("index `%s` not found" % filename_index)
jpayne@68	382
jpayne@68	383 # open file
jpayne@68	384 cdef char *cfilename = self.filename
jpayne@68	385 cdef char *cfilename_index = self.filename_index
jpayne@68	386 with nogil:
jpayne@68	387 self.htsfile = hts_open(cfilename, 'r')
jpayne@68	388
jpayne@68	389 if self.htsfile == NULL:
jpayne@68	390 raise IOError("could not open file `%s`" % filename)
jpayne@68	391
jpayne@68	392 #if self.htsfile.format.category != region_list:
jpayne@68	393 # raise ValueError("file does not contain region data")
jpayne@68	394
jpayne@68	395 with nogil:
jpayne@68	396 self.index = tbx_index_load2(cfilename, cfilename_index)
jpayne@68	397
jpayne@68	398 if self.index == NULL:
jpayne@68	399 raise IOError("could not open index for `%s`" % filename)
jpayne@68	400
jpayne@68	401 if not self.is_stream:
jpayne@68	402 self.start_offset = self.tell()
jpayne@68	403
jpayne@68	404 def _dup(self):
jpayne@68	405 '''return a copy of this tabix file.
jpayne@68	406
jpayne@68	407 The file is being re-opened.
jpayne@68	408 '''
jpayne@68	409 return TabixFile(self.filename,
jpayne@68	410 mode="r",
jpayne@68	411 threads=self.threads,
jpayne@68	412 parser=self.parser,
jpayne@68	413 index=self.filename_index,
jpayne@68	414 encoding=self.encoding)
jpayne@68	415
jpayne@68	416 def fetch(self,
jpayne@68	417 reference=None,
jpayne@68	418 start=None,
jpayne@68	419 end=None,
jpayne@68	420 region=None,
jpayne@68	421 parser=None,
jpayne@68	422 multiple_iterators=False):
jpayne@68	423 '''fetch one or more rows in a :term:`region` using 0-based
jpayne@68	424 indexing. The region is specified by :term:`reference`,
jpayne@68	425 start and end. Alternatively, a samtools :term:`region`
jpayne@68	426 string can be supplied.
jpayne@68	427
jpayne@68	428 Without reference or region all entries will be fetched.
jpayne@68	429
jpayne@68	430 If only reference is set, all reads matching on reference
jpayne@68	431 will be fetched.
jpayne@68	432
jpayne@68	433 If parser is None, the default parser will be used for
jpayne@68	434 parsing.
jpayne@68	435
jpayne@68	436 Set multiple_iterators to true if you will be using multiple
jpayne@68	437 iterators on the same file at the same time. The iterator
jpayne@68	438 returned will receive its own copy of a filehandle to the file
jpayne@68	439 effectively re-opening the file. Re-opening a file creates
jpayne@68	440 some overhead, so beware.
jpayne@68	441
jpayne@68	442 '''
jpayne@68	443 if not self.is_open():
jpayne@68	444 raise ValueError("I/O operation on closed file")
jpayne@68	445
jpayne@68	446 # convert coordinates to region string, which is one-based
jpayne@68	447 if reference:
jpayne@68	448 if end is not None:
jpayne@68	449 if end < 0:
jpayne@68	450 raise ValueError("end out of range (%i)" % end)
jpayne@68	451 if start is None:
jpayne@68	452 start = 0
jpayne@68	453
jpayne@68	454 if start < 0:
jpayne@68	455 raise ValueError("start out of range (%i)" % end)
jpayne@68	456 elif start > end:
jpayne@68	457 raise ValueError(
jpayne@68	458 'start (%i) >= end (%i)' % (start, end))
jpayne@68	459 elif start == end:
jpayne@68	460 return EmptyIterator()
jpayne@68	461 else:
jpayne@68	462 region = '%s:%i-%i' % (reference, start + 1, end)
jpayne@68	463 elif start is not None:
jpayne@68	464 if start < 0:
jpayne@68	465 raise ValueError("start out of range (%i)" % end)
jpayne@68	466 region = '%s:%i' % (reference, start + 1)
jpayne@68	467 else:
jpayne@68	468 region = reference
jpayne@68	469
jpayne@68	470 # get iterator
jpayne@68	471 cdef hts_itr_t * itr
jpayne@68	472 cdef char *cstr
jpayne@68	473 cdef TabixFile fileobj
jpayne@68	474
jpayne@68	475 # reopen the same file if necessary
jpayne@68	476 if multiple_iterators:
jpayne@68	477 fileobj = self._dup()
jpayne@68	478 else:
jpayne@68	479 fileobj = self
jpayne@68	480
jpayne@68	481 if region is None:
jpayne@68	482 # without region or reference - iterate from start
jpayne@68	483 with nogil:
jpayne@68	484 itr = tbx_itr_queryi(fileobj.index,
jpayne@68	485 HTS_IDX_START,
jpayne@68	486 0,
jpayne@68	487 0)
jpayne@68	488 else:
jpayne@68	489 s = force_bytes(region, encoding=fileobj.encoding)
jpayne@68	490 cstr = s
jpayne@68	491 with nogil:
jpayne@68	492 itr = tbx_itr_querys(fileobj.index, cstr)
jpayne@68	493
jpayne@68	494 if itr == NULL:
jpayne@68	495 if region is None:
jpayne@68	496 if len(self.contigs) > 0:
jpayne@68	497 # when accessing a tabix file created prior tabix 1.0
jpayne@68	498 # the full-file iterator is empty.
jpayne@68	499 raise ValueError(
jpayne@68	500 "could not create iterator, possible "
jpayne@68	501 "tabix version mismatch")
jpayne@68	502 else:
jpayne@68	503 # possible reason is that the file is empty -
jpayne@68	504 # return an empty iterator
jpayne@68	505 return EmptyIterator()
jpayne@68	506 else:
jpayne@68	507 raise ValueError(
jpayne@68	508 "could not create iterator for region '%s'" %
jpayne@68	509 region)
jpayne@68	510
jpayne@68	511 # use default parser if no parser is specified
jpayne@68	512 if parser is None:
jpayne@68	513 parser = fileobj.parser
jpayne@68	514
jpayne@68	515 cdef TabixIterator a
jpayne@68	516 if parser is None:
jpayne@68	517 a = TabixIterator(encoding=fileobj.encoding)
jpayne@68	518 else:
jpayne@68	519 parser.set_encoding(fileobj.encoding)
jpayne@68	520 a = TabixIteratorParsed(parser)
jpayne@68	521
jpayne@68	522 a.tabixfile = fileobj
jpayne@68	523 a.iterator = itr
jpayne@68	524
jpayne@68	525 return a
jpayne@68	526
jpayne@68	527 ###############################################################
jpayne@68	528 ###############################################################
jpayne@68	529 ###############################################################
jpayne@68	530 ## properties
jpayne@68	531 ###############################################################
jpayne@68	532 property header:
jpayne@68	533 '''the file header.
jpayne@68	534
jpayne@68	535 The file header consists of the lines at the beginning of a
jpayne@68	536 file that are prefixed by the comment character ``#``.
jpayne@68	537
jpayne@68	538 .. note::
jpayne@68	539 The header is returned as an iterator presenting lines
jpayne@68	540 without the newline character.
jpayne@68	541 '''
jpayne@68	542
jpayne@68	543 def __get__(self):
jpayne@68	544
jpayne@68	545 cdef char *cfilename = self.filename
jpayne@68	546 cdef char *cfilename_index = self.filename_index
jpayne@68	547
jpayne@68	548 cdef kstring_t buffer
jpayne@68	549 buffer.l = buffer.m = 0
jpayne@68	550 buffer.s = NULL
jpayne@68	551
jpayne@68	552 cdef htsFile * fp = NULL
jpayne@68	553 cdef int KS_SEP_LINE = 2
jpayne@68	554 cdef tbx_t * tbx = NULL
jpayne@68	555 lines = []
jpayne@68	556 with nogil:
jpayne@68	557 fp = hts_open(cfilename, 'r')
jpayne@68	558
jpayne@68	559 if fp == NULL:
jpayne@68	560 raise OSError("could not open {} for reading header".format(self.filename))
jpayne@68	561
jpayne@68	562 with nogil:
jpayne@68	563 tbx = tbx_index_load2(cfilename, cfilename_index)
jpayne@68	564
jpayne@68	565 if tbx == NULL:
jpayne@68	566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
jpayne@68	567
jpayne@68	568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
jpayne@68	569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
jpayne@68	570 break
jpayne@68	571 lines.append(force_str(buffer.s, self.encoding))
jpayne@68	572
jpayne@68	573 with nogil:
jpayne@68	574 hts_close(fp)
jpayne@68	575 free(buffer.s)
jpayne@68	576
jpayne@68	577 return lines
jpayne@68	578
jpayne@68	579 property contigs:
jpayne@68	580 '''list of chromosome names'''
jpayne@68	581 def __get__(self):
jpayne@68	582 cdef const char ** sequences
jpayne@68	583 cdef int nsequences
jpayne@68	584
jpayne@68	585 with nogil:
jpayne@68	586 sequences = tbx_seqnames(self.index, &nsequences)
jpayne@68	587 cdef int x
jpayne@68	588 result = []
jpayne@68	589 for x from 0 <= x < nsequences:
jpayne@68	590 result.append(force_str(sequences[x]))
jpayne@68	591
jpayne@68	592 # htslib instructions:
jpayne@68	593 # only free container, not the sequences themselves
jpayne@68	594 free(sequences)
jpayne@68	595
jpayne@68	596 return result
jpayne@68	597
jpayne@68	598 def close(self):
jpayne@68	599 '''
jpayne@68	600 closes the :class:`pysam.TabixFile`.'''
jpayne@68	601 if self.htsfile != NULL:
jpayne@68	602 hts_close(self.htsfile)
jpayne@68	603 self.htsfile = NULL
jpayne@68	604 if self.index != NULL:
jpayne@68	605 tbx_destroy(self.index)
jpayne@68	606 self.index = NULL
jpayne@68	607
jpayne@68	608 def __dealloc__( self ):
jpayne@68	609 # remember: dealloc cannot call other python methods
jpayne@68	610 # note: no doc string
jpayne@68	611 # note: __del__ is not called.
jpayne@68	612 if self.htsfile != NULL:
jpayne@68	613 hts_close(self.htsfile)
jpayne@68	614 self.htsfile = NULL
jpayne@68	615 if self.index != NULL:
jpayne@68	616 tbx_destroy(self.index)
jpayne@68	617
jpayne@68	618
jpayne@68	619 cdef class TabixIterator:
jpayne@68	620 """iterates over rows in tabixfile in region
jpayne@68	621 given by tid, start and end.
jpayne@68	622 """
jpayne@68	623
jpayne@68	624 def __init__(self, encoding="ascii"):
jpayne@68	625 self.encoding = encoding
jpayne@68	626
jpayne@68	627 def __iter__(self):
jpayne@68	628 self.buffer.s = NULL
jpayne@68	629 self.buffer.l = 0
jpayne@68	630 self.buffer.m = 0
jpayne@68	631
jpayne@68	632 return self
jpayne@68	633
jpayne@68	634 cdef int __cnext__(self):
jpayne@68	635 '''iterate to next element.
jpayne@68	636
jpayne@68	637 Return -5 if file has been closed when this function
jpayne@68	638 was called.
jpayne@68	639 '''
jpayne@68	640 if self.tabixfile.htsfile == NULL:
jpayne@68	641 return -5
jpayne@68	642
jpayne@68	643 cdef int retval
jpayne@68	644
jpayne@68	645 while 1:
jpayne@68	646 with nogil:
jpayne@68	647 retval = tbx_itr_next(
jpayne@68	648 self.tabixfile.htsfile,
jpayne@68	649 self.tabixfile.index,
jpayne@68	650 self.iterator,
jpayne@68	651 &self.buffer)
jpayne@68	652
jpayne@68	653 if retval < 0:
jpayne@68	654 break
jpayne@68	655
jpayne@68	656 if self.buffer.s[0] != b'#':
jpayne@68	657 break
jpayne@68	658
jpayne@68	659 return retval
jpayne@68	660
jpayne@68	661 def __next__(self):
jpayne@68	662 """python version of next().
jpayne@68	663
jpayne@68	664 pyrex uses this non-standard name instead of next()
jpayne@68	665 """
jpayne@68	666
jpayne@68	667 cdef int retval = self.__cnext__()
jpayne@68	668 if retval == -5:
jpayne@68	669 raise IOError("iteration on closed file")
jpayne@68	670 elif retval < 0:
jpayne@68	671 raise StopIteration
jpayne@68	672
jpayne@68	673 return charptr_to_str(self.buffer.s, self.encoding)
jpayne@68	674
jpayne@68	675 def __dealloc__(self):
jpayne@68	676 if <void*>self.iterator != NULL:
jpayne@68	677 tbx_itr_destroy(self.iterator)
jpayne@68	678 if self.buffer.s != NULL:
jpayne@68	679 free(self.buffer.s)
jpayne@68	680
jpayne@68	681
jpayne@68	682 class EmptyIterator:
jpayne@68	683 '''empty iterator'''
jpayne@68	684
jpayne@68	685 def __iter__(self):
jpayne@68	686 return self
jpayne@68	687
jpayne@68	688 def __next__(self):
jpayne@68	689 raise StopIteration()
jpayne@68	690
jpayne@68	691
jpayne@68	692 cdef class TabixIteratorParsed(TabixIterator):
jpayne@68	693 """iterates over mapped reads in a region.
jpayne@68	694
jpayne@68	695 The parser determines the encoding.
jpayne@68	696
jpayne@68	697 Returns parsed data.
jpayne@68	698 """
jpayne@68	699
jpayne@68	700 def __init__(self, Parser parser):
jpayne@68	701 super().__init__()
jpayne@68	702 self.parser = parser
jpayne@68	703
jpayne@68	704 def __next__(self):
jpayne@68	705 """python version of next().
jpayne@68	706
jpayne@68	707 pyrex uses this non-standard name instead of next()
jpayne@68	708 """
jpayne@68	709
jpayne@68	710 cdef int retval = self.__cnext__()
jpayne@68	711 if retval == -5:
jpayne@68	712 raise IOError("iteration on closed file")
jpayne@68	713 elif retval < 0:
jpayne@68	714 raise StopIteration
jpayne@68	715
jpayne@68	716 return self.parser.parse(self.buffer.s,
jpayne@68	717 self.buffer.l)
jpayne@68	718
jpayne@68	719
jpayne@68	720 cdef class GZIterator:
jpayne@68	721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
jpayne@68	722 '''iterate line-by-line through gzip (or bgzip)
jpayne@68	723 compressed file.
jpayne@68	724 '''
jpayne@68	725 if not os.path.exists(filename):
jpayne@68	726 raise IOError("No such file or directory: %s" % filename)
jpayne@68	727
jpayne@68	728 filename = encode_filename(filename)
jpayne@68	729 cdef char *cfilename = filename
jpayne@68	730 with nogil:
jpayne@68	731 self.gzipfile = bgzf_open(cfilename, "r")
jpayne@68	732 self._filename = filename
jpayne@68	733 self.kstream = ks_init(self.gzipfile)
jpayne@68	734 self.encoding = encoding
jpayne@68	735
jpayne@68	736 self.buffer.l = 0
jpayne@68	737 self.buffer.m = 0
jpayne@68	738 self.buffer.s = <char*>malloc(buffer_size)
jpayne@68	739
jpayne@68	740 def __dealloc__(self):
jpayne@68	741 '''close file.'''
jpayne@68	742 if self.gzipfile != NULL:
jpayne@68	743 bgzf_close(self.gzipfile)
jpayne@68	744 self.gzipfile = NULL
jpayne@68	745 if self.buffer.s != NULL:
jpayne@68	746 free(self.buffer.s)
jpayne@68	747 if self.kstream != NULL:
jpayne@68	748 ks_destroy(self.kstream)
jpayne@68	749
jpayne@68	750 def __iter__(self):
jpayne@68	751 return self
jpayne@68	752
jpayne@68	753 cdef int __cnext__(self):
jpayne@68	754 cdef int dret = 0
jpayne@68	755 cdef int retval = 0
jpayne@68	756 while 1:
jpayne@68	757 with nogil:
jpayne@68	758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@68	759
jpayne@68	760 if retval < 0:
jpayne@68	761 break
jpayne@68	762
jpayne@68	763 return dret
jpayne@68	764 return -1
jpayne@68	765
jpayne@68	766 def __next__(self):
jpayne@68	767 """python version of next().
jpayne@68	768 """
jpayne@68	769 cdef int retval = self.__cnext__()
jpayne@68	770 if retval < 0:
jpayne@68	771 raise StopIteration
jpayne@68	772 return force_str(self.buffer.s, self.encoding)
jpayne@68	773
jpayne@68	774
jpayne@68	775 cdef class GZIteratorHead(GZIterator):
jpayne@68	776 '''iterate line-by-line through gzip (or bgzip)
jpayne@68	777 compressed file returning comments at top of file.
jpayne@68	778 '''
jpayne@68	779
jpayne@68	780 def __next__(self):
jpayne@68	781 """python version of next().
jpayne@68	782 """
jpayne@68	783 cdef int retval = self.__cnext__()
jpayne@68	784 if retval < 0:
jpayne@68	785 raise StopIteration
jpayne@68	786 if self.buffer.s[0] == b'#':
jpayne@68	787 return self.buffer.s
jpayne@68	788 else:
jpayne@68	789 raise StopIteration
jpayne@68	790
jpayne@68	791
jpayne@68	792 cdef class GZIteratorParsed(GZIterator):
jpayne@68	793 '''iterate line-by-line through gzip (or bgzip)
jpayne@68	794 compressed file returning comments at top of file.
jpayne@68	795 '''
jpayne@68	796
jpayne@68	797 def __init__(self, parser):
jpayne@68	798 self.parser = parser
jpayne@68	799
jpayne@68	800 def __next__(self):
jpayne@68	801 """python version of next().
jpayne@68	802 """
jpayne@68	803 cdef int retval = self.__cnext__()
jpayne@68	804 if retval < 0:
jpayne@68	805 raise StopIteration
jpayne@68	806
jpayne@68	807 return self.parser.parse(self.buffer.s,
jpayne@68	808 self.buffer.l)
jpayne@68	809
jpayne@68	810
jpayne@68	811 def tabix_compress(filename_in,
jpayne@68	812 filename_out,
jpayne@68	813 force=False):
jpayne@68	814 '''compress filename_in writing the output to filename_out.
jpayne@68	815
jpayne@68	816 Raise an IOError if filename_out already exists, unless force
jpayne@68	817 is set.
jpayne@68	818 '''
jpayne@68	819
jpayne@68	820 if not force and os.path.exists(filename_out):
jpayne@68	821 raise IOError(
jpayne@68	822 "Filename '%s' already exists, use force to "
jpayne@68	823 "overwrite" % filename_out)
jpayne@68	824
jpayne@68	825 cdef int WINDOW_SIZE
jpayne@68	826 cdef int c, r
jpayne@68	827 cdef void * buffer
jpayne@68	828 cdef BGZF * fp
jpayne@68	829 cdef int fd_src
jpayne@68	830 cdef bint is_empty = True
jpayne@68	831 cdef int O_RDONLY
jpayne@68	832 O_RDONLY = os.O_RDONLY
jpayne@68	833
jpayne@68	834 WINDOW_SIZE = 64 * 1024
jpayne@68	835
jpayne@68	836 fn = encode_filename(filename_out)
jpayne@68	837 cdef char *cfn = fn
jpayne@68	838 with nogil:
jpayne@68	839 fp = bgzf_open(cfn, "w")
jpayne@68	840 if fp == NULL:
jpayne@68	841 raise IOError("could not open '%s' for writing" % filename_out)
jpayne@68	842
jpayne@68	843 fn = encode_filename(filename_in)
jpayne@68	844 fd_src = open(fn, O_RDONLY)
jpayne@68	845 if fd_src == 0:
jpayne@68	846 raise IOError("could not open '%s' for reading" % filename_in)
jpayne@68	847
jpayne@68	848 buffer = malloc(WINDOW_SIZE)
jpayne@68	849 c = 1
jpayne@68	850
jpayne@68	851 while c > 0:
jpayne@68	852 with nogil:
jpayne@68	853 c = read(fd_src, buffer, WINDOW_SIZE)
jpayne@68	854 if c > 0:
jpayne@68	855 is_empty = False
jpayne@68	856 r = bgzf_write(fp, buffer, c)
jpayne@68	857 if r < 0:
jpayne@68	858 free(buffer)
jpayne@68	859 raise IOError("writing failed")
jpayne@68	860
jpayne@68	861 free(buffer)
jpayne@68	862 r = bgzf_close(fp)
jpayne@68	863 if r < 0:
jpayne@68	864 raise IOError("error %i when writing to file %s" % (r, filename_out))
jpayne@68	865
jpayne@68	866 r = close(fd_src)
jpayne@68	867 # an empty file will return with -1, thus ignore this.
jpayne@68	868 if r < 0:
jpayne@68	869 if not (r == -1 and is_empty):
jpayne@68	870 raise IOError("error %i when closing file %s" % (r, filename_in))
jpayne@68	871
jpayne@68	872
jpayne@68	873 def tabix_index(filename,
jpayne@68	874 force=False,
jpayne@68	875 seq_col=None,
jpayne@68	876 start_col=None,
jpayne@68	877 end_col=None,
jpayne@68	878 preset=None,
jpayne@68	879 meta_char="#",
jpayne@68	880 int line_skip=0,
jpayne@68	881 zerobased=False,
jpayne@68	882 int min_shift=-1,
jpayne@68	883 index=None,
jpayne@68	884 keep_original=False,
jpayne@68	885 csi=False,
jpayne@68	886 ):
jpayne@68	887 '''index tab-separated filename using tabix.
jpayne@68	888
jpayne@68	889 An existing index will not be overwritten unless force is set.
jpayne@68	890
jpayne@68	891 The index will be built from coordinates in columns seq_col,
jpayne@68	892 start_col and end_col.
jpayne@68	893
jpayne@68	894 The contents of filename have to be sorted by contig and
jpayne@68	895 position - the method does not check if the file is sorted.
jpayne@68	896
jpayne@68	897 Column indices are 0-based. Note that this is different from the
jpayne@68	898 tabix command line utility where column indices start at 1.
jpayne@68	899
jpayne@68	900 Coordinates in the file are assumed to be 1-based unless
jpayne@68	901 zerobased is set.
jpayne@68	902
jpayne@68	903 If preset is provided, the column coordinates are taken from a
jpayne@68	904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
jpayne@68	905 psltbl", "pileup".
jpayne@68	906
jpayne@68	907 Lines beginning with meta_char and the first line_skip lines
jpayne@68	908 will be skipped.
jpayne@68	909
jpayne@68	910 If filename is not detected as a gzip file it will be automatically
jpayne@68	911 compressed. The original file will be removed and only the compressed
jpayne@68	912 file will be retained.
jpayne@68	913
jpayne@68	914 By default or when min_shift is 0, creates a TBI index. If min_shift
jpayne@68	915 is greater than zero and/or csi is True, creates a CSI index with a
jpayne@68	916 minimal interval size of 1<<min_shift (1<<14 if only csi is set).
jpayne@68	917
jpayne@68	918 index controls the filename which should be used for creating the index.
jpayne@68	919 If not set, the default is to append ``.tbi`` to filename.
jpayne@68	920
jpayne@68	921 When automatically compressing files, if keep_original is set the
jpayne@68	922 uncompressed file will not be deleted.
jpayne@68	923
jpayne@68	924 returns the filename of the compressed data
jpayne@68	925
jpayne@68	926 '''
jpayne@68	927
jpayne@68	928 if preset is None and \
jpayne@68	929 (seq_col is None or start_col is None or end_col is None):
jpayne@68	930 raise ValueError(
jpayne@68	931 "neither preset nor seq_col,start_col and end_col given")
jpayne@68	932
jpayne@68	933 fn = encode_filename(filename)
jpayne@68	934 cdef char *cfn = fn
jpayne@68	935
jpayne@68	936 cdef htsFile *fp = hts_open(cfn, "r")
jpayne@68	937 if fp == NULL:
jpayne@68	938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
jpayne@68	939
jpayne@68	940 cdef htsFormat fmt = hts_get_format(fp)[0]
jpayne@68	941 hts_close(fp)
jpayne@68	942
jpayne@68	943 if fmt.compression == no_compression:
jpayne@68	944 tabix_compress(filename, filename + ".gz", force=force)
jpayne@68	945 if not keep_original:
jpayne@68	946 os.unlink(filename)
jpayne@68	947 filename += ".gz"
jpayne@68	948 fn = encode_filename(filename)
jpayne@68	949 cfn = fn
jpayne@68	950
jpayne@68	951 # columns (1-based):
jpayne@68	952 # preset-code, contig, start, end, metachar for
jpayne@68	953 # comments, lines to ignore at beginning
jpayne@68	954 # 0 is a missing column
jpayne@68	955 preset2conf = {
jpayne@68	956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
jpayne@68	957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
jpayne@68	958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
jpayne@68	959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
jpayne@68	960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
jpayne@68	961 }
jpayne@68	962
jpayne@68	963 conf_data = None
jpayne@68	964 if preset == "bcf" or fmt.format == bcf:
jpayne@68	965 csi = True
jpayne@68	966 elif preset:
jpayne@68	967 try:
jpayne@68	968 conf_data = preset2conf[preset]
jpayne@68	969 except KeyError:
jpayne@68	970 raise KeyError(
jpayne@68	971 "unknown preset '%s', valid presets are '%s'" %
jpayne@68	972 (preset, ",".join(preset2conf.keys())))
jpayne@68	973 else:
jpayne@68	974 if end_col is None:
jpayne@68	975 end_col = -1
jpayne@68	976
jpayne@68	977 preset = 0
jpayne@68	978 # tabix internally works with 0-based coordinates and
jpayne@68	979 # open/closed intervals. When using a preset, conversion is
jpayne@68	980 # automatically taken care of. Otherwise, the coordinates are
jpayne@68	981 # assumed to be 1-based closed intervals and -1 is subtracted
jpayne@68	982 # from the start coordinate. To avoid doing this, set the
jpayne@68	983 # TI_FLAG_UCSC=0x10000 flag:
jpayne@68	984 if zerobased:
jpayne@68	985 preset = preset \| TBX_UCSC
jpayne@68	986
jpayne@68	987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
jpayne@68	988
jpayne@68	989 cdef tbx_conf_t conf
jpayne@68	990 if conf_data:
jpayne@68	991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
jpayne@68	992
jpayne@68	993 if csi or min_shift > 0:
jpayne@68	994 suffix = ".csi"
jpayne@68	995 if min_shift <= 0: min_shift = 14
jpayne@68	996 else:
jpayne@68	997 suffix = ".tbi"
jpayne@68	998 min_shift = 0
jpayne@68	999
jpayne@68	1000 index = index or filename + suffix
jpayne@68	1001 fn_index = encode_filename(index)
jpayne@68	1002
jpayne@68	1003 if not force and os.path.exists(index):
jpayne@68	1004 raise IOError(
jpayne@68	1005 "filename '%s' already exists, use force to overwrite" % index)
jpayne@68	1006
jpayne@68	1007 cdef char *fnidx = fn_index
jpayne@68	1008 cdef int retval = 0
jpayne@68	1009
jpayne@68	1010 if csi and fmt.format == bcf:
jpayne@68	1011 with nogil:
jpayne@68	1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
jpayne@68	1013 else:
jpayne@68	1014 with nogil:
jpayne@68	1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
jpayne@68	1016
jpayne@68	1017 if retval != 0:
jpayne@68	1018 raise OSError("building of index for {} failed".format(filename))
jpayne@68	1019
jpayne@68	1020 return filename
jpayne@68	1021
jpayne@68	1022 # #########################################################
jpayne@68	1023 # cdef class tabix_file_iterator_old:
jpayne@68	1024 # '''iterate over ``infile``.
jpayne@68	1025
jpayne@68	1026 # This iterator is not safe. If the :meth:`__next__()` method is called
jpayne@68	1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
jpayne@68	1028
jpayne@68	1029 # The iterator might either raise a StopIteration or segfault.
jpayne@68	1030 # '''
jpayne@68	1031
jpayne@68	1032
jpayne@68	1033 # def __cinit__(self,
jpayne@68	1034 # infile,
jpayne@68	1035 # Parser parser,
jpayne@68	1036 # int buffer_size = 65536 ):
jpayne@68	1037
jpayne@68	1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
jpayne@68	1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
jpayne@68	1040 # self.infile = fdopen( fd, 'r')
jpayne@68	1041
jpayne@68	1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
jpayne@68	1043
jpayne@68	1044 # self.buffer = <char*>malloc( buffer_size )
jpayne@68	1045 # self.size = buffer_size
jpayne@68	1046 # self.parser = parser
jpayne@68	1047
jpayne@68	1048 # def __iter__(self):
jpayne@68	1049 # return self
jpayne@68	1050
jpayne@68	1051 # cdef __cnext__(self):
jpayne@68	1052
jpayne@68	1053 # cdef char * b
jpayne@68	1054 # cdef size_t nbytes
jpayne@68	1055 # b = self.buffer
jpayne@68	1056
jpayne@68	1057 # while not feof( self.infile ):
jpayne@68	1058 # nbytes = getline( &b, &self.size, self.infile)
jpayne@68	1059
jpayne@68	1060 # # stop at first error or eof
jpayne@68	1061 # if (nbytes == -1): break
jpayne@68	1062 # # skip comments
jpayne@68	1063 # if (b[0] == '#'): continue
jpayne@68	1064
jpayne@68	1065 # # skip empty lines
jpayne@68	1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
jpayne@68	1067
jpayne@68	1068 # # make sure that entry is complete
jpayne@68	1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
jpayne@68	1070 # result = b
jpayne@68	1071 # raise ValueError( "incomplete line at %s" % result )
jpayne@68	1072
jpayne@68	1073 # # make sure that this goes fully through C
jpayne@68	1074 # # otherwise buffer is copied to/from a
jpayne@68	1075 # # Python object causing segfaults as
jpayne@68	1076 # # the wrong memory is freed
jpayne@68	1077 # return self.parser.parse( b, nbytes )
jpayne@68	1078
jpayne@68	1079 # raise StopIteration
jpayne@68	1080
jpayne@68	1081 # def __dealloc__(self):
jpayne@68	1082 # free(self.buffer)
jpayne@68	1083
jpayne@68	1084 # def __next__(self):
jpayne@68	1085 # return self.__cnext__()
jpayne@68	1086
jpayne@68	1087 #########################################################
jpayne@68	1088 #########################################################
jpayne@68	1089 #########################################################
jpayne@68	1090 ## Iterators for parsing through unindexed files.
jpayne@68	1091 #########################################################
jpayne@68	1092 # cdef buildGzipError(void *gzfp):
jpayne@68	1093 # cdef int errnum = 0
jpayne@68	1094 # cdef char *s = gzerror(gzfp, &errnum)
jpayne@68	1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
jpayne@68	1096
jpayne@68	1097
jpayne@68	1098 cdef class tabix_file_iterator:
jpayne@68	1099 '''iterate over a compressed or uncompressed ``infile``.
jpayne@68	1100 '''
jpayne@68	1101
jpayne@68	1102 def __cinit__(self,
jpayne@68	1103 infile,
jpayne@68	1104 Parser parser,
jpayne@68	1105 int buffer_size=65536):
jpayne@68	1106
jpayne@68	1107 if infile.closed:
jpayne@68	1108 raise ValueError("I/O operation on closed file.")
jpayne@68	1109
jpayne@68	1110 self.infile = infile
jpayne@68	1111
jpayne@68	1112 cdef int fd = PyObject_AsFileDescriptor(infile)
jpayne@68	1113 if fd == -1:
jpayne@68	1114 raise ValueError("I/O operation on closed file.")
jpayne@68	1115
jpayne@68	1116 self.duplicated_fd = dup(fd)
jpayne@68	1117
jpayne@68	1118 # From the manual:
jpayne@68	1119 # gzopen can be used to read a file which is not in gzip format;
jpayne@68	1120 # in this case gzread will directly read from the file without decompression.
jpayne@68	1121 # When reading, this will be detected automatically by looking
jpayne@68	1122 # for the magic two-byte gzip header.
jpayne@68	1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
jpayne@68	1124
jpayne@68	1125 if self.fh == NULL:
jpayne@68	1126 raise IOError('%s' % strerror(errno))
jpayne@68	1127
jpayne@68	1128 self.kstream = ks_init(self.fh)
jpayne@68	1129
jpayne@68	1130 self.buffer.s = <char*>malloc(buffer_size)
jpayne@68	1131 #if self.buffer == NULL:
jpayne@68	1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
jpayne@68	1133 #self.size = buffer_size
jpayne@68	1134 self.parser = parser
jpayne@68	1135
jpayne@68	1136 def __iter__(self):
jpayne@68	1137 return self
jpayne@68	1138
jpayne@68	1139 cdef __cnext__(self):
jpayne@68	1140
jpayne@68	1141 cdef char * b
jpayne@68	1142 cdef int dret = 0
jpayne@68	1143 cdef int retval = 0
jpayne@68	1144 while 1:
jpayne@68	1145 with nogil:
jpayne@68	1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@68	1147
jpayne@68	1148 if retval < 0:
jpayne@68	1149 break
jpayne@68	1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
jpayne@68	1151
jpayne@68	1152 b = self.buffer.s
jpayne@68	1153
jpayne@68	1154 # skip comments
jpayne@68	1155 if (b[0] == b'#'):
jpayne@68	1156 continue
jpayne@68	1157
jpayne@68	1158 # skip empty lines
jpayne@68	1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@68	1160 continue
jpayne@68	1161
jpayne@68	1162 # gzgets terminates at \n, no need to test
jpayne@68	1163
jpayne@68	1164 # parser creates a copy
jpayne@68	1165 return self.parser.parse(b, self.buffer.l)
jpayne@68	1166
jpayne@68	1167 raise StopIteration
jpayne@68	1168
jpayne@68	1169 def __dealloc__(self):
jpayne@68	1170 free(self.buffer.s)
jpayne@68	1171 ks_destroy(self.kstream)
jpayne@68	1172 bgzf_close(self.fh)
jpayne@68	1173
jpayne@68	1174 def __next__(self):
jpayne@68	1175 return self.__cnext__()
jpayne@68	1176
jpayne@68	1177
jpayne@68	1178 class tabix_generic_iterator:
jpayne@68	1179 '''iterate over ``infile``.
jpayne@68	1180
jpayne@68	1181 Permits the use of file-like objects for example from the gzip module.
jpayne@68	1182 '''
jpayne@68	1183 def __init__(self, infile, parser):
jpayne@68	1184
jpayne@68	1185 self.infile = infile
jpayne@68	1186 if self.infile.closed:
jpayne@68	1187 raise ValueError("I/O operation on closed file.")
jpayne@68	1188 self.parser = parser
jpayne@68	1189
jpayne@68	1190 def __iter__(self):
jpayne@68	1191 return self
jpayne@68	1192
jpayne@68	1193 # cython version - required for python 3
jpayne@68	1194 def __next__(self):
jpayne@68	1195
jpayne@68	1196 cdef char * b
jpayne@68	1197 cdef char * cpy
jpayne@68	1198 cdef size_t nbytes
jpayne@68	1199
jpayne@68	1200 encoding = self.parser.get_encoding()
jpayne@68	1201
jpayne@68	1202 # note that GzipFile.close() does not close the file
jpayne@68	1203 # reading is still possible.
jpayne@68	1204 if self.infile.closed:
jpayne@68	1205 raise ValueError("I/O operation on closed file.")
jpayne@68	1206
jpayne@68	1207 while 1:
jpayne@68	1208
jpayne@68	1209 line = self.infile.readline()
jpayne@68	1210 if not line:
jpayne@68	1211 break
jpayne@68	1212
jpayne@68	1213 s = force_bytes(line, encoding)
jpayne@68	1214 b = s
jpayne@68	1215 nbytes = len(line)
jpayne@68	1216 assert b[nbytes] == b'\0'
jpayne@68	1217
jpayne@68	1218 # skip comments
jpayne@68	1219 if b[0] == b'#':
jpayne@68	1220 continue
jpayne@68	1221
jpayne@68	1222 # skip empty lines
jpayne@68	1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@68	1224 continue
jpayne@68	1225
jpayne@68	1226 # make sure that entry is complete
jpayne@68	1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
jpayne@68	1228 raise ValueError("incomplete line at %s" % line)
jpayne@68	1229
jpayne@68	1230 bytes_cpy = <bytes> b
jpayne@68	1231 cpy = <char *> bytes_cpy
jpayne@68	1232
jpayne@68	1233 return self.parser(cpy, nbytes)
jpayne@68	1234
jpayne@68	1235 raise StopIteration
jpayne@68	1236
jpayne@68	1237
jpayne@68	1238 def tabix_iterator(infile, parser):
jpayne@68	1239 """return an iterator over all entries in a file.
jpayne@68	1240
jpayne@68	1241 Results are returned parsed as specified by the parser. If
jpayne@68	1242 parser is None, the results are returned as an unparsed string.
jpayne@68	1243 Otherwise, parser is assumed to be a functor that will return
jpayne@68	1244 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@68	1245 :class:`~pysam.asGTF`).
jpayne@68	1246
jpayne@68	1247 """
jpayne@68	1248 return tabix_generic_iterator(infile, parser)
jpayne@68	1249
jpayne@68	1250
jpayne@68	1251 cdef class Tabixfile(TabixFile):
jpayne@68	1252 """Tabixfile is deprecated: use TabixFile instead"""
jpayne@68	1253 pass
jpayne@68	1254
jpayne@68	1255
jpayne@68	1256 __all__ = [
jpayne@68	1257 "tabix_index",
jpayne@68	1258 "tabix_compress",
jpayne@68	1259 "TabixFile",
jpayne@68	1260 "Tabixfile",
jpayne@68	1261 "asTuple",
jpayne@68	1262 "asGTF",
jpayne@68	1263 "asGFF3",
jpayne@68	1264 "asVCF",
jpayne@68	1265 "asBed",
jpayne@68	1266 "GZIterator",
jpayne@68	1267 "GZIteratorHead",
jpayne@68	1268 "tabix_iterator",
jpayne@68	1269 "tabix_generic_iterator",
jpayne@68	1270 "tabix_file_iterator",
jpayne@68	1271 ]

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 68:5028fdace37b