csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # cython: language_level=3
jpayne@69	2 # cython: embedsignature=True
jpayne@69	3 # cython: profile=True
jpayne@69	4 ###############################################################################
jpayne@69	5 ###############################################################################
jpayne@69	6 # Cython wrapper for access to tabix indexed files in bgzf format
jpayne@69	7 ###############################################################################
jpayne@69	8 # The principal classes and functions defined in this module are:
jpayne@69	9 #
jpayne@69	10 # class TabixFile class wrapping tabix indexed files in bgzf format
jpayne@69	11 #
jpayne@69	12 # class asTuple Parser class for tuples
jpayne@69	13 # class asGTF Parser class for GTF formatted rows
jpayne@69	14 # class asGFF3 Parser class for GFF3 formatted rows
jpayne@69	15 # class asBed Parser class for Bed formatted rows
jpayne@69	16 # class asVCF Parser class for VCF formatted rows
jpayne@69	17 #
jpayne@69	18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
jpayne@69	19 #
jpayne@69	20 # Additionally this module defines several additional classes that are part
jpayne@69	21 # of the internal API. These are:
jpayne@69	22 #
jpayne@69	23 # class Parser base class for parsers of tab-separated rows
jpayne@69	24 # class tabix_file_iterator
jpayne@69	25 # class TabixIterator iterator class over rows in bgzf file
jpayne@69	26 # class EmptyIterator
jpayne@69	27 #
jpayne@69	28 # For backwards compatibility, the following classes are also defined:
jpayne@69	29 #
jpayne@69	30 # class Tabixfile equivalent to TabixFile
jpayne@69	31 #
jpayne@69	32 ###############################################################################
jpayne@69	33 #
jpayne@69	34 # The MIT License
jpayne@69	35 #
jpayne@69	36 # Copyright (c) 2015 Andreas Heger
jpayne@69	37 #
jpayne@69	38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69	39 # copy of this software and associated documentation files (the "Software"),
jpayne@69	40 # to deal in the Software without restriction, including without limitation
jpayne@69	41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69	42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69	43 # Software is furnished to do so, subject to the following conditions:
jpayne@69	44 #
jpayne@69	45 # The above copyright notice and this permission notice shall be included in
jpayne@69	46 # all copies or substantial portions of the Software.
jpayne@69	47 #
jpayne@69	48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69	49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69	50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@69	51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69	52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69	53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69	54 # DEALINGS IN THE SOFTWARE.
jpayne@69	55 #
jpayne@69	56 ###############################################################################
jpayne@69	57 import os
jpayne@69	58 import sys
jpayne@69	59
jpayne@69	60 from libc.stdio cimport printf, fprintf, stderr
jpayne@69	61 from libc.string cimport strerror
jpayne@69	62 from libc.errno cimport errno
jpayne@69	63 from posix.unistd cimport dup
jpayne@69	64
jpayne@69	65 from cpython cimport PyErr_SetString, PyBytes_Check, \
jpayne@69	66 PyUnicode_Check, PyBytes_FromStringAndSize, \
jpayne@69	67 PyObject_AsFileDescriptor
jpayne@69	68
jpayne@69	69 cimport pysam.libctabixproxies as ctabixproxies
jpayne@69	70
jpayne@69	71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
jpayne@69	72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
jpayne@69	73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
jpayne@69	74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
jpayne@69	75 tbx_destroy, hisremote, region_list, hts_getline, \
jpayne@69	76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
jpayne@69	77 no_compression, bcf, bcf_index_build2
jpayne@69	78
jpayne@69	79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@69	80 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@69	81
jpayne@69	82 cdef class Parser:
jpayne@69	83
jpayne@69	84 def __init__(self, encoding="ascii"):
jpayne@69	85 self.encoding = encoding
jpayne@69	86
jpayne@69	87 def set_encoding(self, encoding):
jpayne@69	88 self.encoding = encoding
jpayne@69	89
jpayne@69	90 def get_encoding(self):
jpayne@69	91 return self.encoding
jpayne@69	92
jpayne@69	93 cdef parse(self, char * buffer, int length):
jpayne@69	94 raise NotImplementedError(
jpayne@69	95 'parse method of %s not implemented' % str(self))
jpayne@69	96
jpayne@69	97 def __call__(self, char * buffer, int length):
jpayne@69	98 return self.parse(buffer, length)
jpayne@69	99
jpayne@69	100
jpayne@69	101 cdef class asTuple(Parser):
jpayne@69	102 '''converts a :term:`tabix row` into a python tuple.
jpayne@69	103
jpayne@69	104 A field in a row is accessed by numeric index.
jpayne@69	105 '''
jpayne@69	106 cdef parse(self, char * buffer, int len):
jpayne@69	107 cdef ctabixproxies.TupleProxy r
jpayne@69	108 r = ctabixproxies.TupleProxy(self.encoding)
jpayne@69	109 # need to copy - there were some
jpayne@69	110 # persistence issues with "present"
jpayne@69	111 r.copy(buffer, len)
jpayne@69	112 return r
jpayne@69	113
jpayne@69	114
jpayne@69	115 cdef class asGFF3(Parser):
jpayne@69	116 '''converts a :term:`tabix row` into a GFF record with the following
jpayne@69	117 fields:
jpayne@69	118
jpayne@69	119 +----------+----------+-------------------------------+
jpayne@69	120 \|Column \|Name \|Content \|
jpayne@69	121 +----------+----------+-------------------------------+
jpayne@69	122 \|1 \|contig \|the chromosome name \|
jpayne@69	123 +----------+----------+-------------------------------+
jpayne@69	124 \|2 \|feature \|The feature type \|
jpayne@69	125 +----------+----------+-------------------------------+
jpayne@69	126 \|3 \|source \|The feature source \|
jpayne@69	127 +----------+----------+-------------------------------+
jpayne@69	128 \|4 \|start \|genomic start coordinate \|
jpayne@69	129 \| \| \|(0-based) \|
jpayne@69	130 +----------+----------+-------------------------------+
jpayne@69	131 \|5 \|end \|genomic end coordinate \|
jpayne@69	132 \| \| \|(0-based) \|
jpayne@69	133 +----------+----------+-------------------------------+
jpayne@69	134 \|6 \|score \|feature score \|
jpayne@69	135 +----------+----------+-------------------------------+
jpayne@69	136 \|7 \|strand \|strand \|
jpayne@69	137 +----------+----------+-------------------------------+
jpayne@69	138 \|8 \|frame \|frame \|
jpayne@69	139 +----------+----------+-------------------------------+
jpayne@69	140 \|9 \|attributes\|the attribute field \|
jpayne@69	141 +----------+----------+-------------------------------+
jpayne@69	142
jpayne@69	143 '''
jpayne@69	144 cdef parse(self, char * buffer, int len):
jpayne@69	145 cdef ctabixproxies.GFF3Proxy r
jpayne@69	146 r = ctabixproxies.GFF3Proxy(self.encoding)
jpayne@69	147 r.copy(buffer, len)
jpayne@69	148 return r
jpayne@69	149
jpayne@69	150
jpayne@69	151 cdef class asGTF(Parser):
jpayne@69	152 '''converts a :term:`tabix row` into a GTF record with the following
jpayne@69	153 fields:
jpayne@69	154
jpayne@69	155 +----------+----------+-------------------------------+
jpayne@69	156 \|Column \|Name \|Content \|
jpayne@69	157 +----------+----------+-------------------------------+
jpayne@69	158 \|1 \|contig \|the chromosome name \|
jpayne@69	159 +----------+----------+-------------------------------+
jpayne@69	160 \|2 \|feature \|The feature type \|
jpayne@69	161 +----------+----------+-------------------------------+
jpayne@69	162 \|3 \|source \|The feature source \|
jpayne@69	163 +----------+----------+-------------------------------+
jpayne@69	164 \|4 \|start \|genomic start coordinate \|
jpayne@69	165 \| \| \|(0-based) \|
jpayne@69	166 +----------+----------+-------------------------------+
jpayne@69	167 \|5 \|end \|genomic end coordinate \|
jpayne@69	168 \| \| \|(0-based) \|
jpayne@69	169 +----------+----------+-------------------------------+
jpayne@69	170 \|6 \|score \|feature score \|
jpayne@69	171 +----------+----------+-------------------------------+
jpayne@69	172 \|7 \|strand \|strand \|
jpayne@69	173 +----------+----------+-------------------------------+
jpayne@69	174 \|8 \|frame \|frame \|
jpayne@69	175 +----------+----------+-------------------------------+
jpayne@69	176 \|9 \|attributes\|the attribute field \|
jpayne@69	177 +----------+----------+-------------------------------+
jpayne@69	178
jpayne@69	179 GTF formatted entries also define the following fields that
jpayne@69	180 are derived from the attributes field:
jpayne@69	181
jpayne@69	182 +--------------------+------------------------------+
jpayne@69	183 \|Name \|Content \|
jpayne@69	184 +--------------------+------------------------------+
jpayne@69	185 \|gene_id \|the gene identifier \|
jpayne@69	186 +--------------------+------------------------------+
jpayne@69	187 \|transcript_id \|the transcript identifier \|
jpayne@69	188 +--------------------+------------------------------+
jpayne@69	189
jpayne@69	190 '''
jpayne@69	191 cdef parse(self, char * buffer, int len):
jpayne@69	192 cdef ctabixproxies.GTFProxy r
jpayne@69	193 r = ctabixproxies.GTFProxy(self.encoding)
jpayne@69	194 r.copy(buffer, len)
jpayne@69	195 return r
jpayne@69	196
jpayne@69	197
jpayne@69	198 cdef class asBed(Parser):
jpayne@69	199 '''converts a :term:`tabix row` into a bed record
jpayne@69	200 with the following fields:
jpayne@69	201
jpayne@69	202 +-----------+-----------+------------------------------------------+
jpayne@69	203 \|Column \|Field \|Contents \|
jpayne@69	204 \| \| \| \|
jpayne@69	205 +-----------+-----------+------------------------------------------+
jpayne@69	206 \|1 \|contig \|contig \|
jpayne@69	207 \| \| \| \|
jpayne@69	208 +-----------+-----------+------------------------------------------+
jpayne@69	209 \|2 \|start \|genomic start coordinate (zero-based) \|
jpayne@69	210 +-----------+-----------+------------------------------------------+
jpayne@69	211 \|3 \|end \|genomic end coordinate plus one \|
jpayne@69	212 \| \| \|(zero-based) \|
jpayne@69	213 +-----------+-----------+------------------------------------------+
jpayne@69	214 \|4 \|name \|name of feature. \|
jpayne@69	215 +-----------+-----------+------------------------------------------+
jpayne@69	216 \|5 \|score \|score of feature \|
jpayne@69	217 +-----------+-----------+------------------------------------------+
jpayne@69	218 \|6 \|strand \|strand of feature \|
jpayne@69	219 +-----------+-----------+------------------------------------------+
jpayne@69	220 \|7 \|thickStart \|thickStart \|
jpayne@69	221 +-----------+-----------+------------------------------------------+
jpayne@69	222 \|8 \|thickEnd \|thickEnd \|
jpayne@69	223 +-----------+-----------+------------------------------------------+
jpayne@69	224 \|9 \|itemRGB \|itemRGB \|
jpayne@69	225 +-----------+-----------+------------------------------------------+
jpayne@69	226 \|10 \|blockCount \|number of bocks \|
jpayne@69	227 +-----------+-----------+------------------------------------------+
jpayne@69	228 \|11 \|blockSizes \|',' separated string of block sizes \|
jpayne@69	229 +-----------+-----------+------------------------------------------+
jpayne@69	230 \|12 \|blockStarts\|',' separated string of block genomic \|
jpayne@69	231 \| \| \|start positions \|
jpayne@69	232 +-----------+-----------+------------------------------------------+
jpayne@69	233
jpayne@69	234 Only the first three fields are required. Additional
jpayne@69	235 fields are optional, but if one is defined, all the preceding
jpayne@69	236 need to be defined as well.
jpayne@69	237
jpayne@69	238 '''
jpayne@69	239 cdef parse(self, char * buffer, int len):
jpayne@69	240 cdef ctabixproxies.BedProxy r
jpayne@69	241 r = ctabixproxies.BedProxy(self.encoding)
jpayne@69	242 r.copy(buffer, len)
jpayne@69	243 return r
jpayne@69	244
jpayne@69	245
jpayne@69	246 cdef class asVCF(Parser):
jpayne@69	247 '''converts a :term:`tabix row` into a VCF record with
jpayne@69	248 the following fields:
jpayne@69	249
jpayne@69	250 +----------+---------+------------------------------------+
jpayne@69	251 \|Column \|Field \|Contents \|
jpayne@69	252 \| \| \| \|
jpayne@69	253 +----------+---------+------------------------------------+
jpayne@69	254 \|1 \|contig \|chromosome \|
jpayne@69	255 +----------+---------+------------------------------------+
jpayne@69	256 \|2 \|pos \|chromosomal position, zero-based \|
jpayne@69	257 +----------+---------+------------------------------------+
jpayne@69	258 \|3 \|id \|id \|
jpayne@69	259 +----------+---------+------------------------------------+
jpayne@69	260 \|4 \|ref \|reference allele \|
jpayne@69	261 +----------+---------+------------------------------------+
jpayne@69	262 \|5 \|alt \|alternate alleles \|
jpayne@69	263 +----------+---------+------------------------------------+
jpayne@69	264 \|6 \|qual \|quality \|
jpayne@69	265 +----------+---------+------------------------------------+
jpayne@69	266 \|7 \|filter \|filter \|
jpayne@69	267 +----------+---------+------------------------------------+
jpayne@69	268 \|8 \|info \|info \|
jpayne@69	269 +----------+---------+------------------------------------+
jpayne@69	270 \|9 \|format \|format specifier. \|
jpayne@69	271 +----------+---------+------------------------------------+
jpayne@69	272
jpayne@69	273 Access to genotypes is via index::
jpayne@69	274
jpayne@69	275 contig = vcf.contig
jpayne@69	276 first_sample_genotype = vcf[0]
jpayne@69	277 second_sample_genotype = vcf[1]
jpayne@69	278
jpayne@69	279 '''
jpayne@69	280 cdef parse(self, char * buffer, int len):
jpayne@69	281 cdef ctabixproxies.VCFProxy r
jpayne@69	282 r = ctabixproxies.VCFProxy(self.encoding)
jpayne@69	283 r.copy(buffer, len)
jpayne@69	284 return r
jpayne@69	285
jpayne@69	286
jpayne@69	287 cdef class TabixFile:
jpayne@69	288 """Random access to bgzf formatted files that
jpayne@69	289 have been indexed by :term:`tabix`.
jpayne@69	290
jpayne@69	291 The file is automatically opened. The index file of file
jpayne@69	292 ``<filename>`` is expected to be called ``<filename>.tbi``
jpayne@69	293 by default (see parameter `index`).
jpayne@69	294
jpayne@69	295 Parameters
jpayne@69	296 ----------
jpayne@69	297
jpayne@69	298 filename : string
jpayne@69	299 Filename of bgzf file to be opened.
jpayne@69	300
jpayne@69	301 index : string
jpayne@69	302 The filename of the index. If not set, the default is to
jpayne@69	303 assume that the index is called ``filename.tbi``
jpayne@69	304
jpayne@69	305 mode : char
jpayne@69	306 The file opening mode. Currently, only ``r`` is permitted.
jpayne@69	307
jpayne@69	308 parser : :class:`pysam.Parser`
jpayne@69	309
jpayne@69	310 sets the default parser for this tabix file. If `parser`
jpayne@69	311 is None, the results are returned as an unparsed string.
jpayne@69	312 Otherwise, `parser` is assumed to be a functor that will return
jpayne@69	313 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69	314 :class:`~pysam.asGTF`).
jpayne@69	315
jpayne@69	316 encoding : string
jpayne@69	317
jpayne@69	318 The encoding passed to the parser
jpayne@69	319
jpayne@69	320 threads: integer
jpayne@69	321 Number of threads to use for decompressing Tabix files.
jpayne@69	322 (Default=1)
jpayne@69	323
jpayne@69	324
jpayne@69	325 Raises
jpayne@69	326 ------
jpayne@69	327
jpayne@69	328 ValueError
jpayne@69	329 if index file is missing.
jpayne@69	330
jpayne@69	331 IOError
jpayne@69	332 if file could not be opened
jpayne@69	333 """
jpayne@69	334 def __cinit__(self,
jpayne@69	335 filename,
jpayne@69	336 mode='r',
jpayne@69	337 parser=None,
jpayne@69	338 index=None,
jpayne@69	339 encoding="ascii",
jpayne@69	340 threads=1,
jpayne@69	341 *args,
jpayne@69	342 **kwargs ):
jpayne@69	343
jpayne@69	344 self.htsfile = NULL
jpayne@69	345 self.is_remote = False
jpayne@69	346 self.is_stream = False
jpayne@69	347 self.parser = parser
jpayne@69	348 self.threads = threads
jpayne@69	349 self._open(filename, mode, index, args, *kwargs)
jpayne@69	350 self.encoding = encoding
jpayne@69	351
jpayne@69	352 def _open( self,
jpayne@69	353 filename,
jpayne@69	354 mode='r',
jpayne@69	355 index=None,
jpayne@69	356 threads=1,
jpayne@69	357 ):
jpayne@69	358 '''open a :term:`tabix file` for reading.'''
jpayne@69	359
jpayne@69	360 if mode != 'r':
jpayne@69	361 raise ValueError("invalid file opening mode `%s`" % mode)
jpayne@69	362
jpayne@69	363 if self.htsfile != NULL:
jpayne@69	364 self.close()
jpayne@69	365 self.htsfile = NULL
jpayne@69	366 self.threads=threads
jpayne@69	367
jpayne@69	368 filename_index = index or (filename + ".tbi")
jpayne@69	369 # encode all the strings to pass to tabix
jpayne@69	370 self.filename = encode_filename(filename)
jpayne@69	371 self.filename_index = encode_filename(filename_index)
jpayne@69	372
jpayne@69	373 self.is_stream = self.filename == b'-'
jpayne@69	374 self.is_remote = hisremote(self.filename)
jpayne@69	375
jpayne@69	376 if not self.is_remote:
jpayne@69	377 if not os.path.exists(filename):
jpayne@69	378 raise IOError("file `%s` not found" % filename)
jpayne@69	379
jpayne@69	380 if not os.path.exists(filename_index):
jpayne@69	381 raise IOError("index `%s` not found" % filename_index)
jpayne@69	382
jpayne@69	383 # open file
jpayne@69	384 cdef char *cfilename = self.filename
jpayne@69	385 cdef char *cfilename_index = self.filename_index
jpayne@69	386 with nogil:
jpayne@69	387 self.htsfile = hts_open(cfilename, 'r')
jpayne@69	388
jpayne@69	389 if self.htsfile == NULL:
jpayne@69	390 raise IOError("could not open file `%s`" % filename)
jpayne@69	391
jpayne@69	392 #if self.htsfile.format.category != region_list:
jpayne@69	393 # raise ValueError("file does not contain region data")
jpayne@69	394
jpayne@69	395 with nogil:
jpayne@69	396 self.index = tbx_index_load2(cfilename, cfilename_index)
jpayne@69	397
jpayne@69	398 if self.index == NULL:
jpayne@69	399 raise IOError("could not open index for `%s`" % filename)
jpayne@69	400
jpayne@69	401 if not self.is_stream:
jpayne@69	402 self.start_offset = self.tell()
jpayne@69	403
jpayne@69	404 def _dup(self):
jpayne@69	405 '''return a copy of this tabix file.
jpayne@69	406
jpayne@69	407 The file is being re-opened.
jpayne@69	408 '''
jpayne@69	409 return TabixFile(self.filename,
jpayne@69	410 mode="r",
jpayne@69	411 threads=self.threads,
jpayne@69	412 parser=self.parser,
jpayne@69	413 index=self.filename_index,
jpayne@69	414 encoding=self.encoding)
jpayne@69	415
jpayne@69	416 def fetch(self,
jpayne@69	417 reference=None,
jpayne@69	418 start=None,
jpayne@69	419 end=None,
jpayne@69	420 region=None,
jpayne@69	421 parser=None,
jpayne@69	422 multiple_iterators=False):
jpayne@69	423 '''fetch one or more rows in a :term:`region` using 0-based
jpayne@69	424 indexing. The region is specified by :term:`reference`,
jpayne@69	425 start and end. Alternatively, a samtools :term:`region`
jpayne@69	426 string can be supplied.
jpayne@69	427
jpayne@69	428 Without reference or region all entries will be fetched.
jpayne@69	429
jpayne@69	430 If only reference is set, all reads matching on reference
jpayne@69	431 will be fetched.
jpayne@69	432
jpayne@69	433 If parser is None, the default parser will be used for
jpayne@69	434 parsing.
jpayne@69	435
jpayne@69	436 Set multiple_iterators to true if you will be using multiple
jpayne@69	437 iterators on the same file at the same time. The iterator
jpayne@69	438 returned will receive its own copy of a filehandle to the file
jpayne@69	439 effectively re-opening the file. Re-opening a file creates
jpayne@69	440 some overhead, so beware.
jpayne@69	441
jpayne@69	442 '''
jpayne@69	443 if not self.is_open():
jpayne@69	444 raise ValueError("I/O operation on closed file")
jpayne@69	445
jpayne@69	446 # convert coordinates to region string, which is one-based
jpayne@69	447 if reference:
jpayne@69	448 if end is not None:
jpayne@69	449 if end < 0:
jpayne@69	450 raise ValueError("end out of range (%i)" % end)
jpayne@69	451 if start is None:
jpayne@69	452 start = 0
jpayne@69	453
jpayne@69	454 if start < 0:
jpayne@69	455 raise ValueError("start out of range (%i)" % end)
jpayne@69	456 elif start > end:
jpayne@69	457 raise ValueError(
jpayne@69	458 'start (%i) >= end (%i)' % (start, end))
jpayne@69	459 elif start == end:
jpayne@69	460 return EmptyIterator()
jpayne@69	461 else:
jpayne@69	462 region = '%s:%i-%i' % (reference, start + 1, end)
jpayne@69	463 elif start is not None:
jpayne@69	464 if start < 0:
jpayne@69	465 raise ValueError("start out of range (%i)" % end)
jpayne@69	466 region = '%s:%i' % (reference, start + 1)
jpayne@69	467 else:
jpayne@69	468 region = reference
jpayne@69	469
jpayne@69	470 # get iterator
jpayne@69	471 cdef hts_itr_t * itr
jpayne@69	472 cdef char *cstr
jpayne@69	473 cdef TabixFile fileobj
jpayne@69	474
jpayne@69	475 # reopen the same file if necessary
jpayne@69	476 if multiple_iterators:
jpayne@69	477 fileobj = self._dup()
jpayne@69	478 else:
jpayne@69	479 fileobj = self
jpayne@69	480
jpayne@69	481 if region is None:
jpayne@69	482 # without region or reference - iterate from start
jpayne@69	483 with nogil:
jpayne@69	484 itr = tbx_itr_queryi(fileobj.index,
jpayne@69	485 HTS_IDX_START,
jpayne@69	486 0,
jpayne@69	487 0)
jpayne@69	488 else:
jpayne@69	489 s = force_bytes(region, encoding=fileobj.encoding)
jpayne@69	490 cstr = s
jpayne@69	491 with nogil:
jpayne@69	492 itr = tbx_itr_querys(fileobj.index, cstr)
jpayne@69	493
jpayne@69	494 if itr == NULL:
jpayne@69	495 if region is None:
jpayne@69	496 if len(self.contigs) > 0:
jpayne@69	497 # when accessing a tabix file created prior tabix 1.0
jpayne@69	498 # the full-file iterator is empty.
jpayne@69	499 raise ValueError(
jpayne@69	500 "could not create iterator, possible "
jpayne@69	501 "tabix version mismatch")
jpayne@69	502 else:
jpayne@69	503 # possible reason is that the file is empty -
jpayne@69	504 # return an empty iterator
jpayne@69	505 return EmptyIterator()
jpayne@69	506 else:
jpayne@69	507 raise ValueError(
jpayne@69	508 "could not create iterator for region '%s'" %
jpayne@69	509 region)
jpayne@69	510
jpayne@69	511 # use default parser if no parser is specified
jpayne@69	512 if parser is None:
jpayne@69	513 parser = fileobj.parser
jpayne@69	514
jpayne@69	515 cdef TabixIterator a
jpayne@69	516 if parser is None:
jpayne@69	517 a = TabixIterator(encoding=fileobj.encoding)
jpayne@69	518 else:
jpayne@69	519 parser.set_encoding(fileobj.encoding)
jpayne@69	520 a = TabixIteratorParsed(parser)
jpayne@69	521
jpayne@69	522 a.tabixfile = fileobj
jpayne@69	523 a.iterator = itr
jpayne@69	524
jpayne@69	525 return a
jpayne@69	526
jpayne@69	527 ###############################################################
jpayne@69	528 ###############################################################
jpayne@69	529 ###############################################################
jpayne@69	530 ## properties
jpayne@69	531 ###############################################################
jpayne@69	532 property header:
jpayne@69	533 '''the file header.
jpayne@69	534
jpayne@69	535 The file header consists of the lines at the beginning of a
jpayne@69	536 file that are prefixed by the comment character ``#``.
jpayne@69	537
jpayne@69	538 .. note::
jpayne@69	539 The header is returned as an iterator presenting lines
jpayne@69	540 without the newline character.
jpayne@69	541 '''
jpayne@69	542
jpayne@69	543 def __get__(self):
jpayne@69	544
jpayne@69	545 cdef char *cfilename = self.filename
jpayne@69	546 cdef char *cfilename_index = self.filename_index
jpayne@69	547
jpayne@69	548 cdef kstring_t buffer
jpayne@69	549 buffer.l = buffer.m = 0
jpayne@69	550 buffer.s = NULL
jpayne@69	551
jpayne@69	552 cdef htsFile * fp = NULL
jpayne@69	553 cdef int KS_SEP_LINE = 2
jpayne@69	554 cdef tbx_t * tbx = NULL
jpayne@69	555 lines = []
jpayne@69	556 with nogil:
jpayne@69	557 fp = hts_open(cfilename, 'r')
jpayne@69	558
jpayne@69	559 if fp == NULL:
jpayne@69	560 raise OSError("could not open {} for reading header".format(self.filename))
jpayne@69	561
jpayne@69	562 with nogil:
jpayne@69	563 tbx = tbx_index_load2(cfilename, cfilename_index)
jpayne@69	564
jpayne@69	565 if tbx == NULL:
jpayne@69	566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
jpayne@69	567
jpayne@69	568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
jpayne@69	569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
jpayne@69	570 break
jpayne@69	571 lines.append(force_str(buffer.s, self.encoding))
jpayne@69	572
jpayne@69	573 with nogil:
jpayne@69	574 hts_close(fp)
jpayne@69	575 free(buffer.s)
jpayne@69	576
jpayne@69	577 return lines
jpayne@69	578
jpayne@69	579 property contigs:
jpayne@69	580 '''list of chromosome names'''
jpayne@69	581 def __get__(self):
jpayne@69	582 cdef const char ** sequences
jpayne@69	583 cdef int nsequences
jpayne@69	584
jpayne@69	585 with nogil:
jpayne@69	586 sequences = tbx_seqnames(self.index, &nsequences)
jpayne@69	587 cdef int x
jpayne@69	588 result = []
jpayne@69	589 for x from 0 <= x < nsequences:
jpayne@69	590 result.append(force_str(sequences[x]))
jpayne@69	591
jpayne@69	592 # htslib instructions:
jpayne@69	593 # only free container, not the sequences themselves
jpayne@69	594 free(sequences)
jpayne@69	595
jpayne@69	596 return result
jpayne@69	597
jpayne@69	598 def close(self):
jpayne@69	599 '''
jpayne@69	600 closes the :class:`pysam.TabixFile`.'''
jpayne@69	601 if self.htsfile != NULL:
jpayne@69	602 hts_close(self.htsfile)
jpayne@69	603 self.htsfile = NULL
jpayne@69	604 if self.index != NULL:
jpayne@69	605 tbx_destroy(self.index)
jpayne@69	606 self.index = NULL
jpayne@69	607
jpayne@69	608 def __dealloc__( self ):
jpayne@69	609 # remember: dealloc cannot call other python methods
jpayne@69	610 # note: no doc string
jpayne@69	611 # note: __del__ is not called.
jpayne@69	612 if self.htsfile != NULL:
jpayne@69	613 hts_close(self.htsfile)
jpayne@69	614 self.htsfile = NULL
jpayne@69	615 if self.index != NULL:
jpayne@69	616 tbx_destroy(self.index)
jpayne@69	617
jpayne@69	618
jpayne@69	619 cdef class TabixIterator:
jpayne@69	620 """iterates over rows in tabixfile in region
jpayne@69	621 given by tid, start and end.
jpayne@69	622 """
jpayne@69	623
jpayne@69	624 def __init__(self, encoding="ascii"):
jpayne@69	625 self.encoding = encoding
jpayne@69	626
jpayne@69	627 def __iter__(self):
jpayne@69	628 self.buffer.s = NULL
jpayne@69	629 self.buffer.l = 0
jpayne@69	630 self.buffer.m = 0
jpayne@69	631
jpayne@69	632 return self
jpayne@69	633
jpayne@69	634 cdef int __cnext__(self):
jpayne@69	635 '''iterate to next element.
jpayne@69	636
jpayne@69	637 Return -5 if file has been closed when this function
jpayne@69	638 was called.
jpayne@69	639 '''
jpayne@69	640 if self.tabixfile.htsfile == NULL:
jpayne@69	641 return -5
jpayne@69	642
jpayne@69	643 cdef int retval
jpayne@69	644
jpayne@69	645 while 1:
jpayne@69	646 with nogil:
jpayne@69	647 retval = tbx_itr_next(
jpayne@69	648 self.tabixfile.htsfile,
jpayne@69	649 self.tabixfile.index,
jpayne@69	650 self.iterator,
jpayne@69	651 &self.buffer)
jpayne@69	652
jpayne@69	653 if retval < 0:
jpayne@69	654 break
jpayne@69	655
jpayne@69	656 if self.buffer.s[0] != b'#':
jpayne@69	657 break
jpayne@69	658
jpayne@69	659 return retval
jpayne@69	660
jpayne@69	661 def __next__(self):
jpayne@69	662 """python version of next().
jpayne@69	663
jpayne@69	664 pyrex uses this non-standard name instead of next()
jpayne@69	665 """
jpayne@69	666
jpayne@69	667 cdef int retval = self.__cnext__()
jpayne@69	668 if retval == -5:
jpayne@69	669 raise IOError("iteration on closed file")
jpayne@69	670 elif retval < 0:
jpayne@69	671 raise StopIteration
jpayne@69	672
jpayne@69	673 return charptr_to_str(self.buffer.s, self.encoding)
jpayne@69	674
jpayne@69	675 def __dealloc__(self):
jpayne@69	676 if <void*>self.iterator != NULL:
jpayne@69	677 tbx_itr_destroy(self.iterator)
jpayne@69	678 if self.buffer.s != NULL:
jpayne@69	679 free(self.buffer.s)
jpayne@69	680
jpayne@69	681
jpayne@69	682 class EmptyIterator:
jpayne@69	683 '''empty iterator'''
jpayne@69	684
jpayne@69	685 def __iter__(self):
jpayne@69	686 return self
jpayne@69	687
jpayne@69	688 def __next__(self):
jpayne@69	689 raise StopIteration()
jpayne@69	690
jpayne@69	691
jpayne@69	692 cdef class TabixIteratorParsed(TabixIterator):
jpayne@69	693 """iterates over mapped reads in a region.
jpayne@69	694
jpayne@69	695 The parser determines the encoding.
jpayne@69	696
jpayne@69	697 Returns parsed data.
jpayne@69	698 """
jpayne@69	699
jpayne@69	700 def __init__(self, Parser parser):
jpayne@69	701 super().__init__()
jpayne@69	702 self.parser = parser
jpayne@69	703
jpayne@69	704 def __next__(self):
jpayne@69	705 """python version of next().
jpayne@69	706
jpayne@69	707 pyrex uses this non-standard name instead of next()
jpayne@69	708 """
jpayne@69	709
jpayne@69	710 cdef int retval = self.__cnext__()
jpayne@69	711 if retval == -5:
jpayne@69	712 raise IOError("iteration on closed file")
jpayne@69	713 elif retval < 0:
jpayne@69	714 raise StopIteration
jpayne@69	715
jpayne@69	716 return self.parser.parse(self.buffer.s,
jpayne@69	717 self.buffer.l)
jpayne@69	718
jpayne@69	719
jpayne@69	720 cdef class GZIterator:
jpayne@69	721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
jpayne@69	722 '''iterate line-by-line through gzip (or bgzip)
jpayne@69	723 compressed file.
jpayne@69	724 '''
jpayne@69	725 if not os.path.exists(filename):
jpayne@69	726 raise IOError("No such file or directory: %s" % filename)
jpayne@69	727
jpayne@69	728 filename = encode_filename(filename)
jpayne@69	729 cdef char *cfilename = filename
jpayne@69	730 with nogil:
jpayne@69	731 self.gzipfile = bgzf_open(cfilename, "r")
jpayne@69	732 self._filename = filename
jpayne@69	733 self.kstream = ks_init(self.gzipfile)
jpayne@69	734 self.encoding = encoding
jpayne@69	735
jpayne@69	736 self.buffer.l = 0
jpayne@69	737 self.buffer.m = 0
jpayne@69	738 self.buffer.s = <char*>malloc(buffer_size)
jpayne@69	739
jpayne@69	740 def __dealloc__(self):
jpayne@69	741 '''close file.'''
jpayne@69	742 if self.gzipfile != NULL:
jpayne@69	743 bgzf_close(self.gzipfile)
jpayne@69	744 self.gzipfile = NULL
jpayne@69	745 if self.buffer.s != NULL:
jpayne@69	746 free(self.buffer.s)
jpayne@69	747 if self.kstream != NULL:
jpayne@69	748 ks_destroy(self.kstream)
jpayne@69	749
jpayne@69	750 def __iter__(self):
jpayne@69	751 return self
jpayne@69	752
jpayne@69	753 cdef int __cnext__(self):
jpayne@69	754 cdef int dret = 0
jpayne@69	755 cdef int retval = 0
jpayne@69	756 while 1:
jpayne@69	757 with nogil:
jpayne@69	758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69	759
jpayne@69	760 if retval < 0:
jpayne@69	761 break
jpayne@69	762
jpayne@69	763 return dret
jpayne@69	764 return -1
jpayne@69	765
jpayne@69	766 def __next__(self):
jpayne@69	767 """python version of next().
jpayne@69	768 """
jpayne@69	769 cdef int retval = self.__cnext__()
jpayne@69	770 if retval < 0:
jpayne@69	771 raise StopIteration
jpayne@69	772 return force_str(self.buffer.s, self.encoding)
jpayne@69	773
jpayne@69	774
jpayne@69	775 cdef class GZIteratorHead(GZIterator):
jpayne@69	776 '''iterate line-by-line through gzip (or bgzip)
jpayne@69	777 compressed file returning comments at top of file.
jpayne@69	778 '''
jpayne@69	779
jpayne@69	780 def __next__(self):
jpayne@69	781 """python version of next().
jpayne@69	782 """
jpayne@69	783 cdef int retval = self.__cnext__()
jpayne@69	784 if retval < 0:
jpayne@69	785 raise StopIteration
jpayne@69	786 if self.buffer.s[0] == b'#':
jpayne@69	787 return self.buffer.s
jpayne@69	788 else:
jpayne@69	789 raise StopIteration
jpayne@69	790
jpayne@69	791
jpayne@69	792 cdef class GZIteratorParsed(GZIterator):
jpayne@69	793 '''iterate line-by-line through gzip (or bgzip)
jpayne@69	794 compressed file returning comments at top of file.
jpayne@69	795 '''
jpayne@69	796
jpayne@69	797 def __init__(self, parser):
jpayne@69	798 self.parser = parser
jpayne@69	799
jpayne@69	800 def __next__(self):
jpayne@69	801 """python version of next().
jpayne@69	802 """
jpayne@69	803 cdef int retval = self.__cnext__()
jpayne@69	804 if retval < 0:
jpayne@69	805 raise StopIteration
jpayne@69	806
jpayne@69	807 return self.parser.parse(self.buffer.s,
jpayne@69	808 self.buffer.l)
jpayne@69	809
jpayne@69	810
jpayne@69	811 def tabix_compress(filename_in,
jpayne@69	812 filename_out,
jpayne@69	813 force=False):
jpayne@69	814 '''compress filename_in writing the output to filename_out.
jpayne@69	815
jpayne@69	816 Raise an IOError if filename_out already exists, unless force
jpayne@69	817 is set.
jpayne@69	818 '''
jpayne@69	819
jpayne@69	820 if not force and os.path.exists(filename_out):
jpayne@69	821 raise IOError(
jpayne@69	822 "Filename '%s' already exists, use force to "
jpayne@69	823 "overwrite" % filename_out)
jpayne@69	824
jpayne@69	825 cdef int WINDOW_SIZE
jpayne@69	826 cdef int c, r
jpayne@69	827 cdef void * buffer
jpayne@69	828 cdef BGZF * fp
jpayne@69	829 cdef int fd_src
jpayne@69	830 cdef bint is_empty = True
jpayne@69	831 cdef int O_RDONLY
jpayne@69	832 O_RDONLY = os.O_RDONLY
jpayne@69	833
jpayne@69	834 WINDOW_SIZE = 64 * 1024
jpayne@69	835
jpayne@69	836 fn = encode_filename(filename_out)
jpayne@69	837 cdef char *cfn = fn
jpayne@69	838 with nogil:
jpayne@69	839 fp = bgzf_open(cfn, "w")
jpayne@69	840 if fp == NULL:
jpayne@69	841 raise IOError("could not open '%s' for writing" % filename_out)
jpayne@69	842
jpayne@69	843 fn = encode_filename(filename_in)
jpayne@69	844 fd_src = open(fn, O_RDONLY)
jpayne@69	845 if fd_src == 0:
jpayne@69	846 raise IOError("could not open '%s' for reading" % filename_in)
jpayne@69	847
jpayne@69	848 buffer = malloc(WINDOW_SIZE)
jpayne@69	849 c = 1
jpayne@69	850
jpayne@69	851 while c > 0:
jpayne@69	852 with nogil:
jpayne@69	853 c = read(fd_src, buffer, WINDOW_SIZE)
jpayne@69	854 if c > 0:
jpayne@69	855 is_empty = False
jpayne@69	856 r = bgzf_write(fp, buffer, c)
jpayne@69	857 if r < 0:
jpayne@69	858 free(buffer)
jpayne@69	859 raise IOError("writing failed")
jpayne@69	860
jpayne@69	861 free(buffer)
jpayne@69	862 r = bgzf_close(fp)
jpayne@69	863 if r < 0:
jpayne@69	864 raise IOError("error %i when writing to file %s" % (r, filename_out))
jpayne@69	865
jpayne@69	866 r = close(fd_src)
jpayne@69	867 # an empty file will return with -1, thus ignore this.
jpayne@69	868 if r < 0:
jpayne@69	869 if not (r == -1 and is_empty):
jpayne@69	870 raise IOError("error %i when closing file %s" % (r, filename_in))
jpayne@69	871
jpayne@69	872
jpayne@69	873 def tabix_index(filename,
jpayne@69	874 force=False,
jpayne@69	875 seq_col=None,
jpayne@69	876 start_col=None,
jpayne@69	877 end_col=None,
jpayne@69	878 preset=None,
jpayne@69	879 meta_char="#",
jpayne@69	880 int line_skip=0,
jpayne@69	881 zerobased=False,
jpayne@69	882 int min_shift=-1,
jpayne@69	883 index=None,
jpayne@69	884 keep_original=False,
jpayne@69	885 csi=False,
jpayne@69	886 ):
jpayne@69	887 '''index tab-separated filename using tabix.
jpayne@69	888
jpayne@69	889 An existing index will not be overwritten unless force is set.
jpayne@69	890
jpayne@69	891 The index will be built from coordinates in columns seq_col,
jpayne@69	892 start_col and end_col.
jpayne@69	893
jpayne@69	894 The contents of filename have to be sorted by contig and
jpayne@69	895 position - the method does not check if the file is sorted.
jpayne@69	896
jpayne@69	897 Column indices are 0-based. Note that this is different from the
jpayne@69	898 tabix command line utility where column indices start at 1.
jpayne@69	899
jpayne@69	900 Coordinates in the file are assumed to be 1-based unless
jpayne@69	901 zerobased is set.
jpayne@69	902
jpayne@69	903 If preset is provided, the column coordinates are taken from a
jpayne@69	904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
jpayne@69	905 psltbl", "pileup".
jpayne@69	906
jpayne@69	907 Lines beginning with meta_char and the first line_skip lines
jpayne@69	908 will be skipped.
jpayne@69	909
jpayne@69	910 If filename is not detected as a gzip file it will be automatically
jpayne@69	911 compressed. The original file will be removed and only the compressed
jpayne@69	912 file will be retained.
jpayne@69	913
jpayne@69	914 By default or when min_shift is 0, creates a TBI index. If min_shift
jpayne@69	915 is greater than zero and/or csi is True, creates a CSI index with a
jpayne@69	916 minimal interval size of 1<<min_shift (1<<14 if only csi is set).
jpayne@69	917
jpayne@69	918 index controls the filename which should be used for creating the index.
jpayne@69	919 If not set, the default is to append ``.tbi`` to filename.
jpayne@69	920
jpayne@69	921 When automatically compressing files, if keep_original is set the
jpayne@69	922 uncompressed file will not be deleted.
jpayne@69	923
jpayne@69	924 returns the filename of the compressed data
jpayne@69	925
jpayne@69	926 '''
jpayne@69	927
jpayne@69	928 if preset is None and \
jpayne@69	929 (seq_col is None or start_col is None or end_col is None):
jpayne@69	930 raise ValueError(
jpayne@69	931 "neither preset nor seq_col,start_col and end_col given")
jpayne@69	932
jpayne@69	933 fn = encode_filename(filename)
jpayne@69	934 cdef char *cfn = fn
jpayne@69	935
jpayne@69	936 cdef htsFile *fp = hts_open(cfn, "r")
jpayne@69	937 if fp == NULL:
jpayne@69	938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
jpayne@69	939
jpayne@69	940 cdef htsFormat fmt = hts_get_format(fp)[0]
jpayne@69	941 hts_close(fp)
jpayne@69	942
jpayne@69	943 if fmt.compression == no_compression:
jpayne@69	944 tabix_compress(filename, filename + ".gz", force=force)
jpayne@69	945 if not keep_original:
jpayne@69	946 os.unlink(filename)
jpayne@69	947 filename += ".gz"
jpayne@69	948 fn = encode_filename(filename)
jpayne@69	949 cfn = fn
jpayne@69	950
jpayne@69	951 # columns (1-based):
jpayne@69	952 # preset-code, contig, start, end, metachar for
jpayne@69	953 # comments, lines to ignore at beginning
jpayne@69	954 # 0 is a missing column
jpayne@69	955 preset2conf = {
jpayne@69	956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
jpayne@69	957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
jpayne@69	958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
jpayne@69	959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
jpayne@69	960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
jpayne@69	961 }
jpayne@69	962
jpayne@69	963 conf_data = None
jpayne@69	964 if preset == "bcf" or fmt.format == bcf:
jpayne@69	965 csi = True
jpayne@69	966 elif preset:
jpayne@69	967 try:
jpayne@69	968 conf_data = preset2conf[preset]
jpayne@69	969 except KeyError:
jpayne@69	970 raise KeyError(
jpayne@69	971 "unknown preset '%s', valid presets are '%s'" %
jpayne@69	972 (preset, ",".join(preset2conf.keys())))
jpayne@69	973 else:
jpayne@69	974 if end_col is None:
jpayne@69	975 end_col = -1
jpayne@69	976
jpayne@69	977 preset = 0
jpayne@69	978 # tabix internally works with 0-based coordinates and
jpayne@69	979 # open/closed intervals. When using a preset, conversion is
jpayne@69	980 # automatically taken care of. Otherwise, the coordinates are
jpayne@69	981 # assumed to be 1-based closed intervals and -1 is subtracted
jpayne@69	982 # from the start coordinate. To avoid doing this, set the
jpayne@69	983 # TI_FLAG_UCSC=0x10000 flag:
jpayne@69	984 if zerobased:
jpayne@69	985 preset = preset \| TBX_UCSC
jpayne@69	986
jpayne@69	987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
jpayne@69	988
jpayne@69	989 cdef tbx_conf_t conf
jpayne@69	990 if conf_data:
jpayne@69	991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
jpayne@69	992
jpayne@69	993 if csi or min_shift > 0:
jpayne@69	994 suffix = ".csi"
jpayne@69	995 if min_shift <= 0: min_shift = 14
jpayne@69	996 else:
jpayne@69	997 suffix = ".tbi"
jpayne@69	998 min_shift = 0
jpayne@69	999
jpayne@69	1000 index = index or filename + suffix
jpayne@69	1001 fn_index = encode_filename(index)
jpayne@69	1002
jpayne@69	1003 if not force and os.path.exists(index):
jpayne@69	1004 raise IOError(
jpayne@69	1005 "filename '%s' already exists, use force to overwrite" % index)
jpayne@69	1006
jpayne@69	1007 cdef char *fnidx = fn_index
jpayne@69	1008 cdef int retval = 0
jpayne@69	1009
jpayne@69	1010 if csi and fmt.format == bcf:
jpayne@69	1011 with nogil:
jpayne@69	1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
jpayne@69	1013 else:
jpayne@69	1014 with nogil:
jpayne@69	1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
jpayne@69	1016
jpayne@69	1017 if retval != 0:
jpayne@69	1018 raise OSError("building of index for {} failed".format(filename))
jpayne@69	1019
jpayne@69	1020 return filename
jpayne@69	1021
jpayne@69	1022 # #########################################################
jpayne@69	1023 # cdef class tabix_file_iterator_old:
jpayne@69	1024 # '''iterate over ``infile``.
jpayne@69	1025
jpayne@69	1026 # This iterator is not safe. If the :meth:`__next__()` method is called
jpayne@69	1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
jpayne@69	1028
jpayne@69	1029 # The iterator might either raise a StopIteration or segfault.
jpayne@69	1030 # '''
jpayne@69	1031
jpayne@69	1032
jpayne@69	1033 # def __cinit__(self,
jpayne@69	1034 # infile,
jpayne@69	1035 # Parser parser,
jpayne@69	1036 # int buffer_size = 65536 ):
jpayne@69	1037
jpayne@69	1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
jpayne@69	1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
jpayne@69	1040 # self.infile = fdopen( fd, 'r')
jpayne@69	1041
jpayne@69	1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
jpayne@69	1043
jpayne@69	1044 # self.buffer = <char*>malloc( buffer_size )
jpayne@69	1045 # self.size = buffer_size
jpayne@69	1046 # self.parser = parser
jpayne@69	1047
jpayne@69	1048 # def __iter__(self):
jpayne@69	1049 # return self
jpayne@69	1050
jpayne@69	1051 # cdef __cnext__(self):
jpayne@69	1052
jpayne@69	1053 # cdef char * b
jpayne@69	1054 # cdef size_t nbytes
jpayne@69	1055 # b = self.buffer
jpayne@69	1056
jpayne@69	1057 # while not feof( self.infile ):
jpayne@69	1058 # nbytes = getline( &b, &self.size, self.infile)
jpayne@69	1059
jpayne@69	1060 # # stop at first error or eof
jpayne@69	1061 # if (nbytes == -1): break
jpayne@69	1062 # # skip comments
jpayne@69	1063 # if (b[0] == '#'): continue
jpayne@69	1064
jpayne@69	1065 # # skip empty lines
jpayne@69	1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
jpayne@69	1067
jpayne@69	1068 # # make sure that entry is complete
jpayne@69	1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
jpayne@69	1070 # result = b
jpayne@69	1071 # raise ValueError( "incomplete line at %s" % result )
jpayne@69	1072
jpayne@69	1073 # # make sure that this goes fully through C
jpayne@69	1074 # # otherwise buffer is copied to/from a
jpayne@69	1075 # # Python object causing segfaults as
jpayne@69	1076 # # the wrong memory is freed
jpayne@69	1077 # return self.parser.parse( b, nbytes )
jpayne@69	1078
jpayne@69	1079 # raise StopIteration
jpayne@69	1080
jpayne@69	1081 # def __dealloc__(self):
jpayne@69	1082 # free(self.buffer)
jpayne@69	1083
jpayne@69	1084 # def __next__(self):
jpayne@69	1085 # return self.__cnext__()
jpayne@69	1086
jpayne@69	1087 #########################################################
jpayne@69	1088 #########################################################
jpayne@69	1089 #########################################################
jpayne@69	1090 ## Iterators for parsing through unindexed files.
jpayne@69	1091 #########################################################
jpayne@69	1092 # cdef buildGzipError(void *gzfp):
jpayne@69	1093 # cdef int errnum = 0
jpayne@69	1094 # cdef char *s = gzerror(gzfp, &errnum)
jpayne@69	1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
jpayne@69	1096
jpayne@69	1097
jpayne@69	1098 cdef class tabix_file_iterator:
jpayne@69	1099 '''iterate over a compressed or uncompressed ``infile``.
jpayne@69	1100 '''
jpayne@69	1101
jpayne@69	1102 def __cinit__(self,
jpayne@69	1103 infile,
jpayne@69	1104 Parser parser,
jpayne@69	1105 int buffer_size=65536):
jpayne@69	1106
jpayne@69	1107 if infile.closed:
jpayne@69	1108 raise ValueError("I/O operation on closed file.")
jpayne@69	1109
jpayne@69	1110 self.infile = infile
jpayne@69	1111
jpayne@69	1112 cdef int fd = PyObject_AsFileDescriptor(infile)
jpayne@69	1113 if fd == -1:
jpayne@69	1114 raise ValueError("I/O operation on closed file.")
jpayne@69	1115
jpayne@69	1116 self.duplicated_fd = dup(fd)
jpayne@69	1117
jpayne@69	1118 # From the manual:
jpayne@69	1119 # gzopen can be used to read a file which is not in gzip format;
jpayne@69	1120 # in this case gzread will directly read from the file without decompression.
jpayne@69	1121 # When reading, this will be detected automatically by looking
jpayne@69	1122 # for the magic two-byte gzip header.
jpayne@69	1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
jpayne@69	1124
jpayne@69	1125 if self.fh == NULL:
jpayne@69	1126 raise IOError('%s' % strerror(errno))
jpayne@69	1127
jpayne@69	1128 self.kstream = ks_init(self.fh)
jpayne@69	1129
jpayne@69	1130 self.buffer.s = <char*>malloc(buffer_size)
jpayne@69	1131 #if self.buffer == NULL:
jpayne@69	1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
jpayne@69	1133 #self.size = buffer_size
jpayne@69	1134 self.parser = parser
jpayne@69	1135
jpayne@69	1136 def __iter__(self):
jpayne@69	1137 return self
jpayne@69	1138
jpayne@69	1139 cdef __cnext__(self):
jpayne@69	1140
jpayne@69	1141 cdef char * b
jpayne@69	1142 cdef int dret = 0
jpayne@69	1143 cdef int retval = 0
jpayne@69	1144 while 1:
jpayne@69	1145 with nogil:
jpayne@69	1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69	1147
jpayne@69	1148 if retval < 0:
jpayne@69	1149 break
jpayne@69	1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
jpayne@69	1151
jpayne@69	1152 b = self.buffer.s
jpayne@69	1153
jpayne@69	1154 # skip comments
jpayne@69	1155 if (b[0] == b'#'):
jpayne@69	1156 continue
jpayne@69	1157
jpayne@69	1158 # skip empty lines
jpayne@69	1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69	1160 continue
jpayne@69	1161
jpayne@69	1162 # gzgets terminates at \n, no need to test
jpayne@69	1163
jpayne@69	1164 # parser creates a copy
jpayne@69	1165 return self.parser.parse(b, self.buffer.l)
jpayne@69	1166
jpayne@69	1167 raise StopIteration
jpayne@69	1168
jpayne@69	1169 def __dealloc__(self):
jpayne@69	1170 free(self.buffer.s)
jpayne@69	1171 ks_destroy(self.kstream)
jpayne@69	1172 bgzf_close(self.fh)
jpayne@69	1173
jpayne@69	1174 def __next__(self):
jpayne@69	1175 return self.__cnext__()
jpayne@69	1176
jpayne@69	1177
jpayne@69	1178 class tabix_generic_iterator:
jpayne@69	1179 '''iterate over ``infile``.
jpayne@69	1180
jpayne@69	1181 Permits the use of file-like objects for example from the gzip module.
jpayne@69	1182 '''
jpayne@69	1183 def __init__(self, infile, parser):
jpayne@69	1184
jpayne@69	1185 self.infile = infile
jpayne@69	1186 if self.infile.closed:
jpayne@69	1187 raise ValueError("I/O operation on closed file.")
jpayne@69	1188 self.parser = parser
jpayne@69	1189
jpayne@69	1190 def __iter__(self):
jpayne@69	1191 return self
jpayne@69	1192
jpayne@69	1193 # cython version - required for python 3
jpayne@69	1194 def __next__(self):
jpayne@69	1195
jpayne@69	1196 cdef char * b
jpayne@69	1197 cdef char * cpy
jpayne@69	1198 cdef size_t nbytes
jpayne@69	1199
jpayne@69	1200 encoding = self.parser.get_encoding()
jpayne@69	1201
jpayne@69	1202 # note that GzipFile.close() does not close the file
jpayne@69	1203 # reading is still possible.
jpayne@69	1204 if self.infile.closed:
jpayne@69	1205 raise ValueError("I/O operation on closed file.")
jpayne@69	1206
jpayne@69	1207 while 1:
jpayne@69	1208
jpayne@69	1209 line = self.infile.readline()
jpayne@69	1210 if not line:
jpayne@69	1211 break
jpayne@69	1212
jpayne@69	1213 s = force_bytes(line, encoding)
jpayne@69	1214 b = s
jpayne@69	1215 nbytes = len(line)
jpayne@69	1216 assert b[nbytes] == b'\0'
jpayne@69	1217
jpayne@69	1218 # skip comments
jpayne@69	1219 if b[0] == b'#':
jpayne@69	1220 continue
jpayne@69	1221
jpayne@69	1222 # skip empty lines
jpayne@69	1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69	1224 continue
jpayne@69	1225
jpayne@69	1226 # make sure that entry is complete
jpayne@69	1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
jpayne@69	1228 raise ValueError("incomplete line at %s" % line)
jpayne@69	1229
jpayne@69	1230 bytes_cpy = <bytes> b
jpayne@69	1231 cpy = <char *> bytes_cpy
jpayne@69	1232
jpayne@69	1233 return self.parser(cpy, nbytes)
jpayne@69	1234
jpayne@69	1235 raise StopIteration
jpayne@69	1236
jpayne@69	1237
jpayne@69	1238 def tabix_iterator(infile, parser):
jpayne@69	1239 """return an iterator over all entries in a file.
jpayne@69	1240
jpayne@69	1241 Results are returned parsed as specified by the parser. If
jpayne@69	1242 parser is None, the results are returned as an unparsed string.
jpayne@69	1243 Otherwise, parser is assumed to be a functor that will return
jpayne@69	1244 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69	1245 :class:`~pysam.asGTF`).
jpayne@69	1246
jpayne@69	1247 """
jpayne@69	1248 return tabix_generic_iterator(infile, parser)
jpayne@69	1249
jpayne@69	1250
jpayne@69	1251 cdef class Tabixfile(TabixFile):
jpayne@69	1252 """Tabixfile is deprecated: use TabixFile instead"""
jpayne@69	1253 pass
jpayne@69	1254
jpayne@69	1255
jpayne@69	1256 __all__ = [
jpayne@69	1257 "tabix_index",
jpayne@69	1258 "tabix_compress",
jpayne@69	1259 "TabixFile",
jpayne@69	1260 "Tabixfile",
jpayne@69	1261 "asTuple",
jpayne@69	1262 "asGTF",
jpayne@69	1263 "asGFF3",
jpayne@69	1264 "asVCF",
jpayne@69	1265 "asBed",
jpayne@69	1266 "GZIterator",
jpayne@69	1267 "GZIteratorHead",
jpayne@69	1268 "tabix_iterator",
jpayne@69	1269 "tabix_generic_iterator",
jpayne@69	1270 "tabix_file_iterator",
jpayne@69	1271 ]

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356