annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: language_level=3
jpayne@68 2 # cython: embedsignature=True
jpayne@68 3 # cython: profile=True
jpayne@68 4 ###############################################################################
jpayne@68 5 ###############################################################################
jpayne@68 6 # Cython wrapper for access to tabix indexed files in bgzf format
jpayne@68 7 ###############################################################################
jpayne@68 8 # The principal classes and functions defined in this module are:
jpayne@68 9 #
jpayne@68 10 # class TabixFile class wrapping tabix indexed files in bgzf format
jpayne@68 11 #
jpayne@68 12 # class asTuple Parser class for tuples
jpayne@68 13 # class asGTF Parser class for GTF formatted rows
jpayne@68 14 # class asGFF3 Parser class for GFF3 formatted rows
jpayne@68 15 # class asBed Parser class for Bed formatted rows
jpayne@68 16 # class asVCF Parser class for VCF formatted rows
jpayne@68 17 #
jpayne@68 18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
jpayne@68 19 #
jpayne@68 20 # Additionally this module defines several additional classes that are part
jpayne@68 21 # of the internal API. These are:
jpayne@68 22 #
jpayne@68 23 # class Parser base class for parsers of tab-separated rows
jpayne@68 24 # class tabix_file_iterator
jpayne@68 25 # class TabixIterator iterator class over rows in bgzf file
jpayne@68 26 # class EmptyIterator
jpayne@68 27 #
jpayne@68 28 # For backwards compatibility, the following classes are also defined:
jpayne@68 29 #
jpayne@68 30 # class Tabixfile equivalent to TabixFile
jpayne@68 31 #
jpayne@68 32 ###############################################################################
jpayne@68 33 #
jpayne@68 34 # The MIT License
jpayne@68 35 #
jpayne@68 36 # Copyright (c) 2015 Andreas Heger
jpayne@68 37 #
jpayne@68 38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68 39 # copy of this software and associated documentation files (the "Software"),
jpayne@68 40 # to deal in the Software without restriction, including without limitation
jpayne@68 41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68 42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68 43 # Software is furnished to do so, subject to the following conditions:
jpayne@68 44 #
jpayne@68 45 # The above copyright notice and this permission notice shall be included in
jpayne@68 46 # all copies or substantial portions of the Software.
jpayne@68 47 #
jpayne@68 48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68 49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68 50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68 51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68 52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68 53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68 54 # DEALINGS IN THE SOFTWARE.
jpayne@68 55 #
jpayne@68 56 ###############################################################################
jpayne@68 57 import os
jpayne@68 58 import sys
jpayne@68 59
jpayne@68 60 from libc.stdio cimport printf, fprintf, stderr
jpayne@68 61 from libc.string cimport strerror
jpayne@68 62 from libc.errno cimport errno
jpayne@68 63 from posix.unistd cimport dup
jpayne@68 64
jpayne@68 65 from cpython cimport PyErr_SetString, PyBytes_Check, \
jpayne@68 66 PyUnicode_Check, PyBytes_FromStringAndSize, \
jpayne@68 67 PyObject_AsFileDescriptor
jpayne@68 68
jpayne@68 69 cimport pysam.libctabixproxies as ctabixproxies
jpayne@68 70
jpayne@68 71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
jpayne@68 72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
jpayne@68 73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
jpayne@68 74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
jpayne@68 75 tbx_destroy, hisremote, region_list, hts_getline, \
jpayne@68 76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
jpayne@68 77 no_compression, bcf, bcf_index_build2
jpayne@68 78
jpayne@68 79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68 80 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68 81
jpayne@68 82 cdef class Parser:
jpayne@68 83
jpayne@68 84 def __init__(self, encoding="ascii"):
jpayne@68 85 self.encoding = encoding
jpayne@68 86
jpayne@68 87 def set_encoding(self, encoding):
jpayne@68 88 self.encoding = encoding
jpayne@68 89
jpayne@68 90 def get_encoding(self):
jpayne@68 91 return self.encoding
jpayne@68 92
jpayne@68 93 cdef parse(self, char * buffer, int length):
jpayne@68 94 raise NotImplementedError(
jpayne@68 95 'parse method of %s not implemented' % str(self))
jpayne@68 96
jpayne@68 97 def __call__(self, char * buffer, int length):
jpayne@68 98 return self.parse(buffer, length)
jpayne@68 99
jpayne@68 100
jpayne@68 101 cdef class asTuple(Parser):
jpayne@68 102 '''converts a :term:`tabix row` into a python tuple.
jpayne@68 103
jpayne@68 104 A field in a row is accessed by numeric index.
jpayne@68 105 '''
jpayne@68 106 cdef parse(self, char * buffer, int len):
jpayne@68 107 cdef ctabixproxies.TupleProxy r
jpayne@68 108 r = ctabixproxies.TupleProxy(self.encoding)
jpayne@68 109 # need to copy - there were some
jpayne@68 110 # persistence issues with "present"
jpayne@68 111 r.copy(buffer, len)
jpayne@68 112 return r
jpayne@68 113
jpayne@68 114
jpayne@68 115 cdef class asGFF3(Parser):
jpayne@68 116 '''converts a :term:`tabix row` into a GFF record with the following
jpayne@68 117 fields:
jpayne@68 118
jpayne@68 119 +----------+----------+-------------------------------+
jpayne@68 120 |*Column* |*Name* |*Content* |
jpayne@68 121 +----------+----------+-------------------------------+
jpayne@68 122 |1 |contig |the chromosome name |
jpayne@68 123 +----------+----------+-------------------------------+
jpayne@68 124 |2 |feature |The feature type |
jpayne@68 125 +----------+----------+-------------------------------+
jpayne@68 126 |3 |source |The feature source |
jpayne@68 127 +----------+----------+-------------------------------+
jpayne@68 128 |4 |start |genomic start coordinate |
jpayne@68 129 | | |(0-based) |
jpayne@68 130 +----------+----------+-------------------------------+
jpayne@68 131 |5 |end |genomic end coordinate |
jpayne@68 132 | | |(0-based) |
jpayne@68 133 +----------+----------+-------------------------------+
jpayne@68 134 |6 |score |feature score |
jpayne@68 135 +----------+----------+-------------------------------+
jpayne@68 136 |7 |strand |strand |
jpayne@68 137 +----------+----------+-------------------------------+
jpayne@68 138 |8 |frame |frame |
jpayne@68 139 +----------+----------+-------------------------------+
jpayne@68 140 |9 |attributes|the attribute field |
jpayne@68 141 +----------+----------+-------------------------------+
jpayne@68 142
jpayne@68 143 '''
jpayne@68 144 cdef parse(self, char * buffer, int len):
jpayne@68 145 cdef ctabixproxies.GFF3Proxy r
jpayne@68 146 r = ctabixproxies.GFF3Proxy(self.encoding)
jpayne@68 147 r.copy(buffer, len)
jpayne@68 148 return r
jpayne@68 149
jpayne@68 150
jpayne@68 151 cdef class asGTF(Parser):
jpayne@68 152 '''converts a :term:`tabix row` into a GTF record with the following
jpayne@68 153 fields:
jpayne@68 154
jpayne@68 155 +----------+----------+-------------------------------+
jpayne@68 156 |*Column* |*Name* |*Content* |
jpayne@68 157 +----------+----------+-------------------------------+
jpayne@68 158 |1 |contig |the chromosome name |
jpayne@68 159 +----------+----------+-------------------------------+
jpayne@68 160 |2 |feature |The feature type |
jpayne@68 161 +----------+----------+-------------------------------+
jpayne@68 162 |3 |source |The feature source |
jpayne@68 163 +----------+----------+-------------------------------+
jpayne@68 164 |4 |start |genomic start coordinate |
jpayne@68 165 | | |(0-based) |
jpayne@68 166 +----------+----------+-------------------------------+
jpayne@68 167 |5 |end |genomic end coordinate |
jpayne@68 168 | | |(0-based) |
jpayne@68 169 +----------+----------+-------------------------------+
jpayne@68 170 |6 |score |feature score |
jpayne@68 171 +----------+----------+-------------------------------+
jpayne@68 172 |7 |strand |strand |
jpayne@68 173 +----------+----------+-------------------------------+
jpayne@68 174 |8 |frame |frame |
jpayne@68 175 +----------+----------+-------------------------------+
jpayne@68 176 |9 |attributes|the attribute field |
jpayne@68 177 +----------+----------+-------------------------------+
jpayne@68 178
jpayne@68 179 GTF formatted entries also define the following fields that
jpayne@68 180 are derived from the attributes field:
jpayne@68 181
jpayne@68 182 +--------------------+------------------------------+
jpayne@68 183 |*Name* |*Content* |
jpayne@68 184 +--------------------+------------------------------+
jpayne@68 185 |gene_id |the gene identifier |
jpayne@68 186 +--------------------+------------------------------+
jpayne@68 187 |transcript_id |the transcript identifier |
jpayne@68 188 +--------------------+------------------------------+
jpayne@68 189
jpayne@68 190 '''
jpayne@68 191 cdef parse(self, char * buffer, int len):
jpayne@68 192 cdef ctabixproxies.GTFProxy r
jpayne@68 193 r = ctabixproxies.GTFProxy(self.encoding)
jpayne@68 194 r.copy(buffer, len)
jpayne@68 195 return r
jpayne@68 196
jpayne@68 197
jpayne@68 198 cdef class asBed(Parser):
jpayne@68 199 '''converts a :term:`tabix row` into a bed record
jpayne@68 200 with the following fields:
jpayne@68 201
jpayne@68 202 +-----------+-----------+------------------------------------------+
jpayne@68 203 |*Column* |*Field* |*Contents* |
jpayne@68 204 | | | |
jpayne@68 205 +-----------+-----------+------------------------------------------+
jpayne@68 206 |1 |contig |contig |
jpayne@68 207 | | | |
jpayne@68 208 +-----------+-----------+------------------------------------------+
jpayne@68 209 |2 |start |genomic start coordinate (zero-based) |
jpayne@68 210 +-----------+-----------+------------------------------------------+
jpayne@68 211 |3 |end |genomic end coordinate plus one |
jpayne@68 212 | | |(zero-based) |
jpayne@68 213 +-----------+-----------+------------------------------------------+
jpayne@68 214 |4 |name |name of feature. |
jpayne@68 215 +-----------+-----------+------------------------------------------+
jpayne@68 216 |5 |score |score of feature |
jpayne@68 217 +-----------+-----------+------------------------------------------+
jpayne@68 218 |6 |strand |strand of feature |
jpayne@68 219 +-----------+-----------+------------------------------------------+
jpayne@68 220 |7 |thickStart |thickStart |
jpayne@68 221 +-----------+-----------+------------------------------------------+
jpayne@68 222 |8 |thickEnd |thickEnd |
jpayne@68 223 +-----------+-----------+------------------------------------------+
jpayne@68 224 |9 |itemRGB |itemRGB |
jpayne@68 225 +-----------+-----------+------------------------------------------+
jpayne@68 226 |10 |blockCount |number of bocks |
jpayne@68 227 +-----------+-----------+------------------------------------------+
jpayne@68 228 |11 |blockSizes |',' separated string of block sizes |
jpayne@68 229 +-----------+-----------+------------------------------------------+
jpayne@68 230 |12 |blockStarts|',' separated string of block genomic |
jpayne@68 231 | | |start positions |
jpayne@68 232 +-----------+-----------+------------------------------------------+
jpayne@68 233
jpayne@68 234 Only the first three fields are required. Additional
jpayne@68 235 fields are optional, but if one is defined, all the preceding
jpayne@68 236 need to be defined as well.
jpayne@68 237
jpayne@68 238 '''
jpayne@68 239 cdef parse(self, char * buffer, int len):
jpayne@68 240 cdef ctabixproxies.BedProxy r
jpayne@68 241 r = ctabixproxies.BedProxy(self.encoding)
jpayne@68 242 r.copy(buffer, len)
jpayne@68 243 return r
jpayne@68 244
jpayne@68 245
jpayne@68 246 cdef class asVCF(Parser):
jpayne@68 247 '''converts a :term:`tabix row` into a VCF record with
jpayne@68 248 the following fields:
jpayne@68 249
jpayne@68 250 +----------+---------+------------------------------------+
jpayne@68 251 |*Column* |*Field* |*Contents* |
jpayne@68 252 | | | |
jpayne@68 253 +----------+---------+------------------------------------+
jpayne@68 254 |1 |contig |chromosome |
jpayne@68 255 +----------+---------+------------------------------------+
jpayne@68 256 |2 |pos |chromosomal position, zero-based |
jpayne@68 257 +----------+---------+------------------------------------+
jpayne@68 258 |3 |id |id |
jpayne@68 259 +----------+---------+------------------------------------+
jpayne@68 260 |4 |ref |reference allele |
jpayne@68 261 +----------+---------+------------------------------------+
jpayne@68 262 |5 |alt |alternate alleles |
jpayne@68 263 +----------+---------+------------------------------------+
jpayne@68 264 |6 |qual |quality |
jpayne@68 265 +----------+---------+------------------------------------+
jpayne@68 266 |7 |filter |filter |
jpayne@68 267 +----------+---------+------------------------------------+
jpayne@68 268 |8 |info |info |
jpayne@68 269 +----------+---------+------------------------------------+
jpayne@68 270 |9 |format |format specifier. |
jpayne@68 271 +----------+---------+------------------------------------+
jpayne@68 272
jpayne@68 273 Access to genotypes is via index::
jpayne@68 274
jpayne@68 275 contig = vcf.contig
jpayne@68 276 first_sample_genotype = vcf[0]
jpayne@68 277 second_sample_genotype = vcf[1]
jpayne@68 278
jpayne@68 279 '''
jpayne@68 280 cdef parse(self, char * buffer, int len):
jpayne@68 281 cdef ctabixproxies.VCFProxy r
jpayne@68 282 r = ctabixproxies.VCFProxy(self.encoding)
jpayne@68 283 r.copy(buffer, len)
jpayne@68 284 return r
jpayne@68 285
jpayne@68 286
jpayne@68 287 cdef class TabixFile:
jpayne@68 288 """Random access to bgzf formatted files that
jpayne@68 289 have been indexed by :term:`tabix`.
jpayne@68 290
jpayne@68 291 The file is automatically opened. The index file of file
jpayne@68 292 ``<filename>`` is expected to be called ``<filename>.tbi``
jpayne@68 293 by default (see parameter `index`).
jpayne@68 294
jpayne@68 295 Parameters
jpayne@68 296 ----------
jpayne@68 297
jpayne@68 298 filename : string
jpayne@68 299 Filename of bgzf file to be opened.
jpayne@68 300
jpayne@68 301 index : string
jpayne@68 302 The filename of the index. If not set, the default is to
jpayne@68 303 assume that the index is called ``filename.tbi``
jpayne@68 304
jpayne@68 305 mode : char
jpayne@68 306 The file opening mode. Currently, only ``r`` is permitted.
jpayne@68 307
jpayne@68 308 parser : :class:`pysam.Parser`
jpayne@68 309
jpayne@68 310 sets the default parser for this tabix file. If `parser`
jpayne@68 311 is None, the results are returned as an unparsed string.
jpayne@68 312 Otherwise, `parser` is assumed to be a functor that will return
jpayne@68 313 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@68 314 :class:`~pysam.asGTF`).
jpayne@68 315
jpayne@68 316 encoding : string
jpayne@68 317
jpayne@68 318 The encoding passed to the parser
jpayne@68 319
jpayne@68 320 threads: integer
jpayne@68 321 Number of threads to use for decompressing Tabix files.
jpayne@68 322 (Default=1)
jpayne@68 323
jpayne@68 324
jpayne@68 325 Raises
jpayne@68 326 ------
jpayne@68 327
jpayne@68 328 ValueError
jpayne@68 329 if index file is missing.
jpayne@68 330
jpayne@68 331 IOError
jpayne@68 332 if file could not be opened
jpayne@68 333 """
jpayne@68 334 def __cinit__(self,
jpayne@68 335 filename,
jpayne@68 336 mode='r',
jpayne@68 337 parser=None,
jpayne@68 338 index=None,
jpayne@68 339 encoding="ascii",
jpayne@68 340 threads=1,
jpayne@68 341 *args,
jpayne@68 342 **kwargs ):
jpayne@68 343
jpayne@68 344 self.htsfile = NULL
jpayne@68 345 self.is_remote = False
jpayne@68 346 self.is_stream = False
jpayne@68 347 self.parser = parser
jpayne@68 348 self.threads = threads
jpayne@68 349 self._open(filename, mode, index, *args, **kwargs)
jpayne@68 350 self.encoding = encoding
jpayne@68 351
jpayne@68 352 def _open( self,
jpayne@68 353 filename,
jpayne@68 354 mode='r',
jpayne@68 355 index=None,
jpayne@68 356 threads=1,
jpayne@68 357 ):
jpayne@68 358 '''open a :term:`tabix file` for reading.'''
jpayne@68 359
jpayne@68 360 if mode != 'r':
jpayne@68 361 raise ValueError("invalid file opening mode `%s`" % mode)
jpayne@68 362
jpayne@68 363 if self.htsfile != NULL:
jpayne@68 364 self.close()
jpayne@68 365 self.htsfile = NULL
jpayne@68 366 self.threads=threads
jpayne@68 367
jpayne@68 368 filename_index = index or (filename + ".tbi")
jpayne@68 369 # encode all the strings to pass to tabix
jpayne@68 370 self.filename = encode_filename(filename)
jpayne@68 371 self.filename_index = encode_filename(filename_index)
jpayne@68 372
jpayne@68 373 self.is_stream = self.filename == b'-'
jpayne@68 374 self.is_remote = hisremote(self.filename)
jpayne@68 375
jpayne@68 376 if not self.is_remote:
jpayne@68 377 if not os.path.exists(filename):
jpayne@68 378 raise IOError("file `%s` not found" % filename)
jpayne@68 379
jpayne@68 380 if not os.path.exists(filename_index):
jpayne@68 381 raise IOError("index `%s` not found" % filename_index)
jpayne@68 382
jpayne@68 383 # open file
jpayne@68 384 cdef char *cfilename = self.filename
jpayne@68 385 cdef char *cfilename_index = self.filename_index
jpayne@68 386 with nogil:
jpayne@68 387 self.htsfile = hts_open(cfilename, 'r')
jpayne@68 388
jpayne@68 389 if self.htsfile == NULL:
jpayne@68 390 raise IOError("could not open file `%s`" % filename)
jpayne@68 391
jpayne@68 392 #if self.htsfile.format.category != region_list:
jpayne@68 393 # raise ValueError("file does not contain region data")
jpayne@68 394
jpayne@68 395 with nogil:
jpayne@68 396 self.index = tbx_index_load2(cfilename, cfilename_index)
jpayne@68 397
jpayne@68 398 if self.index == NULL:
jpayne@68 399 raise IOError("could not open index for `%s`" % filename)
jpayne@68 400
jpayne@68 401 if not self.is_stream:
jpayne@68 402 self.start_offset = self.tell()
jpayne@68 403
jpayne@68 404 def _dup(self):
jpayne@68 405 '''return a copy of this tabix file.
jpayne@68 406
jpayne@68 407 The file is being re-opened.
jpayne@68 408 '''
jpayne@68 409 return TabixFile(self.filename,
jpayne@68 410 mode="r",
jpayne@68 411 threads=self.threads,
jpayne@68 412 parser=self.parser,
jpayne@68 413 index=self.filename_index,
jpayne@68 414 encoding=self.encoding)
jpayne@68 415
jpayne@68 416 def fetch(self,
jpayne@68 417 reference=None,
jpayne@68 418 start=None,
jpayne@68 419 end=None,
jpayne@68 420 region=None,
jpayne@68 421 parser=None,
jpayne@68 422 multiple_iterators=False):
jpayne@68 423 '''fetch one or more rows in a :term:`region` using 0-based
jpayne@68 424 indexing. The region is specified by :term:`reference`,
jpayne@68 425 *start* and *end*. Alternatively, a samtools :term:`region`
jpayne@68 426 string can be supplied.
jpayne@68 427
jpayne@68 428 Without *reference* or *region* all entries will be fetched.
jpayne@68 429
jpayne@68 430 If only *reference* is set, all reads matching on *reference*
jpayne@68 431 will be fetched.
jpayne@68 432
jpayne@68 433 If *parser* is None, the default parser will be used for
jpayne@68 434 parsing.
jpayne@68 435
jpayne@68 436 Set *multiple_iterators* to true if you will be using multiple
jpayne@68 437 iterators on the same file at the same time. The iterator
jpayne@68 438 returned will receive its own copy of a filehandle to the file
jpayne@68 439 effectively re-opening the file. Re-opening a file creates
jpayne@68 440 some overhead, so beware.
jpayne@68 441
jpayne@68 442 '''
jpayne@68 443 if not self.is_open():
jpayne@68 444 raise ValueError("I/O operation on closed file")
jpayne@68 445
jpayne@68 446 # convert coordinates to region string, which is one-based
jpayne@68 447 if reference:
jpayne@68 448 if end is not None:
jpayne@68 449 if end < 0:
jpayne@68 450 raise ValueError("end out of range (%i)" % end)
jpayne@68 451 if start is None:
jpayne@68 452 start = 0
jpayne@68 453
jpayne@68 454 if start < 0:
jpayne@68 455 raise ValueError("start out of range (%i)" % end)
jpayne@68 456 elif start > end:
jpayne@68 457 raise ValueError(
jpayne@68 458 'start (%i) >= end (%i)' % (start, end))
jpayne@68 459 elif start == end:
jpayne@68 460 return EmptyIterator()
jpayne@68 461 else:
jpayne@68 462 region = '%s:%i-%i' % (reference, start + 1, end)
jpayne@68 463 elif start is not None:
jpayne@68 464 if start < 0:
jpayne@68 465 raise ValueError("start out of range (%i)" % end)
jpayne@68 466 region = '%s:%i' % (reference, start + 1)
jpayne@68 467 else:
jpayne@68 468 region = reference
jpayne@68 469
jpayne@68 470 # get iterator
jpayne@68 471 cdef hts_itr_t * itr
jpayne@68 472 cdef char *cstr
jpayne@68 473 cdef TabixFile fileobj
jpayne@68 474
jpayne@68 475 # reopen the same file if necessary
jpayne@68 476 if multiple_iterators:
jpayne@68 477 fileobj = self._dup()
jpayne@68 478 else:
jpayne@68 479 fileobj = self
jpayne@68 480
jpayne@68 481 if region is None:
jpayne@68 482 # without region or reference - iterate from start
jpayne@68 483 with nogil:
jpayne@68 484 itr = tbx_itr_queryi(fileobj.index,
jpayne@68 485 HTS_IDX_START,
jpayne@68 486 0,
jpayne@68 487 0)
jpayne@68 488 else:
jpayne@68 489 s = force_bytes(region, encoding=fileobj.encoding)
jpayne@68 490 cstr = s
jpayne@68 491 with nogil:
jpayne@68 492 itr = tbx_itr_querys(fileobj.index, cstr)
jpayne@68 493
jpayne@68 494 if itr == NULL:
jpayne@68 495 if region is None:
jpayne@68 496 if len(self.contigs) > 0:
jpayne@68 497 # when accessing a tabix file created prior tabix 1.0
jpayne@68 498 # the full-file iterator is empty.
jpayne@68 499 raise ValueError(
jpayne@68 500 "could not create iterator, possible "
jpayne@68 501 "tabix version mismatch")
jpayne@68 502 else:
jpayne@68 503 # possible reason is that the file is empty -
jpayne@68 504 # return an empty iterator
jpayne@68 505 return EmptyIterator()
jpayne@68 506 else:
jpayne@68 507 raise ValueError(
jpayne@68 508 "could not create iterator for region '%s'" %
jpayne@68 509 region)
jpayne@68 510
jpayne@68 511 # use default parser if no parser is specified
jpayne@68 512 if parser is None:
jpayne@68 513 parser = fileobj.parser
jpayne@68 514
jpayne@68 515 cdef TabixIterator a
jpayne@68 516 if parser is None:
jpayne@68 517 a = TabixIterator(encoding=fileobj.encoding)
jpayne@68 518 else:
jpayne@68 519 parser.set_encoding(fileobj.encoding)
jpayne@68 520 a = TabixIteratorParsed(parser)
jpayne@68 521
jpayne@68 522 a.tabixfile = fileobj
jpayne@68 523 a.iterator = itr
jpayne@68 524
jpayne@68 525 return a
jpayne@68 526
jpayne@68 527 ###############################################################
jpayne@68 528 ###############################################################
jpayne@68 529 ###############################################################
jpayne@68 530 ## properties
jpayne@68 531 ###############################################################
jpayne@68 532 property header:
jpayne@68 533 '''the file header.
jpayne@68 534
jpayne@68 535 The file header consists of the lines at the beginning of a
jpayne@68 536 file that are prefixed by the comment character ``#``.
jpayne@68 537
jpayne@68 538 .. note::
jpayne@68 539 The header is returned as an iterator presenting lines
jpayne@68 540 without the newline character.
jpayne@68 541 '''
jpayne@68 542
jpayne@68 543 def __get__(self):
jpayne@68 544
jpayne@68 545 cdef char *cfilename = self.filename
jpayne@68 546 cdef char *cfilename_index = self.filename_index
jpayne@68 547
jpayne@68 548 cdef kstring_t buffer
jpayne@68 549 buffer.l = buffer.m = 0
jpayne@68 550 buffer.s = NULL
jpayne@68 551
jpayne@68 552 cdef htsFile * fp = NULL
jpayne@68 553 cdef int KS_SEP_LINE = 2
jpayne@68 554 cdef tbx_t * tbx = NULL
jpayne@68 555 lines = []
jpayne@68 556 with nogil:
jpayne@68 557 fp = hts_open(cfilename, 'r')
jpayne@68 558
jpayne@68 559 if fp == NULL:
jpayne@68 560 raise OSError("could not open {} for reading header".format(self.filename))
jpayne@68 561
jpayne@68 562 with nogil:
jpayne@68 563 tbx = tbx_index_load2(cfilename, cfilename_index)
jpayne@68 564
jpayne@68 565 if tbx == NULL:
jpayne@68 566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
jpayne@68 567
jpayne@68 568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
jpayne@68 569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
jpayne@68 570 break
jpayne@68 571 lines.append(force_str(buffer.s, self.encoding))
jpayne@68 572
jpayne@68 573 with nogil:
jpayne@68 574 hts_close(fp)
jpayne@68 575 free(buffer.s)
jpayne@68 576
jpayne@68 577 return lines
jpayne@68 578
jpayne@68 579 property contigs:
jpayne@68 580 '''list of chromosome names'''
jpayne@68 581 def __get__(self):
jpayne@68 582 cdef const char ** sequences
jpayne@68 583 cdef int nsequences
jpayne@68 584
jpayne@68 585 with nogil:
jpayne@68 586 sequences = tbx_seqnames(self.index, &nsequences)
jpayne@68 587 cdef int x
jpayne@68 588 result = []
jpayne@68 589 for x from 0 <= x < nsequences:
jpayne@68 590 result.append(force_str(sequences[x]))
jpayne@68 591
jpayne@68 592 # htslib instructions:
jpayne@68 593 # only free container, not the sequences themselves
jpayne@68 594 free(sequences)
jpayne@68 595
jpayne@68 596 return result
jpayne@68 597
jpayne@68 598 def close(self):
jpayne@68 599 '''
jpayne@68 600 closes the :class:`pysam.TabixFile`.'''
jpayne@68 601 if self.htsfile != NULL:
jpayne@68 602 hts_close(self.htsfile)
jpayne@68 603 self.htsfile = NULL
jpayne@68 604 if self.index != NULL:
jpayne@68 605 tbx_destroy(self.index)
jpayne@68 606 self.index = NULL
jpayne@68 607
jpayne@68 608 def __dealloc__( self ):
jpayne@68 609 # remember: dealloc cannot call other python methods
jpayne@68 610 # note: no doc string
jpayne@68 611 # note: __del__ is not called.
jpayne@68 612 if self.htsfile != NULL:
jpayne@68 613 hts_close(self.htsfile)
jpayne@68 614 self.htsfile = NULL
jpayne@68 615 if self.index != NULL:
jpayne@68 616 tbx_destroy(self.index)
jpayne@68 617
jpayne@68 618
jpayne@68 619 cdef class TabixIterator:
jpayne@68 620 """iterates over rows in *tabixfile* in region
jpayne@68 621 given by *tid*, *start* and *end*.
jpayne@68 622 """
jpayne@68 623
jpayne@68 624 def __init__(self, encoding="ascii"):
jpayne@68 625 self.encoding = encoding
jpayne@68 626
jpayne@68 627 def __iter__(self):
jpayne@68 628 self.buffer.s = NULL
jpayne@68 629 self.buffer.l = 0
jpayne@68 630 self.buffer.m = 0
jpayne@68 631
jpayne@68 632 return self
jpayne@68 633
jpayne@68 634 cdef int __cnext__(self):
jpayne@68 635 '''iterate to next element.
jpayne@68 636
jpayne@68 637 Return -5 if file has been closed when this function
jpayne@68 638 was called.
jpayne@68 639 '''
jpayne@68 640 if self.tabixfile.htsfile == NULL:
jpayne@68 641 return -5
jpayne@68 642
jpayne@68 643 cdef int retval
jpayne@68 644
jpayne@68 645 while 1:
jpayne@68 646 with nogil:
jpayne@68 647 retval = tbx_itr_next(
jpayne@68 648 self.tabixfile.htsfile,
jpayne@68 649 self.tabixfile.index,
jpayne@68 650 self.iterator,
jpayne@68 651 &self.buffer)
jpayne@68 652
jpayne@68 653 if retval < 0:
jpayne@68 654 break
jpayne@68 655
jpayne@68 656 if self.buffer.s[0] != b'#':
jpayne@68 657 break
jpayne@68 658
jpayne@68 659 return retval
jpayne@68 660
jpayne@68 661 def __next__(self):
jpayne@68 662 """python version of next().
jpayne@68 663
jpayne@68 664 pyrex uses this non-standard name instead of next()
jpayne@68 665 """
jpayne@68 666
jpayne@68 667 cdef int retval = self.__cnext__()
jpayne@68 668 if retval == -5:
jpayne@68 669 raise IOError("iteration on closed file")
jpayne@68 670 elif retval < 0:
jpayne@68 671 raise StopIteration
jpayne@68 672
jpayne@68 673 return charptr_to_str(self.buffer.s, self.encoding)
jpayne@68 674
jpayne@68 675 def __dealloc__(self):
jpayne@68 676 if <void*>self.iterator != NULL:
jpayne@68 677 tbx_itr_destroy(self.iterator)
jpayne@68 678 if self.buffer.s != NULL:
jpayne@68 679 free(self.buffer.s)
jpayne@68 680
jpayne@68 681
jpayne@68 682 class EmptyIterator:
jpayne@68 683 '''empty iterator'''
jpayne@68 684
jpayne@68 685 def __iter__(self):
jpayne@68 686 return self
jpayne@68 687
jpayne@68 688 def __next__(self):
jpayne@68 689 raise StopIteration()
jpayne@68 690
jpayne@68 691
jpayne@68 692 cdef class TabixIteratorParsed(TabixIterator):
jpayne@68 693 """iterates over mapped reads in a region.
jpayne@68 694
jpayne@68 695 The *parser* determines the encoding.
jpayne@68 696
jpayne@68 697 Returns parsed data.
jpayne@68 698 """
jpayne@68 699
jpayne@68 700 def __init__(self, Parser parser):
jpayne@68 701 super().__init__()
jpayne@68 702 self.parser = parser
jpayne@68 703
jpayne@68 704 def __next__(self):
jpayne@68 705 """python version of next().
jpayne@68 706
jpayne@68 707 pyrex uses this non-standard name instead of next()
jpayne@68 708 """
jpayne@68 709
jpayne@68 710 cdef int retval = self.__cnext__()
jpayne@68 711 if retval == -5:
jpayne@68 712 raise IOError("iteration on closed file")
jpayne@68 713 elif retval < 0:
jpayne@68 714 raise StopIteration
jpayne@68 715
jpayne@68 716 return self.parser.parse(self.buffer.s,
jpayne@68 717 self.buffer.l)
jpayne@68 718
jpayne@68 719
jpayne@68 720 cdef class GZIterator:
jpayne@68 721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
jpayne@68 722 '''iterate line-by-line through gzip (or bgzip)
jpayne@68 723 compressed file.
jpayne@68 724 '''
jpayne@68 725 if not os.path.exists(filename):
jpayne@68 726 raise IOError("No such file or directory: %s" % filename)
jpayne@68 727
jpayne@68 728 filename = encode_filename(filename)
jpayne@68 729 cdef char *cfilename = filename
jpayne@68 730 with nogil:
jpayne@68 731 self.gzipfile = bgzf_open(cfilename, "r")
jpayne@68 732 self._filename = filename
jpayne@68 733 self.kstream = ks_init(self.gzipfile)
jpayne@68 734 self.encoding = encoding
jpayne@68 735
jpayne@68 736 self.buffer.l = 0
jpayne@68 737 self.buffer.m = 0
jpayne@68 738 self.buffer.s = <char*>malloc(buffer_size)
jpayne@68 739
jpayne@68 740 def __dealloc__(self):
jpayne@68 741 '''close file.'''
jpayne@68 742 if self.gzipfile != NULL:
jpayne@68 743 bgzf_close(self.gzipfile)
jpayne@68 744 self.gzipfile = NULL
jpayne@68 745 if self.buffer.s != NULL:
jpayne@68 746 free(self.buffer.s)
jpayne@68 747 if self.kstream != NULL:
jpayne@68 748 ks_destroy(self.kstream)
jpayne@68 749
jpayne@68 750 def __iter__(self):
jpayne@68 751 return self
jpayne@68 752
jpayne@68 753 cdef int __cnext__(self):
jpayne@68 754 cdef int dret = 0
jpayne@68 755 cdef int retval = 0
jpayne@68 756 while 1:
jpayne@68 757 with nogil:
jpayne@68 758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@68 759
jpayne@68 760 if retval < 0:
jpayne@68 761 break
jpayne@68 762
jpayne@68 763 return dret
jpayne@68 764 return -1
jpayne@68 765
jpayne@68 766 def __next__(self):
jpayne@68 767 """python version of next().
jpayne@68 768 """
jpayne@68 769 cdef int retval = self.__cnext__()
jpayne@68 770 if retval < 0:
jpayne@68 771 raise StopIteration
jpayne@68 772 return force_str(self.buffer.s, self.encoding)
jpayne@68 773
jpayne@68 774
jpayne@68 775 cdef class GZIteratorHead(GZIterator):
jpayne@68 776 '''iterate line-by-line through gzip (or bgzip)
jpayne@68 777 compressed file returning comments at top of file.
jpayne@68 778 '''
jpayne@68 779
jpayne@68 780 def __next__(self):
jpayne@68 781 """python version of next().
jpayne@68 782 """
jpayne@68 783 cdef int retval = self.__cnext__()
jpayne@68 784 if retval < 0:
jpayne@68 785 raise StopIteration
jpayne@68 786 if self.buffer.s[0] == b'#':
jpayne@68 787 return self.buffer.s
jpayne@68 788 else:
jpayne@68 789 raise StopIteration
jpayne@68 790
jpayne@68 791
jpayne@68 792 cdef class GZIteratorParsed(GZIterator):
jpayne@68 793 '''iterate line-by-line through gzip (or bgzip)
jpayne@68 794 compressed file returning comments at top of file.
jpayne@68 795 '''
jpayne@68 796
jpayne@68 797 def __init__(self, parser):
jpayne@68 798 self.parser = parser
jpayne@68 799
jpayne@68 800 def __next__(self):
jpayne@68 801 """python version of next().
jpayne@68 802 """
jpayne@68 803 cdef int retval = self.__cnext__()
jpayne@68 804 if retval < 0:
jpayne@68 805 raise StopIteration
jpayne@68 806
jpayne@68 807 return self.parser.parse(self.buffer.s,
jpayne@68 808 self.buffer.l)
jpayne@68 809
jpayne@68 810
jpayne@68 811 def tabix_compress(filename_in,
jpayne@68 812 filename_out,
jpayne@68 813 force=False):
jpayne@68 814 '''compress *filename_in* writing the output to *filename_out*.
jpayne@68 815
jpayne@68 816 Raise an IOError if *filename_out* already exists, unless *force*
jpayne@68 817 is set.
jpayne@68 818 '''
jpayne@68 819
jpayne@68 820 if not force and os.path.exists(filename_out):
jpayne@68 821 raise IOError(
jpayne@68 822 "Filename '%s' already exists, use *force* to "
jpayne@68 823 "overwrite" % filename_out)
jpayne@68 824
jpayne@68 825 cdef int WINDOW_SIZE
jpayne@68 826 cdef int c, r
jpayne@68 827 cdef void * buffer
jpayne@68 828 cdef BGZF * fp
jpayne@68 829 cdef int fd_src
jpayne@68 830 cdef bint is_empty = True
jpayne@68 831 cdef int O_RDONLY
jpayne@68 832 O_RDONLY = os.O_RDONLY
jpayne@68 833
jpayne@68 834 WINDOW_SIZE = 64 * 1024
jpayne@68 835
jpayne@68 836 fn = encode_filename(filename_out)
jpayne@68 837 cdef char *cfn = fn
jpayne@68 838 with nogil:
jpayne@68 839 fp = bgzf_open(cfn, "w")
jpayne@68 840 if fp == NULL:
jpayne@68 841 raise IOError("could not open '%s' for writing" % filename_out)
jpayne@68 842
jpayne@68 843 fn = encode_filename(filename_in)
jpayne@68 844 fd_src = open(fn, O_RDONLY)
jpayne@68 845 if fd_src == 0:
jpayne@68 846 raise IOError("could not open '%s' for reading" % filename_in)
jpayne@68 847
jpayne@68 848 buffer = malloc(WINDOW_SIZE)
jpayne@68 849 c = 1
jpayne@68 850
jpayne@68 851 while c > 0:
jpayne@68 852 with nogil:
jpayne@68 853 c = read(fd_src, buffer, WINDOW_SIZE)
jpayne@68 854 if c > 0:
jpayne@68 855 is_empty = False
jpayne@68 856 r = bgzf_write(fp, buffer, c)
jpayne@68 857 if r < 0:
jpayne@68 858 free(buffer)
jpayne@68 859 raise IOError("writing failed")
jpayne@68 860
jpayne@68 861 free(buffer)
jpayne@68 862 r = bgzf_close(fp)
jpayne@68 863 if r < 0:
jpayne@68 864 raise IOError("error %i when writing to file %s" % (r, filename_out))
jpayne@68 865
jpayne@68 866 r = close(fd_src)
jpayne@68 867 # an empty file will return with -1, thus ignore this.
jpayne@68 868 if r < 0:
jpayne@68 869 if not (r == -1 and is_empty):
jpayne@68 870 raise IOError("error %i when closing file %s" % (r, filename_in))
jpayne@68 871
jpayne@68 872
jpayne@68 873 def tabix_index(filename,
jpayne@68 874 force=False,
jpayne@68 875 seq_col=None,
jpayne@68 876 start_col=None,
jpayne@68 877 end_col=None,
jpayne@68 878 preset=None,
jpayne@68 879 meta_char="#",
jpayne@68 880 int line_skip=0,
jpayne@68 881 zerobased=False,
jpayne@68 882 int min_shift=-1,
jpayne@68 883 index=None,
jpayne@68 884 keep_original=False,
jpayne@68 885 csi=False,
jpayne@68 886 ):
jpayne@68 887 '''index tab-separated *filename* using tabix.
jpayne@68 888
jpayne@68 889 An existing index will not be overwritten unless *force* is set.
jpayne@68 890
jpayne@68 891 The index will be built from coordinates in columns *seq_col*,
jpayne@68 892 *start_col* and *end_col*.
jpayne@68 893
jpayne@68 894 The contents of *filename* have to be sorted by contig and
jpayne@68 895 position - the method does not check if the file is sorted.
jpayne@68 896
jpayne@68 897 Column indices are 0-based. Note that this is different from the
jpayne@68 898 tabix command line utility where column indices start at 1.
jpayne@68 899
jpayne@68 900 Coordinates in the file are assumed to be 1-based unless
jpayne@68 901 *zerobased* is set.
jpayne@68 902
jpayne@68 903 If *preset* is provided, the column coordinates are taken from a
jpayne@68 904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
jpayne@68 905 psltbl", "pileup".
jpayne@68 906
jpayne@68 907 Lines beginning with *meta_char* and the first *line_skip* lines
jpayne@68 908 will be skipped.
jpayne@68 909
jpayne@68 910 If *filename* is not detected as a gzip file it will be automatically
jpayne@68 911 compressed. The original file will be removed and only the compressed
jpayne@68 912 file will be retained.
jpayne@68 913
jpayne@68 914 By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
jpayne@68 915 is greater than zero and/or *csi* is True, creates a CSI index with a
jpayne@68 916 minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
jpayne@68 917
jpayne@68 918 *index* controls the filename which should be used for creating the index.
jpayne@68 919 If not set, the default is to append ``.tbi`` to *filename*.
jpayne@68 920
jpayne@68 921 When automatically compressing files, if *keep_original* is set the
jpayne@68 922 uncompressed file will not be deleted.
jpayne@68 923
jpayne@68 924 returns the filename of the compressed data
jpayne@68 925
jpayne@68 926 '''
jpayne@68 927
jpayne@68 928 if preset is None and \
jpayne@68 929 (seq_col is None or start_col is None or end_col is None):
jpayne@68 930 raise ValueError(
jpayne@68 931 "neither preset nor seq_col,start_col and end_col given")
jpayne@68 932
jpayne@68 933 fn = encode_filename(filename)
jpayne@68 934 cdef char *cfn = fn
jpayne@68 935
jpayne@68 936 cdef htsFile *fp = hts_open(cfn, "r")
jpayne@68 937 if fp == NULL:
jpayne@68 938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
jpayne@68 939
jpayne@68 940 cdef htsFormat fmt = hts_get_format(fp)[0]
jpayne@68 941 hts_close(fp)
jpayne@68 942
jpayne@68 943 if fmt.compression == no_compression:
jpayne@68 944 tabix_compress(filename, filename + ".gz", force=force)
jpayne@68 945 if not keep_original:
jpayne@68 946 os.unlink(filename)
jpayne@68 947 filename += ".gz"
jpayne@68 948 fn = encode_filename(filename)
jpayne@68 949 cfn = fn
jpayne@68 950
jpayne@68 951 # columns (1-based):
jpayne@68 952 # preset-code, contig, start, end, metachar for
jpayne@68 953 # comments, lines to ignore at beginning
jpayne@68 954 # 0 is a missing column
jpayne@68 955 preset2conf = {
jpayne@68 956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
jpayne@68 957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
jpayne@68 958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
jpayne@68 959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
jpayne@68 960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
jpayne@68 961 }
jpayne@68 962
jpayne@68 963 conf_data = None
jpayne@68 964 if preset == "bcf" or fmt.format == bcf:
jpayne@68 965 csi = True
jpayne@68 966 elif preset:
jpayne@68 967 try:
jpayne@68 968 conf_data = preset2conf[preset]
jpayne@68 969 except KeyError:
jpayne@68 970 raise KeyError(
jpayne@68 971 "unknown preset '%s', valid presets are '%s'" %
jpayne@68 972 (preset, ",".join(preset2conf.keys())))
jpayne@68 973 else:
jpayne@68 974 if end_col is None:
jpayne@68 975 end_col = -1
jpayne@68 976
jpayne@68 977 preset = 0
jpayne@68 978 # tabix internally works with 0-based coordinates and
jpayne@68 979 # open/closed intervals. When using a preset, conversion is
jpayne@68 980 # automatically taken care of. Otherwise, the coordinates are
jpayne@68 981 # assumed to be 1-based closed intervals and -1 is subtracted
jpayne@68 982 # from the start coordinate. To avoid doing this, set the
jpayne@68 983 # TI_FLAG_UCSC=0x10000 flag:
jpayne@68 984 if zerobased:
jpayne@68 985 preset = preset | TBX_UCSC
jpayne@68 986
jpayne@68 987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
jpayne@68 988
jpayne@68 989 cdef tbx_conf_t conf
jpayne@68 990 if conf_data:
jpayne@68 991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
jpayne@68 992
jpayne@68 993 if csi or min_shift > 0:
jpayne@68 994 suffix = ".csi"
jpayne@68 995 if min_shift <= 0: min_shift = 14
jpayne@68 996 else:
jpayne@68 997 suffix = ".tbi"
jpayne@68 998 min_shift = 0
jpayne@68 999
jpayne@68 1000 index = index or filename + suffix
jpayne@68 1001 fn_index = encode_filename(index)
jpayne@68 1002
jpayne@68 1003 if not force and os.path.exists(index):
jpayne@68 1004 raise IOError(
jpayne@68 1005 "filename '%s' already exists, use *force* to overwrite" % index)
jpayne@68 1006
jpayne@68 1007 cdef char *fnidx = fn_index
jpayne@68 1008 cdef int retval = 0
jpayne@68 1009
jpayne@68 1010 if csi and fmt.format == bcf:
jpayne@68 1011 with nogil:
jpayne@68 1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
jpayne@68 1013 else:
jpayne@68 1014 with nogil:
jpayne@68 1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
jpayne@68 1016
jpayne@68 1017 if retval != 0:
jpayne@68 1018 raise OSError("building of index for {} failed".format(filename))
jpayne@68 1019
jpayne@68 1020 return filename
jpayne@68 1021
jpayne@68 1022 # #########################################################
jpayne@68 1023 # cdef class tabix_file_iterator_old:
jpayne@68 1024 # '''iterate over ``infile``.
jpayne@68 1025
jpayne@68 1026 # This iterator is not safe. If the :meth:`__next__()` method is called
jpayne@68 1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
jpayne@68 1028
jpayne@68 1029 # The iterator might either raise a StopIteration or segfault.
jpayne@68 1030 # '''
jpayne@68 1031
jpayne@68 1032
jpayne@68 1033 # def __cinit__(self,
jpayne@68 1034 # infile,
jpayne@68 1035 # Parser parser,
jpayne@68 1036 # int buffer_size = 65536 ):
jpayne@68 1037
jpayne@68 1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
jpayne@68 1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
jpayne@68 1040 # self.infile = fdopen( fd, 'r')
jpayne@68 1041
jpayne@68 1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
jpayne@68 1043
jpayne@68 1044 # self.buffer = <char*>malloc( buffer_size )
jpayne@68 1045 # self.size = buffer_size
jpayne@68 1046 # self.parser = parser
jpayne@68 1047
jpayne@68 1048 # def __iter__(self):
jpayne@68 1049 # return self
jpayne@68 1050
jpayne@68 1051 # cdef __cnext__(self):
jpayne@68 1052
jpayne@68 1053 # cdef char * b
jpayne@68 1054 # cdef size_t nbytes
jpayne@68 1055 # b = self.buffer
jpayne@68 1056
jpayne@68 1057 # while not feof( self.infile ):
jpayne@68 1058 # nbytes = getline( &b, &self.size, self.infile)
jpayne@68 1059
jpayne@68 1060 # # stop at first error or eof
jpayne@68 1061 # if (nbytes == -1): break
jpayne@68 1062 # # skip comments
jpayne@68 1063 # if (b[0] == '#'): continue
jpayne@68 1064
jpayne@68 1065 # # skip empty lines
jpayne@68 1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
jpayne@68 1067
jpayne@68 1068 # # make sure that entry is complete
jpayne@68 1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
jpayne@68 1070 # result = b
jpayne@68 1071 # raise ValueError( "incomplete line at %s" % result )
jpayne@68 1072
jpayne@68 1073 # # make sure that this goes fully through C
jpayne@68 1074 # # otherwise buffer is copied to/from a
jpayne@68 1075 # # Python object causing segfaults as
jpayne@68 1076 # # the wrong memory is freed
jpayne@68 1077 # return self.parser.parse( b, nbytes )
jpayne@68 1078
jpayne@68 1079 # raise StopIteration
jpayne@68 1080
jpayne@68 1081 # def __dealloc__(self):
jpayne@68 1082 # free(self.buffer)
jpayne@68 1083
jpayne@68 1084 # def __next__(self):
jpayne@68 1085 # return self.__cnext__()
jpayne@68 1086
jpayne@68 1087 #########################################################
jpayne@68 1088 #########################################################
jpayne@68 1089 #########################################################
jpayne@68 1090 ## Iterators for parsing through unindexed files.
jpayne@68 1091 #########################################################
jpayne@68 1092 # cdef buildGzipError(void *gzfp):
jpayne@68 1093 # cdef int errnum = 0
jpayne@68 1094 # cdef char *s = gzerror(gzfp, &errnum)
jpayne@68 1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
jpayne@68 1096
jpayne@68 1097
jpayne@68 1098 cdef class tabix_file_iterator:
jpayne@68 1099 '''iterate over a compressed or uncompressed ``infile``.
jpayne@68 1100 '''
jpayne@68 1101
jpayne@68 1102 def __cinit__(self,
jpayne@68 1103 infile,
jpayne@68 1104 Parser parser,
jpayne@68 1105 int buffer_size=65536):
jpayne@68 1106
jpayne@68 1107 if infile.closed:
jpayne@68 1108 raise ValueError("I/O operation on closed file.")
jpayne@68 1109
jpayne@68 1110 self.infile = infile
jpayne@68 1111
jpayne@68 1112 cdef int fd = PyObject_AsFileDescriptor(infile)
jpayne@68 1113 if fd == -1:
jpayne@68 1114 raise ValueError("I/O operation on closed file.")
jpayne@68 1115
jpayne@68 1116 self.duplicated_fd = dup(fd)
jpayne@68 1117
jpayne@68 1118 # From the manual:
jpayne@68 1119 # gzopen can be used to read a file which is not in gzip format;
jpayne@68 1120 # in this case gzread will directly read from the file without decompression.
jpayne@68 1121 # When reading, this will be detected automatically by looking
jpayne@68 1122 # for the magic two-byte gzip header.
jpayne@68 1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
jpayne@68 1124
jpayne@68 1125 if self.fh == NULL:
jpayne@68 1126 raise IOError('%s' % strerror(errno))
jpayne@68 1127
jpayne@68 1128 self.kstream = ks_init(self.fh)
jpayne@68 1129
jpayne@68 1130 self.buffer.s = <char*>malloc(buffer_size)
jpayne@68 1131 #if self.buffer == NULL:
jpayne@68 1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
jpayne@68 1133 #self.size = buffer_size
jpayne@68 1134 self.parser = parser
jpayne@68 1135
jpayne@68 1136 def __iter__(self):
jpayne@68 1137 return self
jpayne@68 1138
jpayne@68 1139 cdef __cnext__(self):
jpayne@68 1140
jpayne@68 1141 cdef char * b
jpayne@68 1142 cdef int dret = 0
jpayne@68 1143 cdef int retval = 0
jpayne@68 1144 while 1:
jpayne@68 1145 with nogil:
jpayne@68 1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@68 1147
jpayne@68 1148 if retval < 0:
jpayne@68 1149 break
jpayne@68 1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
jpayne@68 1151
jpayne@68 1152 b = self.buffer.s
jpayne@68 1153
jpayne@68 1154 # skip comments
jpayne@68 1155 if (b[0] == b'#'):
jpayne@68 1156 continue
jpayne@68 1157
jpayne@68 1158 # skip empty lines
jpayne@68 1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@68 1160 continue
jpayne@68 1161
jpayne@68 1162 # gzgets terminates at \n, no need to test
jpayne@68 1163
jpayne@68 1164 # parser creates a copy
jpayne@68 1165 return self.parser.parse(b, self.buffer.l)
jpayne@68 1166
jpayne@68 1167 raise StopIteration
jpayne@68 1168
jpayne@68 1169 def __dealloc__(self):
jpayne@68 1170 free(self.buffer.s)
jpayne@68 1171 ks_destroy(self.kstream)
jpayne@68 1172 bgzf_close(self.fh)
jpayne@68 1173
jpayne@68 1174 def __next__(self):
jpayne@68 1175 return self.__cnext__()
jpayne@68 1176
jpayne@68 1177
jpayne@68 1178 class tabix_generic_iterator:
jpayne@68 1179 '''iterate over ``infile``.
jpayne@68 1180
jpayne@68 1181 Permits the use of file-like objects for example from the gzip module.
jpayne@68 1182 '''
jpayne@68 1183 def __init__(self, infile, parser):
jpayne@68 1184
jpayne@68 1185 self.infile = infile
jpayne@68 1186 if self.infile.closed:
jpayne@68 1187 raise ValueError("I/O operation on closed file.")
jpayne@68 1188 self.parser = parser
jpayne@68 1189
jpayne@68 1190 def __iter__(self):
jpayne@68 1191 return self
jpayne@68 1192
jpayne@68 1193 # cython version - required for python 3
jpayne@68 1194 def __next__(self):
jpayne@68 1195
jpayne@68 1196 cdef char * b
jpayne@68 1197 cdef char * cpy
jpayne@68 1198 cdef size_t nbytes
jpayne@68 1199
jpayne@68 1200 encoding = self.parser.get_encoding()
jpayne@68 1201
jpayne@68 1202 # note that GzipFile.close() does not close the file
jpayne@68 1203 # reading is still possible.
jpayne@68 1204 if self.infile.closed:
jpayne@68 1205 raise ValueError("I/O operation on closed file.")
jpayne@68 1206
jpayne@68 1207 while 1:
jpayne@68 1208
jpayne@68 1209 line = self.infile.readline()
jpayne@68 1210 if not line:
jpayne@68 1211 break
jpayne@68 1212
jpayne@68 1213 s = force_bytes(line, encoding)
jpayne@68 1214 b = s
jpayne@68 1215 nbytes = len(line)
jpayne@68 1216 assert b[nbytes] == b'\0'
jpayne@68 1217
jpayne@68 1218 # skip comments
jpayne@68 1219 if b[0] == b'#':
jpayne@68 1220 continue
jpayne@68 1221
jpayne@68 1222 # skip empty lines
jpayne@68 1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@68 1224 continue
jpayne@68 1225
jpayne@68 1226 # make sure that entry is complete
jpayne@68 1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
jpayne@68 1228 raise ValueError("incomplete line at %s" % line)
jpayne@68 1229
jpayne@68 1230 bytes_cpy = <bytes> b
jpayne@68 1231 cpy = <char *> bytes_cpy
jpayne@68 1232
jpayne@68 1233 return self.parser(cpy, nbytes)
jpayne@68 1234
jpayne@68 1235 raise StopIteration
jpayne@68 1236
jpayne@68 1237
jpayne@68 1238 def tabix_iterator(infile, parser):
jpayne@68 1239 """return an iterator over all entries in a file.
jpayne@68 1240
jpayne@68 1241 Results are returned parsed as specified by the *parser*. If
jpayne@68 1242 *parser* is None, the results are returned as an unparsed string.
jpayne@68 1243 Otherwise, *parser* is assumed to be a functor that will return
jpayne@68 1244 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@68 1245 :class:`~pysam.asGTF`).
jpayne@68 1246
jpayne@68 1247 """
jpayne@68 1248 return tabix_generic_iterator(infile, parser)
jpayne@68 1249
jpayne@68 1250
jpayne@68 1251 cdef class Tabixfile(TabixFile):
jpayne@68 1252 """Tabixfile is deprecated: use TabixFile instead"""
jpayne@68 1253 pass
jpayne@68 1254
jpayne@68 1255
jpayne@68 1256 __all__ = [
jpayne@68 1257 "tabix_index",
jpayne@68 1258 "tabix_compress",
jpayne@68 1259 "TabixFile",
jpayne@68 1260 "Tabixfile",
jpayne@68 1261 "asTuple",
jpayne@68 1262 "asGTF",
jpayne@68 1263 "asGFF3",
jpayne@68 1264 "asVCF",
jpayne@68 1265 "asBed",
jpayne@68 1266 "GZIterator",
jpayne@68 1267 "GZIteratorHead",
jpayne@68 1268 "tabix_iterator",
jpayne@68 1269 "tabix_generic_iterator",
jpayne@68 1270 "tabix_file_iterator",
jpayne@68 1271 ]