annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # cython: language_level=3
jpayne@69 2 # cython: embedsignature=True
jpayne@69 3 # cython: profile=True
jpayne@69 4 ###############################################################################
jpayne@69 5 ###############################################################################
jpayne@69 6 # Cython wrapper for access to tabix indexed files in bgzf format
jpayne@69 7 ###############################################################################
jpayne@69 8 # The principal classes and functions defined in this module are:
jpayne@69 9 #
jpayne@69 10 # class TabixFile class wrapping tabix indexed files in bgzf format
jpayne@69 11 #
jpayne@69 12 # class asTuple Parser class for tuples
jpayne@69 13 # class asGTF Parser class for GTF formatted rows
jpayne@69 14 # class asGFF3 Parser class for GFF3 formatted rows
jpayne@69 15 # class asBed Parser class for Bed formatted rows
jpayne@69 16 # class asVCF Parser class for VCF formatted rows
jpayne@69 17 #
jpayne@69 18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
jpayne@69 19 #
jpayne@69 20 # Additionally this module defines several additional classes that are part
jpayne@69 21 # of the internal API. These are:
jpayne@69 22 #
jpayne@69 23 # class Parser base class for parsers of tab-separated rows
jpayne@69 24 # class tabix_file_iterator
jpayne@69 25 # class TabixIterator iterator class over rows in bgzf file
jpayne@69 26 # class EmptyIterator
jpayne@69 27 #
jpayne@69 28 # For backwards compatibility, the following classes are also defined:
jpayne@69 29 #
jpayne@69 30 # class Tabixfile equivalent to TabixFile
jpayne@69 31 #
jpayne@69 32 ###############################################################################
jpayne@69 33 #
jpayne@69 34 # The MIT License
jpayne@69 35 #
jpayne@69 36 # Copyright (c) 2015 Andreas Heger
jpayne@69 37 #
jpayne@69 38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69 39 # copy of this software and associated documentation files (the "Software"),
jpayne@69 40 # to deal in the Software without restriction, including without limitation
jpayne@69 41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69 42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69 43 # Software is furnished to do so, subject to the following conditions:
jpayne@69 44 #
jpayne@69 45 # The above copyright notice and this permission notice shall be included in
jpayne@69 46 # all copies or substantial portions of the Software.
jpayne@69 47 #
jpayne@69 48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69 49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69 50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@69 51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69 52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69 53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69 54 # DEALINGS IN THE SOFTWARE.
jpayne@69 55 #
jpayne@69 56 ###############################################################################
jpayne@69 57 import os
jpayne@69 58 import sys
jpayne@69 59
jpayne@69 60 from libc.stdio cimport printf, fprintf, stderr
jpayne@69 61 from libc.string cimport strerror
jpayne@69 62 from libc.errno cimport errno
jpayne@69 63 from posix.unistd cimport dup
jpayne@69 64
jpayne@69 65 from cpython cimport PyErr_SetString, PyBytes_Check, \
jpayne@69 66 PyUnicode_Check, PyBytes_FromStringAndSize, \
jpayne@69 67 PyObject_AsFileDescriptor
jpayne@69 68
jpayne@69 69 cimport pysam.libctabixproxies as ctabixproxies
jpayne@69 70
jpayne@69 71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
jpayne@69 72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
jpayne@69 73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
jpayne@69 74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
jpayne@69 75 tbx_destroy, hisremote, region_list, hts_getline, \
jpayne@69 76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
jpayne@69 77 no_compression, bcf, bcf_index_build2
jpayne@69 78
jpayne@69 79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@69 80 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@69 81
jpayne@69 82 cdef class Parser:
jpayne@69 83
jpayne@69 84 def __init__(self, encoding="ascii"):
jpayne@69 85 self.encoding = encoding
jpayne@69 86
jpayne@69 87 def set_encoding(self, encoding):
jpayne@69 88 self.encoding = encoding
jpayne@69 89
jpayne@69 90 def get_encoding(self):
jpayne@69 91 return self.encoding
jpayne@69 92
jpayne@69 93 cdef parse(self, char * buffer, int length):
jpayne@69 94 raise NotImplementedError(
jpayne@69 95 'parse method of %s not implemented' % str(self))
jpayne@69 96
jpayne@69 97 def __call__(self, char * buffer, int length):
jpayne@69 98 return self.parse(buffer, length)
jpayne@69 99
jpayne@69 100
jpayne@69 101 cdef class asTuple(Parser):
jpayne@69 102 '''converts a :term:`tabix row` into a python tuple.
jpayne@69 103
jpayne@69 104 A field in a row is accessed by numeric index.
jpayne@69 105 '''
jpayne@69 106 cdef parse(self, char * buffer, int len):
jpayne@69 107 cdef ctabixproxies.TupleProxy r
jpayne@69 108 r = ctabixproxies.TupleProxy(self.encoding)
jpayne@69 109 # need to copy - there were some
jpayne@69 110 # persistence issues with "present"
jpayne@69 111 r.copy(buffer, len)
jpayne@69 112 return r
jpayne@69 113
jpayne@69 114
jpayne@69 115 cdef class asGFF3(Parser):
jpayne@69 116 '''converts a :term:`tabix row` into a GFF record with the following
jpayne@69 117 fields:
jpayne@69 118
jpayne@69 119 +----------+----------+-------------------------------+
jpayne@69 120 |*Column* |*Name* |*Content* |
jpayne@69 121 +----------+----------+-------------------------------+
jpayne@69 122 |1 |contig |the chromosome name |
jpayne@69 123 +----------+----------+-------------------------------+
jpayne@69 124 |2 |feature |The feature type |
jpayne@69 125 +----------+----------+-------------------------------+
jpayne@69 126 |3 |source |The feature source |
jpayne@69 127 +----------+----------+-------------------------------+
jpayne@69 128 |4 |start |genomic start coordinate |
jpayne@69 129 | | |(0-based) |
jpayne@69 130 +----------+----------+-------------------------------+
jpayne@69 131 |5 |end |genomic end coordinate |
jpayne@69 132 | | |(0-based) |
jpayne@69 133 +----------+----------+-------------------------------+
jpayne@69 134 |6 |score |feature score |
jpayne@69 135 +----------+----------+-------------------------------+
jpayne@69 136 |7 |strand |strand |
jpayne@69 137 +----------+----------+-------------------------------+
jpayne@69 138 |8 |frame |frame |
jpayne@69 139 +----------+----------+-------------------------------+
jpayne@69 140 |9 |attributes|the attribute field |
jpayne@69 141 +----------+----------+-------------------------------+
jpayne@69 142
jpayne@69 143 '''
jpayne@69 144 cdef parse(self, char * buffer, int len):
jpayne@69 145 cdef ctabixproxies.GFF3Proxy r
jpayne@69 146 r = ctabixproxies.GFF3Proxy(self.encoding)
jpayne@69 147 r.copy(buffer, len)
jpayne@69 148 return r
jpayne@69 149
jpayne@69 150
jpayne@69 151 cdef class asGTF(Parser):
jpayne@69 152 '''converts a :term:`tabix row` into a GTF record with the following
jpayne@69 153 fields:
jpayne@69 154
jpayne@69 155 +----------+----------+-------------------------------+
jpayne@69 156 |*Column* |*Name* |*Content* |
jpayne@69 157 +----------+----------+-------------------------------+
jpayne@69 158 |1 |contig |the chromosome name |
jpayne@69 159 +----------+----------+-------------------------------+
jpayne@69 160 |2 |feature |The feature type |
jpayne@69 161 +----------+----------+-------------------------------+
jpayne@69 162 |3 |source |The feature source |
jpayne@69 163 +----------+----------+-------------------------------+
jpayne@69 164 |4 |start |genomic start coordinate |
jpayne@69 165 | | |(0-based) |
jpayne@69 166 +----------+----------+-------------------------------+
jpayne@69 167 |5 |end |genomic end coordinate |
jpayne@69 168 | | |(0-based) |
jpayne@69 169 +----------+----------+-------------------------------+
jpayne@69 170 |6 |score |feature score |
jpayne@69 171 +----------+----------+-------------------------------+
jpayne@69 172 |7 |strand |strand |
jpayne@69 173 +----------+----------+-------------------------------+
jpayne@69 174 |8 |frame |frame |
jpayne@69 175 +----------+----------+-------------------------------+
jpayne@69 176 |9 |attributes|the attribute field |
jpayne@69 177 +----------+----------+-------------------------------+
jpayne@69 178
jpayne@69 179 GTF formatted entries also define the following fields that
jpayne@69 180 are derived from the attributes field:
jpayne@69 181
jpayne@69 182 +--------------------+------------------------------+
jpayne@69 183 |*Name* |*Content* |
jpayne@69 184 +--------------------+------------------------------+
jpayne@69 185 |gene_id |the gene identifier |
jpayne@69 186 +--------------------+------------------------------+
jpayne@69 187 |transcript_id |the transcript identifier |
jpayne@69 188 +--------------------+------------------------------+
jpayne@69 189
jpayne@69 190 '''
jpayne@69 191 cdef parse(self, char * buffer, int len):
jpayne@69 192 cdef ctabixproxies.GTFProxy r
jpayne@69 193 r = ctabixproxies.GTFProxy(self.encoding)
jpayne@69 194 r.copy(buffer, len)
jpayne@69 195 return r
jpayne@69 196
jpayne@69 197
jpayne@69 198 cdef class asBed(Parser):
jpayne@69 199 '''converts a :term:`tabix row` into a bed record
jpayne@69 200 with the following fields:
jpayne@69 201
jpayne@69 202 +-----------+-----------+------------------------------------------+
jpayne@69 203 |*Column* |*Field* |*Contents* |
jpayne@69 204 | | | |
jpayne@69 205 +-----------+-----------+------------------------------------------+
jpayne@69 206 |1 |contig |contig |
jpayne@69 207 | | | |
jpayne@69 208 +-----------+-----------+------------------------------------------+
jpayne@69 209 |2 |start |genomic start coordinate (zero-based) |
jpayne@69 210 +-----------+-----------+------------------------------------------+
jpayne@69 211 |3 |end |genomic end coordinate plus one |
jpayne@69 212 | | |(zero-based) |
jpayne@69 213 +-----------+-----------+------------------------------------------+
jpayne@69 214 |4 |name |name of feature. |
jpayne@69 215 +-----------+-----------+------------------------------------------+
jpayne@69 216 |5 |score |score of feature |
jpayne@69 217 +-----------+-----------+------------------------------------------+
jpayne@69 218 |6 |strand |strand of feature |
jpayne@69 219 +-----------+-----------+------------------------------------------+
jpayne@69 220 |7 |thickStart |thickStart |
jpayne@69 221 +-----------+-----------+------------------------------------------+
jpayne@69 222 |8 |thickEnd |thickEnd |
jpayne@69 223 +-----------+-----------+------------------------------------------+
jpayne@69 224 |9 |itemRGB |itemRGB |
jpayne@69 225 +-----------+-----------+------------------------------------------+
jpayne@69 226 |10 |blockCount |number of bocks |
jpayne@69 227 +-----------+-----------+------------------------------------------+
jpayne@69 228 |11 |blockSizes |',' separated string of block sizes |
jpayne@69 229 +-----------+-----------+------------------------------------------+
jpayne@69 230 |12 |blockStarts|',' separated string of block genomic |
jpayne@69 231 | | |start positions |
jpayne@69 232 +-----------+-----------+------------------------------------------+
jpayne@69 233
jpayne@69 234 Only the first three fields are required. Additional
jpayne@69 235 fields are optional, but if one is defined, all the preceding
jpayne@69 236 need to be defined as well.
jpayne@69 237
jpayne@69 238 '''
jpayne@69 239 cdef parse(self, char * buffer, int len):
jpayne@69 240 cdef ctabixproxies.BedProxy r
jpayne@69 241 r = ctabixproxies.BedProxy(self.encoding)
jpayne@69 242 r.copy(buffer, len)
jpayne@69 243 return r
jpayne@69 244
jpayne@69 245
jpayne@69 246 cdef class asVCF(Parser):
jpayne@69 247 '''converts a :term:`tabix row` into a VCF record with
jpayne@69 248 the following fields:
jpayne@69 249
jpayne@69 250 +----------+---------+------------------------------------+
jpayne@69 251 |*Column* |*Field* |*Contents* |
jpayne@69 252 | | | |
jpayne@69 253 +----------+---------+------------------------------------+
jpayne@69 254 |1 |contig |chromosome |
jpayne@69 255 +----------+---------+------------------------------------+
jpayne@69 256 |2 |pos |chromosomal position, zero-based |
jpayne@69 257 +----------+---------+------------------------------------+
jpayne@69 258 |3 |id |id |
jpayne@69 259 +----------+---------+------------------------------------+
jpayne@69 260 |4 |ref |reference allele |
jpayne@69 261 +----------+---------+------------------------------------+
jpayne@69 262 |5 |alt |alternate alleles |
jpayne@69 263 +----------+---------+------------------------------------+
jpayne@69 264 |6 |qual |quality |
jpayne@69 265 +----------+---------+------------------------------------+
jpayne@69 266 |7 |filter |filter |
jpayne@69 267 +----------+---------+------------------------------------+
jpayne@69 268 |8 |info |info |
jpayne@69 269 +----------+---------+------------------------------------+
jpayne@69 270 |9 |format |format specifier. |
jpayne@69 271 +----------+---------+------------------------------------+
jpayne@69 272
jpayne@69 273 Access to genotypes is via index::
jpayne@69 274
jpayne@69 275 contig = vcf.contig
jpayne@69 276 first_sample_genotype = vcf[0]
jpayne@69 277 second_sample_genotype = vcf[1]
jpayne@69 278
jpayne@69 279 '''
jpayne@69 280 cdef parse(self, char * buffer, int len):
jpayne@69 281 cdef ctabixproxies.VCFProxy r
jpayne@69 282 r = ctabixproxies.VCFProxy(self.encoding)
jpayne@69 283 r.copy(buffer, len)
jpayne@69 284 return r
jpayne@69 285
jpayne@69 286
jpayne@69 287 cdef class TabixFile:
jpayne@69 288 """Random access to bgzf formatted files that
jpayne@69 289 have been indexed by :term:`tabix`.
jpayne@69 290
jpayne@69 291 The file is automatically opened. The index file of file
jpayne@69 292 ``<filename>`` is expected to be called ``<filename>.tbi``
jpayne@69 293 by default (see parameter `index`).
jpayne@69 294
jpayne@69 295 Parameters
jpayne@69 296 ----------
jpayne@69 297
jpayne@69 298 filename : string
jpayne@69 299 Filename of bgzf file to be opened.
jpayne@69 300
jpayne@69 301 index : string
jpayne@69 302 The filename of the index. If not set, the default is to
jpayne@69 303 assume that the index is called ``filename.tbi``
jpayne@69 304
jpayne@69 305 mode : char
jpayne@69 306 The file opening mode. Currently, only ``r`` is permitted.
jpayne@69 307
jpayne@69 308 parser : :class:`pysam.Parser`
jpayne@69 309
jpayne@69 310 sets the default parser for this tabix file. If `parser`
jpayne@69 311 is None, the results are returned as an unparsed string.
jpayne@69 312 Otherwise, `parser` is assumed to be a functor that will return
jpayne@69 313 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69 314 :class:`~pysam.asGTF`).
jpayne@69 315
jpayne@69 316 encoding : string
jpayne@69 317
jpayne@69 318 The encoding passed to the parser
jpayne@69 319
jpayne@69 320 threads: integer
jpayne@69 321 Number of threads to use for decompressing Tabix files.
jpayne@69 322 (Default=1)
jpayne@69 323
jpayne@69 324
jpayne@69 325 Raises
jpayne@69 326 ------
jpayne@69 327
jpayne@69 328 ValueError
jpayne@69 329 if index file is missing.
jpayne@69 330
jpayne@69 331 IOError
jpayne@69 332 if file could not be opened
jpayne@69 333 """
jpayne@69 334 def __cinit__(self,
jpayne@69 335 filename,
jpayne@69 336 mode='r',
jpayne@69 337 parser=None,
jpayne@69 338 index=None,
jpayne@69 339 encoding="ascii",
jpayne@69 340 threads=1,
jpayne@69 341 *args,
jpayne@69 342 **kwargs ):
jpayne@69 343
jpayne@69 344 self.htsfile = NULL
jpayne@69 345 self.is_remote = False
jpayne@69 346 self.is_stream = False
jpayne@69 347 self.parser = parser
jpayne@69 348 self.threads = threads
jpayne@69 349 self._open(filename, mode, index, *args, **kwargs)
jpayne@69 350 self.encoding = encoding
jpayne@69 351
jpayne@69 352 def _open( self,
jpayne@69 353 filename,
jpayne@69 354 mode='r',
jpayne@69 355 index=None,
jpayne@69 356 threads=1,
jpayne@69 357 ):
jpayne@69 358 '''open a :term:`tabix file` for reading.'''
jpayne@69 359
jpayne@69 360 if mode != 'r':
jpayne@69 361 raise ValueError("invalid file opening mode `%s`" % mode)
jpayne@69 362
jpayne@69 363 if self.htsfile != NULL:
jpayne@69 364 self.close()
jpayne@69 365 self.htsfile = NULL
jpayne@69 366 self.threads=threads
jpayne@69 367
jpayne@69 368 filename_index = index or (filename + ".tbi")
jpayne@69 369 # encode all the strings to pass to tabix
jpayne@69 370 self.filename = encode_filename(filename)
jpayne@69 371 self.filename_index = encode_filename(filename_index)
jpayne@69 372
jpayne@69 373 self.is_stream = self.filename == b'-'
jpayne@69 374 self.is_remote = hisremote(self.filename)
jpayne@69 375
jpayne@69 376 if not self.is_remote:
jpayne@69 377 if not os.path.exists(filename):
jpayne@69 378 raise IOError("file `%s` not found" % filename)
jpayne@69 379
jpayne@69 380 if not os.path.exists(filename_index):
jpayne@69 381 raise IOError("index `%s` not found" % filename_index)
jpayne@69 382
jpayne@69 383 # open file
jpayne@69 384 cdef char *cfilename = self.filename
jpayne@69 385 cdef char *cfilename_index = self.filename_index
jpayne@69 386 with nogil:
jpayne@69 387 self.htsfile = hts_open(cfilename, 'r')
jpayne@69 388
jpayne@69 389 if self.htsfile == NULL:
jpayne@69 390 raise IOError("could not open file `%s`" % filename)
jpayne@69 391
jpayne@69 392 #if self.htsfile.format.category != region_list:
jpayne@69 393 # raise ValueError("file does not contain region data")
jpayne@69 394
jpayne@69 395 with nogil:
jpayne@69 396 self.index = tbx_index_load2(cfilename, cfilename_index)
jpayne@69 397
jpayne@69 398 if self.index == NULL:
jpayne@69 399 raise IOError("could not open index for `%s`" % filename)
jpayne@69 400
jpayne@69 401 if not self.is_stream:
jpayne@69 402 self.start_offset = self.tell()
jpayne@69 403
jpayne@69 404 def _dup(self):
jpayne@69 405 '''return a copy of this tabix file.
jpayne@69 406
jpayne@69 407 The file is being re-opened.
jpayne@69 408 '''
jpayne@69 409 return TabixFile(self.filename,
jpayne@69 410 mode="r",
jpayne@69 411 threads=self.threads,
jpayne@69 412 parser=self.parser,
jpayne@69 413 index=self.filename_index,
jpayne@69 414 encoding=self.encoding)
jpayne@69 415
jpayne@69 416 def fetch(self,
jpayne@69 417 reference=None,
jpayne@69 418 start=None,
jpayne@69 419 end=None,
jpayne@69 420 region=None,
jpayne@69 421 parser=None,
jpayne@69 422 multiple_iterators=False):
jpayne@69 423 '''fetch one or more rows in a :term:`region` using 0-based
jpayne@69 424 indexing. The region is specified by :term:`reference`,
jpayne@69 425 *start* and *end*. Alternatively, a samtools :term:`region`
jpayne@69 426 string can be supplied.
jpayne@69 427
jpayne@69 428 Without *reference* or *region* all entries will be fetched.
jpayne@69 429
jpayne@69 430 If only *reference* is set, all reads matching on *reference*
jpayne@69 431 will be fetched.
jpayne@69 432
jpayne@69 433 If *parser* is None, the default parser will be used for
jpayne@69 434 parsing.
jpayne@69 435
jpayne@69 436 Set *multiple_iterators* to true if you will be using multiple
jpayne@69 437 iterators on the same file at the same time. The iterator
jpayne@69 438 returned will receive its own copy of a filehandle to the file
jpayne@69 439 effectively re-opening the file. Re-opening a file creates
jpayne@69 440 some overhead, so beware.
jpayne@69 441
jpayne@69 442 '''
jpayne@69 443 if not self.is_open():
jpayne@69 444 raise ValueError("I/O operation on closed file")
jpayne@69 445
jpayne@69 446 # convert coordinates to region string, which is one-based
jpayne@69 447 if reference:
jpayne@69 448 if end is not None:
jpayne@69 449 if end < 0:
jpayne@69 450 raise ValueError("end out of range (%i)" % end)
jpayne@69 451 if start is None:
jpayne@69 452 start = 0
jpayne@69 453
jpayne@69 454 if start < 0:
jpayne@69 455 raise ValueError("start out of range (%i)" % end)
jpayne@69 456 elif start > end:
jpayne@69 457 raise ValueError(
jpayne@69 458 'start (%i) >= end (%i)' % (start, end))
jpayne@69 459 elif start == end:
jpayne@69 460 return EmptyIterator()
jpayne@69 461 else:
jpayne@69 462 region = '%s:%i-%i' % (reference, start + 1, end)
jpayne@69 463 elif start is not None:
jpayne@69 464 if start < 0:
jpayne@69 465 raise ValueError("start out of range (%i)" % end)
jpayne@69 466 region = '%s:%i' % (reference, start + 1)
jpayne@69 467 else:
jpayne@69 468 region = reference
jpayne@69 469
jpayne@69 470 # get iterator
jpayne@69 471 cdef hts_itr_t * itr
jpayne@69 472 cdef char *cstr
jpayne@69 473 cdef TabixFile fileobj
jpayne@69 474
jpayne@69 475 # reopen the same file if necessary
jpayne@69 476 if multiple_iterators:
jpayne@69 477 fileobj = self._dup()
jpayne@69 478 else:
jpayne@69 479 fileobj = self
jpayne@69 480
jpayne@69 481 if region is None:
jpayne@69 482 # without region or reference - iterate from start
jpayne@69 483 with nogil:
jpayne@69 484 itr = tbx_itr_queryi(fileobj.index,
jpayne@69 485 HTS_IDX_START,
jpayne@69 486 0,
jpayne@69 487 0)
jpayne@69 488 else:
jpayne@69 489 s = force_bytes(region, encoding=fileobj.encoding)
jpayne@69 490 cstr = s
jpayne@69 491 with nogil:
jpayne@69 492 itr = tbx_itr_querys(fileobj.index, cstr)
jpayne@69 493
jpayne@69 494 if itr == NULL:
jpayne@69 495 if region is None:
jpayne@69 496 if len(self.contigs) > 0:
jpayne@69 497 # when accessing a tabix file created prior tabix 1.0
jpayne@69 498 # the full-file iterator is empty.
jpayne@69 499 raise ValueError(
jpayne@69 500 "could not create iterator, possible "
jpayne@69 501 "tabix version mismatch")
jpayne@69 502 else:
jpayne@69 503 # possible reason is that the file is empty -
jpayne@69 504 # return an empty iterator
jpayne@69 505 return EmptyIterator()
jpayne@69 506 else:
jpayne@69 507 raise ValueError(
jpayne@69 508 "could not create iterator for region '%s'" %
jpayne@69 509 region)
jpayne@69 510
jpayne@69 511 # use default parser if no parser is specified
jpayne@69 512 if parser is None:
jpayne@69 513 parser = fileobj.parser
jpayne@69 514
jpayne@69 515 cdef TabixIterator a
jpayne@69 516 if parser is None:
jpayne@69 517 a = TabixIterator(encoding=fileobj.encoding)
jpayne@69 518 else:
jpayne@69 519 parser.set_encoding(fileobj.encoding)
jpayne@69 520 a = TabixIteratorParsed(parser)
jpayne@69 521
jpayne@69 522 a.tabixfile = fileobj
jpayne@69 523 a.iterator = itr
jpayne@69 524
jpayne@69 525 return a
jpayne@69 526
jpayne@69 527 ###############################################################
jpayne@69 528 ###############################################################
jpayne@69 529 ###############################################################
jpayne@69 530 ## properties
jpayne@69 531 ###############################################################
jpayne@69 532 property header:
jpayne@69 533 '''the file header.
jpayne@69 534
jpayne@69 535 The file header consists of the lines at the beginning of a
jpayne@69 536 file that are prefixed by the comment character ``#``.
jpayne@69 537
jpayne@69 538 .. note::
jpayne@69 539 The header is returned as an iterator presenting lines
jpayne@69 540 without the newline character.
jpayne@69 541 '''
jpayne@69 542
jpayne@69 543 def __get__(self):
jpayne@69 544
jpayne@69 545 cdef char *cfilename = self.filename
jpayne@69 546 cdef char *cfilename_index = self.filename_index
jpayne@69 547
jpayne@69 548 cdef kstring_t buffer
jpayne@69 549 buffer.l = buffer.m = 0
jpayne@69 550 buffer.s = NULL
jpayne@69 551
jpayne@69 552 cdef htsFile * fp = NULL
jpayne@69 553 cdef int KS_SEP_LINE = 2
jpayne@69 554 cdef tbx_t * tbx = NULL
jpayne@69 555 lines = []
jpayne@69 556 with nogil:
jpayne@69 557 fp = hts_open(cfilename, 'r')
jpayne@69 558
jpayne@69 559 if fp == NULL:
jpayne@69 560 raise OSError("could not open {} for reading header".format(self.filename))
jpayne@69 561
jpayne@69 562 with nogil:
jpayne@69 563 tbx = tbx_index_load2(cfilename, cfilename_index)
jpayne@69 564
jpayne@69 565 if tbx == NULL:
jpayne@69 566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
jpayne@69 567
jpayne@69 568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
jpayne@69 569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
jpayne@69 570 break
jpayne@69 571 lines.append(force_str(buffer.s, self.encoding))
jpayne@69 572
jpayne@69 573 with nogil:
jpayne@69 574 hts_close(fp)
jpayne@69 575 free(buffer.s)
jpayne@69 576
jpayne@69 577 return lines
jpayne@69 578
jpayne@69 579 property contigs:
jpayne@69 580 '''list of chromosome names'''
jpayne@69 581 def __get__(self):
jpayne@69 582 cdef const char ** sequences
jpayne@69 583 cdef int nsequences
jpayne@69 584
jpayne@69 585 with nogil:
jpayne@69 586 sequences = tbx_seqnames(self.index, &nsequences)
jpayne@69 587 cdef int x
jpayne@69 588 result = []
jpayne@69 589 for x from 0 <= x < nsequences:
jpayne@69 590 result.append(force_str(sequences[x]))
jpayne@69 591
jpayne@69 592 # htslib instructions:
jpayne@69 593 # only free container, not the sequences themselves
jpayne@69 594 free(sequences)
jpayne@69 595
jpayne@69 596 return result
jpayne@69 597
jpayne@69 598 def close(self):
jpayne@69 599 '''
jpayne@69 600 closes the :class:`pysam.TabixFile`.'''
jpayne@69 601 if self.htsfile != NULL:
jpayne@69 602 hts_close(self.htsfile)
jpayne@69 603 self.htsfile = NULL
jpayne@69 604 if self.index != NULL:
jpayne@69 605 tbx_destroy(self.index)
jpayne@69 606 self.index = NULL
jpayne@69 607
jpayne@69 608 def __dealloc__( self ):
jpayne@69 609 # remember: dealloc cannot call other python methods
jpayne@69 610 # note: no doc string
jpayne@69 611 # note: __del__ is not called.
jpayne@69 612 if self.htsfile != NULL:
jpayne@69 613 hts_close(self.htsfile)
jpayne@69 614 self.htsfile = NULL
jpayne@69 615 if self.index != NULL:
jpayne@69 616 tbx_destroy(self.index)
jpayne@69 617
jpayne@69 618
jpayne@69 619 cdef class TabixIterator:
jpayne@69 620 """iterates over rows in *tabixfile* in region
jpayne@69 621 given by *tid*, *start* and *end*.
jpayne@69 622 """
jpayne@69 623
jpayne@69 624 def __init__(self, encoding="ascii"):
jpayne@69 625 self.encoding = encoding
jpayne@69 626
jpayne@69 627 def __iter__(self):
jpayne@69 628 self.buffer.s = NULL
jpayne@69 629 self.buffer.l = 0
jpayne@69 630 self.buffer.m = 0
jpayne@69 631
jpayne@69 632 return self
jpayne@69 633
jpayne@69 634 cdef int __cnext__(self):
jpayne@69 635 '''iterate to next element.
jpayne@69 636
jpayne@69 637 Return -5 if file has been closed when this function
jpayne@69 638 was called.
jpayne@69 639 '''
jpayne@69 640 if self.tabixfile.htsfile == NULL:
jpayne@69 641 return -5
jpayne@69 642
jpayne@69 643 cdef int retval
jpayne@69 644
jpayne@69 645 while 1:
jpayne@69 646 with nogil:
jpayne@69 647 retval = tbx_itr_next(
jpayne@69 648 self.tabixfile.htsfile,
jpayne@69 649 self.tabixfile.index,
jpayne@69 650 self.iterator,
jpayne@69 651 &self.buffer)
jpayne@69 652
jpayne@69 653 if retval < 0:
jpayne@69 654 break
jpayne@69 655
jpayne@69 656 if self.buffer.s[0] != b'#':
jpayne@69 657 break
jpayne@69 658
jpayne@69 659 return retval
jpayne@69 660
jpayne@69 661 def __next__(self):
jpayne@69 662 """python version of next().
jpayne@69 663
jpayne@69 664 pyrex uses this non-standard name instead of next()
jpayne@69 665 """
jpayne@69 666
jpayne@69 667 cdef int retval = self.__cnext__()
jpayne@69 668 if retval == -5:
jpayne@69 669 raise IOError("iteration on closed file")
jpayne@69 670 elif retval < 0:
jpayne@69 671 raise StopIteration
jpayne@69 672
jpayne@69 673 return charptr_to_str(self.buffer.s, self.encoding)
jpayne@69 674
jpayne@69 675 def __dealloc__(self):
jpayne@69 676 if <void*>self.iterator != NULL:
jpayne@69 677 tbx_itr_destroy(self.iterator)
jpayne@69 678 if self.buffer.s != NULL:
jpayne@69 679 free(self.buffer.s)
jpayne@69 680
jpayne@69 681
jpayne@69 682 class EmptyIterator:
jpayne@69 683 '''empty iterator'''
jpayne@69 684
jpayne@69 685 def __iter__(self):
jpayne@69 686 return self
jpayne@69 687
jpayne@69 688 def __next__(self):
jpayne@69 689 raise StopIteration()
jpayne@69 690
jpayne@69 691
jpayne@69 692 cdef class TabixIteratorParsed(TabixIterator):
jpayne@69 693 """iterates over mapped reads in a region.
jpayne@69 694
jpayne@69 695 The *parser* determines the encoding.
jpayne@69 696
jpayne@69 697 Returns parsed data.
jpayne@69 698 """
jpayne@69 699
jpayne@69 700 def __init__(self, Parser parser):
jpayne@69 701 super().__init__()
jpayne@69 702 self.parser = parser
jpayne@69 703
jpayne@69 704 def __next__(self):
jpayne@69 705 """python version of next().
jpayne@69 706
jpayne@69 707 pyrex uses this non-standard name instead of next()
jpayne@69 708 """
jpayne@69 709
jpayne@69 710 cdef int retval = self.__cnext__()
jpayne@69 711 if retval == -5:
jpayne@69 712 raise IOError("iteration on closed file")
jpayne@69 713 elif retval < 0:
jpayne@69 714 raise StopIteration
jpayne@69 715
jpayne@69 716 return self.parser.parse(self.buffer.s,
jpayne@69 717 self.buffer.l)
jpayne@69 718
jpayne@69 719
jpayne@69 720 cdef class GZIterator:
jpayne@69 721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
jpayne@69 722 '''iterate line-by-line through gzip (or bgzip)
jpayne@69 723 compressed file.
jpayne@69 724 '''
jpayne@69 725 if not os.path.exists(filename):
jpayne@69 726 raise IOError("No such file or directory: %s" % filename)
jpayne@69 727
jpayne@69 728 filename = encode_filename(filename)
jpayne@69 729 cdef char *cfilename = filename
jpayne@69 730 with nogil:
jpayne@69 731 self.gzipfile = bgzf_open(cfilename, "r")
jpayne@69 732 self._filename = filename
jpayne@69 733 self.kstream = ks_init(self.gzipfile)
jpayne@69 734 self.encoding = encoding
jpayne@69 735
jpayne@69 736 self.buffer.l = 0
jpayne@69 737 self.buffer.m = 0
jpayne@69 738 self.buffer.s = <char*>malloc(buffer_size)
jpayne@69 739
jpayne@69 740 def __dealloc__(self):
jpayne@69 741 '''close file.'''
jpayne@69 742 if self.gzipfile != NULL:
jpayne@69 743 bgzf_close(self.gzipfile)
jpayne@69 744 self.gzipfile = NULL
jpayne@69 745 if self.buffer.s != NULL:
jpayne@69 746 free(self.buffer.s)
jpayne@69 747 if self.kstream != NULL:
jpayne@69 748 ks_destroy(self.kstream)
jpayne@69 749
jpayne@69 750 def __iter__(self):
jpayne@69 751 return self
jpayne@69 752
jpayne@69 753 cdef int __cnext__(self):
jpayne@69 754 cdef int dret = 0
jpayne@69 755 cdef int retval = 0
jpayne@69 756 while 1:
jpayne@69 757 with nogil:
jpayne@69 758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69 759
jpayne@69 760 if retval < 0:
jpayne@69 761 break
jpayne@69 762
jpayne@69 763 return dret
jpayne@69 764 return -1
jpayne@69 765
jpayne@69 766 def __next__(self):
jpayne@69 767 """python version of next().
jpayne@69 768 """
jpayne@69 769 cdef int retval = self.__cnext__()
jpayne@69 770 if retval < 0:
jpayne@69 771 raise StopIteration
jpayne@69 772 return force_str(self.buffer.s, self.encoding)
jpayne@69 773
jpayne@69 774
jpayne@69 775 cdef class GZIteratorHead(GZIterator):
jpayne@69 776 '''iterate line-by-line through gzip (or bgzip)
jpayne@69 777 compressed file returning comments at top of file.
jpayne@69 778 '''
jpayne@69 779
jpayne@69 780 def __next__(self):
jpayne@69 781 """python version of next().
jpayne@69 782 """
jpayne@69 783 cdef int retval = self.__cnext__()
jpayne@69 784 if retval < 0:
jpayne@69 785 raise StopIteration
jpayne@69 786 if self.buffer.s[0] == b'#':
jpayne@69 787 return self.buffer.s
jpayne@69 788 else:
jpayne@69 789 raise StopIteration
jpayne@69 790
jpayne@69 791
jpayne@69 792 cdef class GZIteratorParsed(GZIterator):
jpayne@69 793 '''iterate line-by-line through gzip (or bgzip)
jpayne@69 794 compressed file returning comments at top of file.
jpayne@69 795 '''
jpayne@69 796
jpayne@69 797 def __init__(self, parser):
jpayne@69 798 self.parser = parser
jpayne@69 799
jpayne@69 800 def __next__(self):
jpayne@69 801 """python version of next().
jpayne@69 802 """
jpayne@69 803 cdef int retval = self.__cnext__()
jpayne@69 804 if retval < 0:
jpayne@69 805 raise StopIteration
jpayne@69 806
jpayne@69 807 return self.parser.parse(self.buffer.s,
jpayne@69 808 self.buffer.l)
jpayne@69 809
jpayne@69 810
jpayne@69 811 def tabix_compress(filename_in,
jpayne@69 812 filename_out,
jpayne@69 813 force=False):
jpayne@69 814 '''compress *filename_in* writing the output to *filename_out*.
jpayne@69 815
jpayne@69 816 Raise an IOError if *filename_out* already exists, unless *force*
jpayne@69 817 is set.
jpayne@69 818 '''
jpayne@69 819
jpayne@69 820 if not force and os.path.exists(filename_out):
jpayne@69 821 raise IOError(
jpayne@69 822 "Filename '%s' already exists, use *force* to "
jpayne@69 823 "overwrite" % filename_out)
jpayne@69 824
jpayne@69 825 cdef int WINDOW_SIZE
jpayne@69 826 cdef int c, r
jpayne@69 827 cdef void * buffer
jpayne@69 828 cdef BGZF * fp
jpayne@69 829 cdef int fd_src
jpayne@69 830 cdef bint is_empty = True
jpayne@69 831 cdef int O_RDONLY
jpayne@69 832 O_RDONLY = os.O_RDONLY
jpayne@69 833
jpayne@69 834 WINDOW_SIZE = 64 * 1024
jpayne@69 835
jpayne@69 836 fn = encode_filename(filename_out)
jpayne@69 837 cdef char *cfn = fn
jpayne@69 838 with nogil:
jpayne@69 839 fp = bgzf_open(cfn, "w")
jpayne@69 840 if fp == NULL:
jpayne@69 841 raise IOError("could not open '%s' for writing" % filename_out)
jpayne@69 842
jpayne@69 843 fn = encode_filename(filename_in)
jpayne@69 844 fd_src = open(fn, O_RDONLY)
jpayne@69 845 if fd_src == 0:
jpayne@69 846 raise IOError("could not open '%s' for reading" % filename_in)
jpayne@69 847
jpayne@69 848 buffer = malloc(WINDOW_SIZE)
jpayne@69 849 c = 1
jpayne@69 850
jpayne@69 851 while c > 0:
jpayne@69 852 with nogil:
jpayne@69 853 c = read(fd_src, buffer, WINDOW_SIZE)
jpayne@69 854 if c > 0:
jpayne@69 855 is_empty = False
jpayne@69 856 r = bgzf_write(fp, buffer, c)
jpayne@69 857 if r < 0:
jpayne@69 858 free(buffer)
jpayne@69 859 raise IOError("writing failed")
jpayne@69 860
jpayne@69 861 free(buffer)
jpayne@69 862 r = bgzf_close(fp)
jpayne@69 863 if r < 0:
jpayne@69 864 raise IOError("error %i when writing to file %s" % (r, filename_out))
jpayne@69 865
jpayne@69 866 r = close(fd_src)
jpayne@69 867 # an empty file will return with -1, thus ignore this.
jpayne@69 868 if r < 0:
jpayne@69 869 if not (r == -1 and is_empty):
jpayne@69 870 raise IOError("error %i when closing file %s" % (r, filename_in))
jpayne@69 871
jpayne@69 872
jpayne@69 873 def tabix_index(filename,
jpayne@69 874 force=False,
jpayne@69 875 seq_col=None,
jpayne@69 876 start_col=None,
jpayne@69 877 end_col=None,
jpayne@69 878 preset=None,
jpayne@69 879 meta_char="#",
jpayne@69 880 int line_skip=0,
jpayne@69 881 zerobased=False,
jpayne@69 882 int min_shift=-1,
jpayne@69 883 index=None,
jpayne@69 884 keep_original=False,
jpayne@69 885 csi=False,
jpayne@69 886 ):
jpayne@69 887 '''index tab-separated *filename* using tabix.
jpayne@69 888
jpayne@69 889 An existing index will not be overwritten unless *force* is set.
jpayne@69 890
jpayne@69 891 The index will be built from coordinates in columns *seq_col*,
jpayne@69 892 *start_col* and *end_col*.
jpayne@69 893
jpayne@69 894 The contents of *filename* have to be sorted by contig and
jpayne@69 895 position - the method does not check if the file is sorted.
jpayne@69 896
jpayne@69 897 Column indices are 0-based. Note that this is different from the
jpayne@69 898 tabix command line utility where column indices start at 1.
jpayne@69 899
jpayne@69 900 Coordinates in the file are assumed to be 1-based unless
jpayne@69 901 *zerobased* is set.
jpayne@69 902
jpayne@69 903 If *preset* is provided, the column coordinates are taken from a
jpayne@69 904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
jpayne@69 905 psltbl", "pileup".
jpayne@69 906
jpayne@69 907 Lines beginning with *meta_char* and the first *line_skip* lines
jpayne@69 908 will be skipped.
jpayne@69 909
jpayne@69 910 If *filename* is not detected as a gzip file it will be automatically
jpayne@69 911 compressed. The original file will be removed and only the compressed
jpayne@69 912 file will be retained.
jpayne@69 913
jpayne@69 914 By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
jpayne@69 915 is greater than zero and/or *csi* is True, creates a CSI index with a
jpayne@69 916 minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
jpayne@69 917
jpayne@69 918 *index* controls the filename which should be used for creating the index.
jpayne@69 919 If not set, the default is to append ``.tbi`` to *filename*.
jpayne@69 920
jpayne@69 921 When automatically compressing files, if *keep_original* is set the
jpayne@69 922 uncompressed file will not be deleted.
jpayne@69 923
jpayne@69 924 returns the filename of the compressed data
jpayne@69 925
jpayne@69 926 '''
jpayne@69 927
jpayne@69 928 if preset is None and \
jpayne@69 929 (seq_col is None or start_col is None or end_col is None):
jpayne@69 930 raise ValueError(
jpayne@69 931 "neither preset nor seq_col,start_col and end_col given")
jpayne@69 932
jpayne@69 933 fn = encode_filename(filename)
jpayne@69 934 cdef char *cfn = fn
jpayne@69 935
jpayne@69 936 cdef htsFile *fp = hts_open(cfn, "r")
jpayne@69 937 if fp == NULL:
jpayne@69 938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
jpayne@69 939
jpayne@69 940 cdef htsFormat fmt = hts_get_format(fp)[0]
jpayne@69 941 hts_close(fp)
jpayne@69 942
jpayne@69 943 if fmt.compression == no_compression:
jpayne@69 944 tabix_compress(filename, filename + ".gz", force=force)
jpayne@69 945 if not keep_original:
jpayne@69 946 os.unlink(filename)
jpayne@69 947 filename += ".gz"
jpayne@69 948 fn = encode_filename(filename)
jpayne@69 949 cfn = fn
jpayne@69 950
jpayne@69 951 # columns (1-based):
jpayne@69 952 # preset-code, contig, start, end, metachar for
jpayne@69 953 # comments, lines to ignore at beginning
jpayne@69 954 # 0 is a missing column
jpayne@69 955 preset2conf = {
jpayne@69 956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
jpayne@69 957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
jpayne@69 958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
jpayne@69 959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
jpayne@69 960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
jpayne@69 961 }
jpayne@69 962
jpayne@69 963 conf_data = None
jpayne@69 964 if preset == "bcf" or fmt.format == bcf:
jpayne@69 965 csi = True
jpayne@69 966 elif preset:
jpayne@69 967 try:
jpayne@69 968 conf_data = preset2conf[preset]
jpayne@69 969 except KeyError:
jpayne@69 970 raise KeyError(
jpayne@69 971 "unknown preset '%s', valid presets are '%s'" %
jpayne@69 972 (preset, ",".join(preset2conf.keys())))
jpayne@69 973 else:
jpayne@69 974 if end_col is None:
jpayne@69 975 end_col = -1
jpayne@69 976
jpayne@69 977 preset = 0
jpayne@69 978 # tabix internally works with 0-based coordinates and
jpayne@69 979 # open/closed intervals. When using a preset, conversion is
jpayne@69 980 # automatically taken care of. Otherwise, the coordinates are
jpayne@69 981 # assumed to be 1-based closed intervals and -1 is subtracted
jpayne@69 982 # from the start coordinate. To avoid doing this, set the
jpayne@69 983 # TI_FLAG_UCSC=0x10000 flag:
jpayne@69 984 if zerobased:
jpayne@69 985 preset = preset | TBX_UCSC
jpayne@69 986
jpayne@69 987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
jpayne@69 988
jpayne@69 989 cdef tbx_conf_t conf
jpayne@69 990 if conf_data:
jpayne@69 991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
jpayne@69 992
jpayne@69 993 if csi or min_shift > 0:
jpayne@69 994 suffix = ".csi"
jpayne@69 995 if min_shift <= 0: min_shift = 14
jpayne@69 996 else:
jpayne@69 997 suffix = ".tbi"
jpayne@69 998 min_shift = 0
jpayne@69 999
jpayne@69 1000 index = index or filename + suffix
jpayne@69 1001 fn_index = encode_filename(index)
jpayne@69 1002
jpayne@69 1003 if not force and os.path.exists(index):
jpayne@69 1004 raise IOError(
jpayne@69 1005 "filename '%s' already exists, use *force* to overwrite" % index)
jpayne@69 1006
jpayne@69 1007 cdef char *fnidx = fn_index
jpayne@69 1008 cdef int retval = 0
jpayne@69 1009
jpayne@69 1010 if csi and fmt.format == bcf:
jpayne@69 1011 with nogil:
jpayne@69 1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
jpayne@69 1013 else:
jpayne@69 1014 with nogil:
jpayne@69 1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
jpayne@69 1016
jpayne@69 1017 if retval != 0:
jpayne@69 1018 raise OSError("building of index for {} failed".format(filename))
jpayne@69 1019
jpayne@69 1020 return filename
jpayne@69 1021
jpayne@69 1022 # #########################################################
jpayne@69 1023 # cdef class tabix_file_iterator_old:
jpayne@69 1024 # '''iterate over ``infile``.
jpayne@69 1025
jpayne@69 1026 # This iterator is not safe. If the :meth:`__next__()` method is called
jpayne@69 1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
jpayne@69 1028
jpayne@69 1029 # The iterator might either raise a StopIteration or segfault.
jpayne@69 1030 # '''
jpayne@69 1031
jpayne@69 1032
jpayne@69 1033 # def __cinit__(self,
jpayne@69 1034 # infile,
jpayne@69 1035 # Parser parser,
jpayne@69 1036 # int buffer_size = 65536 ):
jpayne@69 1037
jpayne@69 1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
jpayne@69 1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
jpayne@69 1040 # self.infile = fdopen( fd, 'r')
jpayne@69 1041
jpayne@69 1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
jpayne@69 1043
jpayne@69 1044 # self.buffer = <char*>malloc( buffer_size )
jpayne@69 1045 # self.size = buffer_size
jpayne@69 1046 # self.parser = parser
jpayne@69 1047
jpayne@69 1048 # def __iter__(self):
jpayne@69 1049 # return self
jpayne@69 1050
jpayne@69 1051 # cdef __cnext__(self):
jpayne@69 1052
jpayne@69 1053 # cdef char * b
jpayne@69 1054 # cdef size_t nbytes
jpayne@69 1055 # b = self.buffer
jpayne@69 1056
jpayne@69 1057 # while not feof( self.infile ):
jpayne@69 1058 # nbytes = getline( &b, &self.size, self.infile)
jpayne@69 1059
jpayne@69 1060 # # stop at first error or eof
jpayne@69 1061 # if (nbytes == -1): break
jpayne@69 1062 # # skip comments
jpayne@69 1063 # if (b[0] == '#'): continue
jpayne@69 1064
jpayne@69 1065 # # skip empty lines
jpayne@69 1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
jpayne@69 1067
jpayne@69 1068 # # make sure that entry is complete
jpayne@69 1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
jpayne@69 1070 # result = b
jpayne@69 1071 # raise ValueError( "incomplete line at %s" % result )
jpayne@69 1072
jpayne@69 1073 # # make sure that this goes fully through C
jpayne@69 1074 # # otherwise buffer is copied to/from a
jpayne@69 1075 # # Python object causing segfaults as
jpayne@69 1076 # # the wrong memory is freed
jpayne@69 1077 # return self.parser.parse( b, nbytes )
jpayne@69 1078
jpayne@69 1079 # raise StopIteration
jpayne@69 1080
jpayne@69 1081 # def __dealloc__(self):
jpayne@69 1082 # free(self.buffer)
jpayne@69 1083
jpayne@69 1084 # def __next__(self):
jpayne@69 1085 # return self.__cnext__()
jpayne@69 1086
jpayne@69 1087 #########################################################
jpayne@69 1088 #########################################################
jpayne@69 1089 #########################################################
jpayne@69 1090 ## Iterators for parsing through unindexed files.
jpayne@69 1091 #########################################################
jpayne@69 1092 # cdef buildGzipError(void *gzfp):
jpayne@69 1093 # cdef int errnum = 0
jpayne@69 1094 # cdef char *s = gzerror(gzfp, &errnum)
jpayne@69 1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
jpayne@69 1096
jpayne@69 1097
jpayne@69 1098 cdef class tabix_file_iterator:
jpayne@69 1099 '''iterate over a compressed or uncompressed ``infile``.
jpayne@69 1100 '''
jpayne@69 1101
jpayne@69 1102 def __cinit__(self,
jpayne@69 1103 infile,
jpayne@69 1104 Parser parser,
jpayne@69 1105 int buffer_size=65536):
jpayne@69 1106
jpayne@69 1107 if infile.closed:
jpayne@69 1108 raise ValueError("I/O operation on closed file.")
jpayne@69 1109
jpayne@69 1110 self.infile = infile
jpayne@69 1111
jpayne@69 1112 cdef int fd = PyObject_AsFileDescriptor(infile)
jpayne@69 1113 if fd == -1:
jpayne@69 1114 raise ValueError("I/O operation on closed file.")
jpayne@69 1115
jpayne@69 1116 self.duplicated_fd = dup(fd)
jpayne@69 1117
jpayne@69 1118 # From the manual:
jpayne@69 1119 # gzopen can be used to read a file which is not in gzip format;
jpayne@69 1120 # in this case gzread will directly read from the file without decompression.
jpayne@69 1121 # When reading, this will be detected automatically by looking
jpayne@69 1122 # for the magic two-byte gzip header.
jpayne@69 1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
jpayne@69 1124
jpayne@69 1125 if self.fh == NULL:
jpayne@69 1126 raise IOError('%s' % strerror(errno))
jpayne@69 1127
jpayne@69 1128 self.kstream = ks_init(self.fh)
jpayne@69 1129
jpayne@69 1130 self.buffer.s = <char*>malloc(buffer_size)
jpayne@69 1131 #if self.buffer == NULL:
jpayne@69 1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
jpayne@69 1133 #self.size = buffer_size
jpayne@69 1134 self.parser = parser
jpayne@69 1135
jpayne@69 1136 def __iter__(self):
jpayne@69 1137 return self
jpayne@69 1138
jpayne@69 1139 cdef __cnext__(self):
jpayne@69 1140
jpayne@69 1141 cdef char * b
jpayne@69 1142 cdef int dret = 0
jpayne@69 1143 cdef int retval = 0
jpayne@69 1144 while 1:
jpayne@69 1145 with nogil:
jpayne@69 1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
jpayne@69 1147
jpayne@69 1148 if retval < 0:
jpayne@69 1149 break
jpayne@69 1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
jpayne@69 1151
jpayne@69 1152 b = self.buffer.s
jpayne@69 1153
jpayne@69 1154 # skip comments
jpayne@69 1155 if (b[0] == b'#'):
jpayne@69 1156 continue
jpayne@69 1157
jpayne@69 1158 # skip empty lines
jpayne@69 1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69 1160 continue
jpayne@69 1161
jpayne@69 1162 # gzgets terminates at \n, no need to test
jpayne@69 1163
jpayne@69 1164 # parser creates a copy
jpayne@69 1165 return self.parser.parse(b, self.buffer.l)
jpayne@69 1166
jpayne@69 1167 raise StopIteration
jpayne@69 1168
jpayne@69 1169 def __dealloc__(self):
jpayne@69 1170 free(self.buffer.s)
jpayne@69 1171 ks_destroy(self.kstream)
jpayne@69 1172 bgzf_close(self.fh)
jpayne@69 1173
jpayne@69 1174 def __next__(self):
jpayne@69 1175 return self.__cnext__()
jpayne@69 1176
jpayne@69 1177
jpayne@69 1178 class tabix_generic_iterator:
jpayne@69 1179 '''iterate over ``infile``.
jpayne@69 1180
jpayne@69 1181 Permits the use of file-like objects for example from the gzip module.
jpayne@69 1182 '''
jpayne@69 1183 def __init__(self, infile, parser):
jpayne@69 1184
jpayne@69 1185 self.infile = infile
jpayne@69 1186 if self.infile.closed:
jpayne@69 1187 raise ValueError("I/O operation on closed file.")
jpayne@69 1188 self.parser = parser
jpayne@69 1189
jpayne@69 1190 def __iter__(self):
jpayne@69 1191 return self
jpayne@69 1192
jpayne@69 1193 # cython version - required for python 3
jpayne@69 1194 def __next__(self):
jpayne@69 1195
jpayne@69 1196 cdef char * b
jpayne@69 1197 cdef char * cpy
jpayne@69 1198 cdef size_t nbytes
jpayne@69 1199
jpayne@69 1200 encoding = self.parser.get_encoding()
jpayne@69 1201
jpayne@69 1202 # note that GzipFile.close() does not close the file
jpayne@69 1203 # reading is still possible.
jpayne@69 1204 if self.infile.closed:
jpayne@69 1205 raise ValueError("I/O operation on closed file.")
jpayne@69 1206
jpayne@69 1207 while 1:
jpayne@69 1208
jpayne@69 1209 line = self.infile.readline()
jpayne@69 1210 if not line:
jpayne@69 1211 break
jpayne@69 1212
jpayne@69 1213 s = force_bytes(line, encoding)
jpayne@69 1214 b = s
jpayne@69 1215 nbytes = len(line)
jpayne@69 1216 assert b[nbytes] == b'\0'
jpayne@69 1217
jpayne@69 1218 # skip comments
jpayne@69 1219 if b[0] == b'#':
jpayne@69 1220 continue
jpayne@69 1221
jpayne@69 1222 # skip empty lines
jpayne@69 1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
jpayne@69 1224 continue
jpayne@69 1225
jpayne@69 1226 # make sure that entry is complete
jpayne@69 1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
jpayne@69 1228 raise ValueError("incomplete line at %s" % line)
jpayne@69 1229
jpayne@69 1230 bytes_cpy = <bytes> b
jpayne@69 1231 cpy = <char *> bytes_cpy
jpayne@69 1232
jpayne@69 1233 return self.parser(cpy, nbytes)
jpayne@69 1234
jpayne@69 1235 raise StopIteration
jpayne@69 1236
jpayne@69 1237
jpayne@69 1238 def tabix_iterator(infile, parser):
jpayne@69 1239 """return an iterator over all entries in a file.
jpayne@69 1240
jpayne@69 1241 Results are returned parsed as specified by the *parser*. If
jpayne@69 1242 *parser* is None, the results are returned as an unparsed string.
jpayne@69 1243 Otherwise, *parser* is assumed to be a functor that will return
jpayne@69 1244 parsed data (see for example :class:`~pysam.asTuple` and
jpayne@69 1245 :class:`~pysam.asGTF`).
jpayne@69 1246
jpayne@69 1247 """
jpayne@69 1248 return tabix_generic_iterator(infile, parser)
jpayne@69 1249
jpayne@69 1250
jpayne@69 1251 cdef class Tabixfile(TabixFile):
jpayne@69 1252 """Tabixfile is deprecated: use TabixFile instead"""
jpayne@69 1253 pass
jpayne@69 1254
jpayne@69 1255
jpayne@69 1256 __all__ = [
jpayne@69 1257 "tabix_index",
jpayne@69 1258 "tabix_compress",
jpayne@69 1259 "TabixFile",
jpayne@69 1260 "Tabixfile",
jpayne@69 1261 "asTuple",
jpayne@69 1262 "asGTF",
jpayne@69 1263 "asGFF3",
jpayne@69 1264 "asVCF",
jpayne@69 1265 "asBed",
jpayne@69 1266 "GZIterator",
jpayne@69 1267 "GZIteratorHead",
jpayne@69 1268 "tabix_iterator",
jpayne@69 1269 "tabix_generic_iterator",
jpayne@69 1270 "tabix_file_iterator",
jpayne@69 1271 ]