comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # cython: language_level=3
2 # cython: embedsignature=True
3 # cython: profile=True
4 ###############################################################################
5 ###############################################################################
6 # Cython wrapper for access to tabix indexed files in bgzf format
7 ###############################################################################
8 # The principal classes and functions defined in this module are:
9 #
10 # class TabixFile class wrapping tabix indexed files in bgzf format
11 #
12 # class asTuple Parser class for tuples
13 # class asGTF Parser class for GTF formatted rows
14 # class asGFF3 Parser class for GFF3 formatted rows
15 # class asBed Parser class for Bed formatted rows
16 # class asVCF Parser class for VCF formatted rows
17 #
18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
19 #
20 # Additionally this module defines several additional classes that are part
21 # of the internal API. These are:
22 #
23 # class Parser base class for parsers of tab-separated rows
24 # class tabix_file_iterator
25 # class TabixIterator iterator class over rows in bgzf file
26 # class EmptyIterator
27 #
28 # For backwards compatibility, the following classes are also defined:
29 #
30 # class Tabixfile equivalent to TabixFile
31 #
32 ###############################################################################
33 #
34 # The MIT License
35 #
36 # Copyright (c) 2015 Andreas Heger
37 #
38 # Permission is hereby granted, free of charge, to any person obtaining a
39 # copy of this software and associated documentation files (the "Software"),
40 # to deal in the Software without restriction, including without limitation
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
42 # and/or sell copies of the Software, and to permit persons to whom the
43 # Software is furnished to do so, subject to the following conditions:
44 #
45 # The above copyright notice and this permission notice shall be included in
46 # all copies or substantial portions of the Software.
47 #
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
54 # DEALINGS IN THE SOFTWARE.
55 #
56 ###############################################################################
57 import os
58 import sys
59
60 from libc.stdio cimport printf, fprintf, stderr
61 from libc.string cimport strerror
62 from libc.errno cimport errno
63 from posix.unistd cimport dup
64
65 from cpython cimport PyErr_SetString, PyBytes_Check, \
66 PyUnicode_Check, PyBytes_FromStringAndSize, \
67 PyObject_AsFileDescriptor
68
69 cimport pysam.libctabixproxies as ctabixproxies
70
71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
75 tbx_destroy, hisremote, region_list, hts_getline, \
76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
77 no_compression, bcf, bcf_index_build2
78
79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
80 from pysam.libcutils cimport encode_filename, from_string_and_size
81
82 cdef class Parser:
83
84 def __init__(self, encoding="ascii"):
85 self.encoding = encoding
86
87 def set_encoding(self, encoding):
88 self.encoding = encoding
89
90 def get_encoding(self):
91 return self.encoding
92
93 cdef parse(self, char * buffer, int length):
94 raise NotImplementedError(
95 'parse method of %s not implemented' % str(self))
96
97 def __call__(self, char * buffer, int length):
98 return self.parse(buffer, length)
99
100
101 cdef class asTuple(Parser):
102 '''converts a :term:`tabix row` into a python tuple.
103
104 A field in a row is accessed by numeric index.
105 '''
106 cdef parse(self, char * buffer, int len):
107 cdef ctabixproxies.TupleProxy r
108 r = ctabixproxies.TupleProxy(self.encoding)
109 # need to copy - there were some
110 # persistence issues with "present"
111 r.copy(buffer, len)
112 return r
113
114
115 cdef class asGFF3(Parser):
116 '''converts a :term:`tabix row` into a GFF record with the following
117 fields:
118
119 +----------+----------+-------------------------------+
120 |*Column* |*Name* |*Content* |
121 +----------+----------+-------------------------------+
122 |1 |contig |the chromosome name |
123 +----------+----------+-------------------------------+
124 |2 |feature |The feature type |
125 +----------+----------+-------------------------------+
126 |3 |source |The feature source |
127 +----------+----------+-------------------------------+
128 |4 |start |genomic start coordinate |
129 | | |(0-based) |
130 +----------+----------+-------------------------------+
131 |5 |end |genomic end coordinate |
132 | | |(0-based) |
133 +----------+----------+-------------------------------+
134 |6 |score |feature score |
135 +----------+----------+-------------------------------+
136 |7 |strand |strand |
137 +----------+----------+-------------------------------+
138 |8 |frame |frame |
139 +----------+----------+-------------------------------+
140 |9 |attributes|the attribute field |
141 +----------+----------+-------------------------------+
142
143 '''
144 cdef parse(self, char * buffer, int len):
145 cdef ctabixproxies.GFF3Proxy r
146 r = ctabixproxies.GFF3Proxy(self.encoding)
147 r.copy(buffer, len)
148 return r
149
150
151 cdef class asGTF(Parser):
152 '''converts a :term:`tabix row` into a GTF record with the following
153 fields:
154
155 +----------+----------+-------------------------------+
156 |*Column* |*Name* |*Content* |
157 +----------+----------+-------------------------------+
158 |1 |contig |the chromosome name |
159 +----------+----------+-------------------------------+
160 |2 |feature |The feature type |
161 +----------+----------+-------------------------------+
162 |3 |source |The feature source |
163 +----------+----------+-------------------------------+
164 |4 |start |genomic start coordinate |
165 | | |(0-based) |
166 +----------+----------+-------------------------------+
167 |5 |end |genomic end coordinate |
168 | | |(0-based) |
169 +----------+----------+-------------------------------+
170 |6 |score |feature score |
171 +----------+----------+-------------------------------+
172 |7 |strand |strand |
173 +----------+----------+-------------------------------+
174 |8 |frame |frame |
175 +----------+----------+-------------------------------+
176 |9 |attributes|the attribute field |
177 +----------+----------+-------------------------------+
178
179 GTF formatted entries also define the following fields that
180 are derived from the attributes field:
181
182 +--------------------+------------------------------+
183 |*Name* |*Content* |
184 +--------------------+------------------------------+
185 |gene_id |the gene identifier |
186 +--------------------+------------------------------+
187 |transcript_id |the transcript identifier |
188 +--------------------+------------------------------+
189
190 '''
191 cdef parse(self, char * buffer, int len):
192 cdef ctabixproxies.GTFProxy r
193 r = ctabixproxies.GTFProxy(self.encoding)
194 r.copy(buffer, len)
195 return r
196
197
198 cdef class asBed(Parser):
199 '''converts a :term:`tabix row` into a bed record
200 with the following fields:
201
202 +-----------+-----------+------------------------------------------+
203 |*Column* |*Field* |*Contents* |
204 | | | |
205 +-----------+-----------+------------------------------------------+
206 |1 |contig |contig |
207 | | | |
208 +-----------+-----------+------------------------------------------+
209 |2 |start |genomic start coordinate (zero-based) |
210 +-----------+-----------+------------------------------------------+
211 |3 |end |genomic end coordinate plus one |
212 | | |(zero-based) |
213 +-----------+-----------+------------------------------------------+
214 |4 |name |name of feature. |
215 +-----------+-----------+------------------------------------------+
216 |5 |score |score of feature |
217 +-----------+-----------+------------------------------------------+
218 |6 |strand |strand of feature |
219 +-----------+-----------+------------------------------------------+
220 |7 |thickStart |thickStart |
221 +-----------+-----------+------------------------------------------+
222 |8 |thickEnd |thickEnd |
223 +-----------+-----------+------------------------------------------+
224 |9 |itemRGB |itemRGB |
225 +-----------+-----------+------------------------------------------+
226 |10 |blockCount |number of bocks |
227 +-----------+-----------+------------------------------------------+
228 |11 |blockSizes |',' separated string of block sizes |
229 +-----------+-----------+------------------------------------------+
230 |12 |blockStarts|',' separated string of block genomic |
231 | | |start positions |
232 +-----------+-----------+------------------------------------------+
233
234 Only the first three fields are required. Additional
235 fields are optional, but if one is defined, all the preceding
236 need to be defined as well.
237
238 '''
239 cdef parse(self, char * buffer, int len):
240 cdef ctabixproxies.BedProxy r
241 r = ctabixproxies.BedProxy(self.encoding)
242 r.copy(buffer, len)
243 return r
244
245
246 cdef class asVCF(Parser):
247 '''converts a :term:`tabix row` into a VCF record with
248 the following fields:
249
250 +----------+---------+------------------------------------+
251 |*Column* |*Field* |*Contents* |
252 | | | |
253 +----------+---------+------------------------------------+
254 |1 |contig |chromosome |
255 +----------+---------+------------------------------------+
256 |2 |pos |chromosomal position, zero-based |
257 +----------+---------+------------------------------------+
258 |3 |id |id |
259 +----------+---------+------------------------------------+
260 |4 |ref |reference allele |
261 +----------+---------+------------------------------------+
262 |5 |alt |alternate alleles |
263 +----------+---------+------------------------------------+
264 |6 |qual |quality |
265 +----------+---------+------------------------------------+
266 |7 |filter |filter |
267 +----------+---------+------------------------------------+
268 |8 |info |info |
269 +----------+---------+------------------------------------+
270 |9 |format |format specifier. |
271 +----------+---------+------------------------------------+
272
273 Access to genotypes is via index::
274
275 contig = vcf.contig
276 first_sample_genotype = vcf[0]
277 second_sample_genotype = vcf[1]
278
279 '''
280 cdef parse(self, char * buffer, int len):
281 cdef ctabixproxies.VCFProxy r
282 r = ctabixproxies.VCFProxy(self.encoding)
283 r.copy(buffer, len)
284 return r
285
286
287 cdef class TabixFile:
288 """Random access to bgzf formatted files that
289 have been indexed by :term:`tabix`.
290
291 The file is automatically opened. The index file of file
292 ``<filename>`` is expected to be called ``<filename>.tbi``
293 by default (see parameter `index`).
294
295 Parameters
296 ----------
297
298 filename : string
299 Filename of bgzf file to be opened.
300
301 index : string
302 The filename of the index. If not set, the default is to
303 assume that the index is called ``filename.tbi``
304
305 mode : char
306 The file opening mode. Currently, only ``r`` is permitted.
307
308 parser : :class:`pysam.Parser`
309
310 sets the default parser for this tabix file. If `parser`
311 is None, the results are returned as an unparsed string.
312 Otherwise, `parser` is assumed to be a functor that will return
313 parsed data (see for example :class:`~pysam.asTuple` and
314 :class:`~pysam.asGTF`).
315
316 encoding : string
317
318 The encoding passed to the parser
319
320 threads: integer
321 Number of threads to use for decompressing Tabix files.
322 (Default=1)
323
324
325 Raises
326 ------
327
328 ValueError
329 if index file is missing.
330
331 IOError
332 if file could not be opened
333 """
334 def __cinit__(self,
335 filename,
336 mode='r',
337 parser=None,
338 index=None,
339 encoding="ascii",
340 threads=1,
341 *args,
342 **kwargs ):
343
344 self.htsfile = NULL
345 self.is_remote = False
346 self.is_stream = False
347 self.parser = parser
348 self.threads = threads
349 self._open(filename, mode, index, *args, **kwargs)
350 self.encoding = encoding
351
352 def _open( self,
353 filename,
354 mode='r',
355 index=None,
356 threads=1,
357 ):
358 '''open a :term:`tabix file` for reading.'''
359
360 if mode != 'r':
361 raise ValueError("invalid file opening mode `%s`" % mode)
362
363 if self.htsfile != NULL:
364 self.close()
365 self.htsfile = NULL
366 self.threads=threads
367
368 filename_index = index or (filename + ".tbi")
369 # encode all the strings to pass to tabix
370 self.filename = encode_filename(filename)
371 self.filename_index = encode_filename(filename_index)
372
373 self.is_stream = self.filename == b'-'
374 self.is_remote = hisremote(self.filename)
375
376 if not self.is_remote:
377 if not os.path.exists(filename):
378 raise IOError("file `%s` not found" % filename)
379
380 if not os.path.exists(filename_index):
381 raise IOError("index `%s` not found" % filename_index)
382
383 # open file
384 cdef char *cfilename = self.filename
385 cdef char *cfilename_index = self.filename_index
386 with nogil:
387 self.htsfile = hts_open(cfilename, 'r')
388
389 if self.htsfile == NULL:
390 raise IOError("could not open file `%s`" % filename)
391
392 #if self.htsfile.format.category != region_list:
393 # raise ValueError("file does not contain region data")
394
395 with nogil:
396 self.index = tbx_index_load2(cfilename, cfilename_index)
397
398 if self.index == NULL:
399 raise IOError("could not open index for `%s`" % filename)
400
401 if not self.is_stream:
402 self.start_offset = self.tell()
403
404 def _dup(self):
405 '''return a copy of this tabix file.
406
407 The file is being re-opened.
408 '''
409 return TabixFile(self.filename,
410 mode="r",
411 threads=self.threads,
412 parser=self.parser,
413 index=self.filename_index,
414 encoding=self.encoding)
415
416 def fetch(self,
417 reference=None,
418 start=None,
419 end=None,
420 region=None,
421 parser=None,
422 multiple_iterators=False):
423 '''fetch one or more rows in a :term:`region` using 0-based
424 indexing. The region is specified by :term:`reference`,
425 *start* and *end*. Alternatively, a samtools :term:`region`
426 string can be supplied.
427
428 Without *reference* or *region* all entries will be fetched.
429
430 If only *reference* is set, all reads matching on *reference*
431 will be fetched.
432
433 If *parser* is None, the default parser will be used for
434 parsing.
435
436 Set *multiple_iterators* to true if you will be using multiple
437 iterators on the same file at the same time. The iterator
438 returned will receive its own copy of a filehandle to the file
439 effectively re-opening the file. Re-opening a file creates
440 some overhead, so beware.
441
442 '''
443 if not self.is_open():
444 raise ValueError("I/O operation on closed file")
445
446 # convert coordinates to region string, which is one-based
447 if reference:
448 if end is not None:
449 if end < 0:
450 raise ValueError("end out of range (%i)" % end)
451 if start is None:
452 start = 0
453
454 if start < 0:
455 raise ValueError("start out of range (%i)" % end)
456 elif start > end:
457 raise ValueError(
458 'start (%i) >= end (%i)' % (start, end))
459 elif start == end:
460 return EmptyIterator()
461 else:
462 region = '%s:%i-%i' % (reference, start + 1, end)
463 elif start is not None:
464 if start < 0:
465 raise ValueError("start out of range (%i)" % end)
466 region = '%s:%i' % (reference, start + 1)
467 else:
468 region = reference
469
470 # get iterator
471 cdef hts_itr_t * itr
472 cdef char *cstr
473 cdef TabixFile fileobj
474
475 # reopen the same file if necessary
476 if multiple_iterators:
477 fileobj = self._dup()
478 else:
479 fileobj = self
480
481 if region is None:
482 # without region or reference - iterate from start
483 with nogil:
484 itr = tbx_itr_queryi(fileobj.index,
485 HTS_IDX_START,
486 0,
487 0)
488 else:
489 s = force_bytes(region, encoding=fileobj.encoding)
490 cstr = s
491 with nogil:
492 itr = tbx_itr_querys(fileobj.index, cstr)
493
494 if itr == NULL:
495 if region is None:
496 if len(self.contigs) > 0:
497 # when accessing a tabix file created prior tabix 1.0
498 # the full-file iterator is empty.
499 raise ValueError(
500 "could not create iterator, possible "
501 "tabix version mismatch")
502 else:
503 # possible reason is that the file is empty -
504 # return an empty iterator
505 return EmptyIterator()
506 else:
507 raise ValueError(
508 "could not create iterator for region '%s'" %
509 region)
510
511 # use default parser if no parser is specified
512 if parser is None:
513 parser = fileobj.parser
514
515 cdef TabixIterator a
516 if parser is None:
517 a = TabixIterator(encoding=fileobj.encoding)
518 else:
519 parser.set_encoding(fileobj.encoding)
520 a = TabixIteratorParsed(parser)
521
522 a.tabixfile = fileobj
523 a.iterator = itr
524
525 return a
526
527 ###############################################################
528 ###############################################################
529 ###############################################################
530 ## properties
531 ###############################################################
532 property header:
533 '''the file header.
534
535 The file header consists of the lines at the beginning of a
536 file that are prefixed by the comment character ``#``.
537
538 .. note::
539 The header is returned as an iterator presenting lines
540 without the newline character.
541 '''
542
543 def __get__(self):
544
545 cdef char *cfilename = self.filename
546 cdef char *cfilename_index = self.filename_index
547
548 cdef kstring_t buffer
549 buffer.l = buffer.m = 0
550 buffer.s = NULL
551
552 cdef htsFile * fp = NULL
553 cdef int KS_SEP_LINE = 2
554 cdef tbx_t * tbx = NULL
555 lines = []
556 with nogil:
557 fp = hts_open(cfilename, 'r')
558
559 if fp == NULL:
560 raise OSError("could not open {} for reading header".format(self.filename))
561
562 with nogil:
563 tbx = tbx_index_load2(cfilename, cfilename_index)
564
565 if tbx == NULL:
566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
567
568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
570 break
571 lines.append(force_str(buffer.s, self.encoding))
572
573 with nogil:
574 hts_close(fp)
575 free(buffer.s)
576
577 return lines
578
579 property contigs:
580 '''list of chromosome names'''
581 def __get__(self):
582 cdef const char ** sequences
583 cdef int nsequences
584
585 with nogil:
586 sequences = tbx_seqnames(self.index, &nsequences)
587 cdef int x
588 result = []
589 for x from 0 <= x < nsequences:
590 result.append(force_str(sequences[x]))
591
592 # htslib instructions:
593 # only free container, not the sequences themselves
594 free(sequences)
595
596 return result
597
598 def close(self):
599 '''
600 closes the :class:`pysam.TabixFile`.'''
601 if self.htsfile != NULL:
602 hts_close(self.htsfile)
603 self.htsfile = NULL
604 if self.index != NULL:
605 tbx_destroy(self.index)
606 self.index = NULL
607
608 def __dealloc__( self ):
609 # remember: dealloc cannot call other python methods
610 # note: no doc string
611 # note: __del__ is not called.
612 if self.htsfile != NULL:
613 hts_close(self.htsfile)
614 self.htsfile = NULL
615 if self.index != NULL:
616 tbx_destroy(self.index)
617
618
619 cdef class TabixIterator:
620 """iterates over rows in *tabixfile* in region
621 given by *tid*, *start* and *end*.
622 """
623
624 def __init__(self, encoding="ascii"):
625 self.encoding = encoding
626
627 def __iter__(self):
628 self.buffer.s = NULL
629 self.buffer.l = 0
630 self.buffer.m = 0
631
632 return self
633
634 cdef int __cnext__(self):
635 '''iterate to next element.
636
637 Return -5 if file has been closed when this function
638 was called.
639 '''
640 if self.tabixfile.htsfile == NULL:
641 return -5
642
643 cdef int retval
644
645 while 1:
646 with nogil:
647 retval = tbx_itr_next(
648 self.tabixfile.htsfile,
649 self.tabixfile.index,
650 self.iterator,
651 &self.buffer)
652
653 if retval < 0:
654 break
655
656 if self.buffer.s[0] != b'#':
657 break
658
659 return retval
660
661 def __next__(self):
662 """python version of next().
663
664 pyrex uses this non-standard name instead of next()
665 """
666
667 cdef int retval = self.__cnext__()
668 if retval == -5:
669 raise IOError("iteration on closed file")
670 elif retval < 0:
671 raise StopIteration
672
673 return charptr_to_str(self.buffer.s, self.encoding)
674
675 def __dealloc__(self):
676 if <void*>self.iterator != NULL:
677 tbx_itr_destroy(self.iterator)
678 if self.buffer.s != NULL:
679 free(self.buffer.s)
680
681
682 class EmptyIterator:
683 '''empty iterator'''
684
685 def __iter__(self):
686 return self
687
688 def __next__(self):
689 raise StopIteration()
690
691
692 cdef class TabixIteratorParsed(TabixIterator):
693 """iterates over mapped reads in a region.
694
695 The *parser* determines the encoding.
696
697 Returns parsed data.
698 """
699
700 def __init__(self, Parser parser):
701 super().__init__()
702 self.parser = parser
703
704 def __next__(self):
705 """python version of next().
706
707 pyrex uses this non-standard name instead of next()
708 """
709
710 cdef int retval = self.__cnext__()
711 if retval == -5:
712 raise IOError("iteration on closed file")
713 elif retval < 0:
714 raise StopIteration
715
716 return self.parser.parse(self.buffer.s,
717 self.buffer.l)
718
719
720 cdef class GZIterator:
721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
722 '''iterate line-by-line through gzip (or bgzip)
723 compressed file.
724 '''
725 if not os.path.exists(filename):
726 raise IOError("No such file or directory: %s" % filename)
727
728 filename = encode_filename(filename)
729 cdef char *cfilename = filename
730 with nogil:
731 self.gzipfile = bgzf_open(cfilename, "r")
732 self._filename = filename
733 self.kstream = ks_init(self.gzipfile)
734 self.encoding = encoding
735
736 self.buffer.l = 0
737 self.buffer.m = 0
738 self.buffer.s = <char*>malloc(buffer_size)
739
740 def __dealloc__(self):
741 '''close file.'''
742 if self.gzipfile != NULL:
743 bgzf_close(self.gzipfile)
744 self.gzipfile = NULL
745 if self.buffer.s != NULL:
746 free(self.buffer.s)
747 if self.kstream != NULL:
748 ks_destroy(self.kstream)
749
750 def __iter__(self):
751 return self
752
753 cdef int __cnext__(self):
754 cdef int dret = 0
755 cdef int retval = 0
756 while 1:
757 with nogil:
758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
759
760 if retval < 0:
761 break
762
763 return dret
764 return -1
765
766 def __next__(self):
767 """python version of next().
768 """
769 cdef int retval = self.__cnext__()
770 if retval < 0:
771 raise StopIteration
772 return force_str(self.buffer.s, self.encoding)
773
774
775 cdef class GZIteratorHead(GZIterator):
776 '''iterate line-by-line through gzip (or bgzip)
777 compressed file returning comments at top of file.
778 '''
779
780 def __next__(self):
781 """python version of next().
782 """
783 cdef int retval = self.__cnext__()
784 if retval < 0:
785 raise StopIteration
786 if self.buffer.s[0] == b'#':
787 return self.buffer.s
788 else:
789 raise StopIteration
790
791
792 cdef class GZIteratorParsed(GZIterator):
793 '''iterate line-by-line through gzip (or bgzip)
794 compressed file returning comments at top of file.
795 '''
796
797 def __init__(self, parser):
798 self.parser = parser
799
800 def __next__(self):
801 """python version of next().
802 """
803 cdef int retval = self.__cnext__()
804 if retval < 0:
805 raise StopIteration
806
807 return self.parser.parse(self.buffer.s,
808 self.buffer.l)
809
810
811 def tabix_compress(filename_in,
812 filename_out,
813 force=False):
814 '''compress *filename_in* writing the output to *filename_out*.
815
816 Raise an IOError if *filename_out* already exists, unless *force*
817 is set.
818 '''
819
820 if not force and os.path.exists(filename_out):
821 raise IOError(
822 "Filename '%s' already exists, use *force* to "
823 "overwrite" % filename_out)
824
825 cdef int WINDOW_SIZE
826 cdef int c, r
827 cdef void * buffer
828 cdef BGZF * fp
829 cdef int fd_src
830 cdef bint is_empty = True
831 cdef int O_RDONLY
832 O_RDONLY = os.O_RDONLY
833
834 WINDOW_SIZE = 64 * 1024
835
836 fn = encode_filename(filename_out)
837 cdef char *cfn = fn
838 with nogil:
839 fp = bgzf_open(cfn, "w")
840 if fp == NULL:
841 raise IOError("could not open '%s' for writing" % filename_out)
842
843 fn = encode_filename(filename_in)
844 fd_src = open(fn, O_RDONLY)
845 if fd_src == 0:
846 raise IOError("could not open '%s' for reading" % filename_in)
847
848 buffer = malloc(WINDOW_SIZE)
849 c = 1
850
851 while c > 0:
852 with nogil:
853 c = read(fd_src, buffer, WINDOW_SIZE)
854 if c > 0:
855 is_empty = False
856 r = bgzf_write(fp, buffer, c)
857 if r < 0:
858 free(buffer)
859 raise IOError("writing failed")
860
861 free(buffer)
862 r = bgzf_close(fp)
863 if r < 0:
864 raise IOError("error %i when writing to file %s" % (r, filename_out))
865
866 r = close(fd_src)
867 # an empty file will return with -1, thus ignore this.
868 if r < 0:
869 if not (r == -1 and is_empty):
870 raise IOError("error %i when closing file %s" % (r, filename_in))
871
872
873 def tabix_index(filename,
874 force=False,
875 seq_col=None,
876 start_col=None,
877 end_col=None,
878 preset=None,
879 meta_char="#",
880 int line_skip=0,
881 zerobased=False,
882 int min_shift=-1,
883 index=None,
884 keep_original=False,
885 csi=False,
886 ):
887 '''index tab-separated *filename* using tabix.
888
889 An existing index will not be overwritten unless *force* is set.
890
891 The index will be built from coordinates in columns *seq_col*,
892 *start_col* and *end_col*.
893
894 The contents of *filename* have to be sorted by contig and
895 position - the method does not check if the file is sorted.
896
897 Column indices are 0-based. Note that this is different from the
898 tabix command line utility where column indices start at 1.
899
900 Coordinates in the file are assumed to be 1-based unless
901 *zerobased* is set.
902
903 If *preset* is provided, the column coordinates are taken from a
904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
905 psltbl", "pileup".
906
907 Lines beginning with *meta_char* and the first *line_skip* lines
908 will be skipped.
909
910 If *filename* is not detected as a gzip file it will be automatically
911 compressed. The original file will be removed and only the compressed
912 file will be retained.
913
914 By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
915 is greater than zero and/or *csi* is True, creates a CSI index with a
916 minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
917
918 *index* controls the filename which should be used for creating the index.
919 If not set, the default is to append ``.tbi`` to *filename*.
920
921 When automatically compressing files, if *keep_original* is set the
922 uncompressed file will not be deleted.
923
924 returns the filename of the compressed data
925
926 '''
927
928 if preset is None and \
929 (seq_col is None or start_col is None or end_col is None):
930 raise ValueError(
931 "neither preset nor seq_col,start_col and end_col given")
932
933 fn = encode_filename(filename)
934 cdef char *cfn = fn
935
936 cdef htsFile *fp = hts_open(cfn, "r")
937 if fp == NULL:
938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
939
940 cdef htsFormat fmt = hts_get_format(fp)[0]
941 hts_close(fp)
942
943 if fmt.compression == no_compression:
944 tabix_compress(filename, filename + ".gz", force=force)
945 if not keep_original:
946 os.unlink(filename)
947 filename += ".gz"
948 fn = encode_filename(filename)
949 cfn = fn
950
951 # columns (1-based):
952 # preset-code, contig, start, end, metachar for
953 # comments, lines to ignore at beginning
954 # 0 is a missing column
955 preset2conf = {
956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
961 }
962
963 conf_data = None
964 if preset == "bcf" or fmt.format == bcf:
965 csi = True
966 elif preset:
967 try:
968 conf_data = preset2conf[preset]
969 except KeyError:
970 raise KeyError(
971 "unknown preset '%s', valid presets are '%s'" %
972 (preset, ",".join(preset2conf.keys())))
973 else:
974 if end_col is None:
975 end_col = -1
976
977 preset = 0
978 # tabix internally works with 0-based coordinates and
979 # open/closed intervals. When using a preset, conversion is
980 # automatically taken care of. Otherwise, the coordinates are
981 # assumed to be 1-based closed intervals and -1 is subtracted
982 # from the start coordinate. To avoid doing this, set the
983 # TI_FLAG_UCSC=0x10000 flag:
984 if zerobased:
985 preset = preset | TBX_UCSC
986
987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
988
989 cdef tbx_conf_t conf
990 if conf_data:
991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
992
993 if csi or min_shift > 0:
994 suffix = ".csi"
995 if min_shift <= 0: min_shift = 14
996 else:
997 suffix = ".tbi"
998 min_shift = 0
999
1000 index = index or filename + suffix
1001 fn_index = encode_filename(index)
1002
1003 if not force and os.path.exists(index):
1004 raise IOError(
1005 "filename '%s' already exists, use *force* to overwrite" % index)
1006
1007 cdef char *fnidx = fn_index
1008 cdef int retval = 0
1009
1010 if csi and fmt.format == bcf:
1011 with nogil:
1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
1013 else:
1014 with nogil:
1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
1016
1017 if retval != 0:
1018 raise OSError("building of index for {} failed".format(filename))
1019
1020 return filename
1021
1022 # #########################################################
1023 # cdef class tabix_file_iterator_old:
1024 # '''iterate over ``infile``.
1025
1026 # This iterator is not safe. If the :meth:`__next__()` method is called
1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
1028
1029 # The iterator might either raise a StopIteration or segfault.
1030 # '''
1031
1032
1033 # def __cinit__(self,
1034 # infile,
1035 # Parser parser,
1036 # int buffer_size = 65536 ):
1037
1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
1040 # self.infile = fdopen( fd, 'r')
1041
1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
1043
1044 # self.buffer = <char*>malloc( buffer_size )
1045 # self.size = buffer_size
1046 # self.parser = parser
1047
1048 # def __iter__(self):
1049 # return self
1050
1051 # cdef __cnext__(self):
1052
1053 # cdef char * b
1054 # cdef size_t nbytes
1055 # b = self.buffer
1056
1057 # while not feof( self.infile ):
1058 # nbytes = getline( &b, &self.size, self.infile)
1059
1060 # # stop at first error or eof
1061 # if (nbytes == -1): break
1062 # # skip comments
1063 # if (b[0] == '#'): continue
1064
1065 # # skip empty lines
1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
1067
1068 # # make sure that entry is complete
1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
1070 # result = b
1071 # raise ValueError( "incomplete line at %s" % result )
1072
1073 # # make sure that this goes fully through C
1074 # # otherwise buffer is copied to/from a
1075 # # Python object causing segfaults as
1076 # # the wrong memory is freed
1077 # return self.parser.parse( b, nbytes )
1078
1079 # raise StopIteration
1080
1081 # def __dealloc__(self):
1082 # free(self.buffer)
1083
1084 # def __next__(self):
1085 # return self.__cnext__()
1086
1087 #########################################################
1088 #########################################################
1089 #########################################################
1090 ## Iterators for parsing through unindexed files.
1091 #########################################################
1092 # cdef buildGzipError(void *gzfp):
1093 # cdef int errnum = 0
1094 # cdef char *s = gzerror(gzfp, &errnum)
1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
1096
1097
1098 cdef class tabix_file_iterator:
1099 '''iterate over a compressed or uncompressed ``infile``.
1100 '''
1101
1102 def __cinit__(self,
1103 infile,
1104 Parser parser,
1105 int buffer_size=65536):
1106
1107 if infile.closed:
1108 raise ValueError("I/O operation on closed file.")
1109
1110 self.infile = infile
1111
1112 cdef int fd = PyObject_AsFileDescriptor(infile)
1113 if fd == -1:
1114 raise ValueError("I/O operation on closed file.")
1115
1116 self.duplicated_fd = dup(fd)
1117
1118 # From the manual:
1119 # gzopen can be used to read a file which is not in gzip format;
1120 # in this case gzread will directly read from the file without decompression.
1121 # When reading, this will be detected automatically by looking
1122 # for the magic two-byte gzip header.
1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
1124
1125 if self.fh == NULL:
1126 raise IOError('%s' % strerror(errno))
1127
1128 self.kstream = ks_init(self.fh)
1129
1130 self.buffer.s = <char*>malloc(buffer_size)
1131 #if self.buffer == NULL:
1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
1133 #self.size = buffer_size
1134 self.parser = parser
1135
1136 def __iter__(self):
1137 return self
1138
1139 cdef __cnext__(self):
1140
1141 cdef char * b
1142 cdef int dret = 0
1143 cdef int retval = 0
1144 while 1:
1145 with nogil:
1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
1147
1148 if retval < 0:
1149 break
1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
1151
1152 b = self.buffer.s
1153
1154 # skip comments
1155 if (b[0] == b'#'):
1156 continue
1157
1158 # skip empty lines
1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
1160 continue
1161
1162 # gzgets terminates at \n, no need to test
1163
1164 # parser creates a copy
1165 return self.parser.parse(b, self.buffer.l)
1166
1167 raise StopIteration
1168
1169 def __dealloc__(self):
1170 free(self.buffer.s)
1171 ks_destroy(self.kstream)
1172 bgzf_close(self.fh)
1173
1174 def __next__(self):
1175 return self.__cnext__()
1176
1177
1178 class tabix_generic_iterator:
1179 '''iterate over ``infile``.
1180
1181 Permits the use of file-like objects for example from the gzip module.
1182 '''
1183 def __init__(self, infile, parser):
1184
1185 self.infile = infile
1186 if self.infile.closed:
1187 raise ValueError("I/O operation on closed file.")
1188 self.parser = parser
1189
1190 def __iter__(self):
1191 return self
1192
1193 # cython version - required for python 3
1194 def __next__(self):
1195
1196 cdef char * b
1197 cdef char * cpy
1198 cdef size_t nbytes
1199
1200 encoding = self.parser.get_encoding()
1201
1202 # note that GzipFile.close() does not close the file
1203 # reading is still possible.
1204 if self.infile.closed:
1205 raise ValueError("I/O operation on closed file.")
1206
1207 while 1:
1208
1209 line = self.infile.readline()
1210 if not line:
1211 break
1212
1213 s = force_bytes(line, encoding)
1214 b = s
1215 nbytes = len(line)
1216 assert b[nbytes] == b'\0'
1217
1218 # skip comments
1219 if b[0] == b'#':
1220 continue
1221
1222 # skip empty lines
1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
1224 continue
1225
1226 # make sure that entry is complete
1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
1228 raise ValueError("incomplete line at %s" % line)
1229
1230 bytes_cpy = <bytes> b
1231 cpy = <char *> bytes_cpy
1232
1233 return self.parser(cpy, nbytes)
1234
1235 raise StopIteration
1236
1237
1238 def tabix_iterator(infile, parser):
1239 """return an iterator over all entries in a file.
1240
1241 Results are returned parsed as specified by the *parser*. If
1242 *parser* is None, the results are returned as an unparsed string.
1243 Otherwise, *parser* is assumed to be a functor that will return
1244 parsed data (see for example :class:`~pysam.asTuple` and
1245 :class:`~pysam.asGTF`).
1246
1247 """
1248 return tabix_generic_iterator(infile, parser)
1249
1250
1251 cdef class Tabixfile(TabixFile):
1252 """Tabixfile is deprecated: use TabixFile instead"""
1253 pass
1254
1255
1256 __all__ = [
1257 "tabix_index",
1258 "tabix_compress",
1259 "TabixFile",
1260 "Tabixfile",
1261 "asTuple",
1262 "asGTF",
1263 "asGFF3",
1264 "asVCF",
1265 "asBed",
1266 "GZIterator",
1267 "GZIteratorHead",
1268 "tabix_iterator",
1269 "tabix_generic_iterator",
1270 "tabix_file_iterator",
1271 ]