Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabix.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # cython: language_level=3 | |
2 # cython: embedsignature=True | |
3 # cython: profile=True | |
4 ############################################################################### | |
5 ############################################################################### | |
6 # Cython wrapper for access to tabix indexed files in bgzf format | |
7 ############################################################################### | |
8 # The principal classes and functions defined in this module are: | |
9 # | |
10 # class TabixFile class wrapping tabix indexed files in bgzf format | |
11 # | |
12 # class asTuple Parser class for tuples | |
13 # class asGTF Parser class for GTF formatted rows | |
14 # class asGFF3 Parser class for GFF3 formatted rows | |
15 # class asBed Parser class for Bed formatted rows | |
16 # class asVCF Parser class for VCF formatted rows | |
17 # | |
18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files | |
19 # | |
20 # Additionally this module defines several additional classes that are part | |
21 # of the internal API. These are: | |
22 # | |
23 # class Parser base class for parsers of tab-separated rows | |
24 # class tabix_file_iterator | |
25 # class TabixIterator iterator class over rows in bgzf file | |
26 # class EmptyIterator | |
27 # | |
28 # For backwards compatibility, the following classes are also defined: | |
29 # | |
30 # class Tabixfile equivalent to TabixFile | |
31 # | |
32 ############################################################################### | |
33 # | |
34 # The MIT License | |
35 # | |
36 # Copyright (c) 2015 Andreas Heger | |
37 # | |
38 # Permission is hereby granted, free of charge, to any person obtaining a | |
39 # copy of this software and associated documentation files (the "Software"), | |
40 # to deal in the Software without restriction, including without limitation | |
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
42 # and/or sell copies of the Software, and to permit persons to whom the | |
43 # Software is furnished to do so, subject to the following conditions: | |
44 # | |
45 # The above copyright notice and this permission notice shall be included in | |
46 # all copies or substantial portions of the Software. | |
47 # | |
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
54 # DEALINGS IN THE SOFTWARE. | |
55 # | |
56 ############################################################################### | |
57 import os | |
58 import sys | |
59 | |
60 from libc.stdio cimport printf, fprintf, stderr | |
61 from libc.string cimport strerror | |
62 from libc.errno cimport errno | |
63 from posix.unistd cimport dup | |
64 | |
65 from cpython cimport PyErr_SetString, PyBytes_Check, \ | |
66 PyUnicode_Check, PyBytes_FromStringAndSize, \ | |
67 PyObject_AsFileDescriptor | |
68 | |
69 cimport pysam.libctabixproxies as ctabixproxies | |
70 | |
71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ | |
72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ | |
73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ | |
74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ | |
75 tbx_destroy, hisremote, region_list, hts_getline, \ | |
76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \ | |
77 no_compression, bcf, bcf_index_build2 | |
78 | |
79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str | |
80 from pysam.libcutils cimport encode_filename, from_string_and_size | |
81 | |
82 cdef class Parser: | |
83 | |
84 def __init__(self, encoding="ascii"): | |
85 self.encoding = encoding | |
86 | |
87 def set_encoding(self, encoding): | |
88 self.encoding = encoding | |
89 | |
90 def get_encoding(self): | |
91 return self.encoding | |
92 | |
93 cdef parse(self, char * buffer, int length): | |
94 raise NotImplementedError( | |
95 'parse method of %s not implemented' % str(self)) | |
96 | |
97 def __call__(self, char * buffer, int length): | |
98 return self.parse(buffer, length) | |
99 | |
100 | |
101 cdef class asTuple(Parser): | |
102 '''converts a :term:`tabix row` into a python tuple. | |
103 | |
104 A field in a row is accessed by numeric index. | |
105 ''' | |
106 cdef parse(self, char * buffer, int len): | |
107 cdef ctabixproxies.TupleProxy r | |
108 r = ctabixproxies.TupleProxy(self.encoding) | |
109 # need to copy - there were some | |
110 # persistence issues with "present" | |
111 r.copy(buffer, len) | |
112 return r | |
113 | |
114 | |
115 cdef class asGFF3(Parser): | |
116 '''converts a :term:`tabix row` into a GFF record with the following | |
117 fields: | |
118 | |
119 +----------+----------+-------------------------------+ | |
120 |*Column* |*Name* |*Content* | | |
121 +----------+----------+-------------------------------+ | |
122 |1 |contig |the chromosome name | | |
123 +----------+----------+-------------------------------+ | |
124 |2 |feature |The feature type | | |
125 +----------+----------+-------------------------------+ | |
126 |3 |source |The feature source | | |
127 +----------+----------+-------------------------------+ | |
128 |4 |start |genomic start coordinate | | |
129 | | |(0-based) | | |
130 +----------+----------+-------------------------------+ | |
131 |5 |end |genomic end coordinate | | |
132 | | |(0-based) | | |
133 +----------+----------+-------------------------------+ | |
134 |6 |score |feature score | | |
135 +----------+----------+-------------------------------+ | |
136 |7 |strand |strand | | |
137 +----------+----------+-------------------------------+ | |
138 |8 |frame |frame | | |
139 +----------+----------+-------------------------------+ | |
140 |9 |attributes|the attribute field | | |
141 +----------+----------+-------------------------------+ | |
142 | |
143 ''' | |
144 cdef parse(self, char * buffer, int len): | |
145 cdef ctabixproxies.GFF3Proxy r | |
146 r = ctabixproxies.GFF3Proxy(self.encoding) | |
147 r.copy(buffer, len) | |
148 return r | |
149 | |
150 | |
151 cdef class asGTF(Parser): | |
152 '''converts a :term:`tabix row` into a GTF record with the following | |
153 fields: | |
154 | |
155 +----------+----------+-------------------------------+ | |
156 |*Column* |*Name* |*Content* | | |
157 +----------+----------+-------------------------------+ | |
158 |1 |contig |the chromosome name | | |
159 +----------+----------+-------------------------------+ | |
160 |2 |feature |The feature type | | |
161 +----------+----------+-------------------------------+ | |
162 |3 |source |The feature source | | |
163 +----------+----------+-------------------------------+ | |
164 |4 |start |genomic start coordinate | | |
165 | | |(0-based) | | |
166 +----------+----------+-------------------------------+ | |
167 |5 |end |genomic end coordinate | | |
168 | | |(0-based) | | |
169 +----------+----------+-------------------------------+ | |
170 |6 |score |feature score | | |
171 +----------+----------+-------------------------------+ | |
172 |7 |strand |strand | | |
173 +----------+----------+-------------------------------+ | |
174 |8 |frame |frame | | |
175 +----------+----------+-------------------------------+ | |
176 |9 |attributes|the attribute field | | |
177 +----------+----------+-------------------------------+ | |
178 | |
179 GTF formatted entries also define the following fields that | |
180 are derived from the attributes field: | |
181 | |
182 +--------------------+------------------------------+ | |
183 |*Name* |*Content* | | |
184 +--------------------+------------------------------+ | |
185 |gene_id |the gene identifier | | |
186 +--------------------+------------------------------+ | |
187 |transcript_id |the transcript identifier | | |
188 +--------------------+------------------------------+ | |
189 | |
190 ''' | |
191 cdef parse(self, char * buffer, int len): | |
192 cdef ctabixproxies.GTFProxy r | |
193 r = ctabixproxies.GTFProxy(self.encoding) | |
194 r.copy(buffer, len) | |
195 return r | |
196 | |
197 | |
198 cdef class asBed(Parser): | |
199 '''converts a :term:`tabix row` into a bed record | |
200 with the following fields: | |
201 | |
202 +-----------+-----------+------------------------------------------+ | |
203 |*Column* |*Field* |*Contents* | | |
204 | | | | | |
205 +-----------+-----------+------------------------------------------+ | |
206 |1 |contig |contig | | |
207 | | | | | |
208 +-----------+-----------+------------------------------------------+ | |
209 |2 |start |genomic start coordinate (zero-based) | | |
210 +-----------+-----------+------------------------------------------+ | |
211 |3 |end |genomic end coordinate plus one | | |
212 | | |(zero-based) | | |
213 +-----------+-----------+------------------------------------------+ | |
214 |4 |name |name of feature. | | |
215 +-----------+-----------+------------------------------------------+ | |
216 |5 |score |score of feature | | |
217 +-----------+-----------+------------------------------------------+ | |
218 |6 |strand |strand of feature | | |
219 +-----------+-----------+------------------------------------------+ | |
220 |7 |thickStart |thickStart | | |
221 +-----------+-----------+------------------------------------------+ | |
222 |8 |thickEnd |thickEnd | | |
223 +-----------+-----------+------------------------------------------+ | |
224 |9 |itemRGB |itemRGB | | |
225 +-----------+-----------+------------------------------------------+ | |
226 |10 |blockCount |number of bocks | | |
227 +-----------+-----------+------------------------------------------+ | |
228 |11 |blockSizes |',' separated string of block sizes | | |
229 +-----------+-----------+------------------------------------------+ | |
230 |12 |blockStarts|',' separated string of block genomic | | |
231 | | |start positions | | |
232 +-----------+-----------+------------------------------------------+ | |
233 | |
234 Only the first three fields are required. Additional | |
235 fields are optional, but if one is defined, all the preceding | |
236 need to be defined as well. | |
237 | |
238 ''' | |
239 cdef parse(self, char * buffer, int len): | |
240 cdef ctabixproxies.BedProxy r | |
241 r = ctabixproxies.BedProxy(self.encoding) | |
242 r.copy(buffer, len) | |
243 return r | |
244 | |
245 | |
246 cdef class asVCF(Parser): | |
247 '''converts a :term:`tabix row` into a VCF record with | |
248 the following fields: | |
249 | |
250 +----------+---------+------------------------------------+ | |
251 |*Column* |*Field* |*Contents* | | |
252 | | | | | |
253 +----------+---------+------------------------------------+ | |
254 |1 |contig |chromosome | | |
255 +----------+---------+------------------------------------+ | |
256 |2 |pos |chromosomal position, zero-based | | |
257 +----------+---------+------------------------------------+ | |
258 |3 |id |id | | |
259 +----------+---------+------------------------------------+ | |
260 |4 |ref |reference allele | | |
261 +----------+---------+------------------------------------+ | |
262 |5 |alt |alternate alleles | | |
263 +----------+---------+------------------------------------+ | |
264 |6 |qual |quality | | |
265 +----------+---------+------------------------------------+ | |
266 |7 |filter |filter | | |
267 +----------+---------+------------------------------------+ | |
268 |8 |info |info | | |
269 +----------+---------+------------------------------------+ | |
270 |9 |format |format specifier. | | |
271 +----------+---------+------------------------------------+ | |
272 | |
273 Access to genotypes is via index:: | |
274 | |
275 contig = vcf.contig | |
276 first_sample_genotype = vcf[0] | |
277 second_sample_genotype = vcf[1] | |
278 | |
279 ''' | |
280 cdef parse(self, char * buffer, int len): | |
281 cdef ctabixproxies.VCFProxy r | |
282 r = ctabixproxies.VCFProxy(self.encoding) | |
283 r.copy(buffer, len) | |
284 return r | |
285 | |
286 | |
287 cdef class TabixFile: | |
288 """Random access to bgzf formatted files that | |
289 have been indexed by :term:`tabix`. | |
290 | |
291 The file is automatically opened. The index file of file | |
292 ``<filename>`` is expected to be called ``<filename>.tbi`` | |
293 by default (see parameter `index`). | |
294 | |
295 Parameters | |
296 ---------- | |
297 | |
298 filename : string | |
299 Filename of bgzf file to be opened. | |
300 | |
301 index : string | |
302 The filename of the index. If not set, the default is to | |
303 assume that the index is called ``filename.tbi`` | |
304 | |
305 mode : char | |
306 The file opening mode. Currently, only ``r`` is permitted. | |
307 | |
308 parser : :class:`pysam.Parser` | |
309 | |
310 sets the default parser for this tabix file. If `parser` | |
311 is None, the results are returned as an unparsed string. | |
312 Otherwise, `parser` is assumed to be a functor that will return | |
313 parsed data (see for example :class:`~pysam.asTuple` and | |
314 :class:`~pysam.asGTF`). | |
315 | |
316 encoding : string | |
317 | |
318 The encoding passed to the parser | |
319 | |
320 threads: integer | |
321 Number of threads to use for decompressing Tabix files. | |
322 (Default=1) | |
323 | |
324 | |
325 Raises | |
326 ------ | |
327 | |
328 ValueError | |
329 if index file is missing. | |
330 | |
331 IOError | |
332 if file could not be opened | |
333 """ | |
334 def __cinit__(self, | |
335 filename, | |
336 mode='r', | |
337 parser=None, | |
338 index=None, | |
339 encoding="ascii", | |
340 threads=1, | |
341 *args, | |
342 **kwargs ): | |
343 | |
344 self.htsfile = NULL | |
345 self.is_remote = False | |
346 self.is_stream = False | |
347 self.parser = parser | |
348 self.threads = threads | |
349 self._open(filename, mode, index, *args, **kwargs) | |
350 self.encoding = encoding | |
351 | |
352 def _open( self, | |
353 filename, | |
354 mode='r', | |
355 index=None, | |
356 threads=1, | |
357 ): | |
358 '''open a :term:`tabix file` for reading.''' | |
359 | |
360 if mode != 'r': | |
361 raise ValueError("invalid file opening mode `%s`" % mode) | |
362 | |
363 if self.htsfile != NULL: | |
364 self.close() | |
365 self.htsfile = NULL | |
366 self.threads=threads | |
367 | |
368 filename_index = index or (filename + ".tbi") | |
369 # encode all the strings to pass to tabix | |
370 self.filename = encode_filename(filename) | |
371 self.filename_index = encode_filename(filename_index) | |
372 | |
373 self.is_stream = self.filename == b'-' | |
374 self.is_remote = hisremote(self.filename) | |
375 | |
376 if not self.is_remote: | |
377 if not os.path.exists(filename): | |
378 raise IOError("file `%s` not found" % filename) | |
379 | |
380 if not os.path.exists(filename_index): | |
381 raise IOError("index `%s` not found" % filename_index) | |
382 | |
383 # open file | |
384 cdef char *cfilename = self.filename | |
385 cdef char *cfilename_index = self.filename_index | |
386 with nogil: | |
387 self.htsfile = hts_open(cfilename, 'r') | |
388 | |
389 if self.htsfile == NULL: | |
390 raise IOError("could not open file `%s`" % filename) | |
391 | |
392 #if self.htsfile.format.category != region_list: | |
393 # raise ValueError("file does not contain region data") | |
394 | |
395 with nogil: | |
396 self.index = tbx_index_load2(cfilename, cfilename_index) | |
397 | |
398 if self.index == NULL: | |
399 raise IOError("could not open index for `%s`" % filename) | |
400 | |
401 if not self.is_stream: | |
402 self.start_offset = self.tell() | |
403 | |
404 def _dup(self): | |
405 '''return a copy of this tabix file. | |
406 | |
407 The file is being re-opened. | |
408 ''' | |
409 return TabixFile(self.filename, | |
410 mode="r", | |
411 threads=self.threads, | |
412 parser=self.parser, | |
413 index=self.filename_index, | |
414 encoding=self.encoding) | |
415 | |
416 def fetch(self, | |
417 reference=None, | |
418 start=None, | |
419 end=None, | |
420 region=None, | |
421 parser=None, | |
422 multiple_iterators=False): | |
423 '''fetch one or more rows in a :term:`region` using 0-based | |
424 indexing. The region is specified by :term:`reference`, | |
425 *start* and *end*. Alternatively, a samtools :term:`region` | |
426 string can be supplied. | |
427 | |
428 Without *reference* or *region* all entries will be fetched. | |
429 | |
430 If only *reference* is set, all reads matching on *reference* | |
431 will be fetched. | |
432 | |
433 If *parser* is None, the default parser will be used for | |
434 parsing. | |
435 | |
436 Set *multiple_iterators* to true if you will be using multiple | |
437 iterators on the same file at the same time. The iterator | |
438 returned will receive its own copy of a filehandle to the file | |
439 effectively re-opening the file. Re-opening a file creates | |
440 some overhead, so beware. | |
441 | |
442 ''' | |
443 if not self.is_open(): | |
444 raise ValueError("I/O operation on closed file") | |
445 | |
446 # convert coordinates to region string, which is one-based | |
447 if reference: | |
448 if end is not None: | |
449 if end < 0: | |
450 raise ValueError("end out of range (%i)" % end) | |
451 if start is None: | |
452 start = 0 | |
453 | |
454 if start < 0: | |
455 raise ValueError("start out of range (%i)" % end) | |
456 elif start > end: | |
457 raise ValueError( | |
458 'start (%i) >= end (%i)' % (start, end)) | |
459 elif start == end: | |
460 return EmptyIterator() | |
461 else: | |
462 region = '%s:%i-%i' % (reference, start + 1, end) | |
463 elif start is not None: | |
464 if start < 0: | |
465 raise ValueError("start out of range (%i)" % end) | |
466 region = '%s:%i' % (reference, start + 1) | |
467 else: | |
468 region = reference | |
469 | |
470 # get iterator | |
471 cdef hts_itr_t * itr | |
472 cdef char *cstr | |
473 cdef TabixFile fileobj | |
474 | |
475 # reopen the same file if necessary | |
476 if multiple_iterators: | |
477 fileobj = self._dup() | |
478 else: | |
479 fileobj = self | |
480 | |
481 if region is None: | |
482 # without region or reference - iterate from start | |
483 with nogil: | |
484 itr = tbx_itr_queryi(fileobj.index, | |
485 HTS_IDX_START, | |
486 0, | |
487 0) | |
488 else: | |
489 s = force_bytes(region, encoding=fileobj.encoding) | |
490 cstr = s | |
491 with nogil: | |
492 itr = tbx_itr_querys(fileobj.index, cstr) | |
493 | |
494 if itr == NULL: | |
495 if region is None: | |
496 if len(self.contigs) > 0: | |
497 # when accessing a tabix file created prior tabix 1.0 | |
498 # the full-file iterator is empty. | |
499 raise ValueError( | |
500 "could not create iterator, possible " | |
501 "tabix version mismatch") | |
502 else: | |
503 # possible reason is that the file is empty - | |
504 # return an empty iterator | |
505 return EmptyIterator() | |
506 else: | |
507 raise ValueError( | |
508 "could not create iterator for region '%s'" % | |
509 region) | |
510 | |
511 # use default parser if no parser is specified | |
512 if parser is None: | |
513 parser = fileobj.parser | |
514 | |
515 cdef TabixIterator a | |
516 if parser is None: | |
517 a = TabixIterator(encoding=fileobj.encoding) | |
518 else: | |
519 parser.set_encoding(fileobj.encoding) | |
520 a = TabixIteratorParsed(parser) | |
521 | |
522 a.tabixfile = fileobj | |
523 a.iterator = itr | |
524 | |
525 return a | |
526 | |
527 ############################################################### | |
528 ############################################################### | |
529 ############################################################### | |
530 ## properties | |
531 ############################################################### | |
532 property header: | |
533 '''the file header. | |
534 | |
535 The file header consists of the lines at the beginning of a | |
536 file that are prefixed by the comment character ``#``. | |
537 | |
538 .. note:: | |
539 The header is returned as an iterator presenting lines | |
540 without the newline character. | |
541 ''' | |
542 | |
543 def __get__(self): | |
544 | |
545 cdef char *cfilename = self.filename | |
546 cdef char *cfilename_index = self.filename_index | |
547 | |
548 cdef kstring_t buffer | |
549 buffer.l = buffer.m = 0 | |
550 buffer.s = NULL | |
551 | |
552 cdef htsFile * fp = NULL | |
553 cdef int KS_SEP_LINE = 2 | |
554 cdef tbx_t * tbx = NULL | |
555 lines = [] | |
556 with nogil: | |
557 fp = hts_open(cfilename, 'r') | |
558 | |
559 if fp == NULL: | |
560 raise OSError("could not open {} for reading header".format(self.filename)) | |
561 | |
562 with nogil: | |
563 tbx = tbx_index_load2(cfilename, cfilename_index) | |
564 | |
565 if tbx == NULL: | |
566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) | |
567 | |
568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: | |
569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char: | |
570 break | |
571 lines.append(force_str(buffer.s, self.encoding)) | |
572 | |
573 with nogil: | |
574 hts_close(fp) | |
575 free(buffer.s) | |
576 | |
577 return lines | |
578 | |
579 property contigs: | |
580 '''list of chromosome names''' | |
581 def __get__(self): | |
582 cdef const char ** sequences | |
583 cdef int nsequences | |
584 | |
585 with nogil: | |
586 sequences = tbx_seqnames(self.index, &nsequences) | |
587 cdef int x | |
588 result = [] | |
589 for x from 0 <= x < nsequences: | |
590 result.append(force_str(sequences[x])) | |
591 | |
592 # htslib instructions: | |
593 # only free container, not the sequences themselves | |
594 free(sequences) | |
595 | |
596 return result | |
597 | |
598 def close(self): | |
599 ''' | |
600 closes the :class:`pysam.TabixFile`.''' | |
601 if self.htsfile != NULL: | |
602 hts_close(self.htsfile) | |
603 self.htsfile = NULL | |
604 if self.index != NULL: | |
605 tbx_destroy(self.index) | |
606 self.index = NULL | |
607 | |
608 def __dealloc__( self ): | |
609 # remember: dealloc cannot call other python methods | |
610 # note: no doc string | |
611 # note: __del__ is not called. | |
612 if self.htsfile != NULL: | |
613 hts_close(self.htsfile) | |
614 self.htsfile = NULL | |
615 if self.index != NULL: | |
616 tbx_destroy(self.index) | |
617 | |
618 | |
619 cdef class TabixIterator: | |
620 """iterates over rows in *tabixfile* in region | |
621 given by *tid*, *start* and *end*. | |
622 """ | |
623 | |
624 def __init__(self, encoding="ascii"): | |
625 self.encoding = encoding | |
626 | |
627 def __iter__(self): | |
628 self.buffer.s = NULL | |
629 self.buffer.l = 0 | |
630 self.buffer.m = 0 | |
631 | |
632 return self | |
633 | |
634 cdef int __cnext__(self): | |
635 '''iterate to next element. | |
636 | |
637 Return -5 if file has been closed when this function | |
638 was called. | |
639 ''' | |
640 if self.tabixfile.htsfile == NULL: | |
641 return -5 | |
642 | |
643 cdef int retval | |
644 | |
645 while 1: | |
646 with nogil: | |
647 retval = tbx_itr_next( | |
648 self.tabixfile.htsfile, | |
649 self.tabixfile.index, | |
650 self.iterator, | |
651 &self.buffer) | |
652 | |
653 if retval < 0: | |
654 break | |
655 | |
656 if self.buffer.s[0] != b'#': | |
657 break | |
658 | |
659 return retval | |
660 | |
661 def __next__(self): | |
662 """python version of next(). | |
663 | |
664 pyrex uses this non-standard name instead of next() | |
665 """ | |
666 | |
667 cdef int retval = self.__cnext__() | |
668 if retval == -5: | |
669 raise IOError("iteration on closed file") | |
670 elif retval < 0: | |
671 raise StopIteration | |
672 | |
673 return charptr_to_str(self.buffer.s, self.encoding) | |
674 | |
675 def __dealloc__(self): | |
676 if <void*>self.iterator != NULL: | |
677 tbx_itr_destroy(self.iterator) | |
678 if self.buffer.s != NULL: | |
679 free(self.buffer.s) | |
680 | |
681 | |
682 class EmptyIterator: | |
683 '''empty iterator''' | |
684 | |
685 def __iter__(self): | |
686 return self | |
687 | |
688 def __next__(self): | |
689 raise StopIteration() | |
690 | |
691 | |
692 cdef class TabixIteratorParsed(TabixIterator): | |
693 """iterates over mapped reads in a region. | |
694 | |
695 The *parser* determines the encoding. | |
696 | |
697 Returns parsed data. | |
698 """ | |
699 | |
700 def __init__(self, Parser parser): | |
701 super().__init__() | |
702 self.parser = parser | |
703 | |
704 def __next__(self): | |
705 """python version of next(). | |
706 | |
707 pyrex uses this non-standard name instead of next() | |
708 """ | |
709 | |
710 cdef int retval = self.__cnext__() | |
711 if retval == -5: | |
712 raise IOError("iteration on closed file") | |
713 elif retval < 0: | |
714 raise StopIteration | |
715 | |
716 return self.parser.parse(self.buffer.s, | |
717 self.buffer.l) | |
718 | |
719 | |
720 cdef class GZIterator: | |
721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"): | |
722 '''iterate line-by-line through gzip (or bgzip) | |
723 compressed file. | |
724 ''' | |
725 if not os.path.exists(filename): | |
726 raise IOError("No such file or directory: %s" % filename) | |
727 | |
728 filename = encode_filename(filename) | |
729 cdef char *cfilename = filename | |
730 with nogil: | |
731 self.gzipfile = bgzf_open(cfilename, "r") | |
732 self._filename = filename | |
733 self.kstream = ks_init(self.gzipfile) | |
734 self.encoding = encoding | |
735 | |
736 self.buffer.l = 0 | |
737 self.buffer.m = 0 | |
738 self.buffer.s = <char*>malloc(buffer_size) | |
739 | |
740 def __dealloc__(self): | |
741 '''close file.''' | |
742 if self.gzipfile != NULL: | |
743 bgzf_close(self.gzipfile) | |
744 self.gzipfile = NULL | |
745 if self.buffer.s != NULL: | |
746 free(self.buffer.s) | |
747 if self.kstream != NULL: | |
748 ks_destroy(self.kstream) | |
749 | |
750 def __iter__(self): | |
751 return self | |
752 | |
753 cdef int __cnext__(self): | |
754 cdef int dret = 0 | |
755 cdef int retval = 0 | |
756 while 1: | |
757 with nogil: | |
758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) | |
759 | |
760 if retval < 0: | |
761 break | |
762 | |
763 return dret | |
764 return -1 | |
765 | |
766 def __next__(self): | |
767 """python version of next(). | |
768 """ | |
769 cdef int retval = self.__cnext__() | |
770 if retval < 0: | |
771 raise StopIteration | |
772 return force_str(self.buffer.s, self.encoding) | |
773 | |
774 | |
775 cdef class GZIteratorHead(GZIterator): | |
776 '''iterate line-by-line through gzip (or bgzip) | |
777 compressed file returning comments at top of file. | |
778 ''' | |
779 | |
780 def __next__(self): | |
781 """python version of next(). | |
782 """ | |
783 cdef int retval = self.__cnext__() | |
784 if retval < 0: | |
785 raise StopIteration | |
786 if self.buffer.s[0] == b'#': | |
787 return self.buffer.s | |
788 else: | |
789 raise StopIteration | |
790 | |
791 | |
792 cdef class GZIteratorParsed(GZIterator): | |
793 '''iterate line-by-line through gzip (or bgzip) | |
794 compressed file returning comments at top of file. | |
795 ''' | |
796 | |
797 def __init__(self, parser): | |
798 self.parser = parser | |
799 | |
800 def __next__(self): | |
801 """python version of next(). | |
802 """ | |
803 cdef int retval = self.__cnext__() | |
804 if retval < 0: | |
805 raise StopIteration | |
806 | |
807 return self.parser.parse(self.buffer.s, | |
808 self.buffer.l) | |
809 | |
810 | |
811 def tabix_compress(filename_in, | |
812 filename_out, | |
813 force=False): | |
814 '''compress *filename_in* writing the output to *filename_out*. | |
815 | |
816 Raise an IOError if *filename_out* already exists, unless *force* | |
817 is set. | |
818 ''' | |
819 | |
820 if not force and os.path.exists(filename_out): | |
821 raise IOError( | |
822 "Filename '%s' already exists, use *force* to " | |
823 "overwrite" % filename_out) | |
824 | |
825 cdef int WINDOW_SIZE | |
826 cdef int c, r | |
827 cdef void * buffer | |
828 cdef BGZF * fp | |
829 cdef int fd_src | |
830 cdef bint is_empty = True | |
831 cdef int O_RDONLY | |
832 O_RDONLY = os.O_RDONLY | |
833 | |
834 WINDOW_SIZE = 64 * 1024 | |
835 | |
836 fn = encode_filename(filename_out) | |
837 cdef char *cfn = fn | |
838 with nogil: | |
839 fp = bgzf_open(cfn, "w") | |
840 if fp == NULL: | |
841 raise IOError("could not open '%s' for writing" % filename_out) | |
842 | |
843 fn = encode_filename(filename_in) | |
844 fd_src = open(fn, O_RDONLY) | |
845 if fd_src == 0: | |
846 raise IOError("could not open '%s' for reading" % filename_in) | |
847 | |
848 buffer = malloc(WINDOW_SIZE) | |
849 c = 1 | |
850 | |
851 while c > 0: | |
852 with nogil: | |
853 c = read(fd_src, buffer, WINDOW_SIZE) | |
854 if c > 0: | |
855 is_empty = False | |
856 r = bgzf_write(fp, buffer, c) | |
857 if r < 0: | |
858 free(buffer) | |
859 raise IOError("writing failed") | |
860 | |
861 free(buffer) | |
862 r = bgzf_close(fp) | |
863 if r < 0: | |
864 raise IOError("error %i when writing to file %s" % (r, filename_out)) | |
865 | |
866 r = close(fd_src) | |
867 # an empty file will return with -1, thus ignore this. | |
868 if r < 0: | |
869 if not (r == -1 and is_empty): | |
870 raise IOError("error %i when closing file %s" % (r, filename_in)) | |
871 | |
872 | |
873 def tabix_index(filename, | |
874 force=False, | |
875 seq_col=None, | |
876 start_col=None, | |
877 end_col=None, | |
878 preset=None, | |
879 meta_char="#", | |
880 int line_skip=0, | |
881 zerobased=False, | |
882 int min_shift=-1, | |
883 index=None, | |
884 keep_original=False, | |
885 csi=False, | |
886 ): | |
887 '''index tab-separated *filename* using tabix. | |
888 | |
889 An existing index will not be overwritten unless *force* is set. | |
890 | |
891 The index will be built from coordinates in columns *seq_col*, | |
892 *start_col* and *end_col*. | |
893 | |
894 The contents of *filename* have to be sorted by contig and | |
895 position - the method does not check if the file is sorted. | |
896 | |
897 Column indices are 0-based. Note that this is different from the | |
898 tabix command line utility where column indices start at 1. | |
899 | |
900 Coordinates in the file are assumed to be 1-based unless | |
901 *zerobased* is set. | |
902 | |
903 If *preset* is provided, the column coordinates are taken from a | |
904 preset. Valid values for preset are "gff", "bed", "sam", "vcf", | |
905 psltbl", "pileup". | |
906 | |
907 Lines beginning with *meta_char* and the first *line_skip* lines | |
908 will be skipped. | |
909 | |
910 If *filename* is not detected as a gzip file it will be automatically | |
911 compressed. The original file will be removed and only the compressed | |
912 file will be retained. | |
913 | |
914 By default or when *min_shift* is 0, creates a TBI index. If *min_shift* | |
915 is greater than zero and/or *csi* is True, creates a CSI index with a | |
916 minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set). | |
917 | |
918 *index* controls the filename which should be used for creating the index. | |
919 If not set, the default is to append ``.tbi`` to *filename*. | |
920 | |
921 When automatically compressing files, if *keep_original* is set the | |
922 uncompressed file will not be deleted. | |
923 | |
924 returns the filename of the compressed data | |
925 | |
926 ''' | |
927 | |
928 if preset is None and \ | |
929 (seq_col is None or start_col is None or end_col is None): | |
930 raise ValueError( | |
931 "neither preset nor seq_col,start_col and end_col given") | |
932 | |
933 fn = encode_filename(filename) | |
934 cdef char *cfn = fn | |
935 | |
936 cdef htsFile *fp = hts_open(cfn, "r") | |
937 if fp == NULL: | |
938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno)))) | |
939 | |
940 cdef htsFormat fmt = hts_get_format(fp)[0] | |
941 hts_close(fp) | |
942 | |
943 if fmt.compression == no_compression: | |
944 tabix_compress(filename, filename + ".gz", force=force) | |
945 if not keep_original: | |
946 os.unlink(filename) | |
947 filename += ".gz" | |
948 fn = encode_filename(filename) | |
949 cfn = fn | |
950 | |
951 # columns (1-based): | |
952 # preset-code, contig, start, end, metachar for | |
953 # comments, lines to ignore at beginning | |
954 # 0 is a missing column | |
955 preset2conf = { | |
956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0), | |
957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0), | |
958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0), | |
959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0), | |
960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0), | |
961 } | |
962 | |
963 conf_data = None | |
964 if preset == "bcf" or fmt.format == bcf: | |
965 csi = True | |
966 elif preset: | |
967 try: | |
968 conf_data = preset2conf[preset] | |
969 except KeyError: | |
970 raise KeyError( | |
971 "unknown preset '%s', valid presets are '%s'" % | |
972 (preset, ",".join(preset2conf.keys()))) | |
973 else: | |
974 if end_col is None: | |
975 end_col = -1 | |
976 | |
977 preset = 0 | |
978 # tabix internally works with 0-based coordinates and | |
979 # open/closed intervals. When using a preset, conversion is | |
980 # automatically taken care of. Otherwise, the coordinates are | |
981 # assumed to be 1-based closed intervals and -1 is subtracted | |
982 # from the start coordinate. To avoid doing this, set the | |
983 # TI_FLAG_UCSC=0x10000 flag: | |
984 if zerobased: | |
985 preset = preset | TBX_UCSC | |
986 | |
987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip) | |
988 | |
989 cdef tbx_conf_t conf | |
990 if conf_data: | |
991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data | |
992 | |
993 if csi or min_shift > 0: | |
994 suffix = ".csi" | |
995 if min_shift <= 0: min_shift = 14 | |
996 else: | |
997 suffix = ".tbi" | |
998 min_shift = 0 | |
999 | |
1000 index = index or filename + suffix | |
1001 fn_index = encode_filename(index) | |
1002 | |
1003 if not force and os.path.exists(index): | |
1004 raise IOError( | |
1005 "filename '%s' already exists, use *force* to overwrite" % index) | |
1006 | |
1007 cdef char *fnidx = fn_index | |
1008 cdef int retval = 0 | |
1009 | |
1010 if csi and fmt.format == bcf: | |
1011 with nogil: | |
1012 retval = bcf_index_build2(cfn, fnidx, min_shift) | |
1013 else: | |
1014 with nogil: | |
1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf) | |
1016 | |
1017 if retval != 0: | |
1018 raise OSError("building of index for {} failed".format(filename)) | |
1019 | |
1020 return filename | |
1021 | |
1022 # ######################################################### | |
1023 # cdef class tabix_file_iterator_old: | |
1024 # '''iterate over ``infile``. | |
1025 | |
1026 # This iterator is not safe. If the :meth:`__next__()` method is called | |
1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``). | |
1028 | |
1029 # The iterator might either raise a StopIteration or segfault. | |
1030 # ''' | |
1031 | |
1032 | |
1033 # def __cinit__(self, | |
1034 # infile, | |
1035 # Parser parser, | |
1036 # int buffer_size = 65536 ): | |
1037 | |
1038 # cdef int fd = PyObject_AsFileDescriptor( infile ) | |
1039 # if fd == -1: raise ValueError( "I/O operation on closed file." ) | |
1040 # self.infile = fdopen( fd, 'r') | |
1041 | |
1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) | |
1043 | |
1044 # self.buffer = <char*>malloc( buffer_size ) | |
1045 # self.size = buffer_size | |
1046 # self.parser = parser | |
1047 | |
1048 # def __iter__(self): | |
1049 # return self | |
1050 | |
1051 # cdef __cnext__(self): | |
1052 | |
1053 # cdef char * b | |
1054 # cdef size_t nbytes | |
1055 # b = self.buffer | |
1056 | |
1057 # while not feof( self.infile ): | |
1058 # nbytes = getline( &b, &self.size, self.infile) | |
1059 | |
1060 # # stop at first error or eof | |
1061 # if (nbytes == -1): break | |
1062 # # skip comments | |
1063 # if (b[0] == '#'): continue | |
1064 | |
1065 # # skip empty lines | |
1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue | |
1067 | |
1068 # # make sure that entry is complete | |
1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': | |
1070 # result = b | |
1071 # raise ValueError( "incomplete line at %s" % result ) | |
1072 | |
1073 # # make sure that this goes fully through C | |
1074 # # otherwise buffer is copied to/from a | |
1075 # # Python object causing segfaults as | |
1076 # # the wrong memory is freed | |
1077 # return self.parser.parse( b, nbytes ) | |
1078 | |
1079 # raise StopIteration | |
1080 | |
1081 # def __dealloc__(self): | |
1082 # free(self.buffer) | |
1083 | |
1084 # def __next__(self): | |
1085 # return self.__cnext__() | |
1086 | |
1087 ######################################################### | |
1088 ######################################################### | |
1089 ######################################################### | |
1090 ## Iterators for parsing through unindexed files. | |
1091 ######################################################### | |
1092 # cdef buildGzipError(void *gzfp): | |
1093 # cdef int errnum = 0 | |
1094 # cdef char *s = gzerror(gzfp, &errnum) | |
1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) | |
1096 | |
1097 | |
1098 cdef class tabix_file_iterator: | |
1099 '''iterate over a compressed or uncompressed ``infile``. | |
1100 ''' | |
1101 | |
1102 def __cinit__(self, | |
1103 infile, | |
1104 Parser parser, | |
1105 int buffer_size=65536): | |
1106 | |
1107 if infile.closed: | |
1108 raise ValueError("I/O operation on closed file.") | |
1109 | |
1110 self.infile = infile | |
1111 | |
1112 cdef int fd = PyObject_AsFileDescriptor(infile) | |
1113 if fd == -1: | |
1114 raise ValueError("I/O operation on closed file.") | |
1115 | |
1116 self.duplicated_fd = dup(fd) | |
1117 | |
1118 # From the manual: | |
1119 # gzopen can be used to read a file which is not in gzip format; | |
1120 # in this case gzread will directly read from the file without decompression. | |
1121 # When reading, this will be detected automatically by looking | |
1122 # for the magic two-byte gzip header. | |
1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r') | |
1124 | |
1125 if self.fh == NULL: | |
1126 raise IOError('%s' % strerror(errno)) | |
1127 | |
1128 self.kstream = ks_init(self.fh) | |
1129 | |
1130 self.buffer.s = <char*>malloc(buffer_size) | |
1131 #if self.buffer == NULL: | |
1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) | |
1133 #self.size = buffer_size | |
1134 self.parser = parser | |
1135 | |
1136 def __iter__(self): | |
1137 return self | |
1138 | |
1139 cdef __cnext__(self): | |
1140 | |
1141 cdef char * b | |
1142 cdef int dret = 0 | |
1143 cdef int retval = 0 | |
1144 while 1: | |
1145 with nogil: | |
1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) | |
1147 | |
1148 if retval < 0: | |
1149 break | |
1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh )) | |
1151 | |
1152 b = self.buffer.s | |
1153 | |
1154 # skip comments | |
1155 if (b[0] == b'#'): | |
1156 continue | |
1157 | |
1158 # skip empty lines | |
1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': | |
1160 continue | |
1161 | |
1162 # gzgets terminates at \n, no need to test | |
1163 | |
1164 # parser creates a copy | |
1165 return self.parser.parse(b, self.buffer.l) | |
1166 | |
1167 raise StopIteration | |
1168 | |
1169 def __dealloc__(self): | |
1170 free(self.buffer.s) | |
1171 ks_destroy(self.kstream) | |
1172 bgzf_close(self.fh) | |
1173 | |
1174 def __next__(self): | |
1175 return self.__cnext__() | |
1176 | |
1177 | |
1178 class tabix_generic_iterator: | |
1179 '''iterate over ``infile``. | |
1180 | |
1181 Permits the use of file-like objects for example from the gzip module. | |
1182 ''' | |
1183 def __init__(self, infile, parser): | |
1184 | |
1185 self.infile = infile | |
1186 if self.infile.closed: | |
1187 raise ValueError("I/O operation on closed file.") | |
1188 self.parser = parser | |
1189 | |
1190 def __iter__(self): | |
1191 return self | |
1192 | |
1193 # cython version - required for python 3 | |
1194 def __next__(self): | |
1195 | |
1196 cdef char * b | |
1197 cdef char * cpy | |
1198 cdef size_t nbytes | |
1199 | |
1200 encoding = self.parser.get_encoding() | |
1201 | |
1202 # note that GzipFile.close() does not close the file | |
1203 # reading is still possible. | |
1204 if self.infile.closed: | |
1205 raise ValueError("I/O operation on closed file.") | |
1206 | |
1207 while 1: | |
1208 | |
1209 line = self.infile.readline() | |
1210 if not line: | |
1211 break | |
1212 | |
1213 s = force_bytes(line, encoding) | |
1214 b = s | |
1215 nbytes = len(line) | |
1216 assert b[nbytes] == b'\0' | |
1217 | |
1218 # skip comments | |
1219 if b[0] == b'#': | |
1220 continue | |
1221 | |
1222 # skip empty lines | |
1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': | |
1224 continue | |
1225 | |
1226 # make sure that entry is complete | |
1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': | |
1228 raise ValueError("incomplete line at %s" % line) | |
1229 | |
1230 bytes_cpy = <bytes> b | |
1231 cpy = <char *> bytes_cpy | |
1232 | |
1233 return self.parser(cpy, nbytes) | |
1234 | |
1235 raise StopIteration | |
1236 | |
1237 | |
1238 def tabix_iterator(infile, parser): | |
1239 """return an iterator over all entries in a file. | |
1240 | |
1241 Results are returned parsed as specified by the *parser*. If | |
1242 *parser* is None, the results are returned as an unparsed string. | |
1243 Otherwise, *parser* is assumed to be a functor that will return | |
1244 parsed data (see for example :class:`~pysam.asTuple` and | |
1245 :class:`~pysam.asGTF`). | |
1246 | |
1247 """ | |
1248 return tabix_generic_iterator(infile, parser) | |
1249 | |
1250 | |
1251 cdef class Tabixfile(TabixFile): | |
1252 """Tabixfile is deprecated: use TabixFile instead""" | |
1253 pass | |
1254 | |
1255 | |
1256 __all__ = [ | |
1257 "tabix_index", | |
1258 "tabix_compress", | |
1259 "TabixFile", | |
1260 "Tabixfile", | |
1261 "asTuple", | |
1262 "asGTF", | |
1263 "asGFF3", | |
1264 "asVCF", | |
1265 "asBed", | |
1266 "GZIterator", | |
1267 "GZIteratorHead", | |
1268 "tabix_iterator", | |
1269 "tabix_generic_iterator", | |
1270 "tabix_file_iterator", | |
1271 ] |