comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libchtslib.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # cython: language_level=3
2 # cython: embedsignature=True
3 # cython: profile=True
4 # adds doc-strings for sphinx
5
6 ########################################################################
7 ########################################################################
8 ## Cython cimports
9 ########################################################################
10
11 from posix.unistd cimport dup
12 from libc.errno cimport errno
13 from libc.stdint cimport INT32_MAX
14 from cpython cimport PyBytes_FromStringAndSize
15 from pysam.libchtslib cimport *
16 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
17 from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
18
19
20 ########################################################################
21 ########################################################################
22 ## Python imports
23 ########################################################################
24
25 import os
26 import io
27 import re
28 from warnings import warn
29
30
31 ########################################################################
32 ########################################################################
33 ## Constants
34 ########################################################################
35
36 __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
37
38 # maximum genomic coordinace
39 cdef int MAX_POS = (1 << 31) - 1
40
41 cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
42 cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
43 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
44 cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
45
46
47 ########################################################################
48 ########################################################################
49 ## Verbosity functions
50 ########################################################################
51
52
53 cpdef set_verbosity(int verbosity):
54 """Set htslib's hts_verbose global variable to the specified value."""
55 return hts_set_verbosity(verbosity)
56
57 cpdef get_verbosity():
58 """Return the value of htslib's hts_verbose global variable."""
59 return hts_get_verbosity()
60
61
62 ########################################################################
63 ########################################################################
64 ## HFile wrapper class
65 ########################################################################
66
67 cdef class HFile(object):
68 cdef hFILE *fp
69 cdef readonly object name, mode
70
71 def __init__(self, name, mode='r', closefd=True):
72 self._open(name, mode, closefd=True)
73
74 def __dealloc__(self):
75 self.close()
76
77 @property
78 def closed(self):
79 return self.fp == NULL
80
81 cdef _open(self, name, mode, closefd=True):
82 self.name = name
83 self.mode = mode
84
85 mode = force_bytes(mode)
86
87 if isinstance(name, int):
88 if self.fp != NULL:
89 name = dup(name)
90 self.fp = hdopen(name, mode)
91 else:
92 name = encode_filename(name)
93 self.fp = hopen(name, mode)
94
95 if not self.fp:
96 raise IOError(errno, 'failed to open HFile', self.name)
97
98 def close(self):
99 if self.fp == NULL:
100 return
101
102 cdef hFILE *fp = self.fp
103 self.fp = NULL
104
105 if hclose(fp) != 0:
106 raise IOError(errno, 'failed to close HFile', self.name)
107
108 def fileno(self):
109 if self.fp == NULL:
110 raise IOError('operation on closed HFile')
111 if isinstance(self.name, int):
112 return self.name
113 else:
114 raise AttributeError('fileno not available')
115
116 def __enter__(self):
117 return self
118
119 def __exit__(self, type, value, tb):
120 self.close()
121
122 def __iter__(self):
123 return self
124
125 def __next__(self):
126 line = self.readline()
127 if not line:
128 raise StopIteration()
129 return line
130
131 def flush(self):
132 if self.fp == NULL:
133 raise IOError('operation on closed HFile')
134 if hflush(self.fp) != 0:
135 raise IOError(herrno(self.fp), 'failed to flush HFile', self.name)
136
137 def isatty(self):
138 if self.fp == NULL:
139 raise IOError('operation on closed HFile')
140 return False
141
142 def readable(self):
143 return self.fp != NULL and 'r' in self.mode
144
145 def read(self, Py_ssize_t size=-1):
146 if self.fp == NULL:
147 raise IOError('operation on closed HFile')
148
149 if size == 0:
150 return b''
151
152 cdef list parts = []
153 cdef bytes part
154 cdef Py_ssize_t chunk_size, ret, bytes_read = 0
155 cdef char *cpart
156
157 while size == -1 or bytes_read < size:
158 chunk_size = 4096
159 if size != -1:
160 chunk_size = min(chunk_size, size - bytes_read)
161
162 part = PyBytes_FromStringAndSize(NULL, chunk_size)
163 cpart = <char *>part
164 ret = hread(self.fp, <void *>cpart, chunk_size)
165
166 if ret < 0:
167 IOError(herrno(self.fp), 'failed to read HFile', self.name)
168 elif not ret:
169 break
170
171 bytes_read += ret
172
173 if ret < chunk_size:
174 part = cpart[:ret]
175
176 parts.append(part)
177
178 return b''.join(parts)
179
180 def readall(self):
181 return self.read()
182
183 def readinto(self, buf):
184 if self.fp == NULL:
185 raise IOError('operation on closed HFile')
186
187 size = len(buf)
188
189 if size == 0:
190 return size
191
192 mv = memoryview(buf)
193 ret = hread(self.fp, <void *>mv, size)
194
195 if ret < 0:
196 IOError(herrno(self.fp), 'failed to read HFile', self.name)
197
198 return ret
199
200 def readline(self, Py_ssize_t size=-1):
201 if self.fp == NULL:
202 raise IOError('operation on closed HFile')
203
204 if size == 0:
205 return b''
206
207 cdef list parts = []
208 cdef bytes part
209 cdef Py_ssize_t chunk_size, ret, bytes_read = 0
210 cdef char *cpart
211
212 while size == -1 or bytes_read < size:
213 chunk_size = 4096
214 if size != -1:
215 chunk_size = min(chunk_size, size - bytes_read)
216
217 part = PyBytes_FromStringAndSize(NULL, chunk_size)
218 cpart = <char *>part
219
220 # Python bytes objects allocate an extra byte for a null terminator
221 ret = hgetln(cpart, chunk_size+1, self.fp)
222
223 if ret < 0:
224 IOError(herrno(self.fp), 'failed to read HFile', self.name)
225 elif not ret:
226 break
227
228 bytes_read += ret
229
230 if ret < chunk_size:
231 part = cpart[:ret]
232 cpart = <char *>part
233
234 parts.append(part)
235
236 if cpart[ret-1] == b'\n':
237 break
238
239 return b''.join(parts)
240
241 def readlines(self):
242 return list(self)
243
244 def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
245 if self.fp == NULL:
246 raise IOError('operation on closed HFile')
247
248 cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
249
250 if off < 0:
251 raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
252
253 return off
254
255 def tell(self):
256 if self.fp == NULL:
257 raise IOError('operation on closed HFile')
258
259 ret = htell(self.fp)
260
261 if ret < 0:
262 raise IOError(herrno(self.fp), 'tell failed on HFile', self.name)
263
264 return ret
265
266 def seekable(self):
267 return self.fp != NULL
268
269 def truncate(self, size=None):
270 raise NotImplementedError()
271
272 def writable(self):
273 return self.fp != NULL and 'w' in self.mode
274
275 def write(self, bytes b):
276 if self.fp == NULL:
277 raise IOError('operation on closed HFile')
278
279 got = hwrite(self.fp, <void *>b, len(b))
280
281 if got < 0:
282 raise IOError(herrno(self.fp), 'write failed on HFile', self.name)
283
284 return got
285
286 def writelines(self, lines):
287 for line in lines:
288 self.write(line)
289
290
291 ########################################################################
292 ########################################################################
293 ## Helpers for backward compatibility to hide the difference between
294 ## boolean properties and methods
295 ########################################################################
296
297 class CallableValue(object):
298 def __init__(self, value):
299 self.value = value
300 def __call__(self):
301 return self.value
302 def __bool__(self):
303 return self.value
304 def __nonzero__(self):
305 return self.value
306 def __eq__(self, other):
307 return self.value == other
308 def __ne__(self, other):
309 return self.value != other
310
311
312 CTrue = CallableValue(True)
313 CFalse = CallableValue(False)
314
315
316 ########################################################################
317 ########################################################################
318 ## HTSFile wrapper class (base class for AlignmentFile and VariantFile)
319 ########################################################################
320
321 cdef class HTSFile(object):
322 """
323 Base class for HTS file types
324 """
325
326 def __cinit__(self, *args, **kwargs):
327 self.htsfile = NULL
328 self.threads = 1
329 self.duplicate_filehandle = True
330
331 def close(self):
332 if self.htsfile:
333 hts_close(self.htsfile)
334 self.htsfile = NULL
335
336 def __dealloc__(self):
337 if self.htsfile:
338 hts_close(self.htsfile)
339 self.htsfile = NULL
340
341 def flush(self):
342 """Flush any buffered data to the underlying output stream."""
343 if self.htsfile:
344 if hts_flush(self.htsfile) < 0:
345 raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename))
346
347 def check_truncation(self, ignore_truncation=False):
348 """Check if file is truncated."""
349 if not self.htsfile:
350 return
351
352 if self.htsfile.format.compression != bgzf:
353 return
354
355 cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
356 if not bgzfp:
357 return
358
359 cdef int ret = bgzf_check_EOF(bgzfp)
360 if ret < 0:
361 raise IOError(errno, 'error checking for EOF marker')
362 elif ret == 0:
363 msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
364 if ignore_truncation:
365 warn(msg)
366 else:
367 raise IOError(msg)
368
369 def __enter__(self):
370 return self
371
372 def __exit__(self, exc_type, exc_value, traceback):
373 self.close()
374 return False
375
376 @property
377 def category(self):
378 """General file format category. One of UNKNOWN, ALIGNMENTS,
379 VARIANTS, INDEX, REGIONS"""
380 if not self.htsfile:
381 raise ValueError('metadata not available on closed file')
382 return FORMAT_CATEGORIES[self.htsfile.format.category]
383
384 @property
385 def format(self):
386 """File format.
387
388 One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
389 BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
390 """
391 if not self.htsfile:
392 raise ValueError('metadata not available on closed file')
393 return FORMATS[self.htsfile.format.format]
394
395 @property
396 def version(self):
397 """Tuple of file format version numbers (major, minor)"""
398 if not self.htsfile:
399 raise ValueError('metadata not available on closed file')
400 return self.htsfile.format.version.major, self.htsfile.format.version.minor
401
402 @property
403 def compression(self):
404 """File compression.
405
406 One of NONE, GZIP, BGZF, CUSTOM."""
407 if not self.htsfile:
408 raise ValueError('metadata not available on closed file')
409 return COMPRESSION[self.htsfile.format.compression]
410
411 @property
412 def description(self):
413 """Vaguely human readable description of the file format"""
414 if not self.htsfile:
415 raise ValueError('metadata not available on closed file')
416 cdef char *desc = hts_format_description(&self.htsfile.format)
417 try:
418 return charptr_to_str(desc)
419 finally:
420 free(desc)
421
422 @property
423 def is_open(self):
424 """return True if HTSFile is open and in a valid state."""
425 return CTrue if self.htsfile != NULL else CFalse
426
427 @property
428 def is_closed(self):
429 """return True if HTSFile is closed."""
430 return self.htsfile == NULL
431
432 @property
433 def closed(self):
434 """return True if HTSFile is closed."""
435 return self.htsfile == NULL
436
437 @property
438 def is_write(self):
439 """return True if HTSFile is open for writing"""
440 return self.htsfile != NULL and self.htsfile.is_write != 0
441
442 @property
443 def is_read(self):
444 """return True if HTSFile is open for reading"""
445 return self.htsfile != NULL and self.htsfile.is_write == 0
446
447 @property
448 def is_sam(self):
449 """return True if HTSFile is reading or writing a SAM alignment file"""
450 return self.htsfile != NULL and self.htsfile.format.format == sam
451
452 @property
453 def is_bam(self):
454 """return True if HTSFile is reading or writing a BAM alignment file"""
455 return self.htsfile != NULL and self.htsfile.format.format == bam
456
457 @property
458 def is_cram(self):
459 """return True if HTSFile is reading or writing a BAM alignment file"""
460 return self.htsfile != NULL and self.htsfile.format.format == cram
461
462 @property
463 def is_vcf(self):
464 """return True if HTSFile is reading or writing a VCF variant file"""
465 return self.htsfile != NULL and self.htsfile.format.format == vcf
466
467 @property
468 def is_bcf(self):
469 """return True if HTSFile is reading or writing a BCF variant file"""
470 return self.htsfile != NULL and self.htsfile.format.format == bcf
471
472 def reset(self):
473 """reset file position to beginning of file just after the header.
474
475 Returns
476 -------
477
478 The file position after moving the file pointer. : pointer
479
480 """
481 return self.seek(self.start_offset)
482
483 def seek(self, uint64_t offset, int whence=io.SEEK_SET):
484 """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
485 if not self.is_open:
486 raise ValueError('I/O operation on closed file')
487 if self.is_stream:
488 raise IOError('seek not available in streams')
489
490 whence = libc_whence_from_io(whence)
491
492 cdef int64_t ret
493 if self.htsfile.format.compression == bgzf:
494 with nogil:
495 ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
496 elif self.htsfile.format.compression == no_compression:
497 ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
498 else:
499 raise NotImplementedError("seek not implemented in files compressed by method {}".format(
500 self.htsfile.format.compression))
501 return ret
502
503 def tell(self):
504 """return current file position, see :meth:`pysam.HTSFile.seek`."""
505 if not self.is_open:
506 raise ValueError('I/O operation on closed file')
507 if self.is_stream:
508 raise IOError('tell not available in streams')
509
510 cdef int64_t ret
511 if self.htsfile.format.compression == bgzf:
512 with nogil:
513 ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
514 elif self.htsfile.format.compression == no_compression:
515 ret = htell(self.htsfile.fp.hfile)
516 elif self.htsfile.format.format == cram:
517 with nogil:
518 ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
519 else:
520 raise NotImplementedError("seek not implemented in files compressed by method {}".format(
521 self.htsfile.format.compression))
522
523 return ret
524
525 cdef htsFile *_open_htsfile(self) except? NULL:
526 cdef char *cfilename
527 cdef char *cmode = self.mode
528 cdef int fd, dup_fd, threads
529
530 threads = self.threads - 1
531 if isinstance(self.filename, bytes):
532 cfilename = self.filename
533 with nogil:
534 htsfile = hts_open(cfilename, cmode)
535 if htsfile != NULL:
536 hts_set_threads(htsfile, threads)
537 return htsfile
538 else:
539 if isinstance(self.filename, int):
540 fd = self.filename
541 else:
542 fd = self.filename.fileno()
543
544 if self.duplicate_filehandle:
545 dup_fd = dup(fd)
546 else:
547 dup_fd = fd
548
549 # Replicate mode normalization done in hts_open_format
550 smode = self.mode.replace(b'b', b'').replace(b'c', b'')
551 if b'b' in self.mode:
552 smode += b'b'
553 elif b'c' in self.mode:
554 smode += b'c'
555 cmode = smode
556
557 hfile = hdopen(dup_fd, cmode)
558 if hfile == NULL:
559 raise IOError('Cannot create hfile')
560
561 try:
562 # filename.name can be an int
563 filename = str(self.filename.name)
564 except AttributeError:
565 filename = '<fd:{}>'.format(fd)
566
567 filename = encode_filename(filename)
568 cfilename = filename
569 with nogil:
570 htsfile = hts_hopen(hfile, cfilename, cmode)
571 if htsfile != NULL:
572 hts_set_threads(htsfile, threads)
573 return htsfile
574
575 def add_hts_options(self, format_options=None):
576 """Given a list of key=value format option strings, add them to an open htsFile
577 """
578 cdef int rval
579 cdef hts_opt *opts = NULL
580
581 if format_options:
582 for format_option in format_options:
583 rval = hts_opt_add(&opts, format_option)
584 if rval != 0:
585 if opts != NULL:
586 hts_opt_free(opts)
587 raise RuntimeError('Invalid format option ({}) specified'.format(format_option))
588 if opts != NULL:
589 rval = hts_opt_apply(self.htsfile, opts)
590 if rval != 0:
591 hts_opt_free(opts)
592 raise RuntimeError('An error occurred while applying the requested format options')
593 hts_opt_free(opts)
594
595 def parse_region(self, contig=None, start=None, stop=None,
596 region=None, tid=None,
597 reference=None, end=None):
598 """parse alternative ways to specify a genomic region. A region can
599 either be specified by :term:`contig`, `start` and
600 `stop`. `start` and `stop` denote 0-based, half-open
601 intervals. :term:`reference` and `end` are also accepted for
602 backward compatibility as synonyms for :term:`contig` and
603 `stop`, respectively.
604
605 Alternatively, a samtools :term:`region` string can be
606 supplied.
607
608 If any of the coordinates are missing they will be replaced by
609 the minimum (`start`) or maximum (`stop`) coordinate.
610
611 Note that region strings are 1-based inclusive, while `start`
612 and `stop` denote an interval in 0-based, half-open
613 coordinates (like BED files and Python slices).
614
615 If `contig` or `region` or are ``*``, unmapped reads at the end
616 of a BAM file will be returned. Setting either to ``.`` will
617 iterate from the beginning of the file.
618
619 Returns
620 -------
621
622 tuple :
623 a tuple of `flag`, :term:`tid`, `start` and
624 `stop`. The flag indicates whether no coordinates were
625 supplied and the genomic region is the complete genomic space.
626
627 Raises
628 ------
629
630 ValueError
631 for invalid or out of bounds regions.
632
633 """
634 cdef int rtid
635 cdef int32_t rstart
636 cdef int32_t rstop
637
638 if reference is not None:
639 if contig is not None:
640 raise ValueError('contig and reference should not both be specified')
641 contig = reference
642
643 if end is not None:
644 if stop is not None:
645 raise ValueError('stop and end should not both be specified')
646 stop = end
647
648 if contig is None and tid is None and region is None:
649 return 0, 0, 0, MAX_POS
650
651 rtid = -1
652 rstart = 0
653 rstop = MAX_POS
654 if start is not None:
655 try:
656 rstart = start
657 except OverflowError:
658 raise ValueError('start out of range (%i)' % start)
659
660 if stop is not None:
661 try:
662 rstop = stop
663 except OverflowError:
664 raise ValueError('stop out of range (%i)' % stop)
665
666 if region:
667 region = force_str(region)
668 if ":" in region:
669 contig, coord = region.split(":")
670 parts = coord.split("-")
671 rstart = int(parts[0]) - 1
672 if len(parts) >= 1:
673 rstop = int(parts[1])
674 else:
675 contig = region
676
677 if tid is not None:
678 if not self.is_valid_tid(tid):
679 raise IndexError('invalid tid')
680 rtid = tid
681 else:
682 if contig == "*":
683 rtid = HTS_IDX_NOCOOR
684 elif contig == ".":
685 rtid = HTS_IDX_START
686 else:
687 rtid = self.get_tid(contig)
688 if rtid < 0:
689 raise ValueError('invalid contig `%s`' % contig)
690
691 if rstart > rstop:
692 raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
693 if not 0 <= rstart < MAX_POS:
694 raise ValueError('start out of range (%i)' % rstart)
695 if not 0 <= rstop <= MAX_POS:
696 raise ValueError('stop out of range (%i)' % rstop)
697
698 return 1, rtid, rstart, rstop
699
700 def is_valid_tid(self, tid):
701 """
702 return True if the numerical :term:`tid` is valid; False otherwise.
703
704 returns -1 if contig is not known.
705 """
706 raise NotImplementedError()
707
708 def is_valid_reference_name(self, contig):
709 """
710 return True if the contig name :term:`contig` is valid; False otherwise.
711 """
712 return self.get_tid(contig) != -1
713
714 def get_tid(self, contig):
715 """
716 return the numerical :term:`tid` corresponding to
717 :term:`contig`
718
719 returns -1 if contig is not known.
720 """
721 raise NotImplementedError()
722
723 def get_reference_name(self, tid):
724 """
725 return :term:`contig` name corresponding to numerical :term:`tid`
726 """
727 raise NotImplementedError()