Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libchtslib.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libchtslib.pyx Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,727 @@ +# cython: language_level=3 +# cython: embedsignature=True +# cython: profile=True +# adds doc-strings for sphinx + +######################################################################## +######################################################################## +## Cython cimports +######################################################################## + +from posix.unistd cimport dup +from libc.errno cimport errno +from libc.stdint cimport INT32_MAX +from cpython cimport PyBytes_FromStringAndSize +from pysam.libchtslib cimport * +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io + + +######################################################################## +######################################################################## +## Python imports +######################################################################## + +import os +import io +import re +from warnings import warn + + +######################################################################## +######################################################################## +## Constants +######################################################################## + +__all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile'] + +# maximum genomic coordinace +cdef int MAX_POS = (1 << 31) - 1 + +cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') +cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', + 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') +cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') + + +######################################################################## +######################################################################## +## Verbosity functions +######################################################################## + + +cpdef set_verbosity(int verbosity): + """Set htslib's hts_verbose global variable to the specified value.""" + return hts_set_verbosity(verbosity) + +cpdef get_verbosity(): + """Return the value of htslib's hts_verbose global variable.""" + return hts_get_verbosity() + + +######################################################################## +######################################################################## +## HFile wrapper class +######################################################################## + +cdef class HFile(object): + cdef hFILE *fp + cdef readonly object name, mode + + def __init__(self, name, mode='r', closefd=True): + self._open(name, mode, closefd=True) + + def __dealloc__(self): + self.close() + + @property + def closed(self): + return self.fp == NULL + + cdef _open(self, name, mode, closefd=True): + self.name = name + self.mode = mode + + mode = force_bytes(mode) + + if isinstance(name, int): + if self.fp != NULL: + name = dup(name) + self.fp = hdopen(name, mode) + else: + name = encode_filename(name) + self.fp = hopen(name, mode) + + if not self.fp: + raise IOError(errno, 'failed to open HFile', self.name) + + def close(self): + if self.fp == NULL: + return + + cdef hFILE *fp = self.fp + self.fp = NULL + + if hclose(fp) != 0: + raise IOError(errno, 'failed to close HFile', self.name) + + def fileno(self): + if self.fp == NULL: + raise IOError('operation on closed HFile') + if isinstance(self.name, int): + return self.name + else: + raise AttributeError('fileno not available') + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.close() + + def __iter__(self): + return self + + def __next__(self): + line = self.readline() + if not line: + raise StopIteration() + return line + + def flush(self): + if self.fp == NULL: + raise IOError('operation on closed HFile') + if hflush(self.fp) != 0: + raise IOError(herrno(self.fp), 'failed to flush HFile', self.name) + + def isatty(self): + if self.fp == NULL: + raise IOError('operation on closed HFile') + return False + + def readable(self): + return self.fp != NULL and 'r' in self.mode + + def read(self, Py_ssize_t size=-1): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + if size == 0: + return b'' + + cdef list parts = [] + cdef bytes part + cdef Py_ssize_t chunk_size, ret, bytes_read = 0 + cdef char *cpart + + while size == -1 or bytes_read < size: + chunk_size = 4096 + if size != -1: + chunk_size = min(chunk_size, size - bytes_read) + + part = PyBytes_FromStringAndSize(NULL, chunk_size) + cpart = <char *>part + ret = hread(self.fp, <void *>cpart, chunk_size) + + if ret < 0: + IOError(herrno(self.fp), 'failed to read HFile', self.name) + elif not ret: + break + + bytes_read += ret + + if ret < chunk_size: + part = cpart[:ret] + + parts.append(part) + + return b''.join(parts) + + def readall(self): + return self.read() + + def readinto(self, buf): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + size = len(buf) + + if size == 0: + return size + + mv = memoryview(buf) + ret = hread(self.fp, <void *>mv, size) + + if ret < 0: + IOError(herrno(self.fp), 'failed to read HFile', self.name) + + return ret + + def readline(self, Py_ssize_t size=-1): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + if size == 0: + return b'' + + cdef list parts = [] + cdef bytes part + cdef Py_ssize_t chunk_size, ret, bytes_read = 0 + cdef char *cpart + + while size == -1 or bytes_read < size: + chunk_size = 4096 + if size != -1: + chunk_size = min(chunk_size, size - bytes_read) + + part = PyBytes_FromStringAndSize(NULL, chunk_size) + cpart = <char *>part + + # Python bytes objects allocate an extra byte for a null terminator + ret = hgetln(cpart, chunk_size+1, self.fp) + + if ret < 0: + IOError(herrno(self.fp), 'failed to read HFile', self.name) + elif not ret: + break + + bytes_read += ret + + if ret < chunk_size: + part = cpart[:ret] + cpart = <char *>part + + parts.append(part) + + if cpart[ret-1] == b'\n': + break + + return b''.join(parts) + + def readlines(self): + return list(self) + + def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence)) + + if off < 0: + raise IOError(herrno(self.fp), 'seek failed on HFile', self.name) + + return off + + def tell(self): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + ret = htell(self.fp) + + if ret < 0: + raise IOError(herrno(self.fp), 'tell failed on HFile', self.name) + + return ret + + def seekable(self): + return self.fp != NULL + + def truncate(self, size=None): + raise NotImplementedError() + + def writable(self): + return self.fp != NULL and 'w' in self.mode + + def write(self, bytes b): + if self.fp == NULL: + raise IOError('operation on closed HFile') + + got = hwrite(self.fp, <void *>b, len(b)) + + if got < 0: + raise IOError(herrno(self.fp), 'write failed on HFile', self.name) + + return got + + def writelines(self, lines): + for line in lines: + self.write(line) + + +######################################################################## +######################################################################## +## Helpers for backward compatibility to hide the difference between +## boolean properties and methods +######################################################################## + +class CallableValue(object): + def __init__(self, value): + self.value = value + def __call__(self): + return self.value + def __bool__(self): + return self.value + def __nonzero__(self): + return self.value + def __eq__(self, other): + return self.value == other + def __ne__(self, other): + return self.value != other + + +CTrue = CallableValue(True) +CFalse = CallableValue(False) + + +######################################################################## +######################################################################## +## HTSFile wrapper class (base class for AlignmentFile and VariantFile) +######################################################################## + +cdef class HTSFile(object): + """ + Base class for HTS file types + """ + + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + self.threads = 1 + self.duplicate_filehandle = True + + def close(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + + def __dealloc__(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + + def flush(self): + """Flush any buffered data to the underlying output stream.""" + if self.htsfile: + if hts_flush(self.htsfile) < 0: + raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename)) + + def check_truncation(self, ignore_truncation=False): + """Check if file is truncated.""" + if not self.htsfile: + return + + if self.htsfile.format.compression != bgzf: + return + + cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile) + if not bgzfp: + return + + cdef int ret = bgzf_check_EOF(bgzfp) + if ret < 0: + raise IOError(errno, 'error checking for EOF marker') + elif ret == 0: + msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename) + if ignore_truncation: + warn(msg) + else: + raise IOError(msg) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + @property + def category(self): + """General file format category. One of UNKNOWN, ALIGNMENTS, + VARIANTS, INDEX, REGIONS""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMAT_CATEGORIES[self.htsfile.format.category] + + @property + def format(self): + """File format. + + One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, + BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. + """ + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMATS[self.htsfile.format.format] + + @property + def version(self): + """Tuple of file format version numbers (major, minor)""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return self.htsfile.format.version.major, self.htsfile.format.version.minor + + @property + def compression(self): + """File compression. + + One of NONE, GZIP, BGZF, CUSTOM.""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return COMPRESSION[self.htsfile.format.compression] + + @property + def description(self): + """Vaguely human readable description of the file format""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + cdef char *desc = hts_format_description(&self.htsfile.format) + try: + return charptr_to_str(desc) + finally: + free(desc) + + @property + def is_open(self): + """return True if HTSFile is open and in a valid state.""" + return CTrue if self.htsfile != NULL else CFalse + + @property + def is_closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def is_write(self): + """return True if HTSFile is open for writing""" + return self.htsfile != NULL and self.htsfile.is_write != 0 + + @property + def is_read(self): + """return True if HTSFile is open for reading""" + return self.htsfile != NULL and self.htsfile.is_write == 0 + + @property + def is_sam(self): + """return True if HTSFile is reading or writing a SAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == sam + + @property + def is_bam(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == bam + + @property + def is_cram(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == cram + + @property + def is_vcf(self): + """return True if HTSFile is reading or writing a VCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == vcf + + @property + def is_bcf(self): + """return True if HTSFile is reading or writing a BCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == bcf + + def reset(self): + """reset file position to beginning of file just after the header. + + Returns + ------- + + The file position after moving the file pointer. : pointer + + """ + return self.seek(self.start_offset) + + def seek(self, uint64_t offset, int whence=io.SEEK_SET): + """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise IOError('seek not available in streams') + + whence = libc_whence_from_io(whence) + + cdef int64_t ret + if self.htsfile.format.compression == bgzf: + with nogil: + ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence) + elif self.htsfile.format.compression == no_compression: + ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1 + else: + raise NotImplementedError("seek not implemented in files compressed by method {}".format( + self.htsfile.format.compression)) + return ret + + def tell(self): + """return current file position, see :meth:`pysam.HTSFile.seek`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise IOError('tell not available in streams') + + cdef int64_t ret + if self.htsfile.format.compression == bgzf: + with nogil: + ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) + elif self.htsfile.format.compression == no_compression: + ret = htell(self.htsfile.fp.hfile) + elif self.htsfile.format.format == cram: + with nogil: + ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) + else: + raise NotImplementedError("seek not implemented in files compressed by method {}".format( + self.htsfile.format.compression)) + + return ret + + cdef htsFile *_open_htsfile(self) except? NULL: + cdef char *cfilename + cdef char *cmode = self.mode + cdef int fd, dup_fd, threads + + threads = self.threads - 1 + if isinstance(self.filename, bytes): + cfilename = self.filename + with nogil: + htsfile = hts_open(cfilename, cmode) + if htsfile != NULL: + hts_set_threads(htsfile, threads) + return htsfile + else: + if isinstance(self.filename, int): + fd = self.filename + else: + fd = self.filename.fileno() + + if self.duplicate_filehandle: + dup_fd = dup(fd) + else: + dup_fd = fd + + # Replicate mode normalization done in hts_open_format + smode = self.mode.replace(b'b', b'').replace(b'c', b'') + if b'b' in self.mode: + smode += b'b' + elif b'c' in self.mode: + smode += b'c' + cmode = smode + + hfile = hdopen(dup_fd, cmode) + if hfile == NULL: + raise IOError('Cannot create hfile') + + try: + # filename.name can be an int + filename = str(self.filename.name) + except AttributeError: + filename = '<fd:{}>'.format(fd) + + filename = encode_filename(filename) + cfilename = filename + with nogil: + htsfile = hts_hopen(hfile, cfilename, cmode) + if htsfile != NULL: + hts_set_threads(htsfile, threads) + return htsfile + + def add_hts_options(self, format_options=None): + """Given a list of key=value format option strings, add them to an open htsFile + """ + cdef int rval + cdef hts_opt *opts = NULL + + if format_options: + for format_option in format_options: + rval = hts_opt_add(&opts, format_option) + if rval != 0: + if opts != NULL: + hts_opt_free(opts) + raise RuntimeError('Invalid format option ({}) specified'.format(format_option)) + if opts != NULL: + rval = hts_opt_apply(self.htsfile, opts) + if rval != 0: + hts_opt_free(opts) + raise RuntimeError('An error occurred while applying the requested format options') + hts_opt_free(opts) + + def parse_region(self, contig=None, start=None, stop=None, + region=None, tid=None, + reference=None, end=None): + """parse alternative ways to specify a genomic region. A region can + either be specified by :term:`contig`, `start` and + `stop`. `start` and `stop` denote 0-based, half-open + intervals. :term:`reference` and `end` are also accepted for + backward compatibility as synonyms for :term:`contig` and + `stop`, respectively. + + Alternatively, a samtools :term:`region` string can be + supplied. + + If any of the coordinates are missing they will be replaced by + the minimum (`start`) or maximum (`stop`) coordinate. + + Note that region strings are 1-based inclusive, while `start` + and `stop` denote an interval in 0-based, half-open + coordinates (like BED files and Python slices). + + If `contig` or `region` or are ``*``, unmapped reads at the end + of a BAM file will be returned. Setting either to ``.`` will + iterate from the beginning of the file. + + Returns + ------- + + tuple : + a tuple of `flag`, :term:`tid`, `start` and + `stop`. The flag indicates whether no coordinates were + supplied and the genomic region is the complete genomic space. + + Raises + ------ + + ValueError + for invalid or out of bounds regions. + + """ + cdef int rtid + cdef int32_t rstart + cdef int32_t rstop + + if reference is not None: + if contig is not None: + raise ValueError('contig and reference should not both be specified') + contig = reference + + if end is not None: + if stop is not None: + raise ValueError('stop and end should not both be specified') + stop = end + + if contig is None and tid is None and region is None: + return 0, 0, 0, MAX_POS + + rtid = -1 + rstart = 0 + rstop = MAX_POS + if start is not None: + try: + rstart = start + except OverflowError: + raise ValueError('start out of range (%i)' % start) + + if stop is not None: + try: + rstop = stop + except OverflowError: + raise ValueError('stop out of range (%i)' % stop) + + if region: + region = force_str(region) + if ":" in region: + contig, coord = region.split(":") + parts = coord.split("-") + rstart = int(parts[0]) - 1 + if len(parts) >= 1: + rstop = int(parts[1]) + else: + contig = region + + if tid is not None: + if not self.is_valid_tid(tid): + raise IndexError('invalid tid') + rtid = tid + else: + if contig == "*": + rtid = HTS_IDX_NOCOOR + elif contig == ".": + rtid = HTS_IDX_START + else: + rtid = self.get_tid(contig) + if rtid < 0: + raise ValueError('invalid contig `%s`' % contig) + + if rstart > rstop: + raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop)) + if not 0 <= rstart < MAX_POS: + raise ValueError('start out of range (%i)' % rstart) + if not 0 <= rstop <= MAX_POS: + raise ValueError('stop out of range (%i)' % rstop) + + return 1, rtid, rstart, rstop + + def is_valid_tid(self, tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + returns -1 if contig is not known. + """ + raise NotImplementedError() + + def is_valid_reference_name(self, contig): + """ + return True if the contig name :term:`contig` is valid; False otherwise. + """ + return self.get_tid(contig) != -1 + + def get_tid(self, contig): + """ + return the numerical :term:`tid` corresponding to + :term:`contig` + + returns -1 if contig is not known. + """ + raise NotImplementedError() + + def get_reference_name(self, tid): + """ + return :term:`contig` name corresponding to numerical :term:`tid` + """ + raise NotImplementedError()