jpayne@68: # cython: language_level=3 jpayne@68: import types jpayne@68: import sys jpayne@68: import string jpayne@68: import re jpayne@68: import tempfile jpayne@68: import os jpayne@68: import io jpayne@68: from contextlib import contextmanager jpayne@68: from codecs import register_error jpayne@68: jpayne@68: from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION jpayne@68: from cpython cimport PyBytes_Check, PyUnicode_Check jpayne@68: from cpython cimport array as c_array jpayne@68: from libc.errno cimport errno jpayne@68: from libc.stdlib cimport calloc, free jpayne@68: from libc.string cimport strncpy jpayne@68: from libc.stdint cimport INT32_MAX, int32_t jpayne@68: from libc.stdio cimport fprintf, stderr, fflush jpayne@68: from libc.stdio cimport stdout as c_stdout jpayne@68: from posix.fcntl cimport open as c_open, O_WRONLY, O_CREAT, O_TRUNC jpayne@68: from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END jpayne@68: jpayne@68: from pysam.libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \ jpayne@68: samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn jpayne@68: jpayne@68: from pysam.libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \ jpayne@68: bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn jpayne@68: jpayne@68: ##################################################################### jpayne@68: # hard-coded constants jpayne@68: cdef int MAX_POS = (1 << 31) - 1 jpayne@68: jpayne@68: ################################################################# jpayne@68: # Utility functions for quality string conversions jpayne@68: cpdef c_array.array qualitystring_to_array(input_str, int offset=33): jpayne@68: """convert a qualitystring to an array of quality values.""" jpayne@68: if input_str is None: jpayne@68: return None jpayne@68: qs = force_bytes(input_str) jpayne@68: cdef char i jpayne@68: return c_array.array('B', [i - offset for i in qs]) jpayne@68: jpayne@68: jpayne@68: cpdef array_to_qualitystring(c_array.array qualities, int offset=33): jpayne@68: """convert an array of quality values to a string.""" jpayne@68: if qualities is None: jpayne@68: return None jpayne@68: cdef int x jpayne@68: jpayne@68: cdef c_array.array result jpayne@68: result = c_array.clone(qualities, len(qualities), zero=False) jpayne@68: jpayne@68: for x from 0 <= x < len(qualities): jpayne@68: result[x] = qualities[x] + offset jpayne@68: return force_str(result.tobytes()) jpayne@68: jpayne@68: jpayne@68: cpdef qualities_to_qualitystring(qualities, int offset=33): jpayne@68: """convert a list or array of quality scores to the string jpayne@68: representation used in the SAM format. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: offset : int jpayne@68: offset to be added to the quality scores to arrive at jpayne@68: the characters of the quality string (default=33). jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: string jpayne@68: a quality string jpayne@68: jpayne@68: """ jpayne@68: cdef char x jpayne@68: if qualities is None: jpayne@68: return None jpayne@68: elif isinstance(qualities, c_array.array): jpayne@68: return array_to_qualitystring(qualities, offset=offset) jpayne@68: else: jpayne@68: # tuples and lists jpayne@68: return force_str("".join([chr(x + offset) for x in qualities])) jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ## String encoding configuration facilities jpayne@68: ######################################################################## jpayne@68: jpayne@68: # Codec error handler that just interprets each bad byte as ISO-8859-1. jpayne@68: def latin1_replace(exception): jpayne@68: return (chr(exception.object[exception.start]), exception.end) jpayne@68: jpayne@68: register_error('pysam.latin1replace', latin1_replace) jpayne@68: jpayne@68: jpayne@68: cdef str ERROR_HANDLER = 'strict' jpayne@68: jpayne@68: cpdef get_encoding_error_handler(): jpayne@68: return ERROR_HANDLER jpayne@68: jpayne@68: cpdef set_encoding_error_handler(name): jpayne@68: global ERROR_HANDLER jpayne@68: previous = ERROR_HANDLER jpayne@68: ERROR_HANDLER = name jpayne@68: return previous jpayne@68: jpayne@68: ######################################################################## jpayne@68: ## Python 3 compatibility functions jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef from_string_and_size(const char* s, size_t length): jpayne@68: return s[:length].decode('utf-8', ERROR_HANDLER) jpayne@68: jpayne@68: # filename encoding (adapted from lxml.etree.pyx) jpayne@68: cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii' jpayne@68: cdef str TEXT_ENCODING = 'utf-8' jpayne@68: jpayne@68: cdef bytes encode_filename(object filename): jpayne@68: """Make sure a filename is 8-bit encoded (or None).""" jpayne@68: if filename is None: jpayne@68: return None jpayne@68: return os.fsencode(filename) jpayne@68: jpayne@68: jpayne@68: cdef bytes force_bytes(object s, encoding=None, errors=None): jpayne@68: """convert string or unicode object to bytes, assuming jpayne@68: utf8 encoding. jpayne@68: """ jpayne@68: if s is None: jpayne@68: return None jpayne@68: elif PyBytes_Check(s): jpayne@68: return s jpayne@68: elif PyUnicode_Check(s): jpayne@68: return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) jpayne@68: else: jpayne@68: raise TypeError("Argument must be string, bytes or unicode.") jpayne@68: jpayne@68: jpayne@68: cdef charptr_to_str(const char* s, encoding=None, errors=None): jpayne@68: if s == NULL: jpayne@68: return None jpayne@68: return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) jpayne@68: jpayne@68: jpayne@68: cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None): jpayne@68: if s == NULL: jpayne@68: return None jpayne@68: return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) jpayne@68: jpayne@68: jpayne@68: cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None): jpayne@68: if s == NULL: jpayne@68: return None jpayne@68: else: jpayne@68: return s jpayne@68: jpayne@68: jpayne@68: cdef force_str(object s, encoding=None, errors=None): jpayne@68: """Return s converted to str type of current Python jpayne@68: (bytes in Py2, unicode in Py3)""" jpayne@68: if s is None: jpayne@68: return None jpayne@68: if PyBytes_Check(s): jpayne@68: return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) jpayne@68: # assume unicode jpayne@68: return s jpayne@68: jpayne@68: jpayne@68: cdef decode_bytes(bytes s, encoding=None, errors=None): jpayne@68: """Return s converted to current Python's str type, jpayne@68: always decoding even in Python 2""" jpayne@68: if s is None: jpayne@68: return None jpayne@68: else: jpayne@68: return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) jpayne@68: jpayne@68: jpayne@68: cpdef parse_region(contig=None, jpayne@68: start=None, jpayne@68: stop=None, jpayne@68: region=None, jpayne@68: reference=None, jpayne@68: end=None): jpayne@68: """parse alternative ways to specify a genomic region. A region can jpayne@68: either be specified by :term:`reference`, `start` and jpayne@68: `end`. `start` and `end` denote 0-based, half-open intervals. jpayne@68: jpayne@68: :term:`reference` and `end` are also accepted for backward jpayne@68: compatibility as synonyms for :term:`contig` and `stop`, jpayne@68: respectively. jpayne@68: jpayne@68: Alternatively, a samtools :term:`region` string can be supplied. jpayne@68: jpayne@68: If any of the coordinates are missing they will be replaced by the jpayne@68: minimum (`start`) or maximum (`end`) coordinate. jpayne@68: jpayne@68: Note that region strings are 1-based, while `start` and `end` jpayne@68: denote an interval in python coordinates. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: tuple : a tuple of `reference`, `start` and `end`. jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: ValueError jpayne@68: for invalid or out of bounds regions. jpayne@68: jpayne@68: """ jpayne@68: cdef int32_t rstart jpayne@68: cdef int32_t rstop jpayne@68: jpayne@68: jpayne@68: if reference is not None: jpayne@68: if contig is not None: jpayne@68: raise ValueError('contig and reference should not both be specified') jpayne@68: contig = reference jpayne@68: jpayne@68: if contig is not None and region is not None: jpayne@68: raise ValueError('contig/reference and region should not both be specified') jpayne@68: jpayne@68: if end is not None: jpayne@68: if stop is not None: jpayne@68: raise ValueError('stop and end should not both be specified') jpayne@68: stop = end jpayne@68: jpayne@68: if contig is None and region is None: jpayne@68: raise ValueError("neither contig nor region are given") jpayne@68: jpayne@68: rstart = 0 jpayne@68: rstop = MAX_POS jpayne@68: if start is not None: jpayne@68: try: jpayne@68: rstart = start jpayne@68: except OverflowError: jpayne@68: raise ValueError('start out of range (%i)' % start) jpayne@68: jpayne@68: if stop is not None: jpayne@68: try: jpayne@68: rstop = stop jpayne@68: except OverflowError: jpayne@68: raise ValueError('stop out of range (%i)' % stop) jpayne@68: jpayne@68: if region: jpayne@68: if ":" in region: jpayne@68: contig, coord = region.split(":") jpayne@68: parts = coord.split("-") jpayne@68: rstart = int(parts[0]) - 1 jpayne@68: if len(parts) >= 1: jpayne@68: rstop = int(parts[1]) jpayne@68: else: jpayne@68: contig = region jpayne@68: jpayne@68: if rstart > rstop: jpayne@68: raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop)) jpayne@68: if not 0 <= rstart < MAX_POS: jpayne@68: raise ValueError('start out of range (%i)' % rstart) jpayne@68: if not 0 <= rstop <= MAX_POS: jpayne@68: raise ValueError('stop out of range (%i)' % rstop) jpayne@68: jpayne@68: return contig, rstart, rstop jpayne@68: jpayne@68: jpayne@68: cdef int libc_whence_from_io(int whence): jpayne@68: # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents jpayne@68: # have unspecified values. So we must translate, but checking for 0/1/2 jpayne@68: # rather than io.SEEK_SET/etc suffices. jpayne@68: if whence == 0: return SEEK_SET jpayne@68: if whence == 1: return SEEK_CUR jpayne@68: if whence == 2: return SEEK_END jpayne@68: return whence # Otherwise likely invalid, but let HTSlib or OS report it jpayne@68: jpayne@68: jpayne@68: def _pysam_dispatch(collection, jpayne@68: method, jpayne@68: args=None, jpayne@68: catch_stdout=True, jpayne@68: is_usage=False, jpayne@68: save_stdout=None): jpayne@68: '''call ``method`` in samtools/bcftools providing arguments in args. jpayne@68: jpayne@68: By default, stdout is redirected to a temporary file using the patched jpayne@68: C sources except for a few commands that have an explicit output option jpayne@68: (typically: -o). In these commands (such as samtools view), this explicit jpayne@68: option is used. If *is_usage* is True, then these explicit output options jpayne@68: will not be used. jpayne@68: jpayne@68: Catching of stdout can be turned off by setting *catch_stdout* to jpayne@68: False. jpayne@68: ''' jpayne@68: jpayne@68: if method == "index" and args: jpayne@68: # We make sure that at least the first specified input file exists, jpayne@68: # and if it doesn't we raise an IOError. jpayne@68: ARGUMENTS = ['-m', '--min-shift', '-o', '--output', '--output-file', '-@', '--threads'] jpayne@68: skip_next = False jpayne@68: for arg in args: jpayne@68: if skip_next: jpayne@68: skip_next = False jpayne@68: continue jpayne@68: if arg.startswith('-'): jpayne@68: # Skip next argument for e.g. '--min-shift' '12' or '-m' '12' but not '-m12' jpayne@68: if arg in ARGUMENTS: jpayne@68: skip_next = True jpayne@68: continue jpayne@68: if not os.path.exists(arg): jpayne@68: raise IOError("No such file or directory: '%s'" % arg) jpayne@68: else: jpayne@68: break jpayne@68: jpayne@68: if args is None: jpayne@68: args = [] jpayne@68: else: jpayne@68: args = list(args) jpayne@68: jpayne@68: # redirect stderr to file jpayne@68: stderr_h, stderr_f = tempfile.mkstemp() jpayne@68: jpayne@68: # redirect stdout to file jpayne@68: if save_stdout: jpayne@68: stdout_f = save_stdout jpayne@68: stdout_h = c_open(force_bytes(stdout_f), jpayne@68: O_WRONLY|O_CREAT|O_TRUNC, 0666) jpayne@68: if stdout_h == -1: jpayne@68: raise OSError(errno, "error while opening file for writing", stdout_f) jpayne@68: jpayne@68: samtools_set_stdout_fn(force_bytes(stdout_f)) jpayne@68: bcftools_set_stdout_fn(force_bytes(stdout_f)) jpayne@68: jpayne@68: elif catch_stdout: jpayne@68: stdout_h, stdout_f = tempfile.mkstemp() jpayne@68: MAP_STDOUT_OPTIONS = { jpayne@68: "samtools": { jpayne@68: "view": "-o {}", jpayne@68: "mpileup": "-o {}", jpayne@68: "depad": "-o {}", jpayne@68: "calmd": "", # uses pysam_stdout_fn jpayne@68: }, jpayne@68: "bcftools": {} jpayne@68: } jpayne@68: jpayne@68: stdout_option = None jpayne@68: if collection == "bcftools": jpayne@68: # in bcftools, most methods accept -o, the exceptions jpayne@68: # are below: jpayne@68: if method not in ("head", "index", "roh", "stats"): jpayne@68: stdout_option = "-o {}" jpayne@68: elif method in MAP_STDOUT_OPTIONS[collection]: jpayne@68: # special case - samtools view -c outputs on stdout jpayne@68: if not(method == "view" and "-c" in args): jpayne@68: stdout_option = MAP_STDOUT_OPTIONS[collection][method] jpayne@68: jpayne@68: if stdout_option is not None and not is_usage: jpayne@68: os.close(stdout_h) jpayne@68: samtools_set_stdout_fn(force_bytes(stdout_f)) jpayne@68: bcftools_set_stdout_fn(force_bytes(stdout_f)) jpayne@68: args.extend(stdout_option.format(stdout_f).split(" ")) jpayne@68: stdout_h = c_open(b"/dev/null", O_WRONLY) jpayne@68: else: jpayne@68: samtools_set_stdout_fn("-") jpayne@68: bcftools_set_stdout_fn("-") jpayne@68: stdout_h = c_open(b"/dev/null", O_WRONLY) jpayne@68: jpayne@68: # setup the function call to samtools/bcftools main jpayne@68: cdef char ** cargs jpayne@68: cdef int i, n, retval, l jpayne@68: n = len(args) jpayne@68: method = force_bytes(method) jpayne@68: collection = force_bytes(collection) jpayne@68: args = [force_bytes(a) for a in args] jpayne@68: jpayne@68: # allocate two more for first (dummy) argument (contains command) jpayne@68: cdef int extra_args = 0 jpayne@68: if method == b"index": jpayne@68: extra_args = 1 jpayne@68: # add extra arguments for commands accepting optional arguments jpayne@68: # such as 'samtools index x.bam [out.index]' jpayne@68: cargs = calloc(n + 2 + extra_args, sizeof(char *)) jpayne@68: cargs[0] = collection jpayne@68: cargs[1] = method jpayne@68: jpayne@68: # create copies of strings - getopt for long options permutes jpayne@68: # arguments jpayne@68: for i from 0 <= i < n: jpayne@68: l = len(args[i]) jpayne@68: cargs[i + 2] = calloc(l + 1, sizeof(char)) jpayne@68: strncpy(cargs[i + 2], args[i], l) jpayne@68: jpayne@68: # call samtools/bcftools jpayne@68: if collection == b"samtools": jpayne@68: samtools_set_stdout(stdout_h) jpayne@68: samtools_set_stderr(stderr_h) jpayne@68: retval = samtools_dispatch(n + 2, cargs) jpayne@68: samtools_close_stdout() jpayne@68: samtools_close_stderr() jpayne@68: elif collection == b"bcftools": jpayne@68: bcftools_set_stdout(stdout_h) jpayne@68: bcftools_set_stderr(stderr_h) jpayne@68: retval = bcftools_dispatch(n + 2, cargs) jpayne@68: bcftools_close_stdout() jpayne@68: bcftools_close_stderr() jpayne@68: else: jpayne@68: # unknown -- just return a Unix shell's "command not found" exit status jpayne@68: retval = 127 jpayne@68: jpayne@68: for i from 0 <= i < n: jpayne@68: free(cargs[i + 2]) jpayne@68: free(cargs) jpayne@68: jpayne@68: # get error messages jpayne@68: def _collect(fn): jpayne@68: out = [] jpayne@68: try: jpayne@68: with open(fn, "r") as inf: jpayne@68: out = inf.read() jpayne@68: except UnicodeDecodeError: jpayne@68: with open(fn, "rb") as inf: jpayne@68: # read binary output jpayne@68: out = inf.read() jpayne@68: finally: jpayne@68: os.remove(fn) jpayne@68: return out jpayne@68: jpayne@68: out_stderr = _collect(stderr_f) jpayne@68: if save_stdout: jpayne@68: out_stdout = None jpayne@68: elif catch_stdout: jpayne@68: out_stdout = _collect(stdout_f) jpayne@68: else: jpayne@68: out_stdout = None jpayne@68: jpayne@68: return retval, out_stderr, out_stdout jpayne@68: jpayne@68: jpayne@68: __all__ = [ jpayne@68: "qualitystring_to_array", jpayne@68: "array_to_qualitystring", jpayne@68: "qualities_to_qualitystring", jpayne@68: "get_encoding_error_handler", jpayne@68: "set_encoding_error_handler", jpayne@68: ]