annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcutils.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: language_level=3
jpayne@68 2 import types
jpayne@68 3 import sys
jpayne@68 4 import string
jpayne@68 5 import re
jpayne@68 6 import tempfile
jpayne@68 7 import os
jpayne@68 8 import io
jpayne@68 9 from contextlib import contextmanager
jpayne@68 10 from codecs import register_error
jpayne@68 11
jpayne@68 12 from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
jpayne@68 13 from cpython cimport PyBytes_Check, PyUnicode_Check
jpayne@68 14 from cpython cimport array as c_array
jpayne@68 15 from libc.errno cimport errno
jpayne@68 16 from libc.stdlib cimport calloc, free
jpayne@68 17 from libc.string cimport strncpy
jpayne@68 18 from libc.stdint cimport INT32_MAX, int32_t
jpayne@68 19 from libc.stdio cimport fprintf, stderr, fflush
jpayne@68 20 from libc.stdio cimport stdout as c_stdout
jpayne@68 21 from posix.fcntl cimport open as c_open, O_WRONLY, O_CREAT, O_TRUNC
jpayne@68 22 from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END
jpayne@68 23
jpayne@68 24 from pysam.libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
jpayne@68 25 samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn
jpayne@68 26
jpayne@68 27 from pysam.libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \
jpayne@68 28 bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn
jpayne@68 29
jpayne@68 30 #####################################################################
jpayne@68 31 # hard-coded constants
jpayne@68 32 cdef int MAX_POS = (1 << 31) - 1
jpayne@68 33
jpayne@68 34 #################################################################
jpayne@68 35 # Utility functions for quality string conversions
jpayne@68 36 cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
jpayne@68 37 """convert a qualitystring to an array of quality values."""
jpayne@68 38 if input_str is None:
jpayne@68 39 return None
jpayne@68 40 qs = force_bytes(input_str)
jpayne@68 41 cdef char i
jpayne@68 42 return c_array.array('B', [i - offset for i in qs])
jpayne@68 43
jpayne@68 44
jpayne@68 45 cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
jpayne@68 46 """convert an array of quality values to a string."""
jpayne@68 47 if qualities is None:
jpayne@68 48 return None
jpayne@68 49 cdef int x
jpayne@68 50
jpayne@68 51 cdef c_array.array result
jpayne@68 52 result = c_array.clone(qualities, len(qualities), zero=False)
jpayne@68 53
jpayne@68 54 for x from 0 <= x < len(qualities):
jpayne@68 55 result[x] = qualities[x] + offset
jpayne@68 56 return force_str(result.tobytes())
jpayne@68 57
jpayne@68 58
jpayne@68 59 cpdef qualities_to_qualitystring(qualities, int offset=33):
jpayne@68 60 """convert a list or array of quality scores to the string
jpayne@68 61 representation used in the SAM format.
jpayne@68 62
jpayne@68 63 Parameters
jpayne@68 64 ----------
jpayne@68 65 offset : int
jpayne@68 66 offset to be added to the quality scores to arrive at
jpayne@68 67 the characters of the quality string (default=33).
jpayne@68 68
jpayne@68 69 Returns
jpayne@68 70 -------
jpayne@68 71 string
jpayne@68 72 a quality string
jpayne@68 73
jpayne@68 74 """
jpayne@68 75 cdef char x
jpayne@68 76 if qualities is None:
jpayne@68 77 return None
jpayne@68 78 elif isinstance(qualities, c_array.array):
jpayne@68 79 return array_to_qualitystring(qualities, offset=offset)
jpayne@68 80 else:
jpayne@68 81 # tuples and lists
jpayne@68 82 return force_str("".join([chr(x + offset) for x in qualities]))
jpayne@68 83
jpayne@68 84
jpayne@68 85 ########################################################################
jpayne@68 86 ## String encoding configuration facilities
jpayne@68 87 ########################################################################
jpayne@68 88
jpayne@68 89 # Codec error handler that just interprets each bad byte as ISO-8859-1.
jpayne@68 90 def latin1_replace(exception):
jpayne@68 91 return (chr(exception.object[exception.start]), exception.end)
jpayne@68 92
jpayne@68 93 register_error('pysam.latin1replace', latin1_replace)
jpayne@68 94
jpayne@68 95
jpayne@68 96 cdef str ERROR_HANDLER = 'strict'
jpayne@68 97
jpayne@68 98 cpdef get_encoding_error_handler():
jpayne@68 99 return ERROR_HANDLER
jpayne@68 100
jpayne@68 101 cpdef set_encoding_error_handler(name):
jpayne@68 102 global ERROR_HANDLER
jpayne@68 103 previous = ERROR_HANDLER
jpayne@68 104 ERROR_HANDLER = name
jpayne@68 105 return previous
jpayne@68 106
jpayne@68 107 ########################################################################
jpayne@68 108 ## Python 3 compatibility functions
jpayne@68 109 ########################################################################
jpayne@68 110
jpayne@68 111 cdef from_string_and_size(const char* s, size_t length):
jpayne@68 112 return s[:length].decode('utf-8', ERROR_HANDLER)
jpayne@68 113
jpayne@68 114 # filename encoding (adapted from lxml.etree.pyx)
jpayne@68 115 cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
jpayne@68 116 cdef str TEXT_ENCODING = 'utf-8'
jpayne@68 117
jpayne@68 118 cdef bytes encode_filename(object filename):
jpayne@68 119 """Make sure a filename is 8-bit encoded (or None)."""
jpayne@68 120 if filename is None:
jpayne@68 121 return None
jpayne@68 122 return os.fsencode(filename)
jpayne@68 123
jpayne@68 124
jpayne@68 125 cdef bytes force_bytes(object s, encoding=None, errors=None):
jpayne@68 126 """convert string or unicode object to bytes, assuming
jpayne@68 127 utf8 encoding.
jpayne@68 128 """
jpayne@68 129 if s is None:
jpayne@68 130 return None
jpayne@68 131 elif PyBytes_Check(s):
jpayne@68 132 return s
jpayne@68 133 elif PyUnicode_Check(s):
jpayne@68 134 return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
jpayne@68 135 else:
jpayne@68 136 raise TypeError("Argument must be string, bytes or unicode.")
jpayne@68 137
jpayne@68 138
jpayne@68 139 cdef charptr_to_str(const char* s, encoding=None, errors=None):
jpayne@68 140 if s == NULL:
jpayne@68 141 return None
jpayne@68 142 return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
jpayne@68 143
jpayne@68 144
jpayne@68 145 cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None):
jpayne@68 146 if s == NULL:
jpayne@68 147 return None
jpayne@68 148 return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
jpayne@68 149
jpayne@68 150
jpayne@68 151 cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None):
jpayne@68 152 if s == NULL:
jpayne@68 153 return None
jpayne@68 154 else:
jpayne@68 155 return s
jpayne@68 156
jpayne@68 157
jpayne@68 158 cdef force_str(object s, encoding=None, errors=None):
jpayne@68 159 """Return s converted to str type of current Python
jpayne@68 160 (bytes in Py2, unicode in Py3)"""
jpayne@68 161 if s is None:
jpayne@68 162 return None
jpayne@68 163 if PyBytes_Check(s):
jpayne@68 164 return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
jpayne@68 165 # assume unicode
jpayne@68 166 return s
jpayne@68 167
jpayne@68 168
jpayne@68 169 cdef decode_bytes(bytes s, encoding=None, errors=None):
jpayne@68 170 """Return s converted to current Python's str type,
jpayne@68 171 always decoding even in Python 2"""
jpayne@68 172 if s is None:
jpayne@68 173 return None
jpayne@68 174 else:
jpayne@68 175 return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
jpayne@68 176
jpayne@68 177
jpayne@68 178 cpdef parse_region(contig=None,
jpayne@68 179 start=None,
jpayne@68 180 stop=None,
jpayne@68 181 region=None,
jpayne@68 182 reference=None,
jpayne@68 183 end=None):
jpayne@68 184 """parse alternative ways to specify a genomic region. A region can
jpayne@68 185 either be specified by :term:`reference`, `start` and
jpayne@68 186 `end`. `start` and `end` denote 0-based, half-open intervals.
jpayne@68 187
jpayne@68 188 :term:`reference` and `end` are also accepted for backward
jpayne@68 189 compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68 190 respectively.
jpayne@68 191
jpayne@68 192 Alternatively, a samtools :term:`region` string can be supplied.
jpayne@68 193
jpayne@68 194 If any of the coordinates are missing they will be replaced by the
jpayne@68 195 minimum (`start`) or maximum (`end`) coordinate.
jpayne@68 196
jpayne@68 197 Note that region strings are 1-based, while `start` and `end`
jpayne@68 198 denote an interval in python coordinates.
jpayne@68 199
jpayne@68 200 Returns
jpayne@68 201 -------
jpayne@68 202
jpayne@68 203 tuple : a tuple of `reference`, `start` and `end`.
jpayne@68 204
jpayne@68 205 Raises
jpayne@68 206 ------
jpayne@68 207
jpayne@68 208 ValueError
jpayne@68 209 for invalid or out of bounds regions.
jpayne@68 210
jpayne@68 211 """
jpayne@68 212 cdef int32_t rstart
jpayne@68 213 cdef int32_t rstop
jpayne@68 214
jpayne@68 215
jpayne@68 216 if reference is not None:
jpayne@68 217 if contig is not None:
jpayne@68 218 raise ValueError('contig and reference should not both be specified')
jpayne@68 219 contig = reference
jpayne@68 220
jpayne@68 221 if contig is not None and region is not None:
jpayne@68 222 raise ValueError('contig/reference and region should not both be specified')
jpayne@68 223
jpayne@68 224 if end is not None:
jpayne@68 225 if stop is not None:
jpayne@68 226 raise ValueError('stop and end should not both be specified')
jpayne@68 227 stop = end
jpayne@68 228
jpayne@68 229 if contig is None and region is None:
jpayne@68 230 raise ValueError("neither contig nor region are given")
jpayne@68 231
jpayne@68 232 rstart = 0
jpayne@68 233 rstop = MAX_POS
jpayne@68 234 if start is not None:
jpayne@68 235 try:
jpayne@68 236 rstart = start
jpayne@68 237 except OverflowError:
jpayne@68 238 raise ValueError('start out of range (%i)' % start)
jpayne@68 239
jpayne@68 240 if stop is not None:
jpayne@68 241 try:
jpayne@68 242 rstop = stop
jpayne@68 243 except OverflowError:
jpayne@68 244 raise ValueError('stop out of range (%i)' % stop)
jpayne@68 245
jpayne@68 246 if region:
jpayne@68 247 if ":" in region:
jpayne@68 248 contig, coord = region.split(":")
jpayne@68 249 parts = coord.split("-")
jpayne@68 250 rstart = int(parts[0]) - 1
jpayne@68 251 if len(parts) >= 1:
jpayne@68 252 rstop = int(parts[1])
jpayne@68 253 else:
jpayne@68 254 contig = region
jpayne@68 255
jpayne@68 256 if rstart > rstop:
jpayne@68 257 raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
jpayne@68 258 if not 0 <= rstart < MAX_POS:
jpayne@68 259 raise ValueError('start out of range (%i)' % rstart)
jpayne@68 260 if not 0 <= rstop <= MAX_POS:
jpayne@68 261 raise ValueError('stop out of range (%i)' % rstop)
jpayne@68 262
jpayne@68 263 return contig, rstart, rstop
jpayne@68 264
jpayne@68 265
jpayne@68 266 cdef int libc_whence_from_io(int whence):
jpayne@68 267 # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents
jpayne@68 268 # have unspecified values. So we must translate, but checking for 0/1/2
jpayne@68 269 # rather than io.SEEK_SET/etc suffices.
jpayne@68 270 if whence == 0: return SEEK_SET
jpayne@68 271 if whence == 1: return SEEK_CUR
jpayne@68 272 if whence == 2: return SEEK_END
jpayne@68 273 return whence # Otherwise likely invalid, but let HTSlib or OS report it
jpayne@68 274
jpayne@68 275
jpayne@68 276 def _pysam_dispatch(collection,
jpayne@68 277 method,
jpayne@68 278 args=None,
jpayne@68 279 catch_stdout=True,
jpayne@68 280 is_usage=False,
jpayne@68 281 save_stdout=None):
jpayne@68 282 '''call ``method`` in samtools/bcftools providing arguments in args.
jpayne@68 283
jpayne@68 284 By default, stdout is redirected to a temporary file using the patched
jpayne@68 285 C sources except for a few commands that have an explicit output option
jpayne@68 286 (typically: -o). In these commands (such as samtools view), this explicit
jpayne@68 287 option is used. If *is_usage* is True, then these explicit output options
jpayne@68 288 will not be used.
jpayne@68 289
jpayne@68 290 Catching of stdout can be turned off by setting *catch_stdout* to
jpayne@68 291 False.
jpayne@68 292 '''
jpayne@68 293
jpayne@68 294 if method == "index" and args:
jpayne@68 295 # We make sure that at least the first specified input file exists,
jpayne@68 296 # and if it doesn't we raise an IOError.
jpayne@68 297 ARGUMENTS = ['-m', '--min-shift', '-o', '--output', '--output-file', '-@', '--threads']
jpayne@68 298 skip_next = False
jpayne@68 299 for arg in args:
jpayne@68 300 if skip_next:
jpayne@68 301 skip_next = False
jpayne@68 302 continue
jpayne@68 303 if arg.startswith('-'):
jpayne@68 304 # Skip next argument for e.g. '--min-shift' '12' or '-m' '12' but not '-m12'
jpayne@68 305 if arg in ARGUMENTS:
jpayne@68 306 skip_next = True
jpayne@68 307 continue
jpayne@68 308 if not os.path.exists(arg):
jpayne@68 309 raise IOError("No such file or directory: '%s'" % arg)
jpayne@68 310 else:
jpayne@68 311 break
jpayne@68 312
jpayne@68 313 if args is None:
jpayne@68 314 args = []
jpayne@68 315 else:
jpayne@68 316 args = list(args)
jpayne@68 317
jpayne@68 318 # redirect stderr to file
jpayne@68 319 stderr_h, stderr_f = tempfile.mkstemp()
jpayne@68 320
jpayne@68 321 # redirect stdout to file
jpayne@68 322 if save_stdout:
jpayne@68 323 stdout_f = save_stdout
jpayne@68 324 stdout_h = c_open(force_bytes(stdout_f),
jpayne@68 325 O_WRONLY|O_CREAT|O_TRUNC, 0666)
jpayne@68 326 if stdout_h == -1:
jpayne@68 327 raise OSError(errno, "error while opening file for writing", stdout_f)
jpayne@68 328
jpayne@68 329 samtools_set_stdout_fn(force_bytes(stdout_f))
jpayne@68 330 bcftools_set_stdout_fn(force_bytes(stdout_f))
jpayne@68 331
jpayne@68 332 elif catch_stdout:
jpayne@68 333 stdout_h, stdout_f = tempfile.mkstemp()
jpayne@68 334 MAP_STDOUT_OPTIONS = {
jpayne@68 335 "samtools": {
jpayne@68 336 "view": "-o {}",
jpayne@68 337 "mpileup": "-o {}",
jpayne@68 338 "depad": "-o {}",
jpayne@68 339 "calmd": "", # uses pysam_stdout_fn
jpayne@68 340 },
jpayne@68 341 "bcftools": {}
jpayne@68 342 }
jpayne@68 343
jpayne@68 344 stdout_option = None
jpayne@68 345 if collection == "bcftools":
jpayne@68 346 # in bcftools, most methods accept -o, the exceptions
jpayne@68 347 # are below:
jpayne@68 348 if method not in ("head", "index", "roh", "stats"):
jpayne@68 349 stdout_option = "-o {}"
jpayne@68 350 elif method in MAP_STDOUT_OPTIONS[collection]:
jpayne@68 351 # special case - samtools view -c outputs on stdout
jpayne@68 352 if not(method == "view" and "-c" in args):
jpayne@68 353 stdout_option = MAP_STDOUT_OPTIONS[collection][method]
jpayne@68 354
jpayne@68 355 if stdout_option is not None and not is_usage:
jpayne@68 356 os.close(stdout_h)
jpayne@68 357 samtools_set_stdout_fn(force_bytes(stdout_f))
jpayne@68 358 bcftools_set_stdout_fn(force_bytes(stdout_f))
jpayne@68 359 args.extend(stdout_option.format(stdout_f).split(" "))
jpayne@68 360 stdout_h = c_open(b"/dev/null", O_WRONLY)
jpayne@68 361 else:
jpayne@68 362 samtools_set_stdout_fn("-")
jpayne@68 363 bcftools_set_stdout_fn("-")
jpayne@68 364 stdout_h = c_open(b"/dev/null", O_WRONLY)
jpayne@68 365
jpayne@68 366 # setup the function call to samtools/bcftools main
jpayne@68 367 cdef char ** cargs
jpayne@68 368 cdef int i, n, retval, l
jpayne@68 369 n = len(args)
jpayne@68 370 method = force_bytes(method)
jpayne@68 371 collection = force_bytes(collection)
jpayne@68 372 args = [force_bytes(a) for a in args]
jpayne@68 373
jpayne@68 374 # allocate two more for first (dummy) argument (contains command)
jpayne@68 375 cdef int extra_args = 0
jpayne@68 376 if method == b"index":
jpayne@68 377 extra_args = 1
jpayne@68 378 # add extra arguments for commands accepting optional arguments
jpayne@68 379 # such as 'samtools index x.bam [out.index]'
jpayne@68 380 cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
jpayne@68 381 cargs[0] = collection
jpayne@68 382 cargs[1] = method
jpayne@68 383
jpayne@68 384 # create copies of strings - getopt for long options permutes
jpayne@68 385 # arguments
jpayne@68 386 for i from 0 <= i < n:
jpayne@68 387 l = len(args[i])
jpayne@68 388 cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
jpayne@68 389 strncpy(cargs[i + 2], args[i], l)
jpayne@68 390
jpayne@68 391 # call samtools/bcftools
jpayne@68 392 if collection == b"samtools":
jpayne@68 393 samtools_set_stdout(stdout_h)
jpayne@68 394 samtools_set_stderr(stderr_h)
jpayne@68 395 retval = samtools_dispatch(n + 2, cargs)
jpayne@68 396 samtools_close_stdout()
jpayne@68 397 samtools_close_stderr()
jpayne@68 398 elif collection == b"bcftools":
jpayne@68 399 bcftools_set_stdout(stdout_h)
jpayne@68 400 bcftools_set_stderr(stderr_h)
jpayne@68 401 retval = bcftools_dispatch(n + 2, cargs)
jpayne@68 402 bcftools_close_stdout()
jpayne@68 403 bcftools_close_stderr()
jpayne@68 404 else:
jpayne@68 405 # unknown -- just return a Unix shell's "command not found" exit status
jpayne@68 406 retval = 127
jpayne@68 407
jpayne@68 408 for i from 0 <= i < n:
jpayne@68 409 free(cargs[i + 2])
jpayne@68 410 free(cargs)
jpayne@68 411
jpayne@68 412 # get error messages
jpayne@68 413 def _collect(fn):
jpayne@68 414 out = []
jpayne@68 415 try:
jpayne@68 416 with open(fn, "r") as inf:
jpayne@68 417 out = inf.read()
jpayne@68 418 except UnicodeDecodeError:
jpayne@68 419 with open(fn, "rb") as inf:
jpayne@68 420 # read binary output
jpayne@68 421 out = inf.read()
jpayne@68 422 finally:
jpayne@68 423 os.remove(fn)
jpayne@68 424 return out
jpayne@68 425
jpayne@68 426 out_stderr = _collect(stderr_f)
jpayne@68 427 if save_stdout:
jpayne@68 428 out_stdout = None
jpayne@68 429 elif catch_stdout:
jpayne@68 430 out_stdout = _collect(stdout_f)
jpayne@68 431 else:
jpayne@68 432 out_stdout = None
jpayne@68 433
jpayne@68 434 return retval, out_stderr, out_stdout
jpayne@68 435
jpayne@68 436
jpayne@68 437 __all__ = [
jpayne@68 438 "qualitystring_to_array",
jpayne@68 439 "array_to_qualitystring",
jpayne@68 440 "qualities_to_qualitystring",
jpayne@68 441 "get_encoding_error_handler",
jpayne@68 442 "set_encoding_error_handler",
jpayne@68 443 ]