annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: language_level=3
jpayne@68 2 # cython: embedsignature=True
jpayne@68 3 # cython: profile=True
jpayne@68 4 ###############################################################################
jpayne@68 5 ###############################################################################
jpayne@68 6 # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@68 7 ###############################################################################
jpayne@68 8 # The principal classes defined in this module are:
jpayne@68 9 #
jpayne@68 10 # class AlignedSegment an aligned segment (read)
jpayne@68 11 #
jpayne@68 12 # class PileupColumn a collection of segments (PileupRead) aligned to
jpayne@68 13 # a particular genomic position.
jpayne@68 14 #
jpayne@68 15 # class PileupRead an AlignedSegment aligned to a particular genomic
jpayne@68 16 # position. Contains additional attributes with respect
jpayne@68 17 # to this.
jpayne@68 18 #
jpayne@68 19 # Additionally this module defines numerous additional classes that are part
jpayne@68 20 # of the internal API. These are:
jpayne@68 21 #
jpayne@68 22 # Various iterator classes to iterate over alignments in sequential (IteratorRow)
jpayne@68 23 # or in a stacked fashion (IteratorColumn):
jpayne@68 24 #
jpayne@68 25 # class IteratorRow
jpayne@68 26 # class IteratorRowRegion
jpayne@68 27 # class IteratorRowHead
jpayne@68 28 # class IteratorRowAll
jpayne@68 29 # class IteratorRowAllRefs
jpayne@68 30 # class IteratorRowSelection
jpayne@68 31 #
jpayne@68 32 ###############################################################################
jpayne@68 33 #
jpayne@68 34 # The MIT License
jpayne@68 35 #
jpayne@68 36 # Copyright (c) 2015 Andreas Heger
jpayne@68 37 #
jpayne@68 38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68 39 # copy of this software and associated documentation files (the "Software"),
jpayne@68 40 # to deal in the Software without restriction, including without limitation
jpayne@68 41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68 42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68 43 # Software is furnished to do so, subject to the following conditions:
jpayne@68 44 #
jpayne@68 45 # The above copyright notice and this permission notice shall be included in
jpayne@68 46 # all copies or substantial portions of the Software.
jpayne@68 47 #
jpayne@68 48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68 49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68 50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68 51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68 52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68 53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68 54 # DEALINGS IN THE SOFTWARE.
jpayne@68 55 #
jpayne@68 56 ###############################################################################
jpayne@68 57 import re
jpayne@68 58 import array
jpayne@68 59 import json
jpayne@68 60 import string
jpayne@68 61 import ctypes
jpayne@68 62 import struct
jpayne@68 63
jpayne@68 64 cimport cython
jpayne@68 65 from cpython cimport array as c_array
jpayne@68 66 from cpython cimport PyBytes_FromStringAndSize
jpayne@68 67 from libc.string cimport memset, strchr
jpayne@68 68 from cpython cimport array as c_array
jpayne@68 69 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
jpayne@68 70 INT8_MAX, INT16_MAX, INT32_MAX, \
jpayne@68 71 UINT8_MAX, UINT16_MAX, UINT32_MAX
jpayne@68 72
jpayne@68 73 from pysam.libchtslib cimport HTS_IDX_NOCOOR
jpayne@68 74 from pysam.libcutils cimport force_bytes, force_str, \
jpayne@68 75 charptr_to_str, charptr_to_bytes
jpayne@68 76 from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
jpayne@68 77 array_to_qualitystring
jpayne@68 78
jpayne@68 79 # Constants for binary tag conversion
jpayne@68 80 cdef char * htslib_types = 'cCsSiIf'
jpayne@68 81 cdef char * parray_types = 'bBhHiIf'
jpayne@68 82
jpayne@68 83 # translation tables
jpayne@68 84
jpayne@68 85 # cigar code to character and vice versa
jpayne@68 86 cdef char* CODE2CIGAR= "MIDNSHP=XB"
jpayne@68 87 cdef int NCIGAR_CODES = 10
jpayne@68 88
jpayne@68 89 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
jpayne@68 90 CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
jpayne@68 91
jpayne@68 92 # names for keys in dictionary representation of an AlignedSegment
jpayne@68 93 KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
jpayne@68 94 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
jpayne@68 95
jpayne@68 96 #####################################################################
jpayne@68 97 # C multiplication with wrapping around
jpayne@68 98 cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
jpayne@68 99 return (a * b) & 0xffffffff
jpayne@68 100
jpayne@68 101
jpayne@68 102 cdef inline uint8_t tolower(uint8_t ch):
jpayne@68 103 if ch >= 65 and ch <= 90:
jpayne@68 104 return ch + 32
jpayne@68 105 else:
jpayne@68 106 return ch
jpayne@68 107
jpayne@68 108
jpayne@68 109 cdef inline uint8_t toupper(uint8_t ch):
jpayne@68 110 if ch >= 97 and ch <= 122:
jpayne@68 111 return ch - 32
jpayne@68 112 else:
jpayne@68 113 return ch
jpayne@68 114
jpayne@68 115
jpayne@68 116 cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
jpayne@68 117 if ch == b'=':
jpayne@68 118 if bam_is_rev(b):
jpayne@68 119 return b','
jpayne@68 120 else:
jpayne@68 121 return b'.'
jpayne@68 122 else:
jpayne@68 123 if bam_is_rev(b):
jpayne@68 124 return tolower(ch)
jpayne@68 125 else:
jpayne@68 126 return toupper(ch)
jpayne@68 127
jpayne@68 128
jpayne@68 129 cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
jpayne@68 130 cdef uint32_t c
jpayne@68 131 if p.qpos < p.b.core.l_qseq:
jpayne@68 132 c = bam_get_qual(p.b)[p.qpos]
jpayne@68 133 else:
jpayne@68 134 c = 0
jpayne@68 135 if c < threshold:
jpayne@68 136 return True
jpayne@68 137 return False
jpayne@68 138
jpayne@68 139
jpayne@68 140 cdef inline char map_typecode_htslib_to_python(uint8_t s):
jpayne@68 141 """map an htslib typecode to the corresponding python typecode
jpayne@68 142 to be used in the struct or array modules."""
jpayne@68 143
jpayne@68 144 # map type from htslib to python array
jpayne@68 145 cdef char * f = strchr(htslib_types, s)
jpayne@68 146
jpayne@68 147 if f == NULL:
jpayne@68 148 return 0
jpayne@68 149 return parray_types[f - htslib_types]
jpayne@68 150
jpayne@68 151
jpayne@68 152 cdef inline uint8_t map_typecode_python_to_htslib(char s):
jpayne@68 153 """determine value type from type code of array"""
jpayne@68 154 cdef char * f = strchr(parray_types, s)
jpayne@68 155 if f == NULL:
jpayne@68 156 return 0
jpayne@68 157 return htslib_types[f - parray_types]
jpayne@68 158
jpayne@68 159
jpayne@68 160 cdef inline void update_bin(bam1_t * src):
jpayne@68 161 if src.core.flag & BAM_FUNMAP:
jpayne@68 162 # treat alignment as length of 1 for unmapped reads
jpayne@68 163 src.core.bin = hts_reg2bin(
jpayne@68 164 src.core.pos,
jpayne@68 165 src.core.pos + 1,
jpayne@68 166 14,
jpayne@68 167 5)
jpayne@68 168 elif pysam_get_n_cigar(src):
jpayne@68 169 src.core.bin = hts_reg2bin(
jpayne@68 170 src.core.pos,
jpayne@68 171 bam_endpos(src),
jpayne@68 172 14,
jpayne@68 173 5)
jpayne@68 174 else:
jpayne@68 175 src.core.bin = hts_reg2bin(
jpayne@68 176 src.core.pos,
jpayne@68 177 src.core.pos + 1,
jpayne@68 178 14,
jpayne@68 179 5)
jpayne@68 180
jpayne@68 181
jpayne@68 182 # optional tag data manipulation
jpayne@68 183 cdef convert_binary_tag(uint8_t * tag):
jpayne@68 184 """return bytesize, number of values and array of values
jpayne@68 185 in aux_data memory location pointed to by tag."""
jpayne@68 186 cdef uint8_t auxtype
jpayne@68 187 cdef uint8_t byte_size
jpayne@68 188 cdef int32_t nvalues
jpayne@68 189 # get byte size
jpayne@68 190 auxtype = tag[0]
jpayne@68 191 byte_size = aux_type2size(auxtype)
jpayne@68 192 tag += 1
jpayne@68 193 # get number of values in array
jpayne@68 194 nvalues = (<int32_t*>tag)[0]
jpayne@68 195 tag += 4
jpayne@68 196
jpayne@68 197 # define python array
jpayne@68 198 cdef c_array.array c_values = array.array(
jpayne@68 199 chr(map_typecode_htslib_to_python(auxtype)))
jpayne@68 200 c_array.resize(c_values, nvalues)
jpayne@68 201
jpayne@68 202 # copy data
jpayne@68 203 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
jpayne@68 204
jpayne@68 205 # no need to check for endian-ness as bam1_core_t fields
jpayne@68 206 # and aux_data are in host endian-ness. See sam.c and calls
jpayne@68 207 # to swap_data
jpayne@68 208 return byte_size, nvalues, c_values
jpayne@68 209
jpayne@68 210
jpayne@68 211 cdef inline uint8_t get_tag_typecode(value, value_type=None):
jpayne@68 212 """guess type code for a *value*. If *value_type* is None, the type
jpayne@68 213 code will be inferred based on the Python type of *value*
jpayne@68 214
jpayne@68 215 """
jpayne@68 216 # 0 is unknown typecode
jpayne@68 217 cdef char typecode = 0
jpayne@68 218
jpayne@68 219 if value_type is None:
jpayne@68 220 if isinstance(value, int):
jpayne@68 221 if value < 0:
jpayne@68 222 if value >= INT8_MIN:
jpayne@68 223 typecode = b'c'
jpayne@68 224 elif value >= INT16_MIN:
jpayne@68 225 typecode = b's'
jpayne@68 226 elif value >= INT32_MIN:
jpayne@68 227 typecode = b'i'
jpayne@68 228 # unsigned ints
jpayne@68 229 else:
jpayne@68 230 if value <= UINT8_MAX:
jpayne@68 231 typecode = b'C'
jpayne@68 232 elif value <= UINT16_MAX:
jpayne@68 233 typecode = b'S'
jpayne@68 234 elif value <= UINT32_MAX:
jpayne@68 235 typecode = b'I'
jpayne@68 236 elif isinstance(value, float):
jpayne@68 237 typecode = b'f'
jpayne@68 238 elif isinstance(value, str):
jpayne@68 239 typecode = b'Z'
jpayne@68 240 elif isinstance(value, bytes):
jpayne@68 241 typecode = b'Z'
jpayne@68 242 elif isinstance(value, array.array) or \
jpayne@68 243 isinstance(value, list) or \
jpayne@68 244 isinstance(value, tuple):
jpayne@68 245 typecode = b'B'
jpayne@68 246 else:
jpayne@68 247 if value_type in 'aAsSIcCZidfH':
jpayne@68 248 typecode = force_bytes(value_type)[0]
jpayne@68 249
jpayne@68 250 return typecode
jpayne@68 251
jpayne@68 252
jpayne@68 253 cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
jpayne@68 254 '''returns the value typecode of a value.
jpayne@68 255
jpayne@68 256 If max is specified, the appropriate type is returned for a range
jpayne@68 257 where value is the minimum.
jpayne@68 258
jpayne@68 259 Note that this method returns types from the extended BAM alphabet
jpayne@68 260 of types that includes tags that are not part of the SAM
jpayne@68 261 specification.
jpayne@68 262 '''
jpayne@68 263
jpayne@68 264
jpayne@68 265 cdef uint8_t typecode
jpayne@68 266
jpayne@68 267 t = type(value)
jpayne@68 268
jpayne@68 269 if t is float:
jpayne@68 270 typecode = b'f'
jpayne@68 271 elif t is int:
jpayne@68 272 if max_value is None:
jpayne@68 273 max_value = value
jpayne@68 274 if min_value is None:
jpayne@68 275 min_value = value
jpayne@68 276 # signed ints
jpayne@68 277 if min_value < 0:
jpayne@68 278 if min_value >= INT8_MIN and max_value <= INT8_MAX:
jpayne@68 279 typecode = b'c'
jpayne@68 280 elif min_value >= INT16_MIN and max_value <= INT16_MAX:
jpayne@68 281 typecode = b's'
jpayne@68 282 elif min_value >= INT32_MIN or max_value <= INT32_MAX:
jpayne@68 283 typecode = b'i'
jpayne@68 284 else:
jpayne@68 285 raise ValueError(
jpayne@68 286 "at least one signed integer out of range of "
jpayne@68 287 "BAM/SAM specification")
jpayne@68 288 # unsigned ints
jpayne@68 289 else:
jpayne@68 290 if max_value <= UINT8_MAX:
jpayne@68 291 typecode = b'C'
jpayne@68 292 elif max_value <= UINT16_MAX:
jpayne@68 293 typecode = b'S'
jpayne@68 294 elif max_value <= UINT32_MAX:
jpayne@68 295 typecode = b'I'
jpayne@68 296 else:
jpayne@68 297 raise ValueError(
jpayne@68 298 "at least one integer out of range of BAM/SAM specification")
jpayne@68 299 else:
jpayne@68 300 # Note: hex strings (H) are not supported yet
jpayne@68 301 if t is not bytes:
jpayne@68 302 value = value.encode('ascii')
jpayne@68 303 if len(value) == 1:
jpayne@68 304 typecode = b'A'
jpayne@68 305 else:
jpayne@68 306 typecode = b'Z'
jpayne@68 307
jpayne@68 308 return typecode
jpayne@68 309
jpayne@68 310
jpayne@68 311 # mapping python array.array and htslib typecodes to struct typecodes
jpayne@68 312 DATATYPE2FORMAT = {
jpayne@68 313 ord('c'): ('b', 1),
jpayne@68 314 ord('C'): ('B', 1),
jpayne@68 315 ord('s'): ('h', 2),
jpayne@68 316 ord('S'): ('H', 2),
jpayne@68 317 ord('i'): ('i', 4),
jpayne@68 318 ord('I'): ('I', 4),
jpayne@68 319 ord('f'): ('f', 4),
jpayne@68 320 ord('d'): ('d', 8),
jpayne@68 321 ord('A'): ('c', 1),
jpayne@68 322 ord('a'): ('c', 1)}
jpayne@68 323
jpayne@68 324
jpayne@68 325 cdef inline pack_tags(tags):
jpayne@68 326 """pack a list of tags. Each tag is a tuple of (tag, tuple).
jpayne@68 327
jpayne@68 328 Values are packed into the most space efficient data structure
jpayne@68 329 possible unless the tag contains a third field with the typecode.
jpayne@68 330
jpayne@68 331 Returns a format string and the associated list of arguments to be
jpayne@68 332 used in a call to struct.pack_into.
jpayne@68 333 """
jpayne@68 334 fmts, args = ["<"], []
jpayne@68 335
jpayne@68 336 # htslib typecode
jpayne@68 337 cdef uint8_t typecode
jpayne@68 338 for tag in tags:
jpayne@68 339
jpayne@68 340 if len(tag) == 2:
jpayne@68 341 pytag, value = tag
jpayne@68 342 valuetype = None
jpayne@68 343 elif len(tag) == 3:
jpayne@68 344 pytag, value, valuetype = tag
jpayne@68 345 else:
jpayne@68 346 raise ValueError("malformatted tag: %s" % str(tag))
jpayne@68 347
jpayne@68 348 if valuetype is None:
jpayne@68 349 typecode = 0
jpayne@68 350 else:
jpayne@68 351 # only first character in valuecode matters
jpayne@68 352 typecode = force_bytes(valuetype)[0]
jpayne@68 353
jpayne@68 354 pytag = force_bytes(pytag)
jpayne@68 355 pytype = type(value)
jpayne@68 356
jpayne@68 357 if pytype is tuple or pytype is list:
jpayne@68 358 # binary tags from tuples or lists
jpayne@68 359 if not typecode:
jpayne@68 360 # automatically determine value type - first value
jpayne@68 361 # determines type. If there is a mix of types, the
jpayne@68 362 # result is undefined.
jpayne@68 363 typecode = get_btag_typecode(min(value),
jpayne@68 364 min_value=min(value),
jpayne@68 365 max_value=max(value))
jpayne@68 366
jpayne@68 367 if typecode not in DATATYPE2FORMAT:
jpayne@68 368 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@68 369
jpayne@68 370 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
jpayne@68 371 args.extend([pytag[:2],
jpayne@68 372 ord("B"),
jpayne@68 373 typecode,
jpayne@68 374 len(value)] + list(value))
jpayne@68 375
jpayne@68 376 elif isinstance(value, array.array):
jpayne@68 377 # binary tags from arrays
jpayne@68 378 if typecode == 0:
jpayne@68 379 typecode = map_typecode_python_to_htslib(ord(value.typecode))
jpayne@68 380
jpayne@68 381 if typecode == 0:
jpayne@68 382 raise ValueError("unsupported type code '{}'".format(value.typecode))
jpayne@68 383
jpayne@68 384 if typecode not in DATATYPE2FORMAT:
jpayne@68 385 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@68 386
jpayne@68 387 # use array.tostring() to retrieve byte representation and
jpayne@68 388 # save as bytes
jpayne@68 389 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
jpayne@68 390 args.extend([pytag[:2],
jpayne@68 391 ord("B"),
jpayne@68 392 typecode,
jpayne@68 393 len(value),
jpayne@68 394 value.tobytes()])
jpayne@68 395
jpayne@68 396 else:
jpayne@68 397 if typecode == 0:
jpayne@68 398 typecode = get_tag_typecode(value)
jpayne@68 399 if typecode == 0:
jpayne@68 400 raise ValueError("could not deduce typecode for value {}".format(value))
jpayne@68 401
jpayne@68 402 if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
jpayne@68 403 value = force_bytes(value)
jpayne@68 404
jpayne@68 405 if typecode == b"a":
jpayne@68 406 typecode = b'A'
jpayne@68 407
jpayne@68 408 if typecode == b'Z' or typecode == b'H':
jpayne@68 409 datafmt = "2sB%is" % (len(value)+1)
jpayne@68 410 elif typecode in DATATYPE2FORMAT:
jpayne@68 411 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
jpayne@68 412 else:
jpayne@68 413 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@68 414
jpayne@68 415 args.extend([pytag[:2],
jpayne@68 416 typecode,
jpayne@68 417 value])
jpayne@68 418
jpayne@68 419 fmts.append(datafmt)
jpayne@68 420
jpayne@68 421 return "".join(fmts), args
jpayne@68 422
jpayne@68 423
jpayne@68 424 cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
jpayne@68 425 """return query length computed from CIGAR alignment.
jpayne@68 426
jpayne@68 427 Length ignores hard-clipped bases.
jpayne@68 428
jpayne@68 429 Return 0 if there is no CIGAR alignment.
jpayne@68 430 """
jpayne@68 431
jpayne@68 432 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 433
jpayne@68 434 if cigar_p == NULL:
jpayne@68 435 return 0
jpayne@68 436
jpayne@68 437 cdef uint32_t k, qpos
jpayne@68 438 cdef int op
jpayne@68 439 qpos = 0
jpayne@68 440
jpayne@68 441 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 442 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 443
jpayne@68 444 if op == BAM_CMATCH or \
jpayne@68 445 op == BAM_CINS or \
jpayne@68 446 op == BAM_CSOFT_CLIP or \
jpayne@68 447 op == BAM_CEQUAL or \
jpayne@68 448 op == BAM_CDIFF:
jpayne@68 449 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 450
jpayne@68 451 return qpos
jpayne@68 452
jpayne@68 453
jpayne@68 454 cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
jpayne@68 455 """return query length computed from CIGAR alignment.
jpayne@68 456
jpayne@68 457 Length includes hard-clipped bases.
jpayne@68 458
jpayne@68 459 Return 0 if there is no CIGAR alignment.
jpayne@68 460 """
jpayne@68 461
jpayne@68 462 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 463
jpayne@68 464 if cigar_p == NULL:
jpayne@68 465 return 0
jpayne@68 466
jpayne@68 467 cdef uint32_t k, qpos
jpayne@68 468 cdef int op
jpayne@68 469 qpos = 0
jpayne@68 470
jpayne@68 471 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 472 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 473
jpayne@68 474 if op == BAM_CMATCH or \
jpayne@68 475 op == BAM_CINS or \
jpayne@68 476 op == BAM_CSOFT_CLIP or \
jpayne@68 477 op == BAM_CHARD_CLIP or \
jpayne@68 478 op == BAM_CEQUAL or \
jpayne@68 479 op == BAM_CDIFF:
jpayne@68 480 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 481
jpayne@68 482 return qpos
jpayne@68 483
jpayne@68 484
jpayne@68 485 cdef inline int32_t getQueryStart(bam1_t *src) except -1:
jpayne@68 486 cdef uint32_t * cigar_p
jpayne@68 487 cdef uint32_t start_offset = 0
jpayne@68 488 cdef uint32_t k, op
jpayne@68 489
jpayne@68 490 cigar_p = pysam_bam_get_cigar(src);
jpayne@68 491 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 492 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 493 if op == BAM_CHARD_CLIP:
jpayne@68 494 if start_offset != 0 and start_offset != src.core.l_qseq:
jpayne@68 495 raise ValueError('Invalid clipping in CIGAR string')
jpayne@68 496 elif op == BAM_CSOFT_CLIP:
jpayne@68 497 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 498 else:
jpayne@68 499 break
jpayne@68 500
jpayne@68 501 return start_offset
jpayne@68 502
jpayne@68 503
jpayne@68 504 cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
jpayne@68 505 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 506 cdef uint32_t end_offset = src.core.l_qseq
jpayne@68 507 cdef uint32_t k, op
jpayne@68 508
jpayne@68 509 # if there is no sequence, compute length from cigar string
jpayne@68 510 if end_offset == 0:
jpayne@68 511 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 512 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 513 if op == BAM_CMATCH or \
jpayne@68 514 op == BAM_CINS or \
jpayne@68 515 op == BAM_CEQUAL or \
jpayne@68 516 op == BAM_CDIFF or \
jpayne@68 517 (op == BAM_CSOFT_CLIP and end_offset == 0):
jpayne@68 518 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 519 else:
jpayne@68 520 # walk backwards in cigar string
jpayne@68 521 for k from pysam_get_n_cigar(src) > k >= 1:
jpayne@68 522 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 523 if op == BAM_CHARD_CLIP:
jpayne@68 524 if end_offset != src.core.l_qseq:
jpayne@68 525 raise ValueError('Invalid clipping in CIGAR string')
jpayne@68 526 elif op == BAM_CSOFT_CLIP:
jpayne@68 527 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 528 else:
jpayne@68 529 break
jpayne@68 530
jpayne@68 531 return end_offset
jpayne@68 532
jpayne@68 533
jpayne@68 534 cdef inline bytes getSequenceInRange(bam1_t *src,
jpayne@68 535 uint32_t start,
jpayne@68 536 uint32_t end):
jpayne@68 537 """return python string of the sequence in a bam1_t object.
jpayne@68 538 """
jpayne@68 539
jpayne@68 540 cdef uint8_t * p
jpayne@68 541 cdef uint32_t k
jpayne@68 542 cdef char * s
jpayne@68 543
jpayne@68 544 if not src.core.l_qseq:
jpayne@68 545 return None
jpayne@68 546
jpayne@68 547 seq = PyBytes_FromStringAndSize(NULL, end - start)
jpayne@68 548 s = <char*>seq
jpayne@68 549 p = pysam_bam_get_seq(src)
jpayne@68 550
jpayne@68 551 for k from start <= k < end:
jpayne@68 552 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
jpayne@68 553 # note: do not use string literal as it will be a python string
jpayne@68 554 s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
jpayne@68 555
jpayne@68 556 return charptr_to_bytes(seq)
jpayne@68 557
jpayne@68 558
jpayne@68 559 cdef inline object getQualitiesInRange(bam1_t *src,
jpayne@68 560 uint32_t start,
jpayne@68 561 uint32_t end):
jpayne@68 562 """return python array of quality values from a bam1_t object"""
jpayne@68 563
jpayne@68 564 cdef uint8_t * p
jpayne@68 565 cdef uint32_t k
jpayne@68 566
jpayne@68 567 p = pysam_bam_get_qual(src)
jpayne@68 568 if p[0] == 0xff:
jpayne@68 569 return None
jpayne@68 570
jpayne@68 571 # 'B': unsigned char
jpayne@68 572 cdef c_array.array result = array.array('B', [0])
jpayne@68 573 c_array.resize(result, end - start)
jpayne@68 574
jpayne@68 575 # copy data
jpayne@68 576 memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
jpayne@68 577
jpayne@68 578 return result
jpayne@68 579
jpayne@68 580
jpayne@68 581 #####################################################################
jpayne@68 582 ## factory methods for instantiating extension classes
jpayne@68 583 cdef class AlignedSegment
jpayne@68 584 cdef AlignedSegment makeAlignedSegment(bam1_t *src,
jpayne@68 585 AlignmentHeader header):
jpayne@68 586 '''return an AlignedSegment object constructed from `src`'''
jpayne@68 587 # note that the following does not call __init__
jpayne@68 588 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
jpayne@68 589 dest._delegate = bam_dup1(src)
jpayne@68 590 dest.header = header
jpayne@68 591 return dest
jpayne@68 592
jpayne@68 593
jpayne@68 594 cdef class PileupColumn
jpayne@68 595 cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
jpayne@68 596 int tid,
jpayne@68 597 int pos,
jpayne@68 598 int n_pu,
jpayne@68 599 uint32_t min_base_quality,
jpayne@68 600 char * reference_sequence,
jpayne@68 601 AlignmentHeader header):
jpayne@68 602 '''return a PileupColumn object constructed from pileup in `plp` and
jpayne@68 603 setting additional attributes.
jpayne@68 604
jpayne@68 605 '''
jpayne@68 606 # note that the following does not call __init__
jpayne@68 607 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
jpayne@68 608 dest.header = header
jpayne@68 609 dest.plp = plp
jpayne@68 610 dest.tid = tid
jpayne@68 611 dest.pos = pos
jpayne@68 612 dest.n_pu = n_pu
jpayne@68 613 dest.min_base_quality = min_base_quality
jpayne@68 614 dest.reference_sequence = reference_sequence
jpayne@68 615 dest.buf.l = dest.buf.m = 0
jpayne@68 616 dest.buf.s = NULL
jpayne@68 617
jpayne@68 618 return dest
jpayne@68 619
jpayne@68 620
jpayne@68 621 cdef class PileupRead
jpayne@68 622 cdef PileupRead makePileupRead(const bam_pileup1_t *src,
jpayne@68 623 AlignmentHeader header):
jpayne@68 624 '''return a PileupRead object construted from a bam_pileup1_t * object.'''
jpayne@68 625 # note that the following does not call __init__
jpayne@68 626 cdef PileupRead dest = PileupRead.__new__(PileupRead)
jpayne@68 627 dest._alignment = makeAlignedSegment(src.b, header)
jpayne@68 628 dest._qpos = src.qpos
jpayne@68 629 dest._indel = src.indel
jpayne@68 630 dest._level = src.level
jpayne@68 631 dest._is_del = src.is_del
jpayne@68 632 dest._is_head = src.is_head
jpayne@68 633 dest._is_tail = src.is_tail
jpayne@68 634 dest._is_refskip = src.is_refskip
jpayne@68 635 return dest
jpayne@68 636
jpayne@68 637
jpayne@68 638 cdef inline uint32_t get_alignment_length(bam1_t *src):
jpayne@68 639 cdef uint32_t k = 0
jpayne@68 640 cdef uint32_t l = 0
jpayne@68 641 if src == NULL:
jpayne@68 642 return 0
jpayne@68 643 cdef uint32_t * cigar_p = bam_get_cigar(src)
jpayne@68 644 if cigar_p == NULL:
jpayne@68 645 return 0
jpayne@68 646 cdef int op
jpayne@68 647 cdef uint32_t n = pysam_get_n_cigar(src)
jpayne@68 648 for k from 0 <= k < n:
jpayne@68 649 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 650 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
jpayne@68 651 continue
jpayne@68 652 l += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 653 return l
jpayne@68 654
jpayne@68 655
jpayne@68 656 cdef inline uint32_t get_md_reference_length(char * md_tag):
jpayne@68 657 cdef int l = 0
jpayne@68 658 cdef int md_idx = 0
jpayne@68 659 cdef int nmatches = 0
jpayne@68 660
jpayne@68 661 while md_tag[md_idx] != 0:
jpayne@68 662 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
jpayne@68 663 nmatches *= 10
jpayne@68 664 nmatches += md_tag[md_idx] - 48
jpayne@68 665 md_idx += 1
jpayne@68 666 continue
jpayne@68 667 else:
jpayne@68 668 l += nmatches
jpayne@68 669 nmatches = 0
jpayne@68 670 if md_tag[md_idx] == b'^':
jpayne@68 671 md_idx += 1
jpayne@68 672 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
jpayne@68 673 md_idx += 1
jpayne@68 674 l += 1
jpayne@68 675 else:
jpayne@68 676 md_idx += 1
jpayne@68 677 l += 1
jpayne@68 678
jpayne@68 679 l += nmatches
jpayne@68 680 return l
jpayne@68 681
jpayne@68 682 # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
jpayne@68 683 cdef inline bytes build_alignment_sequence(bam1_t * src):
jpayne@68 684 """return expanded sequence from MD tag.
jpayne@68 685
jpayne@68 686 The sequence includes substitutions and both insertions in the
jpayne@68 687 reference as well as deletions to the reference sequence. Combine
jpayne@68 688 with the cigar string to reconstitute the query or the reference
jpayne@68 689 sequence.
jpayne@68 690
jpayne@68 691 Positions corresponding to `N` (skipped region from the reference)
jpayne@68 692 or `P` (padding (silent deletion from padded reference)) in the CIGAR
jpayne@68 693 string will not appear in the returned sequence. The MD should
jpayne@68 694 correspondingly not contain these. Thus proper tags are::
jpayne@68 695
jpayne@68 696 Deletion from the reference: cigar=5M1D5M MD=5^C5
jpayne@68 697 Skipped region from reference: cigar=5M1N5M MD=10
jpayne@68 698 Padded region in the reference: cigar=5M1P5M MD=10
jpayne@68 699
jpayne@68 700 Returns
jpayne@68 701 -------
jpayne@68 702
jpayne@68 703 None, if no MD tag is present.
jpayne@68 704
jpayne@68 705 """
jpayne@68 706 if src == NULL:
jpayne@68 707 return None
jpayne@68 708
jpayne@68 709 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
jpayne@68 710 if md_tag_ptr == NULL:
jpayne@68 711 return None
jpayne@68 712
jpayne@68 713 cdef uint32_t start = getQueryStart(src)
jpayne@68 714 cdef uint32_t end = getQueryEnd(src)
jpayne@68 715 # get read sequence, taking into account soft-clipping
jpayne@68 716 r = getSequenceInRange(src, start, end)
jpayne@68 717 cdef char * read_sequence = r
jpayne@68 718 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 719 if cigar_p == NULL:
jpayne@68 720 return None
jpayne@68 721
jpayne@68 722 cdef uint32_t r_idx = 0
jpayne@68 723 cdef int op
jpayne@68 724 cdef uint32_t k, i, l, x
jpayne@68 725 cdef int nmatches = 0
jpayne@68 726 cdef int s_idx = 0
jpayne@68 727
jpayne@68 728 cdef uint32_t max_len = get_alignment_length(src)
jpayne@68 729 if max_len == 0:
jpayne@68 730 raise ValueError("could not determine alignment length")
jpayne@68 731
jpayne@68 732 cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
jpayne@68 733 if s == NULL:
jpayne@68 734 raise ValueError(
jpayne@68 735 "could not allocate sequence of length %i" % max_len)
jpayne@68 736
jpayne@68 737 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 738 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 739 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 740 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 741 for i from 0 <= i < l:
jpayne@68 742 s[s_idx] = read_sequence[r_idx]
jpayne@68 743 r_idx += 1
jpayne@68 744 s_idx += 1
jpayne@68 745 elif op == BAM_CDEL:
jpayne@68 746 for i from 0 <= i < l:
jpayne@68 747 s[s_idx] = b'-'
jpayne@68 748 s_idx += 1
jpayne@68 749 elif op == BAM_CREF_SKIP:
jpayne@68 750 pass
jpayne@68 751 elif op == BAM_CINS or op == BAM_CPAD:
jpayne@68 752 for i from 0 <= i < l:
jpayne@68 753 # encode insertions into reference as lowercase
jpayne@68 754 s[s_idx] = read_sequence[r_idx] + 32
jpayne@68 755 r_idx += 1
jpayne@68 756 s_idx += 1
jpayne@68 757 elif op == BAM_CSOFT_CLIP:
jpayne@68 758 pass
jpayne@68 759 elif op == BAM_CHARD_CLIP:
jpayne@68 760 pass # advances neither
jpayne@68 761
jpayne@68 762 cdef char md_buffer[2]
jpayne@68 763 cdef char *md_tag
jpayne@68 764 cdef uint8_t md_typecode = md_tag_ptr[0]
jpayne@68 765 if md_typecode == b'Z':
jpayne@68 766 md_tag = bam_aux2Z(md_tag_ptr)
jpayne@68 767 elif md_typecode == b'A':
jpayne@68 768 # Work around HTSeq bug that writes 1-character strings as MD:A:v
jpayne@68 769 md_buffer[0] = bam_aux2A(md_tag_ptr)
jpayne@68 770 md_buffer[1] = b'\0'
jpayne@68 771 md_tag = md_buffer
jpayne@68 772 else:
jpayne@68 773 raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
jpayne@68 774
jpayne@68 775 cdef int md_idx = 0
jpayne@68 776 cdef char c
jpayne@68 777 s_idx = 0
jpayne@68 778
jpayne@68 779 # Check if MD tag is valid by matching CIGAR length to MD tag defined length
jpayne@68 780 # Insertions would be in addition to what is described by MD, so we calculate
jpayne@68 781 # the number of insertions separately.
jpayne@68 782 cdef int insertions = 0
jpayne@68 783
jpayne@68 784 while s[s_idx] != 0:
jpayne@68 785 if s[s_idx] >= b'a':
jpayne@68 786 insertions += 1
jpayne@68 787 s_idx += 1
jpayne@68 788 s_idx = 0
jpayne@68 789
jpayne@68 790 cdef uint32_t md_len = get_md_reference_length(md_tag)
jpayne@68 791 if md_len + insertions > max_len:
jpayne@68 792 free(s)
jpayne@68 793 raise AssertionError(
jpayne@68 794 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
jpayne@68 795 md_len, max_len, insertions))
jpayne@68 796
jpayne@68 797 while md_tag[md_idx] != 0:
jpayne@68 798 # c is numerical
jpayne@68 799 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
jpayne@68 800 nmatches *= 10
jpayne@68 801 nmatches += md_tag[md_idx] - 48
jpayne@68 802 md_idx += 1
jpayne@68 803 continue
jpayne@68 804 else:
jpayne@68 805 # save matches up to this point, skipping insertions
jpayne@68 806 for x from 0 <= x < nmatches:
jpayne@68 807 while s[s_idx] >= b'a':
jpayne@68 808 s_idx += 1
jpayne@68 809 s_idx += 1
jpayne@68 810 while s[s_idx] >= b'a':
jpayne@68 811 s_idx += 1
jpayne@68 812
jpayne@68 813 r_idx += nmatches
jpayne@68 814 nmatches = 0
jpayne@68 815 if md_tag[md_idx] == b'^':
jpayne@68 816 md_idx += 1
jpayne@68 817 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
jpayne@68 818 # assert s[s_idx] == '-'
jpayne@68 819 s[s_idx] = md_tag[md_idx]
jpayne@68 820 s_idx += 1
jpayne@68 821 md_idx += 1
jpayne@68 822 else:
jpayne@68 823 # save mismatch
jpayne@68 824 # enforce lower case
jpayne@68 825 c = md_tag[md_idx]
jpayne@68 826 if c <= 90:
jpayne@68 827 c += 32
jpayne@68 828 s[s_idx] = c
jpayne@68 829 s_idx += 1
jpayne@68 830 r_idx += 1
jpayne@68 831 md_idx += 1
jpayne@68 832
jpayne@68 833 # save matches up to this point, skipping insertions
jpayne@68 834 for x from 0 <= x < nmatches:
jpayne@68 835 while s[s_idx] >= b'a':
jpayne@68 836 s_idx += 1
jpayne@68 837 s_idx += 1
jpayne@68 838 while s[s_idx] >= b'a':
jpayne@68 839 s_idx += 1
jpayne@68 840
jpayne@68 841 seq = PyBytes_FromStringAndSize(s, s_idx)
jpayne@68 842 free(s)
jpayne@68 843
jpayne@68 844 return seq
jpayne@68 845
jpayne@68 846
jpayne@68 847 cdef inline bytes build_reference_sequence(bam1_t * src):
jpayne@68 848 """return the reference sequence in the region that is covered by the
jpayne@68 849 alignment of the read to the reference.
jpayne@68 850
jpayne@68 851 This method requires the MD tag to be set.
jpayne@68 852
jpayne@68 853 """
jpayne@68 854 cdef uint32_t k, i, l
jpayne@68 855 cdef int op
jpayne@68 856 cdef int s_idx = 0
jpayne@68 857 ref_seq = build_alignment_sequence(src)
jpayne@68 858 if ref_seq is None:
jpayne@68 859 raise ValueError("MD tag not present")
jpayne@68 860
jpayne@68 861 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
jpayne@68 862 if s == NULL:
jpayne@68 863 raise ValueError(
jpayne@68 864 "could not allocate sequence of length %i" % len(ref_seq))
jpayne@68 865
jpayne@68 866 cdef char * cref_seq = ref_seq
jpayne@68 867 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 868 cdef uint32_t r_idx = 0
jpayne@68 869 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 870 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 871 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 872 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 873 for i from 0 <= i < l:
jpayne@68 874 s[s_idx] = cref_seq[r_idx]
jpayne@68 875 r_idx += 1
jpayne@68 876 s_idx += 1
jpayne@68 877 elif op == BAM_CDEL:
jpayne@68 878 for i from 0 <= i < l:
jpayne@68 879 s[s_idx] = cref_seq[r_idx]
jpayne@68 880 r_idx += 1
jpayne@68 881 s_idx += 1
jpayne@68 882 elif op == BAM_CREF_SKIP:
jpayne@68 883 pass
jpayne@68 884 elif op == BAM_CINS or op == BAM_CPAD:
jpayne@68 885 r_idx += l
jpayne@68 886 elif op == BAM_CSOFT_CLIP:
jpayne@68 887 pass
jpayne@68 888 elif op == BAM_CHARD_CLIP:
jpayne@68 889 pass # advances neither
jpayne@68 890
jpayne@68 891 seq = PyBytes_FromStringAndSize(s, s_idx)
jpayne@68 892 free(s)
jpayne@68 893
jpayne@68 894 return seq
jpayne@68 895
jpayne@68 896
jpayne@68 897 cdef inline str safe_reference_name(AlignmentHeader header, int tid):
jpayne@68 898 if tid == -1: return "*"
jpayne@68 899 elif header is not None: return header.get_reference_name(tid)
jpayne@68 900 else: return f"#{tid}"
jpayne@68 901
jpayne@68 902
jpayne@68 903 # Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
jpayne@68 904
jpayne@68 905 cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@68 906 return (qpos, pos)
jpayne@68 907
jpayne@68 908
jpayne@68 909 cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@68 910 ref_base = ref_seq[r_idx] if ref_seq is not None else None
jpayne@68 911 return (qpos, pos, ref_base)
jpayne@68 912
jpayne@68 913
jpayne@68 914 cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@68 915 return (qpos, pos, CIGAR_OPS(op))
jpayne@68 916
jpayne@68 917
jpayne@68 918 cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@68 919 ref_base = ref_seq[r_idx] if ref_seq is not None else None
jpayne@68 920 return (qpos, pos, ref_base, CIGAR_OPS(op))
jpayne@68 921
jpayne@68 922
jpayne@68 923 cdef class AlignedSegment:
jpayne@68 924 '''Class representing an aligned segment.
jpayne@68 925
jpayne@68 926 This class stores a handle to the samtools C-structure representing
jpayne@68 927 an aligned read. Member read access is forwarded to the C-structure
jpayne@68 928 and converted into python objects. This implementation should be fast,
jpayne@68 929 as only the data needed is converted.
jpayne@68 930
jpayne@68 931 For write access, the C-structure is updated in-place. This is
jpayne@68 932 not the most efficient way to build BAM entries, as the variable
jpayne@68 933 length data is concatenated and thus needs to be resized if
jpayne@68 934 a field is updated. Furthermore, the BAM entry might be
jpayne@68 935 in an inconsistent state.
jpayne@68 936
jpayne@68 937 One issue to look out for is that the sequence should always
jpayne@68 938 be set *before* the quality scores. Setting the sequence will
jpayne@68 939 also erase any quality scores that were set previously.
jpayne@68 940
jpayne@68 941 Parameters
jpayne@68 942 ----------
jpayne@68 943
jpayne@68 944 header:
jpayne@68 945 :class:`~pysam.AlignmentHeader` object to map numerical
jpayne@68 946 identifiers to chromosome names. If not given, an empty
jpayne@68 947 header is created.
jpayne@68 948 '''
jpayne@68 949
jpayne@68 950 # Now only called when instances are created from Python
jpayne@68 951 def __init__(self, AlignmentHeader header=None):
jpayne@68 952 # see bam_init1
jpayne@68 953 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@68 954 if self._delegate == NULL:
jpayne@68 955 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
jpayne@68 956 # allocate some memory. If size is 0, calloc does not return a
jpayne@68 957 # pointer that can be passed to free() so allocate 40 bytes
jpayne@68 958 # for a new read
jpayne@68 959 self._delegate.m_data = 40
jpayne@68 960 self._delegate.data = <uint8_t *>calloc(
jpayne@68 961 self._delegate.m_data, 1)
jpayne@68 962 if self._delegate.data == NULL:
jpayne@68 963 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
jpayne@68 964 self._delegate.l_data = 0
jpayne@68 965 # set some data to make read approximately legit.
jpayne@68 966 # Note, SAM writing fails with q_name of length 0
jpayne@68 967 self._delegate.core.l_qname = 0
jpayne@68 968 self._delegate.core.tid = -1
jpayne@68 969 self._delegate.core.pos = -1
jpayne@68 970 self._delegate.core.mtid = -1
jpayne@68 971 self._delegate.core.mpos = -1
jpayne@68 972
jpayne@68 973 # caching for selected fields
jpayne@68 974 self.cache_query_qualities = None
jpayne@68 975 self.cache_query_alignment_qualities = None
jpayne@68 976 self.cache_query_sequence = None
jpayne@68 977 self.cache_query_alignment_sequence = None
jpayne@68 978
jpayne@68 979 self.header = header
jpayne@68 980
jpayne@68 981 def __dealloc__(self):
jpayne@68 982 bam_destroy1(self._delegate)
jpayne@68 983
jpayne@68 984 def __str__(self):
jpayne@68 985 """return string representation of alignment.
jpayne@68 986
jpayne@68 987 The representation is an approximate :term:`SAM` format, because
jpayne@68 988 an aligned read might not be associated with a :term:`AlignmentFile`.
jpayne@68 989 Hence when the read does not have an associated :class:`AlignedHeader`,
jpayne@68 990 :term:`tid` is shown instead of the reference name.
jpayne@68 991 Similarly, the tags field is returned in its parsed state.
jpayne@68 992
jpayne@68 993 To get a valid SAM record, use :meth:`to_string`.
jpayne@68 994 """
jpayne@68 995 # sam-parsing is done in sam.c/bam_format1_core which
jpayne@68 996 # requires a valid header.
jpayne@68 997 return "\t".join(map(str, (self.query_name,
jpayne@68 998 self.flag,
jpayne@68 999 safe_reference_name(self.header, self.reference_id),
jpayne@68 1000 self.reference_start + 1,
jpayne@68 1001 self.mapping_quality,
jpayne@68 1002 self.cigarstring,
jpayne@68 1003 safe_reference_name(self.header, self.next_reference_id),
jpayne@68 1004 self.next_reference_start + 1,
jpayne@68 1005 self.template_length,
jpayne@68 1006 self.query_sequence,
jpayne@68 1007 self.query_qualities,
jpayne@68 1008 self.tags)))
jpayne@68 1009
jpayne@68 1010 def __repr__(self):
jpayne@68 1011 ref = self.reference_name if self.header is not None else self.reference_id
jpayne@68 1012 return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
jpayne@68 1013
jpayne@68 1014 def __copy__(self):
jpayne@68 1015 return makeAlignedSegment(self._delegate, self.header)
jpayne@68 1016
jpayne@68 1017 def __deepcopy__(self, memo):
jpayne@68 1018 return makeAlignedSegment(self._delegate, self.header)
jpayne@68 1019
jpayne@68 1020 def compare(self, AlignedSegment other):
jpayne@68 1021 '''return -1,0,1, if contents in this are binary
jpayne@68 1022 <,=,> to *other*
jpayne@68 1023 '''
jpayne@68 1024
jpayne@68 1025 # avoid segfault when other equals None
jpayne@68 1026 if other is None:
jpayne@68 1027 return -1
jpayne@68 1028
jpayne@68 1029 cdef int retval, x
jpayne@68 1030 cdef bam1_t *t
jpayne@68 1031 cdef bam1_t *o
jpayne@68 1032
jpayne@68 1033 t = self._delegate
jpayne@68 1034 o = other._delegate
jpayne@68 1035
jpayne@68 1036 # uncomment for debugging purposes
jpayne@68 1037 # cdef unsigned char * oo, * tt
jpayne@68 1038 # tt = <unsigned char*>(&t.core)
jpayne@68 1039 # oo = <unsigned char*>(&o.core)
jpayne@68 1040 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
jpayne@68 1041 # tt = <unsigned char*>(t.data)
jpayne@68 1042 # oo = <unsigned char*>(o.data)
jpayne@68 1043 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
jpayne@68 1044
jpayne@68 1045 # Fast-path test for object identity
jpayne@68 1046 if t == o:
jpayne@68 1047 return 0
jpayne@68 1048
jpayne@68 1049 cdef uint8_t *a = <uint8_t*>&t.core
jpayne@68 1050 cdef uint8_t *b = <uint8_t*>&o.core
jpayne@68 1051
jpayne@68 1052 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
jpayne@68 1053 if retval:
jpayne@68 1054 return retval
jpayne@68 1055
jpayne@68 1056 # cmp(t.l_data, o.l_data)
jpayne@68 1057 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
jpayne@68 1058 if retval:
jpayne@68 1059 return retval
jpayne@68 1060 return memcmp(t.data, o.data, t.l_data)
jpayne@68 1061
jpayne@68 1062 def __richcmp__(self, AlignedSegment other, int op):
jpayne@68 1063 if op == 2: # == operator
jpayne@68 1064 return self.compare(other) == 0
jpayne@68 1065 elif op == 3: # != operator
jpayne@68 1066 return self.compare(other) != 0
jpayne@68 1067 else:
jpayne@68 1068 return NotImplemented
jpayne@68 1069
jpayne@68 1070 def __hash__(self):
jpayne@68 1071 cdef bam1_t * src = self._delegate
jpayne@68 1072 cdef int x
jpayne@68 1073
jpayne@68 1074 # see http://effbot.org/zone/python-hash.htm
jpayne@68 1075 cdef uint8_t * c = <uint8_t *>&src.core
jpayne@68 1076 cdef uint32_t hash_value = c[0]
jpayne@68 1077 for x from 1 <= x < sizeof(bam1_core_t):
jpayne@68 1078 hash_value = c_mul(hash_value, 1000003) ^ c[x]
jpayne@68 1079 c = <uint8_t *>src.data
jpayne@68 1080 for x from 0 <= x < src.l_data:
jpayne@68 1081 hash_value = c_mul(hash_value, 1000003) ^ c[x]
jpayne@68 1082
jpayne@68 1083 return hash_value
jpayne@68 1084
jpayne@68 1085 cpdef to_string(self):
jpayne@68 1086 """returns a string representation of the aligned segment.
jpayne@68 1087
jpayne@68 1088 The output format is valid SAM format if a header is associated
jpayne@68 1089 with the AlignedSegment.
jpayne@68 1090 """
jpayne@68 1091 cdef kstring_t line
jpayne@68 1092 line.l = line.m = 0
jpayne@68 1093 line.s = NULL
jpayne@68 1094
jpayne@68 1095 if self.header:
jpayne@68 1096 if sam_format1(self.header.ptr, self._delegate, &line) < 0:
jpayne@68 1097 if line.m:
jpayne@68 1098 free(line.s)
jpayne@68 1099 raise ValueError('sam_format failed')
jpayne@68 1100 else:
jpayne@68 1101 raise NotImplementedError("todo")
jpayne@68 1102
jpayne@68 1103 ret = force_str(line.s[:line.l])
jpayne@68 1104
jpayne@68 1105 if line.m:
jpayne@68 1106 free(line.s)
jpayne@68 1107
jpayne@68 1108 return ret
jpayne@68 1109
jpayne@68 1110 @classmethod
jpayne@68 1111 def fromstring(cls, sam, AlignmentHeader header):
jpayne@68 1112 """parses a string representation of the aligned segment.
jpayne@68 1113
jpayne@68 1114 The input format should be valid SAM format.
jpayne@68 1115
jpayne@68 1116 Parameters
jpayne@68 1117 ----------
jpayne@68 1118 sam:
jpayne@68 1119 :term:`SAM` formatted string
jpayne@68 1120
jpayne@68 1121 """
jpayne@68 1122 cdef AlignedSegment dest = cls.__new__(cls)
jpayne@68 1123 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@68 1124 dest.header = header
jpayne@68 1125
jpayne@68 1126 cdef kstring_t line
jpayne@68 1127 line.l = line.m = len(sam)
jpayne@68 1128 _sam = force_bytes(sam)
jpayne@68 1129 line.s = _sam
jpayne@68 1130
jpayne@68 1131 cdef int ret
jpayne@68 1132 ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
jpayne@68 1133 if ret < 0:
jpayne@68 1134 raise ValueError("parsing SAM record string failed (error code {})".format(ret))
jpayne@68 1135
jpayne@68 1136 return dest
jpayne@68 1137
jpayne@68 1138 cpdef tostring(self, htsfile=None):
jpayne@68 1139 """deprecated, use :meth:`to_string()` instead.
jpayne@68 1140
jpayne@68 1141 Parameters
jpayne@68 1142 ----------
jpayne@68 1143
jpayne@68 1144 htsfile:
jpayne@68 1145 (deprecated) AlignmentFile object to map numerical
jpayne@68 1146 identifiers to chromosome names. This parameter is present
jpayne@68 1147 for backwards compatibility and ignored.
jpayne@68 1148 """
jpayne@68 1149
jpayne@68 1150 return self.to_string()
jpayne@68 1151
jpayne@68 1152 def to_dict(self):
jpayne@68 1153 """returns a json representation of the aligned segment.
jpayne@68 1154
jpayne@68 1155 Field names are abbreviated versions of the class attributes.
jpayne@68 1156 """
jpayne@68 1157 # let htslib do the string conversions, but treat optional field properly as list
jpayne@68 1158 vals = self.to_string().split("\t")
jpayne@68 1159 n = len(KEY_NAMES) - 1
jpayne@68 1160 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
jpayne@68 1161
jpayne@68 1162 @classmethod
jpayne@68 1163 def from_dict(cls, sam_dict, AlignmentHeader header):
jpayne@68 1164 """parses a dictionary representation of the aligned segment.
jpayne@68 1165
jpayne@68 1166 Parameters
jpayne@68 1167 ----------
jpayne@68 1168 sam_dict:
jpayne@68 1169 dictionary of alignment values, keys corresponding to output from
jpayne@68 1170 :meth:`todict()`.
jpayne@68 1171
jpayne@68 1172 """
jpayne@68 1173 # let htslib do the parsing
jpayne@68 1174 # the tags field can be missing
jpayne@68 1175 return cls.fromstring(
jpayne@68 1176 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
jpayne@68 1177 "\t" +
jpayne@68 1178 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
jpayne@68 1179
jpayne@68 1180 ########################################################
jpayne@68 1181 ## Basic attributes in order of appearance in SAM format
jpayne@68 1182 property query_name:
jpayne@68 1183 """the query template name (None if not present)"""
jpayne@68 1184 def __get__(self):
jpayne@68 1185
jpayne@68 1186 cdef bam1_t * src = self._delegate
jpayne@68 1187 if src.core.l_qname == 0:
jpayne@68 1188 return None
jpayne@68 1189
jpayne@68 1190 return charptr_to_str(<char *>pysam_bam_get_qname(src))
jpayne@68 1191
jpayne@68 1192 def __set__(self, qname):
jpayne@68 1193
jpayne@68 1194 if qname is None or len(qname) == 0:
jpayne@68 1195 return
jpayne@68 1196
jpayne@68 1197 if len(qname) > 254:
jpayne@68 1198 raise ValueError("query length out of range {} > 254".format(
jpayne@68 1199 len(qname)))
jpayne@68 1200
jpayne@68 1201 qname = force_bytes(qname)
jpayne@68 1202 cdef bam1_t * src = self._delegate
jpayne@68 1203 # the qname is \0 terminated
jpayne@68 1204 cdef uint8_t l = len(qname) + 1
jpayne@68 1205
jpayne@68 1206 cdef char * p = pysam_bam_get_qname(src)
jpayne@68 1207 cdef uint8_t l_extranul = 0
jpayne@68 1208
jpayne@68 1209 if l % 4 != 0:
jpayne@68 1210 l_extranul = 4 - l % 4
jpayne@68 1211
jpayne@68 1212 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@68 1213 src.core.l_qname,
jpayne@68 1214 l + l_extranul,
jpayne@68 1215 <uint8_t*>p)
jpayne@68 1216 if retval == NULL:
jpayne@68 1217 raise MemoryError("could not allocate memory")
jpayne@68 1218
jpayne@68 1219 src.core.l_extranul = l_extranul
jpayne@68 1220 src.core.l_qname = l + l_extranul
jpayne@68 1221
jpayne@68 1222 # re-acquire pointer to location in memory
jpayne@68 1223 # as it might have moved
jpayne@68 1224 p = pysam_bam_get_qname(src)
jpayne@68 1225
jpayne@68 1226 strncpy(p, qname, l)
jpayne@68 1227 # x might be > 255
jpayne@68 1228 cdef uint16_t x = 0
jpayne@68 1229
jpayne@68 1230 for x from l <= x < l + l_extranul:
jpayne@68 1231 p[x] = b'\0'
jpayne@68 1232
jpayne@68 1233 property flag:
jpayne@68 1234 """properties flag"""
jpayne@68 1235 def __get__(self):
jpayne@68 1236 return self._delegate.core.flag
jpayne@68 1237 def __set__(self, flag):
jpayne@68 1238 self._delegate.core.flag = flag
jpayne@68 1239
jpayne@68 1240 property reference_name:
jpayne@68 1241 """:term:`reference` name"""
jpayne@68 1242 def __get__(self):
jpayne@68 1243 if self._delegate.core.tid == -1:
jpayne@68 1244 return None
jpayne@68 1245 if self.header:
jpayne@68 1246 return self.header.get_reference_name(self._delegate.core.tid)
jpayne@68 1247 else:
jpayne@68 1248 raise ValueError("reference_name unknown if no header associated with record")
jpayne@68 1249 def __set__(self, reference):
jpayne@68 1250 cdef int tid
jpayne@68 1251 if reference is None or reference == "*":
jpayne@68 1252 self._delegate.core.tid = -1
jpayne@68 1253 elif self.header:
jpayne@68 1254 tid = self.header.get_tid(reference)
jpayne@68 1255 if tid < 0:
jpayne@68 1256 raise ValueError("reference {} does not exist in header".format(
jpayne@68 1257 reference))
jpayne@68 1258 self._delegate.core.tid = tid
jpayne@68 1259 else:
jpayne@68 1260 raise ValueError("reference_name can not be set if no header associated with record")
jpayne@68 1261
jpayne@68 1262 property reference_id:
jpayne@68 1263 """:term:`reference` ID
jpayne@68 1264
jpayne@68 1265 .. note::
jpayne@68 1266
jpayne@68 1267 This field contains the index of the reference sequence in
jpayne@68 1268 the sequence dictionary. To obtain the name of the
jpayne@68 1269 reference sequence, use :meth:`get_reference_name()`
jpayne@68 1270
jpayne@68 1271 """
jpayne@68 1272 def __get__(self):
jpayne@68 1273 return self._delegate.core.tid
jpayne@68 1274 def __set__(self, tid):
jpayne@68 1275 if tid != -1 and self.header and not self.header.is_valid_tid(tid):
jpayne@68 1276 raise ValueError("reference id {} does not exist in header".format(
jpayne@68 1277 tid))
jpayne@68 1278 self._delegate.core.tid = tid
jpayne@68 1279
jpayne@68 1280 property reference_start:
jpayne@68 1281 """0-based leftmost coordinate"""
jpayne@68 1282 def __get__(self):
jpayne@68 1283 return self._delegate.core.pos
jpayne@68 1284 def __set__(self, pos):
jpayne@68 1285 ## setting the position requires updating the "bin" attribute
jpayne@68 1286 cdef bam1_t * src
jpayne@68 1287 src = self._delegate
jpayne@68 1288 src.core.pos = pos
jpayne@68 1289 update_bin(src)
jpayne@68 1290
jpayne@68 1291 property mapping_quality:
jpayne@68 1292 """mapping quality"""
jpayne@68 1293 def __get__(self):
jpayne@68 1294 return pysam_get_qual(self._delegate)
jpayne@68 1295 def __set__(self, qual):
jpayne@68 1296 pysam_set_qual(self._delegate, qual)
jpayne@68 1297
jpayne@68 1298 property cigarstring:
jpayne@68 1299 '''the :term:`cigar` alignment as a string.
jpayne@68 1300
jpayne@68 1301 The cigar string is a string of alternating integers
jpayne@68 1302 and characters denoting the length and the type of
jpayne@68 1303 an operation.
jpayne@68 1304
jpayne@68 1305 .. note::
jpayne@68 1306 The order length,operation is specified in the
jpayne@68 1307 SAM format. It is different from the order of
jpayne@68 1308 the :attr:`cigar` property.
jpayne@68 1309
jpayne@68 1310 Returns None if not present.
jpayne@68 1311
jpayne@68 1312 To unset the cigarstring, assign None or the
jpayne@68 1313 empty string.
jpayne@68 1314 '''
jpayne@68 1315 def __get__(self):
jpayne@68 1316 c = self.cigartuples
jpayne@68 1317 if c is None:
jpayne@68 1318 return None
jpayne@68 1319 # reverse order
jpayne@68 1320 else:
jpayne@68 1321 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
jpayne@68 1322
jpayne@68 1323 def __set__(self, cigar):
jpayne@68 1324 if cigar is None or len(cigar) == 0:
jpayne@68 1325 self.cigartuples = []
jpayne@68 1326 else:
jpayne@68 1327 parts = CIGAR_REGEX.findall(cigar)
jpayne@68 1328 # reverse order
jpayne@68 1329 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
jpayne@68 1330
jpayne@68 1331 # TODO
jpayne@68 1332 # property cigar:
jpayne@68 1333 # """the cigar alignment"""
jpayne@68 1334
jpayne@68 1335 property next_reference_id:
jpayne@68 1336 """the :term:`reference` id of the mate/next read."""
jpayne@68 1337 def __get__(self):
jpayne@68 1338 return self._delegate.core.mtid
jpayne@68 1339 def __set__(self, mtid):
jpayne@68 1340 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
jpayne@68 1341 raise ValueError("reference id {} does not exist in header".format(
jpayne@68 1342 mtid))
jpayne@68 1343 self._delegate.core.mtid = mtid
jpayne@68 1344
jpayne@68 1345 property next_reference_name:
jpayne@68 1346 """:term:`reference` name of the mate/next read (None if no
jpayne@68 1347 AlignmentFile is associated)"""
jpayne@68 1348 def __get__(self):
jpayne@68 1349 if self._delegate.core.mtid == -1:
jpayne@68 1350 return None
jpayne@68 1351 if self.header:
jpayne@68 1352 return self.header.get_reference_name(self._delegate.core.mtid)
jpayne@68 1353 else:
jpayne@68 1354 raise ValueError("next_reference_name unknown if no header associated with record")
jpayne@68 1355
jpayne@68 1356 def __set__(self, reference):
jpayne@68 1357 cdef int mtid
jpayne@68 1358 if reference is None or reference == "*":
jpayne@68 1359 self._delegate.core.mtid = -1
jpayne@68 1360 elif reference == "=":
jpayne@68 1361 self._delegate.core.mtid = self._delegate.core.tid
jpayne@68 1362 elif self.header:
jpayne@68 1363 mtid = self.header.get_tid(reference)
jpayne@68 1364 if mtid < 0:
jpayne@68 1365 raise ValueError("reference {} does not exist in header".format(
jpayne@68 1366 reference))
jpayne@68 1367 self._delegate.core.mtid = mtid
jpayne@68 1368 else:
jpayne@68 1369 raise ValueError("next_reference_name can not be set if no header associated with record")
jpayne@68 1370
jpayne@68 1371 property next_reference_start:
jpayne@68 1372 """the position of the mate/next read."""
jpayne@68 1373 def __get__(self):
jpayne@68 1374 return self._delegate.core.mpos
jpayne@68 1375 def __set__(self, mpos):
jpayne@68 1376 self._delegate.core.mpos = mpos
jpayne@68 1377
jpayne@68 1378 property query_length:
jpayne@68 1379 """the length of the query/read.
jpayne@68 1380
jpayne@68 1381 This value corresponds to the length of the sequence supplied
jpayne@68 1382 in the BAM/SAM file. The length of a query is 0 if there is no
jpayne@68 1383 sequence in the BAM/SAM file. In those cases, the read length
jpayne@68 1384 can be inferred from the CIGAR alignment, see
jpayne@68 1385 :meth:`pysam.AlignedSegment.infer_query_length`.
jpayne@68 1386
jpayne@68 1387 The length includes soft-clipped bases and is equal to
jpayne@68 1388 ``len(query_sequence)``.
jpayne@68 1389
jpayne@68 1390 This property is read-only but is updated when a new query
jpayne@68 1391 sequence is assigned to this AlignedSegment.
jpayne@68 1392
jpayne@68 1393 Returns 0 if not available.
jpayne@68 1394
jpayne@68 1395 """
jpayne@68 1396 def __get__(self):
jpayne@68 1397 return self._delegate.core.l_qseq
jpayne@68 1398
jpayne@68 1399 property template_length:
jpayne@68 1400 """the observed query template length"""
jpayne@68 1401 def __get__(self):
jpayne@68 1402 return self._delegate.core.isize
jpayne@68 1403 def __set__(self, isize):
jpayne@68 1404 self._delegate.core.isize = isize
jpayne@68 1405
jpayne@68 1406 property query_sequence:
jpayne@68 1407 """read sequence bases, including :term:`soft clipped` bases
jpayne@68 1408 (None if not present).
jpayne@68 1409
jpayne@68 1410 Assigning to this attribute will invalidate any quality scores.
jpayne@68 1411 Thus, to in-place edit the sequence and quality scores, copies of
jpayne@68 1412 the quality scores need to be taken. Consider trimming for example::
jpayne@68 1413
jpayne@68 1414 q = read.query_qualities
jpayne@68 1415 read.query_sequence = read.query_sequence[5:10]
jpayne@68 1416 read.query_qualities = q[5:10]
jpayne@68 1417
jpayne@68 1418 The sequence is returned as it is stored in the BAM file. (This will
jpayne@68 1419 be the reverse complement of the original read sequence if the mapper
jpayne@68 1420 has aligned the read to the reverse strand.)
jpayne@68 1421 """
jpayne@68 1422 def __get__(self):
jpayne@68 1423 if self.cache_query_sequence:
jpayne@68 1424 return self.cache_query_sequence
jpayne@68 1425
jpayne@68 1426 cdef bam1_t * src
jpayne@68 1427 cdef char * s
jpayne@68 1428 src = self._delegate
jpayne@68 1429
jpayne@68 1430 if src.core.l_qseq == 0:
jpayne@68 1431 return None
jpayne@68 1432
jpayne@68 1433 self.cache_query_sequence = force_str(getSequenceInRange(
jpayne@68 1434 src, 0, src.core.l_qseq))
jpayne@68 1435 return self.cache_query_sequence
jpayne@68 1436
jpayne@68 1437 def __set__(self, seq):
jpayne@68 1438 # samtools manages sequence and quality length memory together
jpayne@68 1439 # if no quality information is present, the first byte says 0xff.
jpayne@68 1440 cdef bam1_t * src
jpayne@68 1441 cdef uint8_t * p
jpayne@68 1442 cdef char * s
jpayne@68 1443 cdef int l, k
jpayne@68 1444 cdef Py_ssize_t nbytes_new, nbytes_old
jpayne@68 1445
jpayne@68 1446 if seq == None:
jpayne@68 1447 l = 0
jpayne@68 1448 else:
jpayne@68 1449 l = len(seq)
jpayne@68 1450 seq = force_bytes(seq)
jpayne@68 1451
jpayne@68 1452 src = self._delegate
jpayne@68 1453
jpayne@68 1454 # as the sequence is stored in half-bytes, the total length (sequence
jpayne@68 1455 # plus quality scores) is (l+1)/2 + l
jpayne@68 1456 nbytes_new = (l + 1) // 2 + l
jpayne@68 1457 nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
jpayne@68 1458
jpayne@68 1459 # acquire pointer to location in memory
jpayne@68 1460 p = pysam_bam_get_seq(src)
jpayne@68 1461 src.core.l_qseq = l
jpayne@68 1462
jpayne@68 1463 # change length of data field
jpayne@68 1464 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@68 1465 nbytes_old,
jpayne@68 1466 nbytes_new,
jpayne@68 1467 p)
jpayne@68 1468
jpayne@68 1469 if retval == NULL:
jpayne@68 1470 raise MemoryError("could not allocate memory")
jpayne@68 1471
jpayne@68 1472 if l > 0:
jpayne@68 1473 # re-acquire pointer to location in memory
jpayne@68 1474 # as it might have moved
jpayne@68 1475 p = pysam_bam_get_seq(src)
jpayne@68 1476 for k from 0 <= k < nbytes_new:
jpayne@68 1477 p[k] = 0
jpayne@68 1478 # convert to C string
jpayne@68 1479 s = seq
jpayne@68 1480 for k from 0 <= k < l:
jpayne@68 1481 p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
jpayne@68 1482
jpayne@68 1483 # erase qualities
jpayne@68 1484 p = pysam_bam_get_qual(src)
jpayne@68 1485 memset(p, 0xff, l)
jpayne@68 1486
jpayne@68 1487 self.cache_query_sequence = force_str(seq)
jpayne@68 1488
jpayne@68 1489 # clear cached values for quality values
jpayne@68 1490 self.cache_query_qualities = None
jpayne@68 1491 self.cache_query_alignment_qualities = None
jpayne@68 1492
jpayne@68 1493 property query_qualities:
jpayne@68 1494 """read sequence base qualities, including :term:`soft clipped` bases
jpayne@68 1495 (None if not present).
jpayne@68 1496
jpayne@68 1497 Quality scores are returned as a python array of unsigned
jpayne@68 1498 chars. Note that this is not the ASCII-encoded value typically
jpayne@68 1499 seen in FASTQ or SAM formatted files. Thus, no offset of 33
jpayne@68 1500 needs to be subtracted.
jpayne@68 1501
jpayne@68 1502 Note that to set quality scores the sequence has to be set
jpayne@68 1503 beforehand as this will determine the expected length of the
jpayne@68 1504 quality score array.
jpayne@68 1505
jpayne@68 1506 This method raises a ValueError if the length of the
jpayne@68 1507 quality scores and the sequence are not the same.
jpayne@68 1508
jpayne@68 1509 """
jpayne@68 1510 def __get__(self):
jpayne@68 1511
jpayne@68 1512 if self.cache_query_qualities:
jpayne@68 1513 return self.cache_query_qualities
jpayne@68 1514
jpayne@68 1515 cdef bam1_t * src
jpayne@68 1516 cdef char * q
jpayne@68 1517
jpayne@68 1518 src = self._delegate
jpayne@68 1519
jpayne@68 1520 if src.core.l_qseq == 0:
jpayne@68 1521 return None
jpayne@68 1522
jpayne@68 1523 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
jpayne@68 1524 return self.cache_query_qualities
jpayne@68 1525
jpayne@68 1526 def __set__(self, qual):
jpayne@68 1527
jpayne@68 1528 # note that memory is already allocated via setting the sequence
jpayne@68 1529 # hence length match of sequence and quality needs is checked.
jpayne@68 1530 cdef bam1_t * src
jpayne@68 1531 cdef uint8_t * p
jpayne@68 1532 cdef int l
jpayne@68 1533
jpayne@68 1534 src = self._delegate
jpayne@68 1535 p = pysam_bam_get_qual(src)
jpayne@68 1536 if qual is None or len(qual) == 0:
jpayne@68 1537 # if absent and there is a sequence: set to 0xff
jpayne@68 1538 memset(p, 0xff, src.core.l_qseq)
jpayne@68 1539 return
jpayne@68 1540
jpayne@68 1541 # check for length match
jpayne@68 1542 l = len(qual)
jpayne@68 1543 if src.core.l_qseq != l:
jpayne@68 1544 raise ValueError(
jpayne@68 1545 "quality and sequence mismatch: %i != %i" %
jpayne@68 1546 (l, src.core.l_qseq))
jpayne@68 1547
jpayne@68 1548 # create a python array object filling it
jpayne@68 1549 # with the quality scores
jpayne@68 1550
jpayne@68 1551 # NB: should avoid this copying if qual is
jpayne@68 1552 # already of the correct type.
jpayne@68 1553 cdef c_array.array result = c_array.array('B', qual)
jpayne@68 1554
jpayne@68 1555 # copy data
jpayne@68 1556 memcpy(p, result.data.as_voidptr, l)
jpayne@68 1557
jpayne@68 1558 # save in cache
jpayne@68 1559 self.cache_query_qualities = qual
jpayne@68 1560
jpayne@68 1561 property bin:
jpayne@68 1562 """properties bin"""
jpayne@68 1563 def __get__(self):
jpayne@68 1564 return self._delegate.core.bin
jpayne@68 1565 def __set__(self, bin):
jpayne@68 1566 self._delegate.core.bin = bin
jpayne@68 1567
jpayne@68 1568
jpayne@68 1569 ##########################################################
jpayne@68 1570 # Derived simple attributes. These are simple attributes of
jpayne@68 1571 # AlignedSegment getting and setting values.
jpayne@68 1572 ##########################################################
jpayne@68 1573 # 1. Flags
jpayne@68 1574 ##########################################################
jpayne@68 1575 property is_paired:
jpayne@68 1576 """true if read is paired in sequencing"""
jpayne@68 1577 def __get__(self):
jpayne@68 1578 return (self.flag & BAM_FPAIRED) != 0
jpayne@68 1579 def __set__(self,val):
jpayne@68 1580 pysam_update_flag(self._delegate, val, BAM_FPAIRED)
jpayne@68 1581
jpayne@68 1582 property is_proper_pair:
jpayne@68 1583 """true if read is mapped in a proper pair"""
jpayne@68 1584 def __get__(self):
jpayne@68 1585 return (self.flag & BAM_FPROPER_PAIR) != 0
jpayne@68 1586 def __set__(self,val):
jpayne@68 1587 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
jpayne@68 1588
jpayne@68 1589 property is_unmapped:
jpayne@68 1590 """true if read itself is unmapped"""
jpayne@68 1591 def __get__(self):
jpayne@68 1592 return (self.flag & BAM_FUNMAP) != 0
jpayne@68 1593 def __set__(self, val):
jpayne@68 1594 pysam_update_flag(self._delegate, val, BAM_FUNMAP)
jpayne@68 1595 # setting the unmapped flag requires recalculation of
jpayne@68 1596 # bin as alignment length is now implicitly 1
jpayne@68 1597 update_bin(self._delegate)
jpayne@68 1598
jpayne@68 1599 property is_mapped:
jpayne@68 1600 """true if read itself is mapped
jpayne@68 1601 (implemented in terms of :attr:`is_unmapped`)"""
jpayne@68 1602 def __get__(self):
jpayne@68 1603 return (self.flag & BAM_FUNMAP) == 0
jpayne@68 1604 def __set__(self, val):
jpayne@68 1605 pysam_update_flag(self._delegate, not val, BAM_FUNMAP)
jpayne@68 1606 update_bin(self._delegate)
jpayne@68 1607
jpayne@68 1608 property mate_is_unmapped:
jpayne@68 1609 """true if the mate is unmapped"""
jpayne@68 1610 def __get__(self):
jpayne@68 1611 return (self.flag & BAM_FMUNMAP) != 0
jpayne@68 1612 def __set__(self,val):
jpayne@68 1613 pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
jpayne@68 1614
jpayne@68 1615 property mate_is_mapped:
jpayne@68 1616 """true if the mate is mapped
jpayne@68 1617 (implemented in terms of :attr:`mate_is_unmapped`)"""
jpayne@68 1618 def __get__(self):
jpayne@68 1619 return (self.flag & BAM_FMUNMAP) == 0
jpayne@68 1620 def __set__(self,val):
jpayne@68 1621 pysam_update_flag(self._delegate, not val, BAM_FMUNMAP)
jpayne@68 1622
jpayne@68 1623 property is_reverse:
jpayne@68 1624 """true if read is mapped to reverse strand"""
jpayne@68 1625 def __get__(self):
jpayne@68 1626 return (self.flag & BAM_FREVERSE) != 0
jpayne@68 1627 def __set__(self,val):
jpayne@68 1628 pysam_update_flag(self._delegate, val, BAM_FREVERSE)
jpayne@68 1629
jpayne@68 1630 property is_forward:
jpayne@68 1631 """true if read is mapped to forward strand
jpayne@68 1632 (implemented in terms of :attr:`is_reverse`)"""
jpayne@68 1633 def __get__(self):
jpayne@68 1634 return (self.flag & BAM_FREVERSE) == 0
jpayne@68 1635 def __set__(self,val):
jpayne@68 1636 pysam_update_flag(self._delegate, not val, BAM_FREVERSE)
jpayne@68 1637
jpayne@68 1638 property mate_is_reverse:
jpayne@68 1639 """true if the mate is mapped to reverse strand"""
jpayne@68 1640 def __get__(self):
jpayne@68 1641 return (self.flag & BAM_FMREVERSE) != 0
jpayne@68 1642 def __set__(self,val):
jpayne@68 1643 pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
jpayne@68 1644
jpayne@68 1645 property mate_is_forward:
jpayne@68 1646 """true if the mate is mapped to forward strand
jpayne@68 1647 (implemented in terms of :attr:`mate_is_reverse`)"""
jpayne@68 1648 def __get__(self):
jpayne@68 1649 return (self.flag & BAM_FMREVERSE) == 0
jpayne@68 1650 def __set__(self,val):
jpayne@68 1651 pysam_update_flag(self._delegate, not val, BAM_FMREVERSE)
jpayne@68 1652
jpayne@68 1653 property is_read1:
jpayne@68 1654 """true if this is read1"""
jpayne@68 1655 def __get__(self):
jpayne@68 1656 return (self.flag & BAM_FREAD1) != 0
jpayne@68 1657 def __set__(self,val):
jpayne@68 1658 pysam_update_flag(self._delegate, val, BAM_FREAD1)
jpayne@68 1659 property is_read2:
jpayne@68 1660 """true if this is read2"""
jpayne@68 1661 def __get__(self):
jpayne@68 1662 return (self.flag & BAM_FREAD2) != 0
jpayne@68 1663 def __set__(self, val):
jpayne@68 1664 pysam_update_flag(self._delegate, val, BAM_FREAD2)
jpayne@68 1665 property is_secondary:
jpayne@68 1666 """true if not primary alignment"""
jpayne@68 1667 def __get__(self):
jpayne@68 1668 return (self.flag & BAM_FSECONDARY) != 0
jpayne@68 1669 def __set__(self, val):
jpayne@68 1670 pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
jpayne@68 1671 property is_qcfail:
jpayne@68 1672 """true if QC failure"""
jpayne@68 1673 def __get__(self):
jpayne@68 1674 return (self.flag & BAM_FQCFAIL) != 0
jpayne@68 1675 def __set__(self, val):
jpayne@68 1676 pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
jpayne@68 1677 property is_duplicate:
jpayne@68 1678 """true if optical or PCR duplicate"""
jpayne@68 1679 def __get__(self):
jpayne@68 1680 return (self.flag & BAM_FDUP) != 0
jpayne@68 1681 def __set__(self, val):
jpayne@68 1682 pysam_update_flag(self._delegate, val, BAM_FDUP)
jpayne@68 1683 property is_supplementary:
jpayne@68 1684 """true if this is a supplementary alignment"""
jpayne@68 1685 def __get__(self):
jpayne@68 1686 return (self.flag & BAM_FSUPPLEMENTARY) != 0
jpayne@68 1687 def __set__(self, val):
jpayne@68 1688 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
jpayne@68 1689
jpayne@68 1690 # 2. Coordinates and lengths
jpayne@68 1691 property reference_end:
jpayne@68 1692 '''aligned reference position of the read on the reference genome.
jpayne@68 1693
jpayne@68 1694 reference_end points to one past the last aligned residue.
jpayne@68 1695 Returns None if not available (read is unmapped or no cigar
jpayne@68 1696 alignment present).
jpayne@68 1697
jpayne@68 1698 '''
jpayne@68 1699 def __get__(self):
jpayne@68 1700 cdef bam1_t * src
jpayne@68 1701 src = self._delegate
jpayne@68 1702 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
jpayne@68 1703 return None
jpayne@68 1704 return bam_endpos(src)
jpayne@68 1705
jpayne@68 1706 property reference_length:
jpayne@68 1707 '''aligned length of the read on the reference genome.
jpayne@68 1708
jpayne@68 1709 This is equal to `reference_end - reference_start`.
jpayne@68 1710 Returns None if not available.
jpayne@68 1711 '''
jpayne@68 1712 def __get__(self):
jpayne@68 1713 cdef bam1_t * src
jpayne@68 1714 src = self._delegate
jpayne@68 1715 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
jpayne@68 1716 return None
jpayne@68 1717 return bam_endpos(src) - \
jpayne@68 1718 self._delegate.core.pos
jpayne@68 1719
jpayne@68 1720 property query_alignment_sequence:
jpayne@68 1721 """aligned portion of the read.
jpayne@68 1722
jpayne@68 1723 This is a substring of :attr:`query_sequence` that excludes flanking
jpayne@68 1724 bases that were :term:`soft clipped` (None if not present). It
jpayne@68 1725 is equal to ``query_sequence[query_alignment_start:query_alignment_end]``.
jpayne@68 1726
jpayne@68 1727 SAM/BAM files may include extra flanking bases that are not
jpayne@68 1728 part of the alignment. These bases may be the result of the
jpayne@68 1729 Smith-Waterman or other algorithms, which may not require
jpayne@68 1730 alignments that begin at the first residue or end at the last.
jpayne@68 1731 In addition, extra sequencing adapters, multiplex identifiers,
jpayne@68 1732 and low-quality bases that were not considered for alignment
jpayne@68 1733 may have been retained.
jpayne@68 1734
jpayne@68 1735 """
jpayne@68 1736
jpayne@68 1737 def __get__(self):
jpayne@68 1738 if self.cache_query_alignment_sequence:
jpayne@68 1739 return self.cache_query_alignment_sequence
jpayne@68 1740
jpayne@68 1741 cdef bam1_t * src
jpayne@68 1742 cdef uint32_t start, end
jpayne@68 1743
jpayne@68 1744 src = self._delegate
jpayne@68 1745
jpayne@68 1746 if src.core.l_qseq == 0:
jpayne@68 1747 return None
jpayne@68 1748
jpayne@68 1749 start = getQueryStart(src)
jpayne@68 1750 end = getQueryEnd(src)
jpayne@68 1751
jpayne@68 1752 self.cache_query_alignment_sequence = force_str(
jpayne@68 1753 getSequenceInRange(src, start, end))
jpayne@68 1754 return self.cache_query_alignment_sequence
jpayne@68 1755
jpayne@68 1756 property query_alignment_qualities:
jpayne@68 1757 """aligned query sequence quality values (None if not present). These
jpayne@68 1758 are the quality values that correspond to
jpayne@68 1759 :attr:`query_alignment_sequence`, that is, they exclude qualities of
jpayne@68 1760 :term:`soft clipped` bases. This is equal to
jpayne@68 1761 ``query_qualities[query_alignment_start:query_alignment_end]``.
jpayne@68 1762
jpayne@68 1763 Quality scores are returned as a python array of unsigned
jpayne@68 1764 chars. Note that this is not the ASCII-encoded value typically
jpayne@68 1765 seen in FASTQ or SAM formatted files. Thus, no offset of 33
jpayne@68 1766 needs to be subtracted.
jpayne@68 1767
jpayne@68 1768 This property is read-only.
jpayne@68 1769
jpayne@68 1770 """
jpayne@68 1771 def __get__(self):
jpayne@68 1772
jpayne@68 1773 if self.cache_query_alignment_qualities:
jpayne@68 1774 return self.cache_query_alignment_qualities
jpayne@68 1775
jpayne@68 1776 cdef bam1_t * src
jpayne@68 1777 cdef uint32_t start, end
jpayne@68 1778
jpayne@68 1779 src = self._delegate
jpayne@68 1780
jpayne@68 1781 if src.core.l_qseq == 0:
jpayne@68 1782 return None
jpayne@68 1783
jpayne@68 1784 start = getQueryStart(src)
jpayne@68 1785 end = getQueryEnd(src)
jpayne@68 1786 self.cache_query_alignment_qualities = \
jpayne@68 1787 getQualitiesInRange(src, start, end)
jpayne@68 1788 return self.cache_query_alignment_qualities
jpayne@68 1789
jpayne@68 1790 property query_alignment_start:
jpayne@68 1791 """start index of the aligned query portion of the sequence (0-based,
jpayne@68 1792 inclusive).
jpayne@68 1793
jpayne@68 1794 This the index of the first base in :attr:`query_sequence`
jpayne@68 1795 that is not soft-clipped.
jpayne@68 1796 """
jpayne@68 1797 def __get__(self):
jpayne@68 1798 return getQueryStart(self._delegate)
jpayne@68 1799
jpayne@68 1800 property query_alignment_end:
jpayne@68 1801 """end index of the aligned query portion of the sequence (0-based,
jpayne@68 1802 exclusive)
jpayne@68 1803
jpayne@68 1804 This the index just past the last base in :attr:`query_sequence`
jpayne@68 1805 that is not soft-clipped.
jpayne@68 1806 """
jpayne@68 1807 def __get__(self):
jpayne@68 1808 return getQueryEnd(self._delegate)
jpayne@68 1809
jpayne@68 1810 property modified_bases:
jpayne@68 1811 """Modified bases annotations from Ml/Mm tags. The output is
jpayne@68 1812 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
jpayne@68 1813 with qual being (256*probability), or -1 if unknown.
jpayne@68 1814 Strand==0 for forward and 1 for reverse strand modification
jpayne@68 1815 """
jpayne@68 1816 def __get__(self):
jpayne@68 1817 cdef bam1_t * src
jpayne@68 1818 cdef hts_base_mod_state *m = hts_base_mod_state_alloc()
jpayne@68 1819 cdef hts_base_mod mods[5]
jpayne@68 1820 cdef int pos
jpayne@68 1821
jpayne@68 1822 ret = {}
jpayne@68 1823 src = self._delegate
jpayne@68 1824
jpayne@68 1825 if bam_parse_basemod(src, m) < 0:
jpayne@68 1826 return None
jpayne@68 1827
jpayne@68 1828 n = bam_next_basemod(src, m, mods, 5, &pos)
jpayne@68 1829
jpayne@68 1830 while n>0:
jpayne@68 1831 for i in range(n):
jpayne@68 1832 mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base
jpayne@68 1833 mod_strand = mods[i].strand
jpayne@68 1834 if self.is_reverse:
jpayne@68 1835 mod_strand = 1 - mod_strand
jpayne@68 1836 key = (chr(mods[i].canonical_base),
jpayne@68 1837 mod_strand,
jpayne@68 1838 mod_code )
jpayne@68 1839 ret.setdefault(key,[]).append((pos,mods[i].qual))
jpayne@68 1840
jpayne@68 1841 n = bam_next_basemod(src, m, mods, 5, &pos)
jpayne@68 1842
jpayne@68 1843 if n<0:
jpayne@68 1844 return None
jpayne@68 1845
jpayne@68 1846 hts_base_mod_state_free(m)
jpayne@68 1847 return ret
jpayne@68 1848
jpayne@68 1849 property modified_bases_forward:
jpayne@68 1850 """Modified bases annotations from Ml/Mm tags. The output is
jpayne@68 1851 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
jpayne@68 1852 with qual being (256*probability), or -1 if unknown.
jpayne@68 1853 Strand==0 for forward and 1 for reverse strand modification.
jpayne@68 1854 The positions are with respect to the original sequence from get_forward_sequence()
jpayne@68 1855 """
jpayne@68 1856 def __get__(self):
jpayne@68 1857 pmods = self.modified_bases
jpayne@68 1858 if pmods and self.is_reverse:
jpayne@68 1859 rmod = {}
jpayne@68 1860
jpayne@68 1861 # Try to find the length of the original sequence
jpayne@68 1862 rlen = self.infer_read_length()
jpayne@68 1863 if rlen is None and self.query_sequence is None:
jpayne@68 1864 return rmod
jpayne@68 1865 else:
jpayne@68 1866 rlen = len(self.query_sequence)
jpayne@68 1867
jpayne@68 1868 for k,mods in pmods.items():
jpayne@68 1869 nk = k[0],1 - k[1],k[2]
jpayne@68 1870 for i in range(len(mods)):
jpayne@68 1871
jpayne@68 1872 mods[i] = (rlen - 1 -mods[i][0], mods[i][1])
jpayne@68 1873 rmod[nk] = mods
jpayne@68 1874 return rmod
jpayne@68 1875
jpayne@68 1876 return pmods
jpayne@68 1877
jpayne@68 1878
jpayne@68 1879 property query_alignment_length:
jpayne@68 1880 """length of the aligned query sequence.
jpayne@68 1881
jpayne@68 1882 This is equal to :attr:`query_alignment_end` -
jpayne@68 1883 :attr:`query_alignment_start`
jpayne@68 1884 """
jpayne@68 1885 def __get__(self):
jpayne@68 1886 cdef bam1_t * src
jpayne@68 1887 src = self._delegate
jpayne@68 1888 return getQueryEnd(src) - getQueryStart(src)
jpayne@68 1889
jpayne@68 1890 #####################################################
jpayne@68 1891 # Computed properties
jpayne@68 1892
jpayne@68 1893 def get_reference_positions(self, full_length=False):
jpayne@68 1894 """a list of reference positions that this read aligns to.
jpayne@68 1895
jpayne@68 1896 By default, this method returns the (0-based) positions on the
jpayne@68 1897 reference that are within the read's alignment, leaving gaps
jpayne@68 1898 corresponding to deletions and other reference skips.
jpayne@68 1899
jpayne@68 1900 When *full_length* is True, the returned list is the same length
jpayne@68 1901 as the read and additionally includes None values corresponding
jpayne@68 1902 to insertions or soft-clipping, i.e., to bases of the read that
jpayne@68 1903 are not aligned to a reference position.
jpayne@68 1904 (See also :meth:`get_aligned_pairs` which additionally returns
jpayne@68 1905 the corresponding positions along the read.)
jpayne@68 1906 """
jpayne@68 1907 cdef uint32_t k, i, l, pos
jpayne@68 1908 cdef int op
jpayne@68 1909 cdef uint32_t * cigar_p
jpayne@68 1910 cdef bam1_t * src
jpayne@68 1911 cdef bint _full = full_length
jpayne@68 1912
jpayne@68 1913 src = self._delegate
jpayne@68 1914 if pysam_get_n_cigar(src) == 0:
jpayne@68 1915 return []
jpayne@68 1916
jpayne@68 1917 result = []
jpayne@68 1918 pos = src.core.pos
jpayne@68 1919 cigar_p = pysam_bam_get_cigar(src)
jpayne@68 1920
jpayne@68 1921 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 1922 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 1924
jpayne@68 1925 if op == BAM_CSOFT_CLIP or op == BAM_CINS:
jpayne@68 1926 if _full:
jpayne@68 1927 for i from 0 <= i < l:
jpayne@68 1928 result.append(None)
jpayne@68 1929 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 1930 for i from pos <= i < pos + l:
jpayne@68 1931 result.append(i)
jpayne@68 1932 pos += l
jpayne@68 1933 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
jpayne@68 1934 pos += l
jpayne@68 1935
jpayne@68 1936 return result
jpayne@68 1937
jpayne@68 1938 def infer_query_length(self, always=False):
jpayne@68 1939 """infer query length from CIGAR alignment.
jpayne@68 1940
jpayne@68 1941 This method deduces the query length from the CIGAR alignment
jpayne@68 1942 but does not include hard-clipped bases.
jpayne@68 1943
jpayne@68 1944 Returns None if CIGAR alignment is not present.
jpayne@68 1945
jpayne@68 1946 If *always* is set to True, `infer_read_length` is used instead.
jpayne@68 1947 This is deprecated and only present for backward compatibility.
jpayne@68 1948 """
jpayne@68 1949 if always is True:
jpayne@68 1950 return self.infer_read_length()
jpayne@68 1951 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
jpayne@68 1952 if l > 0:
jpayne@68 1953 return l
jpayne@68 1954 else:
jpayne@68 1955 return None
jpayne@68 1956
jpayne@68 1957 def infer_read_length(self):
jpayne@68 1958 """infer read length from CIGAR alignment.
jpayne@68 1959
jpayne@68 1960 This method deduces the read length from the CIGAR alignment
jpayne@68 1961 including hard-clipped bases.
jpayne@68 1962
jpayne@68 1963 Returns None if CIGAR alignment is not present.
jpayne@68 1964 """
jpayne@68 1965 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
jpayne@68 1966 if l > 0:
jpayne@68 1967 return l
jpayne@68 1968 else:
jpayne@68 1969 return None
jpayne@68 1970
jpayne@68 1971 def get_reference_sequence(self):
jpayne@68 1972 """return the reference sequence in the region that is covered by the
jpayne@68 1973 alignment of the read to the reference.
jpayne@68 1974
jpayne@68 1975 This method requires the MD tag to be set.
jpayne@68 1976
jpayne@68 1977 """
jpayne@68 1978 return force_str(build_reference_sequence(self._delegate))
jpayne@68 1979
jpayne@68 1980 def get_forward_sequence(self):
jpayne@68 1981 """return the original read sequence.
jpayne@68 1982
jpayne@68 1983 Reads mapped to the reverse strand are stored reverse complemented in
jpayne@68 1984 the BAM file. This method returns such reads reverse complemented back
jpayne@68 1985 to their original orientation.
jpayne@68 1986
jpayne@68 1987 Returns None if the record has no query sequence.
jpayne@68 1988 """
jpayne@68 1989 if self.query_sequence is None:
jpayne@68 1990 return None
jpayne@68 1991 s = force_str(self.query_sequence)
jpayne@68 1992 if self.is_reverse:
jpayne@68 1993 s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
jpayne@68 1994 return s
jpayne@68 1995
jpayne@68 1996 def get_forward_qualities(self):
jpayne@68 1997 """return the original base qualities of the read sequence,
jpayne@68 1998 in the same format as the :attr:`query_qualities` property.
jpayne@68 1999
jpayne@68 2000 Reads mapped to the reverse strand have their base qualities stored
jpayne@68 2001 reversed in the BAM file. This method returns such reads' base qualities
jpayne@68 2002 reversed back to their original orientation.
jpayne@68 2003 """
jpayne@68 2004 if self.is_reverse:
jpayne@68 2005 return self.query_qualities[::-1]
jpayne@68 2006 else:
jpayne@68 2007 return self.query_qualities
jpayne@68 2008
jpayne@68 2009 def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
jpayne@68 2010 """a list of aligned read (query) and reference positions.
jpayne@68 2011
jpayne@68 2012 Each item in the returned list is a tuple consisting of
jpayne@68 2013 the 0-based offset from the start of the read sequence
jpayne@68 2014 followed by the 0-based reference position.
jpayne@68 2015
jpayne@68 2016 For inserts, deletions, skipping either query or reference
jpayne@68 2017 position may be None.
jpayne@68 2018
jpayne@68 2019 For padding in the reference, the reference position will
jpayne@68 2020 always be None.
jpayne@68 2021
jpayne@68 2022 Parameters
jpayne@68 2023 ----------
jpayne@68 2024
jpayne@68 2025 matches_only : bool
jpayne@68 2026 If True, only matched bases are returned --- no None on either
jpayne@68 2027 side.
jpayne@68 2028 with_seq : bool
jpayne@68 2029 If True, return a third element in the tuple containing the
jpayne@68 2030 reference sequence. For CIGAR 'P' (padding in the reference)
jpayne@68 2031 operations, the third tuple element will be None. Substitutions
jpayne@68 2032 are lower-case. This option requires an MD tag to be present.
jpayne@68 2033 with_cigar : bool
jpayne@68 2034 If True, return an extra element in the tuple containing the
jpayne@68 2035 CIGAR operator corresponding to this position tuple.
jpayne@68 2036
jpayne@68 2037 Returns
jpayne@68 2038 -------
jpayne@68 2039
jpayne@68 2040 aligned_pairs : list of tuples
jpayne@68 2041
jpayne@68 2042 """
jpayne@68 2043 cdef uint32_t k, i, pos, qpos, r_idx, l
jpayne@68 2044 cdef int op
jpayne@68 2045 cdef uint32_t * cigar_p
jpayne@68 2046 cdef bam1_t * src = self._delegate
jpayne@68 2047 cdef bint _matches_only = bool(matches_only)
jpayne@68 2048 cdef bint _with_seq = bool(with_seq)
jpayne@68 2049 cdef bint _with_cigar = bool(with_cigar)
jpayne@68 2050 cdef object (*make_tuple)(object, object, object, uint32_t, int)
jpayne@68 2051
jpayne@68 2052 # TODO: this method performs no checking and assumes that
jpayne@68 2053 # read sequence, cigar and MD tag are consistent.
jpayne@68 2054
jpayne@68 2055 if _with_seq:
jpayne@68 2056 # force_str required for py2/py3 compatibility
jpayne@68 2057 ref_seq = force_str(build_reference_sequence(src))
jpayne@68 2058 if ref_seq is None:
jpayne@68 2059 raise ValueError("MD tag not present")
jpayne@68 2060 make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
jpayne@68 2061 else:
jpayne@68 2062 ref_seq = None
jpayne@68 2063 make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
jpayne@68 2064
jpayne@68 2065 r_idx = 0
jpayne@68 2066
jpayne@68 2067 if pysam_get_n_cigar(src) == 0:
jpayne@68 2068 return []
jpayne@68 2069
jpayne@68 2070 result = []
jpayne@68 2071 pos = src.core.pos
jpayne@68 2072 qpos = 0
jpayne@68 2073 cigar_p = pysam_bam_get_cigar(src)
jpayne@68 2074 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 2075 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 2076 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 2077
jpayne@68 2078 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 2079 for i from pos <= i < pos + l:
jpayne@68 2080 result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
jpayne@68 2081 r_idx += 1
jpayne@68 2082 qpos += 1
jpayne@68 2083 pos += l
jpayne@68 2084
jpayne@68 2085 elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
jpayne@68 2086 if not _matches_only:
jpayne@68 2087 for i from pos <= i < pos + l:
jpayne@68 2088 result.append(make_tuple(qpos, None, None, 0, op))
jpayne@68 2089 qpos += 1
jpayne@68 2090 else:
jpayne@68 2091 qpos += l
jpayne@68 2092
jpayne@68 2093 elif op == BAM_CDEL:
jpayne@68 2094 if not _matches_only:
jpayne@68 2095 for i from pos <= i < pos + l:
jpayne@68 2096 result.append(make_tuple(None, i, ref_seq, r_idx, op))
jpayne@68 2097 r_idx += 1
jpayne@68 2098 else:
jpayne@68 2099 r_idx += l
jpayne@68 2100 pos += l
jpayne@68 2101
jpayne@68 2102 elif op == BAM_CHARD_CLIP:
jpayne@68 2103 pass # advances neither
jpayne@68 2104
jpayne@68 2105 elif op == BAM_CREF_SKIP:
jpayne@68 2106 if not _matches_only:
jpayne@68 2107 for i from pos <= i < pos + l:
jpayne@68 2108 result.append(make_tuple(None, i, None, 0, op))
jpayne@68 2109
jpayne@68 2110 pos += l
jpayne@68 2111
jpayne@68 2112 return result
jpayne@68 2113
jpayne@68 2114 def get_blocks(self):
jpayne@68 2115 """ a list of start and end positions of
jpayne@68 2116 aligned gapless blocks.
jpayne@68 2117
jpayne@68 2118 The start and end positions are in genomic
jpayne@68 2119 coordinates.
jpayne@68 2120
jpayne@68 2121 Blocks are not normalized, i.e. two blocks
jpayne@68 2122 might be directly adjacent. This happens if
jpayne@68 2123 the two blocks are separated by an insertion
jpayne@68 2124 in the read.
jpayne@68 2125 """
jpayne@68 2126
jpayne@68 2127 cdef uint32_t k, pos, l
jpayne@68 2128 cdef int op
jpayne@68 2129 cdef uint32_t * cigar_p
jpayne@68 2130 cdef bam1_t * src
jpayne@68 2131
jpayne@68 2132 src = self._delegate
jpayne@68 2133 if pysam_get_n_cigar(src) == 0:
jpayne@68 2134 return []
jpayne@68 2135
jpayne@68 2136 result = []
jpayne@68 2137 pos = src.core.pos
jpayne@68 2138 cigar_p = pysam_bam_get_cigar(src)
jpayne@68 2139 l = 0
jpayne@68 2140
jpayne@68 2141 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 2142 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 2143 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 2144 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 2145 result.append((pos, pos + l))
jpayne@68 2146 pos += l
jpayne@68 2147 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
jpayne@68 2148 pos += l
jpayne@68 2149
jpayne@68 2150 return result
jpayne@68 2151
jpayne@68 2152 def get_overlap(self, uint32_t start, uint32_t end):
jpayne@68 2153 """return number of aligned bases of read overlapping the interval
jpayne@68 2154 *start* and *end* on the reference sequence.
jpayne@68 2155
jpayne@68 2156 Return None if cigar alignment is not available.
jpayne@68 2157 """
jpayne@68 2158 cdef uint32_t k, i, pos, overlap
jpayne@68 2159 cdef int op, o
jpayne@68 2160 cdef uint32_t * cigar_p
jpayne@68 2161 cdef bam1_t * src
jpayne@68 2162
jpayne@68 2163 overlap = 0
jpayne@68 2164
jpayne@68 2165 src = self._delegate
jpayne@68 2166 if pysam_get_n_cigar(src) == 0:
jpayne@68 2167 return None
jpayne@68 2168 pos = src.core.pos
jpayne@68 2169 o = 0
jpayne@68 2170
jpayne@68 2171 cigar_p = pysam_bam_get_cigar(src)
jpayne@68 2172 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 2173 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 2174 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 2175
jpayne@68 2176 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 2177 o = min( pos + l, end) - max( pos, start )
jpayne@68 2178 if o > 0: overlap += o
jpayne@68 2179
jpayne@68 2180 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@68 2181 pos += l
jpayne@68 2182
jpayne@68 2183 return overlap
jpayne@68 2184
jpayne@68 2185 def get_cigar_stats(self):
jpayne@68 2186 """summary of operations in cigar string.
jpayne@68 2187
jpayne@68 2188 The output order in the array is "MIDNSHP=X" followed by a
jpayne@68 2189 field for the NM tag. If the NM tag is not present, this
jpayne@68 2190 field will always be 0. (Accessing this field via index -1
jpayne@68 2191 avoids changes if more CIGAR operators are added in future.)
jpayne@68 2192
jpayne@68 2193 +-----+----------------+--------+
jpayne@68 2194 |M |pysam.CMATCH |0 |
jpayne@68 2195 +-----+----------------+--------+
jpayne@68 2196 |I |pysam.CINS |1 |
jpayne@68 2197 +-----+----------------+--------+
jpayne@68 2198 |D |pysam.CDEL |2 |
jpayne@68 2199 +-----+----------------+--------+
jpayne@68 2200 |N |pysam.CREF_SKIP |3 |
jpayne@68 2201 +-----+----------------+--------+
jpayne@68 2202 |S |pysam.CSOFT_CLIP|4 |
jpayne@68 2203 +-----+----------------+--------+
jpayne@68 2204 |H |pysam.CHARD_CLIP|5 |
jpayne@68 2205 +-----+----------------+--------+
jpayne@68 2206 |P |pysam.CPAD |6 |
jpayne@68 2207 +-----+----------------+--------+
jpayne@68 2208 |= |pysam.CEQUAL |7 |
jpayne@68 2209 +-----+----------------+--------+
jpayne@68 2210 |X |pysam.CDIFF |8 |
jpayne@68 2211 +-----+----------------+--------+
jpayne@68 2212 |B |pysam.CBACK |9 |
jpayne@68 2213 +-----+----------------+--------+
jpayne@68 2214 |NM |NM tag |10 or -1|
jpayne@68 2215 +-----+----------------+--------+
jpayne@68 2216
jpayne@68 2217 If no cigar string is present, empty arrays will be returned.
jpayne@68 2218
jpayne@68 2219 Returns:
jpayne@68 2220 arrays :
jpayne@68 2221 two arrays. The first contains the nucleotide counts within
jpayne@68 2222 each cigar operation, the second contains the number of blocks
jpayne@68 2223 for each cigar operation.
jpayne@68 2224
jpayne@68 2225 """
jpayne@68 2226
jpayne@68 2227 cdef int nfields = NCIGAR_CODES + 1
jpayne@68 2228
jpayne@68 2229 cdef c_array.array base_counts = array.array(
jpayne@68 2230 "I",
jpayne@68 2231 [0] * nfields)
jpayne@68 2232 cdef uint32_t [:] base_view = base_counts
jpayne@68 2233 cdef c_array.array block_counts = array.array(
jpayne@68 2234 "I",
jpayne@68 2235 [0] * nfields)
jpayne@68 2236 cdef uint32_t [:] block_view = block_counts
jpayne@68 2237
jpayne@68 2238 cdef bam1_t * src = self._delegate
jpayne@68 2239 cdef int op
jpayne@68 2240 cdef uint32_t l
jpayne@68 2241 cdef int32_t k
jpayne@68 2242 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@68 2243
jpayne@68 2244 if cigar_p == NULL:
jpayne@68 2245 return None
jpayne@68 2246
jpayne@68 2247 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 2248 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 2249 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 2250 base_view[op] += l
jpayne@68 2251 block_view[op] += 1
jpayne@68 2252
jpayne@68 2253 cdef uint8_t * v = bam_aux_get(src, 'NM')
jpayne@68 2254 if v != NULL:
jpayne@68 2255 base_view[nfields - 1] = <int32_t>bam_aux2i(v)
jpayne@68 2256
jpayne@68 2257 return base_counts, block_counts
jpayne@68 2258
jpayne@68 2259 #####################################################
jpayne@68 2260 ## Unsorted as yet
jpayne@68 2261 # TODO: capture in CIGAR object
jpayne@68 2262 property cigartuples:
jpayne@68 2263 """the :term:`cigar` alignment. The alignment
jpayne@68 2264 is returned as a list of tuples of (operation, length).
jpayne@68 2265
jpayne@68 2266 If the alignment is not present, None is returned.
jpayne@68 2267
jpayne@68 2268 The operations are:
jpayne@68 2269
jpayne@68 2270 +-----+----------------+-----+
jpayne@68 2271 |M |pysam.CMATCH |0 |
jpayne@68 2272 +-----+----------------+-----+
jpayne@68 2273 |I |pysam.CINS |1 |
jpayne@68 2274 +-----+----------------+-----+
jpayne@68 2275 |D |pysam.CDEL |2 |
jpayne@68 2276 +-----+----------------+-----+
jpayne@68 2277 |N |pysam.CREF_SKIP |3 |
jpayne@68 2278 +-----+----------------+-----+
jpayne@68 2279 |S |pysam.CSOFT_CLIP|4 |
jpayne@68 2280 +-----+----------------+-----+
jpayne@68 2281 |H |pysam.CHARD_CLIP|5 |
jpayne@68 2282 +-----+----------------+-----+
jpayne@68 2283 |P |pysam.CPAD |6 |
jpayne@68 2284 +-----+----------------+-----+
jpayne@68 2285 |= |pysam.CEQUAL |7 |
jpayne@68 2286 +-----+----------------+-----+
jpayne@68 2287 |X |pysam.CDIFF |8 |
jpayne@68 2288 +-----+----------------+-----+
jpayne@68 2289 |B |pysam.CBACK |9 |
jpayne@68 2290 +-----+----------------+-----+
jpayne@68 2291
jpayne@68 2292 .. note::
jpayne@68 2293 The output is a list of (operation, length) tuples, such as
jpayne@68 2294 ``[(0, 30)]``.
jpayne@68 2295 This is different from the SAM specification and
jpayne@68 2296 the :attr:`cigarstring` property, which uses a
jpayne@68 2297 (length, operation) order, for example: ``30M``.
jpayne@68 2298
jpayne@68 2299 To unset the cigar property, assign an empty list
jpayne@68 2300 or None.
jpayne@68 2301 """
jpayne@68 2302 def __get__(self):
jpayne@68 2303 cdef uint32_t * cigar_p
jpayne@68 2304 cdef bam1_t * src
jpayne@68 2305 cdef uint32_t op, l
jpayne@68 2306 cdef uint32_t k
jpayne@68 2307
jpayne@68 2308 src = self._delegate
jpayne@68 2309 if pysam_get_n_cigar(src) == 0:
jpayne@68 2310 return None
jpayne@68 2311
jpayne@68 2312 cigar = []
jpayne@68 2313
jpayne@68 2314 cigar_p = pysam_bam_get_cigar(src);
jpayne@68 2315 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@68 2316 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@68 2317 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@68 2318 cigar.append((op, l))
jpayne@68 2319 return cigar
jpayne@68 2320
jpayne@68 2321 def __set__(self, values):
jpayne@68 2322 cdef uint32_t * p
jpayne@68 2323 cdef bam1_t * src
jpayne@68 2324 cdef op, l
jpayne@68 2325 cdef int k
jpayne@68 2326
jpayne@68 2327 k = 0
jpayne@68 2328
jpayne@68 2329 src = self._delegate
jpayne@68 2330
jpayne@68 2331 # get location of cigar string
jpayne@68 2332 p = pysam_bam_get_cigar(src)
jpayne@68 2333
jpayne@68 2334 # empty values for cigar string
jpayne@68 2335 if values is None:
jpayne@68 2336 values = []
jpayne@68 2337
jpayne@68 2338 cdef uint32_t ncigar = len(values)
jpayne@68 2339
jpayne@68 2340 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@68 2341 pysam_get_n_cigar(src) * 4,
jpayne@68 2342 ncigar * 4,
jpayne@68 2343 <uint8_t*>p)
jpayne@68 2344
jpayne@68 2345 if retval == NULL:
jpayne@68 2346 raise MemoryError("could not allocate memory")
jpayne@68 2347
jpayne@68 2348 # length is number of cigar operations, not bytes
jpayne@68 2349 pysam_set_n_cigar(src, ncigar)
jpayne@68 2350
jpayne@68 2351 # re-acquire pointer to location in memory
jpayne@68 2352 # as it might have moved
jpayne@68 2353 p = pysam_bam_get_cigar(src)
jpayne@68 2354
jpayne@68 2355 # insert cigar operations
jpayne@68 2356 for op, l in values:
jpayne@68 2357 p[k] = l << BAM_CIGAR_SHIFT | op
jpayne@68 2358 k += 1
jpayne@68 2359
jpayne@68 2360 ## setting the cigar string requires updating the bin
jpayne@68 2361 update_bin(src)
jpayne@68 2362
jpayne@68 2363 cpdef set_tag(self,
jpayne@68 2364 tag,
jpayne@68 2365 value,
jpayne@68 2366 value_type=None,
jpayne@68 2367 replace=True):
jpayne@68 2368 """sets a particular field *tag* to *value* in the optional alignment
jpayne@68 2369 section.
jpayne@68 2370
jpayne@68 2371 *value_type* describes the type of *value* that is to entered
jpayne@68 2372 into the alignment record. It can be set explicitly to one of
jpayne@68 2373 the valid one-letter type codes. If unset, an appropriate type
jpayne@68 2374 will be chosen automatically based on the python type of
jpayne@68 2375 *value*.
jpayne@68 2376
jpayne@68 2377 An existing value of the same *tag* will be overwritten unless
jpayne@68 2378 *replace* is set to False. This is usually not recommended as a
jpayne@68 2379 tag may only appear once in the optional alignment section.
jpayne@68 2380
jpayne@68 2381 If *value* is `None`, the tag will be deleted.
jpayne@68 2382
jpayne@68 2383 This method accepts valid SAM specification value types, which
jpayne@68 2384 are::
jpayne@68 2385
jpayne@68 2386 A: printable char
jpayne@68 2387 i: signed int
jpayne@68 2388 f: float
jpayne@68 2389 Z: printable string
jpayne@68 2390 H: Byte array in hex format
jpayne@68 2391 B: Integer or numeric array
jpayne@68 2392
jpayne@68 2393 Additionally, it will accept the integer BAM types ('cCsSI')
jpayne@68 2394
jpayne@68 2395 For htslib compatibility, 'a' is synonymous with 'A' and the
jpayne@68 2396 method accepts a 'd' type code for a double precision float.
jpayne@68 2397
jpayne@68 2398 When deducing the type code by the python type of *value*, the
jpayne@68 2399 following mapping is applied::
jpayne@68 2400
jpayne@68 2401 i: python int
jpayne@68 2402 f: python float
jpayne@68 2403 Z: python str or bytes
jpayne@68 2404 B: python array.array, list or tuple
jpayne@68 2405
jpayne@68 2406 Note that a single character string will be output as 'Z' and
jpayne@68 2407 not 'A' as the former is the more general type.
jpayne@68 2408 """
jpayne@68 2409
jpayne@68 2410 cdef int value_size
jpayne@68 2411 cdef uint8_t tc
jpayne@68 2412 cdef uint8_t * value_ptr
jpayne@68 2413 cdef uint8_t *existing_ptr
jpayne@68 2414 cdef float float_value
jpayne@68 2415 cdef double double_value
jpayne@68 2416 cdef int32_t int32_t_value
jpayne@68 2417 cdef uint32_t uint32_t_value
jpayne@68 2418 cdef int16_t int16_t_value
jpayne@68 2419 cdef uint16_t uint16_t_value
jpayne@68 2420 cdef int8_t int8_t_value
jpayne@68 2421 cdef uint8_t uint8_t_value
jpayne@68 2422 cdef bam1_t * src = self._delegate
jpayne@68 2423 cdef char * _value_type
jpayne@68 2424 cdef c_array.array array_value
jpayne@68 2425 cdef object buffer
jpayne@68 2426
jpayne@68 2427 if len(tag) != 2:
jpayne@68 2428 raise ValueError('Invalid tag: %s' % tag)
jpayne@68 2429
jpayne@68 2430 tag = force_bytes(tag)
jpayne@68 2431 if replace:
jpayne@68 2432 existing_ptr = bam_aux_get(src, tag)
jpayne@68 2433 if existing_ptr:
jpayne@68 2434 bam_aux_del(src, existing_ptr)
jpayne@68 2435
jpayne@68 2436 # setting value to None deletes a tag
jpayne@68 2437 if value is None:
jpayne@68 2438 return
jpayne@68 2439
jpayne@68 2440 cdef uint8_t typecode = get_tag_typecode(value, value_type)
jpayne@68 2441 if typecode == 0:
jpayne@68 2442 raise ValueError("can't guess type or invalid type code specified: {} {}".format(
jpayne@68 2443 value, value_type))
jpayne@68 2444
jpayne@68 2445 # sam_format1 for typecasting
jpayne@68 2446 if typecode == b'Z':
jpayne@68 2447 value = force_bytes(value)
jpayne@68 2448 value_ptr = <uint8_t*><char*>value
jpayne@68 2449 value_size = len(value)+1
jpayne@68 2450 elif typecode == b'H':
jpayne@68 2451 # Note that hex tags are stored the very same
jpayne@68 2452 # way as Z string.s
jpayne@68 2453 value = force_bytes(value)
jpayne@68 2454 value_ptr = <uint8_t*><char*>value
jpayne@68 2455 value_size = len(value)+1
jpayne@68 2456 elif typecode == b'A' or typecode == b'a':
jpayne@68 2457 value = force_bytes(value)
jpayne@68 2458 value_ptr = <uint8_t*><char*>value
jpayne@68 2459 value_size = sizeof(char)
jpayne@68 2460 typecode = b'A'
jpayne@68 2461 elif typecode == b'i':
jpayne@68 2462 int32_t_value = value
jpayne@68 2463 value_ptr = <uint8_t*>&int32_t_value
jpayne@68 2464 value_size = sizeof(int32_t)
jpayne@68 2465 elif typecode == b'I':
jpayne@68 2466 uint32_t_value = value
jpayne@68 2467 value_ptr = <uint8_t*>&uint32_t_value
jpayne@68 2468 value_size = sizeof(uint32_t)
jpayne@68 2469 elif typecode == b's':
jpayne@68 2470 int16_t_value = value
jpayne@68 2471 value_ptr = <uint8_t*>&int16_t_value
jpayne@68 2472 value_size = sizeof(int16_t)
jpayne@68 2473 elif typecode == b'S':
jpayne@68 2474 uint16_t_value = value
jpayne@68 2475 value_ptr = <uint8_t*>&uint16_t_value
jpayne@68 2476 value_size = sizeof(uint16_t)
jpayne@68 2477 elif typecode == b'c':
jpayne@68 2478 int8_t_value = value
jpayne@68 2479 value_ptr = <uint8_t*>&int8_t_value
jpayne@68 2480 value_size = sizeof(int8_t)
jpayne@68 2481 elif typecode == b'C':
jpayne@68 2482 uint8_t_value = value
jpayne@68 2483 value_ptr = <uint8_t*>&uint8_t_value
jpayne@68 2484 value_size = sizeof(uint8_t)
jpayne@68 2485 elif typecode == b'd':
jpayne@68 2486 double_value = value
jpayne@68 2487 value_ptr = <uint8_t*>&double_value
jpayne@68 2488 value_size = sizeof(double)
jpayne@68 2489 elif typecode == b'f':
jpayne@68 2490 float_value = value
jpayne@68 2491 value_ptr = <uint8_t*>&float_value
jpayne@68 2492 value_size = sizeof(float)
jpayne@68 2493 elif typecode == b'B':
jpayne@68 2494 # the following goes through python, needs to be cleaned up
jpayne@68 2495 # pack array using struct
jpayne@68 2496 fmt, args = pack_tags([(tag, value, value_type)])
jpayne@68 2497
jpayne@68 2498 # remove tag and type code as set by bam_aux_append
jpayne@68 2499 # first four chars of format (<2sB)
jpayne@68 2500 fmt = '<' + fmt[4:]
jpayne@68 2501 # first two values to pack
jpayne@68 2502 args = args[2:]
jpayne@68 2503 value_size = struct.calcsize(fmt)
jpayne@68 2504 # buffer will be freed when object goes out of scope
jpayne@68 2505 buffer = ctypes.create_string_buffer(value_size)
jpayne@68 2506 struct.pack_into(fmt, buffer, 0, *args)
jpayne@68 2507 # bam_aux_append copies data from value_ptr
jpayne@68 2508 bam_aux_append(src,
jpayne@68 2509 tag,
jpayne@68 2510 typecode,
jpayne@68 2511 value_size,
jpayne@68 2512 <uint8_t*>buffer.raw)
jpayne@68 2513 return
jpayne@68 2514 else:
jpayne@68 2515 raise ValueError('unsupported value_type {} in set_option'.format(typecode))
jpayne@68 2516
jpayne@68 2517 bam_aux_append(src,
jpayne@68 2518 tag,
jpayne@68 2519 typecode,
jpayne@68 2520 value_size,
jpayne@68 2521 value_ptr)
jpayne@68 2522
jpayne@68 2523 cpdef has_tag(self, tag):
jpayne@68 2524 """returns true if the optional alignment section
jpayne@68 2525 contains a given *tag*."""
jpayne@68 2526 cdef uint8_t * v
jpayne@68 2527 cdef int nvalues
jpayne@68 2528 btag = force_bytes(tag)
jpayne@68 2529 v = bam_aux_get(self._delegate, btag)
jpayne@68 2530 return v != NULL
jpayne@68 2531
jpayne@68 2532 cpdef get_tag(self, tag, with_value_type=False):
jpayne@68 2533 """
jpayne@68 2534 retrieves data from the optional alignment section
jpayne@68 2535 given a two-letter *tag* denoting the field.
jpayne@68 2536
jpayne@68 2537 The returned value is cast into an appropriate python type.
jpayne@68 2538
jpayne@68 2539 This method is the fastest way to access the optional
jpayne@68 2540 alignment section if only few tags need to be retrieved.
jpayne@68 2541
jpayne@68 2542 Possible value types are "AcCsSiIfZHB" (see BAM format
jpayne@68 2543 specification) as well as additional value type 'd' as
jpayne@68 2544 implemented in htslib.
jpayne@68 2545
jpayne@68 2546 Parameters:
jpayne@68 2547
jpayne@68 2548 tag :
jpayne@68 2549 data tag.
jpayne@68 2550
jpayne@68 2551 with_value_type : Optional[bool]
jpayne@68 2552 if set to True, the return value is a tuple of (tag value, type
jpayne@68 2553 code). (default False)
jpayne@68 2554
jpayne@68 2555 Returns:
jpayne@68 2556
jpayne@68 2557 A python object with the value of the `tag`. The type of the
jpayne@68 2558 object depends on the data type in the data record.
jpayne@68 2559
jpayne@68 2560 Raises:
jpayne@68 2561
jpayne@68 2562 KeyError
jpayne@68 2563 If `tag` is not present, a KeyError is raised.
jpayne@68 2564
jpayne@68 2565 """
jpayne@68 2566 cdef uint8_t * v
jpayne@68 2567 cdef int nvalues
jpayne@68 2568 btag = force_bytes(tag)
jpayne@68 2569 v = bam_aux_get(self._delegate, btag)
jpayne@68 2570 if v == NULL:
jpayne@68 2571 raise KeyError("tag '%s' not present" % tag)
jpayne@68 2572 if chr(v[0]) == "B":
jpayne@68 2573 auxtype = chr(v[0]) + chr(v[1])
jpayne@68 2574 else:
jpayne@68 2575 auxtype = chr(v[0])
jpayne@68 2576
jpayne@68 2577 if auxtype in "iIcCsS":
jpayne@68 2578 value = bam_aux2i(v)
jpayne@68 2579 elif auxtype == 'f' or auxtype == 'F':
jpayne@68 2580 value = bam_aux2f(v)
jpayne@68 2581 elif auxtype == 'd' or auxtype == 'D':
jpayne@68 2582 value = bam_aux2f(v)
jpayne@68 2583 elif auxtype == 'A' or auxtype == 'a':
jpayne@68 2584 # force A to a
jpayne@68 2585 v[0] = b'A'
jpayne@68 2586 # there might a more efficient way
jpayne@68 2587 # to convert a char into a string
jpayne@68 2588 value = '%c' % <char>bam_aux2A(v)
jpayne@68 2589 elif auxtype == 'Z' or auxtype == 'H':
jpayne@68 2590 # Z and H are treated equally as strings in htslib
jpayne@68 2591 value = charptr_to_str(<char*>bam_aux2Z(v))
jpayne@68 2592 elif auxtype[0] == 'B':
jpayne@68 2593 bytesize, nvalues, values = convert_binary_tag(v + 1)
jpayne@68 2594 value = values
jpayne@68 2595 else:
jpayne@68 2596 raise ValueError("unknown auxiliary type '%s'" % auxtype)
jpayne@68 2597
jpayne@68 2598 if with_value_type:
jpayne@68 2599 return (value, auxtype)
jpayne@68 2600 else:
jpayne@68 2601 return value
jpayne@68 2602
jpayne@68 2603 def get_tags(self, with_value_type=False):
jpayne@68 2604 """the fields in the optional alignment section.
jpayne@68 2605
jpayne@68 2606 Returns a list of all fields in the optional
jpayne@68 2607 alignment section. Values are converted to appropriate python
jpayne@68 2608 values. For example: ``[(NM, 2), (RG, "GJP00TM04")]``
jpayne@68 2609
jpayne@68 2610 If *with_value_type* is set, the value type as encode in
jpayne@68 2611 the AlignedSegment record will be returned as well:
jpayne@68 2612
jpayne@68 2613 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
jpayne@68 2614
jpayne@68 2615 This method will convert all values in the optional alignment
jpayne@68 2616 section. When getting only one or few tags, please see
jpayne@68 2617 :meth:`get_tag` for a quicker way to achieve this.
jpayne@68 2618
jpayne@68 2619 """
jpayne@68 2620
jpayne@68 2621 cdef char * ctag
jpayne@68 2622 cdef bam1_t * src
jpayne@68 2623 cdef uint8_t * s
jpayne@68 2624 cdef char auxtag[3]
jpayne@68 2625 cdef char auxtype
jpayne@68 2626 cdef uint8_t byte_size
jpayne@68 2627 cdef int32_t nvalues
jpayne@68 2628
jpayne@68 2629 src = self._delegate
jpayne@68 2630 if src.l_data == 0:
jpayne@68 2631 return []
jpayne@68 2632 s = pysam_bam_get_aux(src)
jpayne@68 2633 result = []
jpayne@68 2634 auxtag[2] = 0
jpayne@68 2635 while s < (src.data + src.l_data):
jpayne@68 2636 # get tag
jpayne@68 2637 auxtag[0] = s[0]
jpayne@68 2638 auxtag[1] = s[1]
jpayne@68 2639 s += 2
jpayne@68 2640 auxtype = s[0]
jpayne@68 2641 if auxtype in (b'c', b'C'):
jpayne@68 2642 value = <int>bam_aux2i(s)
jpayne@68 2643 s += 1
jpayne@68 2644 elif auxtype in (b's', b'S'):
jpayne@68 2645 value = <int>bam_aux2i(s)
jpayne@68 2646 s += 2
jpayne@68 2647 elif auxtype in (b'i', b'I'):
jpayne@68 2648 value = <int32_t>bam_aux2i(s)
jpayne@68 2649 s += 4
jpayne@68 2650 elif auxtype == b'f':
jpayne@68 2651 value = <float>bam_aux2f(s)
jpayne@68 2652 s += 4
jpayne@68 2653 elif auxtype == b'd':
jpayne@68 2654 value = <double>bam_aux2f(s)
jpayne@68 2655 s += 8
jpayne@68 2656 elif auxtype in (b'A', b'a'):
jpayne@68 2657 value = "%c" % <char>bam_aux2A(s)
jpayne@68 2658 s += 1
jpayne@68 2659 elif auxtype in (b'Z', b'H'):
jpayne@68 2660 value = charptr_to_str(<char*>bam_aux2Z(s))
jpayne@68 2661 # +1 for NULL terminated string
jpayne@68 2662 s += len(value) + 1
jpayne@68 2663 elif auxtype == b'B':
jpayne@68 2664 s += 1
jpayne@68 2665 byte_size, nvalues, value = convert_binary_tag(s)
jpayne@68 2666 # 5 for 1 char and 1 int
jpayne@68 2667 s += 5 + (nvalues * byte_size) - 1
jpayne@68 2668 else:
jpayne@68 2669 raise KeyError("unknown type '%s'" % auxtype)
jpayne@68 2670
jpayne@68 2671 s += 1
jpayne@68 2672
jpayne@68 2673 if with_value_type:
jpayne@68 2674 result.append((charptr_to_str(auxtag), value, chr(auxtype)))
jpayne@68 2675 else:
jpayne@68 2676 result.append((charptr_to_str(auxtag), value))
jpayne@68 2677
jpayne@68 2678 return result
jpayne@68 2679
jpayne@68 2680 def set_tags(self, tags):
jpayne@68 2681 """sets the fields in the optional alignment section with
jpayne@68 2682 a list of (tag, value) tuples.
jpayne@68 2683
jpayne@68 2684 The value type of the values is determined from the
jpayne@68 2685 python type. Optionally, a type may be given explicitly as
jpayne@68 2686 a third value in the tuple, For example:
jpayne@68 2687
jpayne@68 2688 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
jpayne@68 2689
jpayne@68 2690 This method will not enforce the rule that the same tag may appear
jpayne@68 2691 only once in the optional alignment section.
jpayne@68 2692 """
jpayne@68 2693
jpayne@68 2694 cdef bam1_t * src
jpayne@68 2695 cdef uint8_t * s
jpayne@68 2696 cdef char * temp
jpayne@68 2697 cdef int new_size = 0
jpayne@68 2698 cdef int old_size
jpayne@68 2699 src = self._delegate
jpayne@68 2700
jpayne@68 2701 # convert and pack the data
jpayne@68 2702 if tags is not None and len(tags) > 0:
jpayne@68 2703 fmt, args = pack_tags(tags)
jpayne@68 2704 new_size = struct.calcsize(fmt)
jpayne@68 2705 buffer = ctypes.create_string_buffer(new_size)
jpayne@68 2706 struct.pack_into(fmt,
jpayne@68 2707 buffer,
jpayne@68 2708 0,
jpayne@68 2709 *args)
jpayne@68 2710
jpayne@68 2711
jpayne@68 2712 # delete the old data and allocate new space.
jpayne@68 2713 # If total_size == 0, the aux field will be
jpayne@68 2714 # empty
jpayne@68 2715 old_size = pysam_bam_get_l_aux(src)
jpayne@68 2716 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@68 2717 old_size,
jpayne@68 2718 new_size,
jpayne@68 2719 pysam_bam_get_aux(src))
jpayne@68 2720 if retval == NULL:
jpayne@68 2721 raise MemoryError("could not allocate memory")
jpayne@68 2722
jpayne@68 2723 # copy data only if there is any
jpayne@68 2724 if new_size > 0:
jpayne@68 2725
jpayne@68 2726 # get location of new data
jpayne@68 2727 s = pysam_bam_get_aux(src)
jpayne@68 2728
jpayne@68 2729 # check if there is direct path from buffer.raw to tmp
jpayne@68 2730 p = buffer.raw
jpayne@68 2731 # create handle to make sure buffer stays alive long
jpayne@68 2732 # enough for memcpy, see issue 129
jpayne@68 2733 temp = p
jpayne@68 2734 memcpy(s, temp, new_size)
jpayne@68 2735
jpayne@68 2736
jpayne@68 2737 ########################################################
jpayne@68 2738 # Compatibility Accessors
jpayne@68 2739 # Functions, properties for compatibility with pysam < 0.8
jpayne@68 2740 #
jpayne@68 2741 # Several options
jpayne@68 2742 # change the factory functions according to API
jpayne@68 2743 # * requires code changes throughout, incl passing
jpayne@68 2744 # handles to factory functions
jpayne@68 2745 # subclass functions and add attributes at runtime
jpayne@68 2746 # e.g.: AlignedSegments.qname = AlignedSegments.query_name
jpayne@68 2747 # * will slow down the default interface
jpayne@68 2748 # explicit declaration of getters/setters
jpayne@68 2749 ########################################################
jpayne@68 2750 property qname:
jpayne@68 2751 """deprecated, use :attr:`query_name` instead."""
jpayne@68 2752 def __get__(self): return self.query_name
jpayne@68 2753 def __set__(self, v): self.query_name = v
jpayne@68 2754 property tid:
jpayne@68 2755 """deprecated, use :attr:`reference_id` instead."""
jpayne@68 2756 def __get__(self): return self.reference_id
jpayne@68 2757 def __set__(self, v): self.reference_id = v
jpayne@68 2758 property pos:
jpayne@68 2759 """deprecated, use :attr:`reference_start` instead."""
jpayne@68 2760 def __get__(self): return self.reference_start
jpayne@68 2761 def __set__(self, v): self.reference_start = v
jpayne@68 2762 property mapq:
jpayne@68 2763 """deprecated, use :attr:`mapping_quality` instead."""
jpayne@68 2764 def __get__(self): return self.mapping_quality
jpayne@68 2765 def __set__(self, v): self.mapping_quality = v
jpayne@68 2766 property rnext:
jpayne@68 2767 """deprecated, use :attr:`next_reference_id` instead."""
jpayne@68 2768 def __get__(self): return self.next_reference_id
jpayne@68 2769 def __set__(self, v): self.next_reference_id = v
jpayne@68 2770 property pnext:
jpayne@68 2771 """deprecated, use :attr:`next_reference_start` instead."""
jpayne@68 2772 def __get__(self):
jpayne@68 2773 return self.next_reference_start
jpayne@68 2774 def __set__(self, v):
jpayne@68 2775 self.next_reference_start = v
jpayne@68 2776 property cigar:
jpayne@68 2777 """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead."""
jpayne@68 2778 def __get__(self):
jpayne@68 2779 r = self.cigartuples
jpayne@68 2780 if r is None:
jpayne@68 2781 r = []
jpayne@68 2782 return r
jpayne@68 2783 def __set__(self, v): self.cigartuples = v
jpayne@68 2784 property tlen:
jpayne@68 2785 """deprecated, use :attr:`template_length` instead."""
jpayne@68 2786 def __get__(self):
jpayne@68 2787 return self.template_length
jpayne@68 2788 def __set__(self, v):
jpayne@68 2789 self.template_length = v
jpayne@68 2790 property seq:
jpayne@68 2791 """deprecated, use :attr:`query_sequence` instead."""
jpayne@68 2792 def __get__(self):
jpayne@68 2793 return self.query_sequence
jpayne@68 2794 def __set__(self, v):
jpayne@68 2795 self.query_sequence = v
jpayne@68 2796 property qual:
jpayne@68 2797 """deprecated, use :attr:`query_qualities` instead."""
jpayne@68 2798 def __get__(self):
jpayne@68 2799 return array_to_qualitystring(self.query_qualities)
jpayne@68 2800 def __set__(self, v):
jpayne@68 2801 self.query_qualities = qualitystring_to_array(v)
jpayne@68 2802 property alen:
jpayne@68 2803 """deprecated, use :attr:`reference_length` instead."""
jpayne@68 2804 def __get__(self):
jpayne@68 2805 return self.reference_length
jpayne@68 2806 def __set__(self, v):
jpayne@68 2807 self.reference_length = v
jpayne@68 2808 property aend:
jpayne@68 2809 """deprecated, use :attr:`reference_end` instead."""
jpayne@68 2810 def __get__(self):
jpayne@68 2811 return self.reference_end
jpayne@68 2812 def __set__(self, v):
jpayne@68 2813 self.reference_end = v
jpayne@68 2814 property rlen:
jpayne@68 2815 """deprecated, use :attr:`query_length` instead."""
jpayne@68 2816 def __get__(self):
jpayne@68 2817 return self.query_length
jpayne@68 2818 def __set__(self, v):
jpayne@68 2819 self.query_length = v
jpayne@68 2820 property query:
jpayne@68 2821 """deprecated, use :attr:`query_alignment_sequence`
jpayne@68 2822 instead."""
jpayne@68 2823 def __get__(self):
jpayne@68 2824 return self.query_alignment_sequence
jpayne@68 2825 def __set__(self, v):
jpayne@68 2826 self.query_alignment_sequence = v
jpayne@68 2827 property qqual:
jpayne@68 2828 """deprecated, use :attr:`query_alignment_qualities`
jpayne@68 2829 instead."""
jpayne@68 2830 def __get__(self):
jpayne@68 2831 return array_to_qualitystring(self.query_alignment_qualities)
jpayne@68 2832 def __set__(self, v):
jpayne@68 2833 self.query_alignment_qualities = qualitystring_to_array(v)
jpayne@68 2834 property qstart:
jpayne@68 2835 """deprecated, use :attr:`query_alignment_start` instead."""
jpayne@68 2836 def __get__(self):
jpayne@68 2837 return self.query_alignment_start
jpayne@68 2838 def __set__(self, v):
jpayne@68 2839 self.query_alignment_start = v
jpayne@68 2840 property qend:
jpayne@68 2841 """deprecated, use :attr:`query_alignment_end` instead."""
jpayne@68 2842 def __get__(self):
jpayne@68 2843 return self.query_alignment_end
jpayne@68 2844 def __set__(self, v):
jpayne@68 2845 self.query_alignment_end = v
jpayne@68 2846 property qlen:
jpayne@68 2847 """deprecated, use :attr:`query_alignment_length`
jpayne@68 2848 instead."""
jpayne@68 2849 def __get__(self):
jpayne@68 2850 return self.query_alignment_length
jpayne@68 2851 def __set__(self, v):
jpayne@68 2852 self.query_alignment_length = v
jpayne@68 2853 property mrnm:
jpayne@68 2854 """deprecated, use :attr:`next_reference_id` instead."""
jpayne@68 2855 def __get__(self):
jpayne@68 2856 return self.next_reference_id
jpayne@68 2857 def __set__(self, v):
jpayne@68 2858 self.next_reference_id = v
jpayne@68 2859 property mpos:
jpayne@68 2860 """deprecated, use :attr:`next_reference_start`
jpayne@68 2861 instead."""
jpayne@68 2862 def __get__(self):
jpayne@68 2863 return self.next_reference_start
jpayne@68 2864 def __set__(self, v):
jpayne@68 2865 self.next_reference_start = v
jpayne@68 2866 property rname:
jpayne@68 2867 """deprecated, use :attr:`reference_id` instead."""
jpayne@68 2868 def __get__(self):
jpayne@68 2869 return self.reference_id
jpayne@68 2870 def __set__(self, v):
jpayne@68 2871 self.reference_id = v
jpayne@68 2872 property isize:
jpayne@68 2873 """deprecated, use :attr:`template_length` instead."""
jpayne@68 2874 def __get__(self):
jpayne@68 2875 return self.template_length
jpayne@68 2876 def __set__(self, v):
jpayne@68 2877 self.template_length = v
jpayne@68 2878 property blocks:
jpayne@68 2879 """deprecated, use :meth:`get_blocks()` instead."""
jpayne@68 2880 def __get__(self):
jpayne@68 2881 return self.get_blocks()
jpayne@68 2882 property aligned_pairs:
jpayne@68 2883 """deprecated, use :meth:`get_aligned_pairs()` instead."""
jpayne@68 2884 def __get__(self):
jpayne@68 2885 return self.get_aligned_pairs()
jpayne@68 2886 property inferred_length:
jpayne@68 2887 """deprecated, use :meth:`infer_query_length()` instead."""
jpayne@68 2888 def __get__(self):
jpayne@68 2889 return self.infer_query_length()
jpayne@68 2890 property positions:
jpayne@68 2891 """deprecated, use :meth:`get_reference_positions()` instead."""
jpayne@68 2892 def __get__(self):
jpayne@68 2893 return self.get_reference_positions()
jpayne@68 2894 property tags:
jpayne@68 2895 """deprecated, use :meth:`get_tags()` instead."""
jpayne@68 2896 def __get__(self):
jpayne@68 2897 return self.get_tags()
jpayne@68 2898 def __set__(self, tags):
jpayne@68 2899 self.set_tags(tags)
jpayne@68 2900 def overlap(self):
jpayne@68 2901 """deprecated, use :meth:`get_overlap()` instead."""
jpayne@68 2902 return self.get_overlap()
jpayne@68 2903 def opt(self, tag):
jpayne@68 2904 """deprecated, use :meth:`get_tag()` instead."""
jpayne@68 2905 return self.get_tag(tag)
jpayne@68 2906 def setTag(self, tag, value, value_type=None, replace=True):
jpayne@68 2907 """deprecated, use :meth:`set_tag()` instead."""
jpayne@68 2908 return self.set_tag(tag, value, value_type, replace)
jpayne@68 2909
jpayne@68 2910
jpayne@68 2911 cdef class PileupColumn:
jpayne@68 2912 '''A pileup of reads at a particular reference sequence position
jpayne@68 2913 (:term:`column`). A pileup column contains all the reads that map
jpayne@68 2914 to a certain target base.
jpayne@68 2915
jpayne@68 2916 This class is a proxy for results returned by the samtools pileup
jpayne@68 2917 engine. If the underlying engine iterator advances, the results
jpayne@68 2918 of this column will change.
jpayne@68 2919 '''
jpayne@68 2920 def __init__(self):
jpayne@68 2921 raise TypeError("this class cannot be instantiated from Python")
jpayne@68 2922
jpayne@68 2923 def __str__(self):
jpayne@68 2924 return "\t".join(map(str,
jpayne@68 2925 (self.reference_id,
jpayne@68 2926 self.reference_pos,
jpayne@68 2927 self.nsegments))) +\
jpayne@68 2928 "\n" +\
jpayne@68 2929 "\n".join(map(str, self.pileups))
jpayne@68 2930
jpayne@68 2931 def __dealloc__(self):
jpayne@68 2932 free(self.buf.s)
jpayne@68 2933
jpayne@68 2934 def set_min_base_quality(self, min_base_quality):
jpayne@68 2935 """set the minimum base quality for this pileup column.
jpayne@68 2936 """
jpayne@68 2937 self.min_base_quality = min_base_quality
jpayne@68 2938
jpayne@68 2939 def __len__(self):
jpayne@68 2940 """return number of reads aligned to this column.
jpayne@68 2941
jpayne@68 2942 see :meth:`get_num_aligned`
jpayne@68 2943 """
jpayne@68 2944 return self.get_num_aligned()
jpayne@68 2945
jpayne@68 2946 property reference_id:
jpayne@68 2947 '''the reference sequence number as defined in the header'''
jpayne@68 2948 def __get__(self):
jpayne@68 2949 return self.tid
jpayne@68 2950
jpayne@68 2951 property reference_name:
jpayne@68 2952 """:term:`reference` name (None if no AlignmentFile is associated)"""
jpayne@68 2953 def __get__(self):
jpayne@68 2954 if self.header is not None:
jpayne@68 2955 return self.header.get_reference_name(self.tid)
jpayne@68 2956 return None
jpayne@68 2957
jpayne@68 2958 property nsegments:
jpayne@68 2959 '''number of reads mapping to this column.
jpayne@68 2960
jpayne@68 2961 Note that this number ignores the base quality filter.'''
jpayne@68 2962 def __get__(self):
jpayne@68 2963 return self.n_pu
jpayne@68 2964 def __set__(self, n):
jpayne@68 2965 self.n_pu = n
jpayne@68 2966
jpayne@68 2967 property reference_pos:
jpayne@68 2968 '''the position in the reference sequence (0-based).'''
jpayne@68 2969 def __get__(self):
jpayne@68 2970 return self.pos
jpayne@68 2971
jpayne@68 2972 property pileups:
jpayne@68 2973 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
jpayne@68 2974 def __get__(self):
jpayne@68 2975 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 2976 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 2977
jpayne@68 2978 cdef int x
jpayne@68 2979 cdef const bam_pileup1_t * p = NULL
jpayne@68 2980 pileups = []
jpayne@68 2981
jpayne@68 2982 # warning: there could be problems if self.n and self.buf are
jpayne@68 2983 # out of sync.
jpayne@68 2984 for x from 0 <= x < self.n_pu:
jpayne@68 2985 p = &(self.plp[0][x])
jpayne@68 2986 if p == NULL:
jpayne@68 2987 raise ValueError(
jpayne@68 2988 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 2989 "outside loop")
jpayne@68 2990 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 2991 continue
jpayne@68 2992 pileups.append(makePileupRead(p, self.header))
jpayne@68 2993 return pileups
jpayne@68 2994
jpayne@68 2995 ########################################################
jpayne@68 2996 # Compatibility Accessors
jpayne@68 2997 # Functions, properties for compatibility with pysam < 0.8
jpayne@68 2998 ########################################################
jpayne@68 2999 property pos:
jpayne@68 3000 """deprecated, use :attr:`reference_pos` instead."""
jpayne@68 3001 def __get__(self):
jpayne@68 3002 return self.reference_pos
jpayne@68 3003 def __set__(self, v):
jpayne@68 3004 self.reference_pos = v
jpayne@68 3005
jpayne@68 3006 property tid:
jpayne@68 3007 """deprecated, use :attr:`reference_id` instead."""
jpayne@68 3008 def __get__(self):
jpayne@68 3009 return self.reference_id
jpayne@68 3010 def __set__(self, v):
jpayne@68 3011 self.reference_id = v
jpayne@68 3012
jpayne@68 3013 property n:
jpayne@68 3014 """deprecated, use :attr:`nsegments` instead."""
jpayne@68 3015 def __get__(self):
jpayne@68 3016 return self.nsegments
jpayne@68 3017 def __set__(self, v):
jpayne@68 3018 self.nsegments = v
jpayne@68 3019
jpayne@68 3020 def get_num_aligned(self):
jpayne@68 3021 """return number of aligned bases at pileup column position.
jpayne@68 3022
jpayne@68 3023 This method applies a base quality filter and the number is
jpayne@68 3024 equal to the size of :meth:`get_query_sequences`,
jpayne@68 3025 :meth:`get_mapping_qualities`, etc.
jpayne@68 3026
jpayne@68 3027 """
jpayne@68 3028 cdef uint32_t x = 0
jpayne@68 3029 cdef uint32_t c = 0
jpayne@68 3030 cdef uint32_t cnt = 0
jpayne@68 3031 cdef const bam_pileup1_t * p = NULL
jpayne@68 3032 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 3033 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 3034
jpayne@68 3035 for x from 0 <= x < self.n_pu:
jpayne@68 3036 p = &(self.plp[0][x])
jpayne@68 3037 if p == NULL:
jpayne@68 3038 raise ValueError(
jpayne@68 3039 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3040 "outside loop")
jpayne@68 3041 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 3042 continue
jpayne@68 3043 cnt += 1
jpayne@68 3044 return cnt
jpayne@68 3045
jpayne@68 3046 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
jpayne@68 3047 """query bases/sequences at pileup column position.
jpayne@68 3048
jpayne@68 3049 Optionally, the bases/sequences can be annotated according to the samtools
jpayne@68 3050 mpileup format. This is the format description from the samtools mpileup tool::
jpayne@68 3051
jpayne@68 3052 Information on match, mismatch, indel, strand, mapping
jpayne@68 3053 quality and start and end of a read are all encoded at the
jpayne@68 3054 read base column. At this column, a dot stands for a match
jpayne@68 3055 to the reference base on the forward strand, a comma for a
jpayne@68 3056 match on the reverse strand, a '>' or '<' for a reference
jpayne@68 3057 skip, `ACGTN' for a mismatch on the forward strand and
jpayne@68 3058 `acgtn' for a mismatch on the reverse strand. A pattern
jpayne@68 3059 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
jpayne@68 3060 between this reference position and the next reference
jpayne@68 3061 position. The length of the insertion is given by the
jpayne@68 3062 integer in the pattern, followed by the inserted
jpayne@68 3063 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
jpayne@68 3064 represents a deletion from the reference. The deleted bases
jpayne@68 3065 will be presented as `*' in the following lines. Also at
jpayne@68 3066 the read base column, a symbol `^' marks the start of a
jpayne@68 3067 read. The ASCII of the character following `^' minus 33
jpayne@68 3068 gives the mapping quality. A symbol `$' marks the end of a
jpayne@68 3069 read segment
jpayne@68 3070
jpayne@68 3071 To reproduce samtools mpileup format, set all of mark_matches,
jpayne@68 3072 mark_ends and add_indels to True.
jpayne@68 3073
jpayne@68 3074 Parameters
jpayne@68 3075 ----------
jpayne@68 3076
jpayne@68 3077 mark_matches: bool
jpayne@68 3078
jpayne@68 3079 If True, output bases matching the reference as "." or ","
jpayne@68 3080 for forward and reverse strand, respectively. This mark
jpayne@68 3081 requires the reference sequence. If no reference is
jpayne@68 3082 present, this option is ignored.
jpayne@68 3083
jpayne@68 3084 mark_ends : bool
jpayne@68 3085
jpayne@68 3086 If True, add markers "^" and "$" for read start and end, respectively.
jpayne@68 3087
jpayne@68 3088 add_indels : bool
jpayne@68 3089
jpayne@68 3090 If True, add bases for bases inserted into or skipped from the
jpayne@68 3091 reference. The latter requires a reference sequence file to have
jpayne@68 3092 been given, e.g. via `pileup(fastafile = ...)`. If no reference
jpayne@68 3093 sequence is available, skipped bases are represented as 'N's.
jpayne@68 3094
jpayne@68 3095 Returns
jpayne@68 3096 -------
jpayne@68 3097
jpayne@68 3098 a list of bases/sequences per read at pileup column position. : list
jpayne@68 3099
jpayne@68 3100 """
jpayne@68 3101 cdef uint32_t x = 0
jpayne@68 3102 cdef uint32_t j = 0
jpayne@68 3103 cdef uint32_t c = 0
jpayne@68 3104 cdef uint8_t cc = 0
jpayne@68 3105 cdef uint8_t rb = 0
jpayne@68 3106 cdef kstring_t * buf = &self.buf
jpayne@68 3107 cdef const bam_pileup1_t * p = NULL
jpayne@68 3108
jpayne@68 3109 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 3110 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 3111
jpayne@68 3112 buf.l = 0
jpayne@68 3113
jpayne@68 3114 # todo: reference sequence to count matches/mismatches
jpayne@68 3115 # todo: convert assertions to exceptions
jpayne@68 3116 for x from 0 <= x < self.n_pu:
jpayne@68 3117 p = &(self.plp[0][x])
jpayne@68 3118 if p == NULL:
jpayne@68 3119 raise ValueError(
jpayne@68 3120 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3121 "outside loop")
jpayne@68 3122 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 3123 continue
jpayne@68 3124 # see samtools pileup_seq
jpayne@68 3125 if mark_ends and p.is_head:
jpayne@68 3126 kputc(b'^', buf)
jpayne@68 3127
jpayne@68 3128 if p.b.core.qual > 93:
jpayne@68 3129 kputc(126, buf)
jpayne@68 3130 else:
jpayne@68 3131 kputc(p.b.core.qual + 33, buf)
jpayne@68 3132 if not p.is_del:
jpayne@68 3133 if p.qpos < p.b.core.l_qseq:
jpayne@68 3134 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
jpayne@68 3135 else:
jpayne@68 3136 cc = b'N'
jpayne@68 3137
jpayne@68 3138 if mark_matches and self.reference_sequence != NULL:
jpayne@68 3139 rb = self.reference_sequence[self.reference_pos]
jpayne@68 3140 if seq_nt16_table[cc] == seq_nt16_table[rb]:
jpayne@68 3141 cc = b'='
jpayne@68 3142 kputc(strand_mark_char(cc, p.b), buf)
jpayne@68 3143 elif add_indels:
jpayne@68 3144 if p.is_refskip:
jpayne@68 3145 if bam_is_rev(p.b):
jpayne@68 3146 kputc(b'<', buf)
jpayne@68 3147 else:
jpayne@68 3148 kputc(b'>', buf)
jpayne@68 3149 else:
jpayne@68 3150 kputc(b'*', buf)
jpayne@68 3151 if add_indels:
jpayne@68 3152 if p.indel > 0:
jpayne@68 3153 kputc(b'+', buf)
jpayne@68 3154 kputw(p.indel, buf)
jpayne@68 3155 for j from 1 <= j <= p.indel:
jpayne@68 3156 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
jpayne@68 3157 kputc(strand_mark_char(cc, p.b), buf)
jpayne@68 3158 elif p.indel < 0:
jpayne@68 3159 kputc(b'-', buf)
jpayne@68 3160 kputw(-p.indel, buf)
jpayne@68 3161 for j from 1 <= j <= -p.indel:
jpayne@68 3162 # TODO: out-of-range check here?
jpayne@68 3163 if self.reference_sequence == NULL:
jpayne@68 3164 cc = b'N'
jpayne@68 3165 else:
jpayne@68 3166 cc = self.reference_sequence[self.reference_pos + j]
jpayne@68 3167 kputc(strand_mark_char(cc, p.b), buf)
jpayne@68 3168 if mark_ends and p.is_tail:
jpayne@68 3169 kputc(b'$', buf)
jpayne@68 3170
jpayne@68 3171 kputc(b':', buf)
jpayne@68 3172
jpayne@68 3173 if buf.l == 0:
jpayne@68 3174 # could be zero if all qualities are too low
jpayne@68 3175 return ""
jpayne@68 3176 else:
jpayne@68 3177 # quicker to ensemble all and split than to encode all separately.
jpayne@68 3178 # ignore last ":"
jpayne@68 3179 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
jpayne@68 3180
jpayne@68 3181 def get_query_qualities(self):
jpayne@68 3182 """query base quality scores at pileup column position.
jpayne@68 3183
jpayne@68 3184 Returns
jpayne@68 3185 -------
jpayne@68 3186
jpayne@68 3187 a list of quality scores : list
jpayne@68 3188 """
jpayne@68 3189 cdef uint32_t x = 0
jpayne@68 3190 cdef const bam_pileup1_t * p = NULL
jpayne@68 3191 cdef uint32_t c = 0
jpayne@68 3192 result = []
jpayne@68 3193 for x from 0 <= x < self.n_pu:
jpayne@68 3194 p = &(self.plp[0][x])
jpayne@68 3195 if p == NULL:
jpayne@68 3196 raise ValueError(
jpayne@68 3197 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3198 "outside loop")
jpayne@68 3199
jpayne@68 3200 if p.qpos < p.b.core.l_qseq:
jpayne@68 3201 c = bam_get_qual(p.b)[p.qpos]
jpayne@68 3202 else:
jpayne@68 3203 c = 0
jpayne@68 3204 if c < self.min_base_quality:
jpayne@68 3205 continue
jpayne@68 3206 result.append(c)
jpayne@68 3207 return result
jpayne@68 3208
jpayne@68 3209 def get_mapping_qualities(self):
jpayne@68 3210 """query mapping quality scores at pileup column position.
jpayne@68 3211
jpayne@68 3212 Returns
jpayne@68 3213 -------
jpayne@68 3214
jpayne@68 3215 a list of quality scores : list
jpayne@68 3216 """
jpayne@68 3217 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 3218 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 3219
jpayne@68 3220 cdef uint32_t x = 0
jpayne@68 3221 cdef const bam_pileup1_t * p = NULL
jpayne@68 3222 result = []
jpayne@68 3223 for x from 0 <= x < self.n_pu:
jpayne@68 3224 p = &(self.plp[0][x])
jpayne@68 3225 if p == NULL:
jpayne@68 3226 raise ValueError(
jpayne@68 3227 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3228 "outside loop")
jpayne@68 3229
jpayne@68 3230 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 3231 continue
jpayne@68 3232 result.append(p.b.core.qual)
jpayne@68 3233 return result
jpayne@68 3234
jpayne@68 3235 def get_query_positions(self):
jpayne@68 3236 """positions in read at pileup column position.
jpayne@68 3237
jpayne@68 3238 Returns
jpayne@68 3239 -------
jpayne@68 3240
jpayne@68 3241 a list of read positions : list
jpayne@68 3242 """
jpayne@68 3243 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 3244 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 3245
jpayne@68 3246 cdef uint32_t x = 0
jpayne@68 3247 cdef const bam_pileup1_t * p = NULL
jpayne@68 3248 result = []
jpayne@68 3249 for x from 0 <= x < self.n_pu:
jpayne@68 3250 p = &(self.plp[0][x])
jpayne@68 3251 if p == NULL:
jpayne@68 3252 raise ValueError(
jpayne@68 3253 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3254 "outside loop")
jpayne@68 3255
jpayne@68 3256 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 3257 continue
jpayne@68 3258 result.append(p.qpos)
jpayne@68 3259 return result
jpayne@68 3260
jpayne@68 3261 def get_query_names(self):
jpayne@68 3262 """query/read names aligned at pileup column position.
jpayne@68 3263
jpayne@68 3264 Returns
jpayne@68 3265 -------
jpayne@68 3266
jpayne@68 3267 a list of query names at pileup column position. : list
jpayne@68 3268 """
jpayne@68 3269 if self.plp == NULL or self.plp[0] == NULL:
jpayne@68 3270 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@68 3271
jpayne@68 3272 cdef uint32_t x = 0
jpayne@68 3273 cdef const bam_pileup1_t * p = NULL
jpayne@68 3274 result = []
jpayne@68 3275 for x from 0 <= x < self.n_pu:
jpayne@68 3276 p = &(self.plp[0][x])
jpayne@68 3277 if p == NULL:
jpayne@68 3278 raise ValueError(
jpayne@68 3279 "pileup buffer out of sync - most likely use of iterator "
jpayne@68 3280 "outside loop")
jpayne@68 3281
jpayne@68 3282 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@68 3283 continue
jpayne@68 3284 result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
jpayne@68 3285 return result
jpayne@68 3286
jpayne@68 3287
jpayne@68 3288 cdef class PileupRead:
jpayne@68 3289 '''Representation of a read aligned to a particular position in the
jpayne@68 3290 reference sequence.
jpayne@68 3291
jpayne@68 3292 '''
jpayne@68 3293
jpayne@68 3294 def __init__(self):
jpayne@68 3295 raise TypeError(
jpayne@68 3296 "this class cannot be instantiated from Python")
jpayne@68 3297
jpayne@68 3298 def __str__(self):
jpayne@68 3299 return "\t".join(
jpayne@68 3300 map(str,
jpayne@68 3301 (self.alignment, self.query_position,
jpayne@68 3302 self.indel, self.level,
jpayne@68 3303 self.is_del, self.is_head,
jpayne@68 3304 self.is_tail, self.is_refskip)))
jpayne@68 3305
jpayne@68 3306 property alignment:
jpayne@68 3307 """a :class:`pysam.AlignedSegment` object of the aligned read"""
jpayne@68 3308 def __get__(self):
jpayne@68 3309 return self._alignment
jpayne@68 3310
jpayne@68 3311 property query_position:
jpayne@68 3312 """position of the read base at the pileup site, 0-based.
jpayne@68 3313 None if :attr:`is_del` or :attr:`is_refskip` is set.
jpayne@68 3314
jpayne@68 3315 """
jpayne@68 3316 def __get__(self):
jpayne@68 3317 if self.is_del or self.is_refskip:
jpayne@68 3318 return None
jpayne@68 3319 else:
jpayne@68 3320 return self._qpos
jpayne@68 3321
jpayne@68 3322 property query_position_or_next:
jpayne@68 3323 """position of the read base at the pileup site, 0-based.
jpayne@68 3324
jpayne@68 3325 If the current position is a deletion, returns the next
jpayne@68 3326 aligned base.
jpayne@68 3327
jpayne@68 3328 """
jpayne@68 3329 def __get__(self):
jpayne@68 3330 return self._qpos
jpayne@68 3331
jpayne@68 3332 property indel:
jpayne@68 3333 """indel length for the position following the current pileup site.
jpayne@68 3334
jpayne@68 3335 This quantity peeks ahead to the next cigar operation in this
jpayne@68 3336 alignment. If the next operation is an insertion, indel will
jpayne@68 3337 be positive. If the next operation is a deletion, it will be
jpayne@68 3338 negation. 0 if the next operation is not an indel.
jpayne@68 3339
jpayne@68 3340 """
jpayne@68 3341 def __get__(self):
jpayne@68 3342 return self._indel
jpayne@68 3343
jpayne@68 3344 property level:
jpayne@68 3345 """the level of the read in the "viewer" mode. Note that this value
jpayne@68 3346 is currently not computed."""
jpayne@68 3347 def __get__(self):
jpayne@68 3348 return self._level
jpayne@68 3349
jpayne@68 3350 property is_del:
jpayne@68 3351 """1 iff the base on the padded read is a deletion"""
jpayne@68 3352 def __get__(self):
jpayne@68 3353 return self._is_del
jpayne@68 3354
jpayne@68 3355 property is_head:
jpayne@68 3356 """1 iff the base on the padded read is the left-most base."""
jpayne@68 3357 def __get__(self):
jpayne@68 3358 return self._is_head
jpayne@68 3359
jpayne@68 3360 property is_tail:
jpayne@68 3361 """1 iff the base on the padded read is the right-most base."""
jpayne@68 3362 def __get__(self):
jpayne@68 3363 return self._is_tail
jpayne@68 3364
jpayne@68 3365 property is_refskip:
jpayne@68 3366 """1 iff the base on the padded read is part of CIGAR N op."""
jpayne@68 3367 def __get__(self):
jpayne@68 3368 return self._is_refskip
jpayne@68 3369
jpayne@68 3370
jpayne@68 3371
jpayne@68 3372 cpdef enum CIGAR_OPS:
jpayne@68 3373 CMATCH = 0
jpayne@68 3374 CINS = 1
jpayne@68 3375 CDEL = 2
jpayne@68 3376 CREF_SKIP = 3
jpayne@68 3377 CSOFT_CLIP = 4
jpayne@68 3378 CHARD_CLIP = 5
jpayne@68 3379 CPAD = 6
jpayne@68 3380 CEQUAL = 7
jpayne@68 3381 CDIFF = 8
jpayne@68 3382 CBACK = 9
jpayne@68 3383
jpayne@68 3384
jpayne@68 3385 cpdef enum SAM_FLAGS:
jpayne@68 3386 # the read is paired in sequencing, no matter whether it is mapped in a pair
jpayne@68 3387 FPAIRED = 1
jpayne@68 3388 # the read is mapped in a proper pair
jpayne@68 3389 FPROPER_PAIR = 2
jpayne@68 3390 # the read itself is unmapped; conflictive with FPROPER_PAIR
jpayne@68 3391 FUNMAP = 4
jpayne@68 3392 # the mate is unmapped
jpayne@68 3393 FMUNMAP = 8
jpayne@68 3394 # the read is mapped to the reverse strand
jpayne@68 3395 FREVERSE = 16
jpayne@68 3396 # the mate is mapped to the reverse strand
jpayne@68 3397 FMREVERSE = 32
jpayne@68 3398 # this is read1
jpayne@68 3399 FREAD1 = 64
jpayne@68 3400 # this is read2
jpayne@68 3401 FREAD2 = 128
jpayne@68 3402 # not primary alignment
jpayne@68 3403 FSECONDARY = 256
jpayne@68 3404 # QC failure
jpayne@68 3405 FQCFAIL = 512
jpayne@68 3406 # optical or PCR duplicate
jpayne@68 3407 FDUP = 1024
jpayne@68 3408 # supplementary alignment
jpayne@68 3409 FSUPPLEMENTARY = 2048
jpayne@68 3410
jpayne@68 3411
jpayne@68 3412 __all__ = [
jpayne@68 3413 "AlignedSegment",
jpayne@68 3414 "PileupColumn",
jpayne@68 3415 "PileupRead",
jpayne@68 3416 "CMATCH",
jpayne@68 3417 "CINS",
jpayne@68 3418 "CDEL",
jpayne@68 3419 "CREF_SKIP",
jpayne@68 3420 "CSOFT_CLIP",
jpayne@68 3421 "CHARD_CLIP",
jpayne@68 3422 "CPAD",
jpayne@68 3423 "CEQUAL",
jpayne@68 3424 "CDIFF",
jpayne@68 3425 "CBACK",
jpayne@68 3426 "FPAIRED",
jpayne@68 3427 "FPROPER_PAIR",
jpayne@68 3428 "FUNMAP",
jpayne@68 3429 "FMUNMAP",
jpayne@68 3430 "FREVERSE",
jpayne@68 3431 "FMREVERSE",
jpayne@68 3432 "FREAD1",
jpayne@68 3433 "FREAD2",
jpayne@68 3434 "FSECONDARY",
jpayne@68 3435 "FQCFAIL",
jpayne@68 3436 "FDUP",
jpayne@68 3437 "FSUPPLEMENTARY",
jpayne@68 3438 "KEY_NAMES"]