annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # cython: language_level=3
jpayne@69 2 # cython: embedsignature=True
jpayne@69 3 # cython: profile=True
jpayne@69 4 ###############################################################################
jpayne@69 5 ###############################################################################
jpayne@69 6 # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@69 7 ###############################################################################
jpayne@69 8 # The principal classes defined in this module are:
jpayne@69 9 #
jpayne@69 10 # class AlignedSegment an aligned segment (read)
jpayne@69 11 #
jpayne@69 12 # class PileupColumn a collection of segments (PileupRead) aligned to
jpayne@69 13 # a particular genomic position.
jpayne@69 14 #
jpayne@69 15 # class PileupRead an AlignedSegment aligned to a particular genomic
jpayne@69 16 # position. Contains additional attributes with respect
jpayne@69 17 # to this.
jpayne@69 18 #
jpayne@69 19 # Additionally this module defines numerous additional classes that are part
jpayne@69 20 # of the internal API. These are:
jpayne@69 21 #
jpayne@69 22 # Various iterator classes to iterate over alignments in sequential (IteratorRow)
jpayne@69 23 # or in a stacked fashion (IteratorColumn):
jpayne@69 24 #
jpayne@69 25 # class IteratorRow
jpayne@69 26 # class IteratorRowRegion
jpayne@69 27 # class IteratorRowHead
jpayne@69 28 # class IteratorRowAll
jpayne@69 29 # class IteratorRowAllRefs
jpayne@69 30 # class IteratorRowSelection
jpayne@69 31 #
jpayne@69 32 ###############################################################################
jpayne@69 33 #
jpayne@69 34 # The MIT License
jpayne@69 35 #
jpayne@69 36 # Copyright (c) 2015 Andreas Heger
jpayne@69 37 #
jpayne@69 38 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69 39 # copy of this software and associated documentation files (the "Software"),
jpayne@69 40 # to deal in the Software without restriction, including without limitation
jpayne@69 41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69 42 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69 43 # Software is furnished to do so, subject to the following conditions:
jpayne@69 44 #
jpayne@69 45 # The above copyright notice and this permission notice shall be included in
jpayne@69 46 # all copies or substantial portions of the Software.
jpayne@69 47 #
jpayne@69 48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69 49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69 50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@69 51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69 52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69 53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69 54 # DEALINGS IN THE SOFTWARE.
jpayne@69 55 #
jpayne@69 56 ###############################################################################
jpayne@69 57 import re
jpayne@69 58 import array
jpayne@69 59 import json
jpayne@69 60 import string
jpayne@69 61 import ctypes
jpayne@69 62 import struct
jpayne@69 63
jpayne@69 64 cimport cython
jpayne@69 65 from cpython cimport array as c_array
jpayne@69 66 from cpython cimport PyBytes_FromStringAndSize
jpayne@69 67 from libc.string cimport memset, strchr
jpayne@69 68 from cpython cimport array as c_array
jpayne@69 69 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
jpayne@69 70 INT8_MAX, INT16_MAX, INT32_MAX, \
jpayne@69 71 UINT8_MAX, UINT16_MAX, UINT32_MAX
jpayne@69 72
jpayne@69 73 from pysam.libchtslib cimport HTS_IDX_NOCOOR
jpayne@69 74 from pysam.libcutils cimport force_bytes, force_str, \
jpayne@69 75 charptr_to_str, charptr_to_bytes
jpayne@69 76 from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
jpayne@69 77 array_to_qualitystring
jpayne@69 78
jpayne@69 79 # Constants for binary tag conversion
jpayne@69 80 cdef char * htslib_types = 'cCsSiIf'
jpayne@69 81 cdef char * parray_types = 'bBhHiIf'
jpayne@69 82
jpayne@69 83 # translation tables
jpayne@69 84
jpayne@69 85 # cigar code to character and vice versa
jpayne@69 86 cdef char* CODE2CIGAR= "MIDNSHP=XB"
jpayne@69 87 cdef int NCIGAR_CODES = 10
jpayne@69 88
jpayne@69 89 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
jpayne@69 90 CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
jpayne@69 91
jpayne@69 92 # names for keys in dictionary representation of an AlignedSegment
jpayne@69 93 KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
jpayne@69 94 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
jpayne@69 95
jpayne@69 96 #####################################################################
jpayne@69 97 # C multiplication with wrapping around
jpayne@69 98 cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
jpayne@69 99 return (a * b) & 0xffffffff
jpayne@69 100
jpayne@69 101
jpayne@69 102 cdef inline uint8_t tolower(uint8_t ch):
jpayne@69 103 if ch >= 65 and ch <= 90:
jpayne@69 104 return ch + 32
jpayne@69 105 else:
jpayne@69 106 return ch
jpayne@69 107
jpayne@69 108
jpayne@69 109 cdef inline uint8_t toupper(uint8_t ch):
jpayne@69 110 if ch >= 97 and ch <= 122:
jpayne@69 111 return ch - 32
jpayne@69 112 else:
jpayne@69 113 return ch
jpayne@69 114
jpayne@69 115
jpayne@69 116 cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
jpayne@69 117 if ch == b'=':
jpayne@69 118 if bam_is_rev(b):
jpayne@69 119 return b','
jpayne@69 120 else:
jpayne@69 121 return b'.'
jpayne@69 122 else:
jpayne@69 123 if bam_is_rev(b):
jpayne@69 124 return tolower(ch)
jpayne@69 125 else:
jpayne@69 126 return toupper(ch)
jpayne@69 127
jpayne@69 128
jpayne@69 129 cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
jpayne@69 130 cdef uint32_t c
jpayne@69 131 if p.qpos < p.b.core.l_qseq:
jpayne@69 132 c = bam_get_qual(p.b)[p.qpos]
jpayne@69 133 else:
jpayne@69 134 c = 0
jpayne@69 135 if c < threshold:
jpayne@69 136 return True
jpayne@69 137 return False
jpayne@69 138
jpayne@69 139
jpayne@69 140 cdef inline char map_typecode_htslib_to_python(uint8_t s):
jpayne@69 141 """map an htslib typecode to the corresponding python typecode
jpayne@69 142 to be used in the struct or array modules."""
jpayne@69 143
jpayne@69 144 # map type from htslib to python array
jpayne@69 145 cdef char * f = strchr(htslib_types, s)
jpayne@69 146
jpayne@69 147 if f == NULL:
jpayne@69 148 return 0
jpayne@69 149 return parray_types[f - htslib_types]
jpayne@69 150
jpayne@69 151
jpayne@69 152 cdef inline uint8_t map_typecode_python_to_htslib(char s):
jpayne@69 153 """determine value type from type code of array"""
jpayne@69 154 cdef char * f = strchr(parray_types, s)
jpayne@69 155 if f == NULL:
jpayne@69 156 return 0
jpayne@69 157 return htslib_types[f - parray_types]
jpayne@69 158
jpayne@69 159
jpayne@69 160 cdef inline void update_bin(bam1_t * src):
jpayne@69 161 if src.core.flag & BAM_FUNMAP:
jpayne@69 162 # treat alignment as length of 1 for unmapped reads
jpayne@69 163 src.core.bin = hts_reg2bin(
jpayne@69 164 src.core.pos,
jpayne@69 165 src.core.pos + 1,
jpayne@69 166 14,
jpayne@69 167 5)
jpayne@69 168 elif pysam_get_n_cigar(src):
jpayne@69 169 src.core.bin = hts_reg2bin(
jpayne@69 170 src.core.pos,
jpayne@69 171 bam_endpos(src),
jpayne@69 172 14,
jpayne@69 173 5)
jpayne@69 174 else:
jpayne@69 175 src.core.bin = hts_reg2bin(
jpayne@69 176 src.core.pos,
jpayne@69 177 src.core.pos + 1,
jpayne@69 178 14,
jpayne@69 179 5)
jpayne@69 180
jpayne@69 181
jpayne@69 182 # optional tag data manipulation
jpayne@69 183 cdef convert_binary_tag(uint8_t * tag):
jpayne@69 184 """return bytesize, number of values and array of values
jpayne@69 185 in aux_data memory location pointed to by tag."""
jpayne@69 186 cdef uint8_t auxtype
jpayne@69 187 cdef uint8_t byte_size
jpayne@69 188 cdef int32_t nvalues
jpayne@69 189 # get byte size
jpayne@69 190 auxtype = tag[0]
jpayne@69 191 byte_size = aux_type2size(auxtype)
jpayne@69 192 tag += 1
jpayne@69 193 # get number of values in array
jpayne@69 194 nvalues = (<int32_t*>tag)[0]
jpayne@69 195 tag += 4
jpayne@69 196
jpayne@69 197 # define python array
jpayne@69 198 cdef c_array.array c_values = array.array(
jpayne@69 199 chr(map_typecode_htslib_to_python(auxtype)))
jpayne@69 200 c_array.resize(c_values, nvalues)
jpayne@69 201
jpayne@69 202 # copy data
jpayne@69 203 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
jpayne@69 204
jpayne@69 205 # no need to check for endian-ness as bam1_core_t fields
jpayne@69 206 # and aux_data are in host endian-ness. See sam.c and calls
jpayne@69 207 # to swap_data
jpayne@69 208 return byte_size, nvalues, c_values
jpayne@69 209
jpayne@69 210
jpayne@69 211 cdef inline uint8_t get_tag_typecode(value, value_type=None):
jpayne@69 212 """guess type code for a *value*. If *value_type* is None, the type
jpayne@69 213 code will be inferred based on the Python type of *value*
jpayne@69 214
jpayne@69 215 """
jpayne@69 216 # 0 is unknown typecode
jpayne@69 217 cdef char typecode = 0
jpayne@69 218
jpayne@69 219 if value_type is None:
jpayne@69 220 if isinstance(value, int):
jpayne@69 221 if value < 0:
jpayne@69 222 if value >= INT8_MIN:
jpayne@69 223 typecode = b'c'
jpayne@69 224 elif value >= INT16_MIN:
jpayne@69 225 typecode = b's'
jpayne@69 226 elif value >= INT32_MIN:
jpayne@69 227 typecode = b'i'
jpayne@69 228 # unsigned ints
jpayne@69 229 else:
jpayne@69 230 if value <= UINT8_MAX:
jpayne@69 231 typecode = b'C'
jpayne@69 232 elif value <= UINT16_MAX:
jpayne@69 233 typecode = b'S'
jpayne@69 234 elif value <= UINT32_MAX:
jpayne@69 235 typecode = b'I'
jpayne@69 236 elif isinstance(value, float):
jpayne@69 237 typecode = b'f'
jpayne@69 238 elif isinstance(value, str):
jpayne@69 239 typecode = b'Z'
jpayne@69 240 elif isinstance(value, bytes):
jpayne@69 241 typecode = b'Z'
jpayne@69 242 elif isinstance(value, array.array) or \
jpayne@69 243 isinstance(value, list) or \
jpayne@69 244 isinstance(value, tuple):
jpayne@69 245 typecode = b'B'
jpayne@69 246 else:
jpayne@69 247 if value_type in 'aAsSIcCZidfH':
jpayne@69 248 typecode = force_bytes(value_type)[0]
jpayne@69 249
jpayne@69 250 return typecode
jpayne@69 251
jpayne@69 252
jpayne@69 253 cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
jpayne@69 254 '''returns the value typecode of a value.
jpayne@69 255
jpayne@69 256 If max is specified, the appropriate type is returned for a range
jpayne@69 257 where value is the minimum.
jpayne@69 258
jpayne@69 259 Note that this method returns types from the extended BAM alphabet
jpayne@69 260 of types that includes tags that are not part of the SAM
jpayne@69 261 specification.
jpayne@69 262 '''
jpayne@69 263
jpayne@69 264
jpayne@69 265 cdef uint8_t typecode
jpayne@69 266
jpayne@69 267 t = type(value)
jpayne@69 268
jpayne@69 269 if t is float:
jpayne@69 270 typecode = b'f'
jpayne@69 271 elif t is int:
jpayne@69 272 if max_value is None:
jpayne@69 273 max_value = value
jpayne@69 274 if min_value is None:
jpayne@69 275 min_value = value
jpayne@69 276 # signed ints
jpayne@69 277 if min_value < 0:
jpayne@69 278 if min_value >= INT8_MIN and max_value <= INT8_MAX:
jpayne@69 279 typecode = b'c'
jpayne@69 280 elif min_value >= INT16_MIN and max_value <= INT16_MAX:
jpayne@69 281 typecode = b's'
jpayne@69 282 elif min_value >= INT32_MIN or max_value <= INT32_MAX:
jpayne@69 283 typecode = b'i'
jpayne@69 284 else:
jpayne@69 285 raise ValueError(
jpayne@69 286 "at least one signed integer out of range of "
jpayne@69 287 "BAM/SAM specification")
jpayne@69 288 # unsigned ints
jpayne@69 289 else:
jpayne@69 290 if max_value <= UINT8_MAX:
jpayne@69 291 typecode = b'C'
jpayne@69 292 elif max_value <= UINT16_MAX:
jpayne@69 293 typecode = b'S'
jpayne@69 294 elif max_value <= UINT32_MAX:
jpayne@69 295 typecode = b'I'
jpayne@69 296 else:
jpayne@69 297 raise ValueError(
jpayne@69 298 "at least one integer out of range of BAM/SAM specification")
jpayne@69 299 else:
jpayne@69 300 # Note: hex strings (H) are not supported yet
jpayne@69 301 if t is not bytes:
jpayne@69 302 value = value.encode('ascii')
jpayne@69 303 if len(value) == 1:
jpayne@69 304 typecode = b'A'
jpayne@69 305 else:
jpayne@69 306 typecode = b'Z'
jpayne@69 307
jpayne@69 308 return typecode
jpayne@69 309
jpayne@69 310
jpayne@69 311 # mapping python array.array and htslib typecodes to struct typecodes
jpayne@69 312 DATATYPE2FORMAT = {
jpayne@69 313 ord('c'): ('b', 1),
jpayne@69 314 ord('C'): ('B', 1),
jpayne@69 315 ord('s'): ('h', 2),
jpayne@69 316 ord('S'): ('H', 2),
jpayne@69 317 ord('i'): ('i', 4),
jpayne@69 318 ord('I'): ('I', 4),
jpayne@69 319 ord('f'): ('f', 4),
jpayne@69 320 ord('d'): ('d', 8),
jpayne@69 321 ord('A'): ('c', 1),
jpayne@69 322 ord('a'): ('c', 1)}
jpayne@69 323
jpayne@69 324
jpayne@69 325 cdef inline pack_tags(tags):
jpayne@69 326 """pack a list of tags. Each tag is a tuple of (tag, tuple).
jpayne@69 327
jpayne@69 328 Values are packed into the most space efficient data structure
jpayne@69 329 possible unless the tag contains a third field with the typecode.
jpayne@69 330
jpayne@69 331 Returns a format string and the associated list of arguments to be
jpayne@69 332 used in a call to struct.pack_into.
jpayne@69 333 """
jpayne@69 334 fmts, args = ["<"], []
jpayne@69 335
jpayne@69 336 # htslib typecode
jpayne@69 337 cdef uint8_t typecode
jpayne@69 338 for tag in tags:
jpayne@69 339
jpayne@69 340 if len(tag) == 2:
jpayne@69 341 pytag, value = tag
jpayne@69 342 valuetype = None
jpayne@69 343 elif len(tag) == 3:
jpayne@69 344 pytag, value, valuetype = tag
jpayne@69 345 else:
jpayne@69 346 raise ValueError("malformatted tag: %s" % str(tag))
jpayne@69 347
jpayne@69 348 if valuetype is None:
jpayne@69 349 typecode = 0
jpayne@69 350 else:
jpayne@69 351 # only first character in valuecode matters
jpayne@69 352 typecode = force_bytes(valuetype)[0]
jpayne@69 353
jpayne@69 354 pytag = force_bytes(pytag)
jpayne@69 355 pytype = type(value)
jpayne@69 356
jpayne@69 357 if pytype is tuple or pytype is list:
jpayne@69 358 # binary tags from tuples or lists
jpayne@69 359 if not typecode:
jpayne@69 360 # automatically determine value type - first value
jpayne@69 361 # determines type. If there is a mix of types, the
jpayne@69 362 # result is undefined.
jpayne@69 363 typecode = get_btag_typecode(min(value),
jpayne@69 364 min_value=min(value),
jpayne@69 365 max_value=max(value))
jpayne@69 366
jpayne@69 367 if typecode not in DATATYPE2FORMAT:
jpayne@69 368 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@69 369
jpayne@69 370 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
jpayne@69 371 args.extend([pytag[:2],
jpayne@69 372 ord("B"),
jpayne@69 373 typecode,
jpayne@69 374 len(value)] + list(value))
jpayne@69 375
jpayne@69 376 elif isinstance(value, array.array):
jpayne@69 377 # binary tags from arrays
jpayne@69 378 if typecode == 0:
jpayne@69 379 typecode = map_typecode_python_to_htslib(ord(value.typecode))
jpayne@69 380
jpayne@69 381 if typecode == 0:
jpayne@69 382 raise ValueError("unsupported type code '{}'".format(value.typecode))
jpayne@69 383
jpayne@69 384 if typecode not in DATATYPE2FORMAT:
jpayne@69 385 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@69 386
jpayne@69 387 # use array.tostring() to retrieve byte representation and
jpayne@69 388 # save as bytes
jpayne@69 389 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
jpayne@69 390 args.extend([pytag[:2],
jpayne@69 391 ord("B"),
jpayne@69 392 typecode,
jpayne@69 393 len(value),
jpayne@69 394 value.tobytes()])
jpayne@69 395
jpayne@69 396 else:
jpayne@69 397 if typecode == 0:
jpayne@69 398 typecode = get_tag_typecode(value)
jpayne@69 399 if typecode == 0:
jpayne@69 400 raise ValueError("could not deduce typecode for value {}".format(value))
jpayne@69 401
jpayne@69 402 if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
jpayne@69 403 value = force_bytes(value)
jpayne@69 404
jpayne@69 405 if typecode == b"a":
jpayne@69 406 typecode = b'A'
jpayne@69 407
jpayne@69 408 if typecode == b'Z' or typecode == b'H':
jpayne@69 409 datafmt = "2sB%is" % (len(value)+1)
jpayne@69 410 elif typecode in DATATYPE2FORMAT:
jpayne@69 411 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
jpayne@69 412 else:
jpayne@69 413 raise ValueError("invalid value type '{}'".format(chr(typecode)))
jpayne@69 414
jpayne@69 415 args.extend([pytag[:2],
jpayne@69 416 typecode,
jpayne@69 417 value])
jpayne@69 418
jpayne@69 419 fmts.append(datafmt)
jpayne@69 420
jpayne@69 421 return "".join(fmts), args
jpayne@69 422
jpayne@69 423
jpayne@69 424 cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
jpayne@69 425 """return query length computed from CIGAR alignment.
jpayne@69 426
jpayne@69 427 Length ignores hard-clipped bases.
jpayne@69 428
jpayne@69 429 Return 0 if there is no CIGAR alignment.
jpayne@69 430 """
jpayne@69 431
jpayne@69 432 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 433
jpayne@69 434 if cigar_p == NULL:
jpayne@69 435 return 0
jpayne@69 436
jpayne@69 437 cdef uint32_t k, qpos
jpayne@69 438 cdef int op
jpayne@69 439 qpos = 0
jpayne@69 440
jpayne@69 441 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 442 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 443
jpayne@69 444 if op == BAM_CMATCH or \
jpayne@69 445 op == BAM_CINS or \
jpayne@69 446 op == BAM_CSOFT_CLIP or \
jpayne@69 447 op == BAM_CEQUAL or \
jpayne@69 448 op == BAM_CDIFF:
jpayne@69 449 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 450
jpayne@69 451 return qpos
jpayne@69 452
jpayne@69 453
jpayne@69 454 cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
jpayne@69 455 """return query length computed from CIGAR alignment.
jpayne@69 456
jpayne@69 457 Length includes hard-clipped bases.
jpayne@69 458
jpayne@69 459 Return 0 if there is no CIGAR alignment.
jpayne@69 460 """
jpayne@69 461
jpayne@69 462 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 463
jpayne@69 464 if cigar_p == NULL:
jpayne@69 465 return 0
jpayne@69 466
jpayne@69 467 cdef uint32_t k, qpos
jpayne@69 468 cdef int op
jpayne@69 469 qpos = 0
jpayne@69 470
jpayne@69 471 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 472 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 473
jpayne@69 474 if op == BAM_CMATCH or \
jpayne@69 475 op == BAM_CINS or \
jpayne@69 476 op == BAM_CSOFT_CLIP or \
jpayne@69 477 op == BAM_CHARD_CLIP or \
jpayne@69 478 op == BAM_CEQUAL or \
jpayne@69 479 op == BAM_CDIFF:
jpayne@69 480 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 481
jpayne@69 482 return qpos
jpayne@69 483
jpayne@69 484
jpayne@69 485 cdef inline int32_t getQueryStart(bam1_t *src) except -1:
jpayne@69 486 cdef uint32_t * cigar_p
jpayne@69 487 cdef uint32_t start_offset = 0
jpayne@69 488 cdef uint32_t k, op
jpayne@69 489
jpayne@69 490 cigar_p = pysam_bam_get_cigar(src);
jpayne@69 491 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 492 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 493 if op == BAM_CHARD_CLIP:
jpayne@69 494 if start_offset != 0 and start_offset != src.core.l_qseq:
jpayne@69 495 raise ValueError('Invalid clipping in CIGAR string')
jpayne@69 496 elif op == BAM_CSOFT_CLIP:
jpayne@69 497 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 498 else:
jpayne@69 499 break
jpayne@69 500
jpayne@69 501 return start_offset
jpayne@69 502
jpayne@69 503
jpayne@69 504 cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
jpayne@69 505 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 506 cdef uint32_t end_offset = src.core.l_qseq
jpayne@69 507 cdef uint32_t k, op
jpayne@69 508
jpayne@69 509 # if there is no sequence, compute length from cigar string
jpayne@69 510 if end_offset == 0:
jpayne@69 511 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 512 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 513 if op == BAM_CMATCH or \
jpayne@69 514 op == BAM_CINS or \
jpayne@69 515 op == BAM_CEQUAL or \
jpayne@69 516 op == BAM_CDIFF or \
jpayne@69 517 (op == BAM_CSOFT_CLIP and end_offset == 0):
jpayne@69 518 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 519 else:
jpayne@69 520 # walk backwards in cigar string
jpayne@69 521 for k from pysam_get_n_cigar(src) > k >= 1:
jpayne@69 522 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 523 if op == BAM_CHARD_CLIP:
jpayne@69 524 if end_offset != src.core.l_qseq:
jpayne@69 525 raise ValueError('Invalid clipping in CIGAR string')
jpayne@69 526 elif op == BAM_CSOFT_CLIP:
jpayne@69 527 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 528 else:
jpayne@69 529 break
jpayne@69 530
jpayne@69 531 return end_offset
jpayne@69 532
jpayne@69 533
jpayne@69 534 cdef inline bytes getSequenceInRange(bam1_t *src,
jpayne@69 535 uint32_t start,
jpayne@69 536 uint32_t end):
jpayne@69 537 """return python string of the sequence in a bam1_t object.
jpayne@69 538 """
jpayne@69 539
jpayne@69 540 cdef uint8_t * p
jpayne@69 541 cdef uint32_t k
jpayne@69 542 cdef char * s
jpayne@69 543
jpayne@69 544 if not src.core.l_qseq:
jpayne@69 545 return None
jpayne@69 546
jpayne@69 547 seq = PyBytes_FromStringAndSize(NULL, end - start)
jpayne@69 548 s = <char*>seq
jpayne@69 549 p = pysam_bam_get_seq(src)
jpayne@69 550
jpayne@69 551 for k from start <= k < end:
jpayne@69 552 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
jpayne@69 553 # note: do not use string literal as it will be a python string
jpayne@69 554 s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
jpayne@69 555
jpayne@69 556 return charptr_to_bytes(seq)
jpayne@69 557
jpayne@69 558
jpayne@69 559 cdef inline object getQualitiesInRange(bam1_t *src,
jpayne@69 560 uint32_t start,
jpayne@69 561 uint32_t end):
jpayne@69 562 """return python array of quality values from a bam1_t object"""
jpayne@69 563
jpayne@69 564 cdef uint8_t * p
jpayne@69 565 cdef uint32_t k
jpayne@69 566
jpayne@69 567 p = pysam_bam_get_qual(src)
jpayne@69 568 if p[0] == 0xff:
jpayne@69 569 return None
jpayne@69 570
jpayne@69 571 # 'B': unsigned char
jpayne@69 572 cdef c_array.array result = array.array('B', [0])
jpayne@69 573 c_array.resize(result, end - start)
jpayne@69 574
jpayne@69 575 # copy data
jpayne@69 576 memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
jpayne@69 577
jpayne@69 578 return result
jpayne@69 579
jpayne@69 580
jpayne@69 581 #####################################################################
jpayne@69 582 ## factory methods for instantiating extension classes
jpayne@69 583 cdef class AlignedSegment
jpayne@69 584 cdef AlignedSegment makeAlignedSegment(bam1_t *src,
jpayne@69 585 AlignmentHeader header):
jpayne@69 586 '''return an AlignedSegment object constructed from `src`'''
jpayne@69 587 # note that the following does not call __init__
jpayne@69 588 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
jpayne@69 589 dest._delegate = bam_dup1(src)
jpayne@69 590 dest.header = header
jpayne@69 591 return dest
jpayne@69 592
jpayne@69 593
jpayne@69 594 cdef class PileupColumn
jpayne@69 595 cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
jpayne@69 596 int tid,
jpayne@69 597 int pos,
jpayne@69 598 int n_pu,
jpayne@69 599 uint32_t min_base_quality,
jpayne@69 600 char * reference_sequence,
jpayne@69 601 AlignmentHeader header):
jpayne@69 602 '''return a PileupColumn object constructed from pileup in `plp` and
jpayne@69 603 setting additional attributes.
jpayne@69 604
jpayne@69 605 '''
jpayne@69 606 # note that the following does not call __init__
jpayne@69 607 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
jpayne@69 608 dest.header = header
jpayne@69 609 dest.plp = plp
jpayne@69 610 dest.tid = tid
jpayne@69 611 dest.pos = pos
jpayne@69 612 dest.n_pu = n_pu
jpayne@69 613 dest.min_base_quality = min_base_quality
jpayne@69 614 dest.reference_sequence = reference_sequence
jpayne@69 615 dest.buf.l = dest.buf.m = 0
jpayne@69 616 dest.buf.s = NULL
jpayne@69 617
jpayne@69 618 return dest
jpayne@69 619
jpayne@69 620
jpayne@69 621 cdef class PileupRead
jpayne@69 622 cdef PileupRead makePileupRead(const bam_pileup1_t *src,
jpayne@69 623 AlignmentHeader header):
jpayne@69 624 '''return a PileupRead object construted from a bam_pileup1_t * object.'''
jpayne@69 625 # note that the following does not call __init__
jpayne@69 626 cdef PileupRead dest = PileupRead.__new__(PileupRead)
jpayne@69 627 dest._alignment = makeAlignedSegment(src.b, header)
jpayne@69 628 dest._qpos = src.qpos
jpayne@69 629 dest._indel = src.indel
jpayne@69 630 dest._level = src.level
jpayne@69 631 dest._is_del = src.is_del
jpayne@69 632 dest._is_head = src.is_head
jpayne@69 633 dest._is_tail = src.is_tail
jpayne@69 634 dest._is_refskip = src.is_refskip
jpayne@69 635 return dest
jpayne@69 636
jpayne@69 637
jpayne@69 638 cdef inline uint32_t get_alignment_length(bam1_t *src):
jpayne@69 639 cdef uint32_t k = 0
jpayne@69 640 cdef uint32_t l = 0
jpayne@69 641 if src == NULL:
jpayne@69 642 return 0
jpayne@69 643 cdef uint32_t * cigar_p = bam_get_cigar(src)
jpayne@69 644 if cigar_p == NULL:
jpayne@69 645 return 0
jpayne@69 646 cdef int op
jpayne@69 647 cdef uint32_t n = pysam_get_n_cigar(src)
jpayne@69 648 for k from 0 <= k < n:
jpayne@69 649 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 650 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
jpayne@69 651 continue
jpayne@69 652 l += cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 653 return l
jpayne@69 654
jpayne@69 655
jpayne@69 656 cdef inline uint32_t get_md_reference_length(char * md_tag):
jpayne@69 657 cdef int l = 0
jpayne@69 658 cdef int md_idx = 0
jpayne@69 659 cdef int nmatches = 0
jpayne@69 660
jpayne@69 661 while md_tag[md_idx] != 0:
jpayne@69 662 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
jpayne@69 663 nmatches *= 10
jpayne@69 664 nmatches += md_tag[md_idx] - 48
jpayne@69 665 md_idx += 1
jpayne@69 666 continue
jpayne@69 667 else:
jpayne@69 668 l += nmatches
jpayne@69 669 nmatches = 0
jpayne@69 670 if md_tag[md_idx] == b'^':
jpayne@69 671 md_idx += 1
jpayne@69 672 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
jpayne@69 673 md_idx += 1
jpayne@69 674 l += 1
jpayne@69 675 else:
jpayne@69 676 md_idx += 1
jpayne@69 677 l += 1
jpayne@69 678
jpayne@69 679 l += nmatches
jpayne@69 680 return l
jpayne@69 681
jpayne@69 682 # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
jpayne@69 683 cdef inline bytes build_alignment_sequence(bam1_t * src):
jpayne@69 684 """return expanded sequence from MD tag.
jpayne@69 685
jpayne@69 686 The sequence includes substitutions and both insertions in the
jpayne@69 687 reference as well as deletions to the reference sequence. Combine
jpayne@69 688 with the cigar string to reconstitute the query or the reference
jpayne@69 689 sequence.
jpayne@69 690
jpayne@69 691 Positions corresponding to `N` (skipped region from the reference)
jpayne@69 692 or `P` (padding (silent deletion from padded reference)) in the CIGAR
jpayne@69 693 string will not appear in the returned sequence. The MD should
jpayne@69 694 correspondingly not contain these. Thus proper tags are::
jpayne@69 695
jpayne@69 696 Deletion from the reference: cigar=5M1D5M MD=5^C5
jpayne@69 697 Skipped region from reference: cigar=5M1N5M MD=10
jpayne@69 698 Padded region in the reference: cigar=5M1P5M MD=10
jpayne@69 699
jpayne@69 700 Returns
jpayne@69 701 -------
jpayne@69 702
jpayne@69 703 None, if no MD tag is present.
jpayne@69 704
jpayne@69 705 """
jpayne@69 706 if src == NULL:
jpayne@69 707 return None
jpayne@69 708
jpayne@69 709 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
jpayne@69 710 if md_tag_ptr == NULL:
jpayne@69 711 return None
jpayne@69 712
jpayne@69 713 cdef uint32_t start = getQueryStart(src)
jpayne@69 714 cdef uint32_t end = getQueryEnd(src)
jpayne@69 715 # get read sequence, taking into account soft-clipping
jpayne@69 716 r = getSequenceInRange(src, start, end)
jpayne@69 717 cdef char * read_sequence = r
jpayne@69 718 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 719 if cigar_p == NULL:
jpayne@69 720 return None
jpayne@69 721
jpayne@69 722 cdef uint32_t r_idx = 0
jpayne@69 723 cdef int op
jpayne@69 724 cdef uint32_t k, i, l, x
jpayne@69 725 cdef int nmatches = 0
jpayne@69 726 cdef int s_idx = 0
jpayne@69 727
jpayne@69 728 cdef uint32_t max_len = get_alignment_length(src)
jpayne@69 729 if max_len == 0:
jpayne@69 730 raise ValueError("could not determine alignment length")
jpayne@69 731
jpayne@69 732 cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
jpayne@69 733 if s == NULL:
jpayne@69 734 raise ValueError(
jpayne@69 735 "could not allocate sequence of length %i" % max_len)
jpayne@69 736
jpayne@69 737 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 738 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 739 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 740 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 741 for i from 0 <= i < l:
jpayne@69 742 s[s_idx] = read_sequence[r_idx]
jpayne@69 743 r_idx += 1
jpayne@69 744 s_idx += 1
jpayne@69 745 elif op == BAM_CDEL:
jpayne@69 746 for i from 0 <= i < l:
jpayne@69 747 s[s_idx] = b'-'
jpayne@69 748 s_idx += 1
jpayne@69 749 elif op == BAM_CREF_SKIP:
jpayne@69 750 pass
jpayne@69 751 elif op == BAM_CINS or op == BAM_CPAD:
jpayne@69 752 for i from 0 <= i < l:
jpayne@69 753 # encode insertions into reference as lowercase
jpayne@69 754 s[s_idx] = read_sequence[r_idx] + 32
jpayne@69 755 r_idx += 1
jpayne@69 756 s_idx += 1
jpayne@69 757 elif op == BAM_CSOFT_CLIP:
jpayne@69 758 pass
jpayne@69 759 elif op == BAM_CHARD_CLIP:
jpayne@69 760 pass # advances neither
jpayne@69 761
jpayne@69 762 cdef char md_buffer[2]
jpayne@69 763 cdef char *md_tag
jpayne@69 764 cdef uint8_t md_typecode = md_tag_ptr[0]
jpayne@69 765 if md_typecode == b'Z':
jpayne@69 766 md_tag = bam_aux2Z(md_tag_ptr)
jpayne@69 767 elif md_typecode == b'A':
jpayne@69 768 # Work around HTSeq bug that writes 1-character strings as MD:A:v
jpayne@69 769 md_buffer[0] = bam_aux2A(md_tag_ptr)
jpayne@69 770 md_buffer[1] = b'\0'
jpayne@69 771 md_tag = md_buffer
jpayne@69 772 else:
jpayne@69 773 raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
jpayne@69 774
jpayne@69 775 cdef int md_idx = 0
jpayne@69 776 cdef char c
jpayne@69 777 s_idx = 0
jpayne@69 778
jpayne@69 779 # Check if MD tag is valid by matching CIGAR length to MD tag defined length
jpayne@69 780 # Insertions would be in addition to what is described by MD, so we calculate
jpayne@69 781 # the number of insertions separately.
jpayne@69 782 cdef int insertions = 0
jpayne@69 783
jpayne@69 784 while s[s_idx] != 0:
jpayne@69 785 if s[s_idx] >= b'a':
jpayne@69 786 insertions += 1
jpayne@69 787 s_idx += 1
jpayne@69 788 s_idx = 0
jpayne@69 789
jpayne@69 790 cdef uint32_t md_len = get_md_reference_length(md_tag)
jpayne@69 791 if md_len + insertions > max_len:
jpayne@69 792 free(s)
jpayne@69 793 raise AssertionError(
jpayne@69 794 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
jpayne@69 795 md_len, max_len, insertions))
jpayne@69 796
jpayne@69 797 while md_tag[md_idx] != 0:
jpayne@69 798 # c is numerical
jpayne@69 799 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
jpayne@69 800 nmatches *= 10
jpayne@69 801 nmatches += md_tag[md_idx] - 48
jpayne@69 802 md_idx += 1
jpayne@69 803 continue
jpayne@69 804 else:
jpayne@69 805 # save matches up to this point, skipping insertions
jpayne@69 806 for x from 0 <= x < nmatches:
jpayne@69 807 while s[s_idx] >= b'a':
jpayne@69 808 s_idx += 1
jpayne@69 809 s_idx += 1
jpayne@69 810 while s[s_idx] >= b'a':
jpayne@69 811 s_idx += 1
jpayne@69 812
jpayne@69 813 r_idx += nmatches
jpayne@69 814 nmatches = 0
jpayne@69 815 if md_tag[md_idx] == b'^':
jpayne@69 816 md_idx += 1
jpayne@69 817 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
jpayne@69 818 # assert s[s_idx] == '-'
jpayne@69 819 s[s_idx] = md_tag[md_idx]
jpayne@69 820 s_idx += 1
jpayne@69 821 md_idx += 1
jpayne@69 822 else:
jpayne@69 823 # save mismatch
jpayne@69 824 # enforce lower case
jpayne@69 825 c = md_tag[md_idx]
jpayne@69 826 if c <= 90:
jpayne@69 827 c += 32
jpayne@69 828 s[s_idx] = c
jpayne@69 829 s_idx += 1
jpayne@69 830 r_idx += 1
jpayne@69 831 md_idx += 1
jpayne@69 832
jpayne@69 833 # save matches up to this point, skipping insertions
jpayne@69 834 for x from 0 <= x < nmatches:
jpayne@69 835 while s[s_idx] >= b'a':
jpayne@69 836 s_idx += 1
jpayne@69 837 s_idx += 1
jpayne@69 838 while s[s_idx] >= b'a':
jpayne@69 839 s_idx += 1
jpayne@69 840
jpayne@69 841 seq = PyBytes_FromStringAndSize(s, s_idx)
jpayne@69 842 free(s)
jpayne@69 843
jpayne@69 844 return seq
jpayne@69 845
jpayne@69 846
jpayne@69 847 cdef inline bytes build_reference_sequence(bam1_t * src):
jpayne@69 848 """return the reference sequence in the region that is covered by the
jpayne@69 849 alignment of the read to the reference.
jpayne@69 850
jpayne@69 851 This method requires the MD tag to be set.
jpayne@69 852
jpayne@69 853 """
jpayne@69 854 cdef uint32_t k, i, l
jpayne@69 855 cdef int op
jpayne@69 856 cdef int s_idx = 0
jpayne@69 857 ref_seq = build_alignment_sequence(src)
jpayne@69 858 if ref_seq is None:
jpayne@69 859 raise ValueError("MD tag not present")
jpayne@69 860
jpayne@69 861 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
jpayne@69 862 if s == NULL:
jpayne@69 863 raise ValueError(
jpayne@69 864 "could not allocate sequence of length %i" % len(ref_seq))
jpayne@69 865
jpayne@69 866 cdef char * cref_seq = ref_seq
jpayne@69 867 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 868 cdef uint32_t r_idx = 0
jpayne@69 869 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 870 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 871 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 872 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 873 for i from 0 <= i < l:
jpayne@69 874 s[s_idx] = cref_seq[r_idx]
jpayne@69 875 r_idx += 1
jpayne@69 876 s_idx += 1
jpayne@69 877 elif op == BAM_CDEL:
jpayne@69 878 for i from 0 <= i < l:
jpayne@69 879 s[s_idx] = cref_seq[r_idx]
jpayne@69 880 r_idx += 1
jpayne@69 881 s_idx += 1
jpayne@69 882 elif op == BAM_CREF_SKIP:
jpayne@69 883 pass
jpayne@69 884 elif op == BAM_CINS or op == BAM_CPAD:
jpayne@69 885 r_idx += l
jpayne@69 886 elif op == BAM_CSOFT_CLIP:
jpayne@69 887 pass
jpayne@69 888 elif op == BAM_CHARD_CLIP:
jpayne@69 889 pass # advances neither
jpayne@69 890
jpayne@69 891 seq = PyBytes_FromStringAndSize(s, s_idx)
jpayne@69 892 free(s)
jpayne@69 893
jpayne@69 894 return seq
jpayne@69 895
jpayne@69 896
jpayne@69 897 cdef inline str safe_reference_name(AlignmentHeader header, int tid):
jpayne@69 898 if tid == -1: return "*"
jpayne@69 899 elif header is not None: return header.get_reference_name(tid)
jpayne@69 900 else: return f"#{tid}"
jpayne@69 901
jpayne@69 902
jpayne@69 903 # Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
jpayne@69 904
jpayne@69 905 cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@69 906 return (qpos, pos)
jpayne@69 907
jpayne@69 908
jpayne@69 909 cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@69 910 ref_base = ref_seq[r_idx] if ref_seq is not None else None
jpayne@69 911 return (qpos, pos, ref_base)
jpayne@69 912
jpayne@69 913
jpayne@69 914 cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@69 915 return (qpos, pos, CIGAR_OPS(op))
jpayne@69 916
jpayne@69 917
jpayne@69 918 cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
jpayne@69 919 ref_base = ref_seq[r_idx] if ref_seq is not None else None
jpayne@69 920 return (qpos, pos, ref_base, CIGAR_OPS(op))
jpayne@69 921
jpayne@69 922
jpayne@69 923 cdef class AlignedSegment:
jpayne@69 924 '''Class representing an aligned segment.
jpayne@69 925
jpayne@69 926 This class stores a handle to the samtools C-structure representing
jpayne@69 927 an aligned read. Member read access is forwarded to the C-structure
jpayne@69 928 and converted into python objects. This implementation should be fast,
jpayne@69 929 as only the data needed is converted.
jpayne@69 930
jpayne@69 931 For write access, the C-structure is updated in-place. This is
jpayne@69 932 not the most efficient way to build BAM entries, as the variable
jpayne@69 933 length data is concatenated and thus needs to be resized if
jpayne@69 934 a field is updated. Furthermore, the BAM entry might be
jpayne@69 935 in an inconsistent state.
jpayne@69 936
jpayne@69 937 One issue to look out for is that the sequence should always
jpayne@69 938 be set *before* the quality scores. Setting the sequence will
jpayne@69 939 also erase any quality scores that were set previously.
jpayne@69 940
jpayne@69 941 Parameters
jpayne@69 942 ----------
jpayne@69 943
jpayne@69 944 header:
jpayne@69 945 :class:`~pysam.AlignmentHeader` object to map numerical
jpayne@69 946 identifiers to chromosome names. If not given, an empty
jpayne@69 947 header is created.
jpayne@69 948 '''
jpayne@69 949
jpayne@69 950 # Now only called when instances are created from Python
jpayne@69 951 def __init__(self, AlignmentHeader header=None):
jpayne@69 952 # see bam_init1
jpayne@69 953 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@69 954 if self._delegate == NULL:
jpayne@69 955 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
jpayne@69 956 # allocate some memory. If size is 0, calloc does not return a
jpayne@69 957 # pointer that can be passed to free() so allocate 40 bytes
jpayne@69 958 # for a new read
jpayne@69 959 self._delegate.m_data = 40
jpayne@69 960 self._delegate.data = <uint8_t *>calloc(
jpayne@69 961 self._delegate.m_data, 1)
jpayne@69 962 if self._delegate.data == NULL:
jpayne@69 963 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
jpayne@69 964 self._delegate.l_data = 0
jpayne@69 965 # set some data to make read approximately legit.
jpayne@69 966 # Note, SAM writing fails with q_name of length 0
jpayne@69 967 self._delegate.core.l_qname = 0
jpayne@69 968 self._delegate.core.tid = -1
jpayne@69 969 self._delegate.core.pos = -1
jpayne@69 970 self._delegate.core.mtid = -1
jpayne@69 971 self._delegate.core.mpos = -1
jpayne@69 972
jpayne@69 973 # caching for selected fields
jpayne@69 974 self.cache_query_qualities = None
jpayne@69 975 self.cache_query_alignment_qualities = None
jpayne@69 976 self.cache_query_sequence = None
jpayne@69 977 self.cache_query_alignment_sequence = None
jpayne@69 978
jpayne@69 979 self.header = header
jpayne@69 980
jpayne@69 981 def __dealloc__(self):
jpayne@69 982 bam_destroy1(self._delegate)
jpayne@69 983
jpayne@69 984 def __str__(self):
jpayne@69 985 """return string representation of alignment.
jpayne@69 986
jpayne@69 987 The representation is an approximate :term:`SAM` format, because
jpayne@69 988 an aligned read might not be associated with a :term:`AlignmentFile`.
jpayne@69 989 Hence when the read does not have an associated :class:`AlignedHeader`,
jpayne@69 990 :term:`tid` is shown instead of the reference name.
jpayne@69 991 Similarly, the tags field is returned in its parsed state.
jpayne@69 992
jpayne@69 993 To get a valid SAM record, use :meth:`to_string`.
jpayne@69 994 """
jpayne@69 995 # sam-parsing is done in sam.c/bam_format1_core which
jpayne@69 996 # requires a valid header.
jpayne@69 997 return "\t".join(map(str, (self.query_name,
jpayne@69 998 self.flag,
jpayne@69 999 safe_reference_name(self.header, self.reference_id),
jpayne@69 1000 self.reference_start + 1,
jpayne@69 1001 self.mapping_quality,
jpayne@69 1002 self.cigarstring,
jpayne@69 1003 safe_reference_name(self.header, self.next_reference_id),
jpayne@69 1004 self.next_reference_start + 1,
jpayne@69 1005 self.template_length,
jpayne@69 1006 self.query_sequence,
jpayne@69 1007 self.query_qualities,
jpayne@69 1008 self.tags)))
jpayne@69 1009
jpayne@69 1010 def __repr__(self):
jpayne@69 1011 ref = self.reference_name if self.header is not None else self.reference_id
jpayne@69 1012 return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
jpayne@69 1013
jpayne@69 1014 def __copy__(self):
jpayne@69 1015 return makeAlignedSegment(self._delegate, self.header)
jpayne@69 1016
jpayne@69 1017 def __deepcopy__(self, memo):
jpayne@69 1018 return makeAlignedSegment(self._delegate, self.header)
jpayne@69 1019
jpayne@69 1020 def compare(self, AlignedSegment other):
jpayne@69 1021 '''return -1,0,1, if contents in this are binary
jpayne@69 1022 <,=,> to *other*
jpayne@69 1023 '''
jpayne@69 1024
jpayne@69 1025 # avoid segfault when other equals None
jpayne@69 1026 if other is None:
jpayne@69 1027 return -1
jpayne@69 1028
jpayne@69 1029 cdef int retval, x
jpayne@69 1030 cdef bam1_t *t
jpayne@69 1031 cdef bam1_t *o
jpayne@69 1032
jpayne@69 1033 t = self._delegate
jpayne@69 1034 o = other._delegate
jpayne@69 1035
jpayne@69 1036 # uncomment for debugging purposes
jpayne@69 1037 # cdef unsigned char * oo, * tt
jpayne@69 1038 # tt = <unsigned char*>(&t.core)
jpayne@69 1039 # oo = <unsigned char*>(&o.core)
jpayne@69 1040 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
jpayne@69 1041 # tt = <unsigned char*>(t.data)
jpayne@69 1042 # oo = <unsigned char*>(o.data)
jpayne@69 1043 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
jpayne@69 1044
jpayne@69 1045 # Fast-path test for object identity
jpayne@69 1046 if t == o:
jpayne@69 1047 return 0
jpayne@69 1048
jpayne@69 1049 cdef uint8_t *a = <uint8_t*>&t.core
jpayne@69 1050 cdef uint8_t *b = <uint8_t*>&o.core
jpayne@69 1051
jpayne@69 1052 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
jpayne@69 1053 if retval:
jpayne@69 1054 return retval
jpayne@69 1055
jpayne@69 1056 # cmp(t.l_data, o.l_data)
jpayne@69 1057 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
jpayne@69 1058 if retval:
jpayne@69 1059 return retval
jpayne@69 1060 return memcmp(t.data, o.data, t.l_data)
jpayne@69 1061
jpayne@69 1062 def __richcmp__(self, AlignedSegment other, int op):
jpayne@69 1063 if op == 2: # == operator
jpayne@69 1064 return self.compare(other) == 0
jpayne@69 1065 elif op == 3: # != operator
jpayne@69 1066 return self.compare(other) != 0
jpayne@69 1067 else:
jpayne@69 1068 return NotImplemented
jpayne@69 1069
jpayne@69 1070 def __hash__(self):
jpayne@69 1071 cdef bam1_t * src = self._delegate
jpayne@69 1072 cdef int x
jpayne@69 1073
jpayne@69 1074 # see http://effbot.org/zone/python-hash.htm
jpayne@69 1075 cdef uint8_t * c = <uint8_t *>&src.core
jpayne@69 1076 cdef uint32_t hash_value = c[0]
jpayne@69 1077 for x from 1 <= x < sizeof(bam1_core_t):
jpayne@69 1078 hash_value = c_mul(hash_value, 1000003) ^ c[x]
jpayne@69 1079 c = <uint8_t *>src.data
jpayne@69 1080 for x from 0 <= x < src.l_data:
jpayne@69 1081 hash_value = c_mul(hash_value, 1000003) ^ c[x]
jpayne@69 1082
jpayne@69 1083 return hash_value
jpayne@69 1084
jpayne@69 1085 cpdef to_string(self):
jpayne@69 1086 """returns a string representation of the aligned segment.
jpayne@69 1087
jpayne@69 1088 The output format is valid SAM format if a header is associated
jpayne@69 1089 with the AlignedSegment.
jpayne@69 1090 """
jpayne@69 1091 cdef kstring_t line
jpayne@69 1092 line.l = line.m = 0
jpayne@69 1093 line.s = NULL
jpayne@69 1094
jpayne@69 1095 if self.header:
jpayne@69 1096 if sam_format1(self.header.ptr, self._delegate, &line) < 0:
jpayne@69 1097 if line.m:
jpayne@69 1098 free(line.s)
jpayne@69 1099 raise ValueError('sam_format failed')
jpayne@69 1100 else:
jpayne@69 1101 raise NotImplementedError("todo")
jpayne@69 1102
jpayne@69 1103 ret = force_str(line.s[:line.l])
jpayne@69 1104
jpayne@69 1105 if line.m:
jpayne@69 1106 free(line.s)
jpayne@69 1107
jpayne@69 1108 return ret
jpayne@69 1109
jpayne@69 1110 @classmethod
jpayne@69 1111 def fromstring(cls, sam, AlignmentHeader header):
jpayne@69 1112 """parses a string representation of the aligned segment.
jpayne@69 1113
jpayne@69 1114 The input format should be valid SAM format.
jpayne@69 1115
jpayne@69 1116 Parameters
jpayne@69 1117 ----------
jpayne@69 1118 sam:
jpayne@69 1119 :term:`SAM` formatted string
jpayne@69 1120
jpayne@69 1121 """
jpayne@69 1122 cdef AlignedSegment dest = cls.__new__(cls)
jpayne@69 1123 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@69 1124 dest.header = header
jpayne@69 1125
jpayne@69 1126 cdef kstring_t line
jpayne@69 1127 line.l = line.m = len(sam)
jpayne@69 1128 _sam = force_bytes(sam)
jpayne@69 1129 line.s = _sam
jpayne@69 1130
jpayne@69 1131 cdef int ret
jpayne@69 1132 ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
jpayne@69 1133 if ret < 0:
jpayne@69 1134 raise ValueError("parsing SAM record string failed (error code {})".format(ret))
jpayne@69 1135
jpayne@69 1136 return dest
jpayne@69 1137
jpayne@69 1138 cpdef tostring(self, htsfile=None):
jpayne@69 1139 """deprecated, use :meth:`to_string()` instead.
jpayne@69 1140
jpayne@69 1141 Parameters
jpayne@69 1142 ----------
jpayne@69 1143
jpayne@69 1144 htsfile:
jpayne@69 1145 (deprecated) AlignmentFile object to map numerical
jpayne@69 1146 identifiers to chromosome names. This parameter is present
jpayne@69 1147 for backwards compatibility and ignored.
jpayne@69 1148 """
jpayne@69 1149
jpayne@69 1150 return self.to_string()
jpayne@69 1151
jpayne@69 1152 def to_dict(self):
jpayne@69 1153 """returns a json representation of the aligned segment.
jpayne@69 1154
jpayne@69 1155 Field names are abbreviated versions of the class attributes.
jpayne@69 1156 """
jpayne@69 1157 # let htslib do the string conversions, but treat optional field properly as list
jpayne@69 1158 vals = self.to_string().split("\t")
jpayne@69 1159 n = len(KEY_NAMES) - 1
jpayne@69 1160 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
jpayne@69 1161
jpayne@69 1162 @classmethod
jpayne@69 1163 def from_dict(cls, sam_dict, AlignmentHeader header):
jpayne@69 1164 """parses a dictionary representation of the aligned segment.
jpayne@69 1165
jpayne@69 1166 Parameters
jpayne@69 1167 ----------
jpayne@69 1168 sam_dict:
jpayne@69 1169 dictionary of alignment values, keys corresponding to output from
jpayne@69 1170 :meth:`todict()`.
jpayne@69 1171
jpayne@69 1172 """
jpayne@69 1173 # let htslib do the parsing
jpayne@69 1174 # the tags field can be missing
jpayne@69 1175 return cls.fromstring(
jpayne@69 1176 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
jpayne@69 1177 "\t" +
jpayne@69 1178 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
jpayne@69 1179
jpayne@69 1180 ########################################################
jpayne@69 1181 ## Basic attributes in order of appearance in SAM format
jpayne@69 1182 property query_name:
jpayne@69 1183 """the query template name (None if not present)"""
jpayne@69 1184 def __get__(self):
jpayne@69 1185
jpayne@69 1186 cdef bam1_t * src = self._delegate
jpayne@69 1187 if src.core.l_qname == 0:
jpayne@69 1188 return None
jpayne@69 1189
jpayne@69 1190 return charptr_to_str(<char *>pysam_bam_get_qname(src))
jpayne@69 1191
jpayne@69 1192 def __set__(self, qname):
jpayne@69 1193
jpayne@69 1194 if qname is None or len(qname) == 0:
jpayne@69 1195 return
jpayne@69 1196
jpayne@69 1197 if len(qname) > 254:
jpayne@69 1198 raise ValueError("query length out of range {} > 254".format(
jpayne@69 1199 len(qname)))
jpayne@69 1200
jpayne@69 1201 qname = force_bytes(qname)
jpayne@69 1202 cdef bam1_t * src = self._delegate
jpayne@69 1203 # the qname is \0 terminated
jpayne@69 1204 cdef uint8_t l = len(qname) + 1
jpayne@69 1205
jpayne@69 1206 cdef char * p = pysam_bam_get_qname(src)
jpayne@69 1207 cdef uint8_t l_extranul = 0
jpayne@69 1208
jpayne@69 1209 if l % 4 != 0:
jpayne@69 1210 l_extranul = 4 - l % 4
jpayne@69 1211
jpayne@69 1212 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@69 1213 src.core.l_qname,
jpayne@69 1214 l + l_extranul,
jpayne@69 1215 <uint8_t*>p)
jpayne@69 1216 if retval == NULL:
jpayne@69 1217 raise MemoryError("could not allocate memory")
jpayne@69 1218
jpayne@69 1219 src.core.l_extranul = l_extranul
jpayne@69 1220 src.core.l_qname = l + l_extranul
jpayne@69 1221
jpayne@69 1222 # re-acquire pointer to location in memory
jpayne@69 1223 # as it might have moved
jpayne@69 1224 p = pysam_bam_get_qname(src)
jpayne@69 1225
jpayne@69 1226 strncpy(p, qname, l)
jpayne@69 1227 # x might be > 255
jpayne@69 1228 cdef uint16_t x = 0
jpayne@69 1229
jpayne@69 1230 for x from l <= x < l + l_extranul:
jpayne@69 1231 p[x] = b'\0'
jpayne@69 1232
jpayne@69 1233 property flag:
jpayne@69 1234 """properties flag"""
jpayne@69 1235 def __get__(self):
jpayne@69 1236 return self._delegate.core.flag
jpayne@69 1237 def __set__(self, flag):
jpayne@69 1238 self._delegate.core.flag = flag
jpayne@69 1239
jpayne@69 1240 property reference_name:
jpayne@69 1241 """:term:`reference` name"""
jpayne@69 1242 def __get__(self):
jpayne@69 1243 if self._delegate.core.tid == -1:
jpayne@69 1244 return None
jpayne@69 1245 if self.header:
jpayne@69 1246 return self.header.get_reference_name(self._delegate.core.tid)
jpayne@69 1247 else:
jpayne@69 1248 raise ValueError("reference_name unknown if no header associated with record")
jpayne@69 1249 def __set__(self, reference):
jpayne@69 1250 cdef int tid
jpayne@69 1251 if reference is None or reference == "*":
jpayne@69 1252 self._delegate.core.tid = -1
jpayne@69 1253 elif self.header:
jpayne@69 1254 tid = self.header.get_tid(reference)
jpayne@69 1255 if tid < 0:
jpayne@69 1256 raise ValueError("reference {} does not exist in header".format(
jpayne@69 1257 reference))
jpayne@69 1258 self._delegate.core.tid = tid
jpayne@69 1259 else:
jpayne@69 1260 raise ValueError("reference_name can not be set if no header associated with record")
jpayne@69 1261
jpayne@69 1262 property reference_id:
jpayne@69 1263 """:term:`reference` ID
jpayne@69 1264
jpayne@69 1265 .. note::
jpayne@69 1266
jpayne@69 1267 This field contains the index of the reference sequence in
jpayne@69 1268 the sequence dictionary. To obtain the name of the
jpayne@69 1269 reference sequence, use :meth:`get_reference_name()`
jpayne@69 1270
jpayne@69 1271 """
jpayne@69 1272 def __get__(self):
jpayne@69 1273 return self._delegate.core.tid
jpayne@69 1274 def __set__(self, tid):
jpayne@69 1275 if tid != -1 and self.header and not self.header.is_valid_tid(tid):
jpayne@69 1276 raise ValueError("reference id {} does not exist in header".format(
jpayne@69 1277 tid))
jpayne@69 1278 self._delegate.core.tid = tid
jpayne@69 1279
jpayne@69 1280 property reference_start:
jpayne@69 1281 """0-based leftmost coordinate"""
jpayne@69 1282 def __get__(self):
jpayne@69 1283 return self._delegate.core.pos
jpayne@69 1284 def __set__(self, pos):
jpayne@69 1285 ## setting the position requires updating the "bin" attribute
jpayne@69 1286 cdef bam1_t * src
jpayne@69 1287 src = self._delegate
jpayne@69 1288 src.core.pos = pos
jpayne@69 1289 update_bin(src)
jpayne@69 1290
jpayne@69 1291 property mapping_quality:
jpayne@69 1292 """mapping quality"""
jpayne@69 1293 def __get__(self):
jpayne@69 1294 return pysam_get_qual(self._delegate)
jpayne@69 1295 def __set__(self, qual):
jpayne@69 1296 pysam_set_qual(self._delegate, qual)
jpayne@69 1297
jpayne@69 1298 property cigarstring:
jpayne@69 1299 '''the :term:`cigar` alignment as a string.
jpayne@69 1300
jpayne@69 1301 The cigar string is a string of alternating integers
jpayne@69 1302 and characters denoting the length and the type of
jpayne@69 1303 an operation.
jpayne@69 1304
jpayne@69 1305 .. note::
jpayne@69 1306 The order length,operation is specified in the
jpayne@69 1307 SAM format. It is different from the order of
jpayne@69 1308 the :attr:`cigar` property.
jpayne@69 1309
jpayne@69 1310 Returns None if not present.
jpayne@69 1311
jpayne@69 1312 To unset the cigarstring, assign None or the
jpayne@69 1313 empty string.
jpayne@69 1314 '''
jpayne@69 1315 def __get__(self):
jpayne@69 1316 c = self.cigartuples
jpayne@69 1317 if c is None:
jpayne@69 1318 return None
jpayne@69 1319 # reverse order
jpayne@69 1320 else:
jpayne@69 1321 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
jpayne@69 1322
jpayne@69 1323 def __set__(self, cigar):
jpayne@69 1324 if cigar is None or len(cigar) == 0:
jpayne@69 1325 self.cigartuples = []
jpayne@69 1326 else:
jpayne@69 1327 parts = CIGAR_REGEX.findall(cigar)
jpayne@69 1328 # reverse order
jpayne@69 1329 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
jpayne@69 1330
jpayne@69 1331 # TODO
jpayne@69 1332 # property cigar:
jpayne@69 1333 # """the cigar alignment"""
jpayne@69 1334
jpayne@69 1335 property next_reference_id:
jpayne@69 1336 """the :term:`reference` id of the mate/next read."""
jpayne@69 1337 def __get__(self):
jpayne@69 1338 return self._delegate.core.mtid
jpayne@69 1339 def __set__(self, mtid):
jpayne@69 1340 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
jpayne@69 1341 raise ValueError("reference id {} does not exist in header".format(
jpayne@69 1342 mtid))
jpayne@69 1343 self._delegate.core.mtid = mtid
jpayne@69 1344
jpayne@69 1345 property next_reference_name:
jpayne@69 1346 """:term:`reference` name of the mate/next read (None if no
jpayne@69 1347 AlignmentFile is associated)"""
jpayne@69 1348 def __get__(self):
jpayne@69 1349 if self._delegate.core.mtid == -1:
jpayne@69 1350 return None
jpayne@69 1351 if self.header:
jpayne@69 1352 return self.header.get_reference_name(self._delegate.core.mtid)
jpayne@69 1353 else:
jpayne@69 1354 raise ValueError("next_reference_name unknown if no header associated with record")
jpayne@69 1355
jpayne@69 1356 def __set__(self, reference):
jpayne@69 1357 cdef int mtid
jpayne@69 1358 if reference is None or reference == "*":
jpayne@69 1359 self._delegate.core.mtid = -1
jpayne@69 1360 elif reference == "=":
jpayne@69 1361 self._delegate.core.mtid = self._delegate.core.tid
jpayne@69 1362 elif self.header:
jpayne@69 1363 mtid = self.header.get_tid(reference)
jpayne@69 1364 if mtid < 0:
jpayne@69 1365 raise ValueError("reference {} does not exist in header".format(
jpayne@69 1366 reference))
jpayne@69 1367 self._delegate.core.mtid = mtid
jpayne@69 1368 else:
jpayne@69 1369 raise ValueError("next_reference_name can not be set if no header associated with record")
jpayne@69 1370
jpayne@69 1371 property next_reference_start:
jpayne@69 1372 """the position of the mate/next read."""
jpayne@69 1373 def __get__(self):
jpayne@69 1374 return self._delegate.core.mpos
jpayne@69 1375 def __set__(self, mpos):
jpayne@69 1376 self._delegate.core.mpos = mpos
jpayne@69 1377
jpayne@69 1378 property query_length:
jpayne@69 1379 """the length of the query/read.
jpayne@69 1380
jpayne@69 1381 This value corresponds to the length of the sequence supplied
jpayne@69 1382 in the BAM/SAM file. The length of a query is 0 if there is no
jpayne@69 1383 sequence in the BAM/SAM file. In those cases, the read length
jpayne@69 1384 can be inferred from the CIGAR alignment, see
jpayne@69 1385 :meth:`pysam.AlignedSegment.infer_query_length`.
jpayne@69 1386
jpayne@69 1387 The length includes soft-clipped bases and is equal to
jpayne@69 1388 ``len(query_sequence)``.
jpayne@69 1389
jpayne@69 1390 This property is read-only but is updated when a new query
jpayne@69 1391 sequence is assigned to this AlignedSegment.
jpayne@69 1392
jpayne@69 1393 Returns 0 if not available.
jpayne@69 1394
jpayne@69 1395 """
jpayne@69 1396 def __get__(self):
jpayne@69 1397 return self._delegate.core.l_qseq
jpayne@69 1398
jpayne@69 1399 property template_length:
jpayne@69 1400 """the observed query template length"""
jpayne@69 1401 def __get__(self):
jpayne@69 1402 return self._delegate.core.isize
jpayne@69 1403 def __set__(self, isize):
jpayne@69 1404 self._delegate.core.isize = isize
jpayne@69 1405
jpayne@69 1406 property query_sequence:
jpayne@69 1407 """read sequence bases, including :term:`soft clipped` bases
jpayne@69 1408 (None if not present).
jpayne@69 1409
jpayne@69 1410 Assigning to this attribute will invalidate any quality scores.
jpayne@69 1411 Thus, to in-place edit the sequence and quality scores, copies of
jpayne@69 1412 the quality scores need to be taken. Consider trimming for example::
jpayne@69 1413
jpayne@69 1414 q = read.query_qualities
jpayne@69 1415 read.query_sequence = read.query_sequence[5:10]
jpayne@69 1416 read.query_qualities = q[5:10]
jpayne@69 1417
jpayne@69 1418 The sequence is returned as it is stored in the BAM file. (This will
jpayne@69 1419 be the reverse complement of the original read sequence if the mapper
jpayne@69 1420 has aligned the read to the reverse strand.)
jpayne@69 1421 """
jpayne@69 1422 def __get__(self):
jpayne@69 1423 if self.cache_query_sequence:
jpayne@69 1424 return self.cache_query_sequence
jpayne@69 1425
jpayne@69 1426 cdef bam1_t * src
jpayne@69 1427 cdef char * s
jpayne@69 1428 src = self._delegate
jpayne@69 1429
jpayne@69 1430 if src.core.l_qseq == 0:
jpayne@69 1431 return None
jpayne@69 1432
jpayne@69 1433 self.cache_query_sequence = force_str(getSequenceInRange(
jpayne@69 1434 src, 0, src.core.l_qseq))
jpayne@69 1435 return self.cache_query_sequence
jpayne@69 1436
jpayne@69 1437 def __set__(self, seq):
jpayne@69 1438 # samtools manages sequence and quality length memory together
jpayne@69 1439 # if no quality information is present, the first byte says 0xff.
jpayne@69 1440 cdef bam1_t * src
jpayne@69 1441 cdef uint8_t * p
jpayne@69 1442 cdef char * s
jpayne@69 1443 cdef int l, k
jpayne@69 1444 cdef Py_ssize_t nbytes_new, nbytes_old
jpayne@69 1445
jpayne@69 1446 if seq == None:
jpayne@69 1447 l = 0
jpayne@69 1448 else:
jpayne@69 1449 l = len(seq)
jpayne@69 1450 seq = force_bytes(seq)
jpayne@69 1451
jpayne@69 1452 src = self._delegate
jpayne@69 1453
jpayne@69 1454 # as the sequence is stored in half-bytes, the total length (sequence
jpayne@69 1455 # plus quality scores) is (l+1)/2 + l
jpayne@69 1456 nbytes_new = (l + 1) // 2 + l
jpayne@69 1457 nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
jpayne@69 1458
jpayne@69 1459 # acquire pointer to location in memory
jpayne@69 1460 p = pysam_bam_get_seq(src)
jpayne@69 1461 src.core.l_qseq = l
jpayne@69 1462
jpayne@69 1463 # change length of data field
jpayne@69 1464 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@69 1465 nbytes_old,
jpayne@69 1466 nbytes_new,
jpayne@69 1467 p)
jpayne@69 1468
jpayne@69 1469 if retval == NULL:
jpayne@69 1470 raise MemoryError("could not allocate memory")
jpayne@69 1471
jpayne@69 1472 if l > 0:
jpayne@69 1473 # re-acquire pointer to location in memory
jpayne@69 1474 # as it might have moved
jpayne@69 1475 p = pysam_bam_get_seq(src)
jpayne@69 1476 for k from 0 <= k < nbytes_new:
jpayne@69 1477 p[k] = 0
jpayne@69 1478 # convert to C string
jpayne@69 1479 s = seq
jpayne@69 1480 for k from 0 <= k < l:
jpayne@69 1481 p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
jpayne@69 1482
jpayne@69 1483 # erase qualities
jpayne@69 1484 p = pysam_bam_get_qual(src)
jpayne@69 1485 memset(p, 0xff, l)
jpayne@69 1486
jpayne@69 1487 self.cache_query_sequence = force_str(seq)
jpayne@69 1488
jpayne@69 1489 # clear cached values for quality values
jpayne@69 1490 self.cache_query_qualities = None
jpayne@69 1491 self.cache_query_alignment_qualities = None
jpayne@69 1492
jpayne@69 1493 property query_qualities:
jpayne@69 1494 """read sequence base qualities, including :term:`soft clipped` bases
jpayne@69 1495 (None if not present).
jpayne@69 1496
jpayne@69 1497 Quality scores are returned as a python array of unsigned
jpayne@69 1498 chars. Note that this is not the ASCII-encoded value typically
jpayne@69 1499 seen in FASTQ or SAM formatted files. Thus, no offset of 33
jpayne@69 1500 needs to be subtracted.
jpayne@69 1501
jpayne@69 1502 Note that to set quality scores the sequence has to be set
jpayne@69 1503 beforehand as this will determine the expected length of the
jpayne@69 1504 quality score array.
jpayne@69 1505
jpayne@69 1506 This method raises a ValueError if the length of the
jpayne@69 1507 quality scores and the sequence are not the same.
jpayne@69 1508
jpayne@69 1509 """
jpayne@69 1510 def __get__(self):
jpayne@69 1511
jpayne@69 1512 if self.cache_query_qualities:
jpayne@69 1513 return self.cache_query_qualities
jpayne@69 1514
jpayne@69 1515 cdef bam1_t * src
jpayne@69 1516 cdef char * q
jpayne@69 1517
jpayne@69 1518 src = self._delegate
jpayne@69 1519
jpayne@69 1520 if src.core.l_qseq == 0:
jpayne@69 1521 return None
jpayne@69 1522
jpayne@69 1523 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
jpayne@69 1524 return self.cache_query_qualities
jpayne@69 1525
jpayne@69 1526 def __set__(self, qual):
jpayne@69 1527
jpayne@69 1528 # note that memory is already allocated via setting the sequence
jpayne@69 1529 # hence length match of sequence and quality needs is checked.
jpayne@69 1530 cdef bam1_t * src
jpayne@69 1531 cdef uint8_t * p
jpayne@69 1532 cdef int l
jpayne@69 1533
jpayne@69 1534 src = self._delegate
jpayne@69 1535 p = pysam_bam_get_qual(src)
jpayne@69 1536 if qual is None or len(qual) == 0:
jpayne@69 1537 # if absent and there is a sequence: set to 0xff
jpayne@69 1538 memset(p, 0xff, src.core.l_qseq)
jpayne@69 1539 return
jpayne@69 1540
jpayne@69 1541 # check for length match
jpayne@69 1542 l = len(qual)
jpayne@69 1543 if src.core.l_qseq != l:
jpayne@69 1544 raise ValueError(
jpayne@69 1545 "quality and sequence mismatch: %i != %i" %
jpayne@69 1546 (l, src.core.l_qseq))
jpayne@69 1547
jpayne@69 1548 # create a python array object filling it
jpayne@69 1549 # with the quality scores
jpayne@69 1550
jpayne@69 1551 # NB: should avoid this copying if qual is
jpayne@69 1552 # already of the correct type.
jpayne@69 1553 cdef c_array.array result = c_array.array('B', qual)
jpayne@69 1554
jpayne@69 1555 # copy data
jpayne@69 1556 memcpy(p, result.data.as_voidptr, l)
jpayne@69 1557
jpayne@69 1558 # save in cache
jpayne@69 1559 self.cache_query_qualities = qual
jpayne@69 1560
jpayne@69 1561 property bin:
jpayne@69 1562 """properties bin"""
jpayne@69 1563 def __get__(self):
jpayne@69 1564 return self._delegate.core.bin
jpayne@69 1565 def __set__(self, bin):
jpayne@69 1566 self._delegate.core.bin = bin
jpayne@69 1567
jpayne@69 1568
jpayne@69 1569 ##########################################################
jpayne@69 1570 # Derived simple attributes. These are simple attributes of
jpayne@69 1571 # AlignedSegment getting and setting values.
jpayne@69 1572 ##########################################################
jpayne@69 1573 # 1. Flags
jpayne@69 1574 ##########################################################
jpayne@69 1575 property is_paired:
jpayne@69 1576 """true if read is paired in sequencing"""
jpayne@69 1577 def __get__(self):
jpayne@69 1578 return (self.flag & BAM_FPAIRED) != 0
jpayne@69 1579 def __set__(self,val):
jpayne@69 1580 pysam_update_flag(self._delegate, val, BAM_FPAIRED)
jpayne@69 1581
jpayne@69 1582 property is_proper_pair:
jpayne@69 1583 """true if read is mapped in a proper pair"""
jpayne@69 1584 def __get__(self):
jpayne@69 1585 return (self.flag & BAM_FPROPER_PAIR) != 0
jpayne@69 1586 def __set__(self,val):
jpayne@69 1587 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
jpayne@69 1588
jpayne@69 1589 property is_unmapped:
jpayne@69 1590 """true if read itself is unmapped"""
jpayne@69 1591 def __get__(self):
jpayne@69 1592 return (self.flag & BAM_FUNMAP) != 0
jpayne@69 1593 def __set__(self, val):
jpayne@69 1594 pysam_update_flag(self._delegate, val, BAM_FUNMAP)
jpayne@69 1595 # setting the unmapped flag requires recalculation of
jpayne@69 1596 # bin as alignment length is now implicitly 1
jpayne@69 1597 update_bin(self._delegate)
jpayne@69 1598
jpayne@69 1599 property is_mapped:
jpayne@69 1600 """true if read itself is mapped
jpayne@69 1601 (implemented in terms of :attr:`is_unmapped`)"""
jpayne@69 1602 def __get__(self):
jpayne@69 1603 return (self.flag & BAM_FUNMAP) == 0
jpayne@69 1604 def __set__(self, val):
jpayne@69 1605 pysam_update_flag(self._delegate, not val, BAM_FUNMAP)
jpayne@69 1606 update_bin(self._delegate)
jpayne@69 1607
jpayne@69 1608 property mate_is_unmapped:
jpayne@69 1609 """true if the mate is unmapped"""
jpayne@69 1610 def __get__(self):
jpayne@69 1611 return (self.flag & BAM_FMUNMAP) != 0
jpayne@69 1612 def __set__(self,val):
jpayne@69 1613 pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
jpayne@69 1614
jpayne@69 1615 property mate_is_mapped:
jpayne@69 1616 """true if the mate is mapped
jpayne@69 1617 (implemented in terms of :attr:`mate_is_unmapped`)"""
jpayne@69 1618 def __get__(self):
jpayne@69 1619 return (self.flag & BAM_FMUNMAP) == 0
jpayne@69 1620 def __set__(self,val):
jpayne@69 1621 pysam_update_flag(self._delegate, not val, BAM_FMUNMAP)
jpayne@69 1622
jpayne@69 1623 property is_reverse:
jpayne@69 1624 """true if read is mapped to reverse strand"""
jpayne@69 1625 def __get__(self):
jpayne@69 1626 return (self.flag & BAM_FREVERSE) != 0
jpayne@69 1627 def __set__(self,val):
jpayne@69 1628 pysam_update_flag(self._delegate, val, BAM_FREVERSE)
jpayne@69 1629
jpayne@69 1630 property is_forward:
jpayne@69 1631 """true if read is mapped to forward strand
jpayne@69 1632 (implemented in terms of :attr:`is_reverse`)"""
jpayne@69 1633 def __get__(self):
jpayne@69 1634 return (self.flag & BAM_FREVERSE) == 0
jpayne@69 1635 def __set__(self,val):
jpayne@69 1636 pysam_update_flag(self._delegate, not val, BAM_FREVERSE)
jpayne@69 1637
jpayne@69 1638 property mate_is_reverse:
jpayne@69 1639 """true if the mate is mapped to reverse strand"""
jpayne@69 1640 def __get__(self):
jpayne@69 1641 return (self.flag & BAM_FMREVERSE) != 0
jpayne@69 1642 def __set__(self,val):
jpayne@69 1643 pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
jpayne@69 1644
jpayne@69 1645 property mate_is_forward:
jpayne@69 1646 """true if the mate is mapped to forward strand
jpayne@69 1647 (implemented in terms of :attr:`mate_is_reverse`)"""
jpayne@69 1648 def __get__(self):
jpayne@69 1649 return (self.flag & BAM_FMREVERSE) == 0
jpayne@69 1650 def __set__(self,val):
jpayne@69 1651 pysam_update_flag(self._delegate, not val, BAM_FMREVERSE)
jpayne@69 1652
jpayne@69 1653 property is_read1:
jpayne@69 1654 """true if this is read1"""
jpayne@69 1655 def __get__(self):
jpayne@69 1656 return (self.flag & BAM_FREAD1) != 0
jpayne@69 1657 def __set__(self,val):
jpayne@69 1658 pysam_update_flag(self._delegate, val, BAM_FREAD1)
jpayne@69 1659 property is_read2:
jpayne@69 1660 """true if this is read2"""
jpayne@69 1661 def __get__(self):
jpayne@69 1662 return (self.flag & BAM_FREAD2) != 0
jpayne@69 1663 def __set__(self, val):
jpayne@69 1664 pysam_update_flag(self._delegate, val, BAM_FREAD2)
jpayne@69 1665 property is_secondary:
jpayne@69 1666 """true if not primary alignment"""
jpayne@69 1667 def __get__(self):
jpayne@69 1668 return (self.flag & BAM_FSECONDARY) != 0
jpayne@69 1669 def __set__(self, val):
jpayne@69 1670 pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
jpayne@69 1671 property is_qcfail:
jpayne@69 1672 """true if QC failure"""
jpayne@69 1673 def __get__(self):
jpayne@69 1674 return (self.flag & BAM_FQCFAIL) != 0
jpayne@69 1675 def __set__(self, val):
jpayne@69 1676 pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
jpayne@69 1677 property is_duplicate:
jpayne@69 1678 """true if optical or PCR duplicate"""
jpayne@69 1679 def __get__(self):
jpayne@69 1680 return (self.flag & BAM_FDUP) != 0
jpayne@69 1681 def __set__(self, val):
jpayne@69 1682 pysam_update_flag(self._delegate, val, BAM_FDUP)
jpayne@69 1683 property is_supplementary:
jpayne@69 1684 """true if this is a supplementary alignment"""
jpayne@69 1685 def __get__(self):
jpayne@69 1686 return (self.flag & BAM_FSUPPLEMENTARY) != 0
jpayne@69 1687 def __set__(self, val):
jpayne@69 1688 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
jpayne@69 1689
jpayne@69 1690 # 2. Coordinates and lengths
jpayne@69 1691 property reference_end:
jpayne@69 1692 '''aligned reference position of the read on the reference genome.
jpayne@69 1693
jpayne@69 1694 reference_end points to one past the last aligned residue.
jpayne@69 1695 Returns None if not available (read is unmapped or no cigar
jpayne@69 1696 alignment present).
jpayne@69 1697
jpayne@69 1698 '''
jpayne@69 1699 def __get__(self):
jpayne@69 1700 cdef bam1_t * src
jpayne@69 1701 src = self._delegate
jpayne@69 1702 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
jpayne@69 1703 return None
jpayne@69 1704 return bam_endpos(src)
jpayne@69 1705
jpayne@69 1706 property reference_length:
jpayne@69 1707 '''aligned length of the read on the reference genome.
jpayne@69 1708
jpayne@69 1709 This is equal to `reference_end - reference_start`.
jpayne@69 1710 Returns None if not available.
jpayne@69 1711 '''
jpayne@69 1712 def __get__(self):
jpayne@69 1713 cdef bam1_t * src
jpayne@69 1714 src = self._delegate
jpayne@69 1715 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
jpayne@69 1716 return None
jpayne@69 1717 return bam_endpos(src) - \
jpayne@69 1718 self._delegate.core.pos
jpayne@69 1719
jpayne@69 1720 property query_alignment_sequence:
jpayne@69 1721 """aligned portion of the read.
jpayne@69 1722
jpayne@69 1723 This is a substring of :attr:`query_sequence` that excludes flanking
jpayne@69 1724 bases that were :term:`soft clipped` (None if not present). It
jpayne@69 1725 is equal to ``query_sequence[query_alignment_start:query_alignment_end]``.
jpayne@69 1726
jpayne@69 1727 SAM/BAM files may include extra flanking bases that are not
jpayne@69 1728 part of the alignment. These bases may be the result of the
jpayne@69 1729 Smith-Waterman or other algorithms, which may not require
jpayne@69 1730 alignments that begin at the first residue or end at the last.
jpayne@69 1731 In addition, extra sequencing adapters, multiplex identifiers,
jpayne@69 1732 and low-quality bases that were not considered for alignment
jpayne@69 1733 may have been retained.
jpayne@69 1734
jpayne@69 1735 """
jpayne@69 1736
jpayne@69 1737 def __get__(self):
jpayne@69 1738 if self.cache_query_alignment_sequence:
jpayne@69 1739 return self.cache_query_alignment_sequence
jpayne@69 1740
jpayne@69 1741 cdef bam1_t * src
jpayne@69 1742 cdef uint32_t start, end
jpayne@69 1743
jpayne@69 1744 src = self._delegate
jpayne@69 1745
jpayne@69 1746 if src.core.l_qseq == 0:
jpayne@69 1747 return None
jpayne@69 1748
jpayne@69 1749 start = getQueryStart(src)
jpayne@69 1750 end = getQueryEnd(src)
jpayne@69 1751
jpayne@69 1752 self.cache_query_alignment_sequence = force_str(
jpayne@69 1753 getSequenceInRange(src, start, end))
jpayne@69 1754 return self.cache_query_alignment_sequence
jpayne@69 1755
jpayne@69 1756 property query_alignment_qualities:
jpayne@69 1757 """aligned query sequence quality values (None if not present). These
jpayne@69 1758 are the quality values that correspond to
jpayne@69 1759 :attr:`query_alignment_sequence`, that is, they exclude qualities of
jpayne@69 1760 :term:`soft clipped` bases. This is equal to
jpayne@69 1761 ``query_qualities[query_alignment_start:query_alignment_end]``.
jpayne@69 1762
jpayne@69 1763 Quality scores are returned as a python array of unsigned
jpayne@69 1764 chars. Note that this is not the ASCII-encoded value typically
jpayne@69 1765 seen in FASTQ or SAM formatted files. Thus, no offset of 33
jpayne@69 1766 needs to be subtracted.
jpayne@69 1767
jpayne@69 1768 This property is read-only.
jpayne@69 1769
jpayne@69 1770 """
jpayne@69 1771 def __get__(self):
jpayne@69 1772
jpayne@69 1773 if self.cache_query_alignment_qualities:
jpayne@69 1774 return self.cache_query_alignment_qualities
jpayne@69 1775
jpayne@69 1776 cdef bam1_t * src
jpayne@69 1777 cdef uint32_t start, end
jpayne@69 1778
jpayne@69 1779 src = self._delegate
jpayne@69 1780
jpayne@69 1781 if src.core.l_qseq == 0:
jpayne@69 1782 return None
jpayne@69 1783
jpayne@69 1784 start = getQueryStart(src)
jpayne@69 1785 end = getQueryEnd(src)
jpayne@69 1786 self.cache_query_alignment_qualities = \
jpayne@69 1787 getQualitiesInRange(src, start, end)
jpayne@69 1788 return self.cache_query_alignment_qualities
jpayne@69 1789
jpayne@69 1790 property query_alignment_start:
jpayne@69 1791 """start index of the aligned query portion of the sequence (0-based,
jpayne@69 1792 inclusive).
jpayne@69 1793
jpayne@69 1794 This the index of the first base in :attr:`query_sequence`
jpayne@69 1795 that is not soft-clipped.
jpayne@69 1796 """
jpayne@69 1797 def __get__(self):
jpayne@69 1798 return getQueryStart(self._delegate)
jpayne@69 1799
jpayne@69 1800 property query_alignment_end:
jpayne@69 1801 """end index of the aligned query portion of the sequence (0-based,
jpayne@69 1802 exclusive)
jpayne@69 1803
jpayne@69 1804 This the index just past the last base in :attr:`query_sequence`
jpayne@69 1805 that is not soft-clipped.
jpayne@69 1806 """
jpayne@69 1807 def __get__(self):
jpayne@69 1808 return getQueryEnd(self._delegate)
jpayne@69 1809
jpayne@69 1810 property modified_bases:
jpayne@69 1811 """Modified bases annotations from Ml/Mm tags. The output is
jpayne@69 1812 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
jpayne@69 1813 with qual being (256*probability), or -1 if unknown.
jpayne@69 1814 Strand==0 for forward and 1 for reverse strand modification
jpayne@69 1815 """
jpayne@69 1816 def __get__(self):
jpayne@69 1817 cdef bam1_t * src
jpayne@69 1818 cdef hts_base_mod_state *m = hts_base_mod_state_alloc()
jpayne@69 1819 cdef hts_base_mod mods[5]
jpayne@69 1820 cdef int pos
jpayne@69 1821
jpayne@69 1822 ret = {}
jpayne@69 1823 src = self._delegate
jpayne@69 1824
jpayne@69 1825 if bam_parse_basemod(src, m) < 0:
jpayne@69 1826 return None
jpayne@69 1827
jpayne@69 1828 n = bam_next_basemod(src, m, mods, 5, &pos)
jpayne@69 1829
jpayne@69 1830 while n>0:
jpayne@69 1831 for i in range(n):
jpayne@69 1832 mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base
jpayne@69 1833 mod_strand = mods[i].strand
jpayne@69 1834 if self.is_reverse:
jpayne@69 1835 mod_strand = 1 - mod_strand
jpayne@69 1836 key = (chr(mods[i].canonical_base),
jpayne@69 1837 mod_strand,
jpayne@69 1838 mod_code )
jpayne@69 1839 ret.setdefault(key,[]).append((pos,mods[i].qual))
jpayne@69 1840
jpayne@69 1841 n = bam_next_basemod(src, m, mods, 5, &pos)
jpayne@69 1842
jpayne@69 1843 if n<0:
jpayne@69 1844 return None
jpayne@69 1845
jpayne@69 1846 hts_base_mod_state_free(m)
jpayne@69 1847 return ret
jpayne@69 1848
jpayne@69 1849 property modified_bases_forward:
jpayne@69 1850 """Modified bases annotations from Ml/Mm tags. The output is
jpayne@69 1851 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
jpayne@69 1852 with qual being (256*probability), or -1 if unknown.
jpayne@69 1853 Strand==0 for forward and 1 for reverse strand modification.
jpayne@69 1854 The positions are with respect to the original sequence from get_forward_sequence()
jpayne@69 1855 """
jpayne@69 1856 def __get__(self):
jpayne@69 1857 pmods = self.modified_bases
jpayne@69 1858 if pmods and self.is_reverse:
jpayne@69 1859 rmod = {}
jpayne@69 1860
jpayne@69 1861 # Try to find the length of the original sequence
jpayne@69 1862 rlen = self.infer_read_length()
jpayne@69 1863 if rlen is None and self.query_sequence is None:
jpayne@69 1864 return rmod
jpayne@69 1865 else:
jpayne@69 1866 rlen = len(self.query_sequence)
jpayne@69 1867
jpayne@69 1868 for k,mods in pmods.items():
jpayne@69 1869 nk = k[0],1 - k[1],k[2]
jpayne@69 1870 for i in range(len(mods)):
jpayne@69 1871
jpayne@69 1872 mods[i] = (rlen - 1 -mods[i][0], mods[i][1])
jpayne@69 1873 rmod[nk] = mods
jpayne@69 1874 return rmod
jpayne@69 1875
jpayne@69 1876 return pmods
jpayne@69 1877
jpayne@69 1878
jpayne@69 1879 property query_alignment_length:
jpayne@69 1880 """length of the aligned query sequence.
jpayne@69 1881
jpayne@69 1882 This is equal to :attr:`query_alignment_end` -
jpayne@69 1883 :attr:`query_alignment_start`
jpayne@69 1884 """
jpayne@69 1885 def __get__(self):
jpayne@69 1886 cdef bam1_t * src
jpayne@69 1887 src = self._delegate
jpayne@69 1888 return getQueryEnd(src) - getQueryStart(src)
jpayne@69 1889
jpayne@69 1890 #####################################################
jpayne@69 1891 # Computed properties
jpayne@69 1892
jpayne@69 1893 def get_reference_positions(self, full_length=False):
jpayne@69 1894 """a list of reference positions that this read aligns to.
jpayne@69 1895
jpayne@69 1896 By default, this method returns the (0-based) positions on the
jpayne@69 1897 reference that are within the read's alignment, leaving gaps
jpayne@69 1898 corresponding to deletions and other reference skips.
jpayne@69 1899
jpayne@69 1900 When *full_length* is True, the returned list is the same length
jpayne@69 1901 as the read and additionally includes None values corresponding
jpayne@69 1902 to insertions or soft-clipping, i.e., to bases of the read that
jpayne@69 1903 are not aligned to a reference position.
jpayne@69 1904 (See also :meth:`get_aligned_pairs` which additionally returns
jpayne@69 1905 the corresponding positions along the read.)
jpayne@69 1906 """
jpayne@69 1907 cdef uint32_t k, i, l, pos
jpayne@69 1908 cdef int op
jpayne@69 1909 cdef uint32_t * cigar_p
jpayne@69 1910 cdef bam1_t * src
jpayne@69 1911 cdef bint _full = full_length
jpayne@69 1912
jpayne@69 1913 src = self._delegate
jpayne@69 1914 if pysam_get_n_cigar(src) == 0:
jpayne@69 1915 return []
jpayne@69 1916
jpayne@69 1917 result = []
jpayne@69 1918 pos = src.core.pos
jpayne@69 1919 cigar_p = pysam_bam_get_cigar(src)
jpayne@69 1920
jpayne@69 1921 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 1922 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 1924
jpayne@69 1925 if op == BAM_CSOFT_CLIP or op == BAM_CINS:
jpayne@69 1926 if _full:
jpayne@69 1927 for i from 0 <= i < l:
jpayne@69 1928 result.append(None)
jpayne@69 1929 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 1930 for i from pos <= i < pos + l:
jpayne@69 1931 result.append(i)
jpayne@69 1932 pos += l
jpayne@69 1933 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
jpayne@69 1934 pos += l
jpayne@69 1935
jpayne@69 1936 return result
jpayne@69 1937
jpayne@69 1938 def infer_query_length(self, always=False):
jpayne@69 1939 """infer query length from CIGAR alignment.
jpayne@69 1940
jpayne@69 1941 This method deduces the query length from the CIGAR alignment
jpayne@69 1942 but does not include hard-clipped bases.
jpayne@69 1943
jpayne@69 1944 Returns None if CIGAR alignment is not present.
jpayne@69 1945
jpayne@69 1946 If *always* is set to True, `infer_read_length` is used instead.
jpayne@69 1947 This is deprecated and only present for backward compatibility.
jpayne@69 1948 """
jpayne@69 1949 if always is True:
jpayne@69 1950 return self.infer_read_length()
jpayne@69 1951 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
jpayne@69 1952 if l > 0:
jpayne@69 1953 return l
jpayne@69 1954 else:
jpayne@69 1955 return None
jpayne@69 1956
jpayne@69 1957 def infer_read_length(self):
jpayne@69 1958 """infer read length from CIGAR alignment.
jpayne@69 1959
jpayne@69 1960 This method deduces the read length from the CIGAR alignment
jpayne@69 1961 including hard-clipped bases.
jpayne@69 1962
jpayne@69 1963 Returns None if CIGAR alignment is not present.
jpayne@69 1964 """
jpayne@69 1965 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
jpayne@69 1966 if l > 0:
jpayne@69 1967 return l
jpayne@69 1968 else:
jpayne@69 1969 return None
jpayne@69 1970
jpayne@69 1971 def get_reference_sequence(self):
jpayne@69 1972 """return the reference sequence in the region that is covered by the
jpayne@69 1973 alignment of the read to the reference.
jpayne@69 1974
jpayne@69 1975 This method requires the MD tag to be set.
jpayne@69 1976
jpayne@69 1977 """
jpayne@69 1978 return force_str(build_reference_sequence(self._delegate))
jpayne@69 1979
jpayne@69 1980 def get_forward_sequence(self):
jpayne@69 1981 """return the original read sequence.
jpayne@69 1982
jpayne@69 1983 Reads mapped to the reverse strand are stored reverse complemented in
jpayne@69 1984 the BAM file. This method returns such reads reverse complemented back
jpayne@69 1985 to their original orientation.
jpayne@69 1986
jpayne@69 1987 Returns None if the record has no query sequence.
jpayne@69 1988 """
jpayne@69 1989 if self.query_sequence is None:
jpayne@69 1990 return None
jpayne@69 1991 s = force_str(self.query_sequence)
jpayne@69 1992 if self.is_reverse:
jpayne@69 1993 s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
jpayne@69 1994 return s
jpayne@69 1995
jpayne@69 1996 def get_forward_qualities(self):
jpayne@69 1997 """return the original base qualities of the read sequence,
jpayne@69 1998 in the same format as the :attr:`query_qualities` property.
jpayne@69 1999
jpayne@69 2000 Reads mapped to the reverse strand have their base qualities stored
jpayne@69 2001 reversed in the BAM file. This method returns such reads' base qualities
jpayne@69 2002 reversed back to their original orientation.
jpayne@69 2003 """
jpayne@69 2004 if self.is_reverse:
jpayne@69 2005 return self.query_qualities[::-1]
jpayne@69 2006 else:
jpayne@69 2007 return self.query_qualities
jpayne@69 2008
jpayne@69 2009 def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
jpayne@69 2010 """a list of aligned read (query) and reference positions.
jpayne@69 2011
jpayne@69 2012 Each item in the returned list is a tuple consisting of
jpayne@69 2013 the 0-based offset from the start of the read sequence
jpayne@69 2014 followed by the 0-based reference position.
jpayne@69 2015
jpayne@69 2016 For inserts, deletions, skipping either query or reference
jpayne@69 2017 position may be None.
jpayne@69 2018
jpayne@69 2019 For padding in the reference, the reference position will
jpayne@69 2020 always be None.
jpayne@69 2021
jpayne@69 2022 Parameters
jpayne@69 2023 ----------
jpayne@69 2024
jpayne@69 2025 matches_only : bool
jpayne@69 2026 If True, only matched bases are returned --- no None on either
jpayne@69 2027 side.
jpayne@69 2028 with_seq : bool
jpayne@69 2029 If True, return a third element in the tuple containing the
jpayne@69 2030 reference sequence. For CIGAR 'P' (padding in the reference)
jpayne@69 2031 operations, the third tuple element will be None. Substitutions
jpayne@69 2032 are lower-case. This option requires an MD tag to be present.
jpayne@69 2033 with_cigar : bool
jpayne@69 2034 If True, return an extra element in the tuple containing the
jpayne@69 2035 CIGAR operator corresponding to this position tuple.
jpayne@69 2036
jpayne@69 2037 Returns
jpayne@69 2038 -------
jpayne@69 2039
jpayne@69 2040 aligned_pairs : list of tuples
jpayne@69 2041
jpayne@69 2042 """
jpayne@69 2043 cdef uint32_t k, i, pos, qpos, r_idx, l
jpayne@69 2044 cdef int op
jpayne@69 2045 cdef uint32_t * cigar_p
jpayne@69 2046 cdef bam1_t * src = self._delegate
jpayne@69 2047 cdef bint _matches_only = bool(matches_only)
jpayne@69 2048 cdef bint _with_seq = bool(with_seq)
jpayne@69 2049 cdef bint _with_cigar = bool(with_cigar)
jpayne@69 2050 cdef object (*make_tuple)(object, object, object, uint32_t, int)
jpayne@69 2051
jpayne@69 2052 # TODO: this method performs no checking and assumes that
jpayne@69 2053 # read sequence, cigar and MD tag are consistent.
jpayne@69 2054
jpayne@69 2055 if _with_seq:
jpayne@69 2056 # force_str required for py2/py3 compatibility
jpayne@69 2057 ref_seq = force_str(build_reference_sequence(src))
jpayne@69 2058 if ref_seq is None:
jpayne@69 2059 raise ValueError("MD tag not present")
jpayne@69 2060 make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
jpayne@69 2061 else:
jpayne@69 2062 ref_seq = None
jpayne@69 2063 make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
jpayne@69 2064
jpayne@69 2065 r_idx = 0
jpayne@69 2066
jpayne@69 2067 if pysam_get_n_cigar(src) == 0:
jpayne@69 2068 return []
jpayne@69 2069
jpayne@69 2070 result = []
jpayne@69 2071 pos = src.core.pos
jpayne@69 2072 qpos = 0
jpayne@69 2073 cigar_p = pysam_bam_get_cigar(src)
jpayne@69 2074 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 2075 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 2076 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 2077
jpayne@69 2078 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 2079 for i from pos <= i < pos + l:
jpayne@69 2080 result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
jpayne@69 2081 r_idx += 1
jpayne@69 2082 qpos += 1
jpayne@69 2083 pos += l
jpayne@69 2084
jpayne@69 2085 elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
jpayne@69 2086 if not _matches_only:
jpayne@69 2087 for i from pos <= i < pos + l:
jpayne@69 2088 result.append(make_tuple(qpos, None, None, 0, op))
jpayne@69 2089 qpos += 1
jpayne@69 2090 else:
jpayne@69 2091 qpos += l
jpayne@69 2092
jpayne@69 2093 elif op == BAM_CDEL:
jpayne@69 2094 if not _matches_only:
jpayne@69 2095 for i from pos <= i < pos + l:
jpayne@69 2096 result.append(make_tuple(None, i, ref_seq, r_idx, op))
jpayne@69 2097 r_idx += 1
jpayne@69 2098 else:
jpayne@69 2099 r_idx += l
jpayne@69 2100 pos += l
jpayne@69 2101
jpayne@69 2102 elif op == BAM_CHARD_CLIP:
jpayne@69 2103 pass # advances neither
jpayne@69 2104
jpayne@69 2105 elif op == BAM_CREF_SKIP:
jpayne@69 2106 if not _matches_only:
jpayne@69 2107 for i from pos <= i < pos + l:
jpayne@69 2108 result.append(make_tuple(None, i, None, 0, op))
jpayne@69 2109
jpayne@69 2110 pos += l
jpayne@69 2111
jpayne@69 2112 return result
jpayne@69 2113
jpayne@69 2114 def get_blocks(self):
jpayne@69 2115 """ a list of start and end positions of
jpayne@69 2116 aligned gapless blocks.
jpayne@69 2117
jpayne@69 2118 The start and end positions are in genomic
jpayne@69 2119 coordinates.
jpayne@69 2120
jpayne@69 2121 Blocks are not normalized, i.e. two blocks
jpayne@69 2122 might be directly adjacent. This happens if
jpayne@69 2123 the two blocks are separated by an insertion
jpayne@69 2124 in the read.
jpayne@69 2125 """
jpayne@69 2126
jpayne@69 2127 cdef uint32_t k, pos, l
jpayne@69 2128 cdef int op
jpayne@69 2129 cdef uint32_t * cigar_p
jpayne@69 2130 cdef bam1_t * src
jpayne@69 2131
jpayne@69 2132 src = self._delegate
jpayne@69 2133 if pysam_get_n_cigar(src) == 0:
jpayne@69 2134 return []
jpayne@69 2135
jpayne@69 2136 result = []
jpayne@69 2137 pos = src.core.pos
jpayne@69 2138 cigar_p = pysam_bam_get_cigar(src)
jpayne@69 2139 l = 0
jpayne@69 2140
jpayne@69 2141 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 2142 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 2143 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 2144 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 2145 result.append((pos, pos + l))
jpayne@69 2146 pos += l
jpayne@69 2147 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
jpayne@69 2148 pos += l
jpayne@69 2149
jpayne@69 2150 return result
jpayne@69 2151
jpayne@69 2152 def get_overlap(self, uint32_t start, uint32_t end):
jpayne@69 2153 """return number of aligned bases of read overlapping the interval
jpayne@69 2154 *start* and *end* on the reference sequence.
jpayne@69 2155
jpayne@69 2156 Return None if cigar alignment is not available.
jpayne@69 2157 """
jpayne@69 2158 cdef uint32_t k, i, pos, overlap
jpayne@69 2159 cdef int op, o
jpayne@69 2160 cdef uint32_t * cigar_p
jpayne@69 2161 cdef bam1_t * src
jpayne@69 2162
jpayne@69 2163 overlap = 0
jpayne@69 2164
jpayne@69 2165 src = self._delegate
jpayne@69 2166 if pysam_get_n_cigar(src) == 0:
jpayne@69 2167 return None
jpayne@69 2168 pos = src.core.pos
jpayne@69 2169 o = 0
jpayne@69 2170
jpayne@69 2171 cigar_p = pysam_bam_get_cigar(src)
jpayne@69 2172 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 2173 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 2174 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 2175
jpayne@69 2176 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 2177 o = min( pos + l, end) - max( pos, start )
jpayne@69 2178 if o > 0: overlap += o
jpayne@69 2179
jpayne@69 2180 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
jpayne@69 2181 pos += l
jpayne@69 2182
jpayne@69 2183 return overlap
jpayne@69 2184
jpayne@69 2185 def get_cigar_stats(self):
jpayne@69 2186 """summary of operations in cigar string.
jpayne@69 2187
jpayne@69 2188 The output order in the array is "MIDNSHP=X" followed by a
jpayne@69 2189 field for the NM tag. If the NM tag is not present, this
jpayne@69 2190 field will always be 0. (Accessing this field via index -1
jpayne@69 2191 avoids changes if more CIGAR operators are added in future.)
jpayne@69 2192
jpayne@69 2193 +-----+----------------+--------+
jpayne@69 2194 |M |pysam.CMATCH |0 |
jpayne@69 2195 +-----+----------------+--------+
jpayne@69 2196 |I |pysam.CINS |1 |
jpayne@69 2197 +-----+----------------+--------+
jpayne@69 2198 |D |pysam.CDEL |2 |
jpayne@69 2199 +-----+----------------+--------+
jpayne@69 2200 |N |pysam.CREF_SKIP |3 |
jpayne@69 2201 +-----+----------------+--------+
jpayne@69 2202 |S |pysam.CSOFT_CLIP|4 |
jpayne@69 2203 +-----+----------------+--------+
jpayne@69 2204 |H |pysam.CHARD_CLIP|5 |
jpayne@69 2205 +-----+----------------+--------+
jpayne@69 2206 |P |pysam.CPAD |6 |
jpayne@69 2207 +-----+----------------+--------+
jpayne@69 2208 |= |pysam.CEQUAL |7 |
jpayne@69 2209 +-----+----------------+--------+
jpayne@69 2210 |X |pysam.CDIFF |8 |
jpayne@69 2211 +-----+----------------+--------+
jpayne@69 2212 |B |pysam.CBACK |9 |
jpayne@69 2213 +-----+----------------+--------+
jpayne@69 2214 |NM |NM tag |10 or -1|
jpayne@69 2215 +-----+----------------+--------+
jpayne@69 2216
jpayne@69 2217 If no cigar string is present, empty arrays will be returned.
jpayne@69 2218
jpayne@69 2219 Returns:
jpayne@69 2220 arrays :
jpayne@69 2221 two arrays. The first contains the nucleotide counts within
jpayne@69 2222 each cigar operation, the second contains the number of blocks
jpayne@69 2223 for each cigar operation.
jpayne@69 2224
jpayne@69 2225 """
jpayne@69 2226
jpayne@69 2227 cdef int nfields = NCIGAR_CODES + 1
jpayne@69 2228
jpayne@69 2229 cdef c_array.array base_counts = array.array(
jpayne@69 2230 "I",
jpayne@69 2231 [0] * nfields)
jpayne@69 2232 cdef uint32_t [:] base_view = base_counts
jpayne@69 2233 cdef c_array.array block_counts = array.array(
jpayne@69 2234 "I",
jpayne@69 2235 [0] * nfields)
jpayne@69 2236 cdef uint32_t [:] block_view = block_counts
jpayne@69 2237
jpayne@69 2238 cdef bam1_t * src = self._delegate
jpayne@69 2239 cdef int op
jpayne@69 2240 cdef uint32_t l
jpayne@69 2241 cdef int32_t k
jpayne@69 2242 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
jpayne@69 2243
jpayne@69 2244 if cigar_p == NULL:
jpayne@69 2245 return None
jpayne@69 2246
jpayne@69 2247 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 2248 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 2249 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 2250 base_view[op] += l
jpayne@69 2251 block_view[op] += 1
jpayne@69 2252
jpayne@69 2253 cdef uint8_t * v = bam_aux_get(src, 'NM')
jpayne@69 2254 if v != NULL:
jpayne@69 2255 base_view[nfields - 1] = <int32_t>bam_aux2i(v)
jpayne@69 2256
jpayne@69 2257 return base_counts, block_counts
jpayne@69 2258
jpayne@69 2259 #####################################################
jpayne@69 2260 ## Unsorted as yet
jpayne@69 2261 # TODO: capture in CIGAR object
jpayne@69 2262 property cigartuples:
jpayne@69 2263 """the :term:`cigar` alignment. The alignment
jpayne@69 2264 is returned as a list of tuples of (operation, length).
jpayne@69 2265
jpayne@69 2266 If the alignment is not present, None is returned.
jpayne@69 2267
jpayne@69 2268 The operations are:
jpayne@69 2269
jpayne@69 2270 +-----+----------------+-----+
jpayne@69 2271 |M |pysam.CMATCH |0 |
jpayne@69 2272 +-----+----------------+-----+
jpayne@69 2273 |I |pysam.CINS |1 |
jpayne@69 2274 +-----+----------------+-----+
jpayne@69 2275 |D |pysam.CDEL |2 |
jpayne@69 2276 +-----+----------------+-----+
jpayne@69 2277 |N |pysam.CREF_SKIP |3 |
jpayne@69 2278 +-----+----------------+-----+
jpayne@69 2279 |S |pysam.CSOFT_CLIP|4 |
jpayne@69 2280 +-----+----------------+-----+
jpayne@69 2281 |H |pysam.CHARD_CLIP|5 |
jpayne@69 2282 +-----+----------------+-----+
jpayne@69 2283 |P |pysam.CPAD |6 |
jpayne@69 2284 +-----+----------------+-----+
jpayne@69 2285 |= |pysam.CEQUAL |7 |
jpayne@69 2286 +-----+----------------+-----+
jpayne@69 2287 |X |pysam.CDIFF |8 |
jpayne@69 2288 +-----+----------------+-----+
jpayne@69 2289 |B |pysam.CBACK |9 |
jpayne@69 2290 +-----+----------------+-----+
jpayne@69 2291
jpayne@69 2292 .. note::
jpayne@69 2293 The output is a list of (operation, length) tuples, such as
jpayne@69 2294 ``[(0, 30)]``.
jpayne@69 2295 This is different from the SAM specification and
jpayne@69 2296 the :attr:`cigarstring` property, which uses a
jpayne@69 2297 (length, operation) order, for example: ``30M``.
jpayne@69 2298
jpayne@69 2299 To unset the cigar property, assign an empty list
jpayne@69 2300 or None.
jpayne@69 2301 """
jpayne@69 2302 def __get__(self):
jpayne@69 2303 cdef uint32_t * cigar_p
jpayne@69 2304 cdef bam1_t * src
jpayne@69 2305 cdef uint32_t op, l
jpayne@69 2306 cdef uint32_t k
jpayne@69 2307
jpayne@69 2308 src = self._delegate
jpayne@69 2309 if pysam_get_n_cigar(src) == 0:
jpayne@69 2310 return None
jpayne@69 2311
jpayne@69 2312 cigar = []
jpayne@69 2313
jpayne@69 2314 cigar_p = pysam_bam_get_cigar(src);
jpayne@69 2315 for k from 0 <= k < pysam_get_n_cigar(src):
jpayne@69 2316 op = cigar_p[k] & BAM_CIGAR_MASK
jpayne@69 2317 l = cigar_p[k] >> BAM_CIGAR_SHIFT
jpayne@69 2318 cigar.append((op, l))
jpayne@69 2319 return cigar
jpayne@69 2320
jpayne@69 2321 def __set__(self, values):
jpayne@69 2322 cdef uint32_t * p
jpayne@69 2323 cdef bam1_t * src
jpayne@69 2324 cdef op, l
jpayne@69 2325 cdef int k
jpayne@69 2326
jpayne@69 2327 k = 0
jpayne@69 2328
jpayne@69 2329 src = self._delegate
jpayne@69 2330
jpayne@69 2331 # get location of cigar string
jpayne@69 2332 p = pysam_bam_get_cigar(src)
jpayne@69 2333
jpayne@69 2334 # empty values for cigar string
jpayne@69 2335 if values is None:
jpayne@69 2336 values = []
jpayne@69 2337
jpayne@69 2338 cdef uint32_t ncigar = len(values)
jpayne@69 2339
jpayne@69 2340 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@69 2341 pysam_get_n_cigar(src) * 4,
jpayne@69 2342 ncigar * 4,
jpayne@69 2343 <uint8_t*>p)
jpayne@69 2344
jpayne@69 2345 if retval == NULL:
jpayne@69 2346 raise MemoryError("could not allocate memory")
jpayne@69 2347
jpayne@69 2348 # length is number of cigar operations, not bytes
jpayne@69 2349 pysam_set_n_cigar(src, ncigar)
jpayne@69 2350
jpayne@69 2351 # re-acquire pointer to location in memory
jpayne@69 2352 # as it might have moved
jpayne@69 2353 p = pysam_bam_get_cigar(src)
jpayne@69 2354
jpayne@69 2355 # insert cigar operations
jpayne@69 2356 for op, l in values:
jpayne@69 2357 p[k] = l << BAM_CIGAR_SHIFT | op
jpayne@69 2358 k += 1
jpayne@69 2359
jpayne@69 2360 ## setting the cigar string requires updating the bin
jpayne@69 2361 update_bin(src)
jpayne@69 2362
jpayne@69 2363 cpdef set_tag(self,
jpayne@69 2364 tag,
jpayne@69 2365 value,
jpayne@69 2366 value_type=None,
jpayne@69 2367 replace=True):
jpayne@69 2368 """sets a particular field *tag* to *value* in the optional alignment
jpayne@69 2369 section.
jpayne@69 2370
jpayne@69 2371 *value_type* describes the type of *value* that is to entered
jpayne@69 2372 into the alignment record. It can be set explicitly to one of
jpayne@69 2373 the valid one-letter type codes. If unset, an appropriate type
jpayne@69 2374 will be chosen automatically based on the python type of
jpayne@69 2375 *value*.
jpayne@69 2376
jpayne@69 2377 An existing value of the same *tag* will be overwritten unless
jpayne@69 2378 *replace* is set to False. This is usually not recommended as a
jpayne@69 2379 tag may only appear once in the optional alignment section.
jpayne@69 2380
jpayne@69 2381 If *value* is `None`, the tag will be deleted.
jpayne@69 2382
jpayne@69 2383 This method accepts valid SAM specification value types, which
jpayne@69 2384 are::
jpayne@69 2385
jpayne@69 2386 A: printable char
jpayne@69 2387 i: signed int
jpayne@69 2388 f: float
jpayne@69 2389 Z: printable string
jpayne@69 2390 H: Byte array in hex format
jpayne@69 2391 B: Integer or numeric array
jpayne@69 2392
jpayne@69 2393 Additionally, it will accept the integer BAM types ('cCsSI')
jpayne@69 2394
jpayne@69 2395 For htslib compatibility, 'a' is synonymous with 'A' and the
jpayne@69 2396 method accepts a 'd' type code for a double precision float.
jpayne@69 2397
jpayne@69 2398 When deducing the type code by the python type of *value*, the
jpayne@69 2399 following mapping is applied::
jpayne@69 2400
jpayne@69 2401 i: python int
jpayne@69 2402 f: python float
jpayne@69 2403 Z: python str or bytes
jpayne@69 2404 B: python array.array, list or tuple
jpayne@69 2405
jpayne@69 2406 Note that a single character string will be output as 'Z' and
jpayne@69 2407 not 'A' as the former is the more general type.
jpayne@69 2408 """
jpayne@69 2409
jpayne@69 2410 cdef int value_size
jpayne@69 2411 cdef uint8_t tc
jpayne@69 2412 cdef uint8_t * value_ptr
jpayne@69 2413 cdef uint8_t *existing_ptr
jpayne@69 2414 cdef float float_value
jpayne@69 2415 cdef double double_value
jpayne@69 2416 cdef int32_t int32_t_value
jpayne@69 2417 cdef uint32_t uint32_t_value
jpayne@69 2418 cdef int16_t int16_t_value
jpayne@69 2419 cdef uint16_t uint16_t_value
jpayne@69 2420 cdef int8_t int8_t_value
jpayne@69 2421 cdef uint8_t uint8_t_value
jpayne@69 2422 cdef bam1_t * src = self._delegate
jpayne@69 2423 cdef char * _value_type
jpayne@69 2424 cdef c_array.array array_value
jpayne@69 2425 cdef object buffer
jpayne@69 2426
jpayne@69 2427 if len(tag) != 2:
jpayne@69 2428 raise ValueError('Invalid tag: %s' % tag)
jpayne@69 2429
jpayne@69 2430 tag = force_bytes(tag)
jpayne@69 2431 if replace:
jpayne@69 2432 existing_ptr = bam_aux_get(src, tag)
jpayne@69 2433 if existing_ptr:
jpayne@69 2434 bam_aux_del(src, existing_ptr)
jpayne@69 2435
jpayne@69 2436 # setting value to None deletes a tag
jpayne@69 2437 if value is None:
jpayne@69 2438 return
jpayne@69 2439
jpayne@69 2440 cdef uint8_t typecode = get_tag_typecode(value, value_type)
jpayne@69 2441 if typecode == 0:
jpayne@69 2442 raise ValueError("can't guess type or invalid type code specified: {} {}".format(
jpayne@69 2443 value, value_type))
jpayne@69 2444
jpayne@69 2445 # sam_format1 for typecasting
jpayne@69 2446 if typecode == b'Z':
jpayne@69 2447 value = force_bytes(value)
jpayne@69 2448 value_ptr = <uint8_t*><char*>value
jpayne@69 2449 value_size = len(value)+1
jpayne@69 2450 elif typecode == b'H':
jpayne@69 2451 # Note that hex tags are stored the very same
jpayne@69 2452 # way as Z string.s
jpayne@69 2453 value = force_bytes(value)
jpayne@69 2454 value_ptr = <uint8_t*><char*>value
jpayne@69 2455 value_size = len(value)+1
jpayne@69 2456 elif typecode == b'A' or typecode == b'a':
jpayne@69 2457 value = force_bytes(value)
jpayne@69 2458 value_ptr = <uint8_t*><char*>value
jpayne@69 2459 value_size = sizeof(char)
jpayne@69 2460 typecode = b'A'
jpayne@69 2461 elif typecode == b'i':
jpayne@69 2462 int32_t_value = value
jpayne@69 2463 value_ptr = <uint8_t*>&int32_t_value
jpayne@69 2464 value_size = sizeof(int32_t)
jpayne@69 2465 elif typecode == b'I':
jpayne@69 2466 uint32_t_value = value
jpayne@69 2467 value_ptr = <uint8_t*>&uint32_t_value
jpayne@69 2468 value_size = sizeof(uint32_t)
jpayne@69 2469 elif typecode == b's':
jpayne@69 2470 int16_t_value = value
jpayne@69 2471 value_ptr = <uint8_t*>&int16_t_value
jpayne@69 2472 value_size = sizeof(int16_t)
jpayne@69 2473 elif typecode == b'S':
jpayne@69 2474 uint16_t_value = value
jpayne@69 2475 value_ptr = <uint8_t*>&uint16_t_value
jpayne@69 2476 value_size = sizeof(uint16_t)
jpayne@69 2477 elif typecode == b'c':
jpayne@69 2478 int8_t_value = value
jpayne@69 2479 value_ptr = <uint8_t*>&int8_t_value
jpayne@69 2480 value_size = sizeof(int8_t)
jpayne@69 2481 elif typecode == b'C':
jpayne@69 2482 uint8_t_value = value
jpayne@69 2483 value_ptr = <uint8_t*>&uint8_t_value
jpayne@69 2484 value_size = sizeof(uint8_t)
jpayne@69 2485 elif typecode == b'd':
jpayne@69 2486 double_value = value
jpayne@69 2487 value_ptr = <uint8_t*>&double_value
jpayne@69 2488 value_size = sizeof(double)
jpayne@69 2489 elif typecode == b'f':
jpayne@69 2490 float_value = value
jpayne@69 2491 value_ptr = <uint8_t*>&float_value
jpayne@69 2492 value_size = sizeof(float)
jpayne@69 2493 elif typecode == b'B':
jpayne@69 2494 # the following goes through python, needs to be cleaned up
jpayne@69 2495 # pack array using struct
jpayne@69 2496 fmt, args = pack_tags([(tag, value, value_type)])
jpayne@69 2497
jpayne@69 2498 # remove tag and type code as set by bam_aux_append
jpayne@69 2499 # first four chars of format (<2sB)
jpayne@69 2500 fmt = '<' + fmt[4:]
jpayne@69 2501 # first two values to pack
jpayne@69 2502 args = args[2:]
jpayne@69 2503 value_size = struct.calcsize(fmt)
jpayne@69 2504 # buffer will be freed when object goes out of scope
jpayne@69 2505 buffer = ctypes.create_string_buffer(value_size)
jpayne@69 2506 struct.pack_into(fmt, buffer, 0, *args)
jpayne@69 2507 # bam_aux_append copies data from value_ptr
jpayne@69 2508 bam_aux_append(src,
jpayne@69 2509 tag,
jpayne@69 2510 typecode,
jpayne@69 2511 value_size,
jpayne@69 2512 <uint8_t*>buffer.raw)
jpayne@69 2513 return
jpayne@69 2514 else:
jpayne@69 2515 raise ValueError('unsupported value_type {} in set_option'.format(typecode))
jpayne@69 2516
jpayne@69 2517 bam_aux_append(src,
jpayne@69 2518 tag,
jpayne@69 2519 typecode,
jpayne@69 2520 value_size,
jpayne@69 2521 value_ptr)
jpayne@69 2522
jpayne@69 2523 cpdef has_tag(self, tag):
jpayne@69 2524 """returns true if the optional alignment section
jpayne@69 2525 contains a given *tag*."""
jpayne@69 2526 cdef uint8_t * v
jpayne@69 2527 cdef int nvalues
jpayne@69 2528 btag = force_bytes(tag)
jpayne@69 2529 v = bam_aux_get(self._delegate, btag)
jpayne@69 2530 return v != NULL
jpayne@69 2531
jpayne@69 2532 cpdef get_tag(self, tag, with_value_type=False):
jpayne@69 2533 """
jpayne@69 2534 retrieves data from the optional alignment section
jpayne@69 2535 given a two-letter *tag* denoting the field.
jpayne@69 2536
jpayne@69 2537 The returned value is cast into an appropriate python type.
jpayne@69 2538
jpayne@69 2539 This method is the fastest way to access the optional
jpayne@69 2540 alignment section if only few tags need to be retrieved.
jpayne@69 2541
jpayne@69 2542 Possible value types are "AcCsSiIfZHB" (see BAM format
jpayne@69 2543 specification) as well as additional value type 'd' as
jpayne@69 2544 implemented in htslib.
jpayne@69 2545
jpayne@69 2546 Parameters:
jpayne@69 2547
jpayne@69 2548 tag :
jpayne@69 2549 data tag.
jpayne@69 2550
jpayne@69 2551 with_value_type : Optional[bool]
jpayne@69 2552 if set to True, the return value is a tuple of (tag value, type
jpayne@69 2553 code). (default False)
jpayne@69 2554
jpayne@69 2555 Returns:
jpayne@69 2556
jpayne@69 2557 A python object with the value of the `tag`. The type of the
jpayne@69 2558 object depends on the data type in the data record.
jpayne@69 2559
jpayne@69 2560 Raises:
jpayne@69 2561
jpayne@69 2562 KeyError
jpayne@69 2563 If `tag` is not present, a KeyError is raised.
jpayne@69 2564
jpayne@69 2565 """
jpayne@69 2566 cdef uint8_t * v
jpayne@69 2567 cdef int nvalues
jpayne@69 2568 btag = force_bytes(tag)
jpayne@69 2569 v = bam_aux_get(self._delegate, btag)
jpayne@69 2570 if v == NULL:
jpayne@69 2571 raise KeyError("tag '%s' not present" % tag)
jpayne@69 2572 if chr(v[0]) == "B":
jpayne@69 2573 auxtype = chr(v[0]) + chr(v[1])
jpayne@69 2574 else:
jpayne@69 2575 auxtype = chr(v[0])
jpayne@69 2576
jpayne@69 2577 if auxtype in "iIcCsS":
jpayne@69 2578 value = bam_aux2i(v)
jpayne@69 2579 elif auxtype == 'f' or auxtype == 'F':
jpayne@69 2580 value = bam_aux2f(v)
jpayne@69 2581 elif auxtype == 'd' or auxtype == 'D':
jpayne@69 2582 value = bam_aux2f(v)
jpayne@69 2583 elif auxtype == 'A' or auxtype == 'a':
jpayne@69 2584 # force A to a
jpayne@69 2585 v[0] = b'A'
jpayne@69 2586 # there might a more efficient way
jpayne@69 2587 # to convert a char into a string
jpayne@69 2588 value = '%c' % <char>bam_aux2A(v)
jpayne@69 2589 elif auxtype == 'Z' or auxtype == 'H':
jpayne@69 2590 # Z and H are treated equally as strings in htslib
jpayne@69 2591 value = charptr_to_str(<char*>bam_aux2Z(v))
jpayne@69 2592 elif auxtype[0] == 'B':
jpayne@69 2593 bytesize, nvalues, values = convert_binary_tag(v + 1)
jpayne@69 2594 value = values
jpayne@69 2595 else:
jpayne@69 2596 raise ValueError("unknown auxiliary type '%s'" % auxtype)
jpayne@69 2597
jpayne@69 2598 if with_value_type:
jpayne@69 2599 return (value, auxtype)
jpayne@69 2600 else:
jpayne@69 2601 return value
jpayne@69 2602
jpayne@69 2603 def get_tags(self, with_value_type=False):
jpayne@69 2604 """the fields in the optional alignment section.
jpayne@69 2605
jpayne@69 2606 Returns a list of all fields in the optional
jpayne@69 2607 alignment section. Values are converted to appropriate python
jpayne@69 2608 values. For example: ``[(NM, 2), (RG, "GJP00TM04")]``
jpayne@69 2609
jpayne@69 2610 If *with_value_type* is set, the value type as encode in
jpayne@69 2611 the AlignedSegment record will be returned as well:
jpayne@69 2612
jpayne@69 2613 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
jpayne@69 2614
jpayne@69 2615 This method will convert all values in the optional alignment
jpayne@69 2616 section. When getting only one or few tags, please see
jpayne@69 2617 :meth:`get_tag` for a quicker way to achieve this.
jpayne@69 2618
jpayne@69 2619 """
jpayne@69 2620
jpayne@69 2621 cdef char * ctag
jpayne@69 2622 cdef bam1_t * src
jpayne@69 2623 cdef uint8_t * s
jpayne@69 2624 cdef char auxtag[3]
jpayne@69 2625 cdef char auxtype
jpayne@69 2626 cdef uint8_t byte_size
jpayne@69 2627 cdef int32_t nvalues
jpayne@69 2628
jpayne@69 2629 src = self._delegate
jpayne@69 2630 if src.l_data == 0:
jpayne@69 2631 return []
jpayne@69 2632 s = pysam_bam_get_aux(src)
jpayne@69 2633 result = []
jpayne@69 2634 auxtag[2] = 0
jpayne@69 2635 while s < (src.data + src.l_data):
jpayne@69 2636 # get tag
jpayne@69 2637 auxtag[0] = s[0]
jpayne@69 2638 auxtag[1] = s[1]
jpayne@69 2639 s += 2
jpayne@69 2640 auxtype = s[0]
jpayne@69 2641 if auxtype in (b'c', b'C'):
jpayne@69 2642 value = <int>bam_aux2i(s)
jpayne@69 2643 s += 1
jpayne@69 2644 elif auxtype in (b's', b'S'):
jpayne@69 2645 value = <int>bam_aux2i(s)
jpayne@69 2646 s += 2
jpayne@69 2647 elif auxtype in (b'i', b'I'):
jpayne@69 2648 value = <int32_t>bam_aux2i(s)
jpayne@69 2649 s += 4
jpayne@69 2650 elif auxtype == b'f':
jpayne@69 2651 value = <float>bam_aux2f(s)
jpayne@69 2652 s += 4
jpayne@69 2653 elif auxtype == b'd':
jpayne@69 2654 value = <double>bam_aux2f(s)
jpayne@69 2655 s += 8
jpayne@69 2656 elif auxtype in (b'A', b'a'):
jpayne@69 2657 value = "%c" % <char>bam_aux2A(s)
jpayne@69 2658 s += 1
jpayne@69 2659 elif auxtype in (b'Z', b'H'):
jpayne@69 2660 value = charptr_to_str(<char*>bam_aux2Z(s))
jpayne@69 2661 # +1 for NULL terminated string
jpayne@69 2662 s += len(value) + 1
jpayne@69 2663 elif auxtype == b'B':
jpayne@69 2664 s += 1
jpayne@69 2665 byte_size, nvalues, value = convert_binary_tag(s)
jpayne@69 2666 # 5 for 1 char and 1 int
jpayne@69 2667 s += 5 + (nvalues * byte_size) - 1
jpayne@69 2668 else:
jpayne@69 2669 raise KeyError("unknown type '%s'" % auxtype)
jpayne@69 2670
jpayne@69 2671 s += 1
jpayne@69 2672
jpayne@69 2673 if with_value_type:
jpayne@69 2674 result.append((charptr_to_str(auxtag), value, chr(auxtype)))
jpayne@69 2675 else:
jpayne@69 2676 result.append((charptr_to_str(auxtag), value))
jpayne@69 2677
jpayne@69 2678 return result
jpayne@69 2679
jpayne@69 2680 def set_tags(self, tags):
jpayne@69 2681 """sets the fields in the optional alignment section with
jpayne@69 2682 a list of (tag, value) tuples.
jpayne@69 2683
jpayne@69 2684 The value type of the values is determined from the
jpayne@69 2685 python type. Optionally, a type may be given explicitly as
jpayne@69 2686 a third value in the tuple, For example:
jpayne@69 2687
jpayne@69 2688 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
jpayne@69 2689
jpayne@69 2690 This method will not enforce the rule that the same tag may appear
jpayne@69 2691 only once in the optional alignment section.
jpayne@69 2692 """
jpayne@69 2693
jpayne@69 2694 cdef bam1_t * src
jpayne@69 2695 cdef uint8_t * s
jpayne@69 2696 cdef char * temp
jpayne@69 2697 cdef int new_size = 0
jpayne@69 2698 cdef int old_size
jpayne@69 2699 src = self._delegate
jpayne@69 2700
jpayne@69 2701 # convert and pack the data
jpayne@69 2702 if tags is not None and len(tags) > 0:
jpayne@69 2703 fmt, args = pack_tags(tags)
jpayne@69 2704 new_size = struct.calcsize(fmt)
jpayne@69 2705 buffer = ctypes.create_string_buffer(new_size)
jpayne@69 2706 struct.pack_into(fmt,
jpayne@69 2707 buffer,
jpayne@69 2708 0,
jpayne@69 2709 *args)
jpayne@69 2710
jpayne@69 2711
jpayne@69 2712 # delete the old data and allocate new space.
jpayne@69 2713 # If total_size == 0, the aux field will be
jpayne@69 2714 # empty
jpayne@69 2715 old_size = pysam_bam_get_l_aux(src)
jpayne@69 2716 cdef bam1_t * retval = pysam_bam_update(src,
jpayne@69 2717 old_size,
jpayne@69 2718 new_size,
jpayne@69 2719 pysam_bam_get_aux(src))
jpayne@69 2720 if retval == NULL:
jpayne@69 2721 raise MemoryError("could not allocate memory")
jpayne@69 2722
jpayne@69 2723 # copy data only if there is any
jpayne@69 2724 if new_size > 0:
jpayne@69 2725
jpayne@69 2726 # get location of new data
jpayne@69 2727 s = pysam_bam_get_aux(src)
jpayne@69 2728
jpayne@69 2729 # check if there is direct path from buffer.raw to tmp
jpayne@69 2730 p = buffer.raw
jpayne@69 2731 # create handle to make sure buffer stays alive long
jpayne@69 2732 # enough for memcpy, see issue 129
jpayne@69 2733 temp = p
jpayne@69 2734 memcpy(s, temp, new_size)
jpayne@69 2735
jpayne@69 2736
jpayne@69 2737 ########################################################
jpayne@69 2738 # Compatibility Accessors
jpayne@69 2739 # Functions, properties for compatibility with pysam < 0.8
jpayne@69 2740 #
jpayne@69 2741 # Several options
jpayne@69 2742 # change the factory functions according to API
jpayne@69 2743 # * requires code changes throughout, incl passing
jpayne@69 2744 # handles to factory functions
jpayne@69 2745 # subclass functions and add attributes at runtime
jpayne@69 2746 # e.g.: AlignedSegments.qname = AlignedSegments.query_name
jpayne@69 2747 # * will slow down the default interface
jpayne@69 2748 # explicit declaration of getters/setters
jpayne@69 2749 ########################################################
jpayne@69 2750 property qname:
jpayne@69 2751 """deprecated, use :attr:`query_name` instead."""
jpayne@69 2752 def __get__(self): return self.query_name
jpayne@69 2753 def __set__(self, v): self.query_name = v
jpayne@69 2754 property tid:
jpayne@69 2755 """deprecated, use :attr:`reference_id` instead."""
jpayne@69 2756 def __get__(self): return self.reference_id
jpayne@69 2757 def __set__(self, v): self.reference_id = v
jpayne@69 2758 property pos:
jpayne@69 2759 """deprecated, use :attr:`reference_start` instead."""
jpayne@69 2760 def __get__(self): return self.reference_start
jpayne@69 2761 def __set__(self, v): self.reference_start = v
jpayne@69 2762 property mapq:
jpayne@69 2763 """deprecated, use :attr:`mapping_quality` instead."""
jpayne@69 2764 def __get__(self): return self.mapping_quality
jpayne@69 2765 def __set__(self, v): self.mapping_quality = v
jpayne@69 2766 property rnext:
jpayne@69 2767 """deprecated, use :attr:`next_reference_id` instead."""
jpayne@69 2768 def __get__(self): return self.next_reference_id
jpayne@69 2769 def __set__(self, v): self.next_reference_id = v
jpayne@69 2770 property pnext:
jpayne@69 2771 """deprecated, use :attr:`next_reference_start` instead."""
jpayne@69 2772 def __get__(self):
jpayne@69 2773 return self.next_reference_start
jpayne@69 2774 def __set__(self, v):
jpayne@69 2775 self.next_reference_start = v
jpayne@69 2776 property cigar:
jpayne@69 2777 """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead."""
jpayne@69 2778 def __get__(self):
jpayne@69 2779 r = self.cigartuples
jpayne@69 2780 if r is None:
jpayne@69 2781 r = []
jpayne@69 2782 return r
jpayne@69 2783 def __set__(self, v): self.cigartuples = v
jpayne@69 2784 property tlen:
jpayne@69 2785 """deprecated, use :attr:`template_length` instead."""
jpayne@69 2786 def __get__(self):
jpayne@69 2787 return self.template_length
jpayne@69 2788 def __set__(self, v):
jpayne@69 2789 self.template_length = v
jpayne@69 2790 property seq:
jpayne@69 2791 """deprecated, use :attr:`query_sequence` instead."""
jpayne@69 2792 def __get__(self):
jpayne@69 2793 return self.query_sequence
jpayne@69 2794 def __set__(self, v):
jpayne@69 2795 self.query_sequence = v
jpayne@69 2796 property qual:
jpayne@69 2797 """deprecated, use :attr:`query_qualities` instead."""
jpayne@69 2798 def __get__(self):
jpayne@69 2799 return array_to_qualitystring(self.query_qualities)
jpayne@69 2800 def __set__(self, v):
jpayne@69 2801 self.query_qualities = qualitystring_to_array(v)
jpayne@69 2802 property alen:
jpayne@69 2803 """deprecated, use :attr:`reference_length` instead."""
jpayne@69 2804 def __get__(self):
jpayne@69 2805 return self.reference_length
jpayne@69 2806 def __set__(self, v):
jpayne@69 2807 self.reference_length = v
jpayne@69 2808 property aend:
jpayne@69 2809 """deprecated, use :attr:`reference_end` instead."""
jpayne@69 2810 def __get__(self):
jpayne@69 2811 return self.reference_end
jpayne@69 2812 def __set__(self, v):
jpayne@69 2813 self.reference_end = v
jpayne@69 2814 property rlen:
jpayne@69 2815 """deprecated, use :attr:`query_length` instead."""
jpayne@69 2816 def __get__(self):
jpayne@69 2817 return self.query_length
jpayne@69 2818 def __set__(self, v):
jpayne@69 2819 self.query_length = v
jpayne@69 2820 property query:
jpayne@69 2821 """deprecated, use :attr:`query_alignment_sequence`
jpayne@69 2822 instead."""
jpayne@69 2823 def __get__(self):
jpayne@69 2824 return self.query_alignment_sequence
jpayne@69 2825 def __set__(self, v):
jpayne@69 2826 self.query_alignment_sequence = v
jpayne@69 2827 property qqual:
jpayne@69 2828 """deprecated, use :attr:`query_alignment_qualities`
jpayne@69 2829 instead."""
jpayne@69 2830 def __get__(self):
jpayne@69 2831 return array_to_qualitystring(self.query_alignment_qualities)
jpayne@69 2832 def __set__(self, v):
jpayne@69 2833 self.query_alignment_qualities = qualitystring_to_array(v)
jpayne@69 2834 property qstart:
jpayne@69 2835 """deprecated, use :attr:`query_alignment_start` instead."""
jpayne@69 2836 def __get__(self):
jpayne@69 2837 return self.query_alignment_start
jpayne@69 2838 def __set__(self, v):
jpayne@69 2839 self.query_alignment_start = v
jpayne@69 2840 property qend:
jpayne@69 2841 """deprecated, use :attr:`query_alignment_end` instead."""
jpayne@69 2842 def __get__(self):
jpayne@69 2843 return self.query_alignment_end
jpayne@69 2844 def __set__(self, v):
jpayne@69 2845 self.query_alignment_end = v
jpayne@69 2846 property qlen:
jpayne@69 2847 """deprecated, use :attr:`query_alignment_length`
jpayne@69 2848 instead."""
jpayne@69 2849 def __get__(self):
jpayne@69 2850 return self.query_alignment_length
jpayne@69 2851 def __set__(self, v):
jpayne@69 2852 self.query_alignment_length = v
jpayne@69 2853 property mrnm:
jpayne@69 2854 """deprecated, use :attr:`next_reference_id` instead."""
jpayne@69 2855 def __get__(self):
jpayne@69 2856 return self.next_reference_id
jpayne@69 2857 def __set__(self, v):
jpayne@69 2858 self.next_reference_id = v
jpayne@69 2859 property mpos:
jpayne@69 2860 """deprecated, use :attr:`next_reference_start`
jpayne@69 2861 instead."""
jpayne@69 2862 def __get__(self):
jpayne@69 2863 return self.next_reference_start
jpayne@69 2864 def __set__(self, v):
jpayne@69 2865 self.next_reference_start = v
jpayne@69 2866 property rname:
jpayne@69 2867 """deprecated, use :attr:`reference_id` instead."""
jpayne@69 2868 def __get__(self):
jpayne@69 2869 return self.reference_id
jpayne@69 2870 def __set__(self, v):
jpayne@69 2871 self.reference_id = v
jpayne@69 2872 property isize:
jpayne@69 2873 """deprecated, use :attr:`template_length` instead."""
jpayne@69 2874 def __get__(self):
jpayne@69 2875 return self.template_length
jpayne@69 2876 def __set__(self, v):
jpayne@69 2877 self.template_length = v
jpayne@69 2878 property blocks:
jpayne@69 2879 """deprecated, use :meth:`get_blocks()` instead."""
jpayne@69 2880 def __get__(self):
jpayne@69 2881 return self.get_blocks()
jpayne@69 2882 property aligned_pairs:
jpayne@69 2883 """deprecated, use :meth:`get_aligned_pairs()` instead."""
jpayne@69 2884 def __get__(self):
jpayne@69 2885 return self.get_aligned_pairs()
jpayne@69 2886 property inferred_length:
jpayne@69 2887 """deprecated, use :meth:`infer_query_length()` instead."""
jpayne@69 2888 def __get__(self):
jpayne@69 2889 return self.infer_query_length()
jpayne@69 2890 property positions:
jpayne@69 2891 """deprecated, use :meth:`get_reference_positions()` instead."""
jpayne@69 2892 def __get__(self):
jpayne@69 2893 return self.get_reference_positions()
jpayne@69 2894 property tags:
jpayne@69 2895 """deprecated, use :meth:`get_tags()` instead."""
jpayne@69 2896 def __get__(self):
jpayne@69 2897 return self.get_tags()
jpayne@69 2898 def __set__(self, tags):
jpayne@69 2899 self.set_tags(tags)
jpayne@69 2900 def overlap(self):
jpayne@69 2901 """deprecated, use :meth:`get_overlap()` instead."""
jpayne@69 2902 return self.get_overlap()
jpayne@69 2903 def opt(self, tag):
jpayne@69 2904 """deprecated, use :meth:`get_tag()` instead."""
jpayne@69 2905 return self.get_tag(tag)
jpayne@69 2906 def setTag(self, tag, value, value_type=None, replace=True):
jpayne@69 2907 """deprecated, use :meth:`set_tag()` instead."""
jpayne@69 2908 return self.set_tag(tag, value, value_type, replace)
jpayne@69 2909
jpayne@69 2910
jpayne@69 2911 cdef class PileupColumn:
jpayne@69 2912 '''A pileup of reads at a particular reference sequence position
jpayne@69 2913 (:term:`column`). A pileup column contains all the reads that map
jpayne@69 2914 to a certain target base.
jpayne@69 2915
jpayne@69 2916 This class is a proxy for results returned by the samtools pileup
jpayne@69 2917 engine. If the underlying engine iterator advances, the results
jpayne@69 2918 of this column will change.
jpayne@69 2919 '''
jpayne@69 2920 def __init__(self):
jpayne@69 2921 raise TypeError("this class cannot be instantiated from Python")
jpayne@69 2922
jpayne@69 2923 def __str__(self):
jpayne@69 2924 return "\t".join(map(str,
jpayne@69 2925 (self.reference_id,
jpayne@69 2926 self.reference_pos,
jpayne@69 2927 self.nsegments))) +\
jpayne@69 2928 "\n" +\
jpayne@69 2929 "\n".join(map(str, self.pileups))
jpayne@69 2930
jpayne@69 2931 def __dealloc__(self):
jpayne@69 2932 free(self.buf.s)
jpayne@69 2933
jpayne@69 2934 def set_min_base_quality(self, min_base_quality):
jpayne@69 2935 """set the minimum base quality for this pileup column.
jpayne@69 2936 """
jpayne@69 2937 self.min_base_quality = min_base_quality
jpayne@69 2938
jpayne@69 2939 def __len__(self):
jpayne@69 2940 """return number of reads aligned to this column.
jpayne@69 2941
jpayne@69 2942 see :meth:`get_num_aligned`
jpayne@69 2943 """
jpayne@69 2944 return self.get_num_aligned()
jpayne@69 2945
jpayne@69 2946 property reference_id:
jpayne@69 2947 '''the reference sequence number as defined in the header'''
jpayne@69 2948 def __get__(self):
jpayne@69 2949 return self.tid
jpayne@69 2950
jpayne@69 2951 property reference_name:
jpayne@69 2952 """:term:`reference` name (None if no AlignmentFile is associated)"""
jpayne@69 2953 def __get__(self):
jpayne@69 2954 if self.header is not None:
jpayne@69 2955 return self.header.get_reference_name(self.tid)
jpayne@69 2956 return None
jpayne@69 2957
jpayne@69 2958 property nsegments:
jpayne@69 2959 '''number of reads mapping to this column.
jpayne@69 2960
jpayne@69 2961 Note that this number ignores the base quality filter.'''
jpayne@69 2962 def __get__(self):
jpayne@69 2963 return self.n_pu
jpayne@69 2964 def __set__(self, n):
jpayne@69 2965 self.n_pu = n
jpayne@69 2966
jpayne@69 2967 property reference_pos:
jpayne@69 2968 '''the position in the reference sequence (0-based).'''
jpayne@69 2969 def __get__(self):
jpayne@69 2970 return self.pos
jpayne@69 2971
jpayne@69 2972 property pileups:
jpayne@69 2973 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
jpayne@69 2974 def __get__(self):
jpayne@69 2975 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 2976 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 2977
jpayne@69 2978 cdef int x
jpayne@69 2979 cdef const bam_pileup1_t * p = NULL
jpayne@69 2980 pileups = []
jpayne@69 2981
jpayne@69 2982 # warning: there could be problems if self.n and self.buf are
jpayne@69 2983 # out of sync.
jpayne@69 2984 for x from 0 <= x < self.n_pu:
jpayne@69 2985 p = &(self.plp[0][x])
jpayne@69 2986 if p == NULL:
jpayne@69 2987 raise ValueError(
jpayne@69 2988 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 2989 "outside loop")
jpayne@69 2990 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 2991 continue
jpayne@69 2992 pileups.append(makePileupRead(p, self.header))
jpayne@69 2993 return pileups
jpayne@69 2994
jpayne@69 2995 ########################################################
jpayne@69 2996 # Compatibility Accessors
jpayne@69 2997 # Functions, properties for compatibility with pysam < 0.8
jpayne@69 2998 ########################################################
jpayne@69 2999 property pos:
jpayne@69 3000 """deprecated, use :attr:`reference_pos` instead."""
jpayne@69 3001 def __get__(self):
jpayne@69 3002 return self.reference_pos
jpayne@69 3003 def __set__(self, v):
jpayne@69 3004 self.reference_pos = v
jpayne@69 3005
jpayne@69 3006 property tid:
jpayne@69 3007 """deprecated, use :attr:`reference_id` instead."""
jpayne@69 3008 def __get__(self):
jpayne@69 3009 return self.reference_id
jpayne@69 3010 def __set__(self, v):
jpayne@69 3011 self.reference_id = v
jpayne@69 3012
jpayne@69 3013 property n:
jpayne@69 3014 """deprecated, use :attr:`nsegments` instead."""
jpayne@69 3015 def __get__(self):
jpayne@69 3016 return self.nsegments
jpayne@69 3017 def __set__(self, v):
jpayne@69 3018 self.nsegments = v
jpayne@69 3019
jpayne@69 3020 def get_num_aligned(self):
jpayne@69 3021 """return number of aligned bases at pileup column position.
jpayne@69 3022
jpayne@69 3023 This method applies a base quality filter and the number is
jpayne@69 3024 equal to the size of :meth:`get_query_sequences`,
jpayne@69 3025 :meth:`get_mapping_qualities`, etc.
jpayne@69 3026
jpayne@69 3027 """
jpayne@69 3028 cdef uint32_t x = 0
jpayne@69 3029 cdef uint32_t c = 0
jpayne@69 3030 cdef uint32_t cnt = 0
jpayne@69 3031 cdef const bam_pileup1_t * p = NULL
jpayne@69 3032 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 3033 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 3034
jpayne@69 3035 for x from 0 <= x < self.n_pu:
jpayne@69 3036 p = &(self.plp[0][x])
jpayne@69 3037 if p == NULL:
jpayne@69 3038 raise ValueError(
jpayne@69 3039 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3040 "outside loop")
jpayne@69 3041 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 3042 continue
jpayne@69 3043 cnt += 1
jpayne@69 3044 return cnt
jpayne@69 3045
jpayne@69 3046 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
jpayne@69 3047 """query bases/sequences at pileup column position.
jpayne@69 3048
jpayne@69 3049 Optionally, the bases/sequences can be annotated according to the samtools
jpayne@69 3050 mpileup format. This is the format description from the samtools mpileup tool::
jpayne@69 3051
jpayne@69 3052 Information on match, mismatch, indel, strand, mapping
jpayne@69 3053 quality and start and end of a read are all encoded at the
jpayne@69 3054 read base column. At this column, a dot stands for a match
jpayne@69 3055 to the reference base on the forward strand, a comma for a
jpayne@69 3056 match on the reverse strand, a '>' or '<' for a reference
jpayne@69 3057 skip, `ACGTN' for a mismatch on the forward strand and
jpayne@69 3058 `acgtn' for a mismatch on the reverse strand. A pattern
jpayne@69 3059 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
jpayne@69 3060 between this reference position and the next reference
jpayne@69 3061 position. The length of the insertion is given by the
jpayne@69 3062 integer in the pattern, followed by the inserted
jpayne@69 3063 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
jpayne@69 3064 represents a deletion from the reference. The deleted bases
jpayne@69 3065 will be presented as `*' in the following lines. Also at
jpayne@69 3066 the read base column, a symbol `^' marks the start of a
jpayne@69 3067 read. The ASCII of the character following `^' minus 33
jpayne@69 3068 gives the mapping quality. A symbol `$' marks the end of a
jpayne@69 3069 read segment
jpayne@69 3070
jpayne@69 3071 To reproduce samtools mpileup format, set all of mark_matches,
jpayne@69 3072 mark_ends and add_indels to True.
jpayne@69 3073
jpayne@69 3074 Parameters
jpayne@69 3075 ----------
jpayne@69 3076
jpayne@69 3077 mark_matches: bool
jpayne@69 3078
jpayne@69 3079 If True, output bases matching the reference as "." or ","
jpayne@69 3080 for forward and reverse strand, respectively. This mark
jpayne@69 3081 requires the reference sequence. If no reference is
jpayne@69 3082 present, this option is ignored.
jpayne@69 3083
jpayne@69 3084 mark_ends : bool
jpayne@69 3085
jpayne@69 3086 If True, add markers "^" and "$" for read start and end, respectively.
jpayne@69 3087
jpayne@69 3088 add_indels : bool
jpayne@69 3089
jpayne@69 3090 If True, add bases for bases inserted into or skipped from the
jpayne@69 3091 reference. The latter requires a reference sequence file to have
jpayne@69 3092 been given, e.g. via `pileup(fastafile = ...)`. If no reference
jpayne@69 3093 sequence is available, skipped bases are represented as 'N's.
jpayne@69 3094
jpayne@69 3095 Returns
jpayne@69 3096 -------
jpayne@69 3097
jpayne@69 3098 a list of bases/sequences per read at pileup column position. : list
jpayne@69 3099
jpayne@69 3100 """
jpayne@69 3101 cdef uint32_t x = 0
jpayne@69 3102 cdef uint32_t j = 0
jpayne@69 3103 cdef uint32_t c = 0
jpayne@69 3104 cdef uint8_t cc = 0
jpayne@69 3105 cdef uint8_t rb = 0
jpayne@69 3106 cdef kstring_t * buf = &self.buf
jpayne@69 3107 cdef const bam_pileup1_t * p = NULL
jpayne@69 3108
jpayne@69 3109 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 3110 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 3111
jpayne@69 3112 buf.l = 0
jpayne@69 3113
jpayne@69 3114 # todo: reference sequence to count matches/mismatches
jpayne@69 3115 # todo: convert assertions to exceptions
jpayne@69 3116 for x from 0 <= x < self.n_pu:
jpayne@69 3117 p = &(self.plp[0][x])
jpayne@69 3118 if p == NULL:
jpayne@69 3119 raise ValueError(
jpayne@69 3120 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3121 "outside loop")
jpayne@69 3122 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 3123 continue
jpayne@69 3124 # see samtools pileup_seq
jpayne@69 3125 if mark_ends and p.is_head:
jpayne@69 3126 kputc(b'^', buf)
jpayne@69 3127
jpayne@69 3128 if p.b.core.qual > 93:
jpayne@69 3129 kputc(126, buf)
jpayne@69 3130 else:
jpayne@69 3131 kputc(p.b.core.qual + 33, buf)
jpayne@69 3132 if not p.is_del:
jpayne@69 3133 if p.qpos < p.b.core.l_qseq:
jpayne@69 3134 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
jpayne@69 3135 else:
jpayne@69 3136 cc = b'N'
jpayne@69 3137
jpayne@69 3138 if mark_matches and self.reference_sequence != NULL:
jpayne@69 3139 rb = self.reference_sequence[self.reference_pos]
jpayne@69 3140 if seq_nt16_table[cc] == seq_nt16_table[rb]:
jpayne@69 3141 cc = b'='
jpayne@69 3142 kputc(strand_mark_char(cc, p.b), buf)
jpayne@69 3143 elif add_indels:
jpayne@69 3144 if p.is_refskip:
jpayne@69 3145 if bam_is_rev(p.b):
jpayne@69 3146 kputc(b'<', buf)
jpayne@69 3147 else:
jpayne@69 3148 kputc(b'>', buf)
jpayne@69 3149 else:
jpayne@69 3150 kputc(b'*', buf)
jpayne@69 3151 if add_indels:
jpayne@69 3152 if p.indel > 0:
jpayne@69 3153 kputc(b'+', buf)
jpayne@69 3154 kputw(p.indel, buf)
jpayne@69 3155 for j from 1 <= j <= p.indel:
jpayne@69 3156 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
jpayne@69 3157 kputc(strand_mark_char(cc, p.b), buf)
jpayne@69 3158 elif p.indel < 0:
jpayne@69 3159 kputc(b'-', buf)
jpayne@69 3160 kputw(-p.indel, buf)
jpayne@69 3161 for j from 1 <= j <= -p.indel:
jpayne@69 3162 # TODO: out-of-range check here?
jpayne@69 3163 if self.reference_sequence == NULL:
jpayne@69 3164 cc = b'N'
jpayne@69 3165 else:
jpayne@69 3166 cc = self.reference_sequence[self.reference_pos + j]
jpayne@69 3167 kputc(strand_mark_char(cc, p.b), buf)
jpayne@69 3168 if mark_ends and p.is_tail:
jpayne@69 3169 kputc(b'$', buf)
jpayne@69 3170
jpayne@69 3171 kputc(b':', buf)
jpayne@69 3172
jpayne@69 3173 if buf.l == 0:
jpayne@69 3174 # could be zero if all qualities are too low
jpayne@69 3175 return ""
jpayne@69 3176 else:
jpayne@69 3177 # quicker to ensemble all and split than to encode all separately.
jpayne@69 3178 # ignore last ":"
jpayne@69 3179 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
jpayne@69 3180
jpayne@69 3181 def get_query_qualities(self):
jpayne@69 3182 """query base quality scores at pileup column position.
jpayne@69 3183
jpayne@69 3184 Returns
jpayne@69 3185 -------
jpayne@69 3186
jpayne@69 3187 a list of quality scores : list
jpayne@69 3188 """
jpayne@69 3189 cdef uint32_t x = 0
jpayne@69 3190 cdef const bam_pileup1_t * p = NULL
jpayne@69 3191 cdef uint32_t c = 0
jpayne@69 3192 result = []
jpayne@69 3193 for x from 0 <= x < self.n_pu:
jpayne@69 3194 p = &(self.plp[0][x])
jpayne@69 3195 if p == NULL:
jpayne@69 3196 raise ValueError(
jpayne@69 3197 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3198 "outside loop")
jpayne@69 3199
jpayne@69 3200 if p.qpos < p.b.core.l_qseq:
jpayne@69 3201 c = bam_get_qual(p.b)[p.qpos]
jpayne@69 3202 else:
jpayne@69 3203 c = 0
jpayne@69 3204 if c < self.min_base_quality:
jpayne@69 3205 continue
jpayne@69 3206 result.append(c)
jpayne@69 3207 return result
jpayne@69 3208
jpayne@69 3209 def get_mapping_qualities(self):
jpayne@69 3210 """query mapping quality scores at pileup column position.
jpayne@69 3211
jpayne@69 3212 Returns
jpayne@69 3213 -------
jpayne@69 3214
jpayne@69 3215 a list of quality scores : list
jpayne@69 3216 """
jpayne@69 3217 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 3218 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 3219
jpayne@69 3220 cdef uint32_t x = 0
jpayne@69 3221 cdef const bam_pileup1_t * p = NULL
jpayne@69 3222 result = []
jpayne@69 3223 for x from 0 <= x < self.n_pu:
jpayne@69 3224 p = &(self.plp[0][x])
jpayne@69 3225 if p == NULL:
jpayne@69 3226 raise ValueError(
jpayne@69 3227 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3228 "outside loop")
jpayne@69 3229
jpayne@69 3230 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 3231 continue
jpayne@69 3232 result.append(p.b.core.qual)
jpayne@69 3233 return result
jpayne@69 3234
jpayne@69 3235 def get_query_positions(self):
jpayne@69 3236 """positions in read at pileup column position.
jpayne@69 3237
jpayne@69 3238 Returns
jpayne@69 3239 -------
jpayne@69 3240
jpayne@69 3241 a list of read positions : list
jpayne@69 3242 """
jpayne@69 3243 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 3244 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 3245
jpayne@69 3246 cdef uint32_t x = 0
jpayne@69 3247 cdef const bam_pileup1_t * p = NULL
jpayne@69 3248 result = []
jpayne@69 3249 for x from 0 <= x < self.n_pu:
jpayne@69 3250 p = &(self.plp[0][x])
jpayne@69 3251 if p == NULL:
jpayne@69 3252 raise ValueError(
jpayne@69 3253 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3254 "outside loop")
jpayne@69 3255
jpayne@69 3256 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 3257 continue
jpayne@69 3258 result.append(p.qpos)
jpayne@69 3259 return result
jpayne@69 3260
jpayne@69 3261 def get_query_names(self):
jpayne@69 3262 """query/read names aligned at pileup column position.
jpayne@69 3263
jpayne@69 3264 Returns
jpayne@69 3265 -------
jpayne@69 3266
jpayne@69 3267 a list of query names at pileup column position. : list
jpayne@69 3268 """
jpayne@69 3269 if self.plp == NULL or self.plp[0] == NULL:
jpayne@69 3270 raise ValueError("PileupColumn accessed after iterator finished")
jpayne@69 3271
jpayne@69 3272 cdef uint32_t x = 0
jpayne@69 3273 cdef const bam_pileup1_t * p = NULL
jpayne@69 3274 result = []
jpayne@69 3275 for x from 0 <= x < self.n_pu:
jpayne@69 3276 p = &(self.plp[0][x])
jpayne@69 3277 if p == NULL:
jpayne@69 3278 raise ValueError(
jpayne@69 3279 "pileup buffer out of sync - most likely use of iterator "
jpayne@69 3280 "outside loop")
jpayne@69 3281
jpayne@69 3282 if pileup_base_qual_skip(p, self.min_base_quality):
jpayne@69 3283 continue
jpayne@69 3284 result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
jpayne@69 3285 return result
jpayne@69 3286
jpayne@69 3287
jpayne@69 3288 cdef class PileupRead:
jpayne@69 3289 '''Representation of a read aligned to a particular position in the
jpayne@69 3290 reference sequence.
jpayne@69 3291
jpayne@69 3292 '''
jpayne@69 3293
jpayne@69 3294 def __init__(self):
jpayne@69 3295 raise TypeError(
jpayne@69 3296 "this class cannot be instantiated from Python")
jpayne@69 3297
jpayne@69 3298 def __str__(self):
jpayne@69 3299 return "\t".join(
jpayne@69 3300 map(str,
jpayne@69 3301 (self.alignment, self.query_position,
jpayne@69 3302 self.indel, self.level,
jpayne@69 3303 self.is_del, self.is_head,
jpayne@69 3304 self.is_tail, self.is_refskip)))
jpayne@69 3305
jpayne@69 3306 property alignment:
jpayne@69 3307 """a :class:`pysam.AlignedSegment` object of the aligned read"""
jpayne@69 3308 def __get__(self):
jpayne@69 3309 return self._alignment
jpayne@69 3310
jpayne@69 3311 property query_position:
jpayne@69 3312 """position of the read base at the pileup site, 0-based.
jpayne@69 3313 None if :attr:`is_del` or :attr:`is_refskip` is set.
jpayne@69 3314
jpayne@69 3315 """
jpayne@69 3316 def __get__(self):
jpayne@69 3317 if self.is_del or self.is_refskip:
jpayne@69 3318 return None
jpayne@69 3319 else:
jpayne@69 3320 return self._qpos
jpayne@69 3321
jpayne@69 3322 property query_position_or_next:
jpayne@69 3323 """position of the read base at the pileup site, 0-based.
jpayne@69 3324
jpayne@69 3325 If the current position is a deletion, returns the next
jpayne@69 3326 aligned base.
jpayne@69 3327
jpayne@69 3328 """
jpayne@69 3329 def __get__(self):
jpayne@69 3330 return self._qpos
jpayne@69 3331
jpayne@69 3332 property indel:
jpayne@69 3333 """indel length for the position following the current pileup site.
jpayne@69 3334
jpayne@69 3335 This quantity peeks ahead to the next cigar operation in this
jpayne@69 3336 alignment. If the next operation is an insertion, indel will
jpayne@69 3337 be positive. If the next operation is a deletion, it will be
jpayne@69 3338 negation. 0 if the next operation is not an indel.
jpayne@69 3339
jpayne@69 3340 """
jpayne@69 3341 def __get__(self):
jpayne@69 3342 return self._indel
jpayne@69 3343
jpayne@69 3344 property level:
jpayne@69 3345 """the level of the read in the "viewer" mode. Note that this value
jpayne@69 3346 is currently not computed."""
jpayne@69 3347 def __get__(self):
jpayne@69 3348 return self._level
jpayne@69 3349
jpayne@69 3350 property is_del:
jpayne@69 3351 """1 iff the base on the padded read is a deletion"""
jpayne@69 3352 def __get__(self):
jpayne@69 3353 return self._is_del
jpayne@69 3354
jpayne@69 3355 property is_head:
jpayne@69 3356 """1 iff the base on the padded read is the left-most base."""
jpayne@69 3357 def __get__(self):
jpayne@69 3358 return self._is_head
jpayne@69 3359
jpayne@69 3360 property is_tail:
jpayne@69 3361 """1 iff the base on the padded read is the right-most base."""
jpayne@69 3362 def __get__(self):
jpayne@69 3363 return self._is_tail
jpayne@69 3364
jpayne@69 3365 property is_refskip:
jpayne@69 3366 """1 iff the base on the padded read is part of CIGAR N op."""
jpayne@69 3367 def __get__(self):
jpayne@69 3368 return self._is_refskip
jpayne@69 3369
jpayne@69 3370
jpayne@69 3371
jpayne@69 3372 cpdef enum CIGAR_OPS:
jpayne@69 3373 CMATCH = 0
jpayne@69 3374 CINS = 1
jpayne@69 3375 CDEL = 2
jpayne@69 3376 CREF_SKIP = 3
jpayne@69 3377 CSOFT_CLIP = 4
jpayne@69 3378 CHARD_CLIP = 5
jpayne@69 3379 CPAD = 6
jpayne@69 3380 CEQUAL = 7
jpayne@69 3381 CDIFF = 8
jpayne@69 3382 CBACK = 9
jpayne@69 3383
jpayne@69 3384
jpayne@69 3385 cpdef enum SAM_FLAGS:
jpayne@69 3386 # the read is paired in sequencing, no matter whether it is mapped in a pair
jpayne@69 3387 FPAIRED = 1
jpayne@69 3388 # the read is mapped in a proper pair
jpayne@69 3389 FPROPER_PAIR = 2
jpayne@69 3390 # the read itself is unmapped; conflictive with FPROPER_PAIR
jpayne@69 3391 FUNMAP = 4
jpayne@69 3392 # the mate is unmapped
jpayne@69 3393 FMUNMAP = 8
jpayne@69 3394 # the read is mapped to the reverse strand
jpayne@69 3395 FREVERSE = 16
jpayne@69 3396 # the mate is mapped to the reverse strand
jpayne@69 3397 FMREVERSE = 32
jpayne@69 3398 # this is read1
jpayne@69 3399 FREAD1 = 64
jpayne@69 3400 # this is read2
jpayne@69 3401 FREAD2 = 128
jpayne@69 3402 # not primary alignment
jpayne@69 3403 FSECONDARY = 256
jpayne@69 3404 # QC failure
jpayne@69 3405 FQCFAIL = 512
jpayne@69 3406 # optical or PCR duplicate
jpayne@69 3407 FDUP = 1024
jpayne@69 3408 # supplementary alignment
jpayne@69 3409 FSUPPLEMENTARY = 2048
jpayne@69 3410
jpayne@69 3411
jpayne@69 3412 __all__ = [
jpayne@69 3413 "AlignedSegment",
jpayne@69 3414 "PileupColumn",
jpayne@69 3415 "PileupRead",
jpayne@69 3416 "CMATCH",
jpayne@69 3417 "CINS",
jpayne@69 3418 "CDEL",
jpayne@69 3419 "CREF_SKIP",
jpayne@69 3420 "CSOFT_CLIP",
jpayne@69 3421 "CHARD_CLIP",
jpayne@69 3422 "CPAD",
jpayne@69 3423 "CEQUAL",
jpayne@69 3424 "CDIFF",
jpayne@69 3425 "CBACK",
jpayne@69 3426 "FPAIRED",
jpayne@69 3427 "FPROPER_PAIR",
jpayne@69 3428 "FUNMAP",
jpayne@69 3429 "FMUNMAP",
jpayne@69 3430 "FREVERSE",
jpayne@69 3431 "FMREVERSE",
jpayne@69 3432 "FREAD1",
jpayne@69 3433 "FREAD2",
jpayne@69 3434 "FSECONDARY",
jpayne@69 3435 "FQCFAIL",
jpayne@69 3436 "FDUP",
jpayne@69 3437 "FSUPPLEMENTARY",
jpayne@69 3438 "KEY_NAMES"]