csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:5028fdace37b
+# cython: language_level=3
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+###############################################################################
+# The principal classes defined in this module are:
+#
+# class AlignedSegment  an aligned segment (read)
+#
+# class PileupColumn    a collection of segments (PileupRead) aligned to
+#                       a particular genomic position.
+#
+# class PileupRead      an AlignedSegment aligned to a particular genomic
+#                       position. Contains additional attributes with respect
+#                       to this.
+#
+# Additionally this module defines numerous additional classes that are part
+# of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential (IteratorRow)
+# or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import re
+import array
+import json
+import string
+import ctypes
+import struct
+cimport cython
+from cpython cimport array as c_array
+from cpython cimport PyBytes_FromStringAndSize
+from libc.string cimport memset, strchr
+from cpython cimport array as c_array
+from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
+INT8_MAX, INT16_MAX, INT32_MAX, \
+UINT8_MAX, UINT16_MAX, UINT32_MAX
+from pysam.libchtslib cimport HTS_IDX_NOCOOR
+from pysam.libcutils cimport force_bytes, force_str, \
+charptr_to_str, charptr_to_bytes
+from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
+array_to_qualitystring
+# Constants for binary tag conversion
+cdef char * htslib_types = 'cCsSiIf'
+cdef char * parray_types = 'bBhHiIf'
+# translation tables
+# cigar code to character and vice versa
+cdef char* CODE2CIGAR= "MIDNSHP=XB"
+cdef int NCIGAR_CODES = 10
+CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
+CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
+# names for keys in dictionary representation of an AlignedSegment
+KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
+"next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
+#####################################################################
+# C multiplication with wrapping around
+cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
+return (a * b) & 0xffffffff
+cdef inline uint8_t tolower(uint8_t ch):
+if ch >= 65 and ch <= 90:
+return ch + 32
+else:
+return ch
+cdef inline uint8_t toupper(uint8_t ch):
+if ch >= 97 and ch <= 122:
+return ch - 32
+else:
+return ch
+cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
+if ch == b'=':
+if bam_is_rev(b):
+return b','
+else:
+return b'.'
+else:
+if bam_is_rev(b):
+return tolower(ch)
+else:
+return toupper(ch)
+cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
+cdef uint32_t c
+if p.qpos < p.b.core.l_qseq:
+c = bam_get_qual(p.b)[p.qpos]
+else:
+c = 0
+if c < threshold:
+return True
+return False
+cdef inline char map_typecode_htslib_to_python(uint8_t s):
+"""map an htslib typecode to the corresponding python typecode
+to be used in the struct or array modules."""
+# map type from htslib to python array
+cdef char * f = strchr(htslib_types, s)
+if f == NULL:
+return 0
+return parray_types[f - htslib_types]
+cdef inline uint8_t map_typecode_python_to_htslib(char s):
+"""determine value type from type code of array"""
+cdef char * f = strchr(parray_types, s)
+if f == NULL:
+return 0
+return htslib_types[f - parray_types]
+cdef inline void update_bin(bam1_t * src):
+if src.core.flag & BAM_FUNMAP:
+# treat alignment as length of 1 for unmapped reads
+src.core.bin = hts_reg2bin(
+src.core.pos,
+src.core.pos + 1,
+14,
+5)
+elif pysam_get_n_cigar(src):
+src.core.bin = hts_reg2bin(
+src.core.pos,
+bam_endpos(src),
+14,
+5)
+else:
+src.core.bin = hts_reg2bin(
+src.core.pos,
+src.core.pos + 1,
+14,
+5)
+# optional tag data manipulation
+cdef convert_binary_tag(uint8_t * tag):
+"""return bytesize, number of values and array of values
+in aux_data memory location pointed to by tag."""
+cdef uint8_t auxtype
+cdef uint8_t byte_size
+cdef int32_t nvalues
+# get byte size
+auxtype = tag[0]
+byte_size = aux_type2size(auxtype)
+tag += 1
+# get number of values in array
+nvalues = (<int32_t*>tag)[0]
+tag += 4
+# define python array
+cdef c_array.array c_values = array.array(
+chr(map_typecode_htslib_to_python(auxtype)))
+c_array.resize(c_values, nvalues)
+# copy data
+memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
+# no need to check for endian-ness as bam1_core_t fields
+# and aux_data are in host endian-ness. See sam.c and calls
+# to swap_data
+return byte_size, nvalues, c_values
+cdef inline uint8_t get_tag_typecode(value, value_type=None):
+"""guess type code for a *value*. If *value_type* is None, the type
+code will be inferred based on the Python type of *value*
+"""
+# 0 is unknown typecode
+cdef char typecode = 0
+if value_type is None:
+if isinstance(value, int):
+if value < 0:
+if value >= INT8_MIN:
+typecode = b'c'
+elif value >= INT16_MIN:
+typecode = b's'
+elif value >= INT32_MIN:
+typecode = b'i'
+# unsigned ints
+else:
+if value <= UINT8_MAX:
+typecode = b'C'
+elif value <= UINT16_MAX:
+typecode = b'S'
+elif value <= UINT32_MAX:
+typecode = b'I'
+elif isinstance(value, float):
+typecode = b'f'
+elif isinstance(value, str):
+typecode = b'Z'
+elif isinstance(value, bytes):
+typecode = b'Z'
+elif isinstance(value, array.array) or \
+isinstance(value, list) or \
+isinstance(value, tuple):
+typecode = b'B'
+else:
+if value_type in 'aAsSIcCZidfH':
+typecode = force_bytes(value_type)[0]
+return typecode
+cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
+'''returns the value typecode of a value.
+If max is specified, the appropriate type is returned for a range
+where value is the minimum.
+Note that this method returns types from the extended BAM alphabet
+of types that includes tags that are not part of the SAM
+specification.
+'''
+cdef uint8_t typecode
+t = type(value)
+if t is float:
+typecode = b'f'
+elif t is int:
+if max_value is None:
+max_value = value
+if min_value is None:
+min_value = value
+# signed ints
+if min_value < 0:
+if min_value >= INT8_MIN and max_value <= INT8_MAX:
+typecode = b'c'
+elif min_value >= INT16_MIN and max_value <= INT16_MAX:
+typecode = b's'
+elif min_value >= INT32_MIN or max_value <= INT32_MAX:
+typecode = b'i'
+else:
+raise ValueError(
+"at least one signed integer out of range of "
+"BAM/SAM specification")
+# unsigned ints
+else:
+if max_value <= UINT8_MAX:
+typecode = b'C'
+elif max_value <= UINT16_MAX:
+typecode = b'S'
+elif max_value <= UINT32_MAX:
+typecode = b'I'
+else:
+raise ValueError(
+"at least one integer out of range of BAM/SAM specification")
+else:
+# Note: hex strings (H) are not supported yet
+if t is not bytes:
+value = value.encode('ascii')
+if len(value) == 1:
+typecode = b'A'
+else:
+typecode = b'Z'
+return typecode
+# mapping python array.array and htslib typecodes to struct typecodes
+DATATYPE2FORMAT = {
+ord('c'): ('b', 1),
+ord('C'): ('B', 1),
+ord('s'): ('h', 2),
+ord('S'): ('H', 2),
+ord('i'): ('i', 4),
+ord('I'): ('I', 4),
+ord('f'): ('f', 4),
+ord('d'): ('d', 8),
+ord('A'): ('c', 1),
+ord('a'): ('c', 1)}
+cdef inline pack_tags(tags):
+"""pack a list of tags. Each tag is a tuple of (tag, tuple).
+Values are packed into the most space efficient data structure
+possible unless the tag contains a third field with the typecode.
+Returns a format string and the associated list of arguments to be
+used in a call to struct.pack_into.
+"""
+fmts, args = ["<"], []
+# htslib typecode
+cdef uint8_t typecode
+for tag in tags:
+if len(tag) == 2:
+pytag, value = tag
+valuetype = None
+elif len(tag) == 3:
+pytag, value, valuetype = tag
+else:
+raise ValueError("malformatted tag: %s" % str(tag))
+if valuetype is None:
+typecode = 0
+else:
+# only first character in valuecode matters
+typecode = force_bytes(valuetype)[0]
+pytag = force_bytes(pytag)
+pytype = type(value)
+if pytype is tuple or pytype is list:
+# binary tags from tuples or lists
+if not typecode:
+# automatically determine value type - first value
+# determines type. If there is a mix of types, the
+# result is undefined.
+typecode = get_btag_typecode(min(value),
+min_value=min(value),
+max_value=max(value))
+if typecode not in DATATYPE2FORMAT:
+raise ValueError("invalid value type '{}'".format(chr(typecode)))
+datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
+args.extend([pytag[:2],
+ord("B"),
+typecode,
+len(value)] + list(value))
+elif isinstance(value, array.array):
+# binary tags from arrays
+if typecode == 0:
+typecode = map_typecode_python_to_htslib(ord(value.typecode))
+if typecode == 0:
+raise ValueError("unsupported type code '{}'".format(value.typecode))
+if typecode not in DATATYPE2FORMAT:
+raise ValueError("invalid value type '{}'".format(chr(typecode)))
+# use array.tostring() to retrieve byte representation and
+# save as bytes
+datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
+args.extend([pytag[:2],
+ord("B"),
+typecode,
+len(value),
+value.tobytes()])
+else:
+if typecode == 0:
+typecode = get_tag_typecode(value)
+if typecode == 0:
+raise ValueError("could not deduce typecode for value {}".format(value))
+if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
+value = force_bytes(value)
+if typecode == b"a":
+typecode = b'A'
+if typecode == b'Z' or typecode == b'H':
+datafmt = "2sB%is" % (len(value)+1)
+elif typecode in DATATYPE2FORMAT:
+datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
+else:
+raise ValueError("invalid value type '{}'".format(chr(typecode)))
+args.extend([pytag[:2],
+typecode,
+value])
+fmts.append(datafmt)
+return "".join(fmts), args
+cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
+"""return query length computed from CIGAR alignment.
+Length ignores hard-clipped bases.
+Return 0 if there is no CIGAR alignment.
+"""
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+if cigar_p == NULL:
+return 0
+cdef uint32_t k, qpos
+cdef int op
+qpos = 0
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CMATCH or \
+op == BAM_CINS or \
+op == BAM_CSOFT_CLIP or \
+op == BAM_CEQUAL or \
+op == BAM_CDIFF:
+qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+return qpos
+cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
+"""return query length computed from CIGAR alignment.
+Length includes hard-clipped bases.
+Return 0 if there is no CIGAR alignment.
+"""
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+if cigar_p == NULL:
+return 0
+cdef uint32_t k, qpos
+cdef int op
+qpos = 0
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CMATCH or \
+op == BAM_CINS or \
+op == BAM_CSOFT_CLIP or \
+op == BAM_CHARD_CLIP or \
+op == BAM_CEQUAL or \
+op == BAM_CDIFF:
+qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+return qpos
+cdef inline int32_t getQueryStart(bam1_t *src) except -1:
+cdef uint32_t * cigar_p
+cdef uint32_t start_offset = 0
+cdef uint32_t k, op
+cigar_p = pysam_bam_get_cigar(src);
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CHARD_CLIP:
+if start_offset != 0 and start_offset != src.core.l_qseq:
+raise ValueError('Invalid clipping in CIGAR string')
+elif op == BAM_CSOFT_CLIP:
+start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+else:
+break
+return start_offset
+cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+cdef uint32_t end_offset = src.core.l_qseq
+cdef uint32_t k, op
+# if there is no sequence, compute length from cigar string
+if end_offset == 0:
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CMATCH or \
+op == BAM_CINS or \
+op == BAM_CEQUAL or \
+op == BAM_CDIFF or \
+(op == BAM_CSOFT_CLIP and end_offset == 0):
+end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+else:
+# walk backwards in cigar string
+for k from pysam_get_n_cigar(src) > k >= 1:
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CHARD_CLIP:
+if end_offset != src.core.l_qseq:
+raise ValueError('Invalid clipping in CIGAR string')
+elif op == BAM_CSOFT_CLIP:
+end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+else:
+break
+return end_offset
+cdef inline bytes getSequenceInRange(bam1_t *src,
+uint32_t start,
+uint32_t end):
+"""return python string of the sequence in a bam1_t object.
+"""
+cdef uint8_t * p
+cdef uint32_t k
+cdef char * s
+if not src.core.l_qseq:
+return None
+seq = PyBytes_FromStringAndSize(NULL, end - start)
+s   = <char*>seq
+p   = pysam_bam_get_seq(src)
+for k from start <= k < end:
+# equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
+# note: do not use string literal as it will be a python string
+s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
+return charptr_to_bytes(seq)
+cdef inline object getQualitiesInRange(bam1_t *src,
+uint32_t start,
+uint32_t end):
+"""return python array of quality values from a bam1_t object"""
+cdef uint8_t * p
+cdef uint32_t k
+p = pysam_bam_get_qual(src)
+if p[0] == 0xff:
+return None
+# 'B': unsigned char
+cdef c_array.array result = array.array('B', [0])
+c_array.resize(result, end - start)
+# copy data
+memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
+return result
+#####################################################################
+## factory methods for instantiating extension classes
+cdef class AlignedSegment
+cdef AlignedSegment makeAlignedSegment(bam1_t *src,
+AlignmentHeader header):
+'''return an AlignedSegment object constructed from `src`'''
+# note that the following does not call __init__
+cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
+dest._delegate = bam_dup1(src)
+dest.header = header
+return dest
+cdef class PileupColumn
+cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
+int tid,
+int pos,
+int n_pu,
+uint32_t min_base_quality,
+char * reference_sequence,
+AlignmentHeader header):
+'''return a PileupColumn object constructed from pileup in `plp` and
+setting additional attributes.
+'''
+# note that the following does not call __init__
+cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
+dest.header = header
+dest.plp = plp
+dest.tid = tid
+dest.pos = pos
+dest.n_pu = n_pu
+dest.min_base_quality = min_base_quality
+dest.reference_sequence = reference_sequence
+dest.buf.l = dest.buf.m = 0
+dest.buf.s = NULL
+return dest
+cdef class PileupRead
+cdef PileupRead makePileupRead(const bam_pileup1_t *src,
+AlignmentHeader header):
+'''return a PileupRead object construted from a bam_pileup1_t * object.'''
+# note that the following does not call __init__
+cdef PileupRead dest = PileupRead.__new__(PileupRead)
+dest._alignment = makeAlignedSegment(src.b, header)
+dest._qpos = src.qpos
+dest._indel = src.indel
+dest._level = src.level
+dest._is_del = src.is_del
+dest._is_head = src.is_head
+dest._is_tail = src.is_tail
+dest._is_refskip = src.is_refskip
+return dest
+cdef inline uint32_t get_alignment_length(bam1_t *src):
+cdef uint32_t k = 0
+cdef uint32_t l = 0
+if src == NULL:
+return 0
+cdef uint32_t * cigar_p = bam_get_cigar(src)
+if cigar_p == NULL:
+return 0
+cdef int op
+cdef uint32_t n = pysam_get_n_cigar(src)
+for k from 0 <= k < n:
+op = cigar_p[k] & BAM_CIGAR_MASK
+if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
+continue
+l += cigar_p[k] >> BAM_CIGAR_SHIFT
+return l
+cdef inline uint32_t get_md_reference_length(char * md_tag):
+cdef int l = 0
+cdef int md_idx = 0
+cdef int nmatches = 0
+while md_tag[md_idx] != 0:
+if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
+nmatches *= 10
+nmatches += md_tag[md_idx] - 48
+md_idx += 1
+continue
+else:
+l += nmatches
+nmatches = 0
+if md_tag[md_idx] == b'^':
+md_idx += 1
+while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+md_idx += 1
+l += 1
+else:
+md_idx += 1
+l += 1
+l += nmatches
+return l
+# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
+cdef inline bytes build_alignment_sequence(bam1_t * src):
+"""return expanded sequence from MD tag.
+The sequence includes substitutions and both insertions in the
+reference as well as deletions to the reference sequence. Combine
+with the cigar string to reconstitute the query or the reference
+sequence.
+Positions corresponding to `N` (skipped region from the reference)
+or `P` (padding (silent deletion from padded reference)) in the CIGAR
+string will not appear in the returned sequence. The MD should
+correspondingly not contain these. Thus proper tags are::
+Deletion from the reference:    cigar=5M1D5M    MD=5^C5
+Skipped region from reference:  cigar=5M1N5M    MD=10
+Padded region in the reference: cigar=5M1P5M    MD=10
+Returns
+-------
+None, if no MD tag is present.
+"""
+if src == NULL:
+return None
+cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
+if md_tag_ptr == NULL:
+return None
+cdef uint32_t start = getQueryStart(src)
+cdef uint32_t end = getQueryEnd(src)
+# get read sequence, taking into account soft-clipping
+r = getSequenceInRange(src, start, end)
+cdef char * read_sequence = r
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+if cigar_p == NULL:
+return None
+cdef uint32_t r_idx = 0
+cdef int op
+cdef uint32_t k, i, l, x
+cdef int nmatches = 0
+cdef int s_idx = 0
+cdef uint32_t max_len = get_alignment_length(src)
+if max_len == 0:
+raise ValueError("could not determine alignment length")
+cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
+if s == NULL:
+raise ValueError(
+"could not allocate sequence of length %i" % max_len)
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+for i from 0 <= i < l:
+s[s_idx] = read_sequence[r_idx]
+r_idx += 1
+s_idx += 1
+elif op == BAM_CDEL:
+for i from 0 <= i < l:
+s[s_idx] = b'-'
+s_idx += 1
+elif op == BAM_CREF_SKIP:
+pass
+elif op == BAM_CINS or op == BAM_CPAD:
+for i from 0 <= i < l:
+# encode insertions into reference as lowercase
+s[s_idx] = read_sequence[r_idx] + 32
+r_idx += 1
+s_idx += 1
+elif op == BAM_CSOFT_CLIP:
+pass
+elif op == BAM_CHARD_CLIP:
+pass # advances neither
+cdef char md_buffer[2]
+cdef char *md_tag
+cdef uint8_t md_typecode = md_tag_ptr[0]
+if md_typecode == b'Z':
+md_tag = bam_aux2Z(md_tag_ptr)
+elif md_typecode == b'A':
+# Work around HTSeq bug that writes 1-character strings as MD:A:v
+md_buffer[0] = bam_aux2A(md_tag_ptr)
+md_buffer[1] = b'\0'
+md_tag = md_buffer
+else:
+raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
+cdef int md_idx = 0
+cdef char c
+s_idx = 0
+# Check if MD tag is valid by matching CIGAR length to MD tag defined length
+# Insertions would be in addition to what is described by MD, so we calculate
+# the number of insertions separately.
+cdef int insertions = 0
+while s[s_idx] != 0:
+if s[s_idx] >= b'a':
+insertions += 1
+s_idx += 1
+s_idx = 0
+cdef uint32_t md_len = get_md_reference_length(md_tag)
+if md_len + insertions > max_len:
+free(s)
+raise AssertionError(
+"Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
+md_len, max_len, insertions))
+while md_tag[md_idx] != 0:
+# c is numerical
+if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
+nmatches *= 10
+nmatches += md_tag[md_idx] - 48
+md_idx += 1
+continue
+else:
+# save matches up to this point, skipping insertions
+for x from 0 <= x < nmatches:
+while s[s_idx] >= b'a':
+s_idx += 1
+s_idx += 1
+while s[s_idx] >= b'a':
+s_idx += 1
+r_idx += nmatches
+nmatches = 0
+if md_tag[md_idx] == b'^':
+md_idx += 1
+while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+# assert s[s_idx] == '-'
+s[s_idx] = md_tag[md_idx]
+s_idx += 1
+md_idx += 1
+else:
+# save mismatch
+# enforce lower case
+c = md_tag[md_idx]
+if c <= 90:
+c += 32
+s[s_idx] = c
+s_idx += 1
+r_idx += 1
+md_idx += 1
+# save matches up to this point, skipping insertions
+for x from 0 <= x < nmatches:
+while s[s_idx] >= b'a':
+s_idx += 1
+s_idx += 1
+while s[s_idx] >= b'a':
+s_idx += 1
+seq = PyBytes_FromStringAndSize(s, s_idx)
+free(s)
+return seq
+cdef inline bytes build_reference_sequence(bam1_t * src):
+"""return the reference sequence in the region that is covered by the
+alignment of the read to the reference.
+This method requires the MD tag to be set.
+"""
+cdef uint32_t k, i, l
+cdef int op
+cdef int s_idx = 0
+ref_seq = build_alignment_sequence(src)
+if ref_seq is None:
+raise ValueError("MD tag not present")
+cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
+if s == NULL:
+raise ValueError(
+"could not allocate sequence of length %i" % len(ref_seq))
+cdef char * cref_seq = ref_seq
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+cdef uint32_t r_idx = 0
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+for i from 0 <= i < l:
+s[s_idx] = cref_seq[r_idx]
+r_idx += 1
+s_idx += 1
+elif op == BAM_CDEL:
+for i from 0 <= i < l:
+s[s_idx] = cref_seq[r_idx]
+r_idx += 1
+s_idx += 1
+elif op == BAM_CREF_SKIP:
+pass
+elif op == BAM_CINS or op == BAM_CPAD:
+r_idx += l
+elif op == BAM_CSOFT_CLIP:
+pass
+elif op == BAM_CHARD_CLIP:
+pass # advances neither
+seq = PyBytes_FromStringAndSize(s, s_idx)
+free(s)
+return seq
+cdef inline str safe_reference_name(AlignmentHeader header, int tid):
+if tid == -1: return "*"
+elif header is not None: return header.get_reference_name(tid)
+else: return f"#{tid}"
+# Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
+cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
+return (qpos, pos)
+cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
+ref_base = ref_seq[r_idx] if ref_seq is not None else None
+return (qpos, pos, ref_base)
+cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
+return (qpos, pos, CIGAR_OPS(op))
+cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
+ref_base = ref_seq[r_idx] if ref_seq is not None else None
+return (qpos, pos, ref_base, CIGAR_OPS(op))
+cdef class AlignedSegment:
+'''Class representing an aligned segment.
+This class stores a handle to the samtools C-structure representing
+an aligned read. Member read access is forwarded to the C-structure
+and converted into python objects. This implementation should be fast,
+as only the data needed is converted.
+For write access, the C-structure is updated in-place. This is
+not the most efficient way to build BAM entries, as the variable
+length data is concatenated and thus needs to be resized if
+a field is updated. Furthermore, the BAM entry might be
+in an inconsistent state.
+One issue to look out for is that the sequence should always
+be set *before* the quality scores. Setting the sequence will
+also erase any quality scores that were set previously.
+Parameters
+----------
+header:
+:class:`~pysam.AlignmentHeader` object to map numerical
+identifiers to chromosome names. If not given, an empty
+header is created.
+'''
+# Now only called when instances are created from Python
+def __init__(self, AlignmentHeader header=None):
+# see bam_init1
+self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
+if self._delegate == NULL:
+raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
+# allocate some memory. If size is 0, calloc does not return a
+# pointer that can be passed to free() so allocate 40 bytes
+# for a new read
+self._delegate.m_data = 40
+self._delegate.data = <uint8_t *>calloc(
+self._delegate.m_data, 1)
+if self._delegate.data == NULL:
+raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
+self._delegate.l_data = 0
+# set some data to make read approximately legit.
+# Note, SAM writing fails with q_name of length 0
+self._delegate.core.l_qname = 0
+self._delegate.core.tid = -1
+self._delegate.core.pos = -1
+self._delegate.core.mtid = -1
+self._delegate.core.mpos = -1
+# caching for selected fields
+self.cache_query_qualities = None
+self.cache_query_alignment_qualities = None
+self.cache_query_sequence = None
+self.cache_query_alignment_sequence = None
+self.header = header
+def __dealloc__(self):
+bam_destroy1(self._delegate)
+def __str__(self):
+"""return string representation of alignment.
+The representation is an approximate :term:`SAM` format, because
+an aligned read might not be associated with a :term:`AlignmentFile`.
+Hence when the read does not have an associated :class:`AlignedHeader`,
+:term:`tid` is shown instead of the reference name.
+Similarly, the tags field is returned in its parsed state.
+To get a valid SAM record, use :meth:`to_string`.
+"""
+# sam-parsing is done in sam.c/bam_format1_core which
+# requires a valid header.
+return "\t".join(map(str, (self.query_name,
+self.flag,
+safe_reference_name(self.header, self.reference_id),
+self.reference_start + 1,
+self.mapping_quality,
+self.cigarstring,
+safe_reference_name(self.header, self.next_reference_id),
+self.next_reference_start + 1,
+self.template_length,
+self.query_sequence,
+self.query_qualities,
+self.tags)))
+def __repr__(self):
+ref = self.reference_name if self.header is not None else self.reference_id
+return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
+def __copy__(self):
+return makeAlignedSegment(self._delegate, self.header)
+def __deepcopy__(self, memo):
+return makeAlignedSegment(self._delegate, self.header)
+def compare(self, AlignedSegment other):
+'''return -1,0,1, if contents in this are binary
+<,=,> to *other*
+'''
+# avoid segfault when other equals None
+if other is None:
+return -1
+cdef int retval, x
+cdef bam1_t *t
+cdef bam1_t *o
+t = self._delegate
+o = other._delegate
+# uncomment for debugging purposes
+# cdef unsigned char * oo, * tt
+# tt = <unsigned char*>(&t.core)
+# oo = <unsigned char*>(&o.core)
+# for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
+# tt = <unsigned char*>(t.data)
+# oo = <unsigned char*>(o.data)
+# for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
+# Fast-path test for object identity
+if t == o:
+return 0
+cdef uint8_t *a = <uint8_t*>&t.core
+cdef uint8_t *b = <uint8_t*>&o.core
+retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
+if retval:
+return retval
+# cmp(t.l_data, o.l_data)
+retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
+if retval:
+return retval
+return memcmp(t.data, o.data, t.l_data)
+def __richcmp__(self, AlignedSegment other, int op):
+if op == 2:  # == operator
+return self.compare(other) == 0
+elif op == 3:  # != operator
+return self.compare(other) != 0
+else:
+return NotImplemented
+def __hash__(self):
+cdef bam1_t * src = self._delegate
+cdef int x
+# see http://effbot.org/zone/python-hash.htm
+cdef uint8_t * c = <uint8_t *>&src.core
+cdef uint32_t hash_value = c[0]
+for x from 1 <= x < sizeof(bam1_core_t):
+hash_value = c_mul(hash_value, 1000003) ^ c[x]
+c = <uint8_t *>src.data
+for x from 0 <= x < src.l_data:
+hash_value = c_mul(hash_value, 1000003) ^ c[x]
+return hash_value
+cpdef to_string(self):
+"""returns a string representation of the aligned segment.
+The output format is valid SAM format if a header is associated
+with the AlignedSegment.
+"""
+cdef kstring_t line
+line.l = line.m = 0
+line.s = NULL
+if self.header:
+if sam_format1(self.header.ptr, self._delegate, &line) < 0:
+if line.m:
+free(line.s)
+raise ValueError('sam_format failed')
+else:
+raise NotImplementedError("todo")
+ret = force_str(line.s[:line.l])
+if line.m:
+free(line.s)
+return ret
+@classmethod
+def fromstring(cls, sam, AlignmentHeader header):
+"""parses a string representation of the aligned segment.
+The input format should be valid SAM format.
+Parameters
+----------
+sam:
+:term:`SAM` formatted string
+"""
+cdef AlignedSegment dest = cls.__new__(cls)
+dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
+dest.header = header
+cdef kstring_t line
+line.l = line.m = len(sam)
+_sam = force_bytes(sam)
+line.s = _sam
+cdef int ret
+ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
+if ret < 0:
+raise ValueError("parsing SAM record string failed (error code {})".format(ret))
+return dest
+cpdef tostring(self, htsfile=None):
+"""deprecated, use :meth:`to_string()` instead.
+Parameters
+----------
+htsfile:
+(deprecated) AlignmentFile object to map numerical
+identifiers to chromosome names. This parameter is present
+for backwards compatibility and ignored.
+"""
+return self.to_string()
+def to_dict(self):
+"""returns a json representation of the aligned segment.
+Field names are abbreviated versions of the class attributes.
+"""
+# let htslib do the string conversions, but treat optional field properly as list
+vals = self.to_string().split("\t")
+n = len(KEY_NAMES) - 1
+return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
+@classmethod
+def from_dict(cls, sam_dict, AlignmentHeader header):
+"""parses a dictionary representation of the aligned segment.
+Parameters
+----------
+sam_dict:
+dictionary of alignment values, keys corresponding to output from
+:meth:`todict()`.
+"""
+# let htslib do the parsing
+# the tags field can be missing
+return cls.fromstring(
+"\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
+"\t" +
+"\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
+########################################################
+## Basic attributes in order of appearance in SAM format
+property query_name:
+"""the query template name (None if not present)"""
+def __get__(self):
+cdef bam1_t * src = self._delegate
+if src.core.l_qname == 0:
+return None
+return charptr_to_str(<char *>pysam_bam_get_qname(src))
+def __set__(self, qname):
+if qname is None or len(qname) == 0:
+return
+if len(qname) > 254:
+raise ValueError("query length out of range {} > 254".format(
+len(qname)))
+qname = force_bytes(qname)
+cdef bam1_t * src = self._delegate
+# the qname is \0 terminated
+cdef uint8_t l = len(qname) + 1
+cdef char * p = pysam_bam_get_qname(src)
+cdef uint8_t l_extranul = 0
+if l % 4 != 0:
+l_extranul = 4 - l % 4
+cdef bam1_t * retval = pysam_bam_update(src,
+src.core.l_qname,
+l + l_extranul,
+<uint8_t*>p)
+if retval == NULL:
+raise MemoryError("could not allocate memory")
+src.core.l_extranul = l_extranul
+src.core.l_qname = l + l_extranul
+# re-acquire pointer to location in memory
+# as it might have moved
+p = pysam_bam_get_qname(src)
+strncpy(p, qname, l)
+# x might be > 255
+cdef uint16_t x = 0
+for x from l <= x < l + l_extranul:
+p[x] = b'\0'
+property flag:
+"""properties flag"""
+def __get__(self):
+return self._delegate.core.flag
+def __set__(self, flag):
+self._delegate.core.flag = flag
+property reference_name:
+""":term:`reference` name"""
+def __get__(self):
+if self._delegate.core.tid == -1:
+return None
+if self.header:
+return self.header.get_reference_name(self._delegate.core.tid)
+else:
+raise ValueError("reference_name unknown if no header associated with record")
+def __set__(self, reference):
+cdef int tid
+if reference is None or reference == "*":
+self._delegate.core.tid = -1
+elif self.header:
+tid = self.header.get_tid(reference)
+if tid < 0:
+raise ValueError("reference {} does not exist in header".format(
+reference))
+self._delegate.core.tid = tid
+else:
+raise ValueError("reference_name can not be set if no header associated with record")
+property reference_id:
+""":term:`reference` ID
+.. note::
+This field contains the index of the reference sequence in
+the sequence dictionary. To obtain the name of the
+reference sequence, use :meth:`get_reference_name()`
+"""
+def __get__(self):
+return self._delegate.core.tid
+def __set__(self, tid):
+if tid != -1 and self.header and not self.header.is_valid_tid(tid):
+raise ValueError("reference id {} does not exist in header".format(
+tid))
+self._delegate.core.tid = tid
+property reference_start:
+"""0-based leftmost coordinate"""
+def __get__(self):
+return self._delegate.core.pos
+def __set__(self, pos):
+## setting the position requires updating the "bin" attribute
+cdef bam1_t * src
+src = self._delegate
+src.core.pos = pos
+update_bin(src)
+property mapping_quality:
+"""mapping quality"""
+def __get__(self):
+return pysam_get_qual(self._delegate)
+def __set__(self, qual):
+pysam_set_qual(self._delegate, qual)
+property cigarstring:
+'''the :term:`cigar` alignment as a string.
+The cigar string is a string of alternating integers
+and characters denoting the length and the type of
+an operation.
+.. note::
+The order length,operation is specified in the
+SAM format. It is different from the order of
+the :attr:`cigar` property.
+Returns None if not present.
+To unset the cigarstring, assign None or the
+empty string.
+'''
+def __get__(self):
+c = self.cigartuples
+if c is None:
+return None
+# reverse order
+else:
+return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
+def __set__(self, cigar):
+if cigar is None or len(cigar) == 0:
+self.cigartuples = []
+else:
+parts = CIGAR_REGEX.findall(cigar)
+# reverse order
+self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
+# TODO
+# property cigar:
+#     """the cigar alignment"""
+property next_reference_id:
+"""the :term:`reference` id of the mate/next read."""
+def __get__(self):
+return self._delegate.core.mtid
+def __set__(self, mtid):
+if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
+raise ValueError("reference id {} does not exist in header".format(
+mtid))
+self._delegate.core.mtid = mtid
+property next_reference_name:
+""":term:`reference` name of the mate/next read (None if no
+AlignmentFile is associated)"""
+def __get__(self):
+if self._delegate.core.mtid == -1:
+return None
+if self.header:
+return self.header.get_reference_name(self._delegate.core.mtid)
+else:
+raise ValueError("next_reference_name unknown if no header associated with record")
+def __set__(self, reference):
+cdef int mtid
+if reference is None or reference == "*":
+self._delegate.core.mtid = -1
+elif reference == "=":
+self._delegate.core.mtid = self._delegate.core.tid
+elif self.header:
+mtid = self.header.get_tid(reference)
+if mtid < 0:
+raise ValueError("reference {} does not exist in header".format(
+reference))
+self._delegate.core.mtid = mtid
+else:
+raise ValueError("next_reference_name can not be set if no header associated with record")
+property next_reference_start:
+"""the position of the mate/next read."""
+def __get__(self):
+return self._delegate.core.mpos
+def __set__(self, mpos):
+self._delegate.core.mpos = mpos
+property query_length:
+"""the length of the query/read.
+This value corresponds to the length of the sequence supplied
+in the BAM/SAM file. The length of a query is 0 if there is no
+sequence in the BAM/SAM file. In those cases, the read length
+can be inferred from the CIGAR alignment, see
+:meth:`pysam.AlignedSegment.infer_query_length`.
+The length includes soft-clipped bases and is equal to
+``len(query_sequence)``.
+This property is read-only but is updated when a new query
+sequence is assigned to this AlignedSegment.
+Returns 0 if not available.
+"""
+def __get__(self):
+return self._delegate.core.l_qseq
+property template_length:
+"""the observed query template length"""
+def __get__(self):
+return self._delegate.core.isize
+def __set__(self, isize):
+self._delegate.core.isize = isize
+property query_sequence:
+"""read sequence bases, including :term:`soft clipped` bases
+(None if not present).
+Assigning to this attribute will invalidate any quality scores.
+Thus, to in-place edit the sequence and quality scores, copies of
+the quality scores need to be taken. Consider trimming for example::
+q = read.query_qualities
+read.query_sequence = read.query_sequence[5:10]
+read.query_qualities = q[5:10]
+The sequence is returned as it is stored in the BAM file. (This will
+be the reverse complement of the original read sequence if the mapper
+has aligned the read to the reverse strand.)
+"""
+def __get__(self):
+if self.cache_query_sequence:
+return self.cache_query_sequence
+cdef bam1_t * src
+cdef char * s
+src = self._delegate
+if src.core.l_qseq == 0:
+return None
+self.cache_query_sequence = force_str(getSequenceInRange(
+src, 0, src.core.l_qseq))
+return self.cache_query_sequence
+def __set__(self, seq):
+# samtools manages sequence and quality length memory together
+# if no quality information is present, the first byte says 0xff.
+cdef bam1_t * src
+cdef uint8_t * p
+cdef char * s
+cdef int l, k
+cdef Py_ssize_t nbytes_new, nbytes_old
+if seq == None:
+l = 0
+else:
+l = len(seq)
+seq = force_bytes(seq)
+src = self._delegate
+# as the sequence is stored in half-bytes, the total length (sequence
+# plus quality scores) is (l+1)/2 + l
+nbytes_new = (l + 1) // 2 + l
+nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
+# acquire pointer to location in memory
+p = pysam_bam_get_seq(src)
+src.core.l_qseq = l
+# change length of data field
+cdef bam1_t * retval = pysam_bam_update(src,
+nbytes_old,
+nbytes_new,
+p)
+if retval == NULL:
+raise MemoryError("could not allocate memory")
+if l > 0:
+# re-acquire pointer to location in memory
+# as it might have moved
+p = pysam_bam_get_seq(src)
+for k from 0 <= k < nbytes_new:
+p[k] = 0
+# convert to C string
+s = seq
+for k from 0 <= k < l:
+p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
+# erase qualities
+p = pysam_bam_get_qual(src)
+memset(p, 0xff, l)
+self.cache_query_sequence = force_str(seq)
+# clear cached values for quality values
+self.cache_query_qualities = None
+self.cache_query_alignment_qualities = None
+property query_qualities:
+"""read sequence base qualities, including :term:`soft clipped` bases
+(None if not present).
+Quality scores are returned as a python array of unsigned
+chars. Note that this is not the ASCII-encoded value typically
+seen in FASTQ or SAM formatted files. Thus, no offset of 33
+needs to be subtracted.
+Note that to set quality scores the sequence has to be set
+beforehand as this will determine the expected length of the
+quality score array.
+This method raises a ValueError if the length of the
+quality scores and the sequence are not the same.
+"""
+def __get__(self):
+if self.cache_query_qualities:
+return self.cache_query_qualities
+cdef bam1_t * src
+cdef char * q
+src = self._delegate
+if src.core.l_qseq == 0:
+return None
+self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
+return self.cache_query_qualities
+def __set__(self, qual):
+# note that memory is already allocated via setting the sequence
+# hence length match of sequence and quality needs is checked.
+cdef bam1_t * src
+cdef uint8_t * p
+cdef int l
+src = self._delegate
+p = pysam_bam_get_qual(src)
+if qual is None or len(qual) == 0:
+# if absent and there is a sequence: set to 0xff
+memset(p, 0xff, src.core.l_qseq)
+return
+# check for length match
+l = len(qual)
+if src.core.l_qseq != l:
+raise ValueError(
+"quality and sequence mismatch: %i != %i" %
+(l, src.core.l_qseq))
+# create a python array object filling it
+# with the quality scores
+# NB: should avoid this copying if qual is
+# already of the correct type.
+cdef c_array.array result = c_array.array('B', qual)
+# copy data
+memcpy(p, result.data.as_voidptr, l)
+# save in cache
+self.cache_query_qualities = qual
+property bin:
+"""properties bin"""
+def __get__(self):
+return self._delegate.core.bin
+def __set__(self, bin):
+self._delegate.core.bin = bin
+##########################################################
+# Derived simple attributes. These are simple attributes of
+# AlignedSegment getting and setting values.
+##########################################################
+# 1. Flags
+##########################################################
+property is_paired:
+"""true if read is paired in sequencing"""
+def __get__(self):
+return (self.flag & BAM_FPAIRED) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FPAIRED)
+property is_proper_pair:
+"""true if read is mapped in a proper pair"""
+def __get__(self):
+return (self.flag & BAM_FPROPER_PAIR) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
+property is_unmapped:
+"""true if read itself is unmapped"""
+def __get__(self):
+return (self.flag & BAM_FUNMAP) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FUNMAP)
+# setting the unmapped flag requires recalculation of
+# bin as alignment length is now implicitly 1
+update_bin(self._delegate)
+property is_mapped:
+"""true if read itself is mapped
+(implemented in terms of :attr:`is_unmapped`)"""
+def __get__(self):
+return (self.flag & BAM_FUNMAP) == 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, not val, BAM_FUNMAP)
+update_bin(self._delegate)
+property mate_is_unmapped:
+"""true if the mate is unmapped"""
+def __get__(self):
+return (self.flag & BAM_FMUNMAP) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
+property mate_is_mapped:
+"""true if the mate is mapped
+(implemented in terms of :attr:`mate_is_unmapped`)"""
+def __get__(self):
+return (self.flag & BAM_FMUNMAP) == 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, not val, BAM_FMUNMAP)
+property is_reverse:
+"""true if read is mapped to reverse strand"""
+def __get__(self):
+return (self.flag & BAM_FREVERSE) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FREVERSE)
+property is_forward:
+"""true if read is mapped to forward strand
+(implemented in terms of :attr:`is_reverse`)"""
+def __get__(self):
+return (self.flag & BAM_FREVERSE) == 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, not val, BAM_FREVERSE)
+property mate_is_reverse:
+"""true if the mate is mapped to reverse strand"""
+def __get__(self):
+return (self.flag & BAM_FMREVERSE) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
+property mate_is_forward:
+"""true if the mate is mapped to forward strand
+(implemented in terms of :attr:`mate_is_reverse`)"""
+def __get__(self):
+return (self.flag & BAM_FMREVERSE) == 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, not val, BAM_FMREVERSE)
+property is_read1:
+"""true if this is read1"""
+def __get__(self):
+return (self.flag & BAM_FREAD1) != 0
+def __set__(self,val):
+pysam_update_flag(self._delegate, val, BAM_FREAD1)
+property is_read2:
+"""true if this is read2"""
+def __get__(self):
+return (self.flag & BAM_FREAD2) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FREAD2)
+property is_secondary:
+"""true if not primary alignment"""
+def __get__(self):
+return (self.flag & BAM_FSECONDARY) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
+property is_qcfail:
+"""true if QC failure"""
+def __get__(self):
+return (self.flag & BAM_FQCFAIL) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
+property is_duplicate:
+"""true if optical or PCR duplicate"""
+def __get__(self):
+return (self.flag & BAM_FDUP) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FDUP)
+property is_supplementary:
+"""true if this is a supplementary alignment"""
+def __get__(self):
+return (self.flag & BAM_FSUPPLEMENTARY) != 0
+def __set__(self, val):
+pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
+# 2. Coordinates and lengths
+property reference_end:
+'''aligned reference position of the read on the reference genome.
+reference_end points to one past the last aligned residue.
+Returns None if not available (read is unmapped or no cigar
+alignment present).
+'''
+def __get__(self):
+cdef bam1_t * src
+src = self._delegate
+if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+return None
+return bam_endpos(src)
+property reference_length:
+'''aligned length of the read on the reference genome.
+This is equal to `reference_end - reference_start`.
+Returns None if not available.
+'''
+def __get__(self):
+cdef bam1_t * src
+src = self._delegate
+if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+return None
+return bam_endpos(src) - \
+self._delegate.core.pos
+property query_alignment_sequence:
+"""aligned portion of the read.
+This is a substring of :attr:`query_sequence` that excludes flanking
+bases that were :term:`soft clipped` (None if not present). It
+is equal to ``query_sequence[query_alignment_start:query_alignment_end]``.
+SAM/BAM files may include extra flanking bases that are not
+part of the alignment.  These bases may be the result of the
+Smith-Waterman or other algorithms, which may not require
+alignments that begin at the first residue or end at the last.
+In addition, extra sequencing adapters, multiplex identifiers,
+and low-quality bases that were not considered for alignment
+may have been retained.
+"""
+def __get__(self):
+if self.cache_query_alignment_sequence:
+return self.cache_query_alignment_sequence
+cdef bam1_t * src
+cdef uint32_t start, end
+src = self._delegate
+if src.core.l_qseq == 0:
+return None
+start = getQueryStart(src)
+end   = getQueryEnd(src)
+self.cache_query_alignment_sequence = force_str(
+getSequenceInRange(src, start, end))
+return self.cache_query_alignment_sequence
+property query_alignment_qualities:
+"""aligned query sequence quality values (None if not present). These
+are the quality values that correspond to
+:attr:`query_alignment_sequence`, that is, they exclude qualities of
+:term:`soft clipped` bases. This is equal to
+``query_qualities[query_alignment_start:query_alignment_end]``.
+Quality scores are returned as a python array of unsigned
+chars. Note that this is not the ASCII-encoded value typically
+seen in FASTQ or SAM formatted files. Thus, no offset of 33
+needs to be subtracted.
+This property is read-only.
+"""
+def __get__(self):
+if self.cache_query_alignment_qualities:
+return self.cache_query_alignment_qualities
+cdef bam1_t * src
+cdef uint32_t start, end
+src = self._delegate
+if src.core.l_qseq == 0:
+return None
+start = getQueryStart(src)
+end   = getQueryEnd(src)
+self.cache_query_alignment_qualities = \
+getQualitiesInRange(src, start, end)
+return self.cache_query_alignment_qualities
+property query_alignment_start:
+"""start index of the aligned query portion of the sequence (0-based,
+inclusive).
+This the index of the first base in :attr:`query_sequence`
+that is not soft-clipped.
+"""
+def __get__(self):
+return getQueryStart(self._delegate)
+property query_alignment_end:
+"""end index of the aligned query portion of the sequence (0-based,
+exclusive)
+This the index just past the last base in :attr:`query_sequence`
+that is not soft-clipped.
+"""
+def __get__(self):
+return getQueryEnd(self._delegate)
+property modified_bases:
+"""Modified bases annotations from Ml/Mm tags. The output is
+Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
+with qual being (256*probability), or -1 if unknown.
+Strand==0 for forward and 1 for reverse strand modification
+"""
+def __get__(self):
+cdef bam1_t * src
+cdef hts_base_mod_state *m = hts_base_mod_state_alloc()
+cdef hts_base_mod mods[5]
+cdef int pos
+ret = {}
+src = self._delegate
+if bam_parse_basemod(src, m) < 0:
+return None
+n = bam_next_basemod(src, m, mods, 5, &pos)
+while n>0:
+for i in range(n):
+mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base
+mod_strand = mods[i].strand
+if self.is_reverse:
+mod_strand = 1 - mod_strand
+key = (chr(mods[i].canonical_base),
+mod_strand,
+mod_code )
+ret.setdefault(key,[]).append((pos,mods[i].qual))
+n = bam_next_basemod(src, m, mods, 5, &pos)
+if n<0:
+return None
+hts_base_mod_state_free(m)
+return ret
+property modified_bases_forward:
+"""Modified bases annotations from Ml/Mm tags. The output is
+Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
+with qual being (256*probability), or -1 if unknown.
+Strand==0 for forward and 1 for reverse strand modification.
+The positions are with respect to the original sequence from get_forward_sequence()
+"""
+def __get__(self):
+pmods = self.modified_bases
+if pmods and self.is_reverse:
+rmod = {}
+# Try to find the length of the original sequence
+rlen = self.infer_read_length()
+if rlen is None and self.query_sequence is None:
+return rmod
+else:
+rlen = len(self.query_sequence)
+for k,mods in pmods.items():
+nk = k[0],1 - k[1],k[2]
+for i in range(len(mods)):
+mods[i] = (rlen - 1 -mods[i][0], mods[i][1])
+rmod[nk] = mods
+return rmod
+return pmods
+property query_alignment_length:
+"""length of the aligned query sequence.
+This is equal to :attr:`query_alignment_end` -
+:attr:`query_alignment_start`
+"""
+def __get__(self):
+cdef bam1_t * src
+src = self._delegate
+return getQueryEnd(src) - getQueryStart(src)
+#####################################################
+# Computed properties
+def get_reference_positions(self, full_length=False):
+"""a list of reference positions that this read aligns to.
+By default, this method returns the (0-based) positions on the
+reference that are within the read's alignment, leaving gaps
+corresponding to deletions and other reference skips.
+When *full_length* is True, the returned list is the same length
+as the read and additionally includes None values corresponding
+to insertions or soft-clipping, i.e., to bases of the read that
+are not aligned to a reference position.
+(See also :meth:`get_aligned_pairs` which additionally returns
+the corresponding positions along the read.)
+"""
+cdef uint32_t k, i, l, pos
+cdef int op
+cdef uint32_t * cigar_p
+cdef bam1_t * src
+cdef bint _full = full_length
+src = self._delegate
+if pysam_get_n_cigar(src) == 0:
+return []
+result = []
+pos = src.core.pos
+cigar_p = pysam_bam_get_cigar(src)
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CSOFT_CLIP or op == BAM_CINS:
+if _full:
+for i from 0 <= i < l:
+result.append(None)
+elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+for i from pos <= i < pos + l:
+result.append(i)
+pos += l
+elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+pos += l
+return result
+def infer_query_length(self, always=False):
+"""infer query length from CIGAR alignment.
+This method deduces the query length from the CIGAR alignment
+but does not include hard-clipped bases.
+Returns None if CIGAR alignment is not present.
+If *always* is set to True, `infer_read_length` is used instead.
+This is deprecated and only present for backward compatibility.
+"""
+if always is True:
+return self.infer_read_length()
+cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
+if l > 0:
+return l
+else:
+return None
+def infer_read_length(self):
+"""infer read length from CIGAR alignment.
+This method deduces the read length from the CIGAR alignment
+including hard-clipped bases.
+Returns None if CIGAR alignment is not present.
+"""
+cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
+if l > 0:
+return l
+else:
+return None
+def get_reference_sequence(self):
+"""return the reference sequence in the region that is covered by the
+alignment of the read to the reference.
+This method requires the MD tag to be set.
+"""
+return force_str(build_reference_sequence(self._delegate))
+def get_forward_sequence(self):
+"""return the original read sequence.
+Reads mapped to the reverse strand are stored reverse complemented in
+the BAM file. This method returns such reads reverse complemented back
+to their original orientation.
+Returns None if the record has no query sequence.
+"""
+if self.query_sequence is None:
+return None
+s = force_str(self.query_sequence)
+if self.is_reverse:
+s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
+return s
+def get_forward_qualities(self):
+"""return the original base qualities of the read sequence,
+in the same format as the :attr:`query_qualities` property.
+Reads mapped to the reverse strand have their base qualities stored
+reversed in the BAM file. This method returns such reads' base qualities
+reversed back to their original orientation.
+"""
+if self.is_reverse:
+return self.query_qualities[::-1]
+else:
+return self.query_qualities
+def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
+"""a list of aligned read (query) and reference positions.
+Each item in the returned list is a tuple consisting of
+the 0-based offset from the start of the read sequence
+followed by the 0-based reference position.
+For inserts, deletions, skipping either query or reference
+position may be None.
+For padding in the reference, the reference position will
+always be None.
+Parameters
+----------
+matches_only : bool
+If True, only matched bases are returned --- no None on either
+side.
+with_seq : bool
+If True, return a third element in the tuple containing the
+reference sequence. For CIGAR 'P' (padding in the reference)
+operations, the third tuple element will be None. Substitutions
+are lower-case. This option requires an MD tag to be present.
+with_cigar : bool
+If True, return an extra element in the tuple containing the
+CIGAR operator corresponding to this position tuple.
+Returns
+-------
+aligned_pairs : list of tuples
+"""
+cdef uint32_t k, i, pos, qpos, r_idx, l
+cdef int op
+cdef uint32_t * cigar_p
+cdef bam1_t * src = self._delegate
+cdef bint _matches_only = bool(matches_only)
+cdef bint _with_seq = bool(with_seq)
+cdef bint _with_cigar = bool(with_cigar)
+cdef object (*make_tuple)(object, object, object, uint32_t, int)
+# TODO: this method performs no checking and assumes that
+# read sequence, cigar and MD tag are consistent.
+if _with_seq:
+# force_str required for py2/py3 compatibility
+ref_seq = force_str(build_reference_sequence(src))
+if ref_seq is None:
+raise ValueError("MD tag not present")
+make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
+else:
+ref_seq = None
+make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
+r_idx = 0
+if pysam_get_n_cigar(src) == 0:
+return []
+result = []
+pos = src.core.pos
+qpos = 0
+cigar_p = pysam_bam_get_cigar(src)
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+for i from pos <= i < pos + l:
+result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
+r_idx += 1
+qpos += 1
+pos += l
+elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
+if not _matches_only:
+for i from pos <= i < pos + l:
+result.append(make_tuple(qpos, None, None, 0, op))
+qpos += 1
+else:
+qpos += l
+elif op == BAM_CDEL:
+if not _matches_only:
+for i from pos <= i < pos + l:
+result.append(make_tuple(None, i, ref_seq, r_idx, op))
+r_idx += 1
+else:
+r_idx += l
+pos += l
+elif op == BAM_CHARD_CLIP:
+pass # advances neither
+elif op == BAM_CREF_SKIP:
+if not _matches_only:
+for i from pos <= i < pos + l:
+result.append(make_tuple(None, i, None, 0, op))
+pos += l
+return result
+def get_blocks(self):
+""" a list of start and end positions of
+aligned gapless blocks.
+The start and end positions are in genomic
+coordinates.
+Blocks are not normalized, i.e. two blocks
+might be directly adjacent. This happens if
+the two blocks are separated by an insertion
+in the read.
+"""
+cdef uint32_t k, pos, l
+cdef int op
+cdef uint32_t * cigar_p
+cdef bam1_t * src
+src = self._delegate
+if pysam_get_n_cigar(src) == 0:
+return []
+result = []
+pos = src.core.pos
+cigar_p = pysam_bam_get_cigar(src)
+l = 0
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+result.append((pos, pos + l))
+pos += l
+elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+pos += l
+return result
+def get_overlap(self, uint32_t start, uint32_t end):
+"""return number of aligned bases of read overlapping the interval
+*start* and *end* on the reference sequence.
+Return None if cigar alignment is not available.
+"""
+cdef uint32_t k, i, pos, overlap
+cdef int op, o
+cdef uint32_t * cigar_p
+cdef bam1_t * src
+overlap = 0
+src = self._delegate
+if pysam_get_n_cigar(src) == 0:
+return None
+pos = src.core.pos
+o = 0
+cigar_p = pysam_bam_get_cigar(src)
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+o = min( pos + l, end) - max( pos, start )
+if o > 0: overlap += o
+if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
+pos += l
+return overlap
+def get_cigar_stats(self):
+"""summary of operations in cigar string.
+The output order in the array is "MIDNSHP=X" followed by a
+field for the NM tag. If the NM tag is not present, this
+field will always be 0. (Accessing this field via index -1
+avoids changes if more CIGAR operators are added in future.)
++-----+----------------+--------+
+|M    |pysam.CMATCH    |0       |
++-----+----------------+--------+
+|I    |pysam.CINS      |1       |
++-----+----------------+--------+
+|D    |pysam.CDEL      |2       |
++-----+----------------+--------+
+|N    |pysam.CREF_SKIP |3       |
++-----+----------------+--------+
+|S    |pysam.CSOFT_CLIP|4       |
++-----+----------------+--------+
+|H    |pysam.CHARD_CLIP|5       |
++-----+----------------+--------+
+|P    |pysam.CPAD      |6       |
++-----+----------------+--------+
+|=    |pysam.CEQUAL    |7       |
++-----+----------------+--------+
+|X    |pysam.CDIFF     |8       |
++-----+----------------+--------+
+|B    |pysam.CBACK     |9       |
++-----+----------------+--------+
+|NM   |NM tag          |10 or -1|
++-----+----------------+--------+
+If no cigar string is present, empty arrays will be returned.
+Returns:
+arrays :
+two arrays. The first contains the nucleotide counts within
+each cigar operation, the second contains the number of blocks
+for each cigar operation.
+"""
+cdef int nfields = NCIGAR_CODES + 1
+cdef c_array.array base_counts = array.array(
+"I",
+[0] * nfields)
+cdef uint32_t [:] base_view = base_counts
+cdef c_array.array block_counts = array.array(
+"I",
+[0] * nfields)
+cdef uint32_t [:] block_view = block_counts
+cdef bam1_t * src = self._delegate
+cdef int op
+cdef uint32_t l
+cdef int32_t k
+cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+if cigar_p == NULL:
+return None
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+base_view[op] += l
+block_view[op] += 1
+cdef uint8_t * v = bam_aux_get(src, 'NM')
+if v != NULL:
+base_view[nfields - 1] = <int32_t>bam_aux2i(v)
+return base_counts, block_counts
+#####################################################
+## Unsorted as yet
+# TODO: capture in CIGAR object
+property cigartuples:
+"""the :term:`cigar` alignment. The alignment
+is returned as a list of tuples of (operation, length).
+If the alignment is not present, None is returned.
+The operations are:
++-----+----------------+-----+
+|M    |pysam.CMATCH    |0    |
++-----+----------------+-----+
+|I    |pysam.CINS      |1    |
++-----+----------------+-----+
+|D    |pysam.CDEL      |2    |
++-----+----------------+-----+
+|N    |pysam.CREF_SKIP |3    |
++-----+----------------+-----+
+|S    |pysam.CSOFT_CLIP|4    |
++-----+----------------+-----+
+|H    |pysam.CHARD_CLIP|5    |
++-----+----------------+-----+
+|P    |pysam.CPAD      |6    |
++-----+----------------+-----+
+|=    |pysam.CEQUAL    |7    |
++-----+----------------+-----+
+|X    |pysam.CDIFF     |8    |
++-----+----------------+-----+
+|B    |pysam.CBACK     |9    |
++-----+----------------+-----+
+.. note::
+The output is a list of (operation, length) tuples, such as
+``[(0, 30)]``.
+This is different from the SAM specification and
+the :attr:`cigarstring` property, which uses a
+(length, operation) order, for example: ``30M``.
+To unset the cigar property, assign an empty list
+or None.
+"""
+def __get__(self):
+cdef uint32_t * cigar_p
+cdef bam1_t * src
+cdef uint32_t op, l
+cdef uint32_t k
+src = self._delegate
+if pysam_get_n_cigar(src) == 0:
+return None
+cigar = []
+cigar_p = pysam_bam_get_cigar(src);
+for k from 0 <= k < pysam_get_n_cigar(src):
+op = cigar_p[k] & BAM_CIGAR_MASK
+l = cigar_p[k] >> BAM_CIGAR_SHIFT
+cigar.append((op, l))
+return cigar
+def __set__(self, values):
+cdef uint32_t * p
+cdef bam1_t * src
+cdef op, l
+cdef int k
+k = 0
+src = self._delegate
+# get location of cigar string
+p = pysam_bam_get_cigar(src)
+# empty values for cigar string
+if values is None:
+values = []
+cdef uint32_t ncigar = len(values)
+cdef bam1_t * retval = pysam_bam_update(src,
+pysam_get_n_cigar(src) * 4,
+ncigar * 4,
+<uint8_t*>p)
+if retval == NULL:
+raise MemoryError("could not allocate memory")
+# length is number of cigar operations, not bytes
+pysam_set_n_cigar(src, ncigar)
+# re-acquire pointer to location in memory
+# as it might have moved
+p = pysam_bam_get_cigar(src)
+# insert cigar operations
+for op, l in values:
+p[k] = l << BAM_CIGAR_SHIFT | op
+k += 1
+## setting the cigar string requires updating the bin
+update_bin(src)
+cpdef set_tag(self,
+tag,
+value,
+value_type=None,
+replace=True):
+"""sets a particular field *tag* to *value* in the optional alignment
+section.
+*value_type* describes the type of *value* that is to entered
+into the alignment record. It can be set explicitly to one of
+the valid one-letter type codes. If unset, an appropriate type
+will be chosen automatically based on the python type of
+*value*.
+An existing value of the same *tag* will be overwritten unless
+*replace* is set to False. This is usually not recommended as a
+tag may only appear once in the optional alignment section.
+If *value* is `None`, the tag will be deleted.
+This method accepts valid SAM specification value types, which
+are::
+A: printable char
+i: signed int
+f: float
+Z: printable string
+H: Byte array in hex format
+B: Integer or numeric array
+Additionally, it will accept the integer BAM types ('cCsSI')
+For htslib compatibility, 'a' is synonymous with 'A' and the
+method accepts a 'd' type code for a double precision float.
+When deducing the type code by the python type of *value*, the
+following mapping is applied::
+i: python int
+f: python float
+Z: python str or bytes
+B: python array.array, list or tuple
+Note that a single character string will be output as 'Z' and
+not 'A' as the former is the more general type.
+"""
+cdef int value_size
+cdef uint8_t tc
+cdef uint8_t * value_ptr
+cdef uint8_t *existing_ptr
+cdef float float_value
+cdef double double_value
+cdef int32_t int32_t_value
+cdef uint32_t uint32_t_value
+cdef int16_t int16_t_value
+cdef uint16_t uint16_t_value
+cdef int8_t int8_t_value
+cdef uint8_t uint8_t_value
+cdef bam1_t * src = self._delegate
+cdef char * _value_type
+cdef c_array.array array_value
+cdef object buffer
+if len(tag) != 2:
+raise ValueError('Invalid tag: %s' % tag)
+tag = force_bytes(tag)
+if replace:
+existing_ptr = bam_aux_get(src, tag)
+if existing_ptr:
+bam_aux_del(src, existing_ptr)
+# setting value to None deletes a tag
+if value is None:
+return
+cdef uint8_t typecode = get_tag_typecode(value, value_type)
+if typecode == 0:
+raise ValueError("can't guess type or invalid type code specified: {} {}".format(
+value, value_type))
+# sam_format1 for typecasting
+if typecode == b'Z':
+value = force_bytes(value)
+value_ptr = <uint8_t*><char*>value
+value_size = len(value)+1
+elif typecode == b'H':
+# Note that hex tags are stored the very same
+# way as Z string.s
+value = force_bytes(value)
+value_ptr = <uint8_t*><char*>value
+value_size = len(value)+1
+elif typecode == b'A' or typecode == b'a':
+value = force_bytes(value)
+value_ptr = <uint8_t*><char*>value
+value_size = sizeof(char)
+typecode = b'A'
+elif typecode == b'i':
+int32_t_value = value
+value_ptr = <uint8_t*>&int32_t_value
+value_size = sizeof(int32_t)
+elif typecode == b'I':
+uint32_t_value = value
+value_ptr = <uint8_t*>&uint32_t_value
+value_size = sizeof(uint32_t)
+elif typecode == b's':
+int16_t_value = value
+value_ptr = <uint8_t*>&int16_t_value
+value_size = sizeof(int16_t)
+elif typecode == b'S':
+uint16_t_value = value
+value_ptr = <uint8_t*>&uint16_t_value
+value_size = sizeof(uint16_t)
+elif typecode == b'c':
+int8_t_value = value
+value_ptr = <uint8_t*>&int8_t_value
+value_size = sizeof(int8_t)
+elif typecode == b'C':
+uint8_t_value = value
+value_ptr = <uint8_t*>&uint8_t_value
+value_size = sizeof(uint8_t)
+elif typecode == b'd':
+double_value = value
+value_ptr = <uint8_t*>&double_value
+value_size = sizeof(double)
+elif typecode == b'f':
+float_value  = value
+value_ptr = <uint8_t*>&float_value
+value_size = sizeof(float)
+elif typecode == b'B':
+# the following goes through python, needs to be cleaned up
+# pack array using struct
+fmt, args = pack_tags([(tag, value, value_type)])
+# remove tag and type code as set by bam_aux_append
+# first four chars of format (<2sB)
+fmt = '<' + fmt[4:]
+# first two values to pack
+args = args[2:]
+value_size = struct.calcsize(fmt)
+# buffer will be freed when object goes out of scope
+buffer = ctypes.create_string_buffer(value_size)
+struct.pack_into(fmt, buffer, 0, *args)
+# bam_aux_append copies data from value_ptr
+bam_aux_append(src,
+tag,
+typecode,
+value_size,
+<uint8_t*>buffer.raw)
+return
+else:
+raise ValueError('unsupported value_type {} in set_option'.format(typecode))
+bam_aux_append(src,
+tag,
+typecode,
+value_size,
+value_ptr)
+cpdef has_tag(self, tag):
+"""returns true if the optional alignment section
+contains a given *tag*."""
+cdef uint8_t * v
+cdef int nvalues
+btag = force_bytes(tag)
+v = bam_aux_get(self._delegate, btag)
+return v != NULL
+cpdef get_tag(self, tag, with_value_type=False):
+"""
+retrieves data from the optional alignment section
+given a two-letter *tag* denoting the field.
+The returned value is cast into an appropriate python type.
+This method is the fastest way to access the optional
+alignment section if only few tags need to be retrieved.
+Possible value types are "AcCsSiIfZHB" (see BAM format
+specification) as well as additional value type 'd' as
+implemented in htslib.
+Parameters:
+tag :
+data tag.
+with_value_type : Optional[bool]
+if set to True, the return value is a tuple of (tag value, type
+code). (default False)
+Returns:
+A python object with the value of the `tag`. The type of the
+object depends on the data type in the data record.
+Raises:
+KeyError
+If `tag` is not present, a KeyError is raised.
+"""
+cdef uint8_t * v
+cdef int nvalues
+btag = force_bytes(tag)
+v = bam_aux_get(self._delegate, btag)
+if v == NULL:
+raise KeyError("tag '%s' not present" % tag)
+if chr(v[0]) == "B":
+auxtype = chr(v[0]) + chr(v[1])
+else:
+auxtype = chr(v[0])
+if auxtype in "iIcCsS":
+value = bam_aux2i(v)
+elif auxtype == 'f' or auxtype == 'F':
+value = bam_aux2f(v)
+elif auxtype == 'd' or auxtype == 'D':
+value = bam_aux2f(v)
+elif auxtype == 'A' or auxtype == 'a':
+# force A to a
+v[0] = b'A'
+# there might a more efficient way
+# to convert a char into a string
+value = '%c' % <char>bam_aux2A(v)
+elif auxtype == 'Z' or auxtype == 'H':
+# Z and H are treated equally as strings in htslib
+value = charptr_to_str(<char*>bam_aux2Z(v))
+elif auxtype[0] == 'B':
+bytesize, nvalues, values = convert_binary_tag(v + 1)
+value = values
+else:
+raise ValueError("unknown auxiliary type '%s'" % auxtype)
+if with_value_type:
+return (value, auxtype)
+else:
+return value
+def get_tags(self, with_value_type=False):
+"""the fields in the optional alignment section.
+Returns a list of all fields in the optional
+alignment section. Values are converted to appropriate python
+values. For example: ``[(NM, 2), (RG, "GJP00TM04")]``
+If *with_value_type* is set, the value type as encode in
+the AlignedSegment record will be returned as well:
+[(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+This method will convert all values in the optional alignment
+section. When getting only one or few tags, please see
+:meth:`get_tag` for a quicker way to achieve this.
+"""
+cdef char * ctag
+cdef bam1_t * src
+cdef uint8_t * s
+cdef char auxtag[3]
+cdef char auxtype
+cdef uint8_t byte_size
+cdef int32_t nvalues
+src = self._delegate
+if src.l_data == 0:
+return []
+s = pysam_bam_get_aux(src)
+result = []
+auxtag[2] = 0
+while s < (src.data + src.l_data):
+# get tag
+auxtag[0] = s[0]
+auxtag[1] = s[1]
+s += 2
+auxtype = s[0]
+if auxtype in (b'c', b'C'):
+value = <int>bam_aux2i(s)
+s += 1
+elif auxtype in (b's', b'S'):
+value = <int>bam_aux2i(s)
+s += 2
+elif auxtype in (b'i', b'I'):
+value = <int32_t>bam_aux2i(s)
+s += 4
+elif auxtype == b'f':
+value = <float>bam_aux2f(s)
+s += 4
+elif auxtype == b'd':
+value = <double>bam_aux2f(s)
+s += 8
+elif auxtype in (b'A', b'a'):
+value = "%c" % <char>bam_aux2A(s)
+s += 1
+elif auxtype in (b'Z', b'H'):
+value = charptr_to_str(<char*>bam_aux2Z(s))
+# +1 for NULL terminated string
+s += len(value) + 1
+elif auxtype == b'B':
+s += 1
+byte_size, nvalues, value = convert_binary_tag(s)
+# 5 for 1 char and 1 int
+s += 5 + (nvalues * byte_size) - 1
+else:
+raise KeyError("unknown type '%s'" % auxtype)
+s += 1
+if with_value_type:
+result.append((charptr_to_str(auxtag), value, chr(auxtype)))
+else:
+result.append((charptr_to_str(auxtag), value))
+return result
+def set_tags(self, tags):
+"""sets the fields in the optional alignment section with
+a list of (tag, value) tuples.
+The value type of the values is determined from the
+python type. Optionally, a type may be given explicitly as
+a third value in the tuple, For example:
+x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+This method will not enforce the rule that the same tag may appear
+only once in the optional alignment section.
+"""
+cdef bam1_t * src
+cdef uint8_t * s
+cdef char * temp
+cdef int new_size = 0
+cdef int old_size
+src = self._delegate
+# convert and pack the data
+if tags is not None and len(tags) > 0:
+fmt, args = pack_tags(tags)
+new_size = struct.calcsize(fmt)
+buffer = ctypes.create_string_buffer(new_size)
+struct.pack_into(fmt,
+buffer,
+0,
+*args)
+# delete the old data and allocate new space.
+# If total_size == 0, the aux field will be
+# empty
+old_size = pysam_bam_get_l_aux(src)
+cdef bam1_t * retval = pysam_bam_update(src,
+old_size,
+new_size,
+pysam_bam_get_aux(src))
+if retval == NULL:
+raise MemoryError("could not allocate memory")
+# copy data only if there is any
+if new_size > 0:
+# get location of new data
+s = pysam_bam_get_aux(src)
+# check if there is direct path from buffer.raw to tmp
+p = buffer.raw
+# create handle to make sure buffer stays alive long
+# enough for memcpy, see issue 129
+temp = p
+memcpy(s, temp, new_size)
+########################################################
+# Compatibility Accessors
+# Functions, properties for compatibility with pysam < 0.8
+#
+# Several options
+#     change the factory functions according to API
+#         * requires code changes throughout, incl passing
+#           handles to factory functions
+#     subclass functions and add attributes at runtime
+#         e.g.: AlignedSegments.qname = AlignedSegments.query_name
+#         * will slow down the default interface
+#     explicit declaration of getters/setters
+########################################################
+property qname:
+"""deprecated, use :attr:`query_name` instead."""
+def __get__(self): return self.query_name
+def __set__(self, v): self.query_name = v
+property tid:
+"""deprecated, use :attr:`reference_id` instead."""
+def __get__(self): return self.reference_id
+def __set__(self, v): self.reference_id = v
+property pos:
+"""deprecated, use :attr:`reference_start` instead."""
+def __get__(self): return self.reference_start
+def __set__(self, v): self.reference_start = v
+property mapq:
+"""deprecated, use :attr:`mapping_quality` instead."""
+def __get__(self): return self.mapping_quality
+def __set__(self, v): self.mapping_quality = v
+property rnext:
+"""deprecated, use :attr:`next_reference_id` instead."""
+def __get__(self): return self.next_reference_id
+def __set__(self, v): self.next_reference_id = v
+property pnext:
+"""deprecated, use :attr:`next_reference_start` instead."""
+def __get__(self):
+return self.next_reference_start
+def __set__(self, v):
+self.next_reference_start = v
+property cigar:
+"""deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead."""
+def __get__(self):
+r = self.cigartuples
+if r is None:
+r = []
+return r
+def __set__(self, v): self.cigartuples = v
+property tlen:
+"""deprecated, use :attr:`template_length` instead."""
+def __get__(self):
+return self.template_length
+def __set__(self, v):
+self.template_length = v
+property seq:
+"""deprecated, use :attr:`query_sequence` instead."""
+def __get__(self):
+return self.query_sequence
+def __set__(self, v):
+self.query_sequence = v
+property qual:
+"""deprecated, use :attr:`query_qualities` instead."""
+def __get__(self):
+return array_to_qualitystring(self.query_qualities)
+def __set__(self, v):
+self.query_qualities = qualitystring_to_array(v)
+property alen:
+"""deprecated, use :attr:`reference_length` instead."""
+def __get__(self):
+return self.reference_length
+def __set__(self, v):
+self.reference_length = v
+property aend:
+"""deprecated, use :attr:`reference_end` instead."""
+def __get__(self):
+return self.reference_end
+def __set__(self, v):
+self.reference_end = v
+property rlen:
+"""deprecated, use :attr:`query_length` instead."""
+def __get__(self):
+return self.query_length
+def __set__(self, v):
+self.query_length = v
+property query:
+"""deprecated, use :attr:`query_alignment_sequence`
+instead."""
+def __get__(self):
+return self.query_alignment_sequence
+def __set__(self, v):
+self.query_alignment_sequence = v
+property qqual:
+"""deprecated, use :attr:`query_alignment_qualities`
+instead."""
+def __get__(self):
+return array_to_qualitystring(self.query_alignment_qualities)
+def __set__(self, v):
+self.query_alignment_qualities = qualitystring_to_array(v)
+property qstart:
+"""deprecated, use :attr:`query_alignment_start` instead."""
+def __get__(self):
+return self.query_alignment_start
+def __set__(self, v):
+self.query_alignment_start = v
+property qend:
+"""deprecated, use :attr:`query_alignment_end` instead."""
+def __get__(self):
+return self.query_alignment_end
+def __set__(self, v):
+self.query_alignment_end = v
+property qlen:
+"""deprecated, use :attr:`query_alignment_length`
+instead."""
+def __get__(self):
+return self.query_alignment_length
+def __set__(self, v):
+self.query_alignment_length = v
+property mrnm:
+"""deprecated, use :attr:`next_reference_id` instead."""
+def __get__(self):
+return self.next_reference_id
+def __set__(self, v):
+self.next_reference_id = v
+property mpos:
+"""deprecated, use :attr:`next_reference_start`
+instead."""
+def __get__(self):
+return self.next_reference_start
+def __set__(self, v):
+self.next_reference_start = v
+property rname:
+"""deprecated, use :attr:`reference_id` instead."""
+def __get__(self):
+return self.reference_id
+def __set__(self, v):
+self.reference_id = v
+property isize:
+"""deprecated, use :attr:`template_length` instead."""
+def __get__(self):
+return self.template_length
+def __set__(self, v):
+self.template_length = v
+property blocks:
+"""deprecated, use :meth:`get_blocks()` instead."""
+def __get__(self):
+return self.get_blocks()
+property aligned_pairs:
+"""deprecated, use :meth:`get_aligned_pairs()` instead."""
+def __get__(self):
+return self.get_aligned_pairs()
+property inferred_length:
+"""deprecated, use :meth:`infer_query_length()` instead."""
+def __get__(self):
+return self.infer_query_length()
+property positions:
+"""deprecated, use :meth:`get_reference_positions()` instead."""
+def __get__(self):
+return self.get_reference_positions()
+property tags:
+"""deprecated, use :meth:`get_tags()` instead."""
+def __get__(self):
+return self.get_tags()
+def __set__(self, tags):
+self.set_tags(tags)
+def overlap(self):
+"""deprecated, use :meth:`get_overlap()` instead."""
+return self.get_overlap()
+def opt(self, tag):
+"""deprecated, use :meth:`get_tag()` instead."""
+return self.get_tag(tag)
+def setTag(self, tag, value, value_type=None, replace=True):
+"""deprecated, use :meth:`set_tag()` instead."""
+return self.set_tag(tag, value, value_type, replace)
+cdef class PileupColumn:
+'''A pileup of reads at a particular reference sequence position
+(:term:`column`). A pileup column contains all the reads that map
+to a certain target base.
+This class is a proxy for results returned by the samtools pileup
+engine.  If the underlying engine iterator advances, the results
+of this column will change.
+'''
+def __init__(self):
+raise TypeError("this class cannot be instantiated from Python")
+def __str__(self):
+return "\t".join(map(str,
+(self.reference_id,
+self.reference_pos,
+self.nsegments))) +\
+"\n" +\
+"\n".join(map(str, self.pileups))
+def __dealloc__(self):
+free(self.buf.s)
+def set_min_base_quality(self, min_base_quality):
+"""set the minimum base quality for this pileup column.
+"""
+self.min_base_quality = min_base_quality
+def __len__(self):
+"""return number of reads aligned to this column.
+see :meth:`get_num_aligned`
+"""
+return self.get_num_aligned()
+property reference_id:
+'''the reference sequence number as defined in the header'''
+def __get__(self):
+return self.tid
+property reference_name:
+""":term:`reference` name (None if no AlignmentFile is associated)"""
+def __get__(self):
+if self.header is not None:
+return self.header.get_reference_name(self.tid)
+return None
+property nsegments:
+'''number of reads mapping to this column.
+Note that this number ignores the base quality filter.'''
+def __get__(self):
+return self.n_pu
+def __set__(self, n):
+self.n_pu = n
+property reference_pos:
+'''the position in the reference sequence (0-based).'''
+def __get__(self):
+return self.pos
+property pileups:
+'''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
+def __get__(self):
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+cdef int x
+cdef const bam_pileup1_t * p = NULL
+pileups = []
+# warning: there could be problems if self.n and self.buf are
+# out of sync.
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+pileups.append(makePileupRead(p, self.header))
+return pileups
+########################################################
+# Compatibility Accessors
+# Functions, properties for compatibility with pysam < 0.8
+########################################################
+property pos:
+"""deprecated, use :attr:`reference_pos` instead."""
+def __get__(self):
+return self.reference_pos
+def __set__(self, v):
+self.reference_pos = v
+property tid:
+"""deprecated, use :attr:`reference_id` instead."""
+def __get__(self):
+return self.reference_id
+def __set__(self, v):
+self.reference_id = v
+property n:
+"""deprecated, use :attr:`nsegments` instead."""
+def __get__(self):
+return self.nsegments
+def __set__(self, v):
+self.nsegments = v
+def get_num_aligned(self):
+"""return number of aligned bases at pileup column position.
+This method applies a base quality filter and the number is
+equal to the size of :meth:`get_query_sequences`,
+:meth:`get_mapping_qualities`, etc.
+"""
+cdef uint32_t x = 0
+cdef uint32_t c = 0
+cdef uint32_t cnt = 0
+cdef const bam_pileup1_t * p = NULL
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+cnt += 1
+return cnt
+def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
+"""query bases/sequences at pileup column position.
+Optionally, the bases/sequences can be annotated according to the samtools
+mpileup format. This is the format description from the samtools mpileup tool::
+Information on match, mismatch, indel, strand, mapping
+quality and start and end of a read are all encoded at the
+read base column. At this column, a dot stands for a match
+to the reference base on the forward strand, a comma for a
+match on the reverse strand, a '>' or '<' for a reference
+skip, `ACGTN' for a mismatch on the forward strand and
+`acgtn' for a mismatch on the reverse strand. A pattern
+`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
+between this reference position and the next reference
+position. The length of the insertion is given by the
+integer in the pattern, followed by the inserted
+sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
+represents a deletion from the reference. The deleted bases
+will be presented as `*' in the following lines. Also at
+the read base column, a symbol `^' marks the start of a
+read. The ASCII of the character following `^' minus 33
+gives the mapping quality. A symbol `$' marks the end of a
+read segment
+To reproduce samtools mpileup format, set all of mark_matches,
+mark_ends and add_indels to True.
+Parameters
+----------
+mark_matches: bool
+If True, output bases matching the reference as "." or ","
+for forward and reverse strand, respectively. This mark
+requires the reference sequence. If no reference is
+present, this option is ignored.
+mark_ends : bool
+If True, add markers "^" and "$" for read start and end, respectively.
+add_indels : bool
+If True, add bases for bases inserted into or skipped from the
+reference. The latter requires a reference sequence file to have
+been given, e.g. via `pileup(fastafile = ...)`. If no reference
+sequence is available, skipped bases are represented as 'N's.
+Returns
+-------
+a list of bases/sequences per read at pileup column position. : list
+"""
+cdef uint32_t x = 0
+cdef uint32_t j = 0
+cdef uint32_t c = 0
+cdef uint8_t cc = 0
+cdef uint8_t rb = 0
+cdef kstring_t * buf = &self.buf
+cdef const bam_pileup1_t * p = NULL
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+buf.l = 0
+# todo: reference sequence to count matches/mismatches
+# todo: convert assertions to exceptions
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+# see samtools pileup_seq
+if mark_ends and p.is_head:
+kputc(b'^', buf)
+if p.b.core.qual > 93:
+kputc(126, buf)
+else:
+kputc(p.b.core.qual + 33, buf)
+if not p.is_del:
+if p.qpos < p.b.core.l_qseq:
+cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
+else:
+cc = b'N'
+if mark_matches and self.reference_sequence != NULL:
+rb = self.reference_sequence[self.reference_pos]
+if seq_nt16_table[cc] == seq_nt16_table[rb]:
+cc = b'='
+kputc(strand_mark_char(cc, p.b), buf)
+elif add_indels:
+if p.is_refskip:
+if bam_is_rev(p.b):
+kputc(b'<', buf)
+else:
+kputc(b'>', buf)
+else:
+kputc(b'*', buf)
+if add_indels:
+if p.indel > 0:
+kputc(b'+', buf)
+kputw(p.indel, buf)
+for j from 1 <= j <= p.indel:
+cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
+kputc(strand_mark_char(cc, p.b), buf)
+elif p.indel < 0:
+kputc(b'-', buf)
+kputw(-p.indel, buf)
+for j from 1 <= j <= -p.indel:
+# TODO: out-of-range check here?
+if self.reference_sequence == NULL:
+cc = b'N'
+else:
+cc = self.reference_sequence[self.reference_pos + j]
+kputc(strand_mark_char(cc, p.b), buf)
+if mark_ends and p.is_tail:
+kputc(b'$', buf)
+kputc(b':', buf)
+if buf.l == 0:
+# could be zero if all qualities are too low
+return ""
+else:
+# quicker to ensemble all and split than to encode all separately.
+# ignore last ":"
+return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
+def get_query_qualities(self):
+"""query base quality scores at pileup column position.
+Returns
+-------
+a list of quality scores : list
+"""
+cdef uint32_t x = 0
+cdef const bam_pileup1_t * p = NULL
+cdef uint32_t c = 0
+result = []
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if p.qpos < p.b.core.l_qseq:
+c = bam_get_qual(p.b)[p.qpos]
+else:
+c = 0
+if c < self.min_base_quality:
+continue
+result.append(c)
+return result
+def get_mapping_qualities(self):
+"""query mapping quality scores at pileup column position.
+Returns
+-------
+a list of quality scores : list
+"""
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+cdef uint32_t x = 0
+cdef const bam_pileup1_t * p = NULL
+result = []
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+result.append(p.b.core.qual)
+return result
+def get_query_positions(self):
+"""positions in read at pileup column position.
+Returns
+-------
+a list of read positions : list
+"""
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+cdef uint32_t x = 0
+cdef const bam_pileup1_t * p = NULL
+result = []
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+result.append(p.qpos)
+return result
+def get_query_names(self):
+"""query/read names aligned at pileup column position.
+Returns
+-------
+a list of query names at pileup column position. : list
+"""
+if self.plp == NULL or self.plp[0] == NULL:
+raise ValueError("PileupColumn accessed after iterator finished")
+cdef uint32_t x = 0
+cdef const bam_pileup1_t * p = NULL
+result = []
+for x from 0 <= x < self.n_pu:
+p = &(self.plp[0][x])
+if p == NULL:
+raise ValueError(
+"pileup buffer out of sync - most likely use of iterator "
+"outside loop")
+if pileup_base_qual_skip(p, self.min_base_quality):
+continue
+result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
+return result
+cdef class PileupRead:
+'''Representation of a read aligned to a particular position in the
+reference sequence.
+'''
+def __init__(self):
+raise TypeError(
+"this class cannot be instantiated from Python")
+def __str__(self):
+return "\t".join(
+map(str,
+(self.alignment, self.query_position,
+self.indel, self.level,
+self.is_del, self.is_head,
+self.is_tail, self.is_refskip)))
+property alignment:
+"""a :class:`pysam.AlignedSegment` object of the aligned read"""
+def __get__(self):
+return self._alignment
+property query_position:
+"""position of the read base at the pileup site, 0-based.
+None if :attr:`is_del` or :attr:`is_refskip` is set.
+"""
+def __get__(self):
+if self.is_del or self.is_refskip:
+return None
+else:
+return self._qpos
+property query_position_or_next:
+"""position of the read base at the pileup site, 0-based.
+If the current position is a deletion, returns the next
+aligned base.
+"""
+def __get__(self):
+return self._qpos
+property indel:
+"""indel length for the position following the current pileup site.
+This quantity peeks ahead to the next cigar operation in this
+alignment. If the next operation is an insertion, indel will
+be positive. If the next operation is a deletion, it will be
+negation. 0 if the next operation is not an indel.
+"""
+def __get__(self):
+return self._indel
+property level:
+"""the level of the read in the "viewer" mode. Note that this value
+is currently not computed."""
+def __get__(self):
+return self._level
+property is_del:
+"""1 iff the base on the padded read is a deletion"""
+def __get__(self):
+return self._is_del
+property is_head:
+"""1 iff the base on the padded read is the left-most base."""
+def __get__(self):
+return self._is_head
+property is_tail:
+"""1 iff the base on the padded read is the right-most base."""
+def __get__(self):
+return self._is_tail
+property is_refskip:
+"""1 iff the base on the padded read is part of CIGAR N op."""
+def __get__(self):
+return self._is_refskip
+cpdef enum CIGAR_OPS:
+CMATCH = 0
+CINS = 1
+CDEL = 2
+CREF_SKIP = 3
+CSOFT_CLIP = 4
+CHARD_CLIP = 5
+CPAD = 6
+CEQUAL = 7
+CDIFF = 8
+CBACK = 9
+cpdef enum SAM_FLAGS:
+# the read is paired in sequencing, no matter whether it is mapped in a pair
+FPAIRED = 1
+# the read is mapped in a proper pair
+FPROPER_PAIR = 2
+# the read itself is unmapped; conflictive with FPROPER_PAIR
+FUNMAP = 4
+# the mate is unmapped
+FMUNMAP = 8
+# the read is mapped to the reverse strand
+FREVERSE = 16
+# the mate is mapped to the reverse strand
+FMREVERSE = 32
+# this is read1
+FREAD1 = 64
+# this is read2
+FREAD2 = 128
+# not primary alignment
+FSECONDARY = 256
+# QC failure
+FQCFAIL = 512
+# optical or PCR duplicate
+FDUP = 1024
+# supplementary alignment
+FSUPPLEMENTARY = 2048
+__all__ = [
+"AlignedSegment",
+"PileupColumn",
+"PileupRead",
+"CMATCH",
+"CINS",
+"CDEL",
+"CREF_SKIP",
+"CSOFT_CLIP",
+"CHARD_CLIP",
+"CPAD",
+"CEQUAL",
+"CDIFF",
+"CBACK",
+"FPAIRED",
+"FPROPER_PAIR",
+"FUNMAP",
+"FMUNMAP",
+"FREVERSE",
+"FMREVERSE",
+"FREAD1",
+"FREAD2",
+"FSECONDARY",
+"FQCFAIL",
+"FDUP",
+"FSUPPLEMENTARY",
+"KEY_NAMES"]

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 68:5028fdace37b