comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # cython: language_level=3
2 # cython: embedsignature=True
3 # cython: profile=True
4 ###############################################################################
5 ###############################################################################
6 # Cython wrapper for SAM/BAM/CRAM files based on htslib
7 ###############################################################################
8 # The principal classes defined in this module are:
9 #
10 # class AlignedSegment an aligned segment (read)
11 #
12 # class PileupColumn a collection of segments (PileupRead) aligned to
13 # a particular genomic position.
14 #
15 # class PileupRead an AlignedSegment aligned to a particular genomic
16 # position. Contains additional attributes with respect
17 # to this.
18 #
19 # Additionally this module defines numerous additional classes that are part
20 # of the internal API. These are:
21 #
22 # Various iterator classes to iterate over alignments in sequential (IteratorRow)
23 # or in a stacked fashion (IteratorColumn):
24 #
25 # class IteratorRow
26 # class IteratorRowRegion
27 # class IteratorRowHead
28 # class IteratorRowAll
29 # class IteratorRowAllRefs
30 # class IteratorRowSelection
31 #
32 ###############################################################################
33 #
34 # The MIT License
35 #
36 # Copyright (c) 2015 Andreas Heger
37 #
38 # Permission is hereby granted, free of charge, to any person obtaining a
39 # copy of this software and associated documentation files (the "Software"),
40 # to deal in the Software without restriction, including without limitation
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
42 # and/or sell copies of the Software, and to permit persons to whom the
43 # Software is furnished to do so, subject to the following conditions:
44 #
45 # The above copyright notice and this permission notice shall be included in
46 # all copies or substantial portions of the Software.
47 #
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
54 # DEALINGS IN THE SOFTWARE.
55 #
56 ###############################################################################
57 import re
58 import array
59 import json
60 import string
61 import ctypes
62 import struct
63
64 cimport cython
65 from cpython cimport array as c_array
66 from cpython cimport PyBytes_FromStringAndSize
67 from libc.string cimport memset, strchr
68 from cpython cimport array as c_array
69 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
70 INT8_MAX, INT16_MAX, INT32_MAX, \
71 UINT8_MAX, UINT16_MAX, UINT32_MAX
72
73 from pysam.libchtslib cimport HTS_IDX_NOCOOR
74 from pysam.libcutils cimport force_bytes, force_str, \
75 charptr_to_str, charptr_to_bytes
76 from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
77 array_to_qualitystring
78
79 # Constants for binary tag conversion
80 cdef char * htslib_types = 'cCsSiIf'
81 cdef char * parray_types = 'bBhHiIf'
82
83 # translation tables
84
85 # cigar code to character and vice versa
86 cdef char* CODE2CIGAR= "MIDNSHP=XB"
87 cdef int NCIGAR_CODES = 10
88
89 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
90 CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
91
92 # names for keys in dictionary representation of an AlignedSegment
93 KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
94 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
95
96 #####################################################################
97 # C multiplication with wrapping around
98 cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
99 return (a * b) & 0xffffffff
100
101
102 cdef inline uint8_t tolower(uint8_t ch):
103 if ch >= 65 and ch <= 90:
104 return ch + 32
105 else:
106 return ch
107
108
109 cdef inline uint8_t toupper(uint8_t ch):
110 if ch >= 97 and ch <= 122:
111 return ch - 32
112 else:
113 return ch
114
115
116 cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
117 if ch == b'=':
118 if bam_is_rev(b):
119 return b','
120 else:
121 return b'.'
122 else:
123 if bam_is_rev(b):
124 return tolower(ch)
125 else:
126 return toupper(ch)
127
128
129 cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
130 cdef uint32_t c
131 if p.qpos < p.b.core.l_qseq:
132 c = bam_get_qual(p.b)[p.qpos]
133 else:
134 c = 0
135 if c < threshold:
136 return True
137 return False
138
139
140 cdef inline char map_typecode_htslib_to_python(uint8_t s):
141 """map an htslib typecode to the corresponding python typecode
142 to be used in the struct or array modules."""
143
144 # map type from htslib to python array
145 cdef char * f = strchr(htslib_types, s)
146
147 if f == NULL:
148 return 0
149 return parray_types[f - htslib_types]
150
151
152 cdef inline uint8_t map_typecode_python_to_htslib(char s):
153 """determine value type from type code of array"""
154 cdef char * f = strchr(parray_types, s)
155 if f == NULL:
156 return 0
157 return htslib_types[f - parray_types]
158
159
160 cdef inline void update_bin(bam1_t * src):
161 if src.core.flag & BAM_FUNMAP:
162 # treat alignment as length of 1 for unmapped reads
163 src.core.bin = hts_reg2bin(
164 src.core.pos,
165 src.core.pos + 1,
166 14,
167 5)
168 elif pysam_get_n_cigar(src):
169 src.core.bin = hts_reg2bin(
170 src.core.pos,
171 bam_endpos(src),
172 14,
173 5)
174 else:
175 src.core.bin = hts_reg2bin(
176 src.core.pos,
177 src.core.pos + 1,
178 14,
179 5)
180
181
182 # optional tag data manipulation
183 cdef convert_binary_tag(uint8_t * tag):
184 """return bytesize, number of values and array of values
185 in aux_data memory location pointed to by tag."""
186 cdef uint8_t auxtype
187 cdef uint8_t byte_size
188 cdef int32_t nvalues
189 # get byte size
190 auxtype = tag[0]
191 byte_size = aux_type2size(auxtype)
192 tag += 1
193 # get number of values in array
194 nvalues = (<int32_t*>tag)[0]
195 tag += 4
196
197 # define python array
198 cdef c_array.array c_values = array.array(
199 chr(map_typecode_htslib_to_python(auxtype)))
200 c_array.resize(c_values, nvalues)
201
202 # copy data
203 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
204
205 # no need to check for endian-ness as bam1_core_t fields
206 # and aux_data are in host endian-ness. See sam.c and calls
207 # to swap_data
208 return byte_size, nvalues, c_values
209
210
211 cdef inline uint8_t get_tag_typecode(value, value_type=None):
212 """guess type code for a *value*. If *value_type* is None, the type
213 code will be inferred based on the Python type of *value*
214
215 """
216 # 0 is unknown typecode
217 cdef char typecode = 0
218
219 if value_type is None:
220 if isinstance(value, int):
221 if value < 0:
222 if value >= INT8_MIN:
223 typecode = b'c'
224 elif value >= INT16_MIN:
225 typecode = b's'
226 elif value >= INT32_MIN:
227 typecode = b'i'
228 # unsigned ints
229 else:
230 if value <= UINT8_MAX:
231 typecode = b'C'
232 elif value <= UINT16_MAX:
233 typecode = b'S'
234 elif value <= UINT32_MAX:
235 typecode = b'I'
236 elif isinstance(value, float):
237 typecode = b'f'
238 elif isinstance(value, str):
239 typecode = b'Z'
240 elif isinstance(value, bytes):
241 typecode = b'Z'
242 elif isinstance(value, array.array) or \
243 isinstance(value, list) or \
244 isinstance(value, tuple):
245 typecode = b'B'
246 else:
247 if value_type in 'aAsSIcCZidfH':
248 typecode = force_bytes(value_type)[0]
249
250 return typecode
251
252
253 cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
254 '''returns the value typecode of a value.
255
256 If max is specified, the appropriate type is returned for a range
257 where value is the minimum.
258
259 Note that this method returns types from the extended BAM alphabet
260 of types that includes tags that are not part of the SAM
261 specification.
262 '''
263
264
265 cdef uint8_t typecode
266
267 t = type(value)
268
269 if t is float:
270 typecode = b'f'
271 elif t is int:
272 if max_value is None:
273 max_value = value
274 if min_value is None:
275 min_value = value
276 # signed ints
277 if min_value < 0:
278 if min_value >= INT8_MIN and max_value <= INT8_MAX:
279 typecode = b'c'
280 elif min_value >= INT16_MIN and max_value <= INT16_MAX:
281 typecode = b's'
282 elif min_value >= INT32_MIN or max_value <= INT32_MAX:
283 typecode = b'i'
284 else:
285 raise ValueError(
286 "at least one signed integer out of range of "
287 "BAM/SAM specification")
288 # unsigned ints
289 else:
290 if max_value <= UINT8_MAX:
291 typecode = b'C'
292 elif max_value <= UINT16_MAX:
293 typecode = b'S'
294 elif max_value <= UINT32_MAX:
295 typecode = b'I'
296 else:
297 raise ValueError(
298 "at least one integer out of range of BAM/SAM specification")
299 else:
300 # Note: hex strings (H) are not supported yet
301 if t is not bytes:
302 value = value.encode('ascii')
303 if len(value) == 1:
304 typecode = b'A'
305 else:
306 typecode = b'Z'
307
308 return typecode
309
310
311 # mapping python array.array and htslib typecodes to struct typecodes
312 DATATYPE2FORMAT = {
313 ord('c'): ('b', 1),
314 ord('C'): ('B', 1),
315 ord('s'): ('h', 2),
316 ord('S'): ('H', 2),
317 ord('i'): ('i', 4),
318 ord('I'): ('I', 4),
319 ord('f'): ('f', 4),
320 ord('d'): ('d', 8),
321 ord('A'): ('c', 1),
322 ord('a'): ('c', 1)}
323
324
325 cdef inline pack_tags(tags):
326 """pack a list of tags. Each tag is a tuple of (tag, tuple).
327
328 Values are packed into the most space efficient data structure
329 possible unless the tag contains a third field with the typecode.
330
331 Returns a format string and the associated list of arguments to be
332 used in a call to struct.pack_into.
333 """
334 fmts, args = ["<"], []
335
336 # htslib typecode
337 cdef uint8_t typecode
338 for tag in tags:
339
340 if len(tag) == 2:
341 pytag, value = tag
342 valuetype = None
343 elif len(tag) == 3:
344 pytag, value, valuetype = tag
345 else:
346 raise ValueError("malformatted tag: %s" % str(tag))
347
348 if valuetype is None:
349 typecode = 0
350 else:
351 # only first character in valuecode matters
352 typecode = force_bytes(valuetype)[0]
353
354 pytag = force_bytes(pytag)
355 pytype = type(value)
356
357 if pytype is tuple or pytype is list:
358 # binary tags from tuples or lists
359 if not typecode:
360 # automatically determine value type - first value
361 # determines type. If there is a mix of types, the
362 # result is undefined.
363 typecode = get_btag_typecode(min(value),
364 min_value=min(value),
365 max_value=max(value))
366
367 if typecode not in DATATYPE2FORMAT:
368 raise ValueError("invalid value type '{}'".format(chr(typecode)))
369
370 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
371 args.extend([pytag[:2],
372 ord("B"),
373 typecode,
374 len(value)] + list(value))
375
376 elif isinstance(value, array.array):
377 # binary tags from arrays
378 if typecode == 0:
379 typecode = map_typecode_python_to_htslib(ord(value.typecode))
380
381 if typecode == 0:
382 raise ValueError("unsupported type code '{}'".format(value.typecode))
383
384 if typecode not in DATATYPE2FORMAT:
385 raise ValueError("invalid value type '{}'".format(chr(typecode)))
386
387 # use array.tostring() to retrieve byte representation and
388 # save as bytes
389 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
390 args.extend([pytag[:2],
391 ord("B"),
392 typecode,
393 len(value),
394 value.tobytes()])
395
396 else:
397 if typecode == 0:
398 typecode = get_tag_typecode(value)
399 if typecode == 0:
400 raise ValueError("could not deduce typecode for value {}".format(value))
401
402 if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
403 value = force_bytes(value)
404
405 if typecode == b"a":
406 typecode = b'A'
407
408 if typecode == b'Z' or typecode == b'H':
409 datafmt = "2sB%is" % (len(value)+1)
410 elif typecode in DATATYPE2FORMAT:
411 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
412 else:
413 raise ValueError("invalid value type '{}'".format(chr(typecode)))
414
415 args.extend([pytag[:2],
416 typecode,
417 value])
418
419 fmts.append(datafmt)
420
421 return "".join(fmts), args
422
423
424 cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
425 """return query length computed from CIGAR alignment.
426
427 Length ignores hard-clipped bases.
428
429 Return 0 if there is no CIGAR alignment.
430 """
431
432 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
433
434 if cigar_p == NULL:
435 return 0
436
437 cdef uint32_t k, qpos
438 cdef int op
439 qpos = 0
440
441 for k from 0 <= k < pysam_get_n_cigar(src):
442 op = cigar_p[k] & BAM_CIGAR_MASK
443
444 if op == BAM_CMATCH or \
445 op == BAM_CINS or \
446 op == BAM_CSOFT_CLIP or \
447 op == BAM_CEQUAL or \
448 op == BAM_CDIFF:
449 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
450
451 return qpos
452
453
454 cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
455 """return query length computed from CIGAR alignment.
456
457 Length includes hard-clipped bases.
458
459 Return 0 if there is no CIGAR alignment.
460 """
461
462 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
463
464 if cigar_p == NULL:
465 return 0
466
467 cdef uint32_t k, qpos
468 cdef int op
469 qpos = 0
470
471 for k from 0 <= k < pysam_get_n_cigar(src):
472 op = cigar_p[k] & BAM_CIGAR_MASK
473
474 if op == BAM_CMATCH or \
475 op == BAM_CINS or \
476 op == BAM_CSOFT_CLIP or \
477 op == BAM_CHARD_CLIP or \
478 op == BAM_CEQUAL or \
479 op == BAM_CDIFF:
480 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
481
482 return qpos
483
484
485 cdef inline int32_t getQueryStart(bam1_t *src) except -1:
486 cdef uint32_t * cigar_p
487 cdef uint32_t start_offset = 0
488 cdef uint32_t k, op
489
490 cigar_p = pysam_bam_get_cigar(src);
491 for k from 0 <= k < pysam_get_n_cigar(src):
492 op = cigar_p[k] & BAM_CIGAR_MASK
493 if op == BAM_CHARD_CLIP:
494 if start_offset != 0 and start_offset != src.core.l_qseq:
495 raise ValueError('Invalid clipping in CIGAR string')
496 elif op == BAM_CSOFT_CLIP:
497 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
498 else:
499 break
500
501 return start_offset
502
503
504 cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
505 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
506 cdef uint32_t end_offset = src.core.l_qseq
507 cdef uint32_t k, op
508
509 # if there is no sequence, compute length from cigar string
510 if end_offset == 0:
511 for k from 0 <= k < pysam_get_n_cigar(src):
512 op = cigar_p[k] & BAM_CIGAR_MASK
513 if op == BAM_CMATCH or \
514 op == BAM_CINS or \
515 op == BAM_CEQUAL or \
516 op == BAM_CDIFF or \
517 (op == BAM_CSOFT_CLIP and end_offset == 0):
518 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
519 else:
520 # walk backwards in cigar string
521 for k from pysam_get_n_cigar(src) > k >= 1:
522 op = cigar_p[k] & BAM_CIGAR_MASK
523 if op == BAM_CHARD_CLIP:
524 if end_offset != src.core.l_qseq:
525 raise ValueError('Invalid clipping in CIGAR string')
526 elif op == BAM_CSOFT_CLIP:
527 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
528 else:
529 break
530
531 return end_offset
532
533
534 cdef inline bytes getSequenceInRange(bam1_t *src,
535 uint32_t start,
536 uint32_t end):
537 """return python string of the sequence in a bam1_t object.
538 """
539
540 cdef uint8_t * p
541 cdef uint32_t k
542 cdef char * s
543
544 if not src.core.l_qseq:
545 return None
546
547 seq = PyBytes_FromStringAndSize(NULL, end - start)
548 s = <char*>seq
549 p = pysam_bam_get_seq(src)
550
551 for k from start <= k < end:
552 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
553 # note: do not use string literal as it will be a python string
554 s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
555
556 return charptr_to_bytes(seq)
557
558
559 cdef inline object getQualitiesInRange(bam1_t *src,
560 uint32_t start,
561 uint32_t end):
562 """return python array of quality values from a bam1_t object"""
563
564 cdef uint8_t * p
565 cdef uint32_t k
566
567 p = pysam_bam_get_qual(src)
568 if p[0] == 0xff:
569 return None
570
571 # 'B': unsigned char
572 cdef c_array.array result = array.array('B', [0])
573 c_array.resize(result, end - start)
574
575 # copy data
576 memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
577
578 return result
579
580
581 #####################################################################
582 ## factory methods for instantiating extension classes
583 cdef class AlignedSegment
584 cdef AlignedSegment makeAlignedSegment(bam1_t *src,
585 AlignmentHeader header):
586 '''return an AlignedSegment object constructed from `src`'''
587 # note that the following does not call __init__
588 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
589 dest._delegate = bam_dup1(src)
590 dest.header = header
591 return dest
592
593
594 cdef class PileupColumn
595 cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
596 int tid,
597 int pos,
598 int n_pu,
599 uint32_t min_base_quality,
600 char * reference_sequence,
601 AlignmentHeader header):
602 '''return a PileupColumn object constructed from pileup in `plp` and
603 setting additional attributes.
604
605 '''
606 # note that the following does not call __init__
607 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
608 dest.header = header
609 dest.plp = plp
610 dest.tid = tid
611 dest.pos = pos
612 dest.n_pu = n_pu
613 dest.min_base_quality = min_base_quality
614 dest.reference_sequence = reference_sequence
615 dest.buf.l = dest.buf.m = 0
616 dest.buf.s = NULL
617
618 return dest
619
620
621 cdef class PileupRead
622 cdef PileupRead makePileupRead(const bam_pileup1_t *src,
623 AlignmentHeader header):
624 '''return a PileupRead object construted from a bam_pileup1_t * object.'''
625 # note that the following does not call __init__
626 cdef PileupRead dest = PileupRead.__new__(PileupRead)
627 dest._alignment = makeAlignedSegment(src.b, header)
628 dest._qpos = src.qpos
629 dest._indel = src.indel
630 dest._level = src.level
631 dest._is_del = src.is_del
632 dest._is_head = src.is_head
633 dest._is_tail = src.is_tail
634 dest._is_refskip = src.is_refskip
635 return dest
636
637
638 cdef inline uint32_t get_alignment_length(bam1_t *src):
639 cdef uint32_t k = 0
640 cdef uint32_t l = 0
641 if src == NULL:
642 return 0
643 cdef uint32_t * cigar_p = bam_get_cigar(src)
644 if cigar_p == NULL:
645 return 0
646 cdef int op
647 cdef uint32_t n = pysam_get_n_cigar(src)
648 for k from 0 <= k < n:
649 op = cigar_p[k] & BAM_CIGAR_MASK
650 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
651 continue
652 l += cigar_p[k] >> BAM_CIGAR_SHIFT
653 return l
654
655
656 cdef inline uint32_t get_md_reference_length(char * md_tag):
657 cdef int l = 0
658 cdef int md_idx = 0
659 cdef int nmatches = 0
660
661 while md_tag[md_idx] != 0:
662 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
663 nmatches *= 10
664 nmatches += md_tag[md_idx] - 48
665 md_idx += 1
666 continue
667 else:
668 l += nmatches
669 nmatches = 0
670 if md_tag[md_idx] == b'^':
671 md_idx += 1
672 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
673 md_idx += 1
674 l += 1
675 else:
676 md_idx += 1
677 l += 1
678
679 l += nmatches
680 return l
681
682 # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
683 cdef inline bytes build_alignment_sequence(bam1_t * src):
684 """return expanded sequence from MD tag.
685
686 The sequence includes substitutions and both insertions in the
687 reference as well as deletions to the reference sequence. Combine
688 with the cigar string to reconstitute the query or the reference
689 sequence.
690
691 Positions corresponding to `N` (skipped region from the reference)
692 or `P` (padding (silent deletion from padded reference)) in the CIGAR
693 string will not appear in the returned sequence. The MD should
694 correspondingly not contain these. Thus proper tags are::
695
696 Deletion from the reference: cigar=5M1D5M MD=5^C5
697 Skipped region from reference: cigar=5M1N5M MD=10
698 Padded region in the reference: cigar=5M1P5M MD=10
699
700 Returns
701 -------
702
703 None, if no MD tag is present.
704
705 """
706 if src == NULL:
707 return None
708
709 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
710 if md_tag_ptr == NULL:
711 return None
712
713 cdef uint32_t start = getQueryStart(src)
714 cdef uint32_t end = getQueryEnd(src)
715 # get read sequence, taking into account soft-clipping
716 r = getSequenceInRange(src, start, end)
717 cdef char * read_sequence = r
718 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
719 if cigar_p == NULL:
720 return None
721
722 cdef uint32_t r_idx = 0
723 cdef int op
724 cdef uint32_t k, i, l, x
725 cdef int nmatches = 0
726 cdef int s_idx = 0
727
728 cdef uint32_t max_len = get_alignment_length(src)
729 if max_len == 0:
730 raise ValueError("could not determine alignment length")
731
732 cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
733 if s == NULL:
734 raise ValueError(
735 "could not allocate sequence of length %i" % max_len)
736
737 for k from 0 <= k < pysam_get_n_cigar(src):
738 op = cigar_p[k] & BAM_CIGAR_MASK
739 l = cigar_p[k] >> BAM_CIGAR_SHIFT
740 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
741 for i from 0 <= i < l:
742 s[s_idx] = read_sequence[r_idx]
743 r_idx += 1
744 s_idx += 1
745 elif op == BAM_CDEL:
746 for i from 0 <= i < l:
747 s[s_idx] = b'-'
748 s_idx += 1
749 elif op == BAM_CREF_SKIP:
750 pass
751 elif op == BAM_CINS or op == BAM_CPAD:
752 for i from 0 <= i < l:
753 # encode insertions into reference as lowercase
754 s[s_idx] = read_sequence[r_idx] + 32
755 r_idx += 1
756 s_idx += 1
757 elif op == BAM_CSOFT_CLIP:
758 pass
759 elif op == BAM_CHARD_CLIP:
760 pass # advances neither
761
762 cdef char md_buffer[2]
763 cdef char *md_tag
764 cdef uint8_t md_typecode = md_tag_ptr[0]
765 if md_typecode == b'Z':
766 md_tag = bam_aux2Z(md_tag_ptr)
767 elif md_typecode == b'A':
768 # Work around HTSeq bug that writes 1-character strings as MD:A:v
769 md_buffer[0] = bam_aux2A(md_tag_ptr)
770 md_buffer[1] = b'\0'
771 md_tag = md_buffer
772 else:
773 raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
774
775 cdef int md_idx = 0
776 cdef char c
777 s_idx = 0
778
779 # Check if MD tag is valid by matching CIGAR length to MD tag defined length
780 # Insertions would be in addition to what is described by MD, so we calculate
781 # the number of insertions separately.
782 cdef int insertions = 0
783
784 while s[s_idx] != 0:
785 if s[s_idx] >= b'a':
786 insertions += 1
787 s_idx += 1
788 s_idx = 0
789
790 cdef uint32_t md_len = get_md_reference_length(md_tag)
791 if md_len + insertions > max_len:
792 free(s)
793 raise AssertionError(
794 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
795 md_len, max_len, insertions))
796
797 while md_tag[md_idx] != 0:
798 # c is numerical
799 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
800 nmatches *= 10
801 nmatches += md_tag[md_idx] - 48
802 md_idx += 1
803 continue
804 else:
805 # save matches up to this point, skipping insertions
806 for x from 0 <= x < nmatches:
807 while s[s_idx] >= b'a':
808 s_idx += 1
809 s_idx += 1
810 while s[s_idx] >= b'a':
811 s_idx += 1
812
813 r_idx += nmatches
814 nmatches = 0
815 if md_tag[md_idx] == b'^':
816 md_idx += 1
817 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
818 # assert s[s_idx] == '-'
819 s[s_idx] = md_tag[md_idx]
820 s_idx += 1
821 md_idx += 1
822 else:
823 # save mismatch
824 # enforce lower case
825 c = md_tag[md_idx]
826 if c <= 90:
827 c += 32
828 s[s_idx] = c
829 s_idx += 1
830 r_idx += 1
831 md_idx += 1
832
833 # save matches up to this point, skipping insertions
834 for x from 0 <= x < nmatches:
835 while s[s_idx] >= b'a':
836 s_idx += 1
837 s_idx += 1
838 while s[s_idx] >= b'a':
839 s_idx += 1
840
841 seq = PyBytes_FromStringAndSize(s, s_idx)
842 free(s)
843
844 return seq
845
846
847 cdef inline bytes build_reference_sequence(bam1_t * src):
848 """return the reference sequence in the region that is covered by the
849 alignment of the read to the reference.
850
851 This method requires the MD tag to be set.
852
853 """
854 cdef uint32_t k, i, l
855 cdef int op
856 cdef int s_idx = 0
857 ref_seq = build_alignment_sequence(src)
858 if ref_seq is None:
859 raise ValueError("MD tag not present")
860
861 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
862 if s == NULL:
863 raise ValueError(
864 "could not allocate sequence of length %i" % len(ref_seq))
865
866 cdef char * cref_seq = ref_seq
867 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
868 cdef uint32_t r_idx = 0
869 for k from 0 <= k < pysam_get_n_cigar(src):
870 op = cigar_p[k] & BAM_CIGAR_MASK
871 l = cigar_p[k] >> BAM_CIGAR_SHIFT
872 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
873 for i from 0 <= i < l:
874 s[s_idx] = cref_seq[r_idx]
875 r_idx += 1
876 s_idx += 1
877 elif op == BAM_CDEL:
878 for i from 0 <= i < l:
879 s[s_idx] = cref_seq[r_idx]
880 r_idx += 1
881 s_idx += 1
882 elif op == BAM_CREF_SKIP:
883 pass
884 elif op == BAM_CINS or op == BAM_CPAD:
885 r_idx += l
886 elif op == BAM_CSOFT_CLIP:
887 pass
888 elif op == BAM_CHARD_CLIP:
889 pass # advances neither
890
891 seq = PyBytes_FromStringAndSize(s, s_idx)
892 free(s)
893
894 return seq
895
896
897 cdef inline str safe_reference_name(AlignmentHeader header, int tid):
898 if tid == -1: return "*"
899 elif header is not None: return header.get_reference_name(tid)
900 else: return f"#{tid}"
901
902
903 # Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
904
905 cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
906 return (qpos, pos)
907
908
909 cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
910 ref_base = ref_seq[r_idx] if ref_seq is not None else None
911 return (qpos, pos, ref_base)
912
913
914 cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
915 return (qpos, pos, CIGAR_OPS(op))
916
917
918 cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
919 ref_base = ref_seq[r_idx] if ref_seq is not None else None
920 return (qpos, pos, ref_base, CIGAR_OPS(op))
921
922
923 cdef class AlignedSegment:
924 '''Class representing an aligned segment.
925
926 This class stores a handle to the samtools C-structure representing
927 an aligned read. Member read access is forwarded to the C-structure
928 and converted into python objects. This implementation should be fast,
929 as only the data needed is converted.
930
931 For write access, the C-structure is updated in-place. This is
932 not the most efficient way to build BAM entries, as the variable
933 length data is concatenated and thus needs to be resized if
934 a field is updated. Furthermore, the BAM entry might be
935 in an inconsistent state.
936
937 One issue to look out for is that the sequence should always
938 be set *before* the quality scores. Setting the sequence will
939 also erase any quality scores that were set previously.
940
941 Parameters
942 ----------
943
944 header:
945 :class:`~pysam.AlignmentHeader` object to map numerical
946 identifiers to chromosome names. If not given, an empty
947 header is created.
948 '''
949
950 # Now only called when instances are created from Python
951 def __init__(self, AlignmentHeader header=None):
952 # see bam_init1
953 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
954 if self._delegate == NULL:
955 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
956 # allocate some memory. If size is 0, calloc does not return a
957 # pointer that can be passed to free() so allocate 40 bytes
958 # for a new read
959 self._delegate.m_data = 40
960 self._delegate.data = <uint8_t *>calloc(
961 self._delegate.m_data, 1)
962 if self._delegate.data == NULL:
963 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
964 self._delegate.l_data = 0
965 # set some data to make read approximately legit.
966 # Note, SAM writing fails with q_name of length 0
967 self._delegate.core.l_qname = 0
968 self._delegate.core.tid = -1
969 self._delegate.core.pos = -1
970 self._delegate.core.mtid = -1
971 self._delegate.core.mpos = -1
972
973 # caching for selected fields
974 self.cache_query_qualities = None
975 self.cache_query_alignment_qualities = None
976 self.cache_query_sequence = None
977 self.cache_query_alignment_sequence = None
978
979 self.header = header
980
981 def __dealloc__(self):
982 bam_destroy1(self._delegate)
983
984 def __str__(self):
985 """return string representation of alignment.
986
987 The representation is an approximate :term:`SAM` format, because
988 an aligned read might not be associated with a :term:`AlignmentFile`.
989 Hence when the read does not have an associated :class:`AlignedHeader`,
990 :term:`tid` is shown instead of the reference name.
991 Similarly, the tags field is returned in its parsed state.
992
993 To get a valid SAM record, use :meth:`to_string`.
994 """
995 # sam-parsing is done in sam.c/bam_format1_core which
996 # requires a valid header.
997 return "\t".join(map(str, (self.query_name,
998 self.flag,
999 safe_reference_name(self.header, self.reference_id),
1000 self.reference_start + 1,
1001 self.mapping_quality,
1002 self.cigarstring,
1003 safe_reference_name(self.header, self.next_reference_id),
1004 self.next_reference_start + 1,
1005 self.template_length,
1006 self.query_sequence,
1007 self.query_qualities,
1008 self.tags)))
1009
1010 def __repr__(self):
1011 ref = self.reference_name if self.header is not None else self.reference_id
1012 return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
1013
1014 def __copy__(self):
1015 return makeAlignedSegment(self._delegate, self.header)
1016
1017 def __deepcopy__(self, memo):
1018 return makeAlignedSegment(self._delegate, self.header)
1019
1020 def compare(self, AlignedSegment other):
1021 '''return -1,0,1, if contents in this are binary
1022 <,=,> to *other*
1023 '''
1024
1025 # avoid segfault when other equals None
1026 if other is None:
1027 return -1
1028
1029 cdef int retval, x
1030 cdef bam1_t *t
1031 cdef bam1_t *o
1032
1033 t = self._delegate
1034 o = other._delegate
1035
1036 # uncomment for debugging purposes
1037 # cdef unsigned char * oo, * tt
1038 # tt = <unsigned char*>(&t.core)
1039 # oo = <unsigned char*>(&o.core)
1040 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
1041 # tt = <unsigned char*>(t.data)
1042 # oo = <unsigned char*>(o.data)
1043 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
1044
1045 # Fast-path test for object identity
1046 if t == o:
1047 return 0
1048
1049 cdef uint8_t *a = <uint8_t*>&t.core
1050 cdef uint8_t *b = <uint8_t*>&o.core
1051
1052 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
1053 if retval:
1054 return retval
1055
1056 # cmp(t.l_data, o.l_data)
1057 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
1058 if retval:
1059 return retval
1060 return memcmp(t.data, o.data, t.l_data)
1061
1062 def __richcmp__(self, AlignedSegment other, int op):
1063 if op == 2: # == operator
1064 return self.compare(other) == 0
1065 elif op == 3: # != operator
1066 return self.compare(other) != 0
1067 else:
1068 return NotImplemented
1069
1070 def __hash__(self):
1071 cdef bam1_t * src = self._delegate
1072 cdef int x
1073
1074 # see http://effbot.org/zone/python-hash.htm
1075 cdef uint8_t * c = <uint8_t *>&src.core
1076 cdef uint32_t hash_value = c[0]
1077 for x from 1 <= x < sizeof(bam1_core_t):
1078 hash_value = c_mul(hash_value, 1000003) ^ c[x]
1079 c = <uint8_t *>src.data
1080 for x from 0 <= x < src.l_data:
1081 hash_value = c_mul(hash_value, 1000003) ^ c[x]
1082
1083 return hash_value
1084
1085 cpdef to_string(self):
1086 """returns a string representation of the aligned segment.
1087
1088 The output format is valid SAM format if a header is associated
1089 with the AlignedSegment.
1090 """
1091 cdef kstring_t line
1092 line.l = line.m = 0
1093 line.s = NULL
1094
1095 if self.header:
1096 if sam_format1(self.header.ptr, self._delegate, &line) < 0:
1097 if line.m:
1098 free(line.s)
1099 raise ValueError('sam_format failed')
1100 else:
1101 raise NotImplementedError("todo")
1102
1103 ret = force_str(line.s[:line.l])
1104
1105 if line.m:
1106 free(line.s)
1107
1108 return ret
1109
1110 @classmethod
1111 def fromstring(cls, sam, AlignmentHeader header):
1112 """parses a string representation of the aligned segment.
1113
1114 The input format should be valid SAM format.
1115
1116 Parameters
1117 ----------
1118 sam:
1119 :term:`SAM` formatted string
1120
1121 """
1122 cdef AlignedSegment dest = cls.__new__(cls)
1123 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
1124 dest.header = header
1125
1126 cdef kstring_t line
1127 line.l = line.m = len(sam)
1128 _sam = force_bytes(sam)
1129 line.s = _sam
1130
1131 cdef int ret
1132 ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
1133 if ret < 0:
1134 raise ValueError("parsing SAM record string failed (error code {})".format(ret))
1135
1136 return dest
1137
1138 cpdef tostring(self, htsfile=None):
1139 """deprecated, use :meth:`to_string()` instead.
1140
1141 Parameters
1142 ----------
1143
1144 htsfile:
1145 (deprecated) AlignmentFile object to map numerical
1146 identifiers to chromosome names. This parameter is present
1147 for backwards compatibility and ignored.
1148 """
1149
1150 return self.to_string()
1151
1152 def to_dict(self):
1153 """returns a json representation of the aligned segment.
1154
1155 Field names are abbreviated versions of the class attributes.
1156 """
1157 # let htslib do the string conversions, but treat optional field properly as list
1158 vals = self.to_string().split("\t")
1159 n = len(KEY_NAMES) - 1
1160 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
1161
1162 @classmethod
1163 def from_dict(cls, sam_dict, AlignmentHeader header):
1164 """parses a dictionary representation of the aligned segment.
1165
1166 Parameters
1167 ----------
1168 sam_dict:
1169 dictionary of alignment values, keys corresponding to output from
1170 :meth:`todict()`.
1171
1172 """
1173 # let htslib do the parsing
1174 # the tags field can be missing
1175 return cls.fromstring(
1176 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
1177 "\t" +
1178 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
1179
1180 ########################################################
1181 ## Basic attributes in order of appearance in SAM format
1182 property query_name:
1183 """the query template name (None if not present)"""
1184 def __get__(self):
1185
1186 cdef bam1_t * src = self._delegate
1187 if src.core.l_qname == 0:
1188 return None
1189
1190 return charptr_to_str(<char *>pysam_bam_get_qname(src))
1191
1192 def __set__(self, qname):
1193
1194 if qname is None or len(qname) == 0:
1195 return
1196
1197 if len(qname) > 254:
1198 raise ValueError("query length out of range {} > 254".format(
1199 len(qname)))
1200
1201 qname = force_bytes(qname)
1202 cdef bam1_t * src = self._delegate
1203 # the qname is \0 terminated
1204 cdef uint8_t l = len(qname) + 1
1205
1206 cdef char * p = pysam_bam_get_qname(src)
1207 cdef uint8_t l_extranul = 0
1208
1209 if l % 4 != 0:
1210 l_extranul = 4 - l % 4
1211
1212 cdef bam1_t * retval = pysam_bam_update(src,
1213 src.core.l_qname,
1214 l + l_extranul,
1215 <uint8_t*>p)
1216 if retval == NULL:
1217 raise MemoryError("could not allocate memory")
1218
1219 src.core.l_extranul = l_extranul
1220 src.core.l_qname = l + l_extranul
1221
1222 # re-acquire pointer to location in memory
1223 # as it might have moved
1224 p = pysam_bam_get_qname(src)
1225
1226 strncpy(p, qname, l)
1227 # x might be > 255
1228 cdef uint16_t x = 0
1229
1230 for x from l <= x < l + l_extranul:
1231 p[x] = b'\0'
1232
1233 property flag:
1234 """properties flag"""
1235 def __get__(self):
1236 return self._delegate.core.flag
1237 def __set__(self, flag):
1238 self._delegate.core.flag = flag
1239
1240 property reference_name:
1241 """:term:`reference` name"""
1242 def __get__(self):
1243 if self._delegate.core.tid == -1:
1244 return None
1245 if self.header:
1246 return self.header.get_reference_name(self._delegate.core.tid)
1247 else:
1248 raise ValueError("reference_name unknown if no header associated with record")
1249 def __set__(self, reference):
1250 cdef int tid
1251 if reference is None or reference == "*":
1252 self._delegate.core.tid = -1
1253 elif self.header:
1254 tid = self.header.get_tid(reference)
1255 if tid < 0:
1256 raise ValueError("reference {} does not exist in header".format(
1257 reference))
1258 self._delegate.core.tid = tid
1259 else:
1260 raise ValueError("reference_name can not be set if no header associated with record")
1261
1262 property reference_id:
1263 """:term:`reference` ID
1264
1265 .. note::
1266
1267 This field contains the index of the reference sequence in
1268 the sequence dictionary. To obtain the name of the
1269 reference sequence, use :meth:`get_reference_name()`
1270
1271 """
1272 def __get__(self):
1273 return self._delegate.core.tid
1274 def __set__(self, tid):
1275 if tid != -1 and self.header and not self.header.is_valid_tid(tid):
1276 raise ValueError("reference id {} does not exist in header".format(
1277 tid))
1278 self._delegate.core.tid = tid
1279
1280 property reference_start:
1281 """0-based leftmost coordinate"""
1282 def __get__(self):
1283 return self._delegate.core.pos
1284 def __set__(self, pos):
1285 ## setting the position requires updating the "bin" attribute
1286 cdef bam1_t * src
1287 src = self._delegate
1288 src.core.pos = pos
1289 update_bin(src)
1290
1291 property mapping_quality:
1292 """mapping quality"""
1293 def __get__(self):
1294 return pysam_get_qual(self._delegate)
1295 def __set__(self, qual):
1296 pysam_set_qual(self._delegate, qual)
1297
1298 property cigarstring:
1299 '''the :term:`cigar` alignment as a string.
1300
1301 The cigar string is a string of alternating integers
1302 and characters denoting the length and the type of
1303 an operation.
1304
1305 .. note::
1306 The order length,operation is specified in the
1307 SAM format. It is different from the order of
1308 the :attr:`cigar` property.
1309
1310 Returns None if not present.
1311
1312 To unset the cigarstring, assign None or the
1313 empty string.
1314 '''
1315 def __get__(self):
1316 c = self.cigartuples
1317 if c is None:
1318 return None
1319 # reverse order
1320 else:
1321 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
1322
1323 def __set__(self, cigar):
1324 if cigar is None or len(cigar) == 0:
1325 self.cigartuples = []
1326 else:
1327 parts = CIGAR_REGEX.findall(cigar)
1328 # reverse order
1329 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
1330
1331 # TODO
1332 # property cigar:
1333 # """the cigar alignment"""
1334
1335 property next_reference_id:
1336 """the :term:`reference` id of the mate/next read."""
1337 def __get__(self):
1338 return self._delegate.core.mtid
1339 def __set__(self, mtid):
1340 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
1341 raise ValueError("reference id {} does not exist in header".format(
1342 mtid))
1343 self._delegate.core.mtid = mtid
1344
1345 property next_reference_name:
1346 """:term:`reference` name of the mate/next read (None if no
1347 AlignmentFile is associated)"""
1348 def __get__(self):
1349 if self._delegate.core.mtid == -1:
1350 return None
1351 if self.header:
1352 return self.header.get_reference_name(self._delegate.core.mtid)
1353 else:
1354 raise ValueError("next_reference_name unknown if no header associated with record")
1355
1356 def __set__(self, reference):
1357 cdef int mtid
1358 if reference is None or reference == "*":
1359 self._delegate.core.mtid = -1
1360 elif reference == "=":
1361 self._delegate.core.mtid = self._delegate.core.tid
1362 elif self.header:
1363 mtid = self.header.get_tid(reference)
1364 if mtid < 0:
1365 raise ValueError("reference {} does not exist in header".format(
1366 reference))
1367 self._delegate.core.mtid = mtid
1368 else:
1369 raise ValueError("next_reference_name can not be set if no header associated with record")
1370
1371 property next_reference_start:
1372 """the position of the mate/next read."""
1373 def __get__(self):
1374 return self._delegate.core.mpos
1375 def __set__(self, mpos):
1376 self._delegate.core.mpos = mpos
1377
1378 property query_length:
1379 """the length of the query/read.
1380
1381 This value corresponds to the length of the sequence supplied
1382 in the BAM/SAM file. The length of a query is 0 if there is no
1383 sequence in the BAM/SAM file. In those cases, the read length
1384 can be inferred from the CIGAR alignment, see
1385 :meth:`pysam.AlignedSegment.infer_query_length`.
1386
1387 The length includes soft-clipped bases and is equal to
1388 ``len(query_sequence)``.
1389
1390 This property is read-only but is updated when a new query
1391 sequence is assigned to this AlignedSegment.
1392
1393 Returns 0 if not available.
1394
1395 """
1396 def __get__(self):
1397 return self._delegate.core.l_qseq
1398
1399 property template_length:
1400 """the observed query template length"""
1401 def __get__(self):
1402 return self._delegate.core.isize
1403 def __set__(self, isize):
1404 self._delegate.core.isize = isize
1405
1406 property query_sequence:
1407 """read sequence bases, including :term:`soft clipped` bases
1408 (None if not present).
1409
1410 Assigning to this attribute will invalidate any quality scores.
1411 Thus, to in-place edit the sequence and quality scores, copies of
1412 the quality scores need to be taken. Consider trimming for example::
1413
1414 q = read.query_qualities
1415 read.query_sequence = read.query_sequence[5:10]
1416 read.query_qualities = q[5:10]
1417
1418 The sequence is returned as it is stored in the BAM file. (This will
1419 be the reverse complement of the original read sequence if the mapper
1420 has aligned the read to the reverse strand.)
1421 """
1422 def __get__(self):
1423 if self.cache_query_sequence:
1424 return self.cache_query_sequence
1425
1426 cdef bam1_t * src
1427 cdef char * s
1428 src = self._delegate
1429
1430 if src.core.l_qseq == 0:
1431 return None
1432
1433 self.cache_query_sequence = force_str(getSequenceInRange(
1434 src, 0, src.core.l_qseq))
1435 return self.cache_query_sequence
1436
1437 def __set__(self, seq):
1438 # samtools manages sequence and quality length memory together
1439 # if no quality information is present, the first byte says 0xff.
1440 cdef bam1_t * src
1441 cdef uint8_t * p
1442 cdef char * s
1443 cdef int l, k
1444 cdef Py_ssize_t nbytes_new, nbytes_old
1445
1446 if seq == None:
1447 l = 0
1448 else:
1449 l = len(seq)
1450 seq = force_bytes(seq)
1451
1452 src = self._delegate
1453
1454 # as the sequence is stored in half-bytes, the total length (sequence
1455 # plus quality scores) is (l+1)/2 + l
1456 nbytes_new = (l + 1) // 2 + l
1457 nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
1458
1459 # acquire pointer to location in memory
1460 p = pysam_bam_get_seq(src)
1461 src.core.l_qseq = l
1462
1463 # change length of data field
1464 cdef bam1_t * retval = pysam_bam_update(src,
1465 nbytes_old,
1466 nbytes_new,
1467 p)
1468
1469 if retval == NULL:
1470 raise MemoryError("could not allocate memory")
1471
1472 if l > 0:
1473 # re-acquire pointer to location in memory
1474 # as it might have moved
1475 p = pysam_bam_get_seq(src)
1476 for k from 0 <= k < nbytes_new:
1477 p[k] = 0
1478 # convert to C string
1479 s = seq
1480 for k from 0 <= k < l:
1481 p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
1482
1483 # erase qualities
1484 p = pysam_bam_get_qual(src)
1485 memset(p, 0xff, l)
1486
1487 self.cache_query_sequence = force_str(seq)
1488
1489 # clear cached values for quality values
1490 self.cache_query_qualities = None
1491 self.cache_query_alignment_qualities = None
1492
1493 property query_qualities:
1494 """read sequence base qualities, including :term:`soft clipped` bases
1495 (None if not present).
1496
1497 Quality scores are returned as a python array of unsigned
1498 chars. Note that this is not the ASCII-encoded value typically
1499 seen in FASTQ or SAM formatted files. Thus, no offset of 33
1500 needs to be subtracted.
1501
1502 Note that to set quality scores the sequence has to be set
1503 beforehand as this will determine the expected length of the
1504 quality score array.
1505
1506 This method raises a ValueError if the length of the
1507 quality scores and the sequence are not the same.
1508
1509 """
1510 def __get__(self):
1511
1512 if self.cache_query_qualities:
1513 return self.cache_query_qualities
1514
1515 cdef bam1_t * src
1516 cdef char * q
1517
1518 src = self._delegate
1519
1520 if src.core.l_qseq == 0:
1521 return None
1522
1523 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
1524 return self.cache_query_qualities
1525
1526 def __set__(self, qual):
1527
1528 # note that memory is already allocated via setting the sequence
1529 # hence length match of sequence and quality needs is checked.
1530 cdef bam1_t * src
1531 cdef uint8_t * p
1532 cdef int l
1533
1534 src = self._delegate
1535 p = pysam_bam_get_qual(src)
1536 if qual is None or len(qual) == 0:
1537 # if absent and there is a sequence: set to 0xff
1538 memset(p, 0xff, src.core.l_qseq)
1539 return
1540
1541 # check for length match
1542 l = len(qual)
1543 if src.core.l_qseq != l:
1544 raise ValueError(
1545 "quality and sequence mismatch: %i != %i" %
1546 (l, src.core.l_qseq))
1547
1548 # create a python array object filling it
1549 # with the quality scores
1550
1551 # NB: should avoid this copying if qual is
1552 # already of the correct type.
1553 cdef c_array.array result = c_array.array('B', qual)
1554
1555 # copy data
1556 memcpy(p, result.data.as_voidptr, l)
1557
1558 # save in cache
1559 self.cache_query_qualities = qual
1560
1561 property bin:
1562 """properties bin"""
1563 def __get__(self):
1564 return self._delegate.core.bin
1565 def __set__(self, bin):
1566 self._delegate.core.bin = bin
1567
1568
1569 ##########################################################
1570 # Derived simple attributes. These are simple attributes of
1571 # AlignedSegment getting and setting values.
1572 ##########################################################
1573 # 1. Flags
1574 ##########################################################
1575 property is_paired:
1576 """true if read is paired in sequencing"""
1577 def __get__(self):
1578 return (self.flag & BAM_FPAIRED) != 0
1579 def __set__(self,val):
1580 pysam_update_flag(self._delegate, val, BAM_FPAIRED)
1581
1582 property is_proper_pair:
1583 """true if read is mapped in a proper pair"""
1584 def __get__(self):
1585 return (self.flag & BAM_FPROPER_PAIR) != 0
1586 def __set__(self,val):
1587 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
1588
1589 property is_unmapped:
1590 """true if read itself is unmapped"""
1591 def __get__(self):
1592 return (self.flag & BAM_FUNMAP) != 0
1593 def __set__(self, val):
1594 pysam_update_flag(self._delegate, val, BAM_FUNMAP)
1595 # setting the unmapped flag requires recalculation of
1596 # bin as alignment length is now implicitly 1
1597 update_bin(self._delegate)
1598
1599 property is_mapped:
1600 """true if read itself is mapped
1601 (implemented in terms of :attr:`is_unmapped`)"""
1602 def __get__(self):
1603 return (self.flag & BAM_FUNMAP) == 0
1604 def __set__(self, val):
1605 pysam_update_flag(self._delegate, not val, BAM_FUNMAP)
1606 update_bin(self._delegate)
1607
1608 property mate_is_unmapped:
1609 """true if the mate is unmapped"""
1610 def __get__(self):
1611 return (self.flag & BAM_FMUNMAP) != 0
1612 def __set__(self,val):
1613 pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
1614
1615 property mate_is_mapped:
1616 """true if the mate is mapped
1617 (implemented in terms of :attr:`mate_is_unmapped`)"""
1618 def __get__(self):
1619 return (self.flag & BAM_FMUNMAP) == 0
1620 def __set__(self,val):
1621 pysam_update_flag(self._delegate, not val, BAM_FMUNMAP)
1622
1623 property is_reverse:
1624 """true if read is mapped to reverse strand"""
1625 def __get__(self):
1626 return (self.flag & BAM_FREVERSE) != 0
1627 def __set__(self,val):
1628 pysam_update_flag(self._delegate, val, BAM_FREVERSE)
1629
1630 property is_forward:
1631 """true if read is mapped to forward strand
1632 (implemented in terms of :attr:`is_reverse`)"""
1633 def __get__(self):
1634 return (self.flag & BAM_FREVERSE) == 0
1635 def __set__(self,val):
1636 pysam_update_flag(self._delegate, not val, BAM_FREVERSE)
1637
1638 property mate_is_reverse:
1639 """true if the mate is mapped to reverse strand"""
1640 def __get__(self):
1641 return (self.flag & BAM_FMREVERSE) != 0
1642 def __set__(self,val):
1643 pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
1644
1645 property mate_is_forward:
1646 """true if the mate is mapped to forward strand
1647 (implemented in terms of :attr:`mate_is_reverse`)"""
1648 def __get__(self):
1649 return (self.flag & BAM_FMREVERSE) == 0
1650 def __set__(self,val):
1651 pysam_update_flag(self._delegate, not val, BAM_FMREVERSE)
1652
1653 property is_read1:
1654 """true if this is read1"""
1655 def __get__(self):
1656 return (self.flag & BAM_FREAD1) != 0
1657 def __set__(self,val):
1658 pysam_update_flag(self._delegate, val, BAM_FREAD1)
1659 property is_read2:
1660 """true if this is read2"""
1661 def __get__(self):
1662 return (self.flag & BAM_FREAD2) != 0
1663 def __set__(self, val):
1664 pysam_update_flag(self._delegate, val, BAM_FREAD2)
1665 property is_secondary:
1666 """true if not primary alignment"""
1667 def __get__(self):
1668 return (self.flag & BAM_FSECONDARY) != 0
1669 def __set__(self, val):
1670 pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
1671 property is_qcfail:
1672 """true if QC failure"""
1673 def __get__(self):
1674 return (self.flag & BAM_FQCFAIL) != 0
1675 def __set__(self, val):
1676 pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
1677 property is_duplicate:
1678 """true if optical or PCR duplicate"""
1679 def __get__(self):
1680 return (self.flag & BAM_FDUP) != 0
1681 def __set__(self, val):
1682 pysam_update_flag(self._delegate, val, BAM_FDUP)
1683 property is_supplementary:
1684 """true if this is a supplementary alignment"""
1685 def __get__(self):
1686 return (self.flag & BAM_FSUPPLEMENTARY) != 0
1687 def __set__(self, val):
1688 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
1689
1690 # 2. Coordinates and lengths
1691 property reference_end:
1692 '''aligned reference position of the read on the reference genome.
1693
1694 reference_end points to one past the last aligned residue.
1695 Returns None if not available (read is unmapped or no cigar
1696 alignment present).
1697
1698 '''
1699 def __get__(self):
1700 cdef bam1_t * src
1701 src = self._delegate
1702 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
1703 return None
1704 return bam_endpos(src)
1705
1706 property reference_length:
1707 '''aligned length of the read on the reference genome.
1708
1709 This is equal to `reference_end - reference_start`.
1710 Returns None if not available.
1711 '''
1712 def __get__(self):
1713 cdef bam1_t * src
1714 src = self._delegate
1715 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
1716 return None
1717 return bam_endpos(src) - \
1718 self._delegate.core.pos
1719
1720 property query_alignment_sequence:
1721 """aligned portion of the read.
1722
1723 This is a substring of :attr:`query_sequence` that excludes flanking
1724 bases that were :term:`soft clipped` (None if not present). It
1725 is equal to ``query_sequence[query_alignment_start:query_alignment_end]``.
1726
1727 SAM/BAM files may include extra flanking bases that are not
1728 part of the alignment. These bases may be the result of the
1729 Smith-Waterman or other algorithms, which may not require
1730 alignments that begin at the first residue or end at the last.
1731 In addition, extra sequencing adapters, multiplex identifiers,
1732 and low-quality bases that were not considered for alignment
1733 may have been retained.
1734
1735 """
1736
1737 def __get__(self):
1738 if self.cache_query_alignment_sequence:
1739 return self.cache_query_alignment_sequence
1740
1741 cdef bam1_t * src
1742 cdef uint32_t start, end
1743
1744 src = self._delegate
1745
1746 if src.core.l_qseq == 0:
1747 return None
1748
1749 start = getQueryStart(src)
1750 end = getQueryEnd(src)
1751
1752 self.cache_query_alignment_sequence = force_str(
1753 getSequenceInRange(src, start, end))
1754 return self.cache_query_alignment_sequence
1755
1756 property query_alignment_qualities:
1757 """aligned query sequence quality values (None if not present). These
1758 are the quality values that correspond to
1759 :attr:`query_alignment_sequence`, that is, they exclude qualities of
1760 :term:`soft clipped` bases. This is equal to
1761 ``query_qualities[query_alignment_start:query_alignment_end]``.
1762
1763 Quality scores are returned as a python array of unsigned
1764 chars. Note that this is not the ASCII-encoded value typically
1765 seen in FASTQ or SAM formatted files. Thus, no offset of 33
1766 needs to be subtracted.
1767
1768 This property is read-only.
1769
1770 """
1771 def __get__(self):
1772
1773 if self.cache_query_alignment_qualities:
1774 return self.cache_query_alignment_qualities
1775
1776 cdef bam1_t * src
1777 cdef uint32_t start, end
1778
1779 src = self._delegate
1780
1781 if src.core.l_qseq == 0:
1782 return None
1783
1784 start = getQueryStart(src)
1785 end = getQueryEnd(src)
1786 self.cache_query_alignment_qualities = \
1787 getQualitiesInRange(src, start, end)
1788 return self.cache_query_alignment_qualities
1789
1790 property query_alignment_start:
1791 """start index of the aligned query portion of the sequence (0-based,
1792 inclusive).
1793
1794 This the index of the first base in :attr:`query_sequence`
1795 that is not soft-clipped.
1796 """
1797 def __get__(self):
1798 return getQueryStart(self._delegate)
1799
1800 property query_alignment_end:
1801 """end index of the aligned query portion of the sequence (0-based,
1802 exclusive)
1803
1804 This the index just past the last base in :attr:`query_sequence`
1805 that is not soft-clipped.
1806 """
1807 def __get__(self):
1808 return getQueryEnd(self._delegate)
1809
1810 property modified_bases:
1811 """Modified bases annotations from Ml/Mm tags. The output is
1812 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
1813 with qual being (256*probability), or -1 if unknown.
1814 Strand==0 for forward and 1 for reverse strand modification
1815 """
1816 def __get__(self):
1817 cdef bam1_t * src
1818 cdef hts_base_mod_state *m = hts_base_mod_state_alloc()
1819 cdef hts_base_mod mods[5]
1820 cdef int pos
1821
1822 ret = {}
1823 src = self._delegate
1824
1825 if bam_parse_basemod(src, m) < 0:
1826 return None
1827
1828 n = bam_next_basemod(src, m, mods, 5, &pos)
1829
1830 while n>0:
1831 for i in range(n):
1832 mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base
1833 mod_strand = mods[i].strand
1834 if self.is_reverse:
1835 mod_strand = 1 - mod_strand
1836 key = (chr(mods[i].canonical_base),
1837 mod_strand,
1838 mod_code )
1839 ret.setdefault(key,[]).append((pos,mods[i].qual))
1840
1841 n = bam_next_basemod(src, m, mods, 5, &pos)
1842
1843 if n<0:
1844 return None
1845
1846 hts_base_mod_state_free(m)
1847 return ret
1848
1849 property modified_bases_forward:
1850 """Modified bases annotations from Ml/Mm tags. The output is
1851 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
1852 with qual being (256*probability), or -1 if unknown.
1853 Strand==0 for forward and 1 for reverse strand modification.
1854 The positions are with respect to the original sequence from get_forward_sequence()
1855 """
1856 def __get__(self):
1857 pmods = self.modified_bases
1858 if pmods and self.is_reverse:
1859 rmod = {}
1860
1861 # Try to find the length of the original sequence
1862 rlen = self.infer_read_length()
1863 if rlen is None and self.query_sequence is None:
1864 return rmod
1865 else:
1866 rlen = len(self.query_sequence)
1867
1868 for k,mods in pmods.items():
1869 nk = k[0],1 - k[1],k[2]
1870 for i in range(len(mods)):
1871
1872 mods[i] = (rlen - 1 -mods[i][0], mods[i][1])
1873 rmod[nk] = mods
1874 return rmod
1875
1876 return pmods
1877
1878
1879 property query_alignment_length:
1880 """length of the aligned query sequence.
1881
1882 This is equal to :attr:`query_alignment_end` -
1883 :attr:`query_alignment_start`
1884 """
1885 def __get__(self):
1886 cdef bam1_t * src
1887 src = self._delegate
1888 return getQueryEnd(src) - getQueryStart(src)
1889
1890 #####################################################
1891 # Computed properties
1892
1893 def get_reference_positions(self, full_length=False):
1894 """a list of reference positions that this read aligns to.
1895
1896 By default, this method returns the (0-based) positions on the
1897 reference that are within the read's alignment, leaving gaps
1898 corresponding to deletions and other reference skips.
1899
1900 When *full_length* is True, the returned list is the same length
1901 as the read and additionally includes None values corresponding
1902 to insertions or soft-clipping, i.e., to bases of the read that
1903 are not aligned to a reference position.
1904 (See also :meth:`get_aligned_pairs` which additionally returns
1905 the corresponding positions along the read.)
1906 """
1907 cdef uint32_t k, i, l, pos
1908 cdef int op
1909 cdef uint32_t * cigar_p
1910 cdef bam1_t * src
1911 cdef bint _full = full_length
1912
1913 src = self._delegate
1914 if pysam_get_n_cigar(src) == 0:
1915 return []
1916
1917 result = []
1918 pos = src.core.pos
1919 cigar_p = pysam_bam_get_cigar(src)
1920
1921 for k from 0 <= k < pysam_get_n_cigar(src):
1922 op = cigar_p[k] & BAM_CIGAR_MASK
1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT
1924
1925 if op == BAM_CSOFT_CLIP or op == BAM_CINS:
1926 if _full:
1927 for i from 0 <= i < l:
1928 result.append(None)
1929 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
1930 for i from pos <= i < pos + l:
1931 result.append(i)
1932 pos += l
1933 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
1934 pos += l
1935
1936 return result
1937
1938 def infer_query_length(self, always=False):
1939 """infer query length from CIGAR alignment.
1940
1941 This method deduces the query length from the CIGAR alignment
1942 but does not include hard-clipped bases.
1943
1944 Returns None if CIGAR alignment is not present.
1945
1946 If *always* is set to True, `infer_read_length` is used instead.
1947 This is deprecated and only present for backward compatibility.
1948 """
1949 if always is True:
1950 return self.infer_read_length()
1951 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
1952 if l > 0:
1953 return l
1954 else:
1955 return None
1956
1957 def infer_read_length(self):
1958 """infer read length from CIGAR alignment.
1959
1960 This method deduces the read length from the CIGAR alignment
1961 including hard-clipped bases.
1962
1963 Returns None if CIGAR alignment is not present.
1964 """
1965 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
1966 if l > 0:
1967 return l
1968 else:
1969 return None
1970
1971 def get_reference_sequence(self):
1972 """return the reference sequence in the region that is covered by the
1973 alignment of the read to the reference.
1974
1975 This method requires the MD tag to be set.
1976
1977 """
1978 return force_str(build_reference_sequence(self._delegate))
1979
1980 def get_forward_sequence(self):
1981 """return the original read sequence.
1982
1983 Reads mapped to the reverse strand are stored reverse complemented in
1984 the BAM file. This method returns such reads reverse complemented back
1985 to their original orientation.
1986
1987 Returns None if the record has no query sequence.
1988 """
1989 if self.query_sequence is None:
1990 return None
1991 s = force_str(self.query_sequence)
1992 if self.is_reverse:
1993 s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
1994 return s
1995
1996 def get_forward_qualities(self):
1997 """return the original base qualities of the read sequence,
1998 in the same format as the :attr:`query_qualities` property.
1999
2000 Reads mapped to the reverse strand have their base qualities stored
2001 reversed in the BAM file. This method returns such reads' base qualities
2002 reversed back to their original orientation.
2003 """
2004 if self.is_reverse:
2005 return self.query_qualities[::-1]
2006 else:
2007 return self.query_qualities
2008
2009 def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
2010 """a list of aligned read (query) and reference positions.
2011
2012 Each item in the returned list is a tuple consisting of
2013 the 0-based offset from the start of the read sequence
2014 followed by the 0-based reference position.
2015
2016 For inserts, deletions, skipping either query or reference
2017 position may be None.
2018
2019 For padding in the reference, the reference position will
2020 always be None.
2021
2022 Parameters
2023 ----------
2024
2025 matches_only : bool
2026 If True, only matched bases are returned --- no None on either
2027 side.
2028 with_seq : bool
2029 If True, return a third element in the tuple containing the
2030 reference sequence. For CIGAR 'P' (padding in the reference)
2031 operations, the third tuple element will be None. Substitutions
2032 are lower-case. This option requires an MD tag to be present.
2033 with_cigar : bool
2034 If True, return an extra element in the tuple containing the
2035 CIGAR operator corresponding to this position tuple.
2036
2037 Returns
2038 -------
2039
2040 aligned_pairs : list of tuples
2041
2042 """
2043 cdef uint32_t k, i, pos, qpos, r_idx, l
2044 cdef int op
2045 cdef uint32_t * cigar_p
2046 cdef bam1_t * src = self._delegate
2047 cdef bint _matches_only = bool(matches_only)
2048 cdef bint _with_seq = bool(with_seq)
2049 cdef bint _with_cigar = bool(with_cigar)
2050 cdef object (*make_tuple)(object, object, object, uint32_t, int)
2051
2052 # TODO: this method performs no checking and assumes that
2053 # read sequence, cigar and MD tag are consistent.
2054
2055 if _with_seq:
2056 # force_str required for py2/py3 compatibility
2057 ref_seq = force_str(build_reference_sequence(src))
2058 if ref_seq is None:
2059 raise ValueError("MD tag not present")
2060 make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
2061 else:
2062 ref_seq = None
2063 make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
2064
2065 r_idx = 0
2066
2067 if pysam_get_n_cigar(src) == 0:
2068 return []
2069
2070 result = []
2071 pos = src.core.pos
2072 qpos = 0
2073 cigar_p = pysam_bam_get_cigar(src)
2074 for k from 0 <= k < pysam_get_n_cigar(src):
2075 op = cigar_p[k] & BAM_CIGAR_MASK
2076 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2077
2078 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
2079 for i from pos <= i < pos + l:
2080 result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
2081 r_idx += 1
2082 qpos += 1
2083 pos += l
2084
2085 elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
2086 if not _matches_only:
2087 for i from pos <= i < pos + l:
2088 result.append(make_tuple(qpos, None, None, 0, op))
2089 qpos += 1
2090 else:
2091 qpos += l
2092
2093 elif op == BAM_CDEL:
2094 if not _matches_only:
2095 for i from pos <= i < pos + l:
2096 result.append(make_tuple(None, i, ref_seq, r_idx, op))
2097 r_idx += 1
2098 else:
2099 r_idx += l
2100 pos += l
2101
2102 elif op == BAM_CHARD_CLIP:
2103 pass # advances neither
2104
2105 elif op == BAM_CREF_SKIP:
2106 if not _matches_only:
2107 for i from pos <= i < pos + l:
2108 result.append(make_tuple(None, i, None, 0, op))
2109
2110 pos += l
2111
2112 return result
2113
2114 def get_blocks(self):
2115 """ a list of start and end positions of
2116 aligned gapless blocks.
2117
2118 The start and end positions are in genomic
2119 coordinates.
2120
2121 Blocks are not normalized, i.e. two blocks
2122 might be directly adjacent. This happens if
2123 the two blocks are separated by an insertion
2124 in the read.
2125 """
2126
2127 cdef uint32_t k, pos, l
2128 cdef int op
2129 cdef uint32_t * cigar_p
2130 cdef bam1_t * src
2131
2132 src = self._delegate
2133 if pysam_get_n_cigar(src) == 0:
2134 return []
2135
2136 result = []
2137 pos = src.core.pos
2138 cigar_p = pysam_bam_get_cigar(src)
2139 l = 0
2140
2141 for k from 0 <= k < pysam_get_n_cigar(src):
2142 op = cigar_p[k] & BAM_CIGAR_MASK
2143 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2144 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
2145 result.append((pos, pos + l))
2146 pos += l
2147 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
2148 pos += l
2149
2150 return result
2151
2152 def get_overlap(self, uint32_t start, uint32_t end):
2153 """return number of aligned bases of read overlapping the interval
2154 *start* and *end* on the reference sequence.
2155
2156 Return None if cigar alignment is not available.
2157 """
2158 cdef uint32_t k, i, pos, overlap
2159 cdef int op, o
2160 cdef uint32_t * cigar_p
2161 cdef bam1_t * src
2162
2163 overlap = 0
2164
2165 src = self._delegate
2166 if pysam_get_n_cigar(src) == 0:
2167 return None
2168 pos = src.core.pos
2169 o = 0
2170
2171 cigar_p = pysam_bam_get_cigar(src)
2172 for k from 0 <= k < pysam_get_n_cigar(src):
2173 op = cigar_p[k] & BAM_CIGAR_MASK
2174 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2175
2176 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
2177 o = min( pos + l, end) - max( pos, start )
2178 if o > 0: overlap += o
2179
2180 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
2181 pos += l
2182
2183 return overlap
2184
2185 def get_cigar_stats(self):
2186 """summary of operations in cigar string.
2187
2188 The output order in the array is "MIDNSHP=X" followed by a
2189 field for the NM tag. If the NM tag is not present, this
2190 field will always be 0. (Accessing this field via index -1
2191 avoids changes if more CIGAR operators are added in future.)
2192
2193 +-----+----------------+--------+
2194 |M |pysam.CMATCH |0 |
2195 +-----+----------------+--------+
2196 |I |pysam.CINS |1 |
2197 +-----+----------------+--------+
2198 |D |pysam.CDEL |2 |
2199 +-----+----------------+--------+
2200 |N |pysam.CREF_SKIP |3 |
2201 +-----+----------------+--------+
2202 |S |pysam.CSOFT_CLIP|4 |
2203 +-----+----------------+--------+
2204 |H |pysam.CHARD_CLIP|5 |
2205 +-----+----------------+--------+
2206 |P |pysam.CPAD |6 |
2207 +-----+----------------+--------+
2208 |= |pysam.CEQUAL |7 |
2209 +-----+----------------+--------+
2210 |X |pysam.CDIFF |8 |
2211 +-----+----------------+--------+
2212 |B |pysam.CBACK |9 |
2213 +-----+----------------+--------+
2214 |NM |NM tag |10 or -1|
2215 +-----+----------------+--------+
2216
2217 If no cigar string is present, empty arrays will be returned.
2218
2219 Returns:
2220 arrays :
2221 two arrays. The first contains the nucleotide counts within
2222 each cigar operation, the second contains the number of blocks
2223 for each cigar operation.
2224
2225 """
2226
2227 cdef int nfields = NCIGAR_CODES + 1
2228
2229 cdef c_array.array base_counts = array.array(
2230 "I",
2231 [0] * nfields)
2232 cdef uint32_t [:] base_view = base_counts
2233 cdef c_array.array block_counts = array.array(
2234 "I",
2235 [0] * nfields)
2236 cdef uint32_t [:] block_view = block_counts
2237
2238 cdef bam1_t * src = self._delegate
2239 cdef int op
2240 cdef uint32_t l
2241 cdef int32_t k
2242 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
2243
2244 if cigar_p == NULL:
2245 return None
2246
2247 for k from 0 <= k < pysam_get_n_cigar(src):
2248 op = cigar_p[k] & BAM_CIGAR_MASK
2249 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2250 base_view[op] += l
2251 block_view[op] += 1
2252
2253 cdef uint8_t * v = bam_aux_get(src, 'NM')
2254 if v != NULL:
2255 base_view[nfields - 1] = <int32_t>bam_aux2i(v)
2256
2257 return base_counts, block_counts
2258
2259 #####################################################
2260 ## Unsorted as yet
2261 # TODO: capture in CIGAR object
2262 property cigartuples:
2263 """the :term:`cigar` alignment. The alignment
2264 is returned as a list of tuples of (operation, length).
2265
2266 If the alignment is not present, None is returned.
2267
2268 The operations are:
2269
2270 +-----+----------------+-----+
2271 |M |pysam.CMATCH |0 |
2272 +-----+----------------+-----+
2273 |I |pysam.CINS |1 |
2274 +-----+----------------+-----+
2275 |D |pysam.CDEL |2 |
2276 +-----+----------------+-----+
2277 |N |pysam.CREF_SKIP |3 |
2278 +-----+----------------+-----+
2279 |S |pysam.CSOFT_CLIP|4 |
2280 +-----+----------------+-----+
2281 |H |pysam.CHARD_CLIP|5 |
2282 +-----+----------------+-----+
2283 |P |pysam.CPAD |6 |
2284 +-----+----------------+-----+
2285 |= |pysam.CEQUAL |7 |
2286 +-----+----------------+-----+
2287 |X |pysam.CDIFF |8 |
2288 +-----+----------------+-----+
2289 |B |pysam.CBACK |9 |
2290 +-----+----------------+-----+
2291
2292 .. note::
2293 The output is a list of (operation, length) tuples, such as
2294 ``[(0, 30)]``.
2295 This is different from the SAM specification and
2296 the :attr:`cigarstring` property, which uses a
2297 (length, operation) order, for example: ``30M``.
2298
2299 To unset the cigar property, assign an empty list
2300 or None.
2301 """
2302 def __get__(self):
2303 cdef uint32_t * cigar_p
2304 cdef bam1_t * src
2305 cdef uint32_t op, l
2306 cdef uint32_t k
2307
2308 src = self._delegate
2309 if pysam_get_n_cigar(src) == 0:
2310 return None
2311
2312 cigar = []
2313
2314 cigar_p = pysam_bam_get_cigar(src);
2315 for k from 0 <= k < pysam_get_n_cigar(src):
2316 op = cigar_p[k] & BAM_CIGAR_MASK
2317 l = cigar_p[k] >> BAM_CIGAR_SHIFT
2318 cigar.append((op, l))
2319 return cigar
2320
2321 def __set__(self, values):
2322 cdef uint32_t * p
2323 cdef bam1_t * src
2324 cdef op, l
2325 cdef int k
2326
2327 k = 0
2328
2329 src = self._delegate
2330
2331 # get location of cigar string
2332 p = pysam_bam_get_cigar(src)
2333
2334 # empty values for cigar string
2335 if values is None:
2336 values = []
2337
2338 cdef uint32_t ncigar = len(values)
2339
2340 cdef bam1_t * retval = pysam_bam_update(src,
2341 pysam_get_n_cigar(src) * 4,
2342 ncigar * 4,
2343 <uint8_t*>p)
2344
2345 if retval == NULL:
2346 raise MemoryError("could not allocate memory")
2347
2348 # length is number of cigar operations, not bytes
2349 pysam_set_n_cigar(src, ncigar)
2350
2351 # re-acquire pointer to location in memory
2352 # as it might have moved
2353 p = pysam_bam_get_cigar(src)
2354
2355 # insert cigar operations
2356 for op, l in values:
2357 p[k] = l << BAM_CIGAR_SHIFT | op
2358 k += 1
2359
2360 ## setting the cigar string requires updating the bin
2361 update_bin(src)
2362
2363 cpdef set_tag(self,
2364 tag,
2365 value,
2366 value_type=None,
2367 replace=True):
2368 """sets a particular field *tag* to *value* in the optional alignment
2369 section.
2370
2371 *value_type* describes the type of *value* that is to entered
2372 into the alignment record. It can be set explicitly to one of
2373 the valid one-letter type codes. If unset, an appropriate type
2374 will be chosen automatically based on the python type of
2375 *value*.
2376
2377 An existing value of the same *tag* will be overwritten unless
2378 *replace* is set to False. This is usually not recommended as a
2379 tag may only appear once in the optional alignment section.
2380
2381 If *value* is `None`, the tag will be deleted.
2382
2383 This method accepts valid SAM specification value types, which
2384 are::
2385
2386 A: printable char
2387 i: signed int
2388 f: float
2389 Z: printable string
2390 H: Byte array in hex format
2391 B: Integer or numeric array
2392
2393 Additionally, it will accept the integer BAM types ('cCsSI')
2394
2395 For htslib compatibility, 'a' is synonymous with 'A' and the
2396 method accepts a 'd' type code for a double precision float.
2397
2398 When deducing the type code by the python type of *value*, the
2399 following mapping is applied::
2400
2401 i: python int
2402 f: python float
2403 Z: python str or bytes
2404 B: python array.array, list or tuple
2405
2406 Note that a single character string will be output as 'Z' and
2407 not 'A' as the former is the more general type.
2408 """
2409
2410 cdef int value_size
2411 cdef uint8_t tc
2412 cdef uint8_t * value_ptr
2413 cdef uint8_t *existing_ptr
2414 cdef float float_value
2415 cdef double double_value
2416 cdef int32_t int32_t_value
2417 cdef uint32_t uint32_t_value
2418 cdef int16_t int16_t_value
2419 cdef uint16_t uint16_t_value
2420 cdef int8_t int8_t_value
2421 cdef uint8_t uint8_t_value
2422 cdef bam1_t * src = self._delegate
2423 cdef char * _value_type
2424 cdef c_array.array array_value
2425 cdef object buffer
2426
2427 if len(tag) != 2:
2428 raise ValueError('Invalid tag: %s' % tag)
2429
2430 tag = force_bytes(tag)
2431 if replace:
2432 existing_ptr = bam_aux_get(src, tag)
2433 if existing_ptr:
2434 bam_aux_del(src, existing_ptr)
2435
2436 # setting value to None deletes a tag
2437 if value is None:
2438 return
2439
2440 cdef uint8_t typecode = get_tag_typecode(value, value_type)
2441 if typecode == 0:
2442 raise ValueError("can't guess type or invalid type code specified: {} {}".format(
2443 value, value_type))
2444
2445 # sam_format1 for typecasting
2446 if typecode == b'Z':
2447 value = force_bytes(value)
2448 value_ptr = <uint8_t*><char*>value
2449 value_size = len(value)+1
2450 elif typecode == b'H':
2451 # Note that hex tags are stored the very same
2452 # way as Z string.s
2453 value = force_bytes(value)
2454 value_ptr = <uint8_t*><char*>value
2455 value_size = len(value)+1
2456 elif typecode == b'A' or typecode == b'a':
2457 value = force_bytes(value)
2458 value_ptr = <uint8_t*><char*>value
2459 value_size = sizeof(char)
2460 typecode = b'A'
2461 elif typecode == b'i':
2462 int32_t_value = value
2463 value_ptr = <uint8_t*>&int32_t_value
2464 value_size = sizeof(int32_t)
2465 elif typecode == b'I':
2466 uint32_t_value = value
2467 value_ptr = <uint8_t*>&uint32_t_value
2468 value_size = sizeof(uint32_t)
2469 elif typecode == b's':
2470 int16_t_value = value
2471 value_ptr = <uint8_t*>&int16_t_value
2472 value_size = sizeof(int16_t)
2473 elif typecode == b'S':
2474 uint16_t_value = value
2475 value_ptr = <uint8_t*>&uint16_t_value
2476 value_size = sizeof(uint16_t)
2477 elif typecode == b'c':
2478 int8_t_value = value
2479 value_ptr = <uint8_t*>&int8_t_value
2480 value_size = sizeof(int8_t)
2481 elif typecode == b'C':
2482 uint8_t_value = value
2483 value_ptr = <uint8_t*>&uint8_t_value
2484 value_size = sizeof(uint8_t)
2485 elif typecode == b'd':
2486 double_value = value
2487 value_ptr = <uint8_t*>&double_value
2488 value_size = sizeof(double)
2489 elif typecode == b'f':
2490 float_value = value
2491 value_ptr = <uint8_t*>&float_value
2492 value_size = sizeof(float)
2493 elif typecode == b'B':
2494 # the following goes through python, needs to be cleaned up
2495 # pack array using struct
2496 fmt, args = pack_tags([(tag, value, value_type)])
2497
2498 # remove tag and type code as set by bam_aux_append
2499 # first four chars of format (<2sB)
2500 fmt = '<' + fmt[4:]
2501 # first two values to pack
2502 args = args[2:]
2503 value_size = struct.calcsize(fmt)
2504 # buffer will be freed when object goes out of scope
2505 buffer = ctypes.create_string_buffer(value_size)
2506 struct.pack_into(fmt, buffer, 0, *args)
2507 # bam_aux_append copies data from value_ptr
2508 bam_aux_append(src,
2509 tag,
2510 typecode,
2511 value_size,
2512 <uint8_t*>buffer.raw)
2513 return
2514 else:
2515 raise ValueError('unsupported value_type {} in set_option'.format(typecode))
2516
2517 bam_aux_append(src,
2518 tag,
2519 typecode,
2520 value_size,
2521 value_ptr)
2522
2523 cpdef has_tag(self, tag):
2524 """returns true if the optional alignment section
2525 contains a given *tag*."""
2526 cdef uint8_t * v
2527 cdef int nvalues
2528 btag = force_bytes(tag)
2529 v = bam_aux_get(self._delegate, btag)
2530 return v != NULL
2531
2532 cpdef get_tag(self, tag, with_value_type=False):
2533 """
2534 retrieves data from the optional alignment section
2535 given a two-letter *tag* denoting the field.
2536
2537 The returned value is cast into an appropriate python type.
2538
2539 This method is the fastest way to access the optional
2540 alignment section if only few tags need to be retrieved.
2541
2542 Possible value types are "AcCsSiIfZHB" (see BAM format
2543 specification) as well as additional value type 'd' as
2544 implemented in htslib.
2545
2546 Parameters:
2547
2548 tag :
2549 data tag.
2550
2551 with_value_type : Optional[bool]
2552 if set to True, the return value is a tuple of (tag value, type
2553 code). (default False)
2554
2555 Returns:
2556
2557 A python object with the value of the `tag`. The type of the
2558 object depends on the data type in the data record.
2559
2560 Raises:
2561
2562 KeyError
2563 If `tag` is not present, a KeyError is raised.
2564
2565 """
2566 cdef uint8_t * v
2567 cdef int nvalues
2568 btag = force_bytes(tag)
2569 v = bam_aux_get(self._delegate, btag)
2570 if v == NULL:
2571 raise KeyError("tag '%s' not present" % tag)
2572 if chr(v[0]) == "B":
2573 auxtype = chr(v[0]) + chr(v[1])
2574 else:
2575 auxtype = chr(v[0])
2576
2577 if auxtype in "iIcCsS":
2578 value = bam_aux2i(v)
2579 elif auxtype == 'f' or auxtype == 'F':
2580 value = bam_aux2f(v)
2581 elif auxtype == 'd' or auxtype == 'D':
2582 value = bam_aux2f(v)
2583 elif auxtype == 'A' or auxtype == 'a':
2584 # force A to a
2585 v[0] = b'A'
2586 # there might a more efficient way
2587 # to convert a char into a string
2588 value = '%c' % <char>bam_aux2A(v)
2589 elif auxtype == 'Z' or auxtype == 'H':
2590 # Z and H are treated equally as strings in htslib
2591 value = charptr_to_str(<char*>bam_aux2Z(v))
2592 elif auxtype[0] == 'B':
2593 bytesize, nvalues, values = convert_binary_tag(v + 1)
2594 value = values
2595 else:
2596 raise ValueError("unknown auxiliary type '%s'" % auxtype)
2597
2598 if with_value_type:
2599 return (value, auxtype)
2600 else:
2601 return value
2602
2603 def get_tags(self, with_value_type=False):
2604 """the fields in the optional alignment section.
2605
2606 Returns a list of all fields in the optional
2607 alignment section. Values are converted to appropriate python
2608 values. For example: ``[(NM, 2), (RG, "GJP00TM04")]``
2609
2610 If *with_value_type* is set, the value type as encode in
2611 the AlignedSegment record will be returned as well:
2612
2613 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
2614
2615 This method will convert all values in the optional alignment
2616 section. When getting only one or few tags, please see
2617 :meth:`get_tag` for a quicker way to achieve this.
2618
2619 """
2620
2621 cdef char * ctag
2622 cdef bam1_t * src
2623 cdef uint8_t * s
2624 cdef char auxtag[3]
2625 cdef char auxtype
2626 cdef uint8_t byte_size
2627 cdef int32_t nvalues
2628
2629 src = self._delegate
2630 if src.l_data == 0:
2631 return []
2632 s = pysam_bam_get_aux(src)
2633 result = []
2634 auxtag[2] = 0
2635 while s < (src.data + src.l_data):
2636 # get tag
2637 auxtag[0] = s[0]
2638 auxtag[1] = s[1]
2639 s += 2
2640 auxtype = s[0]
2641 if auxtype in (b'c', b'C'):
2642 value = <int>bam_aux2i(s)
2643 s += 1
2644 elif auxtype in (b's', b'S'):
2645 value = <int>bam_aux2i(s)
2646 s += 2
2647 elif auxtype in (b'i', b'I'):
2648 value = <int32_t>bam_aux2i(s)
2649 s += 4
2650 elif auxtype == b'f':
2651 value = <float>bam_aux2f(s)
2652 s += 4
2653 elif auxtype == b'd':
2654 value = <double>bam_aux2f(s)
2655 s += 8
2656 elif auxtype in (b'A', b'a'):
2657 value = "%c" % <char>bam_aux2A(s)
2658 s += 1
2659 elif auxtype in (b'Z', b'H'):
2660 value = charptr_to_str(<char*>bam_aux2Z(s))
2661 # +1 for NULL terminated string
2662 s += len(value) + 1
2663 elif auxtype == b'B':
2664 s += 1
2665 byte_size, nvalues, value = convert_binary_tag(s)
2666 # 5 for 1 char and 1 int
2667 s += 5 + (nvalues * byte_size) - 1
2668 else:
2669 raise KeyError("unknown type '%s'" % auxtype)
2670
2671 s += 1
2672
2673 if with_value_type:
2674 result.append((charptr_to_str(auxtag), value, chr(auxtype)))
2675 else:
2676 result.append((charptr_to_str(auxtag), value))
2677
2678 return result
2679
2680 def set_tags(self, tags):
2681 """sets the fields in the optional alignment section with
2682 a list of (tag, value) tuples.
2683
2684 The value type of the values is determined from the
2685 python type. Optionally, a type may be given explicitly as
2686 a third value in the tuple, For example:
2687
2688 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
2689
2690 This method will not enforce the rule that the same tag may appear
2691 only once in the optional alignment section.
2692 """
2693
2694 cdef bam1_t * src
2695 cdef uint8_t * s
2696 cdef char * temp
2697 cdef int new_size = 0
2698 cdef int old_size
2699 src = self._delegate
2700
2701 # convert and pack the data
2702 if tags is not None and len(tags) > 0:
2703 fmt, args = pack_tags(tags)
2704 new_size = struct.calcsize(fmt)
2705 buffer = ctypes.create_string_buffer(new_size)
2706 struct.pack_into(fmt,
2707 buffer,
2708 0,
2709 *args)
2710
2711
2712 # delete the old data and allocate new space.
2713 # If total_size == 0, the aux field will be
2714 # empty
2715 old_size = pysam_bam_get_l_aux(src)
2716 cdef bam1_t * retval = pysam_bam_update(src,
2717 old_size,
2718 new_size,
2719 pysam_bam_get_aux(src))
2720 if retval == NULL:
2721 raise MemoryError("could not allocate memory")
2722
2723 # copy data only if there is any
2724 if new_size > 0:
2725
2726 # get location of new data
2727 s = pysam_bam_get_aux(src)
2728
2729 # check if there is direct path from buffer.raw to tmp
2730 p = buffer.raw
2731 # create handle to make sure buffer stays alive long
2732 # enough for memcpy, see issue 129
2733 temp = p
2734 memcpy(s, temp, new_size)
2735
2736
2737 ########################################################
2738 # Compatibility Accessors
2739 # Functions, properties for compatibility with pysam < 0.8
2740 #
2741 # Several options
2742 # change the factory functions according to API
2743 # * requires code changes throughout, incl passing
2744 # handles to factory functions
2745 # subclass functions and add attributes at runtime
2746 # e.g.: AlignedSegments.qname = AlignedSegments.query_name
2747 # * will slow down the default interface
2748 # explicit declaration of getters/setters
2749 ########################################################
2750 property qname:
2751 """deprecated, use :attr:`query_name` instead."""
2752 def __get__(self): return self.query_name
2753 def __set__(self, v): self.query_name = v
2754 property tid:
2755 """deprecated, use :attr:`reference_id` instead."""
2756 def __get__(self): return self.reference_id
2757 def __set__(self, v): self.reference_id = v
2758 property pos:
2759 """deprecated, use :attr:`reference_start` instead."""
2760 def __get__(self): return self.reference_start
2761 def __set__(self, v): self.reference_start = v
2762 property mapq:
2763 """deprecated, use :attr:`mapping_quality` instead."""
2764 def __get__(self): return self.mapping_quality
2765 def __set__(self, v): self.mapping_quality = v
2766 property rnext:
2767 """deprecated, use :attr:`next_reference_id` instead."""
2768 def __get__(self): return self.next_reference_id
2769 def __set__(self, v): self.next_reference_id = v
2770 property pnext:
2771 """deprecated, use :attr:`next_reference_start` instead."""
2772 def __get__(self):
2773 return self.next_reference_start
2774 def __set__(self, v):
2775 self.next_reference_start = v
2776 property cigar:
2777 """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead."""
2778 def __get__(self):
2779 r = self.cigartuples
2780 if r is None:
2781 r = []
2782 return r
2783 def __set__(self, v): self.cigartuples = v
2784 property tlen:
2785 """deprecated, use :attr:`template_length` instead."""
2786 def __get__(self):
2787 return self.template_length
2788 def __set__(self, v):
2789 self.template_length = v
2790 property seq:
2791 """deprecated, use :attr:`query_sequence` instead."""
2792 def __get__(self):
2793 return self.query_sequence
2794 def __set__(self, v):
2795 self.query_sequence = v
2796 property qual:
2797 """deprecated, use :attr:`query_qualities` instead."""
2798 def __get__(self):
2799 return array_to_qualitystring(self.query_qualities)
2800 def __set__(self, v):
2801 self.query_qualities = qualitystring_to_array(v)
2802 property alen:
2803 """deprecated, use :attr:`reference_length` instead."""
2804 def __get__(self):
2805 return self.reference_length
2806 def __set__(self, v):
2807 self.reference_length = v
2808 property aend:
2809 """deprecated, use :attr:`reference_end` instead."""
2810 def __get__(self):
2811 return self.reference_end
2812 def __set__(self, v):
2813 self.reference_end = v
2814 property rlen:
2815 """deprecated, use :attr:`query_length` instead."""
2816 def __get__(self):
2817 return self.query_length
2818 def __set__(self, v):
2819 self.query_length = v
2820 property query:
2821 """deprecated, use :attr:`query_alignment_sequence`
2822 instead."""
2823 def __get__(self):
2824 return self.query_alignment_sequence
2825 def __set__(self, v):
2826 self.query_alignment_sequence = v
2827 property qqual:
2828 """deprecated, use :attr:`query_alignment_qualities`
2829 instead."""
2830 def __get__(self):
2831 return array_to_qualitystring(self.query_alignment_qualities)
2832 def __set__(self, v):
2833 self.query_alignment_qualities = qualitystring_to_array(v)
2834 property qstart:
2835 """deprecated, use :attr:`query_alignment_start` instead."""
2836 def __get__(self):
2837 return self.query_alignment_start
2838 def __set__(self, v):
2839 self.query_alignment_start = v
2840 property qend:
2841 """deprecated, use :attr:`query_alignment_end` instead."""
2842 def __get__(self):
2843 return self.query_alignment_end
2844 def __set__(self, v):
2845 self.query_alignment_end = v
2846 property qlen:
2847 """deprecated, use :attr:`query_alignment_length`
2848 instead."""
2849 def __get__(self):
2850 return self.query_alignment_length
2851 def __set__(self, v):
2852 self.query_alignment_length = v
2853 property mrnm:
2854 """deprecated, use :attr:`next_reference_id` instead."""
2855 def __get__(self):
2856 return self.next_reference_id
2857 def __set__(self, v):
2858 self.next_reference_id = v
2859 property mpos:
2860 """deprecated, use :attr:`next_reference_start`
2861 instead."""
2862 def __get__(self):
2863 return self.next_reference_start
2864 def __set__(self, v):
2865 self.next_reference_start = v
2866 property rname:
2867 """deprecated, use :attr:`reference_id` instead."""
2868 def __get__(self):
2869 return self.reference_id
2870 def __set__(self, v):
2871 self.reference_id = v
2872 property isize:
2873 """deprecated, use :attr:`template_length` instead."""
2874 def __get__(self):
2875 return self.template_length
2876 def __set__(self, v):
2877 self.template_length = v
2878 property blocks:
2879 """deprecated, use :meth:`get_blocks()` instead."""
2880 def __get__(self):
2881 return self.get_blocks()
2882 property aligned_pairs:
2883 """deprecated, use :meth:`get_aligned_pairs()` instead."""
2884 def __get__(self):
2885 return self.get_aligned_pairs()
2886 property inferred_length:
2887 """deprecated, use :meth:`infer_query_length()` instead."""
2888 def __get__(self):
2889 return self.infer_query_length()
2890 property positions:
2891 """deprecated, use :meth:`get_reference_positions()` instead."""
2892 def __get__(self):
2893 return self.get_reference_positions()
2894 property tags:
2895 """deprecated, use :meth:`get_tags()` instead."""
2896 def __get__(self):
2897 return self.get_tags()
2898 def __set__(self, tags):
2899 self.set_tags(tags)
2900 def overlap(self):
2901 """deprecated, use :meth:`get_overlap()` instead."""
2902 return self.get_overlap()
2903 def opt(self, tag):
2904 """deprecated, use :meth:`get_tag()` instead."""
2905 return self.get_tag(tag)
2906 def setTag(self, tag, value, value_type=None, replace=True):
2907 """deprecated, use :meth:`set_tag()` instead."""
2908 return self.set_tag(tag, value, value_type, replace)
2909
2910
2911 cdef class PileupColumn:
2912 '''A pileup of reads at a particular reference sequence position
2913 (:term:`column`). A pileup column contains all the reads that map
2914 to a certain target base.
2915
2916 This class is a proxy for results returned by the samtools pileup
2917 engine. If the underlying engine iterator advances, the results
2918 of this column will change.
2919 '''
2920 def __init__(self):
2921 raise TypeError("this class cannot be instantiated from Python")
2922
2923 def __str__(self):
2924 return "\t".join(map(str,
2925 (self.reference_id,
2926 self.reference_pos,
2927 self.nsegments))) +\
2928 "\n" +\
2929 "\n".join(map(str, self.pileups))
2930
2931 def __dealloc__(self):
2932 free(self.buf.s)
2933
2934 def set_min_base_quality(self, min_base_quality):
2935 """set the minimum base quality for this pileup column.
2936 """
2937 self.min_base_quality = min_base_quality
2938
2939 def __len__(self):
2940 """return number of reads aligned to this column.
2941
2942 see :meth:`get_num_aligned`
2943 """
2944 return self.get_num_aligned()
2945
2946 property reference_id:
2947 '''the reference sequence number as defined in the header'''
2948 def __get__(self):
2949 return self.tid
2950
2951 property reference_name:
2952 """:term:`reference` name (None if no AlignmentFile is associated)"""
2953 def __get__(self):
2954 if self.header is not None:
2955 return self.header.get_reference_name(self.tid)
2956 return None
2957
2958 property nsegments:
2959 '''number of reads mapping to this column.
2960
2961 Note that this number ignores the base quality filter.'''
2962 def __get__(self):
2963 return self.n_pu
2964 def __set__(self, n):
2965 self.n_pu = n
2966
2967 property reference_pos:
2968 '''the position in the reference sequence (0-based).'''
2969 def __get__(self):
2970 return self.pos
2971
2972 property pileups:
2973 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
2974 def __get__(self):
2975 if self.plp == NULL or self.plp[0] == NULL:
2976 raise ValueError("PileupColumn accessed after iterator finished")
2977
2978 cdef int x
2979 cdef const bam_pileup1_t * p = NULL
2980 pileups = []
2981
2982 # warning: there could be problems if self.n and self.buf are
2983 # out of sync.
2984 for x from 0 <= x < self.n_pu:
2985 p = &(self.plp[0][x])
2986 if p == NULL:
2987 raise ValueError(
2988 "pileup buffer out of sync - most likely use of iterator "
2989 "outside loop")
2990 if pileup_base_qual_skip(p, self.min_base_quality):
2991 continue
2992 pileups.append(makePileupRead(p, self.header))
2993 return pileups
2994
2995 ########################################################
2996 # Compatibility Accessors
2997 # Functions, properties for compatibility with pysam < 0.8
2998 ########################################################
2999 property pos:
3000 """deprecated, use :attr:`reference_pos` instead."""
3001 def __get__(self):
3002 return self.reference_pos
3003 def __set__(self, v):
3004 self.reference_pos = v
3005
3006 property tid:
3007 """deprecated, use :attr:`reference_id` instead."""
3008 def __get__(self):
3009 return self.reference_id
3010 def __set__(self, v):
3011 self.reference_id = v
3012
3013 property n:
3014 """deprecated, use :attr:`nsegments` instead."""
3015 def __get__(self):
3016 return self.nsegments
3017 def __set__(self, v):
3018 self.nsegments = v
3019
3020 def get_num_aligned(self):
3021 """return number of aligned bases at pileup column position.
3022
3023 This method applies a base quality filter and the number is
3024 equal to the size of :meth:`get_query_sequences`,
3025 :meth:`get_mapping_qualities`, etc.
3026
3027 """
3028 cdef uint32_t x = 0
3029 cdef uint32_t c = 0
3030 cdef uint32_t cnt = 0
3031 cdef const bam_pileup1_t * p = NULL
3032 if self.plp == NULL or self.plp[0] == NULL:
3033 raise ValueError("PileupColumn accessed after iterator finished")
3034
3035 for x from 0 <= x < self.n_pu:
3036 p = &(self.plp[0][x])
3037 if p == NULL:
3038 raise ValueError(
3039 "pileup buffer out of sync - most likely use of iterator "
3040 "outside loop")
3041 if pileup_base_qual_skip(p, self.min_base_quality):
3042 continue
3043 cnt += 1
3044 return cnt
3045
3046 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
3047 """query bases/sequences at pileup column position.
3048
3049 Optionally, the bases/sequences can be annotated according to the samtools
3050 mpileup format. This is the format description from the samtools mpileup tool::
3051
3052 Information on match, mismatch, indel, strand, mapping
3053 quality and start and end of a read are all encoded at the
3054 read base column. At this column, a dot stands for a match
3055 to the reference base on the forward strand, a comma for a
3056 match on the reverse strand, a '>' or '<' for a reference
3057 skip, `ACGTN' for a mismatch on the forward strand and
3058 `acgtn' for a mismatch on the reverse strand. A pattern
3059 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
3060 between this reference position and the next reference
3061 position. The length of the insertion is given by the
3062 integer in the pattern, followed by the inserted
3063 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
3064 represents a deletion from the reference. The deleted bases
3065 will be presented as `*' in the following lines. Also at
3066 the read base column, a symbol `^' marks the start of a
3067 read. The ASCII of the character following `^' minus 33
3068 gives the mapping quality. A symbol `$' marks the end of a
3069 read segment
3070
3071 To reproduce samtools mpileup format, set all of mark_matches,
3072 mark_ends and add_indels to True.
3073
3074 Parameters
3075 ----------
3076
3077 mark_matches: bool
3078
3079 If True, output bases matching the reference as "." or ","
3080 for forward and reverse strand, respectively. This mark
3081 requires the reference sequence. If no reference is
3082 present, this option is ignored.
3083
3084 mark_ends : bool
3085
3086 If True, add markers "^" and "$" for read start and end, respectively.
3087
3088 add_indels : bool
3089
3090 If True, add bases for bases inserted into or skipped from the
3091 reference. The latter requires a reference sequence file to have
3092 been given, e.g. via `pileup(fastafile = ...)`. If no reference
3093 sequence is available, skipped bases are represented as 'N's.
3094
3095 Returns
3096 -------
3097
3098 a list of bases/sequences per read at pileup column position. : list
3099
3100 """
3101 cdef uint32_t x = 0
3102 cdef uint32_t j = 0
3103 cdef uint32_t c = 0
3104 cdef uint8_t cc = 0
3105 cdef uint8_t rb = 0
3106 cdef kstring_t * buf = &self.buf
3107 cdef const bam_pileup1_t * p = NULL
3108
3109 if self.plp == NULL or self.plp[0] == NULL:
3110 raise ValueError("PileupColumn accessed after iterator finished")
3111
3112 buf.l = 0
3113
3114 # todo: reference sequence to count matches/mismatches
3115 # todo: convert assertions to exceptions
3116 for x from 0 <= x < self.n_pu:
3117 p = &(self.plp[0][x])
3118 if p == NULL:
3119 raise ValueError(
3120 "pileup buffer out of sync - most likely use of iterator "
3121 "outside loop")
3122 if pileup_base_qual_skip(p, self.min_base_quality):
3123 continue
3124 # see samtools pileup_seq
3125 if mark_ends and p.is_head:
3126 kputc(b'^', buf)
3127
3128 if p.b.core.qual > 93:
3129 kputc(126, buf)
3130 else:
3131 kputc(p.b.core.qual + 33, buf)
3132 if not p.is_del:
3133 if p.qpos < p.b.core.l_qseq:
3134 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
3135 else:
3136 cc = b'N'
3137
3138 if mark_matches and self.reference_sequence != NULL:
3139 rb = self.reference_sequence[self.reference_pos]
3140 if seq_nt16_table[cc] == seq_nt16_table[rb]:
3141 cc = b'='
3142 kputc(strand_mark_char(cc, p.b), buf)
3143 elif add_indels:
3144 if p.is_refskip:
3145 if bam_is_rev(p.b):
3146 kputc(b'<', buf)
3147 else:
3148 kputc(b'>', buf)
3149 else:
3150 kputc(b'*', buf)
3151 if add_indels:
3152 if p.indel > 0:
3153 kputc(b'+', buf)
3154 kputw(p.indel, buf)
3155 for j from 1 <= j <= p.indel:
3156 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
3157 kputc(strand_mark_char(cc, p.b), buf)
3158 elif p.indel < 0:
3159 kputc(b'-', buf)
3160 kputw(-p.indel, buf)
3161 for j from 1 <= j <= -p.indel:
3162 # TODO: out-of-range check here?
3163 if self.reference_sequence == NULL:
3164 cc = b'N'
3165 else:
3166 cc = self.reference_sequence[self.reference_pos + j]
3167 kputc(strand_mark_char(cc, p.b), buf)
3168 if mark_ends and p.is_tail:
3169 kputc(b'$', buf)
3170
3171 kputc(b':', buf)
3172
3173 if buf.l == 0:
3174 # could be zero if all qualities are too low
3175 return ""
3176 else:
3177 # quicker to ensemble all and split than to encode all separately.
3178 # ignore last ":"
3179 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
3180
3181 def get_query_qualities(self):
3182 """query base quality scores at pileup column position.
3183
3184 Returns
3185 -------
3186
3187 a list of quality scores : list
3188 """
3189 cdef uint32_t x = 0
3190 cdef const bam_pileup1_t * p = NULL
3191 cdef uint32_t c = 0
3192 result = []
3193 for x from 0 <= x < self.n_pu:
3194 p = &(self.plp[0][x])
3195 if p == NULL:
3196 raise ValueError(
3197 "pileup buffer out of sync - most likely use of iterator "
3198 "outside loop")
3199
3200 if p.qpos < p.b.core.l_qseq:
3201 c = bam_get_qual(p.b)[p.qpos]
3202 else:
3203 c = 0
3204 if c < self.min_base_quality:
3205 continue
3206 result.append(c)
3207 return result
3208
3209 def get_mapping_qualities(self):
3210 """query mapping quality scores at pileup column position.
3211
3212 Returns
3213 -------
3214
3215 a list of quality scores : list
3216 """
3217 if self.plp == NULL or self.plp[0] == NULL:
3218 raise ValueError("PileupColumn accessed after iterator finished")
3219
3220 cdef uint32_t x = 0
3221 cdef const bam_pileup1_t * p = NULL
3222 result = []
3223 for x from 0 <= x < self.n_pu:
3224 p = &(self.plp[0][x])
3225 if p == NULL:
3226 raise ValueError(
3227 "pileup buffer out of sync - most likely use of iterator "
3228 "outside loop")
3229
3230 if pileup_base_qual_skip(p, self.min_base_quality):
3231 continue
3232 result.append(p.b.core.qual)
3233 return result
3234
3235 def get_query_positions(self):
3236 """positions in read at pileup column position.
3237
3238 Returns
3239 -------
3240
3241 a list of read positions : list
3242 """
3243 if self.plp == NULL or self.plp[0] == NULL:
3244 raise ValueError("PileupColumn accessed after iterator finished")
3245
3246 cdef uint32_t x = 0
3247 cdef const bam_pileup1_t * p = NULL
3248 result = []
3249 for x from 0 <= x < self.n_pu:
3250 p = &(self.plp[0][x])
3251 if p == NULL:
3252 raise ValueError(
3253 "pileup buffer out of sync - most likely use of iterator "
3254 "outside loop")
3255
3256 if pileup_base_qual_skip(p, self.min_base_quality):
3257 continue
3258 result.append(p.qpos)
3259 return result
3260
3261 def get_query_names(self):
3262 """query/read names aligned at pileup column position.
3263
3264 Returns
3265 -------
3266
3267 a list of query names at pileup column position. : list
3268 """
3269 if self.plp == NULL or self.plp[0] == NULL:
3270 raise ValueError("PileupColumn accessed after iterator finished")
3271
3272 cdef uint32_t x = 0
3273 cdef const bam_pileup1_t * p = NULL
3274 result = []
3275 for x from 0 <= x < self.n_pu:
3276 p = &(self.plp[0][x])
3277 if p == NULL:
3278 raise ValueError(
3279 "pileup buffer out of sync - most likely use of iterator "
3280 "outside loop")
3281
3282 if pileup_base_qual_skip(p, self.min_base_quality):
3283 continue
3284 result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
3285 return result
3286
3287
3288 cdef class PileupRead:
3289 '''Representation of a read aligned to a particular position in the
3290 reference sequence.
3291
3292 '''
3293
3294 def __init__(self):
3295 raise TypeError(
3296 "this class cannot be instantiated from Python")
3297
3298 def __str__(self):
3299 return "\t".join(
3300 map(str,
3301 (self.alignment, self.query_position,
3302 self.indel, self.level,
3303 self.is_del, self.is_head,
3304 self.is_tail, self.is_refskip)))
3305
3306 property alignment:
3307 """a :class:`pysam.AlignedSegment` object of the aligned read"""
3308 def __get__(self):
3309 return self._alignment
3310
3311 property query_position:
3312 """position of the read base at the pileup site, 0-based.
3313 None if :attr:`is_del` or :attr:`is_refskip` is set.
3314
3315 """
3316 def __get__(self):
3317 if self.is_del or self.is_refskip:
3318 return None
3319 else:
3320 return self._qpos
3321
3322 property query_position_or_next:
3323 """position of the read base at the pileup site, 0-based.
3324
3325 If the current position is a deletion, returns the next
3326 aligned base.
3327
3328 """
3329 def __get__(self):
3330 return self._qpos
3331
3332 property indel:
3333 """indel length for the position following the current pileup site.
3334
3335 This quantity peeks ahead to the next cigar operation in this
3336 alignment. If the next operation is an insertion, indel will
3337 be positive. If the next operation is a deletion, it will be
3338 negation. 0 if the next operation is not an indel.
3339
3340 """
3341 def __get__(self):
3342 return self._indel
3343
3344 property level:
3345 """the level of the read in the "viewer" mode. Note that this value
3346 is currently not computed."""
3347 def __get__(self):
3348 return self._level
3349
3350 property is_del:
3351 """1 iff the base on the padded read is a deletion"""
3352 def __get__(self):
3353 return self._is_del
3354
3355 property is_head:
3356 """1 iff the base on the padded read is the left-most base."""
3357 def __get__(self):
3358 return self._is_head
3359
3360 property is_tail:
3361 """1 iff the base on the padded read is the right-most base."""
3362 def __get__(self):
3363 return self._is_tail
3364
3365 property is_refskip:
3366 """1 iff the base on the padded read is part of CIGAR N op."""
3367 def __get__(self):
3368 return self._is_refskip
3369
3370
3371
3372 cpdef enum CIGAR_OPS:
3373 CMATCH = 0
3374 CINS = 1
3375 CDEL = 2
3376 CREF_SKIP = 3
3377 CSOFT_CLIP = 4
3378 CHARD_CLIP = 5
3379 CPAD = 6
3380 CEQUAL = 7
3381 CDIFF = 8
3382 CBACK = 9
3383
3384
3385 cpdef enum SAM_FLAGS:
3386 # the read is paired in sequencing, no matter whether it is mapped in a pair
3387 FPAIRED = 1
3388 # the read is mapped in a proper pair
3389 FPROPER_PAIR = 2
3390 # the read itself is unmapped; conflictive with FPROPER_PAIR
3391 FUNMAP = 4
3392 # the mate is unmapped
3393 FMUNMAP = 8
3394 # the read is mapped to the reverse strand
3395 FREVERSE = 16
3396 # the mate is mapped to the reverse strand
3397 FMREVERSE = 32
3398 # this is read1
3399 FREAD1 = 64
3400 # this is read2
3401 FREAD2 = 128
3402 # not primary alignment
3403 FSECONDARY = 256
3404 # QC failure
3405 FQCFAIL = 512
3406 # optical or PCR duplicate
3407 FDUP = 1024
3408 # supplementary alignment
3409 FSUPPLEMENTARY = 2048
3410
3411
3412 __all__ = [
3413 "AlignedSegment",
3414 "PileupColumn",
3415 "PileupRead",
3416 "CMATCH",
3417 "CINS",
3418 "CDEL",
3419 "CREF_SKIP",
3420 "CSOFT_CLIP",
3421 "CHARD_CLIP",
3422 "CPAD",
3423 "CEQUAL",
3424 "CDIFF",
3425 "CBACK",
3426 "FPAIRED",
3427 "FPROPER_PAIR",
3428 "FUNMAP",
3429 "FMUNMAP",
3430 "FREVERSE",
3431 "FMREVERSE",
3432 "FREAD1",
3433 "FREAD2",
3434 "FSECONDARY",
3435 "FQCFAIL",
3436 "FDUP",
3437 "FSUPPLEMENTARY",
3438 "KEY_NAMES"]