Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignedsegment.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # cython: language_level=3 | |
2 # cython: embedsignature=True | |
3 # cython: profile=True | |
4 ############################################################################### | |
5 ############################################################################### | |
6 # Cython wrapper for SAM/BAM/CRAM files based on htslib | |
7 ############################################################################### | |
8 # The principal classes defined in this module are: | |
9 # | |
10 # class AlignedSegment an aligned segment (read) | |
11 # | |
12 # class PileupColumn a collection of segments (PileupRead) aligned to | |
13 # a particular genomic position. | |
14 # | |
15 # class PileupRead an AlignedSegment aligned to a particular genomic | |
16 # position. Contains additional attributes with respect | |
17 # to this. | |
18 # | |
19 # Additionally this module defines numerous additional classes that are part | |
20 # of the internal API. These are: | |
21 # | |
22 # Various iterator classes to iterate over alignments in sequential (IteratorRow) | |
23 # or in a stacked fashion (IteratorColumn): | |
24 # | |
25 # class IteratorRow | |
26 # class IteratorRowRegion | |
27 # class IteratorRowHead | |
28 # class IteratorRowAll | |
29 # class IteratorRowAllRefs | |
30 # class IteratorRowSelection | |
31 # | |
32 ############################################################################### | |
33 # | |
34 # The MIT License | |
35 # | |
36 # Copyright (c) 2015 Andreas Heger | |
37 # | |
38 # Permission is hereby granted, free of charge, to any person obtaining a | |
39 # copy of this software and associated documentation files (the "Software"), | |
40 # to deal in the Software without restriction, including without limitation | |
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
42 # and/or sell copies of the Software, and to permit persons to whom the | |
43 # Software is furnished to do so, subject to the following conditions: | |
44 # | |
45 # The above copyright notice and this permission notice shall be included in | |
46 # all copies or substantial portions of the Software. | |
47 # | |
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
54 # DEALINGS IN THE SOFTWARE. | |
55 # | |
56 ############################################################################### | |
57 import re | |
58 import array | |
59 import json | |
60 import string | |
61 import ctypes | |
62 import struct | |
63 | |
64 cimport cython | |
65 from cpython cimport array as c_array | |
66 from cpython cimport PyBytes_FromStringAndSize | |
67 from libc.string cimport memset, strchr | |
68 from cpython cimport array as c_array | |
69 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ | |
70 INT8_MAX, INT16_MAX, INT32_MAX, \ | |
71 UINT8_MAX, UINT16_MAX, UINT32_MAX | |
72 | |
73 from pysam.libchtslib cimport HTS_IDX_NOCOOR | |
74 from pysam.libcutils cimport force_bytes, force_str, \ | |
75 charptr_to_str, charptr_to_bytes | |
76 from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \ | |
77 array_to_qualitystring | |
78 | |
79 # Constants for binary tag conversion | |
80 cdef char * htslib_types = 'cCsSiIf' | |
81 cdef char * parray_types = 'bBhHiIf' | |
82 | |
83 # translation tables | |
84 | |
85 # cigar code to character and vice versa | |
86 cdef char* CODE2CIGAR= "MIDNSHP=XB" | |
87 cdef int NCIGAR_CODES = 10 | |
88 | |
89 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) | |
90 CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") | |
91 | |
92 # names for keys in dictionary representation of an AlignedSegment | |
93 KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar", | |
94 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"] | |
95 | |
96 ##################################################################### | |
97 # C multiplication with wrapping around | |
98 cdef inline uint32_t c_mul(uint32_t a, uint32_t b): | |
99 return (a * b) & 0xffffffff | |
100 | |
101 | |
102 cdef inline uint8_t tolower(uint8_t ch): | |
103 if ch >= 65 and ch <= 90: | |
104 return ch + 32 | |
105 else: | |
106 return ch | |
107 | |
108 | |
109 cdef inline uint8_t toupper(uint8_t ch): | |
110 if ch >= 97 and ch <= 122: | |
111 return ch - 32 | |
112 else: | |
113 return ch | |
114 | |
115 | |
116 cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): | |
117 if ch == b'=': | |
118 if bam_is_rev(b): | |
119 return b',' | |
120 else: | |
121 return b'.' | |
122 else: | |
123 if bam_is_rev(b): | |
124 return tolower(ch) | |
125 else: | |
126 return toupper(ch) | |
127 | |
128 | |
129 cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold): | |
130 cdef uint32_t c | |
131 if p.qpos < p.b.core.l_qseq: | |
132 c = bam_get_qual(p.b)[p.qpos] | |
133 else: | |
134 c = 0 | |
135 if c < threshold: | |
136 return True | |
137 return False | |
138 | |
139 | |
140 cdef inline char map_typecode_htslib_to_python(uint8_t s): | |
141 """map an htslib typecode to the corresponding python typecode | |
142 to be used in the struct or array modules.""" | |
143 | |
144 # map type from htslib to python array | |
145 cdef char * f = strchr(htslib_types, s) | |
146 | |
147 if f == NULL: | |
148 return 0 | |
149 return parray_types[f - htslib_types] | |
150 | |
151 | |
152 cdef inline uint8_t map_typecode_python_to_htslib(char s): | |
153 """determine value type from type code of array""" | |
154 cdef char * f = strchr(parray_types, s) | |
155 if f == NULL: | |
156 return 0 | |
157 return htslib_types[f - parray_types] | |
158 | |
159 | |
160 cdef inline void update_bin(bam1_t * src): | |
161 if src.core.flag & BAM_FUNMAP: | |
162 # treat alignment as length of 1 for unmapped reads | |
163 src.core.bin = hts_reg2bin( | |
164 src.core.pos, | |
165 src.core.pos + 1, | |
166 14, | |
167 5) | |
168 elif pysam_get_n_cigar(src): | |
169 src.core.bin = hts_reg2bin( | |
170 src.core.pos, | |
171 bam_endpos(src), | |
172 14, | |
173 5) | |
174 else: | |
175 src.core.bin = hts_reg2bin( | |
176 src.core.pos, | |
177 src.core.pos + 1, | |
178 14, | |
179 5) | |
180 | |
181 | |
182 # optional tag data manipulation | |
183 cdef convert_binary_tag(uint8_t * tag): | |
184 """return bytesize, number of values and array of values | |
185 in aux_data memory location pointed to by tag.""" | |
186 cdef uint8_t auxtype | |
187 cdef uint8_t byte_size | |
188 cdef int32_t nvalues | |
189 # get byte size | |
190 auxtype = tag[0] | |
191 byte_size = aux_type2size(auxtype) | |
192 tag += 1 | |
193 # get number of values in array | |
194 nvalues = (<int32_t*>tag)[0] | |
195 tag += 4 | |
196 | |
197 # define python array | |
198 cdef c_array.array c_values = array.array( | |
199 chr(map_typecode_htslib_to_python(auxtype))) | |
200 c_array.resize(c_values, nvalues) | |
201 | |
202 # copy data | |
203 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size) | |
204 | |
205 # no need to check for endian-ness as bam1_core_t fields | |
206 # and aux_data are in host endian-ness. See sam.c and calls | |
207 # to swap_data | |
208 return byte_size, nvalues, c_values | |
209 | |
210 | |
211 cdef inline uint8_t get_tag_typecode(value, value_type=None): | |
212 """guess type code for a *value*. If *value_type* is None, the type | |
213 code will be inferred based on the Python type of *value* | |
214 | |
215 """ | |
216 # 0 is unknown typecode | |
217 cdef char typecode = 0 | |
218 | |
219 if value_type is None: | |
220 if isinstance(value, int): | |
221 if value < 0: | |
222 if value >= INT8_MIN: | |
223 typecode = b'c' | |
224 elif value >= INT16_MIN: | |
225 typecode = b's' | |
226 elif value >= INT32_MIN: | |
227 typecode = b'i' | |
228 # unsigned ints | |
229 else: | |
230 if value <= UINT8_MAX: | |
231 typecode = b'C' | |
232 elif value <= UINT16_MAX: | |
233 typecode = b'S' | |
234 elif value <= UINT32_MAX: | |
235 typecode = b'I' | |
236 elif isinstance(value, float): | |
237 typecode = b'f' | |
238 elif isinstance(value, str): | |
239 typecode = b'Z' | |
240 elif isinstance(value, bytes): | |
241 typecode = b'Z' | |
242 elif isinstance(value, array.array) or \ | |
243 isinstance(value, list) or \ | |
244 isinstance(value, tuple): | |
245 typecode = b'B' | |
246 else: | |
247 if value_type in 'aAsSIcCZidfH': | |
248 typecode = force_bytes(value_type)[0] | |
249 | |
250 return typecode | |
251 | |
252 | |
253 cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): | |
254 '''returns the value typecode of a value. | |
255 | |
256 If max is specified, the appropriate type is returned for a range | |
257 where value is the minimum. | |
258 | |
259 Note that this method returns types from the extended BAM alphabet | |
260 of types that includes tags that are not part of the SAM | |
261 specification. | |
262 ''' | |
263 | |
264 | |
265 cdef uint8_t typecode | |
266 | |
267 t = type(value) | |
268 | |
269 if t is float: | |
270 typecode = b'f' | |
271 elif t is int: | |
272 if max_value is None: | |
273 max_value = value | |
274 if min_value is None: | |
275 min_value = value | |
276 # signed ints | |
277 if min_value < 0: | |
278 if min_value >= INT8_MIN and max_value <= INT8_MAX: | |
279 typecode = b'c' | |
280 elif min_value >= INT16_MIN and max_value <= INT16_MAX: | |
281 typecode = b's' | |
282 elif min_value >= INT32_MIN or max_value <= INT32_MAX: | |
283 typecode = b'i' | |
284 else: | |
285 raise ValueError( | |
286 "at least one signed integer out of range of " | |
287 "BAM/SAM specification") | |
288 # unsigned ints | |
289 else: | |
290 if max_value <= UINT8_MAX: | |
291 typecode = b'C' | |
292 elif max_value <= UINT16_MAX: | |
293 typecode = b'S' | |
294 elif max_value <= UINT32_MAX: | |
295 typecode = b'I' | |
296 else: | |
297 raise ValueError( | |
298 "at least one integer out of range of BAM/SAM specification") | |
299 else: | |
300 # Note: hex strings (H) are not supported yet | |
301 if t is not bytes: | |
302 value = value.encode('ascii') | |
303 if len(value) == 1: | |
304 typecode = b'A' | |
305 else: | |
306 typecode = b'Z' | |
307 | |
308 return typecode | |
309 | |
310 | |
311 # mapping python array.array and htslib typecodes to struct typecodes | |
312 DATATYPE2FORMAT = { | |
313 ord('c'): ('b', 1), | |
314 ord('C'): ('B', 1), | |
315 ord('s'): ('h', 2), | |
316 ord('S'): ('H', 2), | |
317 ord('i'): ('i', 4), | |
318 ord('I'): ('I', 4), | |
319 ord('f'): ('f', 4), | |
320 ord('d'): ('d', 8), | |
321 ord('A'): ('c', 1), | |
322 ord('a'): ('c', 1)} | |
323 | |
324 | |
325 cdef inline pack_tags(tags): | |
326 """pack a list of tags. Each tag is a tuple of (tag, tuple). | |
327 | |
328 Values are packed into the most space efficient data structure | |
329 possible unless the tag contains a third field with the typecode. | |
330 | |
331 Returns a format string and the associated list of arguments to be | |
332 used in a call to struct.pack_into. | |
333 """ | |
334 fmts, args = ["<"], [] | |
335 | |
336 # htslib typecode | |
337 cdef uint8_t typecode | |
338 for tag in tags: | |
339 | |
340 if len(tag) == 2: | |
341 pytag, value = tag | |
342 valuetype = None | |
343 elif len(tag) == 3: | |
344 pytag, value, valuetype = tag | |
345 else: | |
346 raise ValueError("malformatted tag: %s" % str(tag)) | |
347 | |
348 if valuetype is None: | |
349 typecode = 0 | |
350 else: | |
351 # only first character in valuecode matters | |
352 typecode = force_bytes(valuetype)[0] | |
353 | |
354 pytag = force_bytes(pytag) | |
355 pytype = type(value) | |
356 | |
357 if pytype is tuple or pytype is list: | |
358 # binary tags from tuples or lists | |
359 if not typecode: | |
360 # automatically determine value type - first value | |
361 # determines type. If there is a mix of types, the | |
362 # result is undefined. | |
363 typecode = get_btag_typecode(min(value), | |
364 min_value=min(value), | |
365 max_value=max(value)) | |
366 | |
367 if typecode not in DATATYPE2FORMAT: | |
368 raise ValueError("invalid value type '{}'".format(chr(typecode))) | |
369 | |
370 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0]) | |
371 args.extend([pytag[:2], | |
372 ord("B"), | |
373 typecode, | |
374 len(value)] + list(value)) | |
375 | |
376 elif isinstance(value, array.array): | |
377 # binary tags from arrays | |
378 if typecode == 0: | |
379 typecode = map_typecode_python_to_htslib(ord(value.typecode)) | |
380 | |
381 if typecode == 0: | |
382 raise ValueError("unsupported type code '{}'".format(value.typecode)) | |
383 | |
384 if typecode not in DATATYPE2FORMAT: | |
385 raise ValueError("invalid value type '{}'".format(chr(typecode))) | |
386 | |
387 # use array.tostring() to retrieve byte representation and | |
388 # save as bytes | |
389 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1]) | |
390 args.extend([pytag[:2], | |
391 ord("B"), | |
392 typecode, | |
393 len(value), | |
394 value.tobytes()]) | |
395 | |
396 else: | |
397 if typecode == 0: | |
398 typecode = get_tag_typecode(value) | |
399 if typecode == 0: | |
400 raise ValueError("could not deduce typecode for value {}".format(value)) | |
401 | |
402 if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H': | |
403 value = force_bytes(value) | |
404 | |
405 if typecode == b"a": | |
406 typecode = b'A' | |
407 | |
408 if typecode == b'Z' or typecode == b'H': | |
409 datafmt = "2sB%is" % (len(value)+1) | |
410 elif typecode in DATATYPE2FORMAT: | |
411 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0] | |
412 else: | |
413 raise ValueError("invalid value type '{}'".format(chr(typecode))) | |
414 | |
415 args.extend([pytag[:2], | |
416 typecode, | |
417 value]) | |
418 | |
419 fmts.append(datafmt) | |
420 | |
421 return "".join(fmts), args | |
422 | |
423 | |
424 cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src): | |
425 """return query length computed from CIGAR alignment. | |
426 | |
427 Length ignores hard-clipped bases. | |
428 | |
429 Return 0 if there is no CIGAR alignment. | |
430 """ | |
431 | |
432 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
433 | |
434 if cigar_p == NULL: | |
435 return 0 | |
436 | |
437 cdef uint32_t k, qpos | |
438 cdef int op | |
439 qpos = 0 | |
440 | |
441 for k from 0 <= k < pysam_get_n_cigar(src): | |
442 op = cigar_p[k] & BAM_CIGAR_MASK | |
443 | |
444 if op == BAM_CMATCH or \ | |
445 op == BAM_CINS or \ | |
446 op == BAM_CSOFT_CLIP or \ | |
447 op == BAM_CEQUAL or \ | |
448 op == BAM_CDIFF: | |
449 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT | |
450 | |
451 return qpos | |
452 | |
453 | |
454 cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src): | |
455 """return query length computed from CIGAR alignment. | |
456 | |
457 Length includes hard-clipped bases. | |
458 | |
459 Return 0 if there is no CIGAR alignment. | |
460 """ | |
461 | |
462 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
463 | |
464 if cigar_p == NULL: | |
465 return 0 | |
466 | |
467 cdef uint32_t k, qpos | |
468 cdef int op | |
469 qpos = 0 | |
470 | |
471 for k from 0 <= k < pysam_get_n_cigar(src): | |
472 op = cigar_p[k] & BAM_CIGAR_MASK | |
473 | |
474 if op == BAM_CMATCH or \ | |
475 op == BAM_CINS or \ | |
476 op == BAM_CSOFT_CLIP or \ | |
477 op == BAM_CHARD_CLIP or \ | |
478 op == BAM_CEQUAL or \ | |
479 op == BAM_CDIFF: | |
480 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT | |
481 | |
482 return qpos | |
483 | |
484 | |
485 cdef inline int32_t getQueryStart(bam1_t *src) except -1: | |
486 cdef uint32_t * cigar_p | |
487 cdef uint32_t start_offset = 0 | |
488 cdef uint32_t k, op | |
489 | |
490 cigar_p = pysam_bam_get_cigar(src); | |
491 for k from 0 <= k < pysam_get_n_cigar(src): | |
492 op = cigar_p[k] & BAM_CIGAR_MASK | |
493 if op == BAM_CHARD_CLIP: | |
494 if start_offset != 0 and start_offset != src.core.l_qseq: | |
495 raise ValueError('Invalid clipping in CIGAR string') | |
496 elif op == BAM_CSOFT_CLIP: | |
497 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT | |
498 else: | |
499 break | |
500 | |
501 return start_offset | |
502 | |
503 | |
504 cdef inline int32_t getQueryEnd(bam1_t *src) except -1: | |
505 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
506 cdef uint32_t end_offset = src.core.l_qseq | |
507 cdef uint32_t k, op | |
508 | |
509 # if there is no sequence, compute length from cigar string | |
510 if end_offset == 0: | |
511 for k from 0 <= k < pysam_get_n_cigar(src): | |
512 op = cigar_p[k] & BAM_CIGAR_MASK | |
513 if op == BAM_CMATCH or \ | |
514 op == BAM_CINS or \ | |
515 op == BAM_CEQUAL or \ | |
516 op == BAM_CDIFF or \ | |
517 (op == BAM_CSOFT_CLIP and end_offset == 0): | |
518 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT | |
519 else: | |
520 # walk backwards in cigar string | |
521 for k from pysam_get_n_cigar(src) > k >= 1: | |
522 op = cigar_p[k] & BAM_CIGAR_MASK | |
523 if op == BAM_CHARD_CLIP: | |
524 if end_offset != src.core.l_qseq: | |
525 raise ValueError('Invalid clipping in CIGAR string') | |
526 elif op == BAM_CSOFT_CLIP: | |
527 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT | |
528 else: | |
529 break | |
530 | |
531 return end_offset | |
532 | |
533 | |
534 cdef inline bytes getSequenceInRange(bam1_t *src, | |
535 uint32_t start, | |
536 uint32_t end): | |
537 """return python string of the sequence in a bam1_t object. | |
538 """ | |
539 | |
540 cdef uint8_t * p | |
541 cdef uint32_t k | |
542 cdef char * s | |
543 | |
544 if not src.core.l_qseq: | |
545 return None | |
546 | |
547 seq = PyBytes_FromStringAndSize(NULL, end - start) | |
548 s = <char*>seq | |
549 p = pysam_bam_get_seq(src) | |
550 | |
551 for k from start <= k < end: | |
552 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) | |
553 # note: do not use string literal as it will be a python string | |
554 s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf] | |
555 | |
556 return charptr_to_bytes(seq) | |
557 | |
558 | |
559 cdef inline object getQualitiesInRange(bam1_t *src, | |
560 uint32_t start, | |
561 uint32_t end): | |
562 """return python array of quality values from a bam1_t object""" | |
563 | |
564 cdef uint8_t * p | |
565 cdef uint32_t k | |
566 | |
567 p = pysam_bam_get_qual(src) | |
568 if p[0] == 0xff: | |
569 return None | |
570 | |
571 # 'B': unsigned char | |
572 cdef c_array.array result = array.array('B', [0]) | |
573 c_array.resize(result, end - start) | |
574 | |
575 # copy data | |
576 memcpy(result.data.as_voidptr, <void*>&p[start], end - start) | |
577 | |
578 return result | |
579 | |
580 | |
581 ##################################################################### | |
582 ## factory methods for instantiating extension classes | |
583 cdef class AlignedSegment | |
584 cdef AlignedSegment makeAlignedSegment(bam1_t *src, | |
585 AlignmentHeader header): | |
586 '''return an AlignedSegment object constructed from `src`''' | |
587 # note that the following does not call __init__ | |
588 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) | |
589 dest._delegate = bam_dup1(src) | |
590 dest.header = header | |
591 return dest | |
592 | |
593 | |
594 cdef class PileupColumn | |
595 cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp, | |
596 int tid, | |
597 int pos, | |
598 int n_pu, | |
599 uint32_t min_base_quality, | |
600 char * reference_sequence, | |
601 AlignmentHeader header): | |
602 '''return a PileupColumn object constructed from pileup in `plp` and | |
603 setting additional attributes. | |
604 | |
605 ''' | |
606 # note that the following does not call __init__ | |
607 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) | |
608 dest.header = header | |
609 dest.plp = plp | |
610 dest.tid = tid | |
611 dest.pos = pos | |
612 dest.n_pu = n_pu | |
613 dest.min_base_quality = min_base_quality | |
614 dest.reference_sequence = reference_sequence | |
615 dest.buf.l = dest.buf.m = 0 | |
616 dest.buf.s = NULL | |
617 | |
618 return dest | |
619 | |
620 | |
621 cdef class PileupRead | |
622 cdef PileupRead makePileupRead(const bam_pileup1_t *src, | |
623 AlignmentHeader header): | |
624 '''return a PileupRead object construted from a bam_pileup1_t * object.''' | |
625 # note that the following does not call __init__ | |
626 cdef PileupRead dest = PileupRead.__new__(PileupRead) | |
627 dest._alignment = makeAlignedSegment(src.b, header) | |
628 dest._qpos = src.qpos | |
629 dest._indel = src.indel | |
630 dest._level = src.level | |
631 dest._is_del = src.is_del | |
632 dest._is_head = src.is_head | |
633 dest._is_tail = src.is_tail | |
634 dest._is_refskip = src.is_refskip | |
635 return dest | |
636 | |
637 | |
638 cdef inline uint32_t get_alignment_length(bam1_t *src): | |
639 cdef uint32_t k = 0 | |
640 cdef uint32_t l = 0 | |
641 if src == NULL: | |
642 return 0 | |
643 cdef uint32_t * cigar_p = bam_get_cigar(src) | |
644 if cigar_p == NULL: | |
645 return 0 | |
646 cdef int op | |
647 cdef uint32_t n = pysam_get_n_cigar(src) | |
648 for k from 0 <= k < n: | |
649 op = cigar_p[k] & BAM_CIGAR_MASK | |
650 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP: | |
651 continue | |
652 l += cigar_p[k] >> BAM_CIGAR_SHIFT | |
653 return l | |
654 | |
655 | |
656 cdef inline uint32_t get_md_reference_length(char * md_tag): | |
657 cdef int l = 0 | |
658 cdef int md_idx = 0 | |
659 cdef int nmatches = 0 | |
660 | |
661 while md_tag[md_idx] != 0: | |
662 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: | |
663 nmatches *= 10 | |
664 nmatches += md_tag[md_idx] - 48 | |
665 md_idx += 1 | |
666 continue | |
667 else: | |
668 l += nmatches | |
669 nmatches = 0 | |
670 if md_tag[md_idx] == b'^': | |
671 md_idx += 1 | |
672 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: | |
673 md_idx += 1 | |
674 l += 1 | |
675 else: | |
676 md_idx += 1 | |
677 l += 1 | |
678 | |
679 l += nmatches | |
680 return l | |
681 | |
682 # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ... | |
683 cdef inline bytes build_alignment_sequence(bam1_t * src): | |
684 """return expanded sequence from MD tag. | |
685 | |
686 The sequence includes substitutions and both insertions in the | |
687 reference as well as deletions to the reference sequence. Combine | |
688 with the cigar string to reconstitute the query or the reference | |
689 sequence. | |
690 | |
691 Positions corresponding to `N` (skipped region from the reference) | |
692 or `P` (padding (silent deletion from padded reference)) in the CIGAR | |
693 string will not appear in the returned sequence. The MD should | |
694 correspondingly not contain these. Thus proper tags are:: | |
695 | |
696 Deletion from the reference: cigar=5M1D5M MD=5^C5 | |
697 Skipped region from reference: cigar=5M1N5M MD=10 | |
698 Padded region in the reference: cigar=5M1P5M MD=10 | |
699 | |
700 Returns | |
701 ------- | |
702 | |
703 None, if no MD tag is present. | |
704 | |
705 """ | |
706 if src == NULL: | |
707 return None | |
708 | |
709 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") | |
710 if md_tag_ptr == NULL: | |
711 return None | |
712 | |
713 cdef uint32_t start = getQueryStart(src) | |
714 cdef uint32_t end = getQueryEnd(src) | |
715 # get read sequence, taking into account soft-clipping | |
716 r = getSequenceInRange(src, start, end) | |
717 cdef char * read_sequence = r | |
718 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
719 if cigar_p == NULL: | |
720 return None | |
721 | |
722 cdef uint32_t r_idx = 0 | |
723 cdef int op | |
724 cdef uint32_t k, i, l, x | |
725 cdef int nmatches = 0 | |
726 cdef int s_idx = 0 | |
727 | |
728 cdef uint32_t max_len = get_alignment_length(src) | |
729 if max_len == 0: | |
730 raise ValueError("could not determine alignment length") | |
731 | |
732 cdef char * s = <char*>calloc(max_len + 1, sizeof(char)) | |
733 if s == NULL: | |
734 raise ValueError( | |
735 "could not allocate sequence of length %i" % max_len) | |
736 | |
737 for k from 0 <= k < pysam_get_n_cigar(src): | |
738 op = cigar_p[k] & BAM_CIGAR_MASK | |
739 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
740 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
741 for i from 0 <= i < l: | |
742 s[s_idx] = read_sequence[r_idx] | |
743 r_idx += 1 | |
744 s_idx += 1 | |
745 elif op == BAM_CDEL: | |
746 for i from 0 <= i < l: | |
747 s[s_idx] = b'-' | |
748 s_idx += 1 | |
749 elif op == BAM_CREF_SKIP: | |
750 pass | |
751 elif op == BAM_CINS or op == BAM_CPAD: | |
752 for i from 0 <= i < l: | |
753 # encode insertions into reference as lowercase | |
754 s[s_idx] = read_sequence[r_idx] + 32 | |
755 r_idx += 1 | |
756 s_idx += 1 | |
757 elif op == BAM_CSOFT_CLIP: | |
758 pass | |
759 elif op == BAM_CHARD_CLIP: | |
760 pass # advances neither | |
761 | |
762 cdef char md_buffer[2] | |
763 cdef char *md_tag | |
764 cdef uint8_t md_typecode = md_tag_ptr[0] | |
765 if md_typecode == b'Z': | |
766 md_tag = bam_aux2Z(md_tag_ptr) | |
767 elif md_typecode == b'A': | |
768 # Work around HTSeq bug that writes 1-character strings as MD:A:v | |
769 md_buffer[0] = bam_aux2A(md_tag_ptr) | |
770 md_buffer[1] = b'\0' | |
771 md_tag = md_buffer | |
772 else: | |
773 raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode))) | |
774 | |
775 cdef int md_idx = 0 | |
776 cdef char c | |
777 s_idx = 0 | |
778 | |
779 # Check if MD tag is valid by matching CIGAR length to MD tag defined length | |
780 # Insertions would be in addition to what is described by MD, so we calculate | |
781 # the number of insertions separately. | |
782 cdef int insertions = 0 | |
783 | |
784 while s[s_idx] != 0: | |
785 if s[s_idx] >= b'a': | |
786 insertions += 1 | |
787 s_idx += 1 | |
788 s_idx = 0 | |
789 | |
790 cdef uint32_t md_len = get_md_reference_length(md_tag) | |
791 if md_len + insertions > max_len: | |
792 free(s) | |
793 raise AssertionError( | |
794 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format( | |
795 md_len, max_len, insertions)) | |
796 | |
797 while md_tag[md_idx] != 0: | |
798 # c is numerical | |
799 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: | |
800 nmatches *= 10 | |
801 nmatches += md_tag[md_idx] - 48 | |
802 md_idx += 1 | |
803 continue | |
804 else: | |
805 # save matches up to this point, skipping insertions | |
806 for x from 0 <= x < nmatches: | |
807 while s[s_idx] >= b'a': | |
808 s_idx += 1 | |
809 s_idx += 1 | |
810 while s[s_idx] >= b'a': | |
811 s_idx += 1 | |
812 | |
813 r_idx += nmatches | |
814 nmatches = 0 | |
815 if md_tag[md_idx] == b'^': | |
816 md_idx += 1 | |
817 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: | |
818 # assert s[s_idx] == '-' | |
819 s[s_idx] = md_tag[md_idx] | |
820 s_idx += 1 | |
821 md_idx += 1 | |
822 else: | |
823 # save mismatch | |
824 # enforce lower case | |
825 c = md_tag[md_idx] | |
826 if c <= 90: | |
827 c += 32 | |
828 s[s_idx] = c | |
829 s_idx += 1 | |
830 r_idx += 1 | |
831 md_idx += 1 | |
832 | |
833 # save matches up to this point, skipping insertions | |
834 for x from 0 <= x < nmatches: | |
835 while s[s_idx] >= b'a': | |
836 s_idx += 1 | |
837 s_idx += 1 | |
838 while s[s_idx] >= b'a': | |
839 s_idx += 1 | |
840 | |
841 seq = PyBytes_FromStringAndSize(s, s_idx) | |
842 free(s) | |
843 | |
844 return seq | |
845 | |
846 | |
847 cdef inline bytes build_reference_sequence(bam1_t * src): | |
848 """return the reference sequence in the region that is covered by the | |
849 alignment of the read to the reference. | |
850 | |
851 This method requires the MD tag to be set. | |
852 | |
853 """ | |
854 cdef uint32_t k, i, l | |
855 cdef int op | |
856 cdef int s_idx = 0 | |
857 ref_seq = build_alignment_sequence(src) | |
858 if ref_seq is None: | |
859 raise ValueError("MD tag not present") | |
860 | |
861 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char)) | |
862 if s == NULL: | |
863 raise ValueError( | |
864 "could not allocate sequence of length %i" % len(ref_seq)) | |
865 | |
866 cdef char * cref_seq = ref_seq | |
867 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
868 cdef uint32_t r_idx = 0 | |
869 for k from 0 <= k < pysam_get_n_cigar(src): | |
870 op = cigar_p[k] & BAM_CIGAR_MASK | |
871 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
872 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
873 for i from 0 <= i < l: | |
874 s[s_idx] = cref_seq[r_idx] | |
875 r_idx += 1 | |
876 s_idx += 1 | |
877 elif op == BAM_CDEL: | |
878 for i from 0 <= i < l: | |
879 s[s_idx] = cref_seq[r_idx] | |
880 r_idx += 1 | |
881 s_idx += 1 | |
882 elif op == BAM_CREF_SKIP: | |
883 pass | |
884 elif op == BAM_CINS or op == BAM_CPAD: | |
885 r_idx += l | |
886 elif op == BAM_CSOFT_CLIP: | |
887 pass | |
888 elif op == BAM_CHARD_CLIP: | |
889 pass # advances neither | |
890 | |
891 seq = PyBytes_FromStringAndSize(s, s_idx) | |
892 free(s) | |
893 | |
894 return seq | |
895 | |
896 | |
897 cdef inline str safe_reference_name(AlignmentHeader header, int tid): | |
898 if tid == -1: return "*" | |
899 elif header is not None: return header.get_reference_name(tid) | |
900 else: return f"#{tid}" | |
901 | |
902 | |
903 # Tuple-building helper functions used by AlignedSegment.get_aligned_pairs() | |
904 | |
905 cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op): | |
906 return (qpos, pos) | |
907 | |
908 | |
909 cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op): | |
910 ref_base = ref_seq[r_idx] if ref_seq is not None else None | |
911 return (qpos, pos, ref_base) | |
912 | |
913 | |
914 cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op): | |
915 return (qpos, pos, CIGAR_OPS(op)) | |
916 | |
917 | |
918 cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op): | |
919 ref_base = ref_seq[r_idx] if ref_seq is not None else None | |
920 return (qpos, pos, ref_base, CIGAR_OPS(op)) | |
921 | |
922 | |
923 cdef class AlignedSegment: | |
924 '''Class representing an aligned segment. | |
925 | |
926 This class stores a handle to the samtools C-structure representing | |
927 an aligned read. Member read access is forwarded to the C-structure | |
928 and converted into python objects. This implementation should be fast, | |
929 as only the data needed is converted. | |
930 | |
931 For write access, the C-structure is updated in-place. This is | |
932 not the most efficient way to build BAM entries, as the variable | |
933 length data is concatenated and thus needs to be resized if | |
934 a field is updated. Furthermore, the BAM entry might be | |
935 in an inconsistent state. | |
936 | |
937 One issue to look out for is that the sequence should always | |
938 be set *before* the quality scores. Setting the sequence will | |
939 also erase any quality scores that were set previously. | |
940 | |
941 Parameters | |
942 ---------- | |
943 | |
944 header: | |
945 :class:`~pysam.AlignmentHeader` object to map numerical | |
946 identifiers to chromosome names. If not given, an empty | |
947 header is created. | |
948 ''' | |
949 | |
950 # Now only called when instances are created from Python | |
951 def __init__(self, AlignmentHeader header=None): | |
952 # see bam_init1 | |
953 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t)) | |
954 if self._delegate == NULL: | |
955 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t))) | |
956 # allocate some memory. If size is 0, calloc does not return a | |
957 # pointer that can be passed to free() so allocate 40 bytes | |
958 # for a new read | |
959 self._delegate.m_data = 40 | |
960 self._delegate.data = <uint8_t *>calloc( | |
961 self._delegate.m_data, 1) | |
962 if self._delegate.data == NULL: | |
963 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data)) | |
964 self._delegate.l_data = 0 | |
965 # set some data to make read approximately legit. | |
966 # Note, SAM writing fails with q_name of length 0 | |
967 self._delegate.core.l_qname = 0 | |
968 self._delegate.core.tid = -1 | |
969 self._delegate.core.pos = -1 | |
970 self._delegate.core.mtid = -1 | |
971 self._delegate.core.mpos = -1 | |
972 | |
973 # caching for selected fields | |
974 self.cache_query_qualities = None | |
975 self.cache_query_alignment_qualities = None | |
976 self.cache_query_sequence = None | |
977 self.cache_query_alignment_sequence = None | |
978 | |
979 self.header = header | |
980 | |
981 def __dealloc__(self): | |
982 bam_destroy1(self._delegate) | |
983 | |
984 def __str__(self): | |
985 """return string representation of alignment. | |
986 | |
987 The representation is an approximate :term:`SAM` format, because | |
988 an aligned read might not be associated with a :term:`AlignmentFile`. | |
989 Hence when the read does not have an associated :class:`AlignedHeader`, | |
990 :term:`tid` is shown instead of the reference name. | |
991 Similarly, the tags field is returned in its parsed state. | |
992 | |
993 To get a valid SAM record, use :meth:`to_string`. | |
994 """ | |
995 # sam-parsing is done in sam.c/bam_format1_core which | |
996 # requires a valid header. | |
997 return "\t".join(map(str, (self.query_name, | |
998 self.flag, | |
999 safe_reference_name(self.header, self.reference_id), | |
1000 self.reference_start + 1, | |
1001 self.mapping_quality, | |
1002 self.cigarstring, | |
1003 safe_reference_name(self.header, self.next_reference_id), | |
1004 self.next_reference_start + 1, | |
1005 self.template_length, | |
1006 self.query_sequence, | |
1007 self.query_qualities, | |
1008 self.tags))) | |
1009 | |
1010 def __repr__(self): | |
1011 ref = self.reference_name if self.header is not None else self.reference_id | |
1012 return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>' | |
1013 | |
1014 def __copy__(self): | |
1015 return makeAlignedSegment(self._delegate, self.header) | |
1016 | |
1017 def __deepcopy__(self, memo): | |
1018 return makeAlignedSegment(self._delegate, self.header) | |
1019 | |
1020 def compare(self, AlignedSegment other): | |
1021 '''return -1,0,1, if contents in this are binary | |
1022 <,=,> to *other* | |
1023 ''' | |
1024 | |
1025 # avoid segfault when other equals None | |
1026 if other is None: | |
1027 return -1 | |
1028 | |
1029 cdef int retval, x | |
1030 cdef bam1_t *t | |
1031 cdef bam1_t *o | |
1032 | |
1033 t = self._delegate | |
1034 o = other._delegate | |
1035 | |
1036 # uncomment for debugging purposes | |
1037 # cdef unsigned char * oo, * tt | |
1038 # tt = <unsigned char*>(&t.core) | |
1039 # oo = <unsigned char*>(&o.core) | |
1040 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] | |
1041 # tt = <unsigned char*>(t.data) | |
1042 # oo = <unsigned char*>(o.data) | |
1043 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) | |
1044 | |
1045 # Fast-path test for object identity | |
1046 if t == o: | |
1047 return 0 | |
1048 | |
1049 cdef uint8_t *a = <uint8_t*>&t.core | |
1050 cdef uint8_t *b = <uint8_t*>&o.core | |
1051 | |
1052 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) | |
1053 if retval: | |
1054 return retval | |
1055 | |
1056 # cmp(t.l_data, o.l_data) | |
1057 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data) | |
1058 if retval: | |
1059 return retval | |
1060 return memcmp(t.data, o.data, t.l_data) | |
1061 | |
1062 def __richcmp__(self, AlignedSegment other, int op): | |
1063 if op == 2: # == operator | |
1064 return self.compare(other) == 0 | |
1065 elif op == 3: # != operator | |
1066 return self.compare(other) != 0 | |
1067 else: | |
1068 return NotImplemented | |
1069 | |
1070 def __hash__(self): | |
1071 cdef bam1_t * src = self._delegate | |
1072 cdef int x | |
1073 | |
1074 # see http://effbot.org/zone/python-hash.htm | |
1075 cdef uint8_t * c = <uint8_t *>&src.core | |
1076 cdef uint32_t hash_value = c[0] | |
1077 for x from 1 <= x < sizeof(bam1_core_t): | |
1078 hash_value = c_mul(hash_value, 1000003) ^ c[x] | |
1079 c = <uint8_t *>src.data | |
1080 for x from 0 <= x < src.l_data: | |
1081 hash_value = c_mul(hash_value, 1000003) ^ c[x] | |
1082 | |
1083 return hash_value | |
1084 | |
1085 cpdef to_string(self): | |
1086 """returns a string representation of the aligned segment. | |
1087 | |
1088 The output format is valid SAM format if a header is associated | |
1089 with the AlignedSegment. | |
1090 """ | |
1091 cdef kstring_t line | |
1092 line.l = line.m = 0 | |
1093 line.s = NULL | |
1094 | |
1095 if self.header: | |
1096 if sam_format1(self.header.ptr, self._delegate, &line) < 0: | |
1097 if line.m: | |
1098 free(line.s) | |
1099 raise ValueError('sam_format failed') | |
1100 else: | |
1101 raise NotImplementedError("todo") | |
1102 | |
1103 ret = force_str(line.s[:line.l]) | |
1104 | |
1105 if line.m: | |
1106 free(line.s) | |
1107 | |
1108 return ret | |
1109 | |
1110 @classmethod | |
1111 def fromstring(cls, sam, AlignmentHeader header): | |
1112 """parses a string representation of the aligned segment. | |
1113 | |
1114 The input format should be valid SAM format. | |
1115 | |
1116 Parameters | |
1117 ---------- | |
1118 sam: | |
1119 :term:`SAM` formatted string | |
1120 | |
1121 """ | |
1122 cdef AlignedSegment dest = cls.__new__(cls) | |
1123 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t)) | |
1124 dest.header = header | |
1125 | |
1126 cdef kstring_t line | |
1127 line.l = line.m = len(sam) | |
1128 _sam = force_bytes(sam) | |
1129 line.s = _sam | |
1130 | |
1131 cdef int ret | |
1132 ret = sam_parse1(&line, dest.header.ptr, dest._delegate) | |
1133 if ret < 0: | |
1134 raise ValueError("parsing SAM record string failed (error code {})".format(ret)) | |
1135 | |
1136 return dest | |
1137 | |
1138 cpdef tostring(self, htsfile=None): | |
1139 """deprecated, use :meth:`to_string()` instead. | |
1140 | |
1141 Parameters | |
1142 ---------- | |
1143 | |
1144 htsfile: | |
1145 (deprecated) AlignmentFile object to map numerical | |
1146 identifiers to chromosome names. This parameter is present | |
1147 for backwards compatibility and ignored. | |
1148 """ | |
1149 | |
1150 return self.to_string() | |
1151 | |
1152 def to_dict(self): | |
1153 """returns a json representation of the aligned segment. | |
1154 | |
1155 Field names are abbreviated versions of the class attributes. | |
1156 """ | |
1157 # let htslib do the string conversions, but treat optional field properly as list | |
1158 vals = self.to_string().split("\t") | |
1159 n = len(KEY_NAMES) - 1 | |
1160 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])]) | |
1161 | |
1162 @classmethod | |
1163 def from_dict(cls, sam_dict, AlignmentHeader header): | |
1164 """parses a dictionary representation of the aligned segment. | |
1165 | |
1166 Parameters | |
1167 ---------- | |
1168 sam_dict: | |
1169 dictionary of alignment values, keys corresponding to output from | |
1170 :meth:`todict()`. | |
1171 | |
1172 """ | |
1173 # let htslib do the parsing | |
1174 # the tags field can be missing | |
1175 return cls.fromstring( | |
1176 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) + | |
1177 "\t" + | |
1178 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header) | |
1179 | |
1180 ######################################################## | |
1181 ## Basic attributes in order of appearance in SAM format | |
1182 property query_name: | |
1183 """the query template name (None if not present)""" | |
1184 def __get__(self): | |
1185 | |
1186 cdef bam1_t * src = self._delegate | |
1187 if src.core.l_qname == 0: | |
1188 return None | |
1189 | |
1190 return charptr_to_str(<char *>pysam_bam_get_qname(src)) | |
1191 | |
1192 def __set__(self, qname): | |
1193 | |
1194 if qname is None or len(qname) == 0: | |
1195 return | |
1196 | |
1197 if len(qname) > 254: | |
1198 raise ValueError("query length out of range {} > 254".format( | |
1199 len(qname))) | |
1200 | |
1201 qname = force_bytes(qname) | |
1202 cdef bam1_t * src = self._delegate | |
1203 # the qname is \0 terminated | |
1204 cdef uint8_t l = len(qname) + 1 | |
1205 | |
1206 cdef char * p = pysam_bam_get_qname(src) | |
1207 cdef uint8_t l_extranul = 0 | |
1208 | |
1209 if l % 4 != 0: | |
1210 l_extranul = 4 - l % 4 | |
1211 | |
1212 cdef bam1_t * retval = pysam_bam_update(src, | |
1213 src.core.l_qname, | |
1214 l + l_extranul, | |
1215 <uint8_t*>p) | |
1216 if retval == NULL: | |
1217 raise MemoryError("could not allocate memory") | |
1218 | |
1219 src.core.l_extranul = l_extranul | |
1220 src.core.l_qname = l + l_extranul | |
1221 | |
1222 # re-acquire pointer to location in memory | |
1223 # as it might have moved | |
1224 p = pysam_bam_get_qname(src) | |
1225 | |
1226 strncpy(p, qname, l) | |
1227 # x might be > 255 | |
1228 cdef uint16_t x = 0 | |
1229 | |
1230 for x from l <= x < l + l_extranul: | |
1231 p[x] = b'\0' | |
1232 | |
1233 property flag: | |
1234 """properties flag""" | |
1235 def __get__(self): | |
1236 return self._delegate.core.flag | |
1237 def __set__(self, flag): | |
1238 self._delegate.core.flag = flag | |
1239 | |
1240 property reference_name: | |
1241 """:term:`reference` name""" | |
1242 def __get__(self): | |
1243 if self._delegate.core.tid == -1: | |
1244 return None | |
1245 if self.header: | |
1246 return self.header.get_reference_name(self._delegate.core.tid) | |
1247 else: | |
1248 raise ValueError("reference_name unknown if no header associated with record") | |
1249 def __set__(self, reference): | |
1250 cdef int tid | |
1251 if reference is None or reference == "*": | |
1252 self._delegate.core.tid = -1 | |
1253 elif self.header: | |
1254 tid = self.header.get_tid(reference) | |
1255 if tid < 0: | |
1256 raise ValueError("reference {} does not exist in header".format( | |
1257 reference)) | |
1258 self._delegate.core.tid = tid | |
1259 else: | |
1260 raise ValueError("reference_name can not be set if no header associated with record") | |
1261 | |
1262 property reference_id: | |
1263 """:term:`reference` ID | |
1264 | |
1265 .. note:: | |
1266 | |
1267 This field contains the index of the reference sequence in | |
1268 the sequence dictionary. To obtain the name of the | |
1269 reference sequence, use :meth:`get_reference_name()` | |
1270 | |
1271 """ | |
1272 def __get__(self): | |
1273 return self._delegate.core.tid | |
1274 def __set__(self, tid): | |
1275 if tid != -1 and self.header and not self.header.is_valid_tid(tid): | |
1276 raise ValueError("reference id {} does not exist in header".format( | |
1277 tid)) | |
1278 self._delegate.core.tid = tid | |
1279 | |
1280 property reference_start: | |
1281 """0-based leftmost coordinate""" | |
1282 def __get__(self): | |
1283 return self._delegate.core.pos | |
1284 def __set__(self, pos): | |
1285 ## setting the position requires updating the "bin" attribute | |
1286 cdef bam1_t * src | |
1287 src = self._delegate | |
1288 src.core.pos = pos | |
1289 update_bin(src) | |
1290 | |
1291 property mapping_quality: | |
1292 """mapping quality""" | |
1293 def __get__(self): | |
1294 return pysam_get_qual(self._delegate) | |
1295 def __set__(self, qual): | |
1296 pysam_set_qual(self._delegate, qual) | |
1297 | |
1298 property cigarstring: | |
1299 '''the :term:`cigar` alignment as a string. | |
1300 | |
1301 The cigar string is a string of alternating integers | |
1302 and characters denoting the length and the type of | |
1303 an operation. | |
1304 | |
1305 .. note:: | |
1306 The order length,operation is specified in the | |
1307 SAM format. It is different from the order of | |
1308 the :attr:`cigar` property. | |
1309 | |
1310 Returns None if not present. | |
1311 | |
1312 To unset the cigarstring, assign None or the | |
1313 empty string. | |
1314 ''' | |
1315 def __get__(self): | |
1316 c = self.cigartuples | |
1317 if c is None: | |
1318 return None | |
1319 # reverse order | |
1320 else: | |
1321 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c]) | |
1322 | |
1323 def __set__(self, cigar): | |
1324 if cigar is None or len(cigar) == 0: | |
1325 self.cigartuples = [] | |
1326 else: | |
1327 parts = CIGAR_REGEX.findall(cigar) | |
1328 # reverse order | |
1329 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts] | |
1330 | |
1331 # TODO | |
1332 # property cigar: | |
1333 # """the cigar alignment""" | |
1334 | |
1335 property next_reference_id: | |
1336 """the :term:`reference` id of the mate/next read.""" | |
1337 def __get__(self): | |
1338 return self._delegate.core.mtid | |
1339 def __set__(self, mtid): | |
1340 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid): | |
1341 raise ValueError("reference id {} does not exist in header".format( | |
1342 mtid)) | |
1343 self._delegate.core.mtid = mtid | |
1344 | |
1345 property next_reference_name: | |
1346 """:term:`reference` name of the mate/next read (None if no | |
1347 AlignmentFile is associated)""" | |
1348 def __get__(self): | |
1349 if self._delegate.core.mtid == -1: | |
1350 return None | |
1351 if self.header: | |
1352 return self.header.get_reference_name(self._delegate.core.mtid) | |
1353 else: | |
1354 raise ValueError("next_reference_name unknown if no header associated with record") | |
1355 | |
1356 def __set__(self, reference): | |
1357 cdef int mtid | |
1358 if reference is None or reference == "*": | |
1359 self._delegate.core.mtid = -1 | |
1360 elif reference == "=": | |
1361 self._delegate.core.mtid = self._delegate.core.tid | |
1362 elif self.header: | |
1363 mtid = self.header.get_tid(reference) | |
1364 if mtid < 0: | |
1365 raise ValueError("reference {} does not exist in header".format( | |
1366 reference)) | |
1367 self._delegate.core.mtid = mtid | |
1368 else: | |
1369 raise ValueError("next_reference_name can not be set if no header associated with record") | |
1370 | |
1371 property next_reference_start: | |
1372 """the position of the mate/next read.""" | |
1373 def __get__(self): | |
1374 return self._delegate.core.mpos | |
1375 def __set__(self, mpos): | |
1376 self._delegate.core.mpos = mpos | |
1377 | |
1378 property query_length: | |
1379 """the length of the query/read. | |
1380 | |
1381 This value corresponds to the length of the sequence supplied | |
1382 in the BAM/SAM file. The length of a query is 0 if there is no | |
1383 sequence in the BAM/SAM file. In those cases, the read length | |
1384 can be inferred from the CIGAR alignment, see | |
1385 :meth:`pysam.AlignedSegment.infer_query_length`. | |
1386 | |
1387 The length includes soft-clipped bases and is equal to | |
1388 ``len(query_sequence)``. | |
1389 | |
1390 This property is read-only but is updated when a new query | |
1391 sequence is assigned to this AlignedSegment. | |
1392 | |
1393 Returns 0 if not available. | |
1394 | |
1395 """ | |
1396 def __get__(self): | |
1397 return self._delegate.core.l_qseq | |
1398 | |
1399 property template_length: | |
1400 """the observed query template length""" | |
1401 def __get__(self): | |
1402 return self._delegate.core.isize | |
1403 def __set__(self, isize): | |
1404 self._delegate.core.isize = isize | |
1405 | |
1406 property query_sequence: | |
1407 """read sequence bases, including :term:`soft clipped` bases | |
1408 (None if not present). | |
1409 | |
1410 Assigning to this attribute will invalidate any quality scores. | |
1411 Thus, to in-place edit the sequence and quality scores, copies of | |
1412 the quality scores need to be taken. Consider trimming for example:: | |
1413 | |
1414 q = read.query_qualities | |
1415 read.query_sequence = read.query_sequence[5:10] | |
1416 read.query_qualities = q[5:10] | |
1417 | |
1418 The sequence is returned as it is stored in the BAM file. (This will | |
1419 be the reverse complement of the original read sequence if the mapper | |
1420 has aligned the read to the reverse strand.) | |
1421 """ | |
1422 def __get__(self): | |
1423 if self.cache_query_sequence: | |
1424 return self.cache_query_sequence | |
1425 | |
1426 cdef bam1_t * src | |
1427 cdef char * s | |
1428 src = self._delegate | |
1429 | |
1430 if src.core.l_qseq == 0: | |
1431 return None | |
1432 | |
1433 self.cache_query_sequence = force_str(getSequenceInRange( | |
1434 src, 0, src.core.l_qseq)) | |
1435 return self.cache_query_sequence | |
1436 | |
1437 def __set__(self, seq): | |
1438 # samtools manages sequence and quality length memory together | |
1439 # if no quality information is present, the first byte says 0xff. | |
1440 cdef bam1_t * src | |
1441 cdef uint8_t * p | |
1442 cdef char * s | |
1443 cdef int l, k | |
1444 cdef Py_ssize_t nbytes_new, nbytes_old | |
1445 | |
1446 if seq == None: | |
1447 l = 0 | |
1448 else: | |
1449 l = len(seq) | |
1450 seq = force_bytes(seq) | |
1451 | |
1452 src = self._delegate | |
1453 | |
1454 # as the sequence is stored in half-bytes, the total length (sequence | |
1455 # plus quality scores) is (l+1)/2 + l | |
1456 nbytes_new = (l + 1) // 2 + l | |
1457 nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq | |
1458 | |
1459 # acquire pointer to location in memory | |
1460 p = pysam_bam_get_seq(src) | |
1461 src.core.l_qseq = l | |
1462 | |
1463 # change length of data field | |
1464 cdef bam1_t * retval = pysam_bam_update(src, | |
1465 nbytes_old, | |
1466 nbytes_new, | |
1467 p) | |
1468 | |
1469 if retval == NULL: | |
1470 raise MemoryError("could not allocate memory") | |
1471 | |
1472 if l > 0: | |
1473 # re-acquire pointer to location in memory | |
1474 # as it might have moved | |
1475 p = pysam_bam_get_seq(src) | |
1476 for k from 0 <= k < nbytes_new: | |
1477 p[k] = 0 | |
1478 # convert to C string | |
1479 s = seq | |
1480 for k from 0 <= k < l: | |
1481 p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2) | |
1482 | |
1483 # erase qualities | |
1484 p = pysam_bam_get_qual(src) | |
1485 memset(p, 0xff, l) | |
1486 | |
1487 self.cache_query_sequence = force_str(seq) | |
1488 | |
1489 # clear cached values for quality values | |
1490 self.cache_query_qualities = None | |
1491 self.cache_query_alignment_qualities = None | |
1492 | |
1493 property query_qualities: | |
1494 """read sequence base qualities, including :term:`soft clipped` bases | |
1495 (None if not present). | |
1496 | |
1497 Quality scores are returned as a python array of unsigned | |
1498 chars. Note that this is not the ASCII-encoded value typically | |
1499 seen in FASTQ or SAM formatted files. Thus, no offset of 33 | |
1500 needs to be subtracted. | |
1501 | |
1502 Note that to set quality scores the sequence has to be set | |
1503 beforehand as this will determine the expected length of the | |
1504 quality score array. | |
1505 | |
1506 This method raises a ValueError if the length of the | |
1507 quality scores and the sequence are not the same. | |
1508 | |
1509 """ | |
1510 def __get__(self): | |
1511 | |
1512 if self.cache_query_qualities: | |
1513 return self.cache_query_qualities | |
1514 | |
1515 cdef bam1_t * src | |
1516 cdef char * q | |
1517 | |
1518 src = self._delegate | |
1519 | |
1520 if src.core.l_qseq == 0: | |
1521 return None | |
1522 | |
1523 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) | |
1524 return self.cache_query_qualities | |
1525 | |
1526 def __set__(self, qual): | |
1527 | |
1528 # note that memory is already allocated via setting the sequence | |
1529 # hence length match of sequence and quality needs is checked. | |
1530 cdef bam1_t * src | |
1531 cdef uint8_t * p | |
1532 cdef int l | |
1533 | |
1534 src = self._delegate | |
1535 p = pysam_bam_get_qual(src) | |
1536 if qual is None or len(qual) == 0: | |
1537 # if absent and there is a sequence: set to 0xff | |
1538 memset(p, 0xff, src.core.l_qseq) | |
1539 return | |
1540 | |
1541 # check for length match | |
1542 l = len(qual) | |
1543 if src.core.l_qseq != l: | |
1544 raise ValueError( | |
1545 "quality and sequence mismatch: %i != %i" % | |
1546 (l, src.core.l_qseq)) | |
1547 | |
1548 # create a python array object filling it | |
1549 # with the quality scores | |
1550 | |
1551 # NB: should avoid this copying if qual is | |
1552 # already of the correct type. | |
1553 cdef c_array.array result = c_array.array('B', qual) | |
1554 | |
1555 # copy data | |
1556 memcpy(p, result.data.as_voidptr, l) | |
1557 | |
1558 # save in cache | |
1559 self.cache_query_qualities = qual | |
1560 | |
1561 property bin: | |
1562 """properties bin""" | |
1563 def __get__(self): | |
1564 return self._delegate.core.bin | |
1565 def __set__(self, bin): | |
1566 self._delegate.core.bin = bin | |
1567 | |
1568 | |
1569 ########################################################## | |
1570 # Derived simple attributes. These are simple attributes of | |
1571 # AlignedSegment getting and setting values. | |
1572 ########################################################## | |
1573 # 1. Flags | |
1574 ########################################################## | |
1575 property is_paired: | |
1576 """true if read is paired in sequencing""" | |
1577 def __get__(self): | |
1578 return (self.flag & BAM_FPAIRED) != 0 | |
1579 def __set__(self,val): | |
1580 pysam_update_flag(self._delegate, val, BAM_FPAIRED) | |
1581 | |
1582 property is_proper_pair: | |
1583 """true if read is mapped in a proper pair""" | |
1584 def __get__(self): | |
1585 return (self.flag & BAM_FPROPER_PAIR) != 0 | |
1586 def __set__(self,val): | |
1587 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) | |
1588 | |
1589 property is_unmapped: | |
1590 """true if read itself is unmapped""" | |
1591 def __get__(self): | |
1592 return (self.flag & BAM_FUNMAP) != 0 | |
1593 def __set__(self, val): | |
1594 pysam_update_flag(self._delegate, val, BAM_FUNMAP) | |
1595 # setting the unmapped flag requires recalculation of | |
1596 # bin as alignment length is now implicitly 1 | |
1597 update_bin(self._delegate) | |
1598 | |
1599 property is_mapped: | |
1600 """true if read itself is mapped | |
1601 (implemented in terms of :attr:`is_unmapped`)""" | |
1602 def __get__(self): | |
1603 return (self.flag & BAM_FUNMAP) == 0 | |
1604 def __set__(self, val): | |
1605 pysam_update_flag(self._delegate, not val, BAM_FUNMAP) | |
1606 update_bin(self._delegate) | |
1607 | |
1608 property mate_is_unmapped: | |
1609 """true if the mate is unmapped""" | |
1610 def __get__(self): | |
1611 return (self.flag & BAM_FMUNMAP) != 0 | |
1612 def __set__(self,val): | |
1613 pysam_update_flag(self._delegate, val, BAM_FMUNMAP) | |
1614 | |
1615 property mate_is_mapped: | |
1616 """true if the mate is mapped | |
1617 (implemented in terms of :attr:`mate_is_unmapped`)""" | |
1618 def __get__(self): | |
1619 return (self.flag & BAM_FMUNMAP) == 0 | |
1620 def __set__(self,val): | |
1621 pysam_update_flag(self._delegate, not val, BAM_FMUNMAP) | |
1622 | |
1623 property is_reverse: | |
1624 """true if read is mapped to reverse strand""" | |
1625 def __get__(self): | |
1626 return (self.flag & BAM_FREVERSE) != 0 | |
1627 def __set__(self,val): | |
1628 pysam_update_flag(self._delegate, val, BAM_FREVERSE) | |
1629 | |
1630 property is_forward: | |
1631 """true if read is mapped to forward strand | |
1632 (implemented in terms of :attr:`is_reverse`)""" | |
1633 def __get__(self): | |
1634 return (self.flag & BAM_FREVERSE) == 0 | |
1635 def __set__(self,val): | |
1636 pysam_update_flag(self._delegate, not val, BAM_FREVERSE) | |
1637 | |
1638 property mate_is_reverse: | |
1639 """true if the mate is mapped to reverse strand""" | |
1640 def __get__(self): | |
1641 return (self.flag & BAM_FMREVERSE) != 0 | |
1642 def __set__(self,val): | |
1643 pysam_update_flag(self._delegate, val, BAM_FMREVERSE) | |
1644 | |
1645 property mate_is_forward: | |
1646 """true if the mate is mapped to forward strand | |
1647 (implemented in terms of :attr:`mate_is_reverse`)""" | |
1648 def __get__(self): | |
1649 return (self.flag & BAM_FMREVERSE) == 0 | |
1650 def __set__(self,val): | |
1651 pysam_update_flag(self._delegate, not val, BAM_FMREVERSE) | |
1652 | |
1653 property is_read1: | |
1654 """true if this is read1""" | |
1655 def __get__(self): | |
1656 return (self.flag & BAM_FREAD1) != 0 | |
1657 def __set__(self,val): | |
1658 pysam_update_flag(self._delegate, val, BAM_FREAD1) | |
1659 property is_read2: | |
1660 """true if this is read2""" | |
1661 def __get__(self): | |
1662 return (self.flag & BAM_FREAD2) != 0 | |
1663 def __set__(self, val): | |
1664 pysam_update_flag(self._delegate, val, BAM_FREAD2) | |
1665 property is_secondary: | |
1666 """true if not primary alignment""" | |
1667 def __get__(self): | |
1668 return (self.flag & BAM_FSECONDARY) != 0 | |
1669 def __set__(self, val): | |
1670 pysam_update_flag(self._delegate, val, BAM_FSECONDARY) | |
1671 property is_qcfail: | |
1672 """true if QC failure""" | |
1673 def __get__(self): | |
1674 return (self.flag & BAM_FQCFAIL) != 0 | |
1675 def __set__(self, val): | |
1676 pysam_update_flag(self._delegate, val, BAM_FQCFAIL) | |
1677 property is_duplicate: | |
1678 """true if optical or PCR duplicate""" | |
1679 def __get__(self): | |
1680 return (self.flag & BAM_FDUP) != 0 | |
1681 def __set__(self, val): | |
1682 pysam_update_flag(self._delegate, val, BAM_FDUP) | |
1683 property is_supplementary: | |
1684 """true if this is a supplementary alignment""" | |
1685 def __get__(self): | |
1686 return (self.flag & BAM_FSUPPLEMENTARY) != 0 | |
1687 def __set__(self, val): | |
1688 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) | |
1689 | |
1690 # 2. Coordinates and lengths | |
1691 property reference_end: | |
1692 '''aligned reference position of the read on the reference genome. | |
1693 | |
1694 reference_end points to one past the last aligned residue. | |
1695 Returns None if not available (read is unmapped or no cigar | |
1696 alignment present). | |
1697 | |
1698 ''' | |
1699 def __get__(self): | |
1700 cdef bam1_t * src | |
1701 src = self._delegate | |
1702 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: | |
1703 return None | |
1704 return bam_endpos(src) | |
1705 | |
1706 property reference_length: | |
1707 '''aligned length of the read on the reference genome. | |
1708 | |
1709 This is equal to `reference_end - reference_start`. | |
1710 Returns None if not available. | |
1711 ''' | |
1712 def __get__(self): | |
1713 cdef bam1_t * src | |
1714 src = self._delegate | |
1715 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: | |
1716 return None | |
1717 return bam_endpos(src) - \ | |
1718 self._delegate.core.pos | |
1719 | |
1720 property query_alignment_sequence: | |
1721 """aligned portion of the read. | |
1722 | |
1723 This is a substring of :attr:`query_sequence` that excludes flanking | |
1724 bases that were :term:`soft clipped` (None if not present). It | |
1725 is equal to ``query_sequence[query_alignment_start:query_alignment_end]``. | |
1726 | |
1727 SAM/BAM files may include extra flanking bases that are not | |
1728 part of the alignment. These bases may be the result of the | |
1729 Smith-Waterman or other algorithms, which may not require | |
1730 alignments that begin at the first residue or end at the last. | |
1731 In addition, extra sequencing adapters, multiplex identifiers, | |
1732 and low-quality bases that were not considered for alignment | |
1733 may have been retained. | |
1734 | |
1735 """ | |
1736 | |
1737 def __get__(self): | |
1738 if self.cache_query_alignment_sequence: | |
1739 return self.cache_query_alignment_sequence | |
1740 | |
1741 cdef bam1_t * src | |
1742 cdef uint32_t start, end | |
1743 | |
1744 src = self._delegate | |
1745 | |
1746 if src.core.l_qseq == 0: | |
1747 return None | |
1748 | |
1749 start = getQueryStart(src) | |
1750 end = getQueryEnd(src) | |
1751 | |
1752 self.cache_query_alignment_sequence = force_str( | |
1753 getSequenceInRange(src, start, end)) | |
1754 return self.cache_query_alignment_sequence | |
1755 | |
1756 property query_alignment_qualities: | |
1757 """aligned query sequence quality values (None if not present). These | |
1758 are the quality values that correspond to | |
1759 :attr:`query_alignment_sequence`, that is, they exclude qualities of | |
1760 :term:`soft clipped` bases. This is equal to | |
1761 ``query_qualities[query_alignment_start:query_alignment_end]``. | |
1762 | |
1763 Quality scores are returned as a python array of unsigned | |
1764 chars. Note that this is not the ASCII-encoded value typically | |
1765 seen in FASTQ or SAM formatted files. Thus, no offset of 33 | |
1766 needs to be subtracted. | |
1767 | |
1768 This property is read-only. | |
1769 | |
1770 """ | |
1771 def __get__(self): | |
1772 | |
1773 if self.cache_query_alignment_qualities: | |
1774 return self.cache_query_alignment_qualities | |
1775 | |
1776 cdef bam1_t * src | |
1777 cdef uint32_t start, end | |
1778 | |
1779 src = self._delegate | |
1780 | |
1781 if src.core.l_qseq == 0: | |
1782 return None | |
1783 | |
1784 start = getQueryStart(src) | |
1785 end = getQueryEnd(src) | |
1786 self.cache_query_alignment_qualities = \ | |
1787 getQualitiesInRange(src, start, end) | |
1788 return self.cache_query_alignment_qualities | |
1789 | |
1790 property query_alignment_start: | |
1791 """start index of the aligned query portion of the sequence (0-based, | |
1792 inclusive). | |
1793 | |
1794 This the index of the first base in :attr:`query_sequence` | |
1795 that is not soft-clipped. | |
1796 """ | |
1797 def __get__(self): | |
1798 return getQueryStart(self._delegate) | |
1799 | |
1800 property query_alignment_end: | |
1801 """end index of the aligned query portion of the sequence (0-based, | |
1802 exclusive) | |
1803 | |
1804 This the index just past the last base in :attr:`query_sequence` | |
1805 that is not soft-clipped. | |
1806 """ | |
1807 def __get__(self): | |
1808 return getQueryEnd(self._delegate) | |
1809 | |
1810 property modified_bases: | |
1811 """Modified bases annotations from Ml/Mm tags. The output is | |
1812 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...] | |
1813 with qual being (256*probability), or -1 if unknown. | |
1814 Strand==0 for forward and 1 for reverse strand modification | |
1815 """ | |
1816 def __get__(self): | |
1817 cdef bam1_t * src | |
1818 cdef hts_base_mod_state *m = hts_base_mod_state_alloc() | |
1819 cdef hts_base_mod mods[5] | |
1820 cdef int pos | |
1821 | |
1822 ret = {} | |
1823 src = self._delegate | |
1824 | |
1825 if bam_parse_basemod(src, m) < 0: | |
1826 return None | |
1827 | |
1828 n = bam_next_basemod(src, m, mods, 5, &pos) | |
1829 | |
1830 while n>0: | |
1831 for i in range(n): | |
1832 mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base | |
1833 mod_strand = mods[i].strand | |
1834 if self.is_reverse: | |
1835 mod_strand = 1 - mod_strand | |
1836 key = (chr(mods[i].canonical_base), | |
1837 mod_strand, | |
1838 mod_code ) | |
1839 ret.setdefault(key,[]).append((pos,mods[i].qual)) | |
1840 | |
1841 n = bam_next_basemod(src, m, mods, 5, &pos) | |
1842 | |
1843 if n<0: | |
1844 return None | |
1845 | |
1846 hts_base_mod_state_free(m) | |
1847 return ret | |
1848 | |
1849 property modified_bases_forward: | |
1850 """Modified bases annotations from Ml/Mm tags. The output is | |
1851 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...] | |
1852 with qual being (256*probability), or -1 if unknown. | |
1853 Strand==0 for forward and 1 for reverse strand modification. | |
1854 The positions are with respect to the original sequence from get_forward_sequence() | |
1855 """ | |
1856 def __get__(self): | |
1857 pmods = self.modified_bases | |
1858 if pmods and self.is_reverse: | |
1859 rmod = {} | |
1860 | |
1861 # Try to find the length of the original sequence | |
1862 rlen = self.infer_read_length() | |
1863 if rlen is None and self.query_sequence is None: | |
1864 return rmod | |
1865 else: | |
1866 rlen = len(self.query_sequence) | |
1867 | |
1868 for k,mods in pmods.items(): | |
1869 nk = k[0],1 - k[1],k[2] | |
1870 for i in range(len(mods)): | |
1871 | |
1872 mods[i] = (rlen - 1 -mods[i][0], mods[i][1]) | |
1873 rmod[nk] = mods | |
1874 return rmod | |
1875 | |
1876 return pmods | |
1877 | |
1878 | |
1879 property query_alignment_length: | |
1880 """length of the aligned query sequence. | |
1881 | |
1882 This is equal to :attr:`query_alignment_end` - | |
1883 :attr:`query_alignment_start` | |
1884 """ | |
1885 def __get__(self): | |
1886 cdef bam1_t * src | |
1887 src = self._delegate | |
1888 return getQueryEnd(src) - getQueryStart(src) | |
1889 | |
1890 ##################################################### | |
1891 # Computed properties | |
1892 | |
1893 def get_reference_positions(self, full_length=False): | |
1894 """a list of reference positions that this read aligns to. | |
1895 | |
1896 By default, this method returns the (0-based) positions on the | |
1897 reference that are within the read's alignment, leaving gaps | |
1898 corresponding to deletions and other reference skips. | |
1899 | |
1900 When *full_length* is True, the returned list is the same length | |
1901 as the read and additionally includes None values corresponding | |
1902 to insertions or soft-clipping, i.e., to bases of the read that | |
1903 are not aligned to a reference position. | |
1904 (See also :meth:`get_aligned_pairs` which additionally returns | |
1905 the corresponding positions along the read.) | |
1906 """ | |
1907 cdef uint32_t k, i, l, pos | |
1908 cdef int op | |
1909 cdef uint32_t * cigar_p | |
1910 cdef bam1_t * src | |
1911 cdef bint _full = full_length | |
1912 | |
1913 src = self._delegate | |
1914 if pysam_get_n_cigar(src) == 0: | |
1915 return [] | |
1916 | |
1917 result = [] | |
1918 pos = src.core.pos | |
1919 cigar_p = pysam_bam_get_cigar(src) | |
1920 | |
1921 for k from 0 <= k < pysam_get_n_cigar(src): | |
1922 op = cigar_p[k] & BAM_CIGAR_MASK | |
1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
1924 | |
1925 if op == BAM_CSOFT_CLIP or op == BAM_CINS: | |
1926 if _full: | |
1927 for i from 0 <= i < l: | |
1928 result.append(None) | |
1929 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
1930 for i from pos <= i < pos + l: | |
1931 result.append(i) | |
1932 pos += l | |
1933 elif op == BAM_CDEL or op == BAM_CREF_SKIP: | |
1934 pos += l | |
1935 | |
1936 return result | |
1937 | |
1938 def infer_query_length(self, always=False): | |
1939 """infer query length from CIGAR alignment. | |
1940 | |
1941 This method deduces the query length from the CIGAR alignment | |
1942 but does not include hard-clipped bases. | |
1943 | |
1944 Returns None if CIGAR alignment is not present. | |
1945 | |
1946 If *always* is set to True, `infer_read_length` is used instead. | |
1947 This is deprecated and only present for backward compatibility. | |
1948 """ | |
1949 if always is True: | |
1950 return self.infer_read_length() | |
1951 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate) | |
1952 if l > 0: | |
1953 return l | |
1954 else: | |
1955 return None | |
1956 | |
1957 def infer_read_length(self): | |
1958 """infer read length from CIGAR alignment. | |
1959 | |
1960 This method deduces the read length from the CIGAR alignment | |
1961 including hard-clipped bases. | |
1962 | |
1963 Returns None if CIGAR alignment is not present. | |
1964 """ | |
1965 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate) | |
1966 if l > 0: | |
1967 return l | |
1968 else: | |
1969 return None | |
1970 | |
1971 def get_reference_sequence(self): | |
1972 """return the reference sequence in the region that is covered by the | |
1973 alignment of the read to the reference. | |
1974 | |
1975 This method requires the MD tag to be set. | |
1976 | |
1977 """ | |
1978 return force_str(build_reference_sequence(self._delegate)) | |
1979 | |
1980 def get_forward_sequence(self): | |
1981 """return the original read sequence. | |
1982 | |
1983 Reads mapped to the reverse strand are stored reverse complemented in | |
1984 the BAM file. This method returns such reads reverse complemented back | |
1985 to their original orientation. | |
1986 | |
1987 Returns None if the record has no query sequence. | |
1988 """ | |
1989 if self.query_sequence is None: | |
1990 return None | |
1991 s = force_str(self.query_sequence) | |
1992 if self.is_reverse: | |
1993 s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] | |
1994 return s | |
1995 | |
1996 def get_forward_qualities(self): | |
1997 """return the original base qualities of the read sequence, | |
1998 in the same format as the :attr:`query_qualities` property. | |
1999 | |
2000 Reads mapped to the reverse strand have their base qualities stored | |
2001 reversed in the BAM file. This method returns such reads' base qualities | |
2002 reversed back to their original orientation. | |
2003 """ | |
2004 if self.is_reverse: | |
2005 return self.query_qualities[::-1] | |
2006 else: | |
2007 return self.query_qualities | |
2008 | |
2009 def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False): | |
2010 """a list of aligned read (query) and reference positions. | |
2011 | |
2012 Each item in the returned list is a tuple consisting of | |
2013 the 0-based offset from the start of the read sequence | |
2014 followed by the 0-based reference position. | |
2015 | |
2016 For inserts, deletions, skipping either query or reference | |
2017 position may be None. | |
2018 | |
2019 For padding in the reference, the reference position will | |
2020 always be None. | |
2021 | |
2022 Parameters | |
2023 ---------- | |
2024 | |
2025 matches_only : bool | |
2026 If True, only matched bases are returned --- no None on either | |
2027 side. | |
2028 with_seq : bool | |
2029 If True, return a third element in the tuple containing the | |
2030 reference sequence. For CIGAR 'P' (padding in the reference) | |
2031 operations, the third tuple element will be None. Substitutions | |
2032 are lower-case. This option requires an MD tag to be present. | |
2033 with_cigar : bool | |
2034 If True, return an extra element in the tuple containing the | |
2035 CIGAR operator corresponding to this position tuple. | |
2036 | |
2037 Returns | |
2038 ------- | |
2039 | |
2040 aligned_pairs : list of tuples | |
2041 | |
2042 """ | |
2043 cdef uint32_t k, i, pos, qpos, r_idx, l | |
2044 cdef int op | |
2045 cdef uint32_t * cigar_p | |
2046 cdef bam1_t * src = self._delegate | |
2047 cdef bint _matches_only = bool(matches_only) | |
2048 cdef bint _with_seq = bool(with_seq) | |
2049 cdef bint _with_cigar = bool(with_cigar) | |
2050 cdef object (*make_tuple)(object, object, object, uint32_t, int) | |
2051 | |
2052 # TODO: this method performs no checking and assumes that | |
2053 # read sequence, cigar and MD tag are consistent. | |
2054 | |
2055 if _with_seq: | |
2056 # force_str required for py2/py3 compatibility | |
2057 ref_seq = force_str(build_reference_sequence(src)) | |
2058 if ref_seq is None: | |
2059 raise ValueError("MD tag not present") | |
2060 make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq | |
2061 else: | |
2062 ref_seq = None | |
2063 make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions | |
2064 | |
2065 r_idx = 0 | |
2066 | |
2067 if pysam_get_n_cigar(src) == 0: | |
2068 return [] | |
2069 | |
2070 result = [] | |
2071 pos = src.core.pos | |
2072 qpos = 0 | |
2073 cigar_p = pysam_bam_get_cigar(src) | |
2074 for k from 0 <= k < pysam_get_n_cigar(src): | |
2075 op = cigar_p[k] & BAM_CIGAR_MASK | |
2076 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
2077 | |
2078 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
2079 for i from pos <= i < pos + l: | |
2080 result.append(make_tuple(qpos, i, ref_seq, r_idx, op)) | |
2081 r_idx += 1 | |
2082 qpos += 1 | |
2083 pos += l | |
2084 | |
2085 elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD: | |
2086 if not _matches_only: | |
2087 for i from pos <= i < pos + l: | |
2088 result.append(make_tuple(qpos, None, None, 0, op)) | |
2089 qpos += 1 | |
2090 else: | |
2091 qpos += l | |
2092 | |
2093 elif op == BAM_CDEL: | |
2094 if not _matches_only: | |
2095 for i from pos <= i < pos + l: | |
2096 result.append(make_tuple(None, i, ref_seq, r_idx, op)) | |
2097 r_idx += 1 | |
2098 else: | |
2099 r_idx += l | |
2100 pos += l | |
2101 | |
2102 elif op == BAM_CHARD_CLIP: | |
2103 pass # advances neither | |
2104 | |
2105 elif op == BAM_CREF_SKIP: | |
2106 if not _matches_only: | |
2107 for i from pos <= i < pos + l: | |
2108 result.append(make_tuple(None, i, None, 0, op)) | |
2109 | |
2110 pos += l | |
2111 | |
2112 return result | |
2113 | |
2114 def get_blocks(self): | |
2115 """ a list of start and end positions of | |
2116 aligned gapless blocks. | |
2117 | |
2118 The start and end positions are in genomic | |
2119 coordinates. | |
2120 | |
2121 Blocks are not normalized, i.e. two blocks | |
2122 might be directly adjacent. This happens if | |
2123 the two blocks are separated by an insertion | |
2124 in the read. | |
2125 """ | |
2126 | |
2127 cdef uint32_t k, pos, l | |
2128 cdef int op | |
2129 cdef uint32_t * cigar_p | |
2130 cdef bam1_t * src | |
2131 | |
2132 src = self._delegate | |
2133 if pysam_get_n_cigar(src) == 0: | |
2134 return [] | |
2135 | |
2136 result = [] | |
2137 pos = src.core.pos | |
2138 cigar_p = pysam_bam_get_cigar(src) | |
2139 l = 0 | |
2140 | |
2141 for k from 0 <= k < pysam_get_n_cigar(src): | |
2142 op = cigar_p[k] & BAM_CIGAR_MASK | |
2143 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
2144 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
2145 result.append((pos, pos + l)) | |
2146 pos += l | |
2147 elif op == BAM_CDEL or op == BAM_CREF_SKIP: | |
2148 pos += l | |
2149 | |
2150 return result | |
2151 | |
2152 def get_overlap(self, uint32_t start, uint32_t end): | |
2153 """return number of aligned bases of read overlapping the interval | |
2154 *start* and *end* on the reference sequence. | |
2155 | |
2156 Return None if cigar alignment is not available. | |
2157 """ | |
2158 cdef uint32_t k, i, pos, overlap | |
2159 cdef int op, o | |
2160 cdef uint32_t * cigar_p | |
2161 cdef bam1_t * src | |
2162 | |
2163 overlap = 0 | |
2164 | |
2165 src = self._delegate | |
2166 if pysam_get_n_cigar(src) == 0: | |
2167 return None | |
2168 pos = src.core.pos | |
2169 o = 0 | |
2170 | |
2171 cigar_p = pysam_bam_get_cigar(src) | |
2172 for k from 0 <= k < pysam_get_n_cigar(src): | |
2173 op = cigar_p[k] & BAM_CIGAR_MASK | |
2174 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
2175 | |
2176 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: | |
2177 o = min( pos + l, end) - max( pos, start ) | |
2178 if o > 0: overlap += o | |
2179 | |
2180 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF: | |
2181 pos += l | |
2182 | |
2183 return overlap | |
2184 | |
2185 def get_cigar_stats(self): | |
2186 """summary of operations in cigar string. | |
2187 | |
2188 The output order in the array is "MIDNSHP=X" followed by a | |
2189 field for the NM tag. If the NM tag is not present, this | |
2190 field will always be 0. (Accessing this field via index -1 | |
2191 avoids changes if more CIGAR operators are added in future.) | |
2192 | |
2193 +-----+----------------+--------+ | |
2194 |M |pysam.CMATCH |0 | | |
2195 +-----+----------------+--------+ | |
2196 |I |pysam.CINS |1 | | |
2197 +-----+----------------+--------+ | |
2198 |D |pysam.CDEL |2 | | |
2199 +-----+----------------+--------+ | |
2200 |N |pysam.CREF_SKIP |3 | | |
2201 +-----+----------------+--------+ | |
2202 |S |pysam.CSOFT_CLIP|4 | | |
2203 +-----+----------------+--------+ | |
2204 |H |pysam.CHARD_CLIP|5 | | |
2205 +-----+----------------+--------+ | |
2206 |P |pysam.CPAD |6 | | |
2207 +-----+----------------+--------+ | |
2208 |= |pysam.CEQUAL |7 | | |
2209 +-----+----------------+--------+ | |
2210 |X |pysam.CDIFF |8 | | |
2211 +-----+----------------+--------+ | |
2212 |B |pysam.CBACK |9 | | |
2213 +-----+----------------+--------+ | |
2214 |NM |NM tag |10 or -1| | |
2215 +-----+----------------+--------+ | |
2216 | |
2217 If no cigar string is present, empty arrays will be returned. | |
2218 | |
2219 Returns: | |
2220 arrays : | |
2221 two arrays. The first contains the nucleotide counts within | |
2222 each cigar operation, the second contains the number of blocks | |
2223 for each cigar operation. | |
2224 | |
2225 """ | |
2226 | |
2227 cdef int nfields = NCIGAR_CODES + 1 | |
2228 | |
2229 cdef c_array.array base_counts = array.array( | |
2230 "I", | |
2231 [0] * nfields) | |
2232 cdef uint32_t [:] base_view = base_counts | |
2233 cdef c_array.array block_counts = array.array( | |
2234 "I", | |
2235 [0] * nfields) | |
2236 cdef uint32_t [:] block_view = block_counts | |
2237 | |
2238 cdef bam1_t * src = self._delegate | |
2239 cdef int op | |
2240 cdef uint32_t l | |
2241 cdef int32_t k | |
2242 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) | |
2243 | |
2244 if cigar_p == NULL: | |
2245 return None | |
2246 | |
2247 for k from 0 <= k < pysam_get_n_cigar(src): | |
2248 op = cigar_p[k] & BAM_CIGAR_MASK | |
2249 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
2250 base_view[op] += l | |
2251 block_view[op] += 1 | |
2252 | |
2253 cdef uint8_t * v = bam_aux_get(src, 'NM') | |
2254 if v != NULL: | |
2255 base_view[nfields - 1] = <int32_t>bam_aux2i(v) | |
2256 | |
2257 return base_counts, block_counts | |
2258 | |
2259 ##################################################### | |
2260 ## Unsorted as yet | |
2261 # TODO: capture in CIGAR object | |
2262 property cigartuples: | |
2263 """the :term:`cigar` alignment. The alignment | |
2264 is returned as a list of tuples of (operation, length). | |
2265 | |
2266 If the alignment is not present, None is returned. | |
2267 | |
2268 The operations are: | |
2269 | |
2270 +-----+----------------+-----+ | |
2271 |M |pysam.CMATCH |0 | | |
2272 +-----+----------------+-----+ | |
2273 |I |pysam.CINS |1 | | |
2274 +-----+----------------+-----+ | |
2275 |D |pysam.CDEL |2 | | |
2276 +-----+----------------+-----+ | |
2277 |N |pysam.CREF_SKIP |3 | | |
2278 +-----+----------------+-----+ | |
2279 |S |pysam.CSOFT_CLIP|4 | | |
2280 +-----+----------------+-----+ | |
2281 |H |pysam.CHARD_CLIP|5 | | |
2282 +-----+----------------+-----+ | |
2283 |P |pysam.CPAD |6 | | |
2284 +-----+----------------+-----+ | |
2285 |= |pysam.CEQUAL |7 | | |
2286 +-----+----------------+-----+ | |
2287 |X |pysam.CDIFF |8 | | |
2288 +-----+----------------+-----+ | |
2289 |B |pysam.CBACK |9 | | |
2290 +-----+----------------+-----+ | |
2291 | |
2292 .. note:: | |
2293 The output is a list of (operation, length) tuples, such as | |
2294 ``[(0, 30)]``. | |
2295 This is different from the SAM specification and | |
2296 the :attr:`cigarstring` property, which uses a | |
2297 (length, operation) order, for example: ``30M``. | |
2298 | |
2299 To unset the cigar property, assign an empty list | |
2300 or None. | |
2301 """ | |
2302 def __get__(self): | |
2303 cdef uint32_t * cigar_p | |
2304 cdef bam1_t * src | |
2305 cdef uint32_t op, l | |
2306 cdef uint32_t k | |
2307 | |
2308 src = self._delegate | |
2309 if pysam_get_n_cigar(src) == 0: | |
2310 return None | |
2311 | |
2312 cigar = [] | |
2313 | |
2314 cigar_p = pysam_bam_get_cigar(src); | |
2315 for k from 0 <= k < pysam_get_n_cigar(src): | |
2316 op = cigar_p[k] & BAM_CIGAR_MASK | |
2317 l = cigar_p[k] >> BAM_CIGAR_SHIFT | |
2318 cigar.append((op, l)) | |
2319 return cigar | |
2320 | |
2321 def __set__(self, values): | |
2322 cdef uint32_t * p | |
2323 cdef bam1_t * src | |
2324 cdef op, l | |
2325 cdef int k | |
2326 | |
2327 k = 0 | |
2328 | |
2329 src = self._delegate | |
2330 | |
2331 # get location of cigar string | |
2332 p = pysam_bam_get_cigar(src) | |
2333 | |
2334 # empty values for cigar string | |
2335 if values is None: | |
2336 values = [] | |
2337 | |
2338 cdef uint32_t ncigar = len(values) | |
2339 | |
2340 cdef bam1_t * retval = pysam_bam_update(src, | |
2341 pysam_get_n_cigar(src) * 4, | |
2342 ncigar * 4, | |
2343 <uint8_t*>p) | |
2344 | |
2345 if retval == NULL: | |
2346 raise MemoryError("could not allocate memory") | |
2347 | |
2348 # length is number of cigar operations, not bytes | |
2349 pysam_set_n_cigar(src, ncigar) | |
2350 | |
2351 # re-acquire pointer to location in memory | |
2352 # as it might have moved | |
2353 p = pysam_bam_get_cigar(src) | |
2354 | |
2355 # insert cigar operations | |
2356 for op, l in values: | |
2357 p[k] = l << BAM_CIGAR_SHIFT | op | |
2358 k += 1 | |
2359 | |
2360 ## setting the cigar string requires updating the bin | |
2361 update_bin(src) | |
2362 | |
2363 cpdef set_tag(self, | |
2364 tag, | |
2365 value, | |
2366 value_type=None, | |
2367 replace=True): | |
2368 """sets a particular field *tag* to *value* in the optional alignment | |
2369 section. | |
2370 | |
2371 *value_type* describes the type of *value* that is to entered | |
2372 into the alignment record. It can be set explicitly to one of | |
2373 the valid one-letter type codes. If unset, an appropriate type | |
2374 will be chosen automatically based on the python type of | |
2375 *value*. | |
2376 | |
2377 An existing value of the same *tag* will be overwritten unless | |
2378 *replace* is set to False. This is usually not recommended as a | |
2379 tag may only appear once in the optional alignment section. | |
2380 | |
2381 If *value* is `None`, the tag will be deleted. | |
2382 | |
2383 This method accepts valid SAM specification value types, which | |
2384 are:: | |
2385 | |
2386 A: printable char | |
2387 i: signed int | |
2388 f: float | |
2389 Z: printable string | |
2390 H: Byte array in hex format | |
2391 B: Integer or numeric array | |
2392 | |
2393 Additionally, it will accept the integer BAM types ('cCsSI') | |
2394 | |
2395 For htslib compatibility, 'a' is synonymous with 'A' and the | |
2396 method accepts a 'd' type code for a double precision float. | |
2397 | |
2398 When deducing the type code by the python type of *value*, the | |
2399 following mapping is applied:: | |
2400 | |
2401 i: python int | |
2402 f: python float | |
2403 Z: python str or bytes | |
2404 B: python array.array, list or tuple | |
2405 | |
2406 Note that a single character string will be output as 'Z' and | |
2407 not 'A' as the former is the more general type. | |
2408 """ | |
2409 | |
2410 cdef int value_size | |
2411 cdef uint8_t tc | |
2412 cdef uint8_t * value_ptr | |
2413 cdef uint8_t *existing_ptr | |
2414 cdef float float_value | |
2415 cdef double double_value | |
2416 cdef int32_t int32_t_value | |
2417 cdef uint32_t uint32_t_value | |
2418 cdef int16_t int16_t_value | |
2419 cdef uint16_t uint16_t_value | |
2420 cdef int8_t int8_t_value | |
2421 cdef uint8_t uint8_t_value | |
2422 cdef bam1_t * src = self._delegate | |
2423 cdef char * _value_type | |
2424 cdef c_array.array array_value | |
2425 cdef object buffer | |
2426 | |
2427 if len(tag) != 2: | |
2428 raise ValueError('Invalid tag: %s' % tag) | |
2429 | |
2430 tag = force_bytes(tag) | |
2431 if replace: | |
2432 existing_ptr = bam_aux_get(src, tag) | |
2433 if existing_ptr: | |
2434 bam_aux_del(src, existing_ptr) | |
2435 | |
2436 # setting value to None deletes a tag | |
2437 if value is None: | |
2438 return | |
2439 | |
2440 cdef uint8_t typecode = get_tag_typecode(value, value_type) | |
2441 if typecode == 0: | |
2442 raise ValueError("can't guess type or invalid type code specified: {} {}".format( | |
2443 value, value_type)) | |
2444 | |
2445 # sam_format1 for typecasting | |
2446 if typecode == b'Z': | |
2447 value = force_bytes(value) | |
2448 value_ptr = <uint8_t*><char*>value | |
2449 value_size = len(value)+1 | |
2450 elif typecode == b'H': | |
2451 # Note that hex tags are stored the very same | |
2452 # way as Z string.s | |
2453 value = force_bytes(value) | |
2454 value_ptr = <uint8_t*><char*>value | |
2455 value_size = len(value)+1 | |
2456 elif typecode == b'A' or typecode == b'a': | |
2457 value = force_bytes(value) | |
2458 value_ptr = <uint8_t*><char*>value | |
2459 value_size = sizeof(char) | |
2460 typecode = b'A' | |
2461 elif typecode == b'i': | |
2462 int32_t_value = value | |
2463 value_ptr = <uint8_t*>&int32_t_value | |
2464 value_size = sizeof(int32_t) | |
2465 elif typecode == b'I': | |
2466 uint32_t_value = value | |
2467 value_ptr = <uint8_t*>&uint32_t_value | |
2468 value_size = sizeof(uint32_t) | |
2469 elif typecode == b's': | |
2470 int16_t_value = value | |
2471 value_ptr = <uint8_t*>&int16_t_value | |
2472 value_size = sizeof(int16_t) | |
2473 elif typecode == b'S': | |
2474 uint16_t_value = value | |
2475 value_ptr = <uint8_t*>&uint16_t_value | |
2476 value_size = sizeof(uint16_t) | |
2477 elif typecode == b'c': | |
2478 int8_t_value = value | |
2479 value_ptr = <uint8_t*>&int8_t_value | |
2480 value_size = sizeof(int8_t) | |
2481 elif typecode == b'C': | |
2482 uint8_t_value = value | |
2483 value_ptr = <uint8_t*>&uint8_t_value | |
2484 value_size = sizeof(uint8_t) | |
2485 elif typecode == b'd': | |
2486 double_value = value | |
2487 value_ptr = <uint8_t*>&double_value | |
2488 value_size = sizeof(double) | |
2489 elif typecode == b'f': | |
2490 float_value = value | |
2491 value_ptr = <uint8_t*>&float_value | |
2492 value_size = sizeof(float) | |
2493 elif typecode == b'B': | |
2494 # the following goes through python, needs to be cleaned up | |
2495 # pack array using struct | |
2496 fmt, args = pack_tags([(tag, value, value_type)]) | |
2497 | |
2498 # remove tag and type code as set by bam_aux_append | |
2499 # first four chars of format (<2sB) | |
2500 fmt = '<' + fmt[4:] | |
2501 # first two values to pack | |
2502 args = args[2:] | |
2503 value_size = struct.calcsize(fmt) | |
2504 # buffer will be freed when object goes out of scope | |
2505 buffer = ctypes.create_string_buffer(value_size) | |
2506 struct.pack_into(fmt, buffer, 0, *args) | |
2507 # bam_aux_append copies data from value_ptr | |
2508 bam_aux_append(src, | |
2509 tag, | |
2510 typecode, | |
2511 value_size, | |
2512 <uint8_t*>buffer.raw) | |
2513 return | |
2514 else: | |
2515 raise ValueError('unsupported value_type {} in set_option'.format(typecode)) | |
2516 | |
2517 bam_aux_append(src, | |
2518 tag, | |
2519 typecode, | |
2520 value_size, | |
2521 value_ptr) | |
2522 | |
2523 cpdef has_tag(self, tag): | |
2524 """returns true if the optional alignment section | |
2525 contains a given *tag*.""" | |
2526 cdef uint8_t * v | |
2527 cdef int nvalues | |
2528 btag = force_bytes(tag) | |
2529 v = bam_aux_get(self._delegate, btag) | |
2530 return v != NULL | |
2531 | |
2532 cpdef get_tag(self, tag, with_value_type=False): | |
2533 """ | |
2534 retrieves data from the optional alignment section | |
2535 given a two-letter *tag* denoting the field. | |
2536 | |
2537 The returned value is cast into an appropriate python type. | |
2538 | |
2539 This method is the fastest way to access the optional | |
2540 alignment section if only few tags need to be retrieved. | |
2541 | |
2542 Possible value types are "AcCsSiIfZHB" (see BAM format | |
2543 specification) as well as additional value type 'd' as | |
2544 implemented in htslib. | |
2545 | |
2546 Parameters: | |
2547 | |
2548 tag : | |
2549 data tag. | |
2550 | |
2551 with_value_type : Optional[bool] | |
2552 if set to True, the return value is a tuple of (tag value, type | |
2553 code). (default False) | |
2554 | |
2555 Returns: | |
2556 | |
2557 A python object with the value of the `tag`. The type of the | |
2558 object depends on the data type in the data record. | |
2559 | |
2560 Raises: | |
2561 | |
2562 KeyError | |
2563 If `tag` is not present, a KeyError is raised. | |
2564 | |
2565 """ | |
2566 cdef uint8_t * v | |
2567 cdef int nvalues | |
2568 btag = force_bytes(tag) | |
2569 v = bam_aux_get(self._delegate, btag) | |
2570 if v == NULL: | |
2571 raise KeyError("tag '%s' not present" % tag) | |
2572 if chr(v[0]) == "B": | |
2573 auxtype = chr(v[0]) + chr(v[1]) | |
2574 else: | |
2575 auxtype = chr(v[0]) | |
2576 | |
2577 if auxtype in "iIcCsS": | |
2578 value = bam_aux2i(v) | |
2579 elif auxtype == 'f' or auxtype == 'F': | |
2580 value = bam_aux2f(v) | |
2581 elif auxtype == 'd' or auxtype == 'D': | |
2582 value = bam_aux2f(v) | |
2583 elif auxtype == 'A' or auxtype == 'a': | |
2584 # force A to a | |
2585 v[0] = b'A' | |
2586 # there might a more efficient way | |
2587 # to convert a char into a string | |
2588 value = '%c' % <char>bam_aux2A(v) | |
2589 elif auxtype == 'Z' or auxtype == 'H': | |
2590 # Z and H are treated equally as strings in htslib | |
2591 value = charptr_to_str(<char*>bam_aux2Z(v)) | |
2592 elif auxtype[0] == 'B': | |
2593 bytesize, nvalues, values = convert_binary_tag(v + 1) | |
2594 value = values | |
2595 else: | |
2596 raise ValueError("unknown auxiliary type '%s'" % auxtype) | |
2597 | |
2598 if with_value_type: | |
2599 return (value, auxtype) | |
2600 else: | |
2601 return value | |
2602 | |
2603 def get_tags(self, with_value_type=False): | |
2604 """the fields in the optional alignment section. | |
2605 | |
2606 Returns a list of all fields in the optional | |
2607 alignment section. Values are converted to appropriate python | |
2608 values. For example: ``[(NM, 2), (RG, "GJP00TM04")]`` | |
2609 | |
2610 If *with_value_type* is set, the value type as encode in | |
2611 the AlignedSegment record will be returned as well: | |
2612 | |
2613 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")] | |
2614 | |
2615 This method will convert all values in the optional alignment | |
2616 section. When getting only one or few tags, please see | |
2617 :meth:`get_tag` for a quicker way to achieve this. | |
2618 | |
2619 """ | |
2620 | |
2621 cdef char * ctag | |
2622 cdef bam1_t * src | |
2623 cdef uint8_t * s | |
2624 cdef char auxtag[3] | |
2625 cdef char auxtype | |
2626 cdef uint8_t byte_size | |
2627 cdef int32_t nvalues | |
2628 | |
2629 src = self._delegate | |
2630 if src.l_data == 0: | |
2631 return [] | |
2632 s = pysam_bam_get_aux(src) | |
2633 result = [] | |
2634 auxtag[2] = 0 | |
2635 while s < (src.data + src.l_data): | |
2636 # get tag | |
2637 auxtag[0] = s[0] | |
2638 auxtag[1] = s[1] | |
2639 s += 2 | |
2640 auxtype = s[0] | |
2641 if auxtype in (b'c', b'C'): | |
2642 value = <int>bam_aux2i(s) | |
2643 s += 1 | |
2644 elif auxtype in (b's', b'S'): | |
2645 value = <int>bam_aux2i(s) | |
2646 s += 2 | |
2647 elif auxtype in (b'i', b'I'): | |
2648 value = <int32_t>bam_aux2i(s) | |
2649 s += 4 | |
2650 elif auxtype == b'f': | |
2651 value = <float>bam_aux2f(s) | |
2652 s += 4 | |
2653 elif auxtype == b'd': | |
2654 value = <double>bam_aux2f(s) | |
2655 s += 8 | |
2656 elif auxtype in (b'A', b'a'): | |
2657 value = "%c" % <char>bam_aux2A(s) | |
2658 s += 1 | |
2659 elif auxtype in (b'Z', b'H'): | |
2660 value = charptr_to_str(<char*>bam_aux2Z(s)) | |
2661 # +1 for NULL terminated string | |
2662 s += len(value) + 1 | |
2663 elif auxtype == b'B': | |
2664 s += 1 | |
2665 byte_size, nvalues, value = convert_binary_tag(s) | |
2666 # 5 for 1 char and 1 int | |
2667 s += 5 + (nvalues * byte_size) - 1 | |
2668 else: | |
2669 raise KeyError("unknown type '%s'" % auxtype) | |
2670 | |
2671 s += 1 | |
2672 | |
2673 if with_value_type: | |
2674 result.append((charptr_to_str(auxtag), value, chr(auxtype))) | |
2675 else: | |
2676 result.append((charptr_to_str(auxtag), value)) | |
2677 | |
2678 return result | |
2679 | |
2680 def set_tags(self, tags): | |
2681 """sets the fields in the optional alignment section with | |
2682 a list of (tag, value) tuples. | |
2683 | |
2684 The value type of the values is determined from the | |
2685 python type. Optionally, a type may be given explicitly as | |
2686 a third value in the tuple, For example: | |
2687 | |
2688 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")] | |
2689 | |
2690 This method will not enforce the rule that the same tag may appear | |
2691 only once in the optional alignment section. | |
2692 """ | |
2693 | |
2694 cdef bam1_t * src | |
2695 cdef uint8_t * s | |
2696 cdef char * temp | |
2697 cdef int new_size = 0 | |
2698 cdef int old_size | |
2699 src = self._delegate | |
2700 | |
2701 # convert and pack the data | |
2702 if tags is not None and len(tags) > 0: | |
2703 fmt, args = pack_tags(tags) | |
2704 new_size = struct.calcsize(fmt) | |
2705 buffer = ctypes.create_string_buffer(new_size) | |
2706 struct.pack_into(fmt, | |
2707 buffer, | |
2708 0, | |
2709 *args) | |
2710 | |
2711 | |
2712 # delete the old data and allocate new space. | |
2713 # If total_size == 0, the aux field will be | |
2714 # empty | |
2715 old_size = pysam_bam_get_l_aux(src) | |
2716 cdef bam1_t * retval = pysam_bam_update(src, | |
2717 old_size, | |
2718 new_size, | |
2719 pysam_bam_get_aux(src)) | |
2720 if retval == NULL: | |
2721 raise MemoryError("could not allocate memory") | |
2722 | |
2723 # copy data only if there is any | |
2724 if new_size > 0: | |
2725 | |
2726 # get location of new data | |
2727 s = pysam_bam_get_aux(src) | |
2728 | |
2729 # check if there is direct path from buffer.raw to tmp | |
2730 p = buffer.raw | |
2731 # create handle to make sure buffer stays alive long | |
2732 # enough for memcpy, see issue 129 | |
2733 temp = p | |
2734 memcpy(s, temp, new_size) | |
2735 | |
2736 | |
2737 ######################################################## | |
2738 # Compatibility Accessors | |
2739 # Functions, properties for compatibility with pysam < 0.8 | |
2740 # | |
2741 # Several options | |
2742 # change the factory functions according to API | |
2743 # * requires code changes throughout, incl passing | |
2744 # handles to factory functions | |
2745 # subclass functions and add attributes at runtime | |
2746 # e.g.: AlignedSegments.qname = AlignedSegments.query_name | |
2747 # * will slow down the default interface | |
2748 # explicit declaration of getters/setters | |
2749 ######################################################## | |
2750 property qname: | |
2751 """deprecated, use :attr:`query_name` instead.""" | |
2752 def __get__(self): return self.query_name | |
2753 def __set__(self, v): self.query_name = v | |
2754 property tid: | |
2755 """deprecated, use :attr:`reference_id` instead.""" | |
2756 def __get__(self): return self.reference_id | |
2757 def __set__(self, v): self.reference_id = v | |
2758 property pos: | |
2759 """deprecated, use :attr:`reference_start` instead.""" | |
2760 def __get__(self): return self.reference_start | |
2761 def __set__(self, v): self.reference_start = v | |
2762 property mapq: | |
2763 """deprecated, use :attr:`mapping_quality` instead.""" | |
2764 def __get__(self): return self.mapping_quality | |
2765 def __set__(self, v): self.mapping_quality = v | |
2766 property rnext: | |
2767 """deprecated, use :attr:`next_reference_id` instead.""" | |
2768 def __get__(self): return self.next_reference_id | |
2769 def __set__(self, v): self.next_reference_id = v | |
2770 property pnext: | |
2771 """deprecated, use :attr:`next_reference_start` instead.""" | |
2772 def __get__(self): | |
2773 return self.next_reference_start | |
2774 def __set__(self, v): | |
2775 self.next_reference_start = v | |
2776 property cigar: | |
2777 """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead.""" | |
2778 def __get__(self): | |
2779 r = self.cigartuples | |
2780 if r is None: | |
2781 r = [] | |
2782 return r | |
2783 def __set__(self, v): self.cigartuples = v | |
2784 property tlen: | |
2785 """deprecated, use :attr:`template_length` instead.""" | |
2786 def __get__(self): | |
2787 return self.template_length | |
2788 def __set__(self, v): | |
2789 self.template_length = v | |
2790 property seq: | |
2791 """deprecated, use :attr:`query_sequence` instead.""" | |
2792 def __get__(self): | |
2793 return self.query_sequence | |
2794 def __set__(self, v): | |
2795 self.query_sequence = v | |
2796 property qual: | |
2797 """deprecated, use :attr:`query_qualities` instead.""" | |
2798 def __get__(self): | |
2799 return array_to_qualitystring(self.query_qualities) | |
2800 def __set__(self, v): | |
2801 self.query_qualities = qualitystring_to_array(v) | |
2802 property alen: | |
2803 """deprecated, use :attr:`reference_length` instead.""" | |
2804 def __get__(self): | |
2805 return self.reference_length | |
2806 def __set__(self, v): | |
2807 self.reference_length = v | |
2808 property aend: | |
2809 """deprecated, use :attr:`reference_end` instead.""" | |
2810 def __get__(self): | |
2811 return self.reference_end | |
2812 def __set__(self, v): | |
2813 self.reference_end = v | |
2814 property rlen: | |
2815 """deprecated, use :attr:`query_length` instead.""" | |
2816 def __get__(self): | |
2817 return self.query_length | |
2818 def __set__(self, v): | |
2819 self.query_length = v | |
2820 property query: | |
2821 """deprecated, use :attr:`query_alignment_sequence` | |
2822 instead.""" | |
2823 def __get__(self): | |
2824 return self.query_alignment_sequence | |
2825 def __set__(self, v): | |
2826 self.query_alignment_sequence = v | |
2827 property qqual: | |
2828 """deprecated, use :attr:`query_alignment_qualities` | |
2829 instead.""" | |
2830 def __get__(self): | |
2831 return array_to_qualitystring(self.query_alignment_qualities) | |
2832 def __set__(self, v): | |
2833 self.query_alignment_qualities = qualitystring_to_array(v) | |
2834 property qstart: | |
2835 """deprecated, use :attr:`query_alignment_start` instead.""" | |
2836 def __get__(self): | |
2837 return self.query_alignment_start | |
2838 def __set__(self, v): | |
2839 self.query_alignment_start = v | |
2840 property qend: | |
2841 """deprecated, use :attr:`query_alignment_end` instead.""" | |
2842 def __get__(self): | |
2843 return self.query_alignment_end | |
2844 def __set__(self, v): | |
2845 self.query_alignment_end = v | |
2846 property qlen: | |
2847 """deprecated, use :attr:`query_alignment_length` | |
2848 instead.""" | |
2849 def __get__(self): | |
2850 return self.query_alignment_length | |
2851 def __set__(self, v): | |
2852 self.query_alignment_length = v | |
2853 property mrnm: | |
2854 """deprecated, use :attr:`next_reference_id` instead.""" | |
2855 def __get__(self): | |
2856 return self.next_reference_id | |
2857 def __set__(self, v): | |
2858 self.next_reference_id = v | |
2859 property mpos: | |
2860 """deprecated, use :attr:`next_reference_start` | |
2861 instead.""" | |
2862 def __get__(self): | |
2863 return self.next_reference_start | |
2864 def __set__(self, v): | |
2865 self.next_reference_start = v | |
2866 property rname: | |
2867 """deprecated, use :attr:`reference_id` instead.""" | |
2868 def __get__(self): | |
2869 return self.reference_id | |
2870 def __set__(self, v): | |
2871 self.reference_id = v | |
2872 property isize: | |
2873 """deprecated, use :attr:`template_length` instead.""" | |
2874 def __get__(self): | |
2875 return self.template_length | |
2876 def __set__(self, v): | |
2877 self.template_length = v | |
2878 property blocks: | |
2879 """deprecated, use :meth:`get_blocks()` instead.""" | |
2880 def __get__(self): | |
2881 return self.get_blocks() | |
2882 property aligned_pairs: | |
2883 """deprecated, use :meth:`get_aligned_pairs()` instead.""" | |
2884 def __get__(self): | |
2885 return self.get_aligned_pairs() | |
2886 property inferred_length: | |
2887 """deprecated, use :meth:`infer_query_length()` instead.""" | |
2888 def __get__(self): | |
2889 return self.infer_query_length() | |
2890 property positions: | |
2891 """deprecated, use :meth:`get_reference_positions()` instead.""" | |
2892 def __get__(self): | |
2893 return self.get_reference_positions() | |
2894 property tags: | |
2895 """deprecated, use :meth:`get_tags()` instead.""" | |
2896 def __get__(self): | |
2897 return self.get_tags() | |
2898 def __set__(self, tags): | |
2899 self.set_tags(tags) | |
2900 def overlap(self): | |
2901 """deprecated, use :meth:`get_overlap()` instead.""" | |
2902 return self.get_overlap() | |
2903 def opt(self, tag): | |
2904 """deprecated, use :meth:`get_tag()` instead.""" | |
2905 return self.get_tag(tag) | |
2906 def setTag(self, tag, value, value_type=None, replace=True): | |
2907 """deprecated, use :meth:`set_tag()` instead.""" | |
2908 return self.set_tag(tag, value, value_type, replace) | |
2909 | |
2910 | |
2911 cdef class PileupColumn: | |
2912 '''A pileup of reads at a particular reference sequence position | |
2913 (:term:`column`). A pileup column contains all the reads that map | |
2914 to a certain target base. | |
2915 | |
2916 This class is a proxy for results returned by the samtools pileup | |
2917 engine. If the underlying engine iterator advances, the results | |
2918 of this column will change. | |
2919 ''' | |
2920 def __init__(self): | |
2921 raise TypeError("this class cannot be instantiated from Python") | |
2922 | |
2923 def __str__(self): | |
2924 return "\t".join(map(str, | |
2925 (self.reference_id, | |
2926 self.reference_pos, | |
2927 self.nsegments))) +\ | |
2928 "\n" +\ | |
2929 "\n".join(map(str, self.pileups)) | |
2930 | |
2931 def __dealloc__(self): | |
2932 free(self.buf.s) | |
2933 | |
2934 def set_min_base_quality(self, min_base_quality): | |
2935 """set the minimum base quality for this pileup column. | |
2936 """ | |
2937 self.min_base_quality = min_base_quality | |
2938 | |
2939 def __len__(self): | |
2940 """return number of reads aligned to this column. | |
2941 | |
2942 see :meth:`get_num_aligned` | |
2943 """ | |
2944 return self.get_num_aligned() | |
2945 | |
2946 property reference_id: | |
2947 '''the reference sequence number as defined in the header''' | |
2948 def __get__(self): | |
2949 return self.tid | |
2950 | |
2951 property reference_name: | |
2952 """:term:`reference` name (None if no AlignmentFile is associated)""" | |
2953 def __get__(self): | |
2954 if self.header is not None: | |
2955 return self.header.get_reference_name(self.tid) | |
2956 return None | |
2957 | |
2958 property nsegments: | |
2959 '''number of reads mapping to this column. | |
2960 | |
2961 Note that this number ignores the base quality filter.''' | |
2962 def __get__(self): | |
2963 return self.n_pu | |
2964 def __set__(self, n): | |
2965 self.n_pu = n | |
2966 | |
2967 property reference_pos: | |
2968 '''the position in the reference sequence (0-based).''' | |
2969 def __get__(self): | |
2970 return self.pos | |
2971 | |
2972 property pileups: | |
2973 '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' | |
2974 def __get__(self): | |
2975 if self.plp == NULL or self.plp[0] == NULL: | |
2976 raise ValueError("PileupColumn accessed after iterator finished") | |
2977 | |
2978 cdef int x | |
2979 cdef const bam_pileup1_t * p = NULL | |
2980 pileups = [] | |
2981 | |
2982 # warning: there could be problems if self.n and self.buf are | |
2983 # out of sync. | |
2984 for x from 0 <= x < self.n_pu: | |
2985 p = &(self.plp[0][x]) | |
2986 if p == NULL: | |
2987 raise ValueError( | |
2988 "pileup buffer out of sync - most likely use of iterator " | |
2989 "outside loop") | |
2990 if pileup_base_qual_skip(p, self.min_base_quality): | |
2991 continue | |
2992 pileups.append(makePileupRead(p, self.header)) | |
2993 return pileups | |
2994 | |
2995 ######################################################## | |
2996 # Compatibility Accessors | |
2997 # Functions, properties for compatibility with pysam < 0.8 | |
2998 ######################################################## | |
2999 property pos: | |
3000 """deprecated, use :attr:`reference_pos` instead.""" | |
3001 def __get__(self): | |
3002 return self.reference_pos | |
3003 def __set__(self, v): | |
3004 self.reference_pos = v | |
3005 | |
3006 property tid: | |
3007 """deprecated, use :attr:`reference_id` instead.""" | |
3008 def __get__(self): | |
3009 return self.reference_id | |
3010 def __set__(self, v): | |
3011 self.reference_id = v | |
3012 | |
3013 property n: | |
3014 """deprecated, use :attr:`nsegments` instead.""" | |
3015 def __get__(self): | |
3016 return self.nsegments | |
3017 def __set__(self, v): | |
3018 self.nsegments = v | |
3019 | |
3020 def get_num_aligned(self): | |
3021 """return number of aligned bases at pileup column position. | |
3022 | |
3023 This method applies a base quality filter and the number is | |
3024 equal to the size of :meth:`get_query_sequences`, | |
3025 :meth:`get_mapping_qualities`, etc. | |
3026 | |
3027 """ | |
3028 cdef uint32_t x = 0 | |
3029 cdef uint32_t c = 0 | |
3030 cdef uint32_t cnt = 0 | |
3031 cdef const bam_pileup1_t * p = NULL | |
3032 if self.plp == NULL or self.plp[0] == NULL: | |
3033 raise ValueError("PileupColumn accessed after iterator finished") | |
3034 | |
3035 for x from 0 <= x < self.n_pu: | |
3036 p = &(self.plp[0][x]) | |
3037 if p == NULL: | |
3038 raise ValueError( | |
3039 "pileup buffer out of sync - most likely use of iterator " | |
3040 "outside loop") | |
3041 if pileup_base_qual_skip(p, self.min_base_quality): | |
3042 continue | |
3043 cnt += 1 | |
3044 return cnt | |
3045 | |
3046 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False): | |
3047 """query bases/sequences at pileup column position. | |
3048 | |
3049 Optionally, the bases/sequences can be annotated according to the samtools | |
3050 mpileup format. This is the format description from the samtools mpileup tool:: | |
3051 | |
3052 Information on match, mismatch, indel, strand, mapping | |
3053 quality and start and end of a read are all encoded at the | |
3054 read base column. At this column, a dot stands for a match | |
3055 to the reference base on the forward strand, a comma for a | |
3056 match on the reverse strand, a '>' or '<' for a reference | |
3057 skip, `ACGTN' for a mismatch on the forward strand and | |
3058 `acgtn' for a mismatch on the reverse strand. A pattern | |
3059 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion | |
3060 between this reference position and the next reference | |
3061 position. The length of the insertion is given by the | |
3062 integer in the pattern, followed by the inserted | |
3063 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' | |
3064 represents a deletion from the reference. The deleted bases | |
3065 will be presented as `*' in the following lines. Also at | |
3066 the read base column, a symbol `^' marks the start of a | |
3067 read. The ASCII of the character following `^' minus 33 | |
3068 gives the mapping quality. A symbol `$' marks the end of a | |
3069 read segment | |
3070 | |
3071 To reproduce samtools mpileup format, set all of mark_matches, | |
3072 mark_ends and add_indels to True. | |
3073 | |
3074 Parameters | |
3075 ---------- | |
3076 | |
3077 mark_matches: bool | |
3078 | |
3079 If True, output bases matching the reference as "." or "," | |
3080 for forward and reverse strand, respectively. This mark | |
3081 requires the reference sequence. If no reference is | |
3082 present, this option is ignored. | |
3083 | |
3084 mark_ends : bool | |
3085 | |
3086 If True, add markers "^" and "$" for read start and end, respectively. | |
3087 | |
3088 add_indels : bool | |
3089 | |
3090 If True, add bases for bases inserted into or skipped from the | |
3091 reference. The latter requires a reference sequence file to have | |
3092 been given, e.g. via `pileup(fastafile = ...)`. If no reference | |
3093 sequence is available, skipped bases are represented as 'N's. | |
3094 | |
3095 Returns | |
3096 ------- | |
3097 | |
3098 a list of bases/sequences per read at pileup column position. : list | |
3099 | |
3100 """ | |
3101 cdef uint32_t x = 0 | |
3102 cdef uint32_t j = 0 | |
3103 cdef uint32_t c = 0 | |
3104 cdef uint8_t cc = 0 | |
3105 cdef uint8_t rb = 0 | |
3106 cdef kstring_t * buf = &self.buf | |
3107 cdef const bam_pileup1_t * p = NULL | |
3108 | |
3109 if self.plp == NULL or self.plp[0] == NULL: | |
3110 raise ValueError("PileupColumn accessed after iterator finished") | |
3111 | |
3112 buf.l = 0 | |
3113 | |
3114 # todo: reference sequence to count matches/mismatches | |
3115 # todo: convert assertions to exceptions | |
3116 for x from 0 <= x < self.n_pu: | |
3117 p = &(self.plp[0][x]) | |
3118 if p == NULL: | |
3119 raise ValueError( | |
3120 "pileup buffer out of sync - most likely use of iterator " | |
3121 "outside loop") | |
3122 if pileup_base_qual_skip(p, self.min_base_quality): | |
3123 continue | |
3124 # see samtools pileup_seq | |
3125 if mark_ends and p.is_head: | |
3126 kputc(b'^', buf) | |
3127 | |
3128 if p.b.core.qual > 93: | |
3129 kputc(126, buf) | |
3130 else: | |
3131 kputc(p.b.core.qual + 33, buf) | |
3132 if not p.is_del: | |
3133 if p.qpos < p.b.core.l_qseq: | |
3134 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] | |
3135 else: | |
3136 cc = b'N' | |
3137 | |
3138 if mark_matches and self.reference_sequence != NULL: | |
3139 rb = self.reference_sequence[self.reference_pos] | |
3140 if seq_nt16_table[cc] == seq_nt16_table[rb]: | |
3141 cc = b'=' | |
3142 kputc(strand_mark_char(cc, p.b), buf) | |
3143 elif add_indels: | |
3144 if p.is_refskip: | |
3145 if bam_is_rev(p.b): | |
3146 kputc(b'<', buf) | |
3147 else: | |
3148 kputc(b'>', buf) | |
3149 else: | |
3150 kputc(b'*', buf) | |
3151 if add_indels: | |
3152 if p.indel > 0: | |
3153 kputc(b'+', buf) | |
3154 kputw(p.indel, buf) | |
3155 for j from 1 <= j <= p.indel: | |
3156 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] | |
3157 kputc(strand_mark_char(cc, p.b), buf) | |
3158 elif p.indel < 0: | |
3159 kputc(b'-', buf) | |
3160 kputw(-p.indel, buf) | |
3161 for j from 1 <= j <= -p.indel: | |
3162 # TODO: out-of-range check here? | |
3163 if self.reference_sequence == NULL: | |
3164 cc = b'N' | |
3165 else: | |
3166 cc = self.reference_sequence[self.reference_pos + j] | |
3167 kputc(strand_mark_char(cc, p.b), buf) | |
3168 if mark_ends and p.is_tail: | |
3169 kputc(b'$', buf) | |
3170 | |
3171 kputc(b':', buf) | |
3172 | |
3173 if buf.l == 0: | |
3174 # could be zero if all qualities are too low | |
3175 return "" | |
3176 else: | |
3177 # quicker to ensemble all and split than to encode all separately. | |
3178 # ignore last ":" | |
3179 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":") | |
3180 | |
3181 def get_query_qualities(self): | |
3182 """query base quality scores at pileup column position. | |
3183 | |
3184 Returns | |
3185 ------- | |
3186 | |
3187 a list of quality scores : list | |
3188 """ | |
3189 cdef uint32_t x = 0 | |
3190 cdef const bam_pileup1_t * p = NULL | |
3191 cdef uint32_t c = 0 | |
3192 result = [] | |
3193 for x from 0 <= x < self.n_pu: | |
3194 p = &(self.plp[0][x]) | |
3195 if p == NULL: | |
3196 raise ValueError( | |
3197 "pileup buffer out of sync - most likely use of iterator " | |
3198 "outside loop") | |
3199 | |
3200 if p.qpos < p.b.core.l_qseq: | |
3201 c = bam_get_qual(p.b)[p.qpos] | |
3202 else: | |
3203 c = 0 | |
3204 if c < self.min_base_quality: | |
3205 continue | |
3206 result.append(c) | |
3207 return result | |
3208 | |
3209 def get_mapping_qualities(self): | |
3210 """query mapping quality scores at pileup column position. | |
3211 | |
3212 Returns | |
3213 ------- | |
3214 | |
3215 a list of quality scores : list | |
3216 """ | |
3217 if self.plp == NULL or self.plp[0] == NULL: | |
3218 raise ValueError("PileupColumn accessed after iterator finished") | |
3219 | |
3220 cdef uint32_t x = 0 | |
3221 cdef const bam_pileup1_t * p = NULL | |
3222 result = [] | |
3223 for x from 0 <= x < self.n_pu: | |
3224 p = &(self.plp[0][x]) | |
3225 if p == NULL: | |
3226 raise ValueError( | |
3227 "pileup buffer out of sync - most likely use of iterator " | |
3228 "outside loop") | |
3229 | |
3230 if pileup_base_qual_skip(p, self.min_base_quality): | |
3231 continue | |
3232 result.append(p.b.core.qual) | |
3233 return result | |
3234 | |
3235 def get_query_positions(self): | |
3236 """positions in read at pileup column position. | |
3237 | |
3238 Returns | |
3239 ------- | |
3240 | |
3241 a list of read positions : list | |
3242 """ | |
3243 if self.plp == NULL or self.plp[0] == NULL: | |
3244 raise ValueError("PileupColumn accessed after iterator finished") | |
3245 | |
3246 cdef uint32_t x = 0 | |
3247 cdef const bam_pileup1_t * p = NULL | |
3248 result = [] | |
3249 for x from 0 <= x < self.n_pu: | |
3250 p = &(self.plp[0][x]) | |
3251 if p == NULL: | |
3252 raise ValueError( | |
3253 "pileup buffer out of sync - most likely use of iterator " | |
3254 "outside loop") | |
3255 | |
3256 if pileup_base_qual_skip(p, self.min_base_quality): | |
3257 continue | |
3258 result.append(p.qpos) | |
3259 return result | |
3260 | |
3261 def get_query_names(self): | |
3262 """query/read names aligned at pileup column position. | |
3263 | |
3264 Returns | |
3265 ------- | |
3266 | |
3267 a list of query names at pileup column position. : list | |
3268 """ | |
3269 if self.plp == NULL or self.plp[0] == NULL: | |
3270 raise ValueError("PileupColumn accessed after iterator finished") | |
3271 | |
3272 cdef uint32_t x = 0 | |
3273 cdef const bam_pileup1_t * p = NULL | |
3274 result = [] | |
3275 for x from 0 <= x < self.n_pu: | |
3276 p = &(self.plp[0][x]) | |
3277 if p == NULL: | |
3278 raise ValueError( | |
3279 "pileup buffer out of sync - most likely use of iterator " | |
3280 "outside loop") | |
3281 | |
3282 if pileup_base_qual_skip(p, self.min_base_quality): | |
3283 continue | |
3284 result.append(charptr_to_str(pysam_bam_get_qname(p.b))) | |
3285 return result | |
3286 | |
3287 | |
3288 cdef class PileupRead: | |
3289 '''Representation of a read aligned to a particular position in the | |
3290 reference sequence. | |
3291 | |
3292 ''' | |
3293 | |
3294 def __init__(self): | |
3295 raise TypeError( | |
3296 "this class cannot be instantiated from Python") | |
3297 | |
3298 def __str__(self): | |
3299 return "\t".join( | |
3300 map(str, | |
3301 (self.alignment, self.query_position, | |
3302 self.indel, self.level, | |
3303 self.is_del, self.is_head, | |
3304 self.is_tail, self.is_refskip))) | |
3305 | |
3306 property alignment: | |
3307 """a :class:`pysam.AlignedSegment` object of the aligned read""" | |
3308 def __get__(self): | |
3309 return self._alignment | |
3310 | |
3311 property query_position: | |
3312 """position of the read base at the pileup site, 0-based. | |
3313 None if :attr:`is_del` or :attr:`is_refskip` is set. | |
3314 | |
3315 """ | |
3316 def __get__(self): | |
3317 if self.is_del or self.is_refskip: | |
3318 return None | |
3319 else: | |
3320 return self._qpos | |
3321 | |
3322 property query_position_or_next: | |
3323 """position of the read base at the pileup site, 0-based. | |
3324 | |
3325 If the current position is a deletion, returns the next | |
3326 aligned base. | |
3327 | |
3328 """ | |
3329 def __get__(self): | |
3330 return self._qpos | |
3331 | |
3332 property indel: | |
3333 """indel length for the position following the current pileup site. | |
3334 | |
3335 This quantity peeks ahead to the next cigar operation in this | |
3336 alignment. If the next operation is an insertion, indel will | |
3337 be positive. If the next operation is a deletion, it will be | |
3338 negation. 0 if the next operation is not an indel. | |
3339 | |
3340 """ | |
3341 def __get__(self): | |
3342 return self._indel | |
3343 | |
3344 property level: | |
3345 """the level of the read in the "viewer" mode. Note that this value | |
3346 is currently not computed.""" | |
3347 def __get__(self): | |
3348 return self._level | |
3349 | |
3350 property is_del: | |
3351 """1 iff the base on the padded read is a deletion""" | |
3352 def __get__(self): | |
3353 return self._is_del | |
3354 | |
3355 property is_head: | |
3356 """1 iff the base on the padded read is the left-most base.""" | |
3357 def __get__(self): | |
3358 return self._is_head | |
3359 | |
3360 property is_tail: | |
3361 """1 iff the base on the padded read is the right-most base.""" | |
3362 def __get__(self): | |
3363 return self._is_tail | |
3364 | |
3365 property is_refskip: | |
3366 """1 iff the base on the padded read is part of CIGAR N op.""" | |
3367 def __get__(self): | |
3368 return self._is_refskip | |
3369 | |
3370 | |
3371 | |
3372 cpdef enum CIGAR_OPS: | |
3373 CMATCH = 0 | |
3374 CINS = 1 | |
3375 CDEL = 2 | |
3376 CREF_SKIP = 3 | |
3377 CSOFT_CLIP = 4 | |
3378 CHARD_CLIP = 5 | |
3379 CPAD = 6 | |
3380 CEQUAL = 7 | |
3381 CDIFF = 8 | |
3382 CBACK = 9 | |
3383 | |
3384 | |
3385 cpdef enum SAM_FLAGS: | |
3386 # the read is paired in sequencing, no matter whether it is mapped in a pair | |
3387 FPAIRED = 1 | |
3388 # the read is mapped in a proper pair | |
3389 FPROPER_PAIR = 2 | |
3390 # the read itself is unmapped; conflictive with FPROPER_PAIR | |
3391 FUNMAP = 4 | |
3392 # the mate is unmapped | |
3393 FMUNMAP = 8 | |
3394 # the read is mapped to the reverse strand | |
3395 FREVERSE = 16 | |
3396 # the mate is mapped to the reverse strand | |
3397 FMREVERSE = 32 | |
3398 # this is read1 | |
3399 FREAD1 = 64 | |
3400 # this is read2 | |
3401 FREAD2 = 128 | |
3402 # not primary alignment | |
3403 FSECONDARY = 256 | |
3404 # QC failure | |
3405 FQCFAIL = 512 | |
3406 # optical or PCR duplicate | |
3407 FDUP = 1024 | |
3408 # supplementary alignment | |
3409 FSUPPLEMENTARY = 2048 | |
3410 | |
3411 | |
3412 __all__ = [ | |
3413 "AlignedSegment", | |
3414 "PileupColumn", | |
3415 "PileupRead", | |
3416 "CMATCH", | |
3417 "CINS", | |
3418 "CDEL", | |
3419 "CREF_SKIP", | |
3420 "CSOFT_CLIP", | |
3421 "CHARD_CLIP", | |
3422 "CPAD", | |
3423 "CEQUAL", | |
3424 "CDIFF", | |
3425 "CBACK", | |
3426 "FPAIRED", | |
3427 "FPROPER_PAIR", | |
3428 "FUNMAP", | |
3429 "FMUNMAP", | |
3430 "FREVERSE", | |
3431 "FMREVERSE", | |
3432 "FREAD1", | |
3433 "FREAD2", | |
3434 "FSECONDARY", | |
3435 "FQCFAIL", | |
3436 "FDUP", | |
3437 "FSUPPLEMENTARY", | |
3438 "KEY_NAMES"] |