jpayne@69
|
1 # cython: language_level=3
|
jpayne@69
|
2 # cython: embedsignature=True
|
jpayne@69
|
3 # cython: profile=True
|
jpayne@69
|
4 ###############################################################################
|
jpayne@69
|
5 ###############################################################################
|
jpayne@69
|
6 # Cython wrapper for SAM/BAM/CRAM files based on htslib
|
jpayne@69
|
7 ###############################################################################
|
jpayne@69
|
8 # The principal classes defined in this module are:
|
jpayne@69
|
9 #
|
jpayne@69
|
10 # class AlignedSegment an aligned segment (read)
|
jpayne@69
|
11 #
|
jpayne@69
|
12 # class PileupColumn a collection of segments (PileupRead) aligned to
|
jpayne@69
|
13 # a particular genomic position.
|
jpayne@69
|
14 #
|
jpayne@69
|
15 # class PileupRead an AlignedSegment aligned to a particular genomic
|
jpayne@69
|
16 # position. Contains additional attributes with respect
|
jpayne@69
|
17 # to this.
|
jpayne@69
|
18 #
|
jpayne@69
|
19 # Additionally this module defines numerous additional classes that are part
|
jpayne@69
|
20 # of the internal API. These are:
|
jpayne@69
|
21 #
|
jpayne@69
|
22 # Various iterator classes to iterate over alignments in sequential (IteratorRow)
|
jpayne@69
|
23 # or in a stacked fashion (IteratorColumn):
|
jpayne@69
|
24 #
|
jpayne@69
|
25 # class IteratorRow
|
jpayne@69
|
26 # class IteratorRowRegion
|
jpayne@69
|
27 # class IteratorRowHead
|
jpayne@69
|
28 # class IteratorRowAll
|
jpayne@69
|
29 # class IteratorRowAllRefs
|
jpayne@69
|
30 # class IteratorRowSelection
|
jpayne@69
|
31 #
|
jpayne@69
|
32 ###############################################################################
|
jpayne@69
|
33 #
|
jpayne@69
|
34 # The MIT License
|
jpayne@69
|
35 #
|
jpayne@69
|
36 # Copyright (c) 2015 Andreas Heger
|
jpayne@69
|
37 #
|
jpayne@69
|
38 # Permission is hereby granted, free of charge, to any person obtaining a
|
jpayne@69
|
39 # copy of this software and associated documentation files (the "Software"),
|
jpayne@69
|
40 # to deal in the Software without restriction, including without limitation
|
jpayne@69
|
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jpayne@69
|
42 # and/or sell copies of the Software, and to permit persons to whom the
|
jpayne@69
|
43 # Software is furnished to do so, subject to the following conditions:
|
jpayne@69
|
44 #
|
jpayne@69
|
45 # The above copyright notice and this permission notice shall be included in
|
jpayne@69
|
46 # all copies or substantial portions of the Software.
|
jpayne@69
|
47 #
|
jpayne@69
|
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@69
|
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@69
|
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
jpayne@69
|
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@69
|
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jpayne@69
|
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jpayne@69
|
54 # DEALINGS IN THE SOFTWARE.
|
jpayne@69
|
55 #
|
jpayne@69
|
56 ###############################################################################
|
jpayne@69
|
57 import re
|
jpayne@69
|
58 import array
|
jpayne@69
|
59 import json
|
jpayne@69
|
60 import string
|
jpayne@69
|
61 import ctypes
|
jpayne@69
|
62 import struct
|
jpayne@69
|
63
|
jpayne@69
|
64 cimport cython
|
jpayne@69
|
65 from cpython cimport array as c_array
|
jpayne@69
|
66 from cpython cimport PyBytes_FromStringAndSize
|
jpayne@69
|
67 from libc.string cimport memset, strchr
|
jpayne@69
|
68 from cpython cimport array as c_array
|
jpayne@69
|
69 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
|
jpayne@69
|
70 INT8_MAX, INT16_MAX, INT32_MAX, \
|
jpayne@69
|
71 UINT8_MAX, UINT16_MAX, UINT32_MAX
|
jpayne@69
|
72
|
jpayne@69
|
73 from pysam.libchtslib cimport HTS_IDX_NOCOOR
|
jpayne@69
|
74 from pysam.libcutils cimport force_bytes, force_str, \
|
jpayne@69
|
75 charptr_to_str, charptr_to_bytes
|
jpayne@69
|
76 from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
|
jpayne@69
|
77 array_to_qualitystring
|
jpayne@69
|
78
|
jpayne@69
|
79 # Constants for binary tag conversion
|
jpayne@69
|
80 cdef char * htslib_types = 'cCsSiIf'
|
jpayne@69
|
81 cdef char * parray_types = 'bBhHiIf'
|
jpayne@69
|
82
|
jpayne@69
|
83 # translation tables
|
jpayne@69
|
84
|
jpayne@69
|
85 # cigar code to character and vice versa
|
jpayne@69
|
86 cdef char* CODE2CIGAR= "MIDNSHP=XB"
|
jpayne@69
|
87 cdef int NCIGAR_CODES = 10
|
jpayne@69
|
88
|
jpayne@69
|
89 CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
|
jpayne@69
|
90 CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
|
jpayne@69
|
91
|
jpayne@69
|
92 # names for keys in dictionary representation of an AlignedSegment
|
jpayne@69
|
93 KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar",
|
jpayne@69
|
94 "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"]
|
jpayne@69
|
95
|
jpayne@69
|
96 #####################################################################
|
jpayne@69
|
97 # C multiplication with wrapping around
|
jpayne@69
|
98 cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
|
jpayne@69
|
99 return (a * b) & 0xffffffff
|
jpayne@69
|
100
|
jpayne@69
|
101
|
jpayne@69
|
102 cdef inline uint8_t tolower(uint8_t ch):
|
jpayne@69
|
103 if ch >= 65 and ch <= 90:
|
jpayne@69
|
104 return ch + 32
|
jpayne@69
|
105 else:
|
jpayne@69
|
106 return ch
|
jpayne@69
|
107
|
jpayne@69
|
108
|
jpayne@69
|
109 cdef inline uint8_t toupper(uint8_t ch):
|
jpayne@69
|
110 if ch >= 97 and ch <= 122:
|
jpayne@69
|
111 return ch - 32
|
jpayne@69
|
112 else:
|
jpayne@69
|
113 return ch
|
jpayne@69
|
114
|
jpayne@69
|
115
|
jpayne@69
|
116 cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
|
jpayne@69
|
117 if ch == b'=':
|
jpayne@69
|
118 if bam_is_rev(b):
|
jpayne@69
|
119 return b','
|
jpayne@69
|
120 else:
|
jpayne@69
|
121 return b'.'
|
jpayne@69
|
122 else:
|
jpayne@69
|
123 if bam_is_rev(b):
|
jpayne@69
|
124 return tolower(ch)
|
jpayne@69
|
125 else:
|
jpayne@69
|
126 return toupper(ch)
|
jpayne@69
|
127
|
jpayne@69
|
128
|
jpayne@69
|
129 cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
|
jpayne@69
|
130 cdef uint32_t c
|
jpayne@69
|
131 if p.qpos < p.b.core.l_qseq:
|
jpayne@69
|
132 c = bam_get_qual(p.b)[p.qpos]
|
jpayne@69
|
133 else:
|
jpayne@69
|
134 c = 0
|
jpayne@69
|
135 if c < threshold:
|
jpayne@69
|
136 return True
|
jpayne@69
|
137 return False
|
jpayne@69
|
138
|
jpayne@69
|
139
|
jpayne@69
|
140 cdef inline char map_typecode_htslib_to_python(uint8_t s):
|
jpayne@69
|
141 """map an htslib typecode to the corresponding python typecode
|
jpayne@69
|
142 to be used in the struct or array modules."""
|
jpayne@69
|
143
|
jpayne@69
|
144 # map type from htslib to python array
|
jpayne@69
|
145 cdef char * f = strchr(htslib_types, s)
|
jpayne@69
|
146
|
jpayne@69
|
147 if f == NULL:
|
jpayne@69
|
148 return 0
|
jpayne@69
|
149 return parray_types[f - htslib_types]
|
jpayne@69
|
150
|
jpayne@69
|
151
|
jpayne@69
|
152 cdef inline uint8_t map_typecode_python_to_htslib(char s):
|
jpayne@69
|
153 """determine value type from type code of array"""
|
jpayne@69
|
154 cdef char * f = strchr(parray_types, s)
|
jpayne@69
|
155 if f == NULL:
|
jpayne@69
|
156 return 0
|
jpayne@69
|
157 return htslib_types[f - parray_types]
|
jpayne@69
|
158
|
jpayne@69
|
159
|
jpayne@69
|
160 cdef inline void update_bin(bam1_t * src):
|
jpayne@69
|
161 if src.core.flag & BAM_FUNMAP:
|
jpayne@69
|
162 # treat alignment as length of 1 for unmapped reads
|
jpayne@69
|
163 src.core.bin = hts_reg2bin(
|
jpayne@69
|
164 src.core.pos,
|
jpayne@69
|
165 src.core.pos + 1,
|
jpayne@69
|
166 14,
|
jpayne@69
|
167 5)
|
jpayne@69
|
168 elif pysam_get_n_cigar(src):
|
jpayne@69
|
169 src.core.bin = hts_reg2bin(
|
jpayne@69
|
170 src.core.pos,
|
jpayne@69
|
171 bam_endpos(src),
|
jpayne@69
|
172 14,
|
jpayne@69
|
173 5)
|
jpayne@69
|
174 else:
|
jpayne@69
|
175 src.core.bin = hts_reg2bin(
|
jpayne@69
|
176 src.core.pos,
|
jpayne@69
|
177 src.core.pos + 1,
|
jpayne@69
|
178 14,
|
jpayne@69
|
179 5)
|
jpayne@69
|
180
|
jpayne@69
|
181
|
jpayne@69
|
182 # optional tag data manipulation
|
jpayne@69
|
183 cdef convert_binary_tag(uint8_t * tag):
|
jpayne@69
|
184 """return bytesize, number of values and array of values
|
jpayne@69
|
185 in aux_data memory location pointed to by tag."""
|
jpayne@69
|
186 cdef uint8_t auxtype
|
jpayne@69
|
187 cdef uint8_t byte_size
|
jpayne@69
|
188 cdef int32_t nvalues
|
jpayne@69
|
189 # get byte size
|
jpayne@69
|
190 auxtype = tag[0]
|
jpayne@69
|
191 byte_size = aux_type2size(auxtype)
|
jpayne@69
|
192 tag += 1
|
jpayne@69
|
193 # get number of values in array
|
jpayne@69
|
194 nvalues = (<int32_t*>tag)[0]
|
jpayne@69
|
195 tag += 4
|
jpayne@69
|
196
|
jpayne@69
|
197 # define python array
|
jpayne@69
|
198 cdef c_array.array c_values = array.array(
|
jpayne@69
|
199 chr(map_typecode_htslib_to_python(auxtype)))
|
jpayne@69
|
200 c_array.resize(c_values, nvalues)
|
jpayne@69
|
201
|
jpayne@69
|
202 # copy data
|
jpayne@69
|
203 memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
|
jpayne@69
|
204
|
jpayne@69
|
205 # no need to check for endian-ness as bam1_core_t fields
|
jpayne@69
|
206 # and aux_data are in host endian-ness. See sam.c and calls
|
jpayne@69
|
207 # to swap_data
|
jpayne@69
|
208 return byte_size, nvalues, c_values
|
jpayne@69
|
209
|
jpayne@69
|
210
|
jpayne@69
|
211 cdef inline uint8_t get_tag_typecode(value, value_type=None):
|
jpayne@69
|
212 """guess type code for a *value*. If *value_type* is None, the type
|
jpayne@69
|
213 code will be inferred based on the Python type of *value*
|
jpayne@69
|
214
|
jpayne@69
|
215 """
|
jpayne@69
|
216 # 0 is unknown typecode
|
jpayne@69
|
217 cdef char typecode = 0
|
jpayne@69
|
218
|
jpayne@69
|
219 if value_type is None:
|
jpayne@69
|
220 if isinstance(value, int):
|
jpayne@69
|
221 if value < 0:
|
jpayne@69
|
222 if value >= INT8_MIN:
|
jpayne@69
|
223 typecode = b'c'
|
jpayne@69
|
224 elif value >= INT16_MIN:
|
jpayne@69
|
225 typecode = b's'
|
jpayne@69
|
226 elif value >= INT32_MIN:
|
jpayne@69
|
227 typecode = b'i'
|
jpayne@69
|
228 # unsigned ints
|
jpayne@69
|
229 else:
|
jpayne@69
|
230 if value <= UINT8_MAX:
|
jpayne@69
|
231 typecode = b'C'
|
jpayne@69
|
232 elif value <= UINT16_MAX:
|
jpayne@69
|
233 typecode = b'S'
|
jpayne@69
|
234 elif value <= UINT32_MAX:
|
jpayne@69
|
235 typecode = b'I'
|
jpayne@69
|
236 elif isinstance(value, float):
|
jpayne@69
|
237 typecode = b'f'
|
jpayne@69
|
238 elif isinstance(value, str):
|
jpayne@69
|
239 typecode = b'Z'
|
jpayne@69
|
240 elif isinstance(value, bytes):
|
jpayne@69
|
241 typecode = b'Z'
|
jpayne@69
|
242 elif isinstance(value, array.array) or \
|
jpayne@69
|
243 isinstance(value, list) or \
|
jpayne@69
|
244 isinstance(value, tuple):
|
jpayne@69
|
245 typecode = b'B'
|
jpayne@69
|
246 else:
|
jpayne@69
|
247 if value_type in 'aAsSIcCZidfH':
|
jpayne@69
|
248 typecode = force_bytes(value_type)[0]
|
jpayne@69
|
249
|
jpayne@69
|
250 return typecode
|
jpayne@69
|
251
|
jpayne@69
|
252
|
jpayne@69
|
253 cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None):
|
jpayne@69
|
254 '''returns the value typecode of a value.
|
jpayne@69
|
255
|
jpayne@69
|
256 If max is specified, the appropriate type is returned for a range
|
jpayne@69
|
257 where value is the minimum.
|
jpayne@69
|
258
|
jpayne@69
|
259 Note that this method returns types from the extended BAM alphabet
|
jpayne@69
|
260 of types that includes tags that are not part of the SAM
|
jpayne@69
|
261 specification.
|
jpayne@69
|
262 '''
|
jpayne@69
|
263
|
jpayne@69
|
264
|
jpayne@69
|
265 cdef uint8_t typecode
|
jpayne@69
|
266
|
jpayne@69
|
267 t = type(value)
|
jpayne@69
|
268
|
jpayne@69
|
269 if t is float:
|
jpayne@69
|
270 typecode = b'f'
|
jpayne@69
|
271 elif t is int:
|
jpayne@69
|
272 if max_value is None:
|
jpayne@69
|
273 max_value = value
|
jpayne@69
|
274 if min_value is None:
|
jpayne@69
|
275 min_value = value
|
jpayne@69
|
276 # signed ints
|
jpayne@69
|
277 if min_value < 0:
|
jpayne@69
|
278 if min_value >= INT8_MIN and max_value <= INT8_MAX:
|
jpayne@69
|
279 typecode = b'c'
|
jpayne@69
|
280 elif min_value >= INT16_MIN and max_value <= INT16_MAX:
|
jpayne@69
|
281 typecode = b's'
|
jpayne@69
|
282 elif min_value >= INT32_MIN or max_value <= INT32_MAX:
|
jpayne@69
|
283 typecode = b'i'
|
jpayne@69
|
284 else:
|
jpayne@69
|
285 raise ValueError(
|
jpayne@69
|
286 "at least one signed integer out of range of "
|
jpayne@69
|
287 "BAM/SAM specification")
|
jpayne@69
|
288 # unsigned ints
|
jpayne@69
|
289 else:
|
jpayne@69
|
290 if max_value <= UINT8_MAX:
|
jpayne@69
|
291 typecode = b'C'
|
jpayne@69
|
292 elif max_value <= UINT16_MAX:
|
jpayne@69
|
293 typecode = b'S'
|
jpayne@69
|
294 elif max_value <= UINT32_MAX:
|
jpayne@69
|
295 typecode = b'I'
|
jpayne@69
|
296 else:
|
jpayne@69
|
297 raise ValueError(
|
jpayne@69
|
298 "at least one integer out of range of BAM/SAM specification")
|
jpayne@69
|
299 else:
|
jpayne@69
|
300 # Note: hex strings (H) are not supported yet
|
jpayne@69
|
301 if t is not bytes:
|
jpayne@69
|
302 value = value.encode('ascii')
|
jpayne@69
|
303 if len(value) == 1:
|
jpayne@69
|
304 typecode = b'A'
|
jpayne@69
|
305 else:
|
jpayne@69
|
306 typecode = b'Z'
|
jpayne@69
|
307
|
jpayne@69
|
308 return typecode
|
jpayne@69
|
309
|
jpayne@69
|
310
|
jpayne@69
|
311 # mapping python array.array and htslib typecodes to struct typecodes
|
jpayne@69
|
312 DATATYPE2FORMAT = {
|
jpayne@69
|
313 ord('c'): ('b', 1),
|
jpayne@69
|
314 ord('C'): ('B', 1),
|
jpayne@69
|
315 ord('s'): ('h', 2),
|
jpayne@69
|
316 ord('S'): ('H', 2),
|
jpayne@69
|
317 ord('i'): ('i', 4),
|
jpayne@69
|
318 ord('I'): ('I', 4),
|
jpayne@69
|
319 ord('f'): ('f', 4),
|
jpayne@69
|
320 ord('d'): ('d', 8),
|
jpayne@69
|
321 ord('A'): ('c', 1),
|
jpayne@69
|
322 ord('a'): ('c', 1)}
|
jpayne@69
|
323
|
jpayne@69
|
324
|
jpayne@69
|
325 cdef inline pack_tags(tags):
|
jpayne@69
|
326 """pack a list of tags. Each tag is a tuple of (tag, tuple).
|
jpayne@69
|
327
|
jpayne@69
|
328 Values are packed into the most space efficient data structure
|
jpayne@69
|
329 possible unless the tag contains a third field with the typecode.
|
jpayne@69
|
330
|
jpayne@69
|
331 Returns a format string and the associated list of arguments to be
|
jpayne@69
|
332 used in a call to struct.pack_into.
|
jpayne@69
|
333 """
|
jpayne@69
|
334 fmts, args = ["<"], []
|
jpayne@69
|
335
|
jpayne@69
|
336 # htslib typecode
|
jpayne@69
|
337 cdef uint8_t typecode
|
jpayne@69
|
338 for tag in tags:
|
jpayne@69
|
339
|
jpayne@69
|
340 if len(tag) == 2:
|
jpayne@69
|
341 pytag, value = tag
|
jpayne@69
|
342 valuetype = None
|
jpayne@69
|
343 elif len(tag) == 3:
|
jpayne@69
|
344 pytag, value, valuetype = tag
|
jpayne@69
|
345 else:
|
jpayne@69
|
346 raise ValueError("malformatted tag: %s" % str(tag))
|
jpayne@69
|
347
|
jpayne@69
|
348 if valuetype is None:
|
jpayne@69
|
349 typecode = 0
|
jpayne@69
|
350 else:
|
jpayne@69
|
351 # only first character in valuecode matters
|
jpayne@69
|
352 typecode = force_bytes(valuetype)[0]
|
jpayne@69
|
353
|
jpayne@69
|
354 pytag = force_bytes(pytag)
|
jpayne@69
|
355 pytype = type(value)
|
jpayne@69
|
356
|
jpayne@69
|
357 if pytype is tuple or pytype is list:
|
jpayne@69
|
358 # binary tags from tuples or lists
|
jpayne@69
|
359 if not typecode:
|
jpayne@69
|
360 # automatically determine value type - first value
|
jpayne@69
|
361 # determines type. If there is a mix of types, the
|
jpayne@69
|
362 # result is undefined.
|
jpayne@69
|
363 typecode = get_btag_typecode(min(value),
|
jpayne@69
|
364 min_value=min(value),
|
jpayne@69
|
365 max_value=max(value))
|
jpayne@69
|
366
|
jpayne@69
|
367 if typecode not in DATATYPE2FORMAT:
|
jpayne@69
|
368 raise ValueError("invalid value type '{}'".format(chr(typecode)))
|
jpayne@69
|
369
|
jpayne@69
|
370 datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0])
|
jpayne@69
|
371 args.extend([pytag[:2],
|
jpayne@69
|
372 ord("B"),
|
jpayne@69
|
373 typecode,
|
jpayne@69
|
374 len(value)] + list(value))
|
jpayne@69
|
375
|
jpayne@69
|
376 elif isinstance(value, array.array):
|
jpayne@69
|
377 # binary tags from arrays
|
jpayne@69
|
378 if typecode == 0:
|
jpayne@69
|
379 typecode = map_typecode_python_to_htslib(ord(value.typecode))
|
jpayne@69
|
380
|
jpayne@69
|
381 if typecode == 0:
|
jpayne@69
|
382 raise ValueError("unsupported type code '{}'".format(value.typecode))
|
jpayne@69
|
383
|
jpayne@69
|
384 if typecode not in DATATYPE2FORMAT:
|
jpayne@69
|
385 raise ValueError("invalid value type '{}'".format(chr(typecode)))
|
jpayne@69
|
386
|
jpayne@69
|
387 # use array.tostring() to retrieve byte representation and
|
jpayne@69
|
388 # save as bytes
|
jpayne@69
|
389 datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
|
jpayne@69
|
390 args.extend([pytag[:2],
|
jpayne@69
|
391 ord("B"),
|
jpayne@69
|
392 typecode,
|
jpayne@69
|
393 len(value),
|
jpayne@69
|
394 value.tobytes()])
|
jpayne@69
|
395
|
jpayne@69
|
396 else:
|
jpayne@69
|
397 if typecode == 0:
|
jpayne@69
|
398 typecode = get_tag_typecode(value)
|
jpayne@69
|
399 if typecode == 0:
|
jpayne@69
|
400 raise ValueError("could not deduce typecode for value {}".format(value))
|
jpayne@69
|
401
|
jpayne@69
|
402 if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
|
jpayne@69
|
403 value = force_bytes(value)
|
jpayne@69
|
404
|
jpayne@69
|
405 if typecode == b"a":
|
jpayne@69
|
406 typecode = b'A'
|
jpayne@69
|
407
|
jpayne@69
|
408 if typecode == b'Z' or typecode == b'H':
|
jpayne@69
|
409 datafmt = "2sB%is" % (len(value)+1)
|
jpayne@69
|
410 elif typecode in DATATYPE2FORMAT:
|
jpayne@69
|
411 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
|
jpayne@69
|
412 else:
|
jpayne@69
|
413 raise ValueError("invalid value type '{}'".format(chr(typecode)))
|
jpayne@69
|
414
|
jpayne@69
|
415 args.extend([pytag[:2],
|
jpayne@69
|
416 typecode,
|
jpayne@69
|
417 value])
|
jpayne@69
|
418
|
jpayne@69
|
419 fmts.append(datafmt)
|
jpayne@69
|
420
|
jpayne@69
|
421 return "".join(fmts), args
|
jpayne@69
|
422
|
jpayne@69
|
423
|
jpayne@69
|
424 cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
|
jpayne@69
|
425 """return query length computed from CIGAR alignment.
|
jpayne@69
|
426
|
jpayne@69
|
427 Length ignores hard-clipped bases.
|
jpayne@69
|
428
|
jpayne@69
|
429 Return 0 if there is no CIGAR alignment.
|
jpayne@69
|
430 """
|
jpayne@69
|
431
|
jpayne@69
|
432 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
433
|
jpayne@69
|
434 if cigar_p == NULL:
|
jpayne@69
|
435 return 0
|
jpayne@69
|
436
|
jpayne@69
|
437 cdef uint32_t k, qpos
|
jpayne@69
|
438 cdef int op
|
jpayne@69
|
439 qpos = 0
|
jpayne@69
|
440
|
jpayne@69
|
441 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
442 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
443
|
jpayne@69
|
444 if op == BAM_CMATCH or \
|
jpayne@69
|
445 op == BAM_CINS or \
|
jpayne@69
|
446 op == BAM_CSOFT_CLIP or \
|
jpayne@69
|
447 op == BAM_CEQUAL or \
|
jpayne@69
|
448 op == BAM_CDIFF:
|
jpayne@69
|
449 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
450
|
jpayne@69
|
451 return qpos
|
jpayne@69
|
452
|
jpayne@69
|
453
|
jpayne@69
|
454 cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
|
jpayne@69
|
455 """return query length computed from CIGAR alignment.
|
jpayne@69
|
456
|
jpayne@69
|
457 Length includes hard-clipped bases.
|
jpayne@69
|
458
|
jpayne@69
|
459 Return 0 if there is no CIGAR alignment.
|
jpayne@69
|
460 """
|
jpayne@69
|
461
|
jpayne@69
|
462 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
463
|
jpayne@69
|
464 if cigar_p == NULL:
|
jpayne@69
|
465 return 0
|
jpayne@69
|
466
|
jpayne@69
|
467 cdef uint32_t k, qpos
|
jpayne@69
|
468 cdef int op
|
jpayne@69
|
469 qpos = 0
|
jpayne@69
|
470
|
jpayne@69
|
471 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
472 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
473
|
jpayne@69
|
474 if op == BAM_CMATCH or \
|
jpayne@69
|
475 op == BAM_CINS or \
|
jpayne@69
|
476 op == BAM_CSOFT_CLIP or \
|
jpayne@69
|
477 op == BAM_CHARD_CLIP or \
|
jpayne@69
|
478 op == BAM_CEQUAL or \
|
jpayne@69
|
479 op == BAM_CDIFF:
|
jpayne@69
|
480 qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
481
|
jpayne@69
|
482 return qpos
|
jpayne@69
|
483
|
jpayne@69
|
484
|
jpayne@69
|
485 cdef inline int32_t getQueryStart(bam1_t *src) except -1:
|
jpayne@69
|
486 cdef uint32_t * cigar_p
|
jpayne@69
|
487 cdef uint32_t start_offset = 0
|
jpayne@69
|
488 cdef uint32_t k, op
|
jpayne@69
|
489
|
jpayne@69
|
490 cigar_p = pysam_bam_get_cigar(src);
|
jpayne@69
|
491 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
492 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
493 if op == BAM_CHARD_CLIP:
|
jpayne@69
|
494 if start_offset != 0 and start_offset != src.core.l_qseq:
|
jpayne@69
|
495 raise ValueError('Invalid clipping in CIGAR string')
|
jpayne@69
|
496 elif op == BAM_CSOFT_CLIP:
|
jpayne@69
|
497 start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
498 else:
|
jpayne@69
|
499 break
|
jpayne@69
|
500
|
jpayne@69
|
501 return start_offset
|
jpayne@69
|
502
|
jpayne@69
|
503
|
jpayne@69
|
504 cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
|
jpayne@69
|
505 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
506 cdef uint32_t end_offset = src.core.l_qseq
|
jpayne@69
|
507 cdef uint32_t k, op
|
jpayne@69
|
508
|
jpayne@69
|
509 # if there is no sequence, compute length from cigar string
|
jpayne@69
|
510 if end_offset == 0:
|
jpayne@69
|
511 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
512 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
513 if op == BAM_CMATCH or \
|
jpayne@69
|
514 op == BAM_CINS or \
|
jpayne@69
|
515 op == BAM_CEQUAL or \
|
jpayne@69
|
516 op == BAM_CDIFF or \
|
jpayne@69
|
517 (op == BAM_CSOFT_CLIP and end_offset == 0):
|
jpayne@69
|
518 end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
519 else:
|
jpayne@69
|
520 # walk backwards in cigar string
|
jpayne@69
|
521 for k from pysam_get_n_cigar(src) > k >= 1:
|
jpayne@69
|
522 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
523 if op == BAM_CHARD_CLIP:
|
jpayne@69
|
524 if end_offset != src.core.l_qseq:
|
jpayne@69
|
525 raise ValueError('Invalid clipping in CIGAR string')
|
jpayne@69
|
526 elif op == BAM_CSOFT_CLIP:
|
jpayne@69
|
527 end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
528 else:
|
jpayne@69
|
529 break
|
jpayne@69
|
530
|
jpayne@69
|
531 return end_offset
|
jpayne@69
|
532
|
jpayne@69
|
533
|
jpayne@69
|
534 cdef inline bytes getSequenceInRange(bam1_t *src,
|
jpayne@69
|
535 uint32_t start,
|
jpayne@69
|
536 uint32_t end):
|
jpayne@69
|
537 """return python string of the sequence in a bam1_t object.
|
jpayne@69
|
538 """
|
jpayne@69
|
539
|
jpayne@69
|
540 cdef uint8_t * p
|
jpayne@69
|
541 cdef uint32_t k
|
jpayne@69
|
542 cdef char * s
|
jpayne@69
|
543
|
jpayne@69
|
544 if not src.core.l_qseq:
|
jpayne@69
|
545 return None
|
jpayne@69
|
546
|
jpayne@69
|
547 seq = PyBytes_FromStringAndSize(NULL, end - start)
|
jpayne@69
|
548 s = <char*>seq
|
jpayne@69
|
549 p = pysam_bam_get_seq(src)
|
jpayne@69
|
550
|
jpayne@69
|
551 for k from start <= k < end:
|
jpayne@69
|
552 # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
|
jpayne@69
|
553 # note: do not use string literal as it will be a python string
|
jpayne@69
|
554 s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
|
jpayne@69
|
555
|
jpayne@69
|
556 return charptr_to_bytes(seq)
|
jpayne@69
|
557
|
jpayne@69
|
558
|
jpayne@69
|
559 cdef inline object getQualitiesInRange(bam1_t *src,
|
jpayne@69
|
560 uint32_t start,
|
jpayne@69
|
561 uint32_t end):
|
jpayne@69
|
562 """return python array of quality values from a bam1_t object"""
|
jpayne@69
|
563
|
jpayne@69
|
564 cdef uint8_t * p
|
jpayne@69
|
565 cdef uint32_t k
|
jpayne@69
|
566
|
jpayne@69
|
567 p = pysam_bam_get_qual(src)
|
jpayne@69
|
568 if p[0] == 0xff:
|
jpayne@69
|
569 return None
|
jpayne@69
|
570
|
jpayne@69
|
571 # 'B': unsigned char
|
jpayne@69
|
572 cdef c_array.array result = array.array('B', [0])
|
jpayne@69
|
573 c_array.resize(result, end - start)
|
jpayne@69
|
574
|
jpayne@69
|
575 # copy data
|
jpayne@69
|
576 memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
|
jpayne@69
|
577
|
jpayne@69
|
578 return result
|
jpayne@69
|
579
|
jpayne@69
|
580
|
jpayne@69
|
581 #####################################################################
|
jpayne@69
|
582 ## factory methods for instantiating extension classes
|
jpayne@69
|
583 cdef class AlignedSegment
|
jpayne@69
|
584 cdef AlignedSegment makeAlignedSegment(bam1_t *src,
|
jpayne@69
|
585 AlignmentHeader header):
|
jpayne@69
|
586 '''return an AlignedSegment object constructed from `src`'''
|
jpayne@69
|
587 # note that the following does not call __init__
|
jpayne@69
|
588 cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
|
jpayne@69
|
589 dest._delegate = bam_dup1(src)
|
jpayne@69
|
590 dest.header = header
|
jpayne@69
|
591 return dest
|
jpayne@69
|
592
|
jpayne@69
|
593
|
jpayne@69
|
594 cdef class PileupColumn
|
jpayne@69
|
595 cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
|
jpayne@69
|
596 int tid,
|
jpayne@69
|
597 int pos,
|
jpayne@69
|
598 int n_pu,
|
jpayne@69
|
599 uint32_t min_base_quality,
|
jpayne@69
|
600 char * reference_sequence,
|
jpayne@69
|
601 AlignmentHeader header):
|
jpayne@69
|
602 '''return a PileupColumn object constructed from pileup in `plp` and
|
jpayne@69
|
603 setting additional attributes.
|
jpayne@69
|
604
|
jpayne@69
|
605 '''
|
jpayne@69
|
606 # note that the following does not call __init__
|
jpayne@69
|
607 cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
|
jpayne@69
|
608 dest.header = header
|
jpayne@69
|
609 dest.plp = plp
|
jpayne@69
|
610 dest.tid = tid
|
jpayne@69
|
611 dest.pos = pos
|
jpayne@69
|
612 dest.n_pu = n_pu
|
jpayne@69
|
613 dest.min_base_quality = min_base_quality
|
jpayne@69
|
614 dest.reference_sequence = reference_sequence
|
jpayne@69
|
615 dest.buf.l = dest.buf.m = 0
|
jpayne@69
|
616 dest.buf.s = NULL
|
jpayne@69
|
617
|
jpayne@69
|
618 return dest
|
jpayne@69
|
619
|
jpayne@69
|
620
|
jpayne@69
|
621 cdef class PileupRead
|
jpayne@69
|
622 cdef PileupRead makePileupRead(const bam_pileup1_t *src,
|
jpayne@69
|
623 AlignmentHeader header):
|
jpayne@69
|
624 '''return a PileupRead object construted from a bam_pileup1_t * object.'''
|
jpayne@69
|
625 # note that the following does not call __init__
|
jpayne@69
|
626 cdef PileupRead dest = PileupRead.__new__(PileupRead)
|
jpayne@69
|
627 dest._alignment = makeAlignedSegment(src.b, header)
|
jpayne@69
|
628 dest._qpos = src.qpos
|
jpayne@69
|
629 dest._indel = src.indel
|
jpayne@69
|
630 dest._level = src.level
|
jpayne@69
|
631 dest._is_del = src.is_del
|
jpayne@69
|
632 dest._is_head = src.is_head
|
jpayne@69
|
633 dest._is_tail = src.is_tail
|
jpayne@69
|
634 dest._is_refskip = src.is_refskip
|
jpayne@69
|
635 return dest
|
jpayne@69
|
636
|
jpayne@69
|
637
|
jpayne@69
|
638 cdef inline uint32_t get_alignment_length(bam1_t *src):
|
jpayne@69
|
639 cdef uint32_t k = 0
|
jpayne@69
|
640 cdef uint32_t l = 0
|
jpayne@69
|
641 if src == NULL:
|
jpayne@69
|
642 return 0
|
jpayne@69
|
643 cdef uint32_t * cigar_p = bam_get_cigar(src)
|
jpayne@69
|
644 if cigar_p == NULL:
|
jpayne@69
|
645 return 0
|
jpayne@69
|
646 cdef int op
|
jpayne@69
|
647 cdef uint32_t n = pysam_get_n_cigar(src)
|
jpayne@69
|
648 for k from 0 <= k < n:
|
jpayne@69
|
649 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
650 if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
|
jpayne@69
|
651 continue
|
jpayne@69
|
652 l += cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
653 return l
|
jpayne@69
|
654
|
jpayne@69
|
655
|
jpayne@69
|
656 cdef inline uint32_t get_md_reference_length(char * md_tag):
|
jpayne@69
|
657 cdef int l = 0
|
jpayne@69
|
658 cdef int md_idx = 0
|
jpayne@69
|
659 cdef int nmatches = 0
|
jpayne@69
|
660
|
jpayne@69
|
661 while md_tag[md_idx] != 0:
|
jpayne@69
|
662 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
|
jpayne@69
|
663 nmatches *= 10
|
jpayne@69
|
664 nmatches += md_tag[md_idx] - 48
|
jpayne@69
|
665 md_idx += 1
|
jpayne@69
|
666 continue
|
jpayne@69
|
667 else:
|
jpayne@69
|
668 l += nmatches
|
jpayne@69
|
669 nmatches = 0
|
jpayne@69
|
670 if md_tag[md_idx] == b'^':
|
jpayne@69
|
671 md_idx += 1
|
jpayne@69
|
672 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
|
jpayne@69
|
673 md_idx += 1
|
jpayne@69
|
674 l += 1
|
jpayne@69
|
675 else:
|
jpayne@69
|
676 md_idx += 1
|
jpayne@69
|
677 l += 1
|
jpayne@69
|
678
|
jpayne@69
|
679 l += nmatches
|
jpayne@69
|
680 return l
|
jpayne@69
|
681
|
jpayne@69
|
682 # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
|
jpayne@69
|
683 cdef inline bytes build_alignment_sequence(bam1_t * src):
|
jpayne@69
|
684 """return expanded sequence from MD tag.
|
jpayne@69
|
685
|
jpayne@69
|
686 The sequence includes substitutions and both insertions in the
|
jpayne@69
|
687 reference as well as deletions to the reference sequence. Combine
|
jpayne@69
|
688 with the cigar string to reconstitute the query or the reference
|
jpayne@69
|
689 sequence.
|
jpayne@69
|
690
|
jpayne@69
|
691 Positions corresponding to `N` (skipped region from the reference)
|
jpayne@69
|
692 or `P` (padding (silent deletion from padded reference)) in the CIGAR
|
jpayne@69
|
693 string will not appear in the returned sequence. The MD should
|
jpayne@69
|
694 correspondingly not contain these. Thus proper tags are::
|
jpayne@69
|
695
|
jpayne@69
|
696 Deletion from the reference: cigar=5M1D5M MD=5^C5
|
jpayne@69
|
697 Skipped region from reference: cigar=5M1N5M MD=10
|
jpayne@69
|
698 Padded region in the reference: cigar=5M1P5M MD=10
|
jpayne@69
|
699
|
jpayne@69
|
700 Returns
|
jpayne@69
|
701 -------
|
jpayne@69
|
702
|
jpayne@69
|
703 None, if no MD tag is present.
|
jpayne@69
|
704
|
jpayne@69
|
705 """
|
jpayne@69
|
706 if src == NULL:
|
jpayne@69
|
707 return None
|
jpayne@69
|
708
|
jpayne@69
|
709 cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
|
jpayne@69
|
710 if md_tag_ptr == NULL:
|
jpayne@69
|
711 return None
|
jpayne@69
|
712
|
jpayne@69
|
713 cdef uint32_t start = getQueryStart(src)
|
jpayne@69
|
714 cdef uint32_t end = getQueryEnd(src)
|
jpayne@69
|
715 # get read sequence, taking into account soft-clipping
|
jpayne@69
|
716 r = getSequenceInRange(src, start, end)
|
jpayne@69
|
717 cdef char * read_sequence = r
|
jpayne@69
|
718 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
719 if cigar_p == NULL:
|
jpayne@69
|
720 return None
|
jpayne@69
|
721
|
jpayne@69
|
722 cdef uint32_t r_idx = 0
|
jpayne@69
|
723 cdef int op
|
jpayne@69
|
724 cdef uint32_t k, i, l, x
|
jpayne@69
|
725 cdef int nmatches = 0
|
jpayne@69
|
726 cdef int s_idx = 0
|
jpayne@69
|
727
|
jpayne@69
|
728 cdef uint32_t max_len = get_alignment_length(src)
|
jpayne@69
|
729 if max_len == 0:
|
jpayne@69
|
730 raise ValueError("could not determine alignment length")
|
jpayne@69
|
731
|
jpayne@69
|
732 cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
|
jpayne@69
|
733 if s == NULL:
|
jpayne@69
|
734 raise ValueError(
|
jpayne@69
|
735 "could not allocate sequence of length %i" % max_len)
|
jpayne@69
|
736
|
jpayne@69
|
737 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
738 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
739 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
740 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
741 for i from 0 <= i < l:
|
jpayne@69
|
742 s[s_idx] = read_sequence[r_idx]
|
jpayne@69
|
743 r_idx += 1
|
jpayne@69
|
744 s_idx += 1
|
jpayne@69
|
745 elif op == BAM_CDEL:
|
jpayne@69
|
746 for i from 0 <= i < l:
|
jpayne@69
|
747 s[s_idx] = b'-'
|
jpayne@69
|
748 s_idx += 1
|
jpayne@69
|
749 elif op == BAM_CREF_SKIP:
|
jpayne@69
|
750 pass
|
jpayne@69
|
751 elif op == BAM_CINS or op == BAM_CPAD:
|
jpayne@69
|
752 for i from 0 <= i < l:
|
jpayne@69
|
753 # encode insertions into reference as lowercase
|
jpayne@69
|
754 s[s_idx] = read_sequence[r_idx] + 32
|
jpayne@69
|
755 r_idx += 1
|
jpayne@69
|
756 s_idx += 1
|
jpayne@69
|
757 elif op == BAM_CSOFT_CLIP:
|
jpayne@69
|
758 pass
|
jpayne@69
|
759 elif op == BAM_CHARD_CLIP:
|
jpayne@69
|
760 pass # advances neither
|
jpayne@69
|
761
|
jpayne@69
|
762 cdef char md_buffer[2]
|
jpayne@69
|
763 cdef char *md_tag
|
jpayne@69
|
764 cdef uint8_t md_typecode = md_tag_ptr[0]
|
jpayne@69
|
765 if md_typecode == b'Z':
|
jpayne@69
|
766 md_tag = bam_aux2Z(md_tag_ptr)
|
jpayne@69
|
767 elif md_typecode == b'A':
|
jpayne@69
|
768 # Work around HTSeq bug that writes 1-character strings as MD:A:v
|
jpayne@69
|
769 md_buffer[0] = bam_aux2A(md_tag_ptr)
|
jpayne@69
|
770 md_buffer[1] = b'\0'
|
jpayne@69
|
771 md_tag = md_buffer
|
jpayne@69
|
772 else:
|
jpayne@69
|
773 raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
|
jpayne@69
|
774
|
jpayne@69
|
775 cdef int md_idx = 0
|
jpayne@69
|
776 cdef char c
|
jpayne@69
|
777 s_idx = 0
|
jpayne@69
|
778
|
jpayne@69
|
779 # Check if MD tag is valid by matching CIGAR length to MD tag defined length
|
jpayne@69
|
780 # Insertions would be in addition to what is described by MD, so we calculate
|
jpayne@69
|
781 # the number of insertions separately.
|
jpayne@69
|
782 cdef int insertions = 0
|
jpayne@69
|
783
|
jpayne@69
|
784 while s[s_idx] != 0:
|
jpayne@69
|
785 if s[s_idx] >= b'a':
|
jpayne@69
|
786 insertions += 1
|
jpayne@69
|
787 s_idx += 1
|
jpayne@69
|
788 s_idx = 0
|
jpayne@69
|
789
|
jpayne@69
|
790 cdef uint32_t md_len = get_md_reference_length(md_tag)
|
jpayne@69
|
791 if md_len + insertions > max_len:
|
jpayne@69
|
792 free(s)
|
jpayne@69
|
793 raise AssertionError(
|
jpayne@69
|
794 "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format(
|
jpayne@69
|
795 md_len, max_len, insertions))
|
jpayne@69
|
796
|
jpayne@69
|
797 while md_tag[md_idx] != 0:
|
jpayne@69
|
798 # c is numerical
|
jpayne@69
|
799 if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
|
jpayne@69
|
800 nmatches *= 10
|
jpayne@69
|
801 nmatches += md_tag[md_idx] - 48
|
jpayne@69
|
802 md_idx += 1
|
jpayne@69
|
803 continue
|
jpayne@69
|
804 else:
|
jpayne@69
|
805 # save matches up to this point, skipping insertions
|
jpayne@69
|
806 for x from 0 <= x < nmatches:
|
jpayne@69
|
807 while s[s_idx] >= b'a':
|
jpayne@69
|
808 s_idx += 1
|
jpayne@69
|
809 s_idx += 1
|
jpayne@69
|
810 while s[s_idx] >= b'a':
|
jpayne@69
|
811 s_idx += 1
|
jpayne@69
|
812
|
jpayne@69
|
813 r_idx += nmatches
|
jpayne@69
|
814 nmatches = 0
|
jpayne@69
|
815 if md_tag[md_idx] == b'^':
|
jpayne@69
|
816 md_idx += 1
|
jpayne@69
|
817 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
|
jpayne@69
|
818 # assert s[s_idx] == '-'
|
jpayne@69
|
819 s[s_idx] = md_tag[md_idx]
|
jpayne@69
|
820 s_idx += 1
|
jpayne@69
|
821 md_idx += 1
|
jpayne@69
|
822 else:
|
jpayne@69
|
823 # save mismatch
|
jpayne@69
|
824 # enforce lower case
|
jpayne@69
|
825 c = md_tag[md_idx]
|
jpayne@69
|
826 if c <= 90:
|
jpayne@69
|
827 c += 32
|
jpayne@69
|
828 s[s_idx] = c
|
jpayne@69
|
829 s_idx += 1
|
jpayne@69
|
830 r_idx += 1
|
jpayne@69
|
831 md_idx += 1
|
jpayne@69
|
832
|
jpayne@69
|
833 # save matches up to this point, skipping insertions
|
jpayne@69
|
834 for x from 0 <= x < nmatches:
|
jpayne@69
|
835 while s[s_idx] >= b'a':
|
jpayne@69
|
836 s_idx += 1
|
jpayne@69
|
837 s_idx += 1
|
jpayne@69
|
838 while s[s_idx] >= b'a':
|
jpayne@69
|
839 s_idx += 1
|
jpayne@69
|
840
|
jpayne@69
|
841 seq = PyBytes_FromStringAndSize(s, s_idx)
|
jpayne@69
|
842 free(s)
|
jpayne@69
|
843
|
jpayne@69
|
844 return seq
|
jpayne@69
|
845
|
jpayne@69
|
846
|
jpayne@69
|
847 cdef inline bytes build_reference_sequence(bam1_t * src):
|
jpayne@69
|
848 """return the reference sequence in the region that is covered by the
|
jpayne@69
|
849 alignment of the read to the reference.
|
jpayne@69
|
850
|
jpayne@69
|
851 This method requires the MD tag to be set.
|
jpayne@69
|
852
|
jpayne@69
|
853 """
|
jpayne@69
|
854 cdef uint32_t k, i, l
|
jpayne@69
|
855 cdef int op
|
jpayne@69
|
856 cdef int s_idx = 0
|
jpayne@69
|
857 ref_seq = build_alignment_sequence(src)
|
jpayne@69
|
858 if ref_seq is None:
|
jpayne@69
|
859 raise ValueError("MD tag not present")
|
jpayne@69
|
860
|
jpayne@69
|
861 cdef char * s = <char*>calloc(len(ref_seq) + 1, sizeof(char))
|
jpayne@69
|
862 if s == NULL:
|
jpayne@69
|
863 raise ValueError(
|
jpayne@69
|
864 "could not allocate sequence of length %i" % len(ref_seq))
|
jpayne@69
|
865
|
jpayne@69
|
866 cdef char * cref_seq = ref_seq
|
jpayne@69
|
867 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
868 cdef uint32_t r_idx = 0
|
jpayne@69
|
869 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
870 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
871 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
872 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
873 for i from 0 <= i < l:
|
jpayne@69
|
874 s[s_idx] = cref_seq[r_idx]
|
jpayne@69
|
875 r_idx += 1
|
jpayne@69
|
876 s_idx += 1
|
jpayne@69
|
877 elif op == BAM_CDEL:
|
jpayne@69
|
878 for i from 0 <= i < l:
|
jpayne@69
|
879 s[s_idx] = cref_seq[r_idx]
|
jpayne@69
|
880 r_idx += 1
|
jpayne@69
|
881 s_idx += 1
|
jpayne@69
|
882 elif op == BAM_CREF_SKIP:
|
jpayne@69
|
883 pass
|
jpayne@69
|
884 elif op == BAM_CINS or op == BAM_CPAD:
|
jpayne@69
|
885 r_idx += l
|
jpayne@69
|
886 elif op == BAM_CSOFT_CLIP:
|
jpayne@69
|
887 pass
|
jpayne@69
|
888 elif op == BAM_CHARD_CLIP:
|
jpayne@69
|
889 pass # advances neither
|
jpayne@69
|
890
|
jpayne@69
|
891 seq = PyBytes_FromStringAndSize(s, s_idx)
|
jpayne@69
|
892 free(s)
|
jpayne@69
|
893
|
jpayne@69
|
894 return seq
|
jpayne@69
|
895
|
jpayne@69
|
896
|
jpayne@69
|
897 cdef inline str safe_reference_name(AlignmentHeader header, int tid):
|
jpayne@69
|
898 if tid == -1: return "*"
|
jpayne@69
|
899 elif header is not None: return header.get_reference_name(tid)
|
jpayne@69
|
900 else: return f"#{tid}"
|
jpayne@69
|
901
|
jpayne@69
|
902
|
jpayne@69
|
903 # Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
|
jpayne@69
|
904
|
jpayne@69
|
905 cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
|
jpayne@69
|
906 return (qpos, pos)
|
jpayne@69
|
907
|
jpayne@69
|
908
|
jpayne@69
|
909 cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
|
jpayne@69
|
910 ref_base = ref_seq[r_idx] if ref_seq is not None else None
|
jpayne@69
|
911 return (qpos, pos, ref_base)
|
jpayne@69
|
912
|
jpayne@69
|
913
|
jpayne@69
|
914 cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
|
jpayne@69
|
915 return (qpos, pos, CIGAR_OPS(op))
|
jpayne@69
|
916
|
jpayne@69
|
917
|
jpayne@69
|
918 cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
|
jpayne@69
|
919 ref_base = ref_seq[r_idx] if ref_seq is not None else None
|
jpayne@69
|
920 return (qpos, pos, ref_base, CIGAR_OPS(op))
|
jpayne@69
|
921
|
jpayne@69
|
922
|
jpayne@69
|
923 cdef class AlignedSegment:
|
jpayne@69
|
924 '''Class representing an aligned segment.
|
jpayne@69
|
925
|
jpayne@69
|
926 This class stores a handle to the samtools C-structure representing
|
jpayne@69
|
927 an aligned read. Member read access is forwarded to the C-structure
|
jpayne@69
|
928 and converted into python objects. This implementation should be fast,
|
jpayne@69
|
929 as only the data needed is converted.
|
jpayne@69
|
930
|
jpayne@69
|
931 For write access, the C-structure is updated in-place. This is
|
jpayne@69
|
932 not the most efficient way to build BAM entries, as the variable
|
jpayne@69
|
933 length data is concatenated and thus needs to be resized if
|
jpayne@69
|
934 a field is updated. Furthermore, the BAM entry might be
|
jpayne@69
|
935 in an inconsistent state.
|
jpayne@69
|
936
|
jpayne@69
|
937 One issue to look out for is that the sequence should always
|
jpayne@69
|
938 be set *before* the quality scores. Setting the sequence will
|
jpayne@69
|
939 also erase any quality scores that were set previously.
|
jpayne@69
|
940
|
jpayne@69
|
941 Parameters
|
jpayne@69
|
942 ----------
|
jpayne@69
|
943
|
jpayne@69
|
944 header:
|
jpayne@69
|
945 :class:`~pysam.AlignmentHeader` object to map numerical
|
jpayne@69
|
946 identifiers to chromosome names. If not given, an empty
|
jpayne@69
|
947 header is created.
|
jpayne@69
|
948 '''
|
jpayne@69
|
949
|
jpayne@69
|
950 # Now only called when instances are created from Python
|
jpayne@69
|
951 def __init__(self, AlignmentHeader header=None):
|
jpayne@69
|
952 # see bam_init1
|
jpayne@69
|
953 self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
|
jpayne@69
|
954 if self._delegate == NULL:
|
jpayne@69
|
955 raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
|
jpayne@69
|
956 # allocate some memory. If size is 0, calloc does not return a
|
jpayne@69
|
957 # pointer that can be passed to free() so allocate 40 bytes
|
jpayne@69
|
958 # for a new read
|
jpayne@69
|
959 self._delegate.m_data = 40
|
jpayne@69
|
960 self._delegate.data = <uint8_t *>calloc(
|
jpayne@69
|
961 self._delegate.m_data, 1)
|
jpayne@69
|
962 if self._delegate.data == NULL:
|
jpayne@69
|
963 raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
|
jpayne@69
|
964 self._delegate.l_data = 0
|
jpayne@69
|
965 # set some data to make read approximately legit.
|
jpayne@69
|
966 # Note, SAM writing fails with q_name of length 0
|
jpayne@69
|
967 self._delegate.core.l_qname = 0
|
jpayne@69
|
968 self._delegate.core.tid = -1
|
jpayne@69
|
969 self._delegate.core.pos = -1
|
jpayne@69
|
970 self._delegate.core.mtid = -1
|
jpayne@69
|
971 self._delegate.core.mpos = -1
|
jpayne@69
|
972
|
jpayne@69
|
973 # caching for selected fields
|
jpayne@69
|
974 self.cache_query_qualities = None
|
jpayne@69
|
975 self.cache_query_alignment_qualities = None
|
jpayne@69
|
976 self.cache_query_sequence = None
|
jpayne@69
|
977 self.cache_query_alignment_sequence = None
|
jpayne@69
|
978
|
jpayne@69
|
979 self.header = header
|
jpayne@69
|
980
|
jpayne@69
|
981 def __dealloc__(self):
|
jpayne@69
|
982 bam_destroy1(self._delegate)
|
jpayne@69
|
983
|
jpayne@69
|
984 def __str__(self):
|
jpayne@69
|
985 """return string representation of alignment.
|
jpayne@69
|
986
|
jpayne@69
|
987 The representation is an approximate :term:`SAM` format, because
|
jpayne@69
|
988 an aligned read might not be associated with a :term:`AlignmentFile`.
|
jpayne@69
|
989 Hence when the read does not have an associated :class:`AlignedHeader`,
|
jpayne@69
|
990 :term:`tid` is shown instead of the reference name.
|
jpayne@69
|
991 Similarly, the tags field is returned in its parsed state.
|
jpayne@69
|
992
|
jpayne@69
|
993 To get a valid SAM record, use :meth:`to_string`.
|
jpayne@69
|
994 """
|
jpayne@69
|
995 # sam-parsing is done in sam.c/bam_format1_core which
|
jpayne@69
|
996 # requires a valid header.
|
jpayne@69
|
997 return "\t".join(map(str, (self.query_name,
|
jpayne@69
|
998 self.flag,
|
jpayne@69
|
999 safe_reference_name(self.header, self.reference_id),
|
jpayne@69
|
1000 self.reference_start + 1,
|
jpayne@69
|
1001 self.mapping_quality,
|
jpayne@69
|
1002 self.cigarstring,
|
jpayne@69
|
1003 safe_reference_name(self.header, self.next_reference_id),
|
jpayne@69
|
1004 self.next_reference_start + 1,
|
jpayne@69
|
1005 self.template_length,
|
jpayne@69
|
1006 self.query_sequence,
|
jpayne@69
|
1007 self.query_qualities,
|
jpayne@69
|
1008 self.tags)))
|
jpayne@69
|
1009
|
jpayne@69
|
1010 def __repr__(self):
|
jpayne@69
|
1011 ref = self.reference_name if self.header is not None else self.reference_id
|
jpayne@69
|
1012 return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
|
jpayne@69
|
1013
|
jpayne@69
|
1014 def __copy__(self):
|
jpayne@69
|
1015 return makeAlignedSegment(self._delegate, self.header)
|
jpayne@69
|
1016
|
jpayne@69
|
1017 def __deepcopy__(self, memo):
|
jpayne@69
|
1018 return makeAlignedSegment(self._delegate, self.header)
|
jpayne@69
|
1019
|
jpayne@69
|
1020 def compare(self, AlignedSegment other):
|
jpayne@69
|
1021 '''return -1,0,1, if contents in this are binary
|
jpayne@69
|
1022 <,=,> to *other*
|
jpayne@69
|
1023 '''
|
jpayne@69
|
1024
|
jpayne@69
|
1025 # avoid segfault when other equals None
|
jpayne@69
|
1026 if other is None:
|
jpayne@69
|
1027 return -1
|
jpayne@69
|
1028
|
jpayne@69
|
1029 cdef int retval, x
|
jpayne@69
|
1030 cdef bam1_t *t
|
jpayne@69
|
1031 cdef bam1_t *o
|
jpayne@69
|
1032
|
jpayne@69
|
1033 t = self._delegate
|
jpayne@69
|
1034 o = other._delegate
|
jpayne@69
|
1035
|
jpayne@69
|
1036 # uncomment for debugging purposes
|
jpayne@69
|
1037 # cdef unsigned char * oo, * tt
|
jpayne@69
|
1038 # tt = <unsigned char*>(&t.core)
|
jpayne@69
|
1039 # oo = <unsigned char*>(&o.core)
|
jpayne@69
|
1040 # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
|
jpayne@69
|
1041 # tt = <unsigned char*>(t.data)
|
jpayne@69
|
1042 # oo = <unsigned char*>(o.data)
|
jpayne@69
|
1043 # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
|
jpayne@69
|
1044
|
jpayne@69
|
1045 # Fast-path test for object identity
|
jpayne@69
|
1046 if t == o:
|
jpayne@69
|
1047 return 0
|
jpayne@69
|
1048
|
jpayne@69
|
1049 cdef uint8_t *a = <uint8_t*>&t.core
|
jpayne@69
|
1050 cdef uint8_t *b = <uint8_t*>&o.core
|
jpayne@69
|
1051
|
jpayne@69
|
1052 retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
|
jpayne@69
|
1053 if retval:
|
jpayne@69
|
1054 return retval
|
jpayne@69
|
1055
|
jpayne@69
|
1056 # cmp(t.l_data, o.l_data)
|
jpayne@69
|
1057 retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
|
jpayne@69
|
1058 if retval:
|
jpayne@69
|
1059 return retval
|
jpayne@69
|
1060 return memcmp(t.data, o.data, t.l_data)
|
jpayne@69
|
1061
|
jpayne@69
|
1062 def __richcmp__(self, AlignedSegment other, int op):
|
jpayne@69
|
1063 if op == 2: # == operator
|
jpayne@69
|
1064 return self.compare(other) == 0
|
jpayne@69
|
1065 elif op == 3: # != operator
|
jpayne@69
|
1066 return self.compare(other) != 0
|
jpayne@69
|
1067 else:
|
jpayne@69
|
1068 return NotImplemented
|
jpayne@69
|
1069
|
jpayne@69
|
1070 def __hash__(self):
|
jpayne@69
|
1071 cdef bam1_t * src = self._delegate
|
jpayne@69
|
1072 cdef int x
|
jpayne@69
|
1073
|
jpayne@69
|
1074 # see http://effbot.org/zone/python-hash.htm
|
jpayne@69
|
1075 cdef uint8_t * c = <uint8_t *>&src.core
|
jpayne@69
|
1076 cdef uint32_t hash_value = c[0]
|
jpayne@69
|
1077 for x from 1 <= x < sizeof(bam1_core_t):
|
jpayne@69
|
1078 hash_value = c_mul(hash_value, 1000003) ^ c[x]
|
jpayne@69
|
1079 c = <uint8_t *>src.data
|
jpayne@69
|
1080 for x from 0 <= x < src.l_data:
|
jpayne@69
|
1081 hash_value = c_mul(hash_value, 1000003) ^ c[x]
|
jpayne@69
|
1082
|
jpayne@69
|
1083 return hash_value
|
jpayne@69
|
1084
|
jpayne@69
|
1085 cpdef to_string(self):
|
jpayne@69
|
1086 """returns a string representation of the aligned segment.
|
jpayne@69
|
1087
|
jpayne@69
|
1088 The output format is valid SAM format if a header is associated
|
jpayne@69
|
1089 with the AlignedSegment.
|
jpayne@69
|
1090 """
|
jpayne@69
|
1091 cdef kstring_t line
|
jpayne@69
|
1092 line.l = line.m = 0
|
jpayne@69
|
1093 line.s = NULL
|
jpayne@69
|
1094
|
jpayne@69
|
1095 if self.header:
|
jpayne@69
|
1096 if sam_format1(self.header.ptr, self._delegate, &line) < 0:
|
jpayne@69
|
1097 if line.m:
|
jpayne@69
|
1098 free(line.s)
|
jpayne@69
|
1099 raise ValueError('sam_format failed')
|
jpayne@69
|
1100 else:
|
jpayne@69
|
1101 raise NotImplementedError("todo")
|
jpayne@69
|
1102
|
jpayne@69
|
1103 ret = force_str(line.s[:line.l])
|
jpayne@69
|
1104
|
jpayne@69
|
1105 if line.m:
|
jpayne@69
|
1106 free(line.s)
|
jpayne@69
|
1107
|
jpayne@69
|
1108 return ret
|
jpayne@69
|
1109
|
jpayne@69
|
1110 @classmethod
|
jpayne@69
|
1111 def fromstring(cls, sam, AlignmentHeader header):
|
jpayne@69
|
1112 """parses a string representation of the aligned segment.
|
jpayne@69
|
1113
|
jpayne@69
|
1114 The input format should be valid SAM format.
|
jpayne@69
|
1115
|
jpayne@69
|
1116 Parameters
|
jpayne@69
|
1117 ----------
|
jpayne@69
|
1118 sam:
|
jpayne@69
|
1119 :term:`SAM` formatted string
|
jpayne@69
|
1120
|
jpayne@69
|
1121 """
|
jpayne@69
|
1122 cdef AlignedSegment dest = cls.__new__(cls)
|
jpayne@69
|
1123 dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
|
jpayne@69
|
1124 dest.header = header
|
jpayne@69
|
1125
|
jpayne@69
|
1126 cdef kstring_t line
|
jpayne@69
|
1127 line.l = line.m = len(sam)
|
jpayne@69
|
1128 _sam = force_bytes(sam)
|
jpayne@69
|
1129 line.s = _sam
|
jpayne@69
|
1130
|
jpayne@69
|
1131 cdef int ret
|
jpayne@69
|
1132 ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
|
jpayne@69
|
1133 if ret < 0:
|
jpayne@69
|
1134 raise ValueError("parsing SAM record string failed (error code {})".format(ret))
|
jpayne@69
|
1135
|
jpayne@69
|
1136 return dest
|
jpayne@69
|
1137
|
jpayne@69
|
1138 cpdef tostring(self, htsfile=None):
|
jpayne@69
|
1139 """deprecated, use :meth:`to_string()` instead.
|
jpayne@69
|
1140
|
jpayne@69
|
1141 Parameters
|
jpayne@69
|
1142 ----------
|
jpayne@69
|
1143
|
jpayne@69
|
1144 htsfile:
|
jpayne@69
|
1145 (deprecated) AlignmentFile object to map numerical
|
jpayne@69
|
1146 identifiers to chromosome names. This parameter is present
|
jpayne@69
|
1147 for backwards compatibility and ignored.
|
jpayne@69
|
1148 """
|
jpayne@69
|
1149
|
jpayne@69
|
1150 return self.to_string()
|
jpayne@69
|
1151
|
jpayne@69
|
1152 def to_dict(self):
|
jpayne@69
|
1153 """returns a json representation of the aligned segment.
|
jpayne@69
|
1154
|
jpayne@69
|
1155 Field names are abbreviated versions of the class attributes.
|
jpayne@69
|
1156 """
|
jpayne@69
|
1157 # let htslib do the string conversions, but treat optional field properly as list
|
jpayne@69
|
1158 vals = self.to_string().split("\t")
|
jpayne@69
|
1159 n = len(KEY_NAMES) - 1
|
jpayne@69
|
1160 return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])])
|
jpayne@69
|
1161
|
jpayne@69
|
1162 @classmethod
|
jpayne@69
|
1163 def from_dict(cls, sam_dict, AlignmentHeader header):
|
jpayne@69
|
1164 """parses a dictionary representation of the aligned segment.
|
jpayne@69
|
1165
|
jpayne@69
|
1166 Parameters
|
jpayne@69
|
1167 ----------
|
jpayne@69
|
1168 sam_dict:
|
jpayne@69
|
1169 dictionary of alignment values, keys corresponding to output from
|
jpayne@69
|
1170 :meth:`todict()`.
|
jpayne@69
|
1171
|
jpayne@69
|
1172 """
|
jpayne@69
|
1173 # let htslib do the parsing
|
jpayne@69
|
1174 # the tags field can be missing
|
jpayne@69
|
1175 return cls.fromstring(
|
jpayne@69
|
1176 "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) +
|
jpayne@69
|
1177 "\t" +
|
jpayne@69
|
1178 "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header)
|
jpayne@69
|
1179
|
jpayne@69
|
1180 ########################################################
|
jpayne@69
|
1181 ## Basic attributes in order of appearance in SAM format
|
jpayne@69
|
1182 property query_name:
|
jpayne@69
|
1183 """the query template name (None if not present)"""
|
jpayne@69
|
1184 def __get__(self):
|
jpayne@69
|
1185
|
jpayne@69
|
1186 cdef bam1_t * src = self._delegate
|
jpayne@69
|
1187 if src.core.l_qname == 0:
|
jpayne@69
|
1188 return None
|
jpayne@69
|
1189
|
jpayne@69
|
1190 return charptr_to_str(<char *>pysam_bam_get_qname(src))
|
jpayne@69
|
1191
|
jpayne@69
|
1192 def __set__(self, qname):
|
jpayne@69
|
1193
|
jpayne@69
|
1194 if qname is None or len(qname) == 0:
|
jpayne@69
|
1195 return
|
jpayne@69
|
1196
|
jpayne@69
|
1197 if len(qname) > 254:
|
jpayne@69
|
1198 raise ValueError("query length out of range {} > 254".format(
|
jpayne@69
|
1199 len(qname)))
|
jpayne@69
|
1200
|
jpayne@69
|
1201 qname = force_bytes(qname)
|
jpayne@69
|
1202 cdef bam1_t * src = self._delegate
|
jpayne@69
|
1203 # the qname is \0 terminated
|
jpayne@69
|
1204 cdef uint8_t l = len(qname) + 1
|
jpayne@69
|
1205
|
jpayne@69
|
1206 cdef char * p = pysam_bam_get_qname(src)
|
jpayne@69
|
1207 cdef uint8_t l_extranul = 0
|
jpayne@69
|
1208
|
jpayne@69
|
1209 if l % 4 != 0:
|
jpayne@69
|
1210 l_extranul = 4 - l % 4
|
jpayne@69
|
1211
|
jpayne@69
|
1212 cdef bam1_t * retval = pysam_bam_update(src,
|
jpayne@69
|
1213 src.core.l_qname,
|
jpayne@69
|
1214 l + l_extranul,
|
jpayne@69
|
1215 <uint8_t*>p)
|
jpayne@69
|
1216 if retval == NULL:
|
jpayne@69
|
1217 raise MemoryError("could not allocate memory")
|
jpayne@69
|
1218
|
jpayne@69
|
1219 src.core.l_extranul = l_extranul
|
jpayne@69
|
1220 src.core.l_qname = l + l_extranul
|
jpayne@69
|
1221
|
jpayne@69
|
1222 # re-acquire pointer to location in memory
|
jpayne@69
|
1223 # as it might have moved
|
jpayne@69
|
1224 p = pysam_bam_get_qname(src)
|
jpayne@69
|
1225
|
jpayne@69
|
1226 strncpy(p, qname, l)
|
jpayne@69
|
1227 # x might be > 255
|
jpayne@69
|
1228 cdef uint16_t x = 0
|
jpayne@69
|
1229
|
jpayne@69
|
1230 for x from l <= x < l + l_extranul:
|
jpayne@69
|
1231 p[x] = b'\0'
|
jpayne@69
|
1232
|
jpayne@69
|
1233 property flag:
|
jpayne@69
|
1234 """properties flag"""
|
jpayne@69
|
1235 def __get__(self):
|
jpayne@69
|
1236 return self._delegate.core.flag
|
jpayne@69
|
1237 def __set__(self, flag):
|
jpayne@69
|
1238 self._delegate.core.flag = flag
|
jpayne@69
|
1239
|
jpayne@69
|
1240 property reference_name:
|
jpayne@69
|
1241 """:term:`reference` name"""
|
jpayne@69
|
1242 def __get__(self):
|
jpayne@69
|
1243 if self._delegate.core.tid == -1:
|
jpayne@69
|
1244 return None
|
jpayne@69
|
1245 if self.header:
|
jpayne@69
|
1246 return self.header.get_reference_name(self._delegate.core.tid)
|
jpayne@69
|
1247 else:
|
jpayne@69
|
1248 raise ValueError("reference_name unknown if no header associated with record")
|
jpayne@69
|
1249 def __set__(self, reference):
|
jpayne@69
|
1250 cdef int tid
|
jpayne@69
|
1251 if reference is None or reference == "*":
|
jpayne@69
|
1252 self._delegate.core.tid = -1
|
jpayne@69
|
1253 elif self.header:
|
jpayne@69
|
1254 tid = self.header.get_tid(reference)
|
jpayne@69
|
1255 if tid < 0:
|
jpayne@69
|
1256 raise ValueError("reference {} does not exist in header".format(
|
jpayne@69
|
1257 reference))
|
jpayne@69
|
1258 self._delegate.core.tid = tid
|
jpayne@69
|
1259 else:
|
jpayne@69
|
1260 raise ValueError("reference_name can not be set if no header associated with record")
|
jpayne@69
|
1261
|
jpayne@69
|
1262 property reference_id:
|
jpayne@69
|
1263 """:term:`reference` ID
|
jpayne@69
|
1264
|
jpayne@69
|
1265 .. note::
|
jpayne@69
|
1266
|
jpayne@69
|
1267 This field contains the index of the reference sequence in
|
jpayne@69
|
1268 the sequence dictionary. To obtain the name of the
|
jpayne@69
|
1269 reference sequence, use :meth:`get_reference_name()`
|
jpayne@69
|
1270
|
jpayne@69
|
1271 """
|
jpayne@69
|
1272 def __get__(self):
|
jpayne@69
|
1273 return self._delegate.core.tid
|
jpayne@69
|
1274 def __set__(self, tid):
|
jpayne@69
|
1275 if tid != -1 and self.header and not self.header.is_valid_tid(tid):
|
jpayne@69
|
1276 raise ValueError("reference id {} does not exist in header".format(
|
jpayne@69
|
1277 tid))
|
jpayne@69
|
1278 self._delegate.core.tid = tid
|
jpayne@69
|
1279
|
jpayne@69
|
1280 property reference_start:
|
jpayne@69
|
1281 """0-based leftmost coordinate"""
|
jpayne@69
|
1282 def __get__(self):
|
jpayne@69
|
1283 return self._delegate.core.pos
|
jpayne@69
|
1284 def __set__(self, pos):
|
jpayne@69
|
1285 ## setting the position requires updating the "bin" attribute
|
jpayne@69
|
1286 cdef bam1_t * src
|
jpayne@69
|
1287 src = self._delegate
|
jpayne@69
|
1288 src.core.pos = pos
|
jpayne@69
|
1289 update_bin(src)
|
jpayne@69
|
1290
|
jpayne@69
|
1291 property mapping_quality:
|
jpayne@69
|
1292 """mapping quality"""
|
jpayne@69
|
1293 def __get__(self):
|
jpayne@69
|
1294 return pysam_get_qual(self._delegate)
|
jpayne@69
|
1295 def __set__(self, qual):
|
jpayne@69
|
1296 pysam_set_qual(self._delegate, qual)
|
jpayne@69
|
1297
|
jpayne@69
|
1298 property cigarstring:
|
jpayne@69
|
1299 '''the :term:`cigar` alignment as a string.
|
jpayne@69
|
1300
|
jpayne@69
|
1301 The cigar string is a string of alternating integers
|
jpayne@69
|
1302 and characters denoting the length and the type of
|
jpayne@69
|
1303 an operation.
|
jpayne@69
|
1304
|
jpayne@69
|
1305 .. note::
|
jpayne@69
|
1306 The order length,operation is specified in the
|
jpayne@69
|
1307 SAM format. It is different from the order of
|
jpayne@69
|
1308 the :attr:`cigar` property.
|
jpayne@69
|
1309
|
jpayne@69
|
1310 Returns None if not present.
|
jpayne@69
|
1311
|
jpayne@69
|
1312 To unset the cigarstring, assign None or the
|
jpayne@69
|
1313 empty string.
|
jpayne@69
|
1314 '''
|
jpayne@69
|
1315 def __get__(self):
|
jpayne@69
|
1316 c = self.cigartuples
|
jpayne@69
|
1317 if c is None:
|
jpayne@69
|
1318 return None
|
jpayne@69
|
1319 # reverse order
|
jpayne@69
|
1320 else:
|
jpayne@69
|
1321 return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
|
jpayne@69
|
1322
|
jpayne@69
|
1323 def __set__(self, cigar):
|
jpayne@69
|
1324 if cigar is None or len(cigar) == 0:
|
jpayne@69
|
1325 self.cigartuples = []
|
jpayne@69
|
1326 else:
|
jpayne@69
|
1327 parts = CIGAR_REGEX.findall(cigar)
|
jpayne@69
|
1328 # reverse order
|
jpayne@69
|
1329 self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
|
jpayne@69
|
1330
|
jpayne@69
|
1331 # TODO
|
jpayne@69
|
1332 # property cigar:
|
jpayne@69
|
1333 # """the cigar alignment"""
|
jpayne@69
|
1334
|
jpayne@69
|
1335 property next_reference_id:
|
jpayne@69
|
1336 """the :term:`reference` id of the mate/next read."""
|
jpayne@69
|
1337 def __get__(self):
|
jpayne@69
|
1338 return self._delegate.core.mtid
|
jpayne@69
|
1339 def __set__(self, mtid):
|
jpayne@69
|
1340 if mtid != -1 and self.header and not self.header.is_valid_tid(mtid):
|
jpayne@69
|
1341 raise ValueError("reference id {} does not exist in header".format(
|
jpayne@69
|
1342 mtid))
|
jpayne@69
|
1343 self._delegate.core.mtid = mtid
|
jpayne@69
|
1344
|
jpayne@69
|
1345 property next_reference_name:
|
jpayne@69
|
1346 """:term:`reference` name of the mate/next read (None if no
|
jpayne@69
|
1347 AlignmentFile is associated)"""
|
jpayne@69
|
1348 def __get__(self):
|
jpayne@69
|
1349 if self._delegate.core.mtid == -1:
|
jpayne@69
|
1350 return None
|
jpayne@69
|
1351 if self.header:
|
jpayne@69
|
1352 return self.header.get_reference_name(self._delegate.core.mtid)
|
jpayne@69
|
1353 else:
|
jpayne@69
|
1354 raise ValueError("next_reference_name unknown if no header associated with record")
|
jpayne@69
|
1355
|
jpayne@69
|
1356 def __set__(self, reference):
|
jpayne@69
|
1357 cdef int mtid
|
jpayne@69
|
1358 if reference is None or reference == "*":
|
jpayne@69
|
1359 self._delegate.core.mtid = -1
|
jpayne@69
|
1360 elif reference == "=":
|
jpayne@69
|
1361 self._delegate.core.mtid = self._delegate.core.tid
|
jpayne@69
|
1362 elif self.header:
|
jpayne@69
|
1363 mtid = self.header.get_tid(reference)
|
jpayne@69
|
1364 if mtid < 0:
|
jpayne@69
|
1365 raise ValueError("reference {} does not exist in header".format(
|
jpayne@69
|
1366 reference))
|
jpayne@69
|
1367 self._delegate.core.mtid = mtid
|
jpayne@69
|
1368 else:
|
jpayne@69
|
1369 raise ValueError("next_reference_name can not be set if no header associated with record")
|
jpayne@69
|
1370
|
jpayne@69
|
1371 property next_reference_start:
|
jpayne@69
|
1372 """the position of the mate/next read."""
|
jpayne@69
|
1373 def __get__(self):
|
jpayne@69
|
1374 return self._delegate.core.mpos
|
jpayne@69
|
1375 def __set__(self, mpos):
|
jpayne@69
|
1376 self._delegate.core.mpos = mpos
|
jpayne@69
|
1377
|
jpayne@69
|
1378 property query_length:
|
jpayne@69
|
1379 """the length of the query/read.
|
jpayne@69
|
1380
|
jpayne@69
|
1381 This value corresponds to the length of the sequence supplied
|
jpayne@69
|
1382 in the BAM/SAM file. The length of a query is 0 if there is no
|
jpayne@69
|
1383 sequence in the BAM/SAM file. In those cases, the read length
|
jpayne@69
|
1384 can be inferred from the CIGAR alignment, see
|
jpayne@69
|
1385 :meth:`pysam.AlignedSegment.infer_query_length`.
|
jpayne@69
|
1386
|
jpayne@69
|
1387 The length includes soft-clipped bases and is equal to
|
jpayne@69
|
1388 ``len(query_sequence)``.
|
jpayne@69
|
1389
|
jpayne@69
|
1390 This property is read-only but is updated when a new query
|
jpayne@69
|
1391 sequence is assigned to this AlignedSegment.
|
jpayne@69
|
1392
|
jpayne@69
|
1393 Returns 0 if not available.
|
jpayne@69
|
1394
|
jpayne@69
|
1395 """
|
jpayne@69
|
1396 def __get__(self):
|
jpayne@69
|
1397 return self._delegate.core.l_qseq
|
jpayne@69
|
1398
|
jpayne@69
|
1399 property template_length:
|
jpayne@69
|
1400 """the observed query template length"""
|
jpayne@69
|
1401 def __get__(self):
|
jpayne@69
|
1402 return self._delegate.core.isize
|
jpayne@69
|
1403 def __set__(self, isize):
|
jpayne@69
|
1404 self._delegate.core.isize = isize
|
jpayne@69
|
1405
|
jpayne@69
|
1406 property query_sequence:
|
jpayne@69
|
1407 """read sequence bases, including :term:`soft clipped` bases
|
jpayne@69
|
1408 (None if not present).
|
jpayne@69
|
1409
|
jpayne@69
|
1410 Assigning to this attribute will invalidate any quality scores.
|
jpayne@69
|
1411 Thus, to in-place edit the sequence and quality scores, copies of
|
jpayne@69
|
1412 the quality scores need to be taken. Consider trimming for example::
|
jpayne@69
|
1413
|
jpayne@69
|
1414 q = read.query_qualities
|
jpayne@69
|
1415 read.query_sequence = read.query_sequence[5:10]
|
jpayne@69
|
1416 read.query_qualities = q[5:10]
|
jpayne@69
|
1417
|
jpayne@69
|
1418 The sequence is returned as it is stored in the BAM file. (This will
|
jpayne@69
|
1419 be the reverse complement of the original read sequence if the mapper
|
jpayne@69
|
1420 has aligned the read to the reverse strand.)
|
jpayne@69
|
1421 """
|
jpayne@69
|
1422 def __get__(self):
|
jpayne@69
|
1423 if self.cache_query_sequence:
|
jpayne@69
|
1424 return self.cache_query_sequence
|
jpayne@69
|
1425
|
jpayne@69
|
1426 cdef bam1_t * src
|
jpayne@69
|
1427 cdef char * s
|
jpayne@69
|
1428 src = self._delegate
|
jpayne@69
|
1429
|
jpayne@69
|
1430 if src.core.l_qseq == 0:
|
jpayne@69
|
1431 return None
|
jpayne@69
|
1432
|
jpayne@69
|
1433 self.cache_query_sequence = force_str(getSequenceInRange(
|
jpayne@69
|
1434 src, 0, src.core.l_qseq))
|
jpayne@69
|
1435 return self.cache_query_sequence
|
jpayne@69
|
1436
|
jpayne@69
|
1437 def __set__(self, seq):
|
jpayne@69
|
1438 # samtools manages sequence and quality length memory together
|
jpayne@69
|
1439 # if no quality information is present, the first byte says 0xff.
|
jpayne@69
|
1440 cdef bam1_t * src
|
jpayne@69
|
1441 cdef uint8_t * p
|
jpayne@69
|
1442 cdef char * s
|
jpayne@69
|
1443 cdef int l, k
|
jpayne@69
|
1444 cdef Py_ssize_t nbytes_new, nbytes_old
|
jpayne@69
|
1445
|
jpayne@69
|
1446 if seq == None:
|
jpayne@69
|
1447 l = 0
|
jpayne@69
|
1448 else:
|
jpayne@69
|
1449 l = len(seq)
|
jpayne@69
|
1450 seq = force_bytes(seq)
|
jpayne@69
|
1451
|
jpayne@69
|
1452 src = self._delegate
|
jpayne@69
|
1453
|
jpayne@69
|
1454 # as the sequence is stored in half-bytes, the total length (sequence
|
jpayne@69
|
1455 # plus quality scores) is (l+1)/2 + l
|
jpayne@69
|
1456 nbytes_new = (l + 1) // 2 + l
|
jpayne@69
|
1457 nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
|
jpayne@69
|
1458
|
jpayne@69
|
1459 # acquire pointer to location in memory
|
jpayne@69
|
1460 p = pysam_bam_get_seq(src)
|
jpayne@69
|
1461 src.core.l_qseq = l
|
jpayne@69
|
1462
|
jpayne@69
|
1463 # change length of data field
|
jpayne@69
|
1464 cdef bam1_t * retval = pysam_bam_update(src,
|
jpayne@69
|
1465 nbytes_old,
|
jpayne@69
|
1466 nbytes_new,
|
jpayne@69
|
1467 p)
|
jpayne@69
|
1468
|
jpayne@69
|
1469 if retval == NULL:
|
jpayne@69
|
1470 raise MemoryError("could not allocate memory")
|
jpayne@69
|
1471
|
jpayne@69
|
1472 if l > 0:
|
jpayne@69
|
1473 # re-acquire pointer to location in memory
|
jpayne@69
|
1474 # as it might have moved
|
jpayne@69
|
1475 p = pysam_bam_get_seq(src)
|
jpayne@69
|
1476 for k from 0 <= k < nbytes_new:
|
jpayne@69
|
1477 p[k] = 0
|
jpayne@69
|
1478 # convert to C string
|
jpayne@69
|
1479 s = seq
|
jpayne@69
|
1480 for k from 0 <= k < l:
|
jpayne@69
|
1481 p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
|
jpayne@69
|
1482
|
jpayne@69
|
1483 # erase qualities
|
jpayne@69
|
1484 p = pysam_bam_get_qual(src)
|
jpayne@69
|
1485 memset(p, 0xff, l)
|
jpayne@69
|
1486
|
jpayne@69
|
1487 self.cache_query_sequence = force_str(seq)
|
jpayne@69
|
1488
|
jpayne@69
|
1489 # clear cached values for quality values
|
jpayne@69
|
1490 self.cache_query_qualities = None
|
jpayne@69
|
1491 self.cache_query_alignment_qualities = None
|
jpayne@69
|
1492
|
jpayne@69
|
1493 property query_qualities:
|
jpayne@69
|
1494 """read sequence base qualities, including :term:`soft clipped` bases
|
jpayne@69
|
1495 (None if not present).
|
jpayne@69
|
1496
|
jpayne@69
|
1497 Quality scores are returned as a python array of unsigned
|
jpayne@69
|
1498 chars. Note that this is not the ASCII-encoded value typically
|
jpayne@69
|
1499 seen in FASTQ or SAM formatted files. Thus, no offset of 33
|
jpayne@69
|
1500 needs to be subtracted.
|
jpayne@69
|
1501
|
jpayne@69
|
1502 Note that to set quality scores the sequence has to be set
|
jpayne@69
|
1503 beforehand as this will determine the expected length of the
|
jpayne@69
|
1504 quality score array.
|
jpayne@69
|
1505
|
jpayne@69
|
1506 This method raises a ValueError if the length of the
|
jpayne@69
|
1507 quality scores and the sequence are not the same.
|
jpayne@69
|
1508
|
jpayne@69
|
1509 """
|
jpayne@69
|
1510 def __get__(self):
|
jpayne@69
|
1511
|
jpayne@69
|
1512 if self.cache_query_qualities:
|
jpayne@69
|
1513 return self.cache_query_qualities
|
jpayne@69
|
1514
|
jpayne@69
|
1515 cdef bam1_t * src
|
jpayne@69
|
1516 cdef char * q
|
jpayne@69
|
1517
|
jpayne@69
|
1518 src = self._delegate
|
jpayne@69
|
1519
|
jpayne@69
|
1520 if src.core.l_qseq == 0:
|
jpayne@69
|
1521 return None
|
jpayne@69
|
1522
|
jpayne@69
|
1523 self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
|
jpayne@69
|
1524 return self.cache_query_qualities
|
jpayne@69
|
1525
|
jpayne@69
|
1526 def __set__(self, qual):
|
jpayne@69
|
1527
|
jpayne@69
|
1528 # note that memory is already allocated via setting the sequence
|
jpayne@69
|
1529 # hence length match of sequence and quality needs is checked.
|
jpayne@69
|
1530 cdef bam1_t * src
|
jpayne@69
|
1531 cdef uint8_t * p
|
jpayne@69
|
1532 cdef int l
|
jpayne@69
|
1533
|
jpayne@69
|
1534 src = self._delegate
|
jpayne@69
|
1535 p = pysam_bam_get_qual(src)
|
jpayne@69
|
1536 if qual is None or len(qual) == 0:
|
jpayne@69
|
1537 # if absent and there is a sequence: set to 0xff
|
jpayne@69
|
1538 memset(p, 0xff, src.core.l_qseq)
|
jpayne@69
|
1539 return
|
jpayne@69
|
1540
|
jpayne@69
|
1541 # check for length match
|
jpayne@69
|
1542 l = len(qual)
|
jpayne@69
|
1543 if src.core.l_qseq != l:
|
jpayne@69
|
1544 raise ValueError(
|
jpayne@69
|
1545 "quality and sequence mismatch: %i != %i" %
|
jpayne@69
|
1546 (l, src.core.l_qseq))
|
jpayne@69
|
1547
|
jpayne@69
|
1548 # create a python array object filling it
|
jpayne@69
|
1549 # with the quality scores
|
jpayne@69
|
1550
|
jpayne@69
|
1551 # NB: should avoid this copying if qual is
|
jpayne@69
|
1552 # already of the correct type.
|
jpayne@69
|
1553 cdef c_array.array result = c_array.array('B', qual)
|
jpayne@69
|
1554
|
jpayne@69
|
1555 # copy data
|
jpayne@69
|
1556 memcpy(p, result.data.as_voidptr, l)
|
jpayne@69
|
1557
|
jpayne@69
|
1558 # save in cache
|
jpayne@69
|
1559 self.cache_query_qualities = qual
|
jpayne@69
|
1560
|
jpayne@69
|
1561 property bin:
|
jpayne@69
|
1562 """properties bin"""
|
jpayne@69
|
1563 def __get__(self):
|
jpayne@69
|
1564 return self._delegate.core.bin
|
jpayne@69
|
1565 def __set__(self, bin):
|
jpayne@69
|
1566 self._delegate.core.bin = bin
|
jpayne@69
|
1567
|
jpayne@69
|
1568
|
jpayne@69
|
1569 ##########################################################
|
jpayne@69
|
1570 # Derived simple attributes. These are simple attributes of
|
jpayne@69
|
1571 # AlignedSegment getting and setting values.
|
jpayne@69
|
1572 ##########################################################
|
jpayne@69
|
1573 # 1. Flags
|
jpayne@69
|
1574 ##########################################################
|
jpayne@69
|
1575 property is_paired:
|
jpayne@69
|
1576 """true if read is paired in sequencing"""
|
jpayne@69
|
1577 def __get__(self):
|
jpayne@69
|
1578 return (self.flag & BAM_FPAIRED) != 0
|
jpayne@69
|
1579 def __set__(self,val):
|
jpayne@69
|
1580 pysam_update_flag(self._delegate, val, BAM_FPAIRED)
|
jpayne@69
|
1581
|
jpayne@69
|
1582 property is_proper_pair:
|
jpayne@69
|
1583 """true if read is mapped in a proper pair"""
|
jpayne@69
|
1584 def __get__(self):
|
jpayne@69
|
1585 return (self.flag & BAM_FPROPER_PAIR) != 0
|
jpayne@69
|
1586 def __set__(self,val):
|
jpayne@69
|
1587 pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
|
jpayne@69
|
1588
|
jpayne@69
|
1589 property is_unmapped:
|
jpayne@69
|
1590 """true if read itself is unmapped"""
|
jpayne@69
|
1591 def __get__(self):
|
jpayne@69
|
1592 return (self.flag & BAM_FUNMAP) != 0
|
jpayne@69
|
1593 def __set__(self, val):
|
jpayne@69
|
1594 pysam_update_flag(self._delegate, val, BAM_FUNMAP)
|
jpayne@69
|
1595 # setting the unmapped flag requires recalculation of
|
jpayne@69
|
1596 # bin as alignment length is now implicitly 1
|
jpayne@69
|
1597 update_bin(self._delegate)
|
jpayne@69
|
1598
|
jpayne@69
|
1599 property is_mapped:
|
jpayne@69
|
1600 """true if read itself is mapped
|
jpayne@69
|
1601 (implemented in terms of :attr:`is_unmapped`)"""
|
jpayne@69
|
1602 def __get__(self):
|
jpayne@69
|
1603 return (self.flag & BAM_FUNMAP) == 0
|
jpayne@69
|
1604 def __set__(self, val):
|
jpayne@69
|
1605 pysam_update_flag(self._delegate, not val, BAM_FUNMAP)
|
jpayne@69
|
1606 update_bin(self._delegate)
|
jpayne@69
|
1607
|
jpayne@69
|
1608 property mate_is_unmapped:
|
jpayne@69
|
1609 """true if the mate is unmapped"""
|
jpayne@69
|
1610 def __get__(self):
|
jpayne@69
|
1611 return (self.flag & BAM_FMUNMAP) != 0
|
jpayne@69
|
1612 def __set__(self,val):
|
jpayne@69
|
1613 pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
|
jpayne@69
|
1614
|
jpayne@69
|
1615 property mate_is_mapped:
|
jpayne@69
|
1616 """true if the mate is mapped
|
jpayne@69
|
1617 (implemented in terms of :attr:`mate_is_unmapped`)"""
|
jpayne@69
|
1618 def __get__(self):
|
jpayne@69
|
1619 return (self.flag & BAM_FMUNMAP) == 0
|
jpayne@69
|
1620 def __set__(self,val):
|
jpayne@69
|
1621 pysam_update_flag(self._delegate, not val, BAM_FMUNMAP)
|
jpayne@69
|
1622
|
jpayne@69
|
1623 property is_reverse:
|
jpayne@69
|
1624 """true if read is mapped to reverse strand"""
|
jpayne@69
|
1625 def __get__(self):
|
jpayne@69
|
1626 return (self.flag & BAM_FREVERSE) != 0
|
jpayne@69
|
1627 def __set__(self,val):
|
jpayne@69
|
1628 pysam_update_flag(self._delegate, val, BAM_FREVERSE)
|
jpayne@69
|
1629
|
jpayne@69
|
1630 property is_forward:
|
jpayne@69
|
1631 """true if read is mapped to forward strand
|
jpayne@69
|
1632 (implemented in terms of :attr:`is_reverse`)"""
|
jpayne@69
|
1633 def __get__(self):
|
jpayne@69
|
1634 return (self.flag & BAM_FREVERSE) == 0
|
jpayne@69
|
1635 def __set__(self,val):
|
jpayne@69
|
1636 pysam_update_flag(self._delegate, not val, BAM_FREVERSE)
|
jpayne@69
|
1637
|
jpayne@69
|
1638 property mate_is_reverse:
|
jpayne@69
|
1639 """true if the mate is mapped to reverse strand"""
|
jpayne@69
|
1640 def __get__(self):
|
jpayne@69
|
1641 return (self.flag & BAM_FMREVERSE) != 0
|
jpayne@69
|
1642 def __set__(self,val):
|
jpayne@69
|
1643 pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
|
jpayne@69
|
1644
|
jpayne@69
|
1645 property mate_is_forward:
|
jpayne@69
|
1646 """true if the mate is mapped to forward strand
|
jpayne@69
|
1647 (implemented in terms of :attr:`mate_is_reverse`)"""
|
jpayne@69
|
1648 def __get__(self):
|
jpayne@69
|
1649 return (self.flag & BAM_FMREVERSE) == 0
|
jpayne@69
|
1650 def __set__(self,val):
|
jpayne@69
|
1651 pysam_update_flag(self._delegate, not val, BAM_FMREVERSE)
|
jpayne@69
|
1652
|
jpayne@69
|
1653 property is_read1:
|
jpayne@69
|
1654 """true if this is read1"""
|
jpayne@69
|
1655 def __get__(self):
|
jpayne@69
|
1656 return (self.flag & BAM_FREAD1) != 0
|
jpayne@69
|
1657 def __set__(self,val):
|
jpayne@69
|
1658 pysam_update_flag(self._delegate, val, BAM_FREAD1)
|
jpayne@69
|
1659 property is_read2:
|
jpayne@69
|
1660 """true if this is read2"""
|
jpayne@69
|
1661 def __get__(self):
|
jpayne@69
|
1662 return (self.flag & BAM_FREAD2) != 0
|
jpayne@69
|
1663 def __set__(self, val):
|
jpayne@69
|
1664 pysam_update_flag(self._delegate, val, BAM_FREAD2)
|
jpayne@69
|
1665 property is_secondary:
|
jpayne@69
|
1666 """true if not primary alignment"""
|
jpayne@69
|
1667 def __get__(self):
|
jpayne@69
|
1668 return (self.flag & BAM_FSECONDARY) != 0
|
jpayne@69
|
1669 def __set__(self, val):
|
jpayne@69
|
1670 pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
|
jpayne@69
|
1671 property is_qcfail:
|
jpayne@69
|
1672 """true if QC failure"""
|
jpayne@69
|
1673 def __get__(self):
|
jpayne@69
|
1674 return (self.flag & BAM_FQCFAIL) != 0
|
jpayne@69
|
1675 def __set__(self, val):
|
jpayne@69
|
1676 pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
|
jpayne@69
|
1677 property is_duplicate:
|
jpayne@69
|
1678 """true if optical or PCR duplicate"""
|
jpayne@69
|
1679 def __get__(self):
|
jpayne@69
|
1680 return (self.flag & BAM_FDUP) != 0
|
jpayne@69
|
1681 def __set__(self, val):
|
jpayne@69
|
1682 pysam_update_flag(self._delegate, val, BAM_FDUP)
|
jpayne@69
|
1683 property is_supplementary:
|
jpayne@69
|
1684 """true if this is a supplementary alignment"""
|
jpayne@69
|
1685 def __get__(self):
|
jpayne@69
|
1686 return (self.flag & BAM_FSUPPLEMENTARY) != 0
|
jpayne@69
|
1687 def __set__(self, val):
|
jpayne@69
|
1688 pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
|
jpayne@69
|
1689
|
jpayne@69
|
1690 # 2. Coordinates and lengths
|
jpayne@69
|
1691 property reference_end:
|
jpayne@69
|
1692 '''aligned reference position of the read on the reference genome.
|
jpayne@69
|
1693
|
jpayne@69
|
1694 reference_end points to one past the last aligned residue.
|
jpayne@69
|
1695 Returns None if not available (read is unmapped or no cigar
|
jpayne@69
|
1696 alignment present).
|
jpayne@69
|
1697
|
jpayne@69
|
1698 '''
|
jpayne@69
|
1699 def __get__(self):
|
jpayne@69
|
1700 cdef bam1_t * src
|
jpayne@69
|
1701 src = self._delegate
|
jpayne@69
|
1702 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
1703 return None
|
jpayne@69
|
1704 return bam_endpos(src)
|
jpayne@69
|
1705
|
jpayne@69
|
1706 property reference_length:
|
jpayne@69
|
1707 '''aligned length of the read on the reference genome.
|
jpayne@69
|
1708
|
jpayne@69
|
1709 This is equal to `reference_end - reference_start`.
|
jpayne@69
|
1710 Returns None if not available.
|
jpayne@69
|
1711 '''
|
jpayne@69
|
1712 def __get__(self):
|
jpayne@69
|
1713 cdef bam1_t * src
|
jpayne@69
|
1714 src = self._delegate
|
jpayne@69
|
1715 if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
1716 return None
|
jpayne@69
|
1717 return bam_endpos(src) - \
|
jpayne@69
|
1718 self._delegate.core.pos
|
jpayne@69
|
1719
|
jpayne@69
|
1720 property query_alignment_sequence:
|
jpayne@69
|
1721 """aligned portion of the read.
|
jpayne@69
|
1722
|
jpayne@69
|
1723 This is a substring of :attr:`query_sequence` that excludes flanking
|
jpayne@69
|
1724 bases that were :term:`soft clipped` (None if not present). It
|
jpayne@69
|
1725 is equal to ``query_sequence[query_alignment_start:query_alignment_end]``.
|
jpayne@69
|
1726
|
jpayne@69
|
1727 SAM/BAM files may include extra flanking bases that are not
|
jpayne@69
|
1728 part of the alignment. These bases may be the result of the
|
jpayne@69
|
1729 Smith-Waterman or other algorithms, which may not require
|
jpayne@69
|
1730 alignments that begin at the first residue or end at the last.
|
jpayne@69
|
1731 In addition, extra sequencing adapters, multiplex identifiers,
|
jpayne@69
|
1732 and low-quality bases that were not considered for alignment
|
jpayne@69
|
1733 may have been retained.
|
jpayne@69
|
1734
|
jpayne@69
|
1735 """
|
jpayne@69
|
1736
|
jpayne@69
|
1737 def __get__(self):
|
jpayne@69
|
1738 if self.cache_query_alignment_sequence:
|
jpayne@69
|
1739 return self.cache_query_alignment_sequence
|
jpayne@69
|
1740
|
jpayne@69
|
1741 cdef bam1_t * src
|
jpayne@69
|
1742 cdef uint32_t start, end
|
jpayne@69
|
1743
|
jpayne@69
|
1744 src = self._delegate
|
jpayne@69
|
1745
|
jpayne@69
|
1746 if src.core.l_qseq == 0:
|
jpayne@69
|
1747 return None
|
jpayne@69
|
1748
|
jpayne@69
|
1749 start = getQueryStart(src)
|
jpayne@69
|
1750 end = getQueryEnd(src)
|
jpayne@69
|
1751
|
jpayne@69
|
1752 self.cache_query_alignment_sequence = force_str(
|
jpayne@69
|
1753 getSequenceInRange(src, start, end))
|
jpayne@69
|
1754 return self.cache_query_alignment_sequence
|
jpayne@69
|
1755
|
jpayne@69
|
1756 property query_alignment_qualities:
|
jpayne@69
|
1757 """aligned query sequence quality values (None if not present). These
|
jpayne@69
|
1758 are the quality values that correspond to
|
jpayne@69
|
1759 :attr:`query_alignment_sequence`, that is, they exclude qualities of
|
jpayne@69
|
1760 :term:`soft clipped` bases. This is equal to
|
jpayne@69
|
1761 ``query_qualities[query_alignment_start:query_alignment_end]``.
|
jpayne@69
|
1762
|
jpayne@69
|
1763 Quality scores are returned as a python array of unsigned
|
jpayne@69
|
1764 chars. Note that this is not the ASCII-encoded value typically
|
jpayne@69
|
1765 seen in FASTQ or SAM formatted files. Thus, no offset of 33
|
jpayne@69
|
1766 needs to be subtracted.
|
jpayne@69
|
1767
|
jpayne@69
|
1768 This property is read-only.
|
jpayne@69
|
1769
|
jpayne@69
|
1770 """
|
jpayne@69
|
1771 def __get__(self):
|
jpayne@69
|
1772
|
jpayne@69
|
1773 if self.cache_query_alignment_qualities:
|
jpayne@69
|
1774 return self.cache_query_alignment_qualities
|
jpayne@69
|
1775
|
jpayne@69
|
1776 cdef bam1_t * src
|
jpayne@69
|
1777 cdef uint32_t start, end
|
jpayne@69
|
1778
|
jpayne@69
|
1779 src = self._delegate
|
jpayne@69
|
1780
|
jpayne@69
|
1781 if src.core.l_qseq == 0:
|
jpayne@69
|
1782 return None
|
jpayne@69
|
1783
|
jpayne@69
|
1784 start = getQueryStart(src)
|
jpayne@69
|
1785 end = getQueryEnd(src)
|
jpayne@69
|
1786 self.cache_query_alignment_qualities = \
|
jpayne@69
|
1787 getQualitiesInRange(src, start, end)
|
jpayne@69
|
1788 return self.cache_query_alignment_qualities
|
jpayne@69
|
1789
|
jpayne@69
|
1790 property query_alignment_start:
|
jpayne@69
|
1791 """start index of the aligned query portion of the sequence (0-based,
|
jpayne@69
|
1792 inclusive).
|
jpayne@69
|
1793
|
jpayne@69
|
1794 This the index of the first base in :attr:`query_sequence`
|
jpayne@69
|
1795 that is not soft-clipped.
|
jpayne@69
|
1796 """
|
jpayne@69
|
1797 def __get__(self):
|
jpayne@69
|
1798 return getQueryStart(self._delegate)
|
jpayne@69
|
1799
|
jpayne@69
|
1800 property query_alignment_end:
|
jpayne@69
|
1801 """end index of the aligned query portion of the sequence (0-based,
|
jpayne@69
|
1802 exclusive)
|
jpayne@69
|
1803
|
jpayne@69
|
1804 This the index just past the last base in :attr:`query_sequence`
|
jpayne@69
|
1805 that is not soft-clipped.
|
jpayne@69
|
1806 """
|
jpayne@69
|
1807 def __get__(self):
|
jpayne@69
|
1808 return getQueryEnd(self._delegate)
|
jpayne@69
|
1809
|
jpayne@69
|
1810 property modified_bases:
|
jpayne@69
|
1811 """Modified bases annotations from Ml/Mm tags. The output is
|
jpayne@69
|
1812 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
|
jpayne@69
|
1813 with qual being (256*probability), or -1 if unknown.
|
jpayne@69
|
1814 Strand==0 for forward and 1 for reverse strand modification
|
jpayne@69
|
1815 """
|
jpayne@69
|
1816 def __get__(self):
|
jpayne@69
|
1817 cdef bam1_t * src
|
jpayne@69
|
1818 cdef hts_base_mod_state *m = hts_base_mod_state_alloc()
|
jpayne@69
|
1819 cdef hts_base_mod mods[5]
|
jpayne@69
|
1820 cdef int pos
|
jpayne@69
|
1821
|
jpayne@69
|
1822 ret = {}
|
jpayne@69
|
1823 src = self._delegate
|
jpayne@69
|
1824
|
jpayne@69
|
1825 if bam_parse_basemod(src, m) < 0:
|
jpayne@69
|
1826 return None
|
jpayne@69
|
1827
|
jpayne@69
|
1828 n = bam_next_basemod(src, m, mods, 5, &pos)
|
jpayne@69
|
1829
|
jpayne@69
|
1830 while n>0:
|
jpayne@69
|
1831 for i in range(n):
|
jpayne@69
|
1832 mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base
|
jpayne@69
|
1833 mod_strand = mods[i].strand
|
jpayne@69
|
1834 if self.is_reverse:
|
jpayne@69
|
1835 mod_strand = 1 - mod_strand
|
jpayne@69
|
1836 key = (chr(mods[i].canonical_base),
|
jpayne@69
|
1837 mod_strand,
|
jpayne@69
|
1838 mod_code )
|
jpayne@69
|
1839 ret.setdefault(key,[]).append((pos,mods[i].qual))
|
jpayne@69
|
1840
|
jpayne@69
|
1841 n = bam_next_basemod(src, m, mods, 5, &pos)
|
jpayne@69
|
1842
|
jpayne@69
|
1843 if n<0:
|
jpayne@69
|
1844 return None
|
jpayne@69
|
1845
|
jpayne@69
|
1846 hts_base_mod_state_free(m)
|
jpayne@69
|
1847 return ret
|
jpayne@69
|
1848
|
jpayne@69
|
1849 property modified_bases_forward:
|
jpayne@69
|
1850 """Modified bases annotations from Ml/Mm tags. The output is
|
jpayne@69
|
1851 Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...]
|
jpayne@69
|
1852 with qual being (256*probability), or -1 if unknown.
|
jpayne@69
|
1853 Strand==0 for forward and 1 for reverse strand modification.
|
jpayne@69
|
1854 The positions are with respect to the original sequence from get_forward_sequence()
|
jpayne@69
|
1855 """
|
jpayne@69
|
1856 def __get__(self):
|
jpayne@69
|
1857 pmods = self.modified_bases
|
jpayne@69
|
1858 if pmods and self.is_reverse:
|
jpayne@69
|
1859 rmod = {}
|
jpayne@69
|
1860
|
jpayne@69
|
1861 # Try to find the length of the original sequence
|
jpayne@69
|
1862 rlen = self.infer_read_length()
|
jpayne@69
|
1863 if rlen is None and self.query_sequence is None:
|
jpayne@69
|
1864 return rmod
|
jpayne@69
|
1865 else:
|
jpayne@69
|
1866 rlen = len(self.query_sequence)
|
jpayne@69
|
1867
|
jpayne@69
|
1868 for k,mods in pmods.items():
|
jpayne@69
|
1869 nk = k[0],1 - k[1],k[2]
|
jpayne@69
|
1870 for i in range(len(mods)):
|
jpayne@69
|
1871
|
jpayne@69
|
1872 mods[i] = (rlen - 1 -mods[i][0], mods[i][1])
|
jpayne@69
|
1873 rmod[nk] = mods
|
jpayne@69
|
1874 return rmod
|
jpayne@69
|
1875
|
jpayne@69
|
1876 return pmods
|
jpayne@69
|
1877
|
jpayne@69
|
1878
|
jpayne@69
|
1879 property query_alignment_length:
|
jpayne@69
|
1880 """length of the aligned query sequence.
|
jpayne@69
|
1881
|
jpayne@69
|
1882 This is equal to :attr:`query_alignment_end` -
|
jpayne@69
|
1883 :attr:`query_alignment_start`
|
jpayne@69
|
1884 """
|
jpayne@69
|
1885 def __get__(self):
|
jpayne@69
|
1886 cdef bam1_t * src
|
jpayne@69
|
1887 src = self._delegate
|
jpayne@69
|
1888 return getQueryEnd(src) - getQueryStart(src)
|
jpayne@69
|
1889
|
jpayne@69
|
1890 #####################################################
|
jpayne@69
|
1891 # Computed properties
|
jpayne@69
|
1892
|
jpayne@69
|
1893 def get_reference_positions(self, full_length=False):
|
jpayne@69
|
1894 """a list of reference positions that this read aligns to.
|
jpayne@69
|
1895
|
jpayne@69
|
1896 By default, this method returns the (0-based) positions on the
|
jpayne@69
|
1897 reference that are within the read's alignment, leaving gaps
|
jpayne@69
|
1898 corresponding to deletions and other reference skips.
|
jpayne@69
|
1899
|
jpayne@69
|
1900 When *full_length* is True, the returned list is the same length
|
jpayne@69
|
1901 as the read and additionally includes None values corresponding
|
jpayne@69
|
1902 to insertions or soft-clipping, i.e., to bases of the read that
|
jpayne@69
|
1903 are not aligned to a reference position.
|
jpayne@69
|
1904 (See also :meth:`get_aligned_pairs` which additionally returns
|
jpayne@69
|
1905 the corresponding positions along the read.)
|
jpayne@69
|
1906 """
|
jpayne@69
|
1907 cdef uint32_t k, i, l, pos
|
jpayne@69
|
1908 cdef int op
|
jpayne@69
|
1909 cdef uint32_t * cigar_p
|
jpayne@69
|
1910 cdef bam1_t * src
|
jpayne@69
|
1911 cdef bint _full = full_length
|
jpayne@69
|
1912
|
jpayne@69
|
1913 src = self._delegate
|
jpayne@69
|
1914 if pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
1915 return []
|
jpayne@69
|
1916
|
jpayne@69
|
1917 result = []
|
jpayne@69
|
1918 pos = src.core.pos
|
jpayne@69
|
1919 cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
1920
|
jpayne@69
|
1921 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
1922 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
1923 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
1924
|
jpayne@69
|
1925 if op == BAM_CSOFT_CLIP or op == BAM_CINS:
|
jpayne@69
|
1926 if _full:
|
jpayne@69
|
1927 for i from 0 <= i < l:
|
jpayne@69
|
1928 result.append(None)
|
jpayne@69
|
1929 elif op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
1930 for i from pos <= i < pos + l:
|
jpayne@69
|
1931 result.append(i)
|
jpayne@69
|
1932 pos += l
|
jpayne@69
|
1933 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
|
jpayne@69
|
1934 pos += l
|
jpayne@69
|
1935
|
jpayne@69
|
1936 return result
|
jpayne@69
|
1937
|
jpayne@69
|
1938 def infer_query_length(self, always=False):
|
jpayne@69
|
1939 """infer query length from CIGAR alignment.
|
jpayne@69
|
1940
|
jpayne@69
|
1941 This method deduces the query length from the CIGAR alignment
|
jpayne@69
|
1942 but does not include hard-clipped bases.
|
jpayne@69
|
1943
|
jpayne@69
|
1944 Returns None if CIGAR alignment is not present.
|
jpayne@69
|
1945
|
jpayne@69
|
1946 If *always* is set to True, `infer_read_length` is used instead.
|
jpayne@69
|
1947 This is deprecated and only present for backward compatibility.
|
jpayne@69
|
1948 """
|
jpayne@69
|
1949 if always is True:
|
jpayne@69
|
1950 return self.infer_read_length()
|
jpayne@69
|
1951 cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
|
jpayne@69
|
1952 if l > 0:
|
jpayne@69
|
1953 return l
|
jpayne@69
|
1954 else:
|
jpayne@69
|
1955 return None
|
jpayne@69
|
1956
|
jpayne@69
|
1957 def infer_read_length(self):
|
jpayne@69
|
1958 """infer read length from CIGAR alignment.
|
jpayne@69
|
1959
|
jpayne@69
|
1960 This method deduces the read length from the CIGAR alignment
|
jpayne@69
|
1961 including hard-clipped bases.
|
jpayne@69
|
1962
|
jpayne@69
|
1963 Returns None if CIGAR alignment is not present.
|
jpayne@69
|
1964 """
|
jpayne@69
|
1965 cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
|
jpayne@69
|
1966 if l > 0:
|
jpayne@69
|
1967 return l
|
jpayne@69
|
1968 else:
|
jpayne@69
|
1969 return None
|
jpayne@69
|
1970
|
jpayne@69
|
1971 def get_reference_sequence(self):
|
jpayne@69
|
1972 """return the reference sequence in the region that is covered by the
|
jpayne@69
|
1973 alignment of the read to the reference.
|
jpayne@69
|
1974
|
jpayne@69
|
1975 This method requires the MD tag to be set.
|
jpayne@69
|
1976
|
jpayne@69
|
1977 """
|
jpayne@69
|
1978 return force_str(build_reference_sequence(self._delegate))
|
jpayne@69
|
1979
|
jpayne@69
|
1980 def get_forward_sequence(self):
|
jpayne@69
|
1981 """return the original read sequence.
|
jpayne@69
|
1982
|
jpayne@69
|
1983 Reads mapped to the reverse strand are stored reverse complemented in
|
jpayne@69
|
1984 the BAM file. This method returns such reads reverse complemented back
|
jpayne@69
|
1985 to their original orientation.
|
jpayne@69
|
1986
|
jpayne@69
|
1987 Returns None if the record has no query sequence.
|
jpayne@69
|
1988 """
|
jpayne@69
|
1989 if self.query_sequence is None:
|
jpayne@69
|
1990 return None
|
jpayne@69
|
1991 s = force_str(self.query_sequence)
|
jpayne@69
|
1992 if self.is_reverse:
|
jpayne@69
|
1993 s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
|
jpayne@69
|
1994 return s
|
jpayne@69
|
1995
|
jpayne@69
|
1996 def get_forward_qualities(self):
|
jpayne@69
|
1997 """return the original base qualities of the read sequence,
|
jpayne@69
|
1998 in the same format as the :attr:`query_qualities` property.
|
jpayne@69
|
1999
|
jpayne@69
|
2000 Reads mapped to the reverse strand have their base qualities stored
|
jpayne@69
|
2001 reversed in the BAM file. This method returns such reads' base qualities
|
jpayne@69
|
2002 reversed back to their original orientation.
|
jpayne@69
|
2003 """
|
jpayne@69
|
2004 if self.is_reverse:
|
jpayne@69
|
2005 return self.query_qualities[::-1]
|
jpayne@69
|
2006 else:
|
jpayne@69
|
2007 return self.query_qualities
|
jpayne@69
|
2008
|
jpayne@69
|
2009 def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
|
jpayne@69
|
2010 """a list of aligned read (query) and reference positions.
|
jpayne@69
|
2011
|
jpayne@69
|
2012 Each item in the returned list is a tuple consisting of
|
jpayne@69
|
2013 the 0-based offset from the start of the read sequence
|
jpayne@69
|
2014 followed by the 0-based reference position.
|
jpayne@69
|
2015
|
jpayne@69
|
2016 For inserts, deletions, skipping either query or reference
|
jpayne@69
|
2017 position may be None.
|
jpayne@69
|
2018
|
jpayne@69
|
2019 For padding in the reference, the reference position will
|
jpayne@69
|
2020 always be None.
|
jpayne@69
|
2021
|
jpayne@69
|
2022 Parameters
|
jpayne@69
|
2023 ----------
|
jpayne@69
|
2024
|
jpayne@69
|
2025 matches_only : bool
|
jpayne@69
|
2026 If True, only matched bases are returned --- no None on either
|
jpayne@69
|
2027 side.
|
jpayne@69
|
2028 with_seq : bool
|
jpayne@69
|
2029 If True, return a third element in the tuple containing the
|
jpayne@69
|
2030 reference sequence. For CIGAR 'P' (padding in the reference)
|
jpayne@69
|
2031 operations, the third tuple element will be None. Substitutions
|
jpayne@69
|
2032 are lower-case. This option requires an MD tag to be present.
|
jpayne@69
|
2033 with_cigar : bool
|
jpayne@69
|
2034 If True, return an extra element in the tuple containing the
|
jpayne@69
|
2035 CIGAR operator corresponding to this position tuple.
|
jpayne@69
|
2036
|
jpayne@69
|
2037 Returns
|
jpayne@69
|
2038 -------
|
jpayne@69
|
2039
|
jpayne@69
|
2040 aligned_pairs : list of tuples
|
jpayne@69
|
2041
|
jpayne@69
|
2042 """
|
jpayne@69
|
2043 cdef uint32_t k, i, pos, qpos, r_idx, l
|
jpayne@69
|
2044 cdef int op
|
jpayne@69
|
2045 cdef uint32_t * cigar_p
|
jpayne@69
|
2046 cdef bam1_t * src = self._delegate
|
jpayne@69
|
2047 cdef bint _matches_only = bool(matches_only)
|
jpayne@69
|
2048 cdef bint _with_seq = bool(with_seq)
|
jpayne@69
|
2049 cdef bint _with_cigar = bool(with_cigar)
|
jpayne@69
|
2050 cdef object (*make_tuple)(object, object, object, uint32_t, int)
|
jpayne@69
|
2051
|
jpayne@69
|
2052 # TODO: this method performs no checking and assumes that
|
jpayne@69
|
2053 # read sequence, cigar and MD tag are consistent.
|
jpayne@69
|
2054
|
jpayne@69
|
2055 if _with_seq:
|
jpayne@69
|
2056 # force_str required for py2/py3 compatibility
|
jpayne@69
|
2057 ref_seq = force_str(build_reference_sequence(src))
|
jpayne@69
|
2058 if ref_seq is None:
|
jpayne@69
|
2059 raise ValueError("MD tag not present")
|
jpayne@69
|
2060 make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
|
jpayne@69
|
2061 else:
|
jpayne@69
|
2062 ref_seq = None
|
jpayne@69
|
2063 make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
|
jpayne@69
|
2064
|
jpayne@69
|
2065 r_idx = 0
|
jpayne@69
|
2066
|
jpayne@69
|
2067 if pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
2068 return []
|
jpayne@69
|
2069
|
jpayne@69
|
2070 result = []
|
jpayne@69
|
2071 pos = src.core.pos
|
jpayne@69
|
2072 qpos = 0
|
jpayne@69
|
2073 cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2074 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
2075 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
2076 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
2077
|
jpayne@69
|
2078 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
2079 for i from pos <= i < pos + l:
|
jpayne@69
|
2080 result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
|
jpayne@69
|
2081 r_idx += 1
|
jpayne@69
|
2082 qpos += 1
|
jpayne@69
|
2083 pos += l
|
jpayne@69
|
2084
|
jpayne@69
|
2085 elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
|
jpayne@69
|
2086 if not _matches_only:
|
jpayne@69
|
2087 for i from pos <= i < pos + l:
|
jpayne@69
|
2088 result.append(make_tuple(qpos, None, None, 0, op))
|
jpayne@69
|
2089 qpos += 1
|
jpayne@69
|
2090 else:
|
jpayne@69
|
2091 qpos += l
|
jpayne@69
|
2092
|
jpayne@69
|
2093 elif op == BAM_CDEL:
|
jpayne@69
|
2094 if not _matches_only:
|
jpayne@69
|
2095 for i from pos <= i < pos + l:
|
jpayne@69
|
2096 result.append(make_tuple(None, i, ref_seq, r_idx, op))
|
jpayne@69
|
2097 r_idx += 1
|
jpayne@69
|
2098 else:
|
jpayne@69
|
2099 r_idx += l
|
jpayne@69
|
2100 pos += l
|
jpayne@69
|
2101
|
jpayne@69
|
2102 elif op == BAM_CHARD_CLIP:
|
jpayne@69
|
2103 pass # advances neither
|
jpayne@69
|
2104
|
jpayne@69
|
2105 elif op == BAM_CREF_SKIP:
|
jpayne@69
|
2106 if not _matches_only:
|
jpayne@69
|
2107 for i from pos <= i < pos + l:
|
jpayne@69
|
2108 result.append(make_tuple(None, i, None, 0, op))
|
jpayne@69
|
2109
|
jpayne@69
|
2110 pos += l
|
jpayne@69
|
2111
|
jpayne@69
|
2112 return result
|
jpayne@69
|
2113
|
jpayne@69
|
2114 def get_blocks(self):
|
jpayne@69
|
2115 """ a list of start and end positions of
|
jpayne@69
|
2116 aligned gapless blocks.
|
jpayne@69
|
2117
|
jpayne@69
|
2118 The start and end positions are in genomic
|
jpayne@69
|
2119 coordinates.
|
jpayne@69
|
2120
|
jpayne@69
|
2121 Blocks are not normalized, i.e. two blocks
|
jpayne@69
|
2122 might be directly adjacent. This happens if
|
jpayne@69
|
2123 the two blocks are separated by an insertion
|
jpayne@69
|
2124 in the read.
|
jpayne@69
|
2125 """
|
jpayne@69
|
2126
|
jpayne@69
|
2127 cdef uint32_t k, pos, l
|
jpayne@69
|
2128 cdef int op
|
jpayne@69
|
2129 cdef uint32_t * cigar_p
|
jpayne@69
|
2130 cdef bam1_t * src
|
jpayne@69
|
2131
|
jpayne@69
|
2132 src = self._delegate
|
jpayne@69
|
2133 if pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
2134 return []
|
jpayne@69
|
2135
|
jpayne@69
|
2136 result = []
|
jpayne@69
|
2137 pos = src.core.pos
|
jpayne@69
|
2138 cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2139 l = 0
|
jpayne@69
|
2140
|
jpayne@69
|
2141 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
2142 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
2143 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
2144 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
2145 result.append((pos, pos + l))
|
jpayne@69
|
2146 pos += l
|
jpayne@69
|
2147 elif op == BAM_CDEL or op == BAM_CREF_SKIP:
|
jpayne@69
|
2148 pos += l
|
jpayne@69
|
2149
|
jpayne@69
|
2150 return result
|
jpayne@69
|
2151
|
jpayne@69
|
2152 def get_overlap(self, uint32_t start, uint32_t end):
|
jpayne@69
|
2153 """return number of aligned bases of read overlapping the interval
|
jpayne@69
|
2154 *start* and *end* on the reference sequence.
|
jpayne@69
|
2155
|
jpayne@69
|
2156 Return None if cigar alignment is not available.
|
jpayne@69
|
2157 """
|
jpayne@69
|
2158 cdef uint32_t k, i, pos, overlap
|
jpayne@69
|
2159 cdef int op, o
|
jpayne@69
|
2160 cdef uint32_t * cigar_p
|
jpayne@69
|
2161 cdef bam1_t * src
|
jpayne@69
|
2162
|
jpayne@69
|
2163 overlap = 0
|
jpayne@69
|
2164
|
jpayne@69
|
2165 src = self._delegate
|
jpayne@69
|
2166 if pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
2167 return None
|
jpayne@69
|
2168 pos = src.core.pos
|
jpayne@69
|
2169 o = 0
|
jpayne@69
|
2170
|
jpayne@69
|
2171 cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2172 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
2173 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
2174 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
2175
|
jpayne@69
|
2176 if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
2177 o = min( pos + l, end) - max( pos, start )
|
jpayne@69
|
2178 if o > 0: overlap += o
|
jpayne@69
|
2179
|
jpayne@69
|
2180 if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP or op == BAM_CEQUAL or op == BAM_CDIFF:
|
jpayne@69
|
2181 pos += l
|
jpayne@69
|
2182
|
jpayne@69
|
2183 return overlap
|
jpayne@69
|
2184
|
jpayne@69
|
2185 def get_cigar_stats(self):
|
jpayne@69
|
2186 """summary of operations in cigar string.
|
jpayne@69
|
2187
|
jpayne@69
|
2188 The output order in the array is "MIDNSHP=X" followed by a
|
jpayne@69
|
2189 field for the NM tag. If the NM tag is not present, this
|
jpayne@69
|
2190 field will always be 0. (Accessing this field via index -1
|
jpayne@69
|
2191 avoids changes if more CIGAR operators are added in future.)
|
jpayne@69
|
2192
|
jpayne@69
|
2193 +-----+----------------+--------+
|
jpayne@69
|
2194 |M |pysam.CMATCH |0 |
|
jpayne@69
|
2195 +-----+----------------+--------+
|
jpayne@69
|
2196 |I |pysam.CINS |1 |
|
jpayne@69
|
2197 +-----+----------------+--------+
|
jpayne@69
|
2198 |D |pysam.CDEL |2 |
|
jpayne@69
|
2199 +-----+----------------+--------+
|
jpayne@69
|
2200 |N |pysam.CREF_SKIP |3 |
|
jpayne@69
|
2201 +-----+----------------+--------+
|
jpayne@69
|
2202 |S |pysam.CSOFT_CLIP|4 |
|
jpayne@69
|
2203 +-----+----------------+--------+
|
jpayne@69
|
2204 |H |pysam.CHARD_CLIP|5 |
|
jpayne@69
|
2205 +-----+----------------+--------+
|
jpayne@69
|
2206 |P |pysam.CPAD |6 |
|
jpayne@69
|
2207 +-----+----------------+--------+
|
jpayne@69
|
2208 |= |pysam.CEQUAL |7 |
|
jpayne@69
|
2209 +-----+----------------+--------+
|
jpayne@69
|
2210 |X |pysam.CDIFF |8 |
|
jpayne@69
|
2211 +-----+----------------+--------+
|
jpayne@69
|
2212 |B |pysam.CBACK |9 |
|
jpayne@69
|
2213 +-----+----------------+--------+
|
jpayne@69
|
2214 |NM |NM tag |10 or -1|
|
jpayne@69
|
2215 +-----+----------------+--------+
|
jpayne@69
|
2216
|
jpayne@69
|
2217 If no cigar string is present, empty arrays will be returned.
|
jpayne@69
|
2218
|
jpayne@69
|
2219 Returns:
|
jpayne@69
|
2220 arrays :
|
jpayne@69
|
2221 two arrays. The first contains the nucleotide counts within
|
jpayne@69
|
2222 each cigar operation, the second contains the number of blocks
|
jpayne@69
|
2223 for each cigar operation.
|
jpayne@69
|
2224
|
jpayne@69
|
2225 """
|
jpayne@69
|
2226
|
jpayne@69
|
2227 cdef int nfields = NCIGAR_CODES + 1
|
jpayne@69
|
2228
|
jpayne@69
|
2229 cdef c_array.array base_counts = array.array(
|
jpayne@69
|
2230 "I",
|
jpayne@69
|
2231 [0] * nfields)
|
jpayne@69
|
2232 cdef uint32_t [:] base_view = base_counts
|
jpayne@69
|
2233 cdef c_array.array block_counts = array.array(
|
jpayne@69
|
2234 "I",
|
jpayne@69
|
2235 [0] * nfields)
|
jpayne@69
|
2236 cdef uint32_t [:] block_view = block_counts
|
jpayne@69
|
2237
|
jpayne@69
|
2238 cdef bam1_t * src = self._delegate
|
jpayne@69
|
2239 cdef int op
|
jpayne@69
|
2240 cdef uint32_t l
|
jpayne@69
|
2241 cdef int32_t k
|
jpayne@69
|
2242 cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2243
|
jpayne@69
|
2244 if cigar_p == NULL:
|
jpayne@69
|
2245 return None
|
jpayne@69
|
2246
|
jpayne@69
|
2247 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
2248 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
2249 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
2250 base_view[op] += l
|
jpayne@69
|
2251 block_view[op] += 1
|
jpayne@69
|
2252
|
jpayne@69
|
2253 cdef uint8_t * v = bam_aux_get(src, 'NM')
|
jpayne@69
|
2254 if v != NULL:
|
jpayne@69
|
2255 base_view[nfields - 1] = <int32_t>bam_aux2i(v)
|
jpayne@69
|
2256
|
jpayne@69
|
2257 return base_counts, block_counts
|
jpayne@69
|
2258
|
jpayne@69
|
2259 #####################################################
|
jpayne@69
|
2260 ## Unsorted as yet
|
jpayne@69
|
2261 # TODO: capture in CIGAR object
|
jpayne@69
|
2262 property cigartuples:
|
jpayne@69
|
2263 """the :term:`cigar` alignment. The alignment
|
jpayne@69
|
2264 is returned as a list of tuples of (operation, length).
|
jpayne@69
|
2265
|
jpayne@69
|
2266 If the alignment is not present, None is returned.
|
jpayne@69
|
2267
|
jpayne@69
|
2268 The operations are:
|
jpayne@69
|
2269
|
jpayne@69
|
2270 +-----+----------------+-----+
|
jpayne@69
|
2271 |M |pysam.CMATCH |0 |
|
jpayne@69
|
2272 +-----+----------------+-----+
|
jpayne@69
|
2273 |I |pysam.CINS |1 |
|
jpayne@69
|
2274 +-----+----------------+-----+
|
jpayne@69
|
2275 |D |pysam.CDEL |2 |
|
jpayne@69
|
2276 +-----+----------------+-----+
|
jpayne@69
|
2277 |N |pysam.CREF_SKIP |3 |
|
jpayne@69
|
2278 +-----+----------------+-----+
|
jpayne@69
|
2279 |S |pysam.CSOFT_CLIP|4 |
|
jpayne@69
|
2280 +-----+----------------+-----+
|
jpayne@69
|
2281 |H |pysam.CHARD_CLIP|5 |
|
jpayne@69
|
2282 +-----+----------------+-----+
|
jpayne@69
|
2283 |P |pysam.CPAD |6 |
|
jpayne@69
|
2284 +-----+----------------+-----+
|
jpayne@69
|
2285 |= |pysam.CEQUAL |7 |
|
jpayne@69
|
2286 +-----+----------------+-----+
|
jpayne@69
|
2287 |X |pysam.CDIFF |8 |
|
jpayne@69
|
2288 +-----+----------------+-----+
|
jpayne@69
|
2289 |B |pysam.CBACK |9 |
|
jpayne@69
|
2290 +-----+----------------+-----+
|
jpayne@69
|
2291
|
jpayne@69
|
2292 .. note::
|
jpayne@69
|
2293 The output is a list of (operation, length) tuples, such as
|
jpayne@69
|
2294 ``[(0, 30)]``.
|
jpayne@69
|
2295 This is different from the SAM specification and
|
jpayne@69
|
2296 the :attr:`cigarstring` property, which uses a
|
jpayne@69
|
2297 (length, operation) order, for example: ``30M``.
|
jpayne@69
|
2298
|
jpayne@69
|
2299 To unset the cigar property, assign an empty list
|
jpayne@69
|
2300 or None.
|
jpayne@69
|
2301 """
|
jpayne@69
|
2302 def __get__(self):
|
jpayne@69
|
2303 cdef uint32_t * cigar_p
|
jpayne@69
|
2304 cdef bam1_t * src
|
jpayne@69
|
2305 cdef uint32_t op, l
|
jpayne@69
|
2306 cdef uint32_t k
|
jpayne@69
|
2307
|
jpayne@69
|
2308 src = self._delegate
|
jpayne@69
|
2309 if pysam_get_n_cigar(src) == 0:
|
jpayne@69
|
2310 return None
|
jpayne@69
|
2311
|
jpayne@69
|
2312 cigar = []
|
jpayne@69
|
2313
|
jpayne@69
|
2314 cigar_p = pysam_bam_get_cigar(src);
|
jpayne@69
|
2315 for k from 0 <= k < pysam_get_n_cigar(src):
|
jpayne@69
|
2316 op = cigar_p[k] & BAM_CIGAR_MASK
|
jpayne@69
|
2317 l = cigar_p[k] >> BAM_CIGAR_SHIFT
|
jpayne@69
|
2318 cigar.append((op, l))
|
jpayne@69
|
2319 return cigar
|
jpayne@69
|
2320
|
jpayne@69
|
2321 def __set__(self, values):
|
jpayne@69
|
2322 cdef uint32_t * p
|
jpayne@69
|
2323 cdef bam1_t * src
|
jpayne@69
|
2324 cdef op, l
|
jpayne@69
|
2325 cdef int k
|
jpayne@69
|
2326
|
jpayne@69
|
2327 k = 0
|
jpayne@69
|
2328
|
jpayne@69
|
2329 src = self._delegate
|
jpayne@69
|
2330
|
jpayne@69
|
2331 # get location of cigar string
|
jpayne@69
|
2332 p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2333
|
jpayne@69
|
2334 # empty values for cigar string
|
jpayne@69
|
2335 if values is None:
|
jpayne@69
|
2336 values = []
|
jpayne@69
|
2337
|
jpayne@69
|
2338 cdef uint32_t ncigar = len(values)
|
jpayne@69
|
2339
|
jpayne@69
|
2340 cdef bam1_t * retval = pysam_bam_update(src,
|
jpayne@69
|
2341 pysam_get_n_cigar(src) * 4,
|
jpayne@69
|
2342 ncigar * 4,
|
jpayne@69
|
2343 <uint8_t*>p)
|
jpayne@69
|
2344
|
jpayne@69
|
2345 if retval == NULL:
|
jpayne@69
|
2346 raise MemoryError("could not allocate memory")
|
jpayne@69
|
2347
|
jpayne@69
|
2348 # length is number of cigar operations, not bytes
|
jpayne@69
|
2349 pysam_set_n_cigar(src, ncigar)
|
jpayne@69
|
2350
|
jpayne@69
|
2351 # re-acquire pointer to location in memory
|
jpayne@69
|
2352 # as it might have moved
|
jpayne@69
|
2353 p = pysam_bam_get_cigar(src)
|
jpayne@69
|
2354
|
jpayne@69
|
2355 # insert cigar operations
|
jpayne@69
|
2356 for op, l in values:
|
jpayne@69
|
2357 p[k] = l << BAM_CIGAR_SHIFT | op
|
jpayne@69
|
2358 k += 1
|
jpayne@69
|
2359
|
jpayne@69
|
2360 ## setting the cigar string requires updating the bin
|
jpayne@69
|
2361 update_bin(src)
|
jpayne@69
|
2362
|
jpayne@69
|
2363 cpdef set_tag(self,
|
jpayne@69
|
2364 tag,
|
jpayne@69
|
2365 value,
|
jpayne@69
|
2366 value_type=None,
|
jpayne@69
|
2367 replace=True):
|
jpayne@69
|
2368 """sets a particular field *tag* to *value* in the optional alignment
|
jpayne@69
|
2369 section.
|
jpayne@69
|
2370
|
jpayne@69
|
2371 *value_type* describes the type of *value* that is to entered
|
jpayne@69
|
2372 into the alignment record. It can be set explicitly to one of
|
jpayne@69
|
2373 the valid one-letter type codes. If unset, an appropriate type
|
jpayne@69
|
2374 will be chosen automatically based on the python type of
|
jpayne@69
|
2375 *value*.
|
jpayne@69
|
2376
|
jpayne@69
|
2377 An existing value of the same *tag* will be overwritten unless
|
jpayne@69
|
2378 *replace* is set to False. This is usually not recommended as a
|
jpayne@69
|
2379 tag may only appear once in the optional alignment section.
|
jpayne@69
|
2380
|
jpayne@69
|
2381 If *value* is `None`, the tag will be deleted.
|
jpayne@69
|
2382
|
jpayne@69
|
2383 This method accepts valid SAM specification value types, which
|
jpayne@69
|
2384 are::
|
jpayne@69
|
2385
|
jpayne@69
|
2386 A: printable char
|
jpayne@69
|
2387 i: signed int
|
jpayne@69
|
2388 f: float
|
jpayne@69
|
2389 Z: printable string
|
jpayne@69
|
2390 H: Byte array in hex format
|
jpayne@69
|
2391 B: Integer or numeric array
|
jpayne@69
|
2392
|
jpayne@69
|
2393 Additionally, it will accept the integer BAM types ('cCsSI')
|
jpayne@69
|
2394
|
jpayne@69
|
2395 For htslib compatibility, 'a' is synonymous with 'A' and the
|
jpayne@69
|
2396 method accepts a 'd' type code for a double precision float.
|
jpayne@69
|
2397
|
jpayne@69
|
2398 When deducing the type code by the python type of *value*, the
|
jpayne@69
|
2399 following mapping is applied::
|
jpayne@69
|
2400
|
jpayne@69
|
2401 i: python int
|
jpayne@69
|
2402 f: python float
|
jpayne@69
|
2403 Z: python str or bytes
|
jpayne@69
|
2404 B: python array.array, list or tuple
|
jpayne@69
|
2405
|
jpayne@69
|
2406 Note that a single character string will be output as 'Z' and
|
jpayne@69
|
2407 not 'A' as the former is the more general type.
|
jpayne@69
|
2408 """
|
jpayne@69
|
2409
|
jpayne@69
|
2410 cdef int value_size
|
jpayne@69
|
2411 cdef uint8_t tc
|
jpayne@69
|
2412 cdef uint8_t * value_ptr
|
jpayne@69
|
2413 cdef uint8_t *existing_ptr
|
jpayne@69
|
2414 cdef float float_value
|
jpayne@69
|
2415 cdef double double_value
|
jpayne@69
|
2416 cdef int32_t int32_t_value
|
jpayne@69
|
2417 cdef uint32_t uint32_t_value
|
jpayne@69
|
2418 cdef int16_t int16_t_value
|
jpayne@69
|
2419 cdef uint16_t uint16_t_value
|
jpayne@69
|
2420 cdef int8_t int8_t_value
|
jpayne@69
|
2421 cdef uint8_t uint8_t_value
|
jpayne@69
|
2422 cdef bam1_t * src = self._delegate
|
jpayne@69
|
2423 cdef char * _value_type
|
jpayne@69
|
2424 cdef c_array.array array_value
|
jpayne@69
|
2425 cdef object buffer
|
jpayne@69
|
2426
|
jpayne@69
|
2427 if len(tag) != 2:
|
jpayne@69
|
2428 raise ValueError('Invalid tag: %s' % tag)
|
jpayne@69
|
2429
|
jpayne@69
|
2430 tag = force_bytes(tag)
|
jpayne@69
|
2431 if replace:
|
jpayne@69
|
2432 existing_ptr = bam_aux_get(src, tag)
|
jpayne@69
|
2433 if existing_ptr:
|
jpayne@69
|
2434 bam_aux_del(src, existing_ptr)
|
jpayne@69
|
2435
|
jpayne@69
|
2436 # setting value to None deletes a tag
|
jpayne@69
|
2437 if value is None:
|
jpayne@69
|
2438 return
|
jpayne@69
|
2439
|
jpayne@69
|
2440 cdef uint8_t typecode = get_tag_typecode(value, value_type)
|
jpayne@69
|
2441 if typecode == 0:
|
jpayne@69
|
2442 raise ValueError("can't guess type or invalid type code specified: {} {}".format(
|
jpayne@69
|
2443 value, value_type))
|
jpayne@69
|
2444
|
jpayne@69
|
2445 # sam_format1 for typecasting
|
jpayne@69
|
2446 if typecode == b'Z':
|
jpayne@69
|
2447 value = force_bytes(value)
|
jpayne@69
|
2448 value_ptr = <uint8_t*><char*>value
|
jpayne@69
|
2449 value_size = len(value)+1
|
jpayne@69
|
2450 elif typecode == b'H':
|
jpayne@69
|
2451 # Note that hex tags are stored the very same
|
jpayne@69
|
2452 # way as Z string.s
|
jpayne@69
|
2453 value = force_bytes(value)
|
jpayne@69
|
2454 value_ptr = <uint8_t*><char*>value
|
jpayne@69
|
2455 value_size = len(value)+1
|
jpayne@69
|
2456 elif typecode == b'A' or typecode == b'a':
|
jpayne@69
|
2457 value = force_bytes(value)
|
jpayne@69
|
2458 value_ptr = <uint8_t*><char*>value
|
jpayne@69
|
2459 value_size = sizeof(char)
|
jpayne@69
|
2460 typecode = b'A'
|
jpayne@69
|
2461 elif typecode == b'i':
|
jpayne@69
|
2462 int32_t_value = value
|
jpayne@69
|
2463 value_ptr = <uint8_t*>&int32_t_value
|
jpayne@69
|
2464 value_size = sizeof(int32_t)
|
jpayne@69
|
2465 elif typecode == b'I':
|
jpayne@69
|
2466 uint32_t_value = value
|
jpayne@69
|
2467 value_ptr = <uint8_t*>&uint32_t_value
|
jpayne@69
|
2468 value_size = sizeof(uint32_t)
|
jpayne@69
|
2469 elif typecode == b's':
|
jpayne@69
|
2470 int16_t_value = value
|
jpayne@69
|
2471 value_ptr = <uint8_t*>&int16_t_value
|
jpayne@69
|
2472 value_size = sizeof(int16_t)
|
jpayne@69
|
2473 elif typecode == b'S':
|
jpayne@69
|
2474 uint16_t_value = value
|
jpayne@69
|
2475 value_ptr = <uint8_t*>&uint16_t_value
|
jpayne@69
|
2476 value_size = sizeof(uint16_t)
|
jpayne@69
|
2477 elif typecode == b'c':
|
jpayne@69
|
2478 int8_t_value = value
|
jpayne@69
|
2479 value_ptr = <uint8_t*>&int8_t_value
|
jpayne@69
|
2480 value_size = sizeof(int8_t)
|
jpayne@69
|
2481 elif typecode == b'C':
|
jpayne@69
|
2482 uint8_t_value = value
|
jpayne@69
|
2483 value_ptr = <uint8_t*>&uint8_t_value
|
jpayne@69
|
2484 value_size = sizeof(uint8_t)
|
jpayne@69
|
2485 elif typecode == b'd':
|
jpayne@69
|
2486 double_value = value
|
jpayne@69
|
2487 value_ptr = <uint8_t*>&double_value
|
jpayne@69
|
2488 value_size = sizeof(double)
|
jpayne@69
|
2489 elif typecode == b'f':
|
jpayne@69
|
2490 float_value = value
|
jpayne@69
|
2491 value_ptr = <uint8_t*>&float_value
|
jpayne@69
|
2492 value_size = sizeof(float)
|
jpayne@69
|
2493 elif typecode == b'B':
|
jpayne@69
|
2494 # the following goes through python, needs to be cleaned up
|
jpayne@69
|
2495 # pack array using struct
|
jpayne@69
|
2496 fmt, args = pack_tags([(tag, value, value_type)])
|
jpayne@69
|
2497
|
jpayne@69
|
2498 # remove tag and type code as set by bam_aux_append
|
jpayne@69
|
2499 # first four chars of format (<2sB)
|
jpayne@69
|
2500 fmt = '<' + fmt[4:]
|
jpayne@69
|
2501 # first two values to pack
|
jpayne@69
|
2502 args = args[2:]
|
jpayne@69
|
2503 value_size = struct.calcsize(fmt)
|
jpayne@69
|
2504 # buffer will be freed when object goes out of scope
|
jpayne@69
|
2505 buffer = ctypes.create_string_buffer(value_size)
|
jpayne@69
|
2506 struct.pack_into(fmt, buffer, 0, *args)
|
jpayne@69
|
2507 # bam_aux_append copies data from value_ptr
|
jpayne@69
|
2508 bam_aux_append(src,
|
jpayne@69
|
2509 tag,
|
jpayne@69
|
2510 typecode,
|
jpayne@69
|
2511 value_size,
|
jpayne@69
|
2512 <uint8_t*>buffer.raw)
|
jpayne@69
|
2513 return
|
jpayne@69
|
2514 else:
|
jpayne@69
|
2515 raise ValueError('unsupported value_type {} in set_option'.format(typecode))
|
jpayne@69
|
2516
|
jpayne@69
|
2517 bam_aux_append(src,
|
jpayne@69
|
2518 tag,
|
jpayne@69
|
2519 typecode,
|
jpayne@69
|
2520 value_size,
|
jpayne@69
|
2521 value_ptr)
|
jpayne@69
|
2522
|
jpayne@69
|
2523 cpdef has_tag(self, tag):
|
jpayne@69
|
2524 """returns true if the optional alignment section
|
jpayne@69
|
2525 contains a given *tag*."""
|
jpayne@69
|
2526 cdef uint8_t * v
|
jpayne@69
|
2527 cdef int nvalues
|
jpayne@69
|
2528 btag = force_bytes(tag)
|
jpayne@69
|
2529 v = bam_aux_get(self._delegate, btag)
|
jpayne@69
|
2530 return v != NULL
|
jpayne@69
|
2531
|
jpayne@69
|
2532 cpdef get_tag(self, tag, with_value_type=False):
|
jpayne@69
|
2533 """
|
jpayne@69
|
2534 retrieves data from the optional alignment section
|
jpayne@69
|
2535 given a two-letter *tag* denoting the field.
|
jpayne@69
|
2536
|
jpayne@69
|
2537 The returned value is cast into an appropriate python type.
|
jpayne@69
|
2538
|
jpayne@69
|
2539 This method is the fastest way to access the optional
|
jpayne@69
|
2540 alignment section if only few tags need to be retrieved.
|
jpayne@69
|
2541
|
jpayne@69
|
2542 Possible value types are "AcCsSiIfZHB" (see BAM format
|
jpayne@69
|
2543 specification) as well as additional value type 'd' as
|
jpayne@69
|
2544 implemented in htslib.
|
jpayne@69
|
2545
|
jpayne@69
|
2546 Parameters:
|
jpayne@69
|
2547
|
jpayne@69
|
2548 tag :
|
jpayne@69
|
2549 data tag.
|
jpayne@69
|
2550
|
jpayne@69
|
2551 with_value_type : Optional[bool]
|
jpayne@69
|
2552 if set to True, the return value is a tuple of (tag value, type
|
jpayne@69
|
2553 code). (default False)
|
jpayne@69
|
2554
|
jpayne@69
|
2555 Returns:
|
jpayne@69
|
2556
|
jpayne@69
|
2557 A python object with the value of the `tag`. The type of the
|
jpayne@69
|
2558 object depends on the data type in the data record.
|
jpayne@69
|
2559
|
jpayne@69
|
2560 Raises:
|
jpayne@69
|
2561
|
jpayne@69
|
2562 KeyError
|
jpayne@69
|
2563 If `tag` is not present, a KeyError is raised.
|
jpayne@69
|
2564
|
jpayne@69
|
2565 """
|
jpayne@69
|
2566 cdef uint8_t * v
|
jpayne@69
|
2567 cdef int nvalues
|
jpayne@69
|
2568 btag = force_bytes(tag)
|
jpayne@69
|
2569 v = bam_aux_get(self._delegate, btag)
|
jpayne@69
|
2570 if v == NULL:
|
jpayne@69
|
2571 raise KeyError("tag '%s' not present" % tag)
|
jpayne@69
|
2572 if chr(v[0]) == "B":
|
jpayne@69
|
2573 auxtype = chr(v[0]) + chr(v[1])
|
jpayne@69
|
2574 else:
|
jpayne@69
|
2575 auxtype = chr(v[0])
|
jpayne@69
|
2576
|
jpayne@69
|
2577 if auxtype in "iIcCsS":
|
jpayne@69
|
2578 value = bam_aux2i(v)
|
jpayne@69
|
2579 elif auxtype == 'f' or auxtype == 'F':
|
jpayne@69
|
2580 value = bam_aux2f(v)
|
jpayne@69
|
2581 elif auxtype == 'd' or auxtype == 'D':
|
jpayne@69
|
2582 value = bam_aux2f(v)
|
jpayne@69
|
2583 elif auxtype == 'A' or auxtype == 'a':
|
jpayne@69
|
2584 # force A to a
|
jpayne@69
|
2585 v[0] = b'A'
|
jpayne@69
|
2586 # there might a more efficient way
|
jpayne@69
|
2587 # to convert a char into a string
|
jpayne@69
|
2588 value = '%c' % <char>bam_aux2A(v)
|
jpayne@69
|
2589 elif auxtype == 'Z' or auxtype == 'H':
|
jpayne@69
|
2590 # Z and H are treated equally as strings in htslib
|
jpayne@69
|
2591 value = charptr_to_str(<char*>bam_aux2Z(v))
|
jpayne@69
|
2592 elif auxtype[0] == 'B':
|
jpayne@69
|
2593 bytesize, nvalues, values = convert_binary_tag(v + 1)
|
jpayne@69
|
2594 value = values
|
jpayne@69
|
2595 else:
|
jpayne@69
|
2596 raise ValueError("unknown auxiliary type '%s'" % auxtype)
|
jpayne@69
|
2597
|
jpayne@69
|
2598 if with_value_type:
|
jpayne@69
|
2599 return (value, auxtype)
|
jpayne@69
|
2600 else:
|
jpayne@69
|
2601 return value
|
jpayne@69
|
2602
|
jpayne@69
|
2603 def get_tags(self, with_value_type=False):
|
jpayne@69
|
2604 """the fields in the optional alignment section.
|
jpayne@69
|
2605
|
jpayne@69
|
2606 Returns a list of all fields in the optional
|
jpayne@69
|
2607 alignment section. Values are converted to appropriate python
|
jpayne@69
|
2608 values. For example: ``[(NM, 2), (RG, "GJP00TM04")]``
|
jpayne@69
|
2609
|
jpayne@69
|
2610 If *with_value_type* is set, the value type as encode in
|
jpayne@69
|
2611 the AlignedSegment record will be returned as well:
|
jpayne@69
|
2612
|
jpayne@69
|
2613 [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
|
jpayne@69
|
2614
|
jpayne@69
|
2615 This method will convert all values in the optional alignment
|
jpayne@69
|
2616 section. When getting only one or few tags, please see
|
jpayne@69
|
2617 :meth:`get_tag` for a quicker way to achieve this.
|
jpayne@69
|
2618
|
jpayne@69
|
2619 """
|
jpayne@69
|
2620
|
jpayne@69
|
2621 cdef char * ctag
|
jpayne@69
|
2622 cdef bam1_t * src
|
jpayne@69
|
2623 cdef uint8_t * s
|
jpayne@69
|
2624 cdef char auxtag[3]
|
jpayne@69
|
2625 cdef char auxtype
|
jpayne@69
|
2626 cdef uint8_t byte_size
|
jpayne@69
|
2627 cdef int32_t nvalues
|
jpayne@69
|
2628
|
jpayne@69
|
2629 src = self._delegate
|
jpayne@69
|
2630 if src.l_data == 0:
|
jpayne@69
|
2631 return []
|
jpayne@69
|
2632 s = pysam_bam_get_aux(src)
|
jpayne@69
|
2633 result = []
|
jpayne@69
|
2634 auxtag[2] = 0
|
jpayne@69
|
2635 while s < (src.data + src.l_data):
|
jpayne@69
|
2636 # get tag
|
jpayne@69
|
2637 auxtag[0] = s[0]
|
jpayne@69
|
2638 auxtag[1] = s[1]
|
jpayne@69
|
2639 s += 2
|
jpayne@69
|
2640 auxtype = s[0]
|
jpayne@69
|
2641 if auxtype in (b'c', b'C'):
|
jpayne@69
|
2642 value = <int>bam_aux2i(s)
|
jpayne@69
|
2643 s += 1
|
jpayne@69
|
2644 elif auxtype in (b's', b'S'):
|
jpayne@69
|
2645 value = <int>bam_aux2i(s)
|
jpayne@69
|
2646 s += 2
|
jpayne@69
|
2647 elif auxtype in (b'i', b'I'):
|
jpayne@69
|
2648 value = <int32_t>bam_aux2i(s)
|
jpayne@69
|
2649 s += 4
|
jpayne@69
|
2650 elif auxtype == b'f':
|
jpayne@69
|
2651 value = <float>bam_aux2f(s)
|
jpayne@69
|
2652 s += 4
|
jpayne@69
|
2653 elif auxtype == b'd':
|
jpayne@69
|
2654 value = <double>bam_aux2f(s)
|
jpayne@69
|
2655 s += 8
|
jpayne@69
|
2656 elif auxtype in (b'A', b'a'):
|
jpayne@69
|
2657 value = "%c" % <char>bam_aux2A(s)
|
jpayne@69
|
2658 s += 1
|
jpayne@69
|
2659 elif auxtype in (b'Z', b'H'):
|
jpayne@69
|
2660 value = charptr_to_str(<char*>bam_aux2Z(s))
|
jpayne@69
|
2661 # +1 for NULL terminated string
|
jpayne@69
|
2662 s += len(value) + 1
|
jpayne@69
|
2663 elif auxtype == b'B':
|
jpayne@69
|
2664 s += 1
|
jpayne@69
|
2665 byte_size, nvalues, value = convert_binary_tag(s)
|
jpayne@69
|
2666 # 5 for 1 char and 1 int
|
jpayne@69
|
2667 s += 5 + (nvalues * byte_size) - 1
|
jpayne@69
|
2668 else:
|
jpayne@69
|
2669 raise KeyError("unknown type '%s'" % auxtype)
|
jpayne@69
|
2670
|
jpayne@69
|
2671 s += 1
|
jpayne@69
|
2672
|
jpayne@69
|
2673 if with_value_type:
|
jpayne@69
|
2674 result.append((charptr_to_str(auxtag), value, chr(auxtype)))
|
jpayne@69
|
2675 else:
|
jpayne@69
|
2676 result.append((charptr_to_str(auxtag), value))
|
jpayne@69
|
2677
|
jpayne@69
|
2678 return result
|
jpayne@69
|
2679
|
jpayne@69
|
2680 def set_tags(self, tags):
|
jpayne@69
|
2681 """sets the fields in the optional alignment section with
|
jpayne@69
|
2682 a list of (tag, value) tuples.
|
jpayne@69
|
2683
|
jpayne@69
|
2684 The value type of the values is determined from the
|
jpayne@69
|
2685 python type. Optionally, a type may be given explicitly as
|
jpayne@69
|
2686 a third value in the tuple, For example:
|
jpayne@69
|
2687
|
jpayne@69
|
2688 x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
|
jpayne@69
|
2689
|
jpayne@69
|
2690 This method will not enforce the rule that the same tag may appear
|
jpayne@69
|
2691 only once in the optional alignment section.
|
jpayne@69
|
2692 """
|
jpayne@69
|
2693
|
jpayne@69
|
2694 cdef bam1_t * src
|
jpayne@69
|
2695 cdef uint8_t * s
|
jpayne@69
|
2696 cdef char * temp
|
jpayne@69
|
2697 cdef int new_size = 0
|
jpayne@69
|
2698 cdef int old_size
|
jpayne@69
|
2699 src = self._delegate
|
jpayne@69
|
2700
|
jpayne@69
|
2701 # convert and pack the data
|
jpayne@69
|
2702 if tags is not None and len(tags) > 0:
|
jpayne@69
|
2703 fmt, args = pack_tags(tags)
|
jpayne@69
|
2704 new_size = struct.calcsize(fmt)
|
jpayne@69
|
2705 buffer = ctypes.create_string_buffer(new_size)
|
jpayne@69
|
2706 struct.pack_into(fmt,
|
jpayne@69
|
2707 buffer,
|
jpayne@69
|
2708 0,
|
jpayne@69
|
2709 *args)
|
jpayne@69
|
2710
|
jpayne@69
|
2711
|
jpayne@69
|
2712 # delete the old data and allocate new space.
|
jpayne@69
|
2713 # If total_size == 0, the aux field will be
|
jpayne@69
|
2714 # empty
|
jpayne@69
|
2715 old_size = pysam_bam_get_l_aux(src)
|
jpayne@69
|
2716 cdef bam1_t * retval = pysam_bam_update(src,
|
jpayne@69
|
2717 old_size,
|
jpayne@69
|
2718 new_size,
|
jpayne@69
|
2719 pysam_bam_get_aux(src))
|
jpayne@69
|
2720 if retval == NULL:
|
jpayne@69
|
2721 raise MemoryError("could not allocate memory")
|
jpayne@69
|
2722
|
jpayne@69
|
2723 # copy data only if there is any
|
jpayne@69
|
2724 if new_size > 0:
|
jpayne@69
|
2725
|
jpayne@69
|
2726 # get location of new data
|
jpayne@69
|
2727 s = pysam_bam_get_aux(src)
|
jpayne@69
|
2728
|
jpayne@69
|
2729 # check if there is direct path from buffer.raw to tmp
|
jpayne@69
|
2730 p = buffer.raw
|
jpayne@69
|
2731 # create handle to make sure buffer stays alive long
|
jpayne@69
|
2732 # enough for memcpy, see issue 129
|
jpayne@69
|
2733 temp = p
|
jpayne@69
|
2734 memcpy(s, temp, new_size)
|
jpayne@69
|
2735
|
jpayne@69
|
2736
|
jpayne@69
|
2737 ########################################################
|
jpayne@69
|
2738 # Compatibility Accessors
|
jpayne@69
|
2739 # Functions, properties for compatibility with pysam < 0.8
|
jpayne@69
|
2740 #
|
jpayne@69
|
2741 # Several options
|
jpayne@69
|
2742 # change the factory functions according to API
|
jpayne@69
|
2743 # * requires code changes throughout, incl passing
|
jpayne@69
|
2744 # handles to factory functions
|
jpayne@69
|
2745 # subclass functions and add attributes at runtime
|
jpayne@69
|
2746 # e.g.: AlignedSegments.qname = AlignedSegments.query_name
|
jpayne@69
|
2747 # * will slow down the default interface
|
jpayne@69
|
2748 # explicit declaration of getters/setters
|
jpayne@69
|
2749 ########################################################
|
jpayne@69
|
2750 property qname:
|
jpayne@69
|
2751 """deprecated, use :attr:`query_name` instead."""
|
jpayne@69
|
2752 def __get__(self): return self.query_name
|
jpayne@69
|
2753 def __set__(self, v): self.query_name = v
|
jpayne@69
|
2754 property tid:
|
jpayne@69
|
2755 """deprecated, use :attr:`reference_id` instead."""
|
jpayne@69
|
2756 def __get__(self): return self.reference_id
|
jpayne@69
|
2757 def __set__(self, v): self.reference_id = v
|
jpayne@69
|
2758 property pos:
|
jpayne@69
|
2759 """deprecated, use :attr:`reference_start` instead."""
|
jpayne@69
|
2760 def __get__(self): return self.reference_start
|
jpayne@69
|
2761 def __set__(self, v): self.reference_start = v
|
jpayne@69
|
2762 property mapq:
|
jpayne@69
|
2763 """deprecated, use :attr:`mapping_quality` instead."""
|
jpayne@69
|
2764 def __get__(self): return self.mapping_quality
|
jpayne@69
|
2765 def __set__(self, v): self.mapping_quality = v
|
jpayne@69
|
2766 property rnext:
|
jpayne@69
|
2767 """deprecated, use :attr:`next_reference_id` instead."""
|
jpayne@69
|
2768 def __get__(self): return self.next_reference_id
|
jpayne@69
|
2769 def __set__(self, v): self.next_reference_id = v
|
jpayne@69
|
2770 property pnext:
|
jpayne@69
|
2771 """deprecated, use :attr:`next_reference_start` instead."""
|
jpayne@69
|
2772 def __get__(self):
|
jpayne@69
|
2773 return self.next_reference_start
|
jpayne@69
|
2774 def __set__(self, v):
|
jpayne@69
|
2775 self.next_reference_start = v
|
jpayne@69
|
2776 property cigar:
|
jpayne@69
|
2777 """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead."""
|
jpayne@69
|
2778 def __get__(self):
|
jpayne@69
|
2779 r = self.cigartuples
|
jpayne@69
|
2780 if r is None:
|
jpayne@69
|
2781 r = []
|
jpayne@69
|
2782 return r
|
jpayne@69
|
2783 def __set__(self, v): self.cigartuples = v
|
jpayne@69
|
2784 property tlen:
|
jpayne@69
|
2785 """deprecated, use :attr:`template_length` instead."""
|
jpayne@69
|
2786 def __get__(self):
|
jpayne@69
|
2787 return self.template_length
|
jpayne@69
|
2788 def __set__(self, v):
|
jpayne@69
|
2789 self.template_length = v
|
jpayne@69
|
2790 property seq:
|
jpayne@69
|
2791 """deprecated, use :attr:`query_sequence` instead."""
|
jpayne@69
|
2792 def __get__(self):
|
jpayne@69
|
2793 return self.query_sequence
|
jpayne@69
|
2794 def __set__(self, v):
|
jpayne@69
|
2795 self.query_sequence = v
|
jpayne@69
|
2796 property qual:
|
jpayne@69
|
2797 """deprecated, use :attr:`query_qualities` instead."""
|
jpayne@69
|
2798 def __get__(self):
|
jpayne@69
|
2799 return array_to_qualitystring(self.query_qualities)
|
jpayne@69
|
2800 def __set__(self, v):
|
jpayne@69
|
2801 self.query_qualities = qualitystring_to_array(v)
|
jpayne@69
|
2802 property alen:
|
jpayne@69
|
2803 """deprecated, use :attr:`reference_length` instead."""
|
jpayne@69
|
2804 def __get__(self):
|
jpayne@69
|
2805 return self.reference_length
|
jpayne@69
|
2806 def __set__(self, v):
|
jpayne@69
|
2807 self.reference_length = v
|
jpayne@69
|
2808 property aend:
|
jpayne@69
|
2809 """deprecated, use :attr:`reference_end` instead."""
|
jpayne@69
|
2810 def __get__(self):
|
jpayne@69
|
2811 return self.reference_end
|
jpayne@69
|
2812 def __set__(self, v):
|
jpayne@69
|
2813 self.reference_end = v
|
jpayne@69
|
2814 property rlen:
|
jpayne@69
|
2815 """deprecated, use :attr:`query_length` instead."""
|
jpayne@69
|
2816 def __get__(self):
|
jpayne@69
|
2817 return self.query_length
|
jpayne@69
|
2818 def __set__(self, v):
|
jpayne@69
|
2819 self.query_length = v
|
jpayne@69
|
2820 property query:
|
jpayne@69
|
2821 """deprecated, use :attr:`query_alignment_sequence`
|
jpayne@69
|
2822 instead."""
|
jpayne@69
|
2823 def __get__(self):
|
jpayne@69
|
2824 return self.query_alignment_sequence
|
jpayne@69
|
2825 def __set__(self, v):
|
jpayne@69
|
2826 self.query_alignment_sequence = v
|
jpayne@69
|
2827 property qqual:
|
jpayne@69
|
2828 """deprecated, use :attr:`query_alignment_qualities`
|
jpayne@69
|
2829 instead."""
|
jpayne@69
|
2830 def __get__(self):
|
jpayne@69
|
2831 return array_to_qualitystring(self.query_alignment_qualities)
|
jpayne@69
|
2832 def __set__(self, v):
|
jpayne@69
|
2833 self.query_alignment_qualities = qualitystring_to_array(v)
|
jpayne@69
|
2834 property qstart:
|
jpayne@69
|
2835 """deprecated, use :attr:`query_alignment_start` instead."""
|
jpayne@69
|
2836 def __get__(self):
|
jpayne@69
|
2837 return self.query_alignment_start
|
jpayne@69
|
2838 def __set__(self, v):
|
jpayne@69
|
2839 self.query_alignment_start = v
|
jpayne@69
|
2840 property qend:
|
jpayne@69
|
2841 """deprecated, use :attr:`query_alignment_end` instead."""
|
jpayne@69
|
2842 def __get__(self):
|
jpayne@69
|
2843 return self.query_alignment_end
|
jpayne@69
|
2844 def __set__(self, v):
|
jpayne@69
|
2845 self.query_alignment_end = v
|
jpayne@69
|
2846 property qlen:
|
jpayne@69
|
2847 """deprecated, use :attr:`query_alignment_length`
|
jpayne@69
|
2848 instead."""
|
jpayne@69
|
2849 def __get__(self):
|
jpayne@69
|
2850 return self.query_alignment_length
|
jpayne@69
|
2851 def __set__(self, v):
|
jpayne@69
|
2852 self.query_alignment_length = v
|
jpayne@69
|
2853 property mrnm:
|
jpayne@69
|
2854 """deprecated, use :attr:`next_reference_id` instead."""
|
jpayne@69
|
2855 def __get__(self):
|
jpayne@69
|
2856 return self.next_reference_id
|
jpayne@69
|
2857 def __set__(self, v):
|
jpayne@69
|
2858 self.next_reference_id = v
|
jpayne@69
|
2859 property mpos:
|
jpayne@69
|
2860 """deprecated, use :attr:`next_reference_start`
|
jpayne@69
|
2861 instead."""
|
jpayne@69
|
2862 def __get__(self):
|
jpayne@69
|
2863 return self.next_reference_start
|
jpayne@69
|
2864 def __set__(self, v):
|
jpayne@69
|
2865 self.next_reference_start = v
|
jpayne@69
|
2866 property rname:
|
jpayne@69
|
2867 """deprecated, use :attr:`reference_id` instead."""
|
jpayne@69
|
2868 def __get__(self):
|
jpayne@69
|
2869 return self.reference_id
|
jpayne@69
|
2870 def __set__(self, v):
|
jpayne@69
|
2871 self.reference_id = v
|
jpayne@69
|
2872 property isize:
|
jpayne@69
|
2873 """deprecated, use :attr:`template_length` instead."""
|
jpayne@69
|
2874 def __get__(self):
|
jpayne@69
|
2875 return self.template_length
|
jpayne@69
|
2876 def __set__(self, v):
|
jpayne@69
|
2877 self.template_length = v
|
jpayne@69
|
2878 property blocks:
|
jpayne@69
|
2879 """deprecated, use :meth:`get_blocks()` instead."""
|
jpayne@69
|
2880 def __get__(self):
|
jpayne@69
|
2881 return self.get_blocks()
|
jpayne@69
|
2882 property aligned_pairs:
|
jpayne@69
|
2883 """deprecated, use :meth:`get_aligned_pairs()` instead."""
|
jpayne@69
|
2884 def __get__(self):
|
jpayne@69
|
2885 return self.get_aligned_pairs()
|
jpayne@69
|
2886 property inferred_length:
|
jpayne@69
|
2887 """deprecated, use :meth:`infer_query_length()` instead."""
|
jpayne@69
|
2888 def __get__(self):
|
jpayne@69
|
2889 return self.infer_query_length()
|
jpayne@69
|
2890 property positions:
|
jpayne@69
|
2891 """deprecated, use :meth:`get_reference_positions()` instead."""
|
jpayne@69
|
2892 def __get__(self):
|
jpayne@69
|
2893 return self.get_reference_positions()
|
jpayne@69
|
2894 property tags:
|
jpayne@69
|
2895 """deprecated, use :meth:`get_tags()` instead."""
|
jpayne@69
|
2896 def __get__(self):
|
jpayne@69
|
2897 return self.get_tags()
|
jpayne@69
|
2898 def __set__(self, tags):
|
jpayne@69
|
2899 self.set_tags(tags)
|
jpayne@69
|
2900 def overlap(self):
|
jpayne@69
|
2901 """deprecated, use :meth:`get_overlap()` instead."""
|
jpayne@69
|
2902 return self.get_overlap()
|
jpayne@69
|
2903 def opt(self, tag):
|
jpayne@69
|
2904 """deprecated, use :meth:`get_tag()` instead."""
|
jpayne@69
|
2905 return self.get_tag(tag)
|
jpayne@69
|
2906 def setTag(self, tag, value, value_type=None, replace=True):
|
jpayne@69
|
2907 """deprecated, use :meth:`set_tag()` instead."""
|
jpayne@69
|
2908 return self.set_tag(tag, value, value_type, replace)
|
jpayne@69
|
2909
|
jpayne@69
|
2910
|
jpayne@69
|
2911 cdef class PileupColumn:
|
jpayne@69
|
2912 '''A pileup of reads at a particular reference sequence position
|
jpayne@69
|
2913 (:term:`column`). A pileup column contains all the reads that map
|
jpayne@69
|
2914 to a certain target base.
|
jpayne@69
|
2915
|
jpayne@69
|
2916 This class is a proxy for results returned by the samtools pileup
|
jpayne@69
|
2917 engine. If the underlying engine iterator advances, the results
|
jpayne@69
|
2918 of this column will change.
|
jpayne@69
|
2919 '''
|
jpayne@69
|
2920 def __init__(self):
|
jpayne@69
|
2921 raise TypeError("this class cannot be instantiated from Python")
|
jpayne@69
|
2922
|
jpayne@69
|
2923 def __str__(self):
|
jpayne@69
|
2924 return "\t".join(map(str,
|
jpayne@69
|
2925 (self.reference_id,
|
jpayne@69
|
2926 self.reference_pos,
|
jpayne@69
|
2927 self.nsegments))) +\
|
jpayne@69
|
2928 "\n" +\
|
jpayne@69
|
2929 "\n".join(map(str, self.pileups))
|
jpayne@69
|
2930
|
jpayne@69
|
2931 def __dealloc__(self):
|
jpayne@69
|
2932 free(self.buf.s)
|
jpayne@69
|
2933
|
jpayne@69
|
2934 def set_min_base_quality(self, min_base_quality):
|
jpayne@69
|
2935 """set the minimum base quality for this pileup column.
|
jpayne@69
|
2936 """
|
jpayne@69
|
2937 self.min_base_quality = min_base_quality
|
jpayne@69
|
2938
|
jpayne@69
|
2939 def __len__(self):
|
jpayne@69
|
2940 """return number of reads aligned to this column.
|
jpayne@69
|
2941
|
jpayne@69
|
2942 see :meth:`get_num_aligned`
|
jpayne@69
|
2943 """
|
jpayne@69
|
2944 return self.get_num_aligned()
|
jpayne@69
|
2945
|
jpayne@69
|
2946 property reference_id:
|
jpayne@69
|
2947 '''the reference sequence number as defined in the header'''
|
jpayne@69
|
2948 def __get__(self):
|
jpayne@69
|
2949 return self.tid
|
jpayne@69
|
2950
|
jpayne@69
|
2951 property reference_name:
|
jpayne@69
|
2952 """:term:`reference` name (None if no AlignmentFile is associated)"""
|
jpayne@69
|
2953 def __get__(self):
|
jpayne@69
|
2954 if self.header is not None:
|
jpayne@69
|
2955 return self.header.get_reference_name(self.tid)
|
jpayne@69
|
2956 return None
|
jpayne@69
|
2957
|
jpayne@69
|
2958 property nsegments:
|
jpayne@69
|
2959 '''number of reads mapping to this column.
|
jpayne@69
|
2960
|
jpayne@69
|
2961 Note that this number ignores the base quality filter.'''
|
jpayne@69
|
2962 def __get__(self):
|
jpayne@69
|
2963 return self.n_pu
|
jpayne@69
|
2964 def __set__(self, n):
|
jpayne@69
|
2965 self.n_pu = n
|
jpayne@69
|
2966
|
jpayne@69
|
2967 property reference_pos:
|
jpayne@69
|
2968 '''the position in the reference sequence (0-based).'''
|
jpayne@69
|
2969 def __get__(self):
|
jpayne@69
|
2970 return self.pos
|
jpayne@69
|
2971
|
jpayne@69
|
2972 property pileups:
|
jpayne@69
|
2973 '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
|
jpayne@69
|
2974 def __get__(self):
|
jpayne@69
|
2975 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
2976 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
2977
|
jpayne@69
|
2978 cdef int x
|
jpayne@69
|
2979 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
2980 pileups = []
|
jpayne@69
|
2981
|
jpayne@69
|
2982 # warning: there could be problems if self.n and self.buf are
|
jpayne@69
|
2983 # out of sync.
|
jpayne@69
|
2984 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
2985 p = &(self.plp[0][x])
|
jpayne@69
|
2986 if p == NULL:
|
jpayne@69
|
2987 raise ValueError(
|
jpayne@69
|
2988 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
2989 "outside loop")
|
jpayne@69
|
2990 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
2991 continue
|
jpayne@69
|
2992 pileups.append(makePileupRead(p, self.header))
|
jpayne@69
|
2993 return pileups
|
jpayne@69
|
2994
|
jpayne@69
|
2995 ########################################################
|
jpayne@69
|
2996 # Compatibility Accessors
|
jpayne@69
|
2997 # Functions, properties for compatibility with pysam < 0.8
|
jpayne@69
|
2998 ########################################################
|
jpayne@69
|
2999 property pos:
|
jpayne@69
|
3000 """deprecated, use :attr:`reference_pos` instead."""
|
jpayne@69
|
3001 def __get__(self):
|
jpayne@69
|
3002 return self.reference_pos
|
jpayne@69
|
3003 def __set__(self, v):
|
jpayne@69
|
3004 self.reference_pos = v
|
jpayne@69
|
3005
|
jpayne@69
|
3006 property tid:
|
jpayne@69
|
3007 """deprecated, use :attr:`reference_id` instead."""
|
jpayne@69
|
3008 def __get__(self):
|
jpayne@69
|
3009 return self.reference_id
|
jpayne@69
|
3010 def __set__(self, v):
|
jpayne@69
|
3011 self.reference_id = v
|
jpayne@69
|
3012
|
jpayne@69
|
3013 property n:
|
jpayne@69
|
3014 """deprecated, use :attr:`nsegments` instead."""
|
jpayne@69
|
3015 def __get__(self):
|
jpayne@69
|
3016 return self.nsegments
|
jpayne@69
|
3017 def __set__(self, v):
|
jpayne@69
|
3018 self.nsegments = v
|
jpayne@69
|
3019
|
jpayne@69
|
3020 def get_num_aligned(self):
|
jpayne@69
|
3021 """return number of aligned bases at pileup column position.
|
jpayne@69
|
3022
|
jpayne@69
|
3023 This method applies a base quality filter and the number is
|
jpayne@69
|
3024 equal to the size of :meth:`get_query_sequences`,
|
jpayne@69
|
3025 :meth:`get_mapping_qualities`, etc.
|
jpayne@69
|
3026
|
jpayne@69
|
3027 """
|
jpayne@69
|
3028 cdef uint32_t x = 0
|
jpayne@69
|
3029 cdef uint32_t c = 0
|
jpayne@69
|
3030 cdef uint32_t cnt = 0
|
jpayne@69
|
3031 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3032 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
3033 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
3034
|
jpayne@69
|
3035 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3036 p = &(self.plp[0][x])
|
jpayne@69
|
3037 if p == NULL:
|
jpayne@69
|
3038 raise ValueError(
|
jpayne@69
|
3039 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3040 "outside loop")
|
jpayne@69
|
3041 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
3042 continue
|
jpayne@69
|
3043 cnt += 1
|
jpayne@69
|
3044 return cnt
|
jpayne@69
|
3045
|
jpayne@69
|
3046 def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False):
|
jpayne@69
|
3047 """query bases/sequences at pileup column position.
|
jpayne@69
|
3048
|
jpayne@69
|
3049 Optionally, the bases/sequences can be annotated according to the samtools
|
jpayne@69
|
3050 mpileup format. This is the format description from the samtools mpileup tool::
|
jpayne@69
|
3051
|
jpayne@69
|
3052 Information on match, mismatch, indel, strand, mapping
|
jpayne@69
|
3053 quality and start and end of a read are all encoded at the
|
jpayne@69
|
3054 read base column. At this column, a dot stands for a match
|
jpayne@69
|
3055 to the reference base on the forward strand, a comma for a
|
jpayne@69
|
3056 match on the reverse strand, a '>' or '<' for a reference
|
jpayne@69
|
3057 skip, `ACGTN' for a mismatch on the forward strand and
|
jpayne@69
|
3058 `acgtn' for a mismatch on the reverse strand. A pattern
|
jpayne@69
|
3059 `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
|
jpayne@69
|
3060 between this reference position and the next reference
|
jpayne@69
|
3061 position. The length of the insertion is given by the
|
jpayne@69
|
3062 integer in the pattern, followed by the inserted
|
jpayne@69
|
3063 sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
|
jpayne@69
|
3064 represents a deletion from the reference. The deleted bases
|
jpayne@69
|
3065 will be presented as `*' in the following lines. Also at
|
jpayne@69
|
3066 the read base column, a symbol `^' marks the start of a
|
jpayne@69
|
3067 read. The ASCII of the character following `^' minus 33
|
jpayne@69
|
3068 gives the mapping quality. A symbol `$' marks the end of a
|
jpayne@69
|
3069 read segment
|
jpayne@69
|
3070
|
jpayne@69
|
3071 To reproduce samtools mpileup format, set all of mark_matches,
|
jpayne@69
|
3072 mark_ends and add_indels to True.
|
jpayne@69
|
3073
|
jpayne@69
|
3074 Parameters
|
jpayne@69
|
3075 ----------
|
jpayne@69
|
3076
|
jpayne@69
|
3077 mark_matches: bool
|
jpayne@69
|
3078
|
jpayne@69
|
3079 If True, output bases matching the reference as "." or ","
|
jpayne@69
|
3080 for forward and reverse strand, respectively. This mark
|
jpayne@69
|
3081 requires the reference sequence. If no reference is
|
jpayne@69
|
3082 present, this option is ignored.
|
jpayne@69
|
3083
|
jpayne@69
|
3084 mark_ends : bool
|
jpayne@69
|
3085
|
jpayne@69
|
3086 If True, add markers "^" and "$" for read start and end, respectively.
|
jpayne@69
|
3087
|
jpayne@69
|
3088 add_indels : bool
|
jpayne@69
|
3089
|
jpayne@69
|
3090 If True, add bases for bases inserted into or skipped from the
|
jpayne@69
|
3091 reference. The latter requires a reference sequence file to have
|
jpayne@69
|
3092 been given, e.g. via `pileup(fastafile = ...)`. If no reference
|
jpayne@69
|
3093 sequence is available, skipped bases are represented as 'N's.
|
jpayne@69
|
3094
|
jpayne@69
|
3095 Returns
|
jpayne@69
|
3096 -------
|
jpayne@69
|
3097
|
jpayne@69
|
3098 a list of bases/sequences per read at pileup column position. : list
|
jpayne@69
|
3099
|
jpayne@69
|
3100 """
|
jpayne@69
|
3101 cdef uint32_t x = 0
|
jpayne@69
|
3102 cdef uint32_t j = 0
|
jpayne@69
|
3103 cdef uint32_t c = 0
|
jpayne@69
|
3104 cdef uint8_t cc = 0
|
jpayne@69
|
3105 cdef uint8_t rb = 0
|
jpayne@69
|
3106 cdef kstring_t * buf = &self.buf
|
jpayne@69
|
3107 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3108
|
jpayne@69
|
3109 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
3110 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
3111
|
jpayne@69
|
3112 buf.l = 0
|
jpayne@69
|
3113
|
jpayne@69
|
3114 # todo: reference sequence to count matches/mismatches
|
jpayne@69
|
3115 # todo: convert assertions to exceptions
|
jpayne@69
|
3116 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3117 p = &(self.plp[0][x])
|
jpayne@69
|
3118 if p == NULL:
|
jpayne@69
|
3119 raise ValueError(
|
jpayne@69
|
3120 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3121 "outside loop")
|
jpayne@69
|
3122 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
3123 continue
|
jpayne@69
|
3124 # see samtools pileup_seq
|
jpayne@69
|
3125 if mark_ends and p.is_head:
|
jpayne@69
|
3126 kputc(b'^', buf)
|
jpayne@69
|
3127
|
jpayne@69
|
3128 if p.b.core.qual > 93:
|
jpayne@69
|
3129 kputc(126, buf)
|
jpayne@69
|
3130 else:
|
jpayne@69
|
3131 kputc(p.b.core.qual + 33, buf)
|
jpayne@69
|
3132 if not p.is_del:
|
jpayne@69
|
3133 if p.qpos < p.b.core.l_qseq:
|
jpayne@69
|
3134 cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
|
jpayne@69
|
3135 else:
|
jpayne@69
|
3136 cc = b'N'
|
jpayne@69
|
3137
|
jpayne@69
|
3138 if mark_matches and self.reference_sequence != NULL:
|
jpayne@69
|
3139 rb = self.reference_sequence[self.reference_pos]
|
jpayne@69
|
3140 if seq_nt16_table[cc] == seq_nt16_table[rb]:
|
jpayne@69
|
3141 cc = b'='
|
jpayne@69
|
3142 kputc(strand_mark_char(cc, p.b), buf)
|
jpayne@69
|
3143 elif add_indels:
|
jpayne@69
|
3144 if p.is_refskip:
|
jpayne@69
|
3145 if bam_is_rev(p.b):
|
jpayne@69
|
3146 kputc(b'<', buf)
|
jpayne@69
|
3147 else:
|
jpayne@69
|
3148 kputc(b'>', buf)
|
jpayne@69
|
3149 else:
|
jpayne@69
|
3150 kputc(b'*', buf)
|
jpayne@69
|
3151 if add_indels:
|
jpayne@69
|
3152 if p.indel > 0:
|
jpayne@69
|
3153 kputc(b'+', buf)
|
jpayne@69
|
3154 kputw(p.indel, buf)
|
jpayne@69
|
3155 for j from 1 <= j <= p.indel:
|
jpayne@69
|
3156 cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
|
jpayne@69
|
3157 kputc(strand_mark_char(cc, p.b), buf)
|
jpayne@69
|
3158 elif p.indel < 0:
|
jpayne@69
|
3159 kputc(b'-', buf)
|
jpayne@69
|
3160 kputw(-p.indel, buf)
|
jpayne@69
|
3161 for j from 1 <= j <= -p.indel:
|
jpayne@69
|
3162 # TODO: out-of-range check here?
|
jpayne@69
|
3163 if self.reference_sequence == NULL:
|
jpayne@69
|
3164 cc = b'N'
|
jpayne@69
|
3165 else:
|
jpayne@69
|
3166 cc = self.reference_sequence[self.reference_pos + j]
|
jpayne@69
|
3167 kputc(strand_mark_char(cc, p.b), buf)
|
jpayne@69
|
3168 if mark_ends and p.is_tail:
|
jpayne@69
|
3169 kputc(b'$', buf)
|
jpayne@69
|
3170
|
jpayne@69
|
3171 kputc(b':', buf)
|
jpayne@69
|
3172
|
jpayne@69
|
3173 if buf.l == 0:
|
jpayne@69
|
3174 # could be zero if all qualities are too low
|
jpayne@69
|
3175 return ""
|
jpayne@69
|
3176 else:
|
jpayne@69
|
3177 # quicker to ensemble all and split than to encode all separately.
|
jpayne@69
|
3178 # ignore last ":"
|
jpayne@69
|
3179 return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
|
jpayne@69
|
3180
|
jpayne@69
|
3181 def get_query_qualities(self):
|
jpayne@69
|
3182 """query base quality scores at pileup column position.
|
jpayne@69
|
3183
|
jpayne@69
|
3184 Returns
|
jpayne@69
|
3185 -------
|
jpayne@69
|
3186
|
jpayne@69
|
3187 a list of quality scores : list
|
jpayne@69
|
3188 """
|
jpayne@69
|
3189 cdef uint32_t x = 0
|
jpayne@69
|
3190 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3191 cdef uint32_t c = 0
|
jpayne@69
|
3192 result = []
|
jpayne@69
|
3193 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3194 p = &(self.plp[0][x])
|
jpayne@69
|
3195 if p == NULL:
|
jpayne@69
|
3196 raise ValueError(
|
jpayne@69
|
3197 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3198 "outside loop")
|
jpayne@69
|
3199
|
jpayne@69
|
3200 if p.qpos < p.b.core.l_qseq:
|
jpayne@69
|
3201 c = bam_get_qual(p.b)[p.qpos]
|
jpayne@69
|
3202 else:
|
jpayne@69
|
3203 c = 0
|
jpayne@69
|
3204 if c < self.min_base_quality:
|
jpayne@69
|
3205 continue
|
jpayne@69
|
3206 result.append(c)
|
jpayne@69
|
3207 return result
|
jpayne@69
|
3208
|
jpayne@69
|
3209 def get_mapping_qualities(self):
|
jpayne@69
|
3210 """query mapping quality scores at pileup column position.
|
jpayne@69
|
3211
|
jpayne@69
|
3212 Returns
|
jpayne@69
|
3213 -------
|
jpayne@69
|
3214
|
jpayne@69
|
3215 a list of quality scores : list
|
jpayne@69
|
3216 """
|
jpayne@69
|
3217 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
3218 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
3219
|
jpayne@69
|
3220 cdef uint32_t x = 0
|
jpayne@69
|
3221 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3222 result = []
|
jpayne@69
|
3223 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3224 p = &(self.plp[0][x])
|
jpayne@69
|
3225 if p == NULL:
|
jpayne@69
|
3226 raise ValueError(
|
jpayne@69
|
3227 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3228 "outside loop")
|
jpayne@69
|
3229
|
jpayne@69
|
3230 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
3231 continue
|
jpayne@69
|
3232 result.append(p.b.core.qual)
|
jpayne@69
|
3233 return result
|
jpayne@69
|
3234
|
jpayne@69
|
3235 def get_query_positions(self):
|
jpayne@69
|
3236 """positions in read at pileup column position.
|
jpayne@69
|
3237
|
jpayne@69
|
3238 Returns
|
jpayne@69
|
3239 -------
|
jpayne@69
|
3240
|
jpayne@69
|
3241 a list of read positions : list
|
jpayne@69
|
3242 """
|
jpayne@69
|
3243 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
3244 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
3245
|
jpayne@69
|
3246 cdef uint32_t x = 0
|
jpayne@69
|
3247 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3248 result = []
|
jpayne@69
|
3249 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3250 p = &(self.plp[0][x])
|
jpayne@69
|
3251 if p == NULL:
|
jpayne@69
|
3252 raise ValueError(
|
jpayne@69
|
3253 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3254 "outside loop")
|
jpayne@69
|
3255
|
jpayne@69
|
3256 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
3257 continue
|
jpayne@69
|
3258 result.append(p.qpos)
|
jpayne@69
|
3259 return result
|
jpayne@69
|
3260
|
jpayne@69
|
3261 def get_query_names(self):
|
jpayne@69
|
3262 """query/read names aligned at pileup column position.
|
jpayne@69
|
3263
|
jpayne@69
|
3264 Returns
|
jpayne@69
|
3265 -------
|
jpayne@69
|
3266
|
jpayne@69
|
3267 a list of query names at pileup column position. : list
|
jpayne@69
|
3268 """
|
jpayne@69
|
3269 if self.plp == NULL or self.plp[0] == NULL:
|
jpayne@69
|
3270 raise ValueError("PileupColumn accessed after iterator finished")
|
jpayne@69
|
3271
|
jpayne@69
|
3272 cdef uint32_t x = 0
|
jpayne@69
|
3273 cdef const bam_pileup1_t * p = NULL
|
jpayne@69
|
3274 result = []
|
jpayne@69
|
3275 for x from 0 <= x < self.n_pu:
|
jpayne@69
|
3276 p = &(self.plp[0][x])
|
jpayne@69
|
3277 if p == NULL:
|
jpayne@69
|
3278 raise ValueError(
|
jpayne@69
|
3279 "pileup buffer out of sync - most likely use of iterator "
|
jpayne@69
|
3280 "outside loop")
|
jpayne@69
|
3281
|
jpayne@69
|
3282 if pileup_base_qual_skip(p, self.min_base_quality):
|
jpayne@69
|
3283 continue
|
jpayne@69
|
3284 result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
|
jpayne@69
|
3285 return result
|
jpayne@69
|
3286
|
jpayne@69
|
3287
|
jpayne@69
|
3288 cdef class PileupRead:
|
jpayne@69
|
3289 '''Representation of a read aligned to a particular position in the
|
jpayne@69
|
3290 reference sequence.
|
jpayne@69
|
3291
|
jpayne@69
|
3292 '''
|
jpayne@69
|
3293
|
jpayne@69
|
3294 def __init__(self):
|
jpayne@69
|
3295 raise TypeError(
|
jpayne@69
|
3296 "this class cannot be instantiated from Python")
|
jpayne@69
|
3297
|
jpayne@69
|
3298 def __str__(self):
|
jpayne@69
|
3299 return "\t".join(
|
jpayne@69
|
3300 map(str,
|
jpayne@69
|
3301 (self.alignment, self.query_position,
|
jpayne@69
|
3302 self.indel, self.level,
|
jpayne@69
|
3303 self.is_del, self.is_head,
|
jpayne@69
|
3304 self.is_tail, self.is_refskip)))
|
jpayne@69
|
3305
|
jpayne@69
|
3306 property alignment:
|
jpayne@69
|
3307 """a :class:`pysam.AlignedSegment` object of the aligned read"""
|
jpayne@69
|
3308 def __get__(self):
|
jpayne@69
|
3309 return self._alignment
|
jpayne@69
|
3310
|
jpayne@69
|
3311 property query_position:
|
jpayne@69
|
3312 """position of the read base at the pileup site, 0-based.
|
jpayne@69
|
3313 None if :attr:`is_del` or :attr:`is_refskip` is set.
|
jpayne@69
|
3314
|
jpayne@69
|
3315 """
|
jpayne@69
|
3316 def __get__(self):
|
jpayne@69
|
3317 if self.is_del or self.is_refskip:
|
jpayne@69
|
3318 return None
|
jpayne@69
|
3319 else:
|
jpayne@69
|
3320 return self._qpos
|
jpayne@69
|
3321
|
jpayne@69
|
3322 property query_position_or_next:
|
jpayne@69
|
3323 """position of the read base at the pileup site, 0-based.
|
jpayne@69
|
3324
|
jpayne@69
|
3325 If the current position is a deletion, returns the next
|
jpayne@69
|
3326 aligned base.
|
jpayne@69
|
3327
|
jpayne@69
|
3328 """
|
jpayne@69
|
3329 def __get__(self):
|
jpayne@69
|
3330 return self._qpos
|
jpayne@69
|
3331
|
jpayne@69
|
3332 property indel:
|
jpayne@69
|
3333 """indel length for the position following the current pileup site.
|
jpayne@69
|
3334
|
jpayne@69
|
3335 This quantity peeks ahead to the next cigar operation in this
|
jpayne@69
|
3336 alignment. If the next operation is an insertion, indel will
|
jpayne@69
|
3337 be positive. If the next operation is a deletion, it will be
|
jpayne@69
|
3338 negation. 0 if the next operation is not an indel.
|
jpayne@69
|
3339
|
jpayne@69
|
3340 """
|
jpayne@69
|
3341 def __get__(self):
|
jpayne@69
|
3342 return self._indel
|
jpayne@69
|
3343
|
jpayne@69
|
3344 property level:
|
jpayne@69
|
3345 """the level of the read in the "viewer" mode. Note that this value
|
jpayne@69
|
3346 is currently not computed."""
|
jpayne@69
|
3347 def __get__(self):
|
jpayne@69
|
3348 return self._level
|
jpayne@69
|
3349
|
jpayne@69
|
3350 property is_del:
|
jpayne@69
|
3351 """1 iff the base on the padded read is a deletion"""
|
jpayne@69
|
3352 def __get__(self):
|
jpayne@69
|
3353 return self._is_del
|
jpayne@69
|
3354
|
jpayne@69
|
3355 property is_head:
|
jpayne@69
|
3356 """1 iff the base on the padded read is the left-most base."""
|
jpayne@69
|
3357 def __get__(self):
|
jpayne@69
|
3358 return self._is_head
|
jpayne@69
|
3359
|
jpayne@69
|
3360 property is_tail:
|
jpayne@69
|
3361 """1 iff the base on the padded read is the right-most base."""
|
jpayne@69
|
3362 def __get__(self):
|
jpayne@69
|
3363 return self._is_tail
|
jpayne@69
|
3364
|
jpayne@69
|
3365 property is_refskip:
|
jpayne@69
|
3366 """1 iff the base on the padded read is part of CIGAR N op."""
|
jpayne@69
|
3367 def __get__(self):
|
jpayne@69
|
3368 return self._is_refskip
|
jpayne@69
|
3369
|
jpayne@69
|
3370
|
jpayne@69
|
3371
|
jpayne@69
|
3372 cpdef enum CIGAR_OPS:
|
jpayne@69
|
3373 CMATCH = 0
|
jpayne@69
|
3374 CINS = 1
|
jpayne@69
|
3375 CDEL = 2
|
jpayne@69
|
3376 CREF_SKIP = 3
|
jpayne@69
|
3377 CSOFT_CLIP = 4
|
jpayne@69
|
3378 CHARD_CLIP = 5
|
jpayne@69
|
3379 CPAD = 6
|
jpayne@69
|
3380 CEQUAL = 7
|
jpayne@69
|
3381 CDIFF = 8
|
jpayne@69
|
3382 CBACK = 9
|
jpayne@69
|
3383
|
jpayne@69
|
3384
|
jpayne@69
|
3385 cpdef enum SAM_FLAGS:
|
jpayne@69
|
3386 # the read is paired in sequencing, no matter whether it is mapped in a pair
|
jpayne@69
|
3387 FPAIRED = 1
|
jpayne@69
|
3388 # the read is mapped in a proper pair
|
jpayne@69
|
3389 FPROPER_PAIR = 2
|
jpayne@69
|
3390 # the read itself is unmapped; conflictive with FPROPER_PAIR
|
jpayne@69
|
3391 FUNMAP = 4
|
jpayne@69
|
3392 # the mate is unmapped
|
jpayne@69
|
3393 FMUNMAP = 8
|
jpayne@69
|
3394 # the read is mapped to the reverse strand
|
jpayne@69
|
3395 FREVERSE = 16
|
jpayne@69
|
3396 # the mate is mapped to the reverse strand
|
jpayne@69
|
3397 FMREVERSE = 32
|
jpayne@69
|
3398 # this is read1
|
jpayne@69
|
3399 FREAD1 = 64
|
jpayne@69
|
3400 # this is read2
|
jpayne@69
|
3401 FREAD2 = 128
|
jpayne@69
|
3402 # not primary alignment
|
jpayne@69
|
3403 FSECONDARY = 256
|
jpayne@69
|
3404 # QC failure
|
jpayne@69
|
3405 FQCFAIL = 512
|
jpayne@69
|
3406 # optical or PCR duplicate
|
jpayne@69
|
3407 FDUP = 1024
|
jpayne@69
|
3408 # supplementary alignment
|
jpayne@69
|
3409 FSUPPLEMENTARY = 2048
|
jpayne@69
|
3410
|
jpayne@69
|
3411
|
jpayne@69
|
3412 __all__ = [
|
jpayne@69
|
3413 "AlignedSegment",
|
jpayne@69
|
3414 "PileupColumn",
|
jpayne@69
|
3415 "PileupRead",
|
jpayne@69
|
3416 "CMATCH",
|
jpayne@69
|
3417 "CINS",
|
jpayne@69
|
3418 "CDEL",
|
jpayne@69
|
3419 "CREF_SKIP",
|
jpayne@69
|
3420 "CSOFT_CLIP",
|
jpayne@69
|
3421 "CHARD_CLIP",
|
jpayne@69
|
3422 "CPAD",
|
jpayne@69
|
3423 "CEQUAL",
|
jpayne@69
|
3424 "CDIFF",
|
jpayne@69
|
3425 "CBACK",
|
jpayne@69
|
3426 "FPAIRED",
|
jpayne@69
|
3427 "FPROPER_PAIR",
|
jpayne@69
|
3428 "FUNMAP",
|
jpayne@69
|
3429 "FMUNMAP",
|
jpayne@69
|
3430 "FREVERSE",
|
jpayne@69
|
3431 "FMREVERSE",
|
jpayne@69
|
3432 "FREAD1",
|
jpayne@69
|
3433 "FREAD2",
|
jpayne@69
|
3434 "FSECONDARY",
|
jpayne@69
|
3435 "FQCFAIL",
|
jpayne@69
|
3436 "FDUP",
|
jpayne@69
|
3437 "FSUPPLEMENTARY",
|
jpayne@69
|
3438 "KEY_NAMES"]
|