Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # cython: embedsignature=True | |
2 # cython: profile=True | |
3 ######################################################## | |
4 ######################################################## | |
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib | |
6 ######################################################## | |
7 # The principal classes defined in this module are: | |
8 # | |
9 # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files | |
10 # | |
11 # class AlignmentHeader manage SAM/BAM/CRAM header data | |
12 # | |
13 # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping | |
14 # the original sort order intact | |
15 # | |
16 # Additionally this module defines numerous additional classes that | |
17 # are part of the internal API. These are: | |
18 # | |
19 # Various iterator classes to iterate over alignments in sequential | |
20 # (IteratorRow) or in a stacked fashion (IteratorColumn): | |
21 # | |
22 # class IteratorRow | |
23 # class IteratorRowRegion | |
24 # class IteratorRowHead | |
25 # class IteratorRowAll | |
26 # class IteratorRowAllRefs | |
27 # class IteratorRowSelection | |
28 # class IteratorColumn | |
29 # class IteratorColumnRegion | |
30 # class IteratorColumnAll | |
31 # class IteratorColumnAllRefs | |
32 # | |
33 ######################################################## | |
34 # | |
35 # The MIT License | |
36 # | |
37 # Copyright (c) 2015 Andreas Heger | |
38 # | |
39 # Permission is hereby granted, free of charge, to any person obtaining a | |
40 # copy of this software and associated documentation files (the "Software"), | |
41 # to deal in the Software without restriction, including without limitation | |
42 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
43 # and/or sell copies of the Software, and to permit persons to whom the | |
44 # Software is furnished to do so, subject to the following conditions: | |
45 # | |
46 # The above copyright notice and this permission notice shall be included in | |
47 # all copies or substantial portions of the Software. | |
48 # | |
49 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
50 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
51 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
52 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
53 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
54 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
55 # DEALINGS IN THE SOFTWARE. | |
56 # | |
57 ######################################################## | |
58 import os | |
59 import collections | |
60 try: | |
61 from collections.abc import Sequence, Mapping # noqa | |
62 except ImportError: | |
63 from collections import Sequence, Mapping # noqa | |
64 import re | |
65 import warnings | |
66 import array | |
67 from libc.errno cimport errno, EPIPE | |
68 from libc.string cimport strcmp, strpbrk, strerror | |
69 from libc.stdint cimport INT32_MAX | |
70 | |
71 from cpython cimport array as c_array | |
72 | |
73 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str | |
74 from pysam.libcutils cimport encode_filename, from_string_and_size | |
75 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn | |
76 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \ | |
77 HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL | |
78 | |
79 from io import StringIO | |
80 | |
81 cimport cython | |
82 | |
83 | |
84 __all__ = [ | |
85 "AlignmentFile", | |
86 "AlignmentHeader", | |
87 "IteratorRow", | |
88 "IteratorColumn", | |
89 "IndexedReads"] | |
90 | |
91 IndexStats = collections.namedtuple("IndexStats", | |
92 ("contig", | |
93 "mapped", | |
94 "unmapped", | |
95 "total")) | |
96 | |
97 ######################################################## | |
98 ## global variables | |
99 # maximum genomic coordinace | |
100 # for some reason, using 'int' causes overflow | |
101 cdef int MAX_POS = (1 << 31) - 1 | |
102 | |
103 # valid types for SAM headers | |
104 VALID_HEADER_TYPES = {"HD" : Mapping, | |
105 "SQ" : Sequence, | |
106 "RG" : Sequence, | |
107 "PG" : Sequence, | |
108 "CO" : Sequence} | |
109 | |
110 # order of records within SAM headers | |
111 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") | |
112 | |
113 # default type conversions within SAM header records | |
114 KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str, | |
115 "SS" : str,}, | |
116 "SQ" : {"SN" : str, "LN" : int, "AS" : str, | |
117 "M5" : str, "SP" : str, "UR" : str, | |
118 "AH" : str, "TP" : str, "DS" : str, | |
119 "AN" : str,}, | |
120 "RG" : {"ID" : str, "CN" : str, "DS" : str, | |
121 "DT" : str, "FO" : str, "KS" : str, | |
122 "LB" : str, "PG" : str, "PI" : str, | |
123 "PL" : str, "PM" : str, "PU" : str, | |
124 "SM" : str, "BC" : str,}, | |
125 "PG" : {"ID" : str, "PN" : str, "CL" : str, | |
126 "PP" : str, "DS" : str, "VN" : str,},} | |
127 | |
128 # output order of fields within records. Ensure that CL is at | |
129 # the end as parsing a CL will ignore any subsequent records. | |
130 VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"), | |
131 "SQ" : ("SN", "LN", "AS", "M5", | |
132 "UR", "SP", "AH", "TP", | |
133 "DS", "AN"), | |
134 "RG" : ("ID", "CN", "SM", "LB", | |
135 "PU", "PI", "DT", "DS", | |
136 "PL", "FO", "KS", "PG", | |
137 "PM", "BC"), | |
138 "PG" : ("PN", "ID", "VN", "PP", | |
139 "DS", "CL"),} | |
140 | |
141 | |
142 def build_header_line(fields, record): | |
143 '''build a header line from `fields` dictionary for `record`''' | |
144 | |
145 # TODO: add checking for field and sort order | |
146 line = ["@%s" % record] | |
147 # comment | |
148 if record == "CO": | |
149 line.append(fields) | |
150 # user tags | |
151 elif record.islower(): | |
152 for key in sorted(fields): | |
153 line.append("%s:%s" % (key, str(fields[key]))) | |
154 # defined tags | |
155 else: | |
156 # write fields of the specification | |
157 for key in VALID_HEADER_ORDER[record]: | |
158 if key in fields: | |
159 line.append("%s:%s" % (key, str(fields[key]))) | |
160 # write user fields | |
161 for key in fields: | |
162 if not key.isupper(): | |
163 line.append("%s:%s" % (key, str(fields[key]))) | |
164 | |
165 return "\t".join(line) | |
166 | |
167 | |
168 cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): | |
169 if not hdr: | |
170 raise ValueError('cannot create AlignmentHeader, received NULL pointer') | |
171 | |
172 # check: is AlignmetHeader.__cinit__ called? | |
173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) | |
174 header.ptr = hdr | |
175 | |
176 return header | |
177 | |
178 def read_failure_reason(code): | |
179 if code == -2: | |
180 return 'truncated file' | |
181 else: | |
182 return "error {} while reading file".format(code) | |
183 | |
184 | |
185 # the following should be class-method for VariantHeader, but cdef @classmethods | |
186 # are not implemented in cython. | |
187 cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, | |
188 reference_names, | |
189 reference_lengths, | |
190 add_sq_text=True, | |
191 text=None) except -1: | |
192 """build header from list of reference names and lengths. | |
193 """ | |
194 | |
195 cdef class AlignmentHeader(object): | |
196 """header information for a :class:`AlignmentFile` object | |
197 | |
198 Parameters | |
199 ---------- | |
200 header_dict : dict | |
201 build header from a multi-level dictionary. The | |
202 first level are the four types ('HD', 'SQ', ...). The second | |
203 level are a list of lines, with each line being a list of | |
204 tag-value pairs. The header is constructed first from all the | |
205 defined fields, followed by user tags in alphabetical | |
206 order. Alternatively, an :class:`~pysam.AlignmentHeader` | |
207 object can be passed directly. | |
208 | |
209 text : string | |
210 use the string provided as the header | |
211 | |
212 reference_names : list | |
213 see reference_lengths | |
214 | |
215 reference_lengths : list | |
216 build header from list of chromosome names and lengths. By | |
217 default, 'SQ' and 'LN' tags will be added to the header | |
218 text. This option can be changed by unsetting the flag | |
219 `add_sq_text`. | |
220 | |
221 add_sq_text : bool | |
222 do not add 'SQ' and 'LN' tags to header. This option permits | |
223 construction :term:`SAM` formatted files without a header. | |
224 | |
225 """ | |
226 | |
227 # See makeVariantHeader for C constructor | |
228 def __cinit__(self): | |
229 self.ptr = NULL | |
230 | |
231 # Python constructor | |
232 def __init__(self): | |
233 self.ptr = bam_hdr_init() | |
234 if self.ptr is NULL: | |
235 raise MemoryError("could not create header") | |
236 | |
237 @classmethod | |
238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths): | |
239 | |
240 cdef AlignmentHeader self = AlignmentHeader() | |
241 cdef char *ctext | |
242 cdef int l_text | |
243 cdef int n, x | |
244 if text is not None: | |
245 btext = force_bytes(text) | |
246 ctext = btext | |
247 l_text = len(btext) | |
248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char)) | |
249 if self.ptr.text == NULL: | |
250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) | |
251 self.ptr.l_text = l_text | |
252 memcpy(self.ptr.text, ctext, l_text + 1) | |
253 | |
254 if reference_names and reference_lengths: | |
255 reference_names = [force_bytes(ref) for ref in reference_names] | |
256 | |
257 self.ptr.n_targets = len(reference_names) | |
258 | |
259 n = sum([len(reference_names) + 1]) | |
260 self.ptr.target_name = <char**>calloc(n, sizeof(char*)) | |
261 if self.ptr.target_name == NULL: | |
262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) | |
263 | |
264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t)) | |
265 if self.ptr.target_len == NULL: | |
266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) | |
267 | |
268 for x from 0 <= x < self.ptr.n_targets: | |
269 self.ptr.target_len[x] = reference_lengths[x] | |
270 name = reference_names[x] | |
271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char)) | |
272 if self.ptr.target_name[x] == NULL: | |
273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) | |
274 strncpy(self.ptr.target_name[x], name, len(name)) | |
275 | |
276 return self | |
277 | |
278 @classmethod | |
279 def from_text(cls, text): | |
280 | |
281 reference_names, reference_lengths = [], [] | |
282 for line in text.splitlines(): | |
283 if line.startswith("@SQ"): | |
284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) | |
285 try: | |
286 reference_names.append(fields["SN"]) | |
287 reference_lengths.append(int(fields["LN"])) | |
288 except KeyError: | |
289 raise KeyError("incomplete sequence information in '%s'" % str(fields)) | |
290 except ValueError: | |
291 raise ValueError("wrong sequence information in '%s'" % str(fields)) | |
292 | |
293 return cls._from_text_and_lengths(text, reference_names, reference_lengths) | |
294 | |
295 @classmethod | |
296 def from_dict(cls, header_dict): | |
297 | |
298 cdef list lines = [] | |
299 # first: defined tags | |
300 for record in VALID_HEADERS: | |
301 if record in header_dict: | |
302 data = header_dict[record] | |
303 if not isinstance(data, VALID_HEADER_TYPES[record]): | |
304 raise ValueError( | |
305 "invalid type for record %s: %s, expected %s".format( | |
306 record, type(data), VALID_HEADER_TYPES[record])) | |
307 if isinstance(data, Mapping): | |
308 lines.append(build_header_line(data, record)) | |
309 else: | |
310 for fields in header_dict[record]: | |
311 lines.append(build_header_line(fields, record)) | |
312 | |
313 # then: user tags (lower case), sorted alphabetically | |
314 for record, data in sorted(header_dict.items()): | |
315 if record in VALID_HEADERS: | |
316 continue | |
317 if isinstance(data, Mapping): | |
318 lines.append(build_header_line(data, record)) | |
319 else: | |
320 for fields in header_dict[record]: | |
321 lines.append(build_header_line(fields, record)) | |
322 | |
323 text = "\n".join(lines) + "\n" | |
324 | |
325 reference_names, reference_lengths = [], [] | |
326 if "SQ" in header_dict: | |
327 for fields in header_dict["SQ"]: | |
328 try: | |
329 reference_names.append(fields["SN"]) | |
330 reference_lengths.append(fields["LN"]) | |
331 except KeyError: | |
332 raise KeyError("incomplete sequence information in '%s'" % str(fields)) | |
333 | |
334 return cls._from_text_and_lengths(text, reference_names, reference_lengths) | |
335 | |
336 @classmethod | |
337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): | |
338 | |
339 if len(reference_names) != len(reference_lengths): | |
340 raise ValueError("number of reference names and lengths do not match") | |
341 | |
342 # optionally, if there is no text, add a SAM compatible header to output file. | |
343 if text is None and add_sq_text: | |
344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( | |
345 reference_names, reference_lengths)]) | |
346 | |
347 return cls._from_text_and_lengths(text, reference_names, reference_lengths) | |
348 | |
349 def __dealloc__(self): | |
350 bam_hdr_destroy(self.ptr) | |
351 self.ptr = NULL | |
352 | |
353 def __bool__(self): | |
354 return self.ptr != NULL | |
355 | |
356 def copy(self): | |
357 return makeAlignmentHeader(bam_hdr_dup(self.ptr)) | |
358 | |
359 property nreferences: | |
360 """int with the number of :term:`reference` sequences in the file. | |
361 | |
362 This is a read-only attribute.""" | |
363 def __get__(self): | |
364 return self.ptr.n_targets | |
365 | |
366 property references: | |
367 """tuple with the names of :term:`reference` sequences. This is a | |
368 read-only attribute""" | |
369 def __get__(self): | |
370 t = [] | |
371 cdef int x | |
372 for x in range(self.ptr.n_targets): | |
373 t.append(charptr_to_str(self.ptr.target_name[x])) | |
374 return tuple(t) | |
375 | |
376 property lengths: | |
377 """tuple of the lengths of the :term:`reference` sequences. This is a | |
378 read-only attribute. The lengths are in the same order as | |
379 :attr:`pysam.AlignmentFile.references` | |
380 """ | |
381 def __get__(self): | |
382 t = [] | |
383 cdef int x | |
384 for x in range(self.ptr.n_targets): | |
385 t.append(self.ptr.target_len[x]) | |
386 return tuple(t) | |
387 | |
388 def _build_sequence_section(self): | |
389 """return sequence section of header. | |
390 | |
391 The sequence section is built from the list of reference names and | |
392 lengths stored in the BAM-file and not from any @SQ entries that | |
393 are part of the header's text section. | |
394 """ | |
395 | |
396 cdef int x | |
397 text = [] | |
398 for x in range(self.ptr.n_targets): | |
399 text.append("@SQ\tSN:{}\tLN:{}\n".format( | |
400 force_str(self.ptr.target_name[x]), | |
401 self.ptr.target_len[x])) | |
402 return "".join(text) | |
403 | |
404 def to_dict(self): | |
405 """return two-level dictionary with header information from the file. | |
406 | |
407 The first level contains the record (``HD``, ``SQ``, etc) and | |
408 the second level contains the fields (``VN``, ``LN``, etc). | |
409 | |
410 The parser is validating and will raise an AssertionError if | |
411 if encounters any record or field tags that are not part of | |
412 the SAM specification. Use the | |
413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed | |
414 header. | |
415 | |
416 The parsing follows the SAM format specification with the | |
417 exception of the ``CL`` field. This option will consume the | |
418 rest of a header line irrespective of any additional fields. | |
419 This behaviour has been added to accommodate command line | |
420 options that contain characters that are not valid field | |
421 separators. | |
422 | |
423 If no @SQ entries are within the text section of the header, | |
424 this will be automatically added from the reference names and | |
425 lengths stored in the binary part of the header. | |
426 """ | |
427 result = collections.OrderedDict() | |
428 | |
429 # convert to python string | |
430 t = self.__str__() | |
431 for line in t.split("\n"): | |
432 line = line.strip(' \0') | |
433 if not line: | |
434 continue | |
435 assert line.startswith("@"), \ | |
436 "header line without '@': '%s'" % line | |
437 fields = line[1:].split("\t") | |
438 record = fields[0] | |
439 assert record in VALID_HEADER_TYPES, \ | |
440 "header line with invalid type '%s': '%s'" % (record, line) | |
441 | |
442 # treat comments | |
443 if record == "CO": | |
444 if record not in result: | |
445 result[record] = [] | |
446 result[record].append("\t".join( fields[1:])) | |
447 continue | |
448 # the following is clumsy as generators do not work? | |
449 x = {} | |
450 | |
451 for idx, field in enumerate(fields[1:]): | |
452 if ":" not in field: | |
453 raise ValueError("malformatted header: no ':' in field" ) | |
454 key, value = field.split(":", 1) | |
455 if key in ("CL",): | |
456 # special treatment for command line | |
457 # statements (CL). These might contain | |
458 # characters that are non-conformant with | |
459 # the valid field separators in the SAM | |
460 # header. Thus, in contravention to the | |
461 # SAM API, consume the rest of the line. | |
462 key, value = "\t".join(fields[idx+1:]).split(":", 1) | |
463 x[key] = KNOWN_HEADER_FIELDS[record][key](value) | |
464 break | |
465 | |
466 # interpret type of known header record tags, default to str | |
467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) | |
468 | |
469 if VALID_HEADER_TYPES[record] == Mapping: | |
470 if record in result: | |
471 raise ValueError( | |
472 "multiple '%s' lines are not permitted" % record) | |
473 | |
474 result[record] = x | |
475 elif VALID_HEADER_TYPES[record] == Sequence: | |
476 if record not in result: result[record] = [] | |
477 result[record].append(x) | |
478 | |
479 # if there are no SQ lines in the header, add the | |
480 # reference names from the information in the bam | |
481 # file. | |
482 # | |
483 # Background: c-samtools keeps the textual part of the | |
484 # header separate from the list of reference names and | |
485 # lengths. Thus, if a header contains only SQ lines, | |
486 # the SQ information is not part of the textual header | |
487 # and thus are missing from the output. See issue 84. | |
488 if "SQ" not in result: | |
489 sq = [] | |
490 for ref, length in zip(self.references, self.lengths): | |
491 sq.append({'LN': length, 'SN': ref }) | |
492 result["SQ"] = sq | |
493 | |
494 return result | |
495 | |
496 def as_dict(self): | |
497 """deprecated, use :meth:`to_dict()` instead""" | |
498 return self.to_dict() | |
499 | |
500 def get_reference_name(self, tid): | |
501 if tid == -1: | |
502 return None | |
503 if not 0 <= tid < self.ptr.n_targets: | |
504 raise ValueError("reference_id %i out of range 0<=tid<%i" % | |
505 (tid, self.ptr.n_targets)) | |
506 return charptr_to_str(self.ptr.target_name[tid]) | |
507 | |
508 def get_reference_length(self, reference): | |
509 cdef int tid = self.get_tid(reference) | |
510 if tid < 0: | |
511 raise KeyError("unknown reference {}".format(reference)) | |
512 else: | |
513 return self.ptr.target_len[tid] | |
514 | |
515 def is_valid_tid(self, int tid): | |
516 """ | |
517 return True if the numerical :term:`tid` is valid; False otherwise. | |
518 | |
519 Note that the unmapped tid code (-1) counts as an invalid. | |
520 """ | |
521 return 0 <= tid < self.ptr.n_targets | |
522 | |
523 def get_tid(self, reference): | |
524 """ | |
525 return the numerical :term:`tid` corresponding to | |
526 :term:`reference` | |
527 | |
528 returns -1 if reference is not known. | |
529 """ | |
530 reference = force_bytes(reference) | |
531 tid = bam_name2id(self.ptr, reference) | |
532 if tid < -1: | |
533 raise ValueError('could not parse header') | |
534 return tid | |
535 | |
536 def __str__(self): | |
537 '''string with the full contents of the :term:`sam file` header as a | |
538 string. | |
539 | |
540 If no @SQ entries are within the text section of the header, | |
541 this will be automatically added from the reference names and | |
542 lengths stored in the binary part of the header. | |
543 | |
544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed | |
545 representation of the header. | |
546 ''' | |
547 text = from_string_and_size(self.ptr.text, self.ptr.l_text) | |
548 if "@SQ" not in text: | |
549 text += "\n" + self._build_sequence_section() | |
550 return text | |
551 | |
552 # dictionary access methods, for backwards compatibility. | |
553 def __setitem__(self, key, value): | |
554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") | |
555 | |
556 def __getitem__(self, key): | |
557 return self.to_dict().__getitem__(key) | |
558 | |
559 def items(self): | |
560 return self.to_dict().items() | |
561 | |
562 # PY2 compatibility | |
563 def iteritems(self): | |
564 return self.to_dict().items() | |
565 | |
566 def keys(self): | |
567 return self.to_dict().keys() | |
568 | |
569 def values(self): | |
570 return self.to_dict().values() | |
571 | |
572 def get(self, *args): | |
573 return self.to_dict().get(*args) | |
574 | |
575 def __len__(self): | |
576 return self.to_dict().__len__() | |
577 | |
578 def __contains__(self, key): | |
579 return self.to_dict().__contains__(key) | |
580 | |
581 | |
582 cdef class AlignmentFile(HTSFile): | |
583 """AlignmentFile(filepath_or_object, mode=None, template=None, | |
584 reference_names=None, reference_lengths=None, text=NULL, | |
585 header=None, add_sq_text=False, check_header=True, check_sq=True, | |
586 reference_filename=None, filename=None, index_filename=None, | |
587 filepath_index=None, require_index=False, duplicate_filehandle=True, | |
588 ignore_truncation=False, threads=1) | |
589 | |
590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. | |
591 | |
592 If `filepath_or_object` is a string, the file is automatically | |
593 opened. If `filepath_or_object` is a python File object, the | |
594 already opened file will be used. | |
595 | |
596 If the file is opened for reading and an index exists (if file is BAM, a | |
597 .bai file or if CRAM a .crai file), it will be opened automatically. | |
598 `index_filename` may be specified explicitly. If the index is not named | |
599 in the standard manner, not located in the same directory as the | |
600 BAM/CRAM file, or is remote. Without an index, random access via | |
601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup` | |
602 is disabled. | |
603 | |
604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can | |
605 be constituted from several sources (see also the samtools format | |
606 specification): | |
607 | |
608 1. If `template` is given, the header is copied from another | |
609 `AlignmentFile` (`template` must be a | |
610 :class:`~pysam.AlignmentFile`). | |
611 | |
612 2. If `header` is given, the header is built from a | |
613 multi-level dictionary. | |
614 | |
615 3. If `text` is given, new header text is copied from raw | |
616 text. | |
617 | |
618 4. The names (`reference_names`) and lengths | |
619 (`reference_lengths`) are supplied directly as lists. | |
620 | |
621 When reading or writing a CRAM file, the filename of a FASTA-formatted | |
622 reference can be specified with `reference_filename`. | |
623 | |
624 By default, if a file is opened in mode 'r', it is checked | |
625 for a valid header (`check_header` = True) and a definition of | |
626 chromosome names (`check_sq` = True). | |
627 | |
628 Parameters | |
629 ---------- | |
630 mode : string | |
631 `mode` should be ``r`` for reading or ``w`` for writing. The | |
632 default is text mode (:term:`SAM`). For binary (:term:`BAM`) | |
633 I/O you should append ``b`` for compressed or ``u`` for | |
634 uncompressed :term:`BAM` output. Use ``h`` to output header | |
635 information in text (:term:`TAM`) mode. Use ``c`` for | |
636 :term:`CRAM` formatted files. | |
637 | |
638 If ``b`` is present, it must immediately follow ``r`` or | |
639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, | |
640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a | |
641 :term:`BAM` formatted file for reading, type:: | |
642 | |
643 f = pysam.AlignmentFile('ex1.bam','rb') | |
644 | |
645 If mode is not specified, the method will try to auto-detect | |
646 in the order 'rb', 'r', thus both the following should work:: | |
647 | |
648 f1 = pysam.AlignmentFile('ex1.bam') | |
649 f2 = pysam.AlignmentFile('ex1.sam') | |
650 | |
651 template : AlignmentFile | |
652 when writing, copy header from file `template`. | |
653 | |
654 header : dict or AlignmentHeader | |
655 when writing, build header from a multi-level dictionary. The | |
656 first level are the four types ('HD', 'SQ', ...). The second | |
657 level are a list of lines, with each line being a list of | |
658 tag-value pairs. The header is constructed first from all the | |
659 defined fields, followed by user tags in alphabetical | |
660 order. Alternatively, an :class:`~pysam.AlignmentHeader` | |
661 object can be passed directly. | |
662 | |
663 text : string | |
664 when writing, use the string provided as the header | |
665 | |
666 reference_names : list | |
667 see reference_lengths | |
668 | |
669 reference_lengths : list | |
670 when writing or opening a SAM file without header build header | |
671 from list of chromosome names and lengths. By default, 'SQ' | |
672 and 'LN' tags will be added to the header text. This option | |
673 can be changed by unsetting the flag `add_sq_text`. | |
674 | |
675 add_sq_text : bool | |
676 do not add 'SQ' and 'LN' tags to header. This option permits | |
677 construction :term:`SAM` formatted files without a header. | |
678 | |
679 add_sam_header : bool | |
680 when outputting SAM the default is to output a header. This is | |
681 equivalent to opening the file in 'wh' mode. If this option is | |
682 set to False, no header will be output. To read such a file, | |
683 set `check_header=False`. | |
684 | |
685 check_header : bool | |
686 obsolete: when reading a SAM file, check if header is present | |
687 (default=True) | |
688 | |
689 check_sq : bool | |
690 when reading, check if SQ entries are present in header | |
691 (default=True) | |
692 | |
693 reference_filename : string | |
694 Path to a FASTA-formatted reference file. Valid only for CRAM files. | |
695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL | |
696 specified in the header (``UR`` tag), which are normally used to find | |
697 the reference. | |
698 | |
699 index_filename : string | |
700 Explicit path to the index file. Only needed if the index is not | |
701 named in the standard manner, not located in the same directory as | |
702 the BAM/CRAM file, or is remote. An IOError is raised if the index | |
703 cannot be found or is invalid. | |
704 | |
705 filepath_index : string | |
706 Alias for `index_filename`. | |
707 | |
708 require_index : bool | |
709 When reading, require that an index file is present and is valid or | |
710 raise an IOError. (default=False) | |
711 | |
712 filename : string | |
713 Alternative to filepath_or_object. Filename of the file | |
714 to be opened. | |
715 | |
716 duplicate_filehandle: bool | |
717 By default, file handles passed either directly or through | |
718 File-like objects will be duplicated before passing them to | |
719 htslib. The duplication prevents issues where the same stream | |
720 will be closed by htslib and through destruction of the | |
721 high-level python object. Set to False to turn off | |
722 duplication. | |
723 | |
724 ignore_truncation: bool | |
725 Issue a warning, instead of raising an error if the current file | |
726 appears to be truncated due to a missing EOF marker. Only applies | |
727 to bgzipped formats. (Default=False) | |
728 | |
729 format_options: list | |
730 A list of key=value strings, as accepted by --input-fmt-option and | |
731 --output-fmt-option in samtools. | |
732 threads: integer | |
733 Number of threads to use for compressing/decompressing BAM/CRAM files. | |
734 Setting threads to > 1 cannot be combined with `ignore_truncation`. | |
735 (Default=1) | |
736 """ | |
737 | |
738 def __cinit__(self, *args, **kwargs): | |
739 self.htsfile = NULL | |
740 self.filename = None | |
741 self.mode = None | |
742 self.threads = 1 | |
743 self.is_stream = False | |
744 self.is_remote = False | |
745 self.index = NULL | |
746 | |
747 if "filename" in kwargs: | |
748 args = [kwargs["filename"]] | |
749 del kwargs["filename"] | |
750 | |
751 self._open(*args, **kwargs) | |
752 | |
753 # allocate memory for iterator | |
754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t)) | |
755 if self.b == NULL: | |
756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) | |
757 | |
758 def has_index(self): | |
759 """return true if htsfile has an existing (and opened) index. | |
760 """ | |
761 return self.index != NULL | |
762 | |
763 def check_index(self): | |
764 """return True if index is present. | |
765 | |
766 Raises | |
767 ------ | |
768 | |
769 AttributeError | |
770 if htsfile is :term:`SAM` formatted and thus has no index. | |
771 | |
772 ValueError | |
773 if htsfile is closed or index could not be opened. | |
774 """ | |
775 | |
776 if not self.is_open: | |
777 raise ValueError("I/O operation on closed file") | |
778 if not self.is_bam and not self.is_cram: | |
779 raise AttributeError( | |
780 "AlignmentFile.mapped only available in bam files") | |
781 if self.index == NULL: | |
782 raise ValueError( | |
783 "mapping information not recorded in index " | |
784 "or index not available") | |
785 return True | |
786 | |
787 def _open(self, | |
788 filepath_or_object, | |
789 mode=None, | |
790 AlignmentFile template=None, | |
791 reference_names=None, | |
792 reference_lengths=None, | |
793 reference_filename=None, | |
794 text=None, | |
795 header=None, | |
796 port=None, | |
797 add_sq_text=True, | |
798 add_sam_header=True, | |
799 check_header=True, | |
800 check_sq=True, | |
801 index_filename=None, | |
802 filepath_index=None, | |
803 require_index=False, | |
804 referencenames=None, | |
805 referencelengths=None, | |
806 duplicate_filehandle=True, | |
807 ignore_truncation=False, | |
808 format_options=None, | |
809 threads=1): | |
810 '''open a sam, bam or cram formatted file. | |
811 | |
812 If _open is called on an existing file, the current file | |
813 will be closed and a new file will be opened. | |
814 | |
815 ''' | |
816 cdef char *cfilename = NULL | |
817 cdef char *creference_filename = NULL | |
818 cdef char *cindexname = NULL | |
819 cdef char *cmode = NULL | |
820 cdef bam_hdr_t * hdr = NULL | |
821 | |
822 if threads > 1 and ignore_truncation: | |
823 # This won't raise errors if reaching a truncated alignment, | |
824 # because bgzf_mt_reader in htslib does not deal with | |
825 # bgzf_mt_read_block returning non-zero values, contrary | |
826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) | |
827 # Better to avoid this (for now) than to produce seemingly correct results. | |
828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True') | |
829 self.threads = threads | |
830 | |
831 # for backwards compatibility: | |
832 if referencenames is not None: | |
833 reference_names = referencenames | |
834 if referencelengths is not None: | |
835 reference_lengths = referencelengths | |
836 | |
837 # close a previously opened file | |
838 if self.is_open: | |
839 self.close() | |
840 | |
841 # autodetection for read | |
842 if mode is None: | |
843 mode = "r" | |
844 | |
845 if add_sam_header and mode == "w": | |
846 mode = "wh" | |
847 | |
848 assert mode in ("r", "w", "rb", "wb", "wh", | |
849 "wbu", "rU", "wb0", | |
850 "rc", "wc"), \ | |
851 "invalid file opening mode `%s`" % mode | |
852 | |
853 self.duplicate_filehandle = duplicate_filehandle | |
854 | |
855 # StringIO not supported | |
856 if isinstance(filepath_or_object, StringIO): | |
857 raise NotImplementedError( | |
858 "access from StringIO objects not supported") | |
859 # reading from a file descriptor | |
860 elif isinstance(filepath_or_object, int): | |
861 self.filename = filepath_or_object | |
862 filename = None | |
863 self.is_remote = False | |
864 self.is_stream = True | |
865 # reading from a File object or other object with fileno | |
866 elif hasattr(filepath_or_object, "fileno"): | |
867 if filepath_or_object.closed: | |
868 raise ValueError('I/O operation on closed file') | |
869 self.filename = filepath_or_object | |
870 # .name can be TextIOWrapper | |
871 try: | |
872 filename = encode_filename(str(filepath_or_object.name)) | |
873 cfilename = filename | |
874 except AttributeError: | |
875 filename = None | |
876 self.is_remote = False | |
877 self.is_stream = True | |
878 # what remains is a filename | |
879 else: | |
880 self.filename = filename = encode_filename(filepath_or_object) | |
881 cfilename = filename | |
882 self.is_remote = hisremote(cfilename) | |
883 self.is_stream = self.filename == b'-' | |
884 | |
885 # for htslib, wbu seems to not work | |
886 if mode == "wbu": | |
887 mode = "wb0" | |
888 | |
889 self.mode = force_bytes(mode) | |
890 self.reference_filename = reference_filename = encode_filename( | |
891 reference_filename) | |
892 | |
893 if mode[0] == 'w': | |
894 # open file for writing | |
895 | |
896 if not (template or header or text or (reference_names and reference_lengths)): | |
897 raise ValueError( | |
898 "either supply options `template`, `header`, `text` or both `reference_names` " | |
899 "and `reference_lengths` for writing") | |
900 | |
901 if template: | |
902 # header is copied, though at the moment not strictly | |
903 # necessary as AlignmentHeader is immutable. | |
904 self.header = template.header.copy() | |
905 elif isinstance(header, AlignmentHeader): | |
906 self.header = header.copy() | |
907 elif isinstance(header, Mapping): | |
908 self.header = AlignmentHeader.from_dict(header) | |
909 elif reference_names and reference_lengths: | |
910 self.header = AlignmentHeader.from_references( | |
911 reference_names, | |
912 reference_lengths, | |
913 add_sq_text=add_sq_text, | |
914 text=text) | |
915 elif text: | |
916 self.header = AlignmentHeader.from_text(text) | |
917 else: | |
918 raise ValueError("not enough information to construct header. Please provide template, " | |
919 "header, text or reference_names/reference_lengths") | |
920 self.htsfile = self._open_htsfile() | |
921 | |
922 if self.htsfile == NULL: | |
923 if errno: | |
924 raise IOError(errno, "could not open alignment file `{}`: {}".format( | |
925 force_str(filename), | |
926 force_str(strerror(errno)))) | |
927 else: | |
928 raise ValueError("could not open alignment file `{}`".format(force_str(filename))) | |
929 if format_options and len(format_options): | |
930 self.add_hts_options(format_options) | |
931 # set filename with reference sequences. If no filename | |
932 # is given, the CRAM reference arrays will be built from | |
933 # the @SQ header in the header | |
934 if "c" in mode and reference_filename: | |
935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0): | |
936 raise ValueError("failure when setting reference filename") | |
937 | |
938 # write header to htsfile | |
939 if "b" in mode or "c" in mode or "h" in mode: | |
940 hdr = self.header.ptr | |
941 with nogil: | |
942 sam_hdr_write(self.htsfile, hdr) | |
943 | |
944 elif mode[0] == "r": | |
945 # open file for reading | |
946 self.htsfile = self._open_htsfile() | |
947 | |
948 if self.htsfile == NULL: | |
949 if errno: | |
950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), | |
951 force_str(strerror(errno)))) | |
952 else: | |
953 raise ValueError("could not open alignment file `{}`".format(force_str(filename))) | |
954 | |
955 if self.htsfile.format.category != sequence_data: | |
956 raise ValueError("file does not contain alignment data") | |
957 | |
958 if format_options and len(format_options): | |
959 self.add_hts_options(format_options) | |
960 | |
961 self.check_truncation(ignore_truncation) | |
962 | |
963 # bam/cram files require a valid header | |
964 if self.is_bam or self.is_cram: | |
965 with nogil: | |
966 hdr = sam_hdr_read(self.htsfile) | |
967 if hdr == NULL: | |
968 raise ValueError( | |
969 "file does not have a valid header (mode='%s') " | |
970 "- is it BAM/CRAM format?" % mode) | |
971 self.header = makeAlignmentHeader(hdr) | |
972 else: | |
973 # in sam files a header is optional. If not given, | |
974 # user may provide reference names and lengths to built | |
975 # an on-the-fly header. | |
976 if reference_names and reference_lengths: | |
977 # build header from a target names and lengths | |
978 self.header = AlignmentHeader.from_references( | |
979 reference_names=reference_names, | |
980 reference_lengths=reference_lengths, | |
981 add_sq_text=add_sq_text, | |
982 text=text) | |
983 else: | |
984 with nogil: | |
985 hdr = sam_hdr_read(self.htsfile) | |
986 if hdr == NULL: | |
987 raise ValueError( | |
988 "SAM? file does not have a valid header (mode='%s'), " | |
989 "please provide reference_names and reference_lengths") | |
990 self.header = makeAlignmentHeader(hdr) | |
991 | |
992 # set filename with reference sequences | |
993 if self.is_cram and reference_filename: | |
994 creference_filename = self.reference_filename | |
995 hts_set_opt(self.htsfile, | |
996 CRAM_OPT_REFERENCE, | |
997 creference_filename) | |
998 | |
999 if check_sq and self.header.nreferences == 0: | |
1000 raise ValueError( | |
1001 ("file has no sequences defined (mode='%s') - " | |
1002 "is it SAM/BAM format? Consider opening with " | |
1003 "check_sq=False") % mode) | |
1004 | |
1005 if self.is_bam or self.is_cram: | |
1006 self.index_filename = index_filename or filepath_index | |
1007 if self.index_filename: | |
1008 cindexname = bfile_name = encode_filename(self.index_filename) | |
1009 | |
1010 if cfilename or cindexname: | |
1011 with nogil: | |
1012 self.index = sam_index_load3(self.htsfile, cfilename, cindexname, | |
1013 HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL) | |
1014 | |
1015 if not self.index and (cindexname or require_index): | |
1016 if errno: | |
1017 raise IOError(errno, force_str(strerror(errno))) | |
1018 else: | |
1019 raise IOError('unable to open index file `%s`' % self.index_filename) | |
1020 | |
1021 elif require_index: | |
1022 raise IOError('unable to open index file') | |
1023 | |
1024 # save start of data section | |
1025 if not self.is_stream: | |
1026 self.start_offset = self.tell() | |
1027 | |
1028 def fetch(self, | |
1029 contig=None, | |
1030 start=None, | |
1031 stop=None, | |
1032 region=None, | |
1033 tid=None, | |
1034 until_eof=False, | |
1035 multiple_iterators=False, | |
1036 reference=None, | |
1037 end=None): | |
1038 """fetch reads aligned in a :term:`region`. | |
1039 | |
1040 See :meth:`~pysam.HTSFile.parse_region` for more information | |
1041 on how genomic regions can be specified. :term:`reference` and | |
1042 `end` are also accepted for backward compatibility as synonyms | |
1043 for :term:`contig` and `stop`, respectively. | |
1044 | |
1045 Without a `contig` or `region` all mapped reads in the file | |
1046 will be fetched. The reads will be returned ordered by reference | |
1047 sequence, which will not necessarily be the order within the | |
1048 file. This mode of iteration still requires an index. If there is | |
1049 no index, use `until_eof=True`. | |
1050 | |
1051 If only `contig` is set, all reads aligned to `contig` | |
1052 will be fetched. | |
1053 | |
1054 A :term:`SAM` file does not allow random access. If `region` | |
1055 or `contig` are given, an exception is raised. | |
1056 | |
1057 Parameters | |
1058 ---------- | |
1059 | |
1060 until_eof : bool | |
1061 | |
1062 If `until_eof` is True, all reads from the current file | |
1063 position will be returned in order as they are within the | |
1064 file. Using this option will also fetch unmapped reads. | |
1065 | |
1066 multiple_iterators : bool | |
1067 | |
1068 If `multiple_iterators` is True, multiple | |
1069 iterators on the same file can be used at the same time. The | |
1070 iterator returned will receive its own copy of a filehandle to | |
1071 the file effectively re-opening the file. Re-opening a file | |
1072 creates some overhead, so beware. | |
1073 | |
1074 Returns | |
1075 ------- | |
1076 | |
1077 An iterator over a collection of reads. : IteratorRow | |
1078 | |
1079 Raises | |
1080 ------ | |
1081 | |
1082 ValueError | |
1083 if the genomic coordinates are out of range or invalid or the | |
1084 file does not permit random access to genomic coordinates. | |
1085 | |
1086 """ | |
1087 cdef int rtid, rstart, rstop, has_coord | |
1088 | |
1089 if not self.is_open: | |
1090 raise ValueError( "I/O operation on closed file" ) | |
1091 | |
1092 has_coord, rtid, rstart, rstop = self.parse_region( | |
1093 contig, start, stop, region, tid, | |
1094 end=end, reference=reference) | |
1095 | |
1096 # Turn of re-opening if htsfile is a stream | |
1097 if self.is_stream: | |
1098 multiple_iterators = False | |
1099 | |
1100 if self.is_bam or self.is_cram: | |
1101 if not until_eof and not self.is_remote: | |
1102 if not self.has_index(): | |
1103 raise ValueError( | |
1104 "fetch called on bamfile without index") | |
1105 | |
1106 if has_coord: | |
1107 return IteratorRowRegion( | |
1108 self, rtid, rstart, rstop, | |
1109 multiple_iterators=multiple_iterators) | |
1110 else: | |
1111 if until_eof: | |
1112 return IteratorRowAll( | |
1113 self, | |
1114 multiple_iterators=multiple_iterators) | |
1115 else: | |
1116 # AH: check - reason why no multiple_iterators for | |
1117 # AllRefs? | |
1118 return IteratorRowAllRefs( | |
1119 self, | |
1120 multiple_iterators=multiple_iterators) | |
1121 else: | |
1122 if has_coord: | |
1123 raise ValueError( | |
1124 "fetching by region is not available for SAM files") | |
1125 | |
1126 if multiple_iterators == True: | |
1127 raise ValueError( | |
1128 "multiple iterators not implemented for SAM files") | |
1129 | |
1130 return IteratorRowAll(self, | |
1131 multiple_iterators=multiple_iterators) | |
1132 | |
1133 def head(self, n, multiple_iterators=True): | |
1134 '''return an iterator over the first n alignments. | |
1135 | |
1136 This iterator is is useful for inspecting the bam-file. | |
1137 | |
1138 Parameters | |
1139 ---------- | |
1140 | |
1141 multiple_iterators : bool | |
1142 | |
1143 is set to True by default in order to | |
1144 avoid changing the current file position. | |
1145 | |
1146 Returns | |
1147 ------- | |
1148 | |
1149 an iterator over a collection of reads : IteratorRowHead | |
1150 | |
1151 ''' | |
1152 return IteratorRowHead(self, n, | |
1153 multiple_iterators=multiple_iterators) | |
1154 | |
1155 def mate(self, AlignedSegment read): | |
1156 '''return the mate of :class:`pysam.AlignedSegment` `read`. | |
1157 | |
1158 .. note:: | |
1159 | |
1160 Calling this method will change the file position. | |
1161 This might interfere with any iterators that have | |
1162 not re-opened the file. | |
1163 | |
1164 .. note:: | |
1165 | |
1166 This method is too slow for high-throughput processing. | |
1167 If a read needs to be processed with its mate, work | |
1168 from a read name sorted file or, better, cache reads. | |
1169 | |
1170 Returns | |
1171 ------- | |
1172 | |
1173 the mate : AlignedSegment | |
1174 | |
1175 Raises | |
1176 ------ | |
1177 | |
1178 ValueError | |
1179 if the read is unpaired or the mate is unmapped | |
1180 | |
1181 ''' | |
1182 cdef uint32_t flag = read._delegate.core.flag | |
1183 | |
1184 if flag & BAM_FPAIRED == 0: | |
1185 raise ValueError("read %s: is unpaired" % | |
1186 (read.query_name)) | |
1187 if flag & BAM_FMUNMAP != 0: | |
1188 raise ValueError("mate %s: is unmapped" % | |
1189 (read.query_name)) | |
1190 | |
1191 # xor flags to get the other mate | |
1192 cdef int x = BAM_FREAD1 + BAM_FREAD2 | |
1193 flag = (flag ^ x) & x | |
1194 | |
1195 # Make sure to use a separate file to jump around | |
1196 # to mate as otherwise the original file position | |
1197 # will be lost | |
1198 # The following code is not using the C API and | |
1199 # could thus be made much quicker, for example | |
1200 # by using tell and seek. | |
1201 for mate in self.fetch( | |
1202 read._delegate.core.mpos, | |
1203 read._delegate.core.mpos + 1, | |
1204 tid=read._delegate.core.mtid, | |
1205 multiple_iterators=True): | |
1206 if mate.flag & flag != 0 and \ | |
1207 mate.query_name == read.query_name: | |
1208 break | |
1209 else: | |
1210 raise ValueError("mate not found") | |
1211 | |
1212 return mate | |
1213 | |
1214 def pileup(self, | |
1215 contig=None, | |
1216 start=None, | |
1217 stop=None, | |
1218 region=None, | |
1219 reference=None, | |
1220 end=None, | |
1221 **kwargs): | |
1222 """perform a :term:`pileup` within a :term:`region`. The region is | |
1223 specified by :term:`contig`, `start` and `stop` (using | |
1224 0-based indexing). :term:`reference` and `end` are also accepted for | |
1225 backward compatibility as synonyms for :term:`contig` and `stop`, | |
1226 respectively. Alternatively, a samtools 'region' string | |
1227 can be supplied. | |
1228 | |
1229 Without 'contig' or 'region' all reads will be used for the | |
1230 pileup. The reads will be returned ordered by | |
1231 :term:`contig` sequence, which will not necessarily be the | |
1232 order within the file. | |
1233 | |
1234 Note that :term:`SAM` formatted files do not allow random | |
1235 access. In these files, if a 'region' or 'contig' are | |
1236 given an exception is raised. | |
1237 | |
1238 .. note:: | |
1239 | |
1240 'all' reads which overlap the region are returned. The | |
1241 first base returned will be the first base of the first | |
1242 read 'not' necessarily the first base of the region used | |
1243 in the query. | |
1244 | |
1245 Parameters | |
1246 ---------- | |
1247 | |
1248 truncate : bool | |
1249 | |
1250 By default, the samtools pileup engine outputs all reads | |
1251 overlapping a region. If truncate is True and a region is | |
1252 given, only columns in the exact region specified are | |
1253 returned. | |
1254 | |
1255 max_depth : int | |
1256 Maximum read depth permitted. The default limit is '8000'. | |
1257 | |
1258 stepper : string | |
1259 The stepper controls how the iterator advances. | |
1260 Possible options for the stepper are | |
1261 | |
1262 ``all`` | |
1263 skip reads in which any of the following flags are set: | |
1264 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP | |
1265 | |
1266 ``nofilter`` | |
1267 uses every single read turning off any filtering. | |
1268 | |
1269 ``samtools`` | |
1270 same filter and read processing as in samtools | |
1271 pileup. For full compatibility, this requires a | |
1272 'fastafile' to be given. The following options all pertain | |
1273 to filtering of the ``samtools`` stepper. | |
1274 | |
1275 fastafile : :class:`~pysam.FastaFile` object. | |
1276 | |
1277 This is required for some of the steppers. | |
1278 | |
1279 ignore_overlaps: bool | |
1280 | |
1281 If set to True, detect if read pairs overlap and only take | |
1282 the higher quality base. This is the default. | |
1283 | |
1284 flag_filter : int | |
1285 | |
1286 ignore reads where any of the bits in the flag are set. The default is | |
1287 BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. | |
1288 | |
1289 flag_require : int | |
1290 | |
1291 only use reads where certain flags are set. The default is 0. | |
1292 | |
1293 ignore_orphans: bool | |
1294 | |
1295 ignore orphans (paired reads that are not in a proper pair). | |
1296 The default is to ignore orphans. | |
1297 | |
1298 min_base_quality: int | |
1299 | |
1300 Minimum base quality. Bases below the minimum quality will | |
1301 not be output. The default is 13. | |
1302 | |
1303 adjust_capq_threshold: int | |
1304 | |
1305 adjust mapping quality. The default is 0 for no | |
1306 adjustment. The recommended value for adjustment is 50. | |
1307 | |
1308 min_mapping_quality : int | |
1309 | |
1310 only use reads above a minimum mapping quality. The default is 0. | |
1311 | |
1312 compute_baq: bool | |
1313 | |
1314 re-alignment computing per-Base Alignment Qualities (BAQ). The | |
1315 default is to do re-alignment. Realignment requires a reference | |
1316 sequence. If none is present, no realignment will be performed. | |
1317 | |
1318 redo_baq: bool | |
1319 | |
1320 recompute per-Base Alignment Quality on the fly ignoring | |
1321 existing base qualities. The default is False (use existing | |
1322 base qualities). | |
1323 | |
1324 Returns | |
1325 ------- | |
1326 | |
1327 an iterator over genomic positions. : IteratorColumn | |
1328 | |
1329 """ | |
1330 cdef int rtid, has_coord | |
1331 cdef int32_t rstart, rstop | |
1332 | |
1333 if not self.is_open: | |
1334 raise ValueError("I/O operation on closed file") | |
1335 | |
1336 has_coord, rtid, rstart, rstop = self.parse_region( | |
1337 contig, start, stop, region, reference=reference, end=end) | |
1338 | |
1339 if has_coord: | |
1340 if not self.has_index(): | |
1341 raise ValueError("no index available for pileup") | |
1342 | |
1343 return IteratorColumnRegion(self, | |
1344 tid=rtid, | |
1345 start=rstart, | |
1346 stop=rstop, | |
1347 **kwargs) | |
1348 else: | |
1349 if self.has_index(): | |
1350 return IteratorColumnAllRefs(self, **kwargs) | |
1351 else: | |
1352 return IteratorColumnAll(self, **kwargs) | |
1353 | |
1354 def count(self, | |
1355 contig=None, | |
1356 start=None, | |
1357 stop=None, | |
1358 region=None, | |
1359 until_eof=False, | |
1360 read_callback="nofilter", | |
1361 reference=None, | |
1362 end=None): | |
1363 '''count the number of reads in :term:`region` | |
1364 | |
1365 The region is specified by :term:`contig`, `start` and `stop`. | |
1366 :term:`reference` and `end` are also accepted for backward | |
1367 compatibility as synonyms for :term:`contig` and `stop`, | |
1368 respectively. Alternatively, a `samtools`_ :term:`region` | |
1369 string can be supplied. | |
1370 | |
1371 A :term:`SAM` file does not allow random access and if | |
1372 `region` or `contig` are given, an exception is raised. | |
1373 | |
1374 Parameters | |
1375 ---------- | |
1376 | |
1377 contig : string | |
1378 reference_name of the genomic region (chromosome) | |
1379 | |
1380 start : int | |
1381 start of the genomic region (0-based inclusive) | |
1382 | |
1383 stop : int | |
1384 end of the genomic region (0-based exclusive) | |
1385 | |
1386 region : string | |
1387 a region string in samtools format. | |
1388 | |
1389 until_eof : bool | |
1390 count until the end of the file, possibly including | |
1391 unmapped reads as well. | |
1392 | |
1393 read_callback: string or function | |
1394 | |
1395 select a call-back to ignore reads when counting. It can | |
1396 be either a string with the following values: | |
1397 | |
1398 ``all`` | |
1399 skip reads in which any of the following | |
1400 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, | |
1401 BAM_FDUP | |
1402 | |
1403 ``nofilter`` | |
1404 uses every single read | |
1405 | |
1406 Alternatively, `read_callback` can be a function | |
1407 ``check_read(read)`` that should return True only for | |
1408 those reads that shall be included in the counting. | |
1409 | |
1410 reference : string | |
1411 backward compatible synonym for `contig` | |
1412 | |
1413 end : int | |
1414 backward compatible synonym for `stop` | |
1415 | |
1416 Raises | |
1417 ------ | |
1418 | |
1419 ValueError | |
1420 if the genomic coordinates are out of range or invalid. | |
1421 | |
1422 ''' | |
1423 cdef AlignedSegment read | |
1424 cdef long counter = 0 | |
1425 | |
1426 if not self.is_open: | |
1427 raise ValueError("I/O operation on closed file") | |
1428 | |
1429 cdef int filter_method = 0 | |
1430 if read_callback == "all": | |
1431 filter_method = 1 | |
1432 elif read_callback == "nofilter": | |
1433 filter_method = 2 | |
1434 | |
1435 for read in self.fetch(contig=contig, | |
1436 start=start, | |
1437 stop=stop, | |
1438 reference=reference, | |
1439 end=end, | |
1440 region=region, | |
1441 until_eof=until_eof): | |
1442 # apply filter | |
1443 if filter_method == 1: | |
1444 # filter = "all" | |
1445 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): | |
1446 continue | |
1447 elif filter_method == 2: | |
1448 # filter = "nofilter" | |
1449 pass | |
1450 else: | |
1451 if not read_callback(read): | |
1452 continue | |
1453 counter += 1 | |
1454 | |
1455 return counter | |
1456 | |
1457 @cython.boundscheck(False) # we do manual bounds checking | |
1458 def count_coverage(self, | |
1459 contig, | |
1460 start=None, | |
1461 stop=None, | |
1462 region=None, | |
1463 quality_threshold=15, | |
1464 read_callback='all', | |
1465 reference=None, | |
1466 end=None): | |
1467 """count the coverage of genomic positions by reads in :term:`region`. | |
1468 | |
1469 The region is specified by :term:`contig`, `start` and `stop`. | |
1470 :term:`reference` and `end` are also accepted for backward | |
1471 compatibility as synonyms for :term:`contig` and `stop`, | |
1472 respectively. Alternatively, a `samtools`_ :term:`region` | |
1473 string can be supplied. The coverage is computed per-base [ACGT]. | |
1474 | |
1475 Parameters | |
1476 ---------- | |
1477 | |
1478 contig : string | |
1479 reference_name of the genomic region (chromosome) | |
1480 | |
1481 start : int | |
1482 start of the genomic region (0-based inclusive). If not | |
1483 given, count from the start of the chromosome. | |
1484 | |
1485 stop : int | |
1486 end of the genomic region (0-based exclusive). If not given, | |
1487 count to the end of the chromosome. | |
1488 | |
1489 region : string | |
1490 a region string. | |
1491 | |
1492 quality_threshold : int | |
1493 quality_threshold is the minimum quality score (in phred) a | |
1494 base has to reach to be counted. | |
1495 | |
1496 read_callback: string or function | |
1497 | |
1498 select a call-back to ignore reads when counting. It can | |
1499 be either a string with the following values: | |
1500 | |
1501 ``all`` | |
1502 skip reads in which any of the following | |
1503 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, | |
1504 BAM_FDUP | |
1505 | |
1506 ``nofilter`` | |
1507 uses every single read | |
1508 | |
1509 Alternatively, `read_callback` can be a function | |
1510 ``check_read(read)`` that should return True only for | |
1511 those reads that shall be included in the counting. | |
1512 | |
1513 reference : string | |
1514 backward compatible synonym for `contig` | |
1515 | |
1516 end : int | |
1517 backward compatible synonym for `stop` | |
1518 | |
1519 Raises | |
1520 ------ | |
1521 | |
1522 ValueError | |
1523 if the genomic coordinates are out of range or invalid. | |
1524 | |
1525 Returns | |
1526 ------- | |
1527 | |
1528 four array.arrays of the same length in order A C G T : tuple | |
1529 | |
1530 """ | |
1531 | |
1532 cdef uint32_t contig_length = self.get_reference_length(contig) | |
1533 cdef int _start = start if start is not None else 0 | |
1534 cdef int _stop = stop if stop is not None else contig_length | |
1535 _stop = _stop if _stop < contig_length else contig_length | |
1536 | |
1537 if _stop == _start: | |
1538 raise ValueError("interval of size 0") | |
1539 if _stop < _start: | |
1540 raise ValueError("interval of size less than 0") | |
1541 | |
1542 cdef int length = _stop - _start | |
1543 cdef c_array.array int_array_template = array.array('L', []) | |
1544 cdef c_array.array count_a | |
1545 cdef c_array.array count_c | |
1546 cdef c_array.array count_g | |
1547 cdef c_array.array count_t | |
1548 count_a = c_array.clone(int_array_template, length, zero=True) | |
1549 count_c = c_array.clone(int_array_template, length, zero=True) | |
1550 count_g = c_array.clone(int_array_template, length, zero=True) | |
1551 count_t = c_array.clone(int_array_template, length, zero=True) | |
1552 | |
1553 cdef AlignedSegment read | |
1554 cdef cython.str seq | |
1555 cdef c_array.array quality | |
1556 cdef int qpos | |
1557 cdef int refpos | |
1558 cdef int c = 0 | |
1559 cdef int filter_method = 0 | |
1560 | |
1561 | |
1562 if read_callback == "all": | |
1563 filter_method = 1 | |
1564 elif read_callback == "nofilter": | |
1565 filter_method = 2 | |
1566 | |
1567 cdef int _threshold = quality_threshold or 0 | |
1568 for read in self.fetch(contig=contig, | |
1569 reference=reference, | |
1570 start=start, | |
1571 stop=stop, | |
1572 end=end, | |
1573 region=region): | |
1574 # apply filter | |
1575 if filter_method == 1: | |
1576 # filter = "all" | |
1577 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): | |
1578 continue | |
1579 elif filter_method == 2: | |
1580 # filter = "nofilter" | |
1581 pass | |
1582 else: | |
1583 if not read_callback(read): | |
1584 continue | |
1585 | |
1586 # count | |
1587 seq = read.seq | |
1588 if seq is None: | |
1589 continue | |
1590 quality = read.query_qualities | |
1591 | |
1592 for qpos, refpos in read.get_aligned_pairs(True): | |
1593 if qpos is not None and refpos is not None and \ | |
1594 _start <= refpos < _stop: | |
1595 | |
1596 # only check base quality if _threshold > 0 | |
1597 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: | |
1598 if seq[qpos] == 'A': | |
1599 count_a.data.as_ulongs[refpos - _start] += 1 | |
1600 if seq[qpos] == 'C': | |
1601 count_c.data.as_ulongs[refpos - _start] += 1 | |
1602 if seq[qpos] == 'G': | |
1603 count_g.data.as_ulongs[refpos - _start] += 1 | |
1604 if seq[qpos] == 'T': | |
1605 count_t.data.as_ulongs[refpos - _start] += 1 | |
1606 | |
1607 return count_a, count_c, count_g, count_t | |
1608 | |
1609 def find_introns_slow(self, read_iterator): | |
1610 """Return a dictionary {(start, stop): count} | |
1611 Listing the intronic sites in the reads (identified by 'N' in the cigar strings), | |
1612 and their support ( = number of reads ). | |
1613 | |
1614 read_iterator can be the result of a .fetch(...) call. | |
1615 Or it can be a generator filtering such reads. Example | |
1616 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) | |
1617 """ | |
1618 res = collections.Counter() | |
1619 for r in read_iterator: | |
1620 if 'N' in r.cigarstring: | |
1621 last_read_pos = False | |
1622 for read_loc, genome_loc in r.get_aligned_pairs(): | |
1623 if read_loc is None and last_read_pos: | |
1624 start = genome_loc | |
1625 elif read_loc and last_read_pos is None: | |
1626 stop = genome_loc # we are right exclusive ,so this is correct | |
1627 res[(start, stop)] += 1 | |
1628 del start | |
1629 del stop | |
1630 last_read_pos = read_loc | |
1631 return res | |
1632 | |
1633 def find_introns(self, read_iterator): | |
1634 """Return a dictionary {(start, stop): count} | |
1635 Listing the intronic sites in the reads (identified by 'N' in the cigar strings), | |
1636 and their support ( = number of reads ). | |
1637 | |
1638 read_iterator can be the result of a .fetch(...) call. | |
1639 Or it can be a generator filtering such reads. Example | |
1640 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) | |
1641 """ | |
1642 cdef: | |
1643 uint32_t base_position, junc_start, nt | |
1644 int op | |
1645 AlignedSegment r | |
1646 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP | |
1647 | |
1648 res = collections.Counter() | |
1649 | |
1650 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position | |
1651 for r in read_iterator: | |
1652 base_position = r.pos | |
1653 cigar = r.cigartuples | |
1654 if cigar is None: | |
1655 continue | |
1656 | |
1657 for op, nt in cigar: | |
1658 if op in match_or_deletion: | |
1659 base_position += nt | |
1660 elif op == BAM_CREF_SKIP: | |
1661 junc_start = base_position | |
1662 base_position += nt | |
1663 res[(junc_start, base_position)] += 1 | |
1664 return res | |
1665 | |
1666 | |
1667 def close(self): | |
1668 '''closes the :class:`pysam.AlignmentFile`.''' | |
1669 | |
1670 if self.htsfile == NULL: | |
1671 return | |
1672 | |
1673 if self.index != NULL: | |
1674 hts_idx_destroy(self.index) | |
1675 self.index = NULL | |
1676 | |
1677 cdef int ret = hts_close(self.htsfile) | |
1678 self.htsfile = NULL | |
1679 | |
1680 self.header = None | |
1681 | |
1682 if ret < 0: | |
1683 global errno | |
1684 if errno == EPIPE: | |
1685 errno = 0 | |
1686 else: | |
1687 raise IOError(errno, force_str(strerror(errno))) | |
1688 | |
1689 def __dealloc__(self): | |
1690 cdef int ret = 0 | |
1691 | |
1692 if self.index != NULL: | |
1693 hts_idx_destroy(self.index) | |
1694 self.index = NULL | |
1695 | |
1696 if self.htsfile != NULL: | |
1697 ret = hts_close(self.htsfile) | |
1698 self.htsfile = NULL | |
1699 | |
1700 self.header = None | |
1701 | |
1702 if self.b: | |
1703 bam_destroy1(self.b) | |
1704 self.b = NULL | |
1705 | |
1706 if ret < 0: | |
1707 global errno | |
1708 if errno == EPIPE: | |
1709 errno = 0 | |
1710 else: | |
1711 raise IOError(errno, force_str(strerror(errno))) | |
1712 | |
1713 cpdef int write(self, AlignedSegment read) except -1: | |
1714 ''' | |
1715 write a single :class:`pysam.AlignedSegment` to disk. | |
1716 | |
1717 Raises: | |
1718 ValueError | |
1719 if the writing failed | |
1720 | |
1721 Returns: | |
1722 int : | |
1723 the number of bytes written. If the file is closed, | |
1724 this will be 0. | |
1725 ''' | |
1726 if not self.is_open: | |
1727 return 0 | |
1728 | |
1729 if self.header.ptr.n_targets <= read._delegate.core.tid: | |
1730 raise ValueError( | |
1731 "AlignedSegment refers to reference number {} that " | |
1732 "is larger than the number of references ({}) in the header".format( | |
1733 read._delegate.core.tid, self.header.ptr.n_targets)) | |
1734 | |
1735 cdef int ret | |
1736 with nogil: | |
1737 ret = sam_write1(self.htsfile, | |
1738 self.header.ptr, | |
1739 read._delegate) | |
1740 | |
1741 # kbj: Still need to raise an exception with except -1. Otherwise | |
1742 # when ret == -1 we get a "SystemError: error return without | |
1743 # exception set". | |
1744 if ret < 0: | |
1745 raise IOError( | |
1746 "sam_write1 failed with error code {}".format(ret)) | |
1747 | |
1748 return ret | |
1749 | |
1750 # context manager interface | |
1751 def __enter__(self): | |
1752 return self | |
1753 | |
1754 def __exit__(self, exc_type, exc_value, traceback): | |
1755 self.close() | |
1756 return False | |
1757 | |
1758 ############################################################### | |
1759 ############################################################### | |
1760 ############################################################### | |
1761 ## properties | |
1762 ############################################################### | |
1763 property mapped: | |
1764 """int with total number of mapped alignments according to the | |
1765 statistics recorded in the index. This is a read-only | |
1766 attribute. | |
1767 (This will be 0 for a CRAM file indexed by a .crai index, as that | |
1768 index format does not record these statistics.) | |
1769 """ | |
1770 def __get__(self): | |
1771 self.check_index() | |
1772 cdef int tid | |
1773 cdef uint64_t total = 0 | |
1774 cdef uint64_t mapped, unmapped | |
1775 for tid from 0 <= tid < self.header.nreferences: | |
1776 with nogil: | |
1777 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) | |
1778 total += mapped | |
1779 return total | |
1780 | |
1781 property unmapped: | |
1782 """int with total number of unmapped reads according to the statistics | |
1783 recorded in the index. This number of reads includes the number of reads | |
1784 without coordinates. This is a read-only attribute. | |
1785 (This will be 0 for a CRAM file indexed by a .crai index, as that | |
1786 index format does not record these statistics.) | |
1787 """ | |
1788 def __get__(self): | |
1789 self.check_index() | |
1790 cdef int tid | |
1791 cdef uint64_t total = hts_idx_get_n_no_coor(self.index) | |
1792 cdef uint64_t mapped, unmapped | |
1793 for tid from 0 <= tid < self.header.nreferences: | |
1794 with nogil: | |
1795 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) | |
1796 total += unmapped | |
1797 return total | |
1798 | |
1799 property nocoordinate: | |
1800 """int with total number of reads without coordinates according to the | |
1801 statistics recorded in the index, i.e., the statistic printed for "*" | |
1802 by the ``samtools idxstats`` command. This is a read-only attribute. | |
1803 (This will be 0 for a CRAM file indexed by a .crai index, as that | |
1804 index format does not record these statistics.) | |
1805 """ | |
1806 def __get__(self): | |
1807 self.check_index() | |
1808 cdef uint64_t n | |
1809 with nogil: | |
1810 n = hts_idx_get_n_no_coor(self.index) | |
1811 return n | |
1812 | |
1813 def get_index_statistics(self): | |
1814 """return statistics about mapped/unmapped reads per chromosome as | |
1815 they are stored in the index, similarly to the statistics printed | |
1816 by the ``samtools idxstats`` command. | |
1817 | |
1818 CRAI indexes do not record these statistics, so for a CRAM file | |
1819 with a .crai index the returned statistics will all be 0. | |
1820 | |
1821 Returns: | |
1822 list : | |
1823 a list of records for each chromosome. Each record has the | |
1824 attributes 'contig', 'mapped', 'unmapped' and 'total'. | |
1825 """ | |
1826 | |
1827 self.check_index() | |
1828 cdef int tid | |
1829 cdef uint64_t mapped, unmapped | |
1830 results = [] | |
1831 # TODO: use header | |
1832 for tid from 0 <= tid < self.nreferences: | |
1833 with nogil: | |
1834 hts_idx_get_stat(self.index, tid, &mapped, &unmapped) | |
1835 results.append( | |
1836 IndexStats._make(( | |
1837 self.get_reference_name(tid), | |
1838 mapped, | |
1839 unmapped, | |
1840 mapped + unmapped))) | |
1841 | |
1842 return results | |
1843 | |
1844 ############################################################### | |
1845 ## file-object like iterator access | |
1846 ## note: concurrent access will cause errors (see IteratorRow | |
1847 ## and multiple_iterators) | |
1848 ## Possible solutions: deprecate or open new file handle | |
1849 def __iter__(self): | |
1850 if not self.is_open: | |
1851 raise ValueError("I/O operation on closed file") | |
1852 | |
1853 if not self.is_bam and self.header.nreferences == 0: | |
1854 raise NotImplementedError( | |
1855 "can not iterate over samfile without header") | |
1856 return self | |
1857 | |
1858 cdef bam1_t * getCurrent(self): | |
1859 return self.b | |
1860 | |
1861 cdef int cnext(self): | |
1862 ''' | |
1863 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. | |
1864 ''' | |
1865 cdef int ret | |
1866 cdef bam_hdr_t * hdr = self.header.ptr | |
1867 with nogil: | |
1868 ret = sam_read1(self.htsfile, | |
1869 hdr, | |
1870 self.b) | |
1871 return ret | |
1872 | |
1873 def __next__(self): | |
1874 cdef int ret = self.cnext() | |
1875 if ret >= 0: | |
1876 return makeAlignedSegment(self.b, self.header) | |
1877 elif ret == -1: | |
1878 raise StopIteration | |
1879 else: | |
1880 raise IOError(read_failure_reason(ret)) | |
1881 | |
1882 ########################################### | |
1883 # methods/properties referencing the header | |
1884 def is_valid_tid(self, int tid): | |
1885 """ | |
1886 return True if the numerical :term:`tid` is valid; False otherwise. | |
1887 | |
1888 Note that the unmapped tid code (-1) counts as an invalid. | |
1889 """ | |
1890 if self.header is None: | |
1891 raise ValueError("header not available in closed files") | |
1892 return self.header.is_valid_tid(tid) | |
1893 | |
1894 def get_tid(self, reference): | |
1895 """ | |
1896 return the numerical :term:`tid` corresponding to | |
1897 :term:`reference` | |
1898 | |
1899 returns -1 if reference is not known. | |
1900 """ | |
1901 if self.header is None: | |
1902 raise ValueError("header not available in closed files") | |
1903 return self.header.get_tid(reference) | |
1904 | |
1905 def get_reference_name(self, tid): | |
1906 """ | |
1907 return :term:`reference` name corresponding to numerical :term:`tid` | |
1908 """ | |
1909 if self.header is None: | |
1910 raise ValueError("header not available in closed files") | |
1911 return self.header.get_reference_name(tid) | |
1912 | |
1913 def get_reference_length(self, reference): | |
1914 """ | |
1915 return :term:`reference` length corresponding to numerical :term:`tid` | |
1916 """ | |
1917 if self.header is None: | |
1918 raise ValueError("header not available in closed files") | |
1919 return self.header.get_reference_length(reference) | |
1920 | |
1921 property nreferences: | |
1922 """int with the number of :term:`reference` sequences in the file. | |
1923 This is a read-only attribute.""" | |
1924 def __get__(self): | |
1925 if self.header: | |
1926 return self.header.nreferences | |
1927 else: | |
1928 raise ValueError("header not available in closed files") | |
1929 | |
1930 property references: | |
1931 """tuple with the names of :term:`reference` sequences. This is a | |
1932 read-only attribute""" | |
1933 def __get__(self): | |
1934 if self.header: | |
1935 return self.header.references | |
1936 else: | |
1937 raise ValueError("header not available in closed files") | |
1938 | |
1939 property lengths: | |
1940 """tuple of the lengths of the :term:`reference` sequences. This is a | |
1941 read-only attribute. The lengths are in the same order as | |
1942 :attr:`pysam.AlignmentFile.references` | |
1943 | |
1944 """ | |
1945 def __get__(self): | |
1946 if self.header: | |
1947 return self.header.lengths | |
1948 else: | |
1949 raise ValueError("header not available in closed files") | |
1950 | |
1951 # Compatibility functions for pysam < 0.14 | |
1952 property text: | |
1953 """deprecated, use :attr:`references` and :attr:`lengths` instead""" | |
1954 def __get__(self): | |
1955 if self.header: | |
1956 return self.header.__str__() | |
1957 else: | |
1958 raise ValueError("header not available in closed files") | |
1959 | |
1960 # Compatibility functions for pysam < 0.8.3 | |
1961 def gettid(self, reference): | |
1962 """deprecated, use :meth:`get_tid` instead""" | |
1963 return self.get_tid(reference) | |
1964 | |
1965 def getrname(self, tid): | |
1966 """deprecated, use :meth:`get_reference_name` instead""" | |
1967 return self.get_reference_name(tid) | |
1968 | |
1969 | |
1970 cdef class IteratorRow: | |
1971 '''abstract base class for iterators over mapped reads. | |
1972 | |
1973 Various iterators implement different behaviours for wrapping around | |
1974 contig boundaries. Examples include: | |
1975 | |
1976 :class:`pysam.IteratorRowRegion` | |
1977 iterate within a single contig and a defined region. | |
1978 | |
1979 :class:`pysam.IteratorRowAll` | |
1980 iterate until EOF. This iterator will also include unmapped reads. | |
1981 | |
1982 :class:`pysam.IteratorRowAllRefs` | |
1983 iterate over all reads in all reference sequences. | |
1984 | |
1985 The method :meth:`AlignmentFile.fetch` returns an IteratorRow. | |
1986 | |
1987 .. note:: | |
1988 | |
1989 It is usually not necessary to create an object of this class | |
1990 explicitly. It is returned as a result of call to a | |
1991 :meth:`AlignmentFile.fetch`. | |
1992 | |
1993 ''' | |
1994 | |
1995 def __init__(self, AlignmentFile samfile, int multiple_iterators=False): | |
1996 cdef char *cfilename | |
1997 cdef char *creference_filename | |
1998 cdef char *cindexname = NULL | |
1999 | |
2000 if not samfile.is_open: | |
2001 raise ValueError("I/O operation on closed file") | |
2002 | |
2003 # makes sure that samfile stays alive as long as the | |
2004 # iterator is alive | |
2005 self.samfile = samfile | |
2006 | |
2007 # reopen the file - note that this makes the iterator | |
2008 # slow and causes pileup to slow down significantly. | |
2009 if multiple_iterators: | |
2010 | |
2011 cfilename = samfile.filename | |
2012 with nogil: | |
2013 self.htsfile = hts_open(cfilename, 'r') | |
2014 assert self.htsfile != NULL | |
2015 | |
2016 if samfile.has_index(): | |
2017 if samfile.index_filename: | |
2018 cindexname = bindex_filename = encode_filename(samfile.index_filename) | |
2019 with nogil: | |
2020 self.index = sam_index_load2(self.htsfile, cfilename, cindexname) | |
2021 else: | |
2022 self.index = NULL | |
2023 | |
2024 # need to advance in newly opened file to position after header | |
2025 # better: use seek/tell? | |
2026 with nogil: | |
2027 hdr = sam_hdr_read(self.htsfile) | |
2028 if hdr is NULL: | |
2029 raise IOError("unable to read header information") | |
2030 self.header = makeAlignmentHeader(hdr) | |
2031 | |
2032 self.owns_samfile = True | |
2033 | |
2034 # options specific to CRAM files | |
2035 if samfile.is_cram and samfile.reference_filename: | |
2036 creference_filename = samfile.reference_filename | |
2037 hts_set_opt(self.htsfile, | |
2038 CRAM_OPT_REFERENCE, | |
2039 creference_filename) | |
2040 | |
2041 else: | |
2042 self.htsfile = samfile.htsfile | |
2043 self.index = samfile.index | |
2044 self.owns_samfile = False | |
2045 self.header = samfile.header | |
2046 | |
2047 self.retval = 0 | |
2048 | |
2049 self.b = bam_init1() | |
2050 | |
2051 def __dealloc__(self): | |
2052 bam_destroy1(self.b) | |
2053 if self.owns_samfile: | |
2054 hts_idx_destroy(self.index) | |
2055 hts_close(self.htsfile) | |
2056 | |
2057 | |
2058 cdef class IteratorRowRegion(IteratorRow): | |
2059 """*(AlignmentFile samfile, int tid, int beg, int stop, | |
2060 int multiple_iterators=False)* | |
2061 | |
2062 iterate over mapped reads in a region. | |
2063 | |
2064 .. note:: | |
2065 | |
2066 It is usually not necessary to create an object of this class | |
2067 explicitly. It is returned as a result of call to a | |
2068 :meth:`AlignmentFile.fetch`. | |
2069 | |
2070 """ | |
2071 | |
2072 def __init__(self, AlignmentFile samfile, | |
2073 int tid, int beg, int stop, | |
2074 int multiple_iterators=False): | |
2075 | |
2076 if not samfile.has_index(): | |
2077 raise ValueError("no index available for iteration") | |
2078 | |
2079 super().__init__(samfile, multiple_iterators=multiple_iterators) | |
2080 with nogil: | |
2081 self.iter = sam_itr_queryi( | |
2082 self.index, | |
2083 tid, | |
2084 beg, | |
2085 stop) | |
2086 | |
2087 def __iter__(self): | |
2088 return self | |
2089 | |
2090 cdef bam1_t * getCurrent(self): | |
2091 return self.b | |
2092 | |
2093 cdef int cnext(self): | |
2094 '''cversion of iterator. Used by IteratorColumn''' | |
2095 with nogil: | |
2096 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), | |
2097 self.iter, | |
2098 self.b, | |
2099 self.htsfile) | |
2100 | |
2101 def __next__(self): | |
2102 self.cnext() | |
2103 if self.retval >= 0: | |
2104 return makeAlignedSegment(self.b, self.header) | |
2105 elif self.retval == -1: | |
2106 raise StopIteration | |
2107 elif self.retval == -2: | |
2108 # Note: it is currently not the case that hts_iter_next | |
2109 # returns -2 for a truncated file. | |
2110 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 | |
2111 raise IOError('truncated file') | |
2112 else: | |
2113 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) | |
2114 | |
2115 def __dealloc__(self): | |
2116 hts_itr_destroy(self.iter) | |
2117 | |
2118 | |
2119 cdef class IteratorRowHead(IteratorRow): | |
2120 """*(AlignmentFile samfile, n, int multiple_iterators=False)* | |
2121 | |
2122 iterate over first n reads in `samfile` | |
2123 | |
2124 .. note:: | |
2125 It is usually not necessary to create an object of this class | |
2126 explicitly. It is returned as a result of call to a | |
2127 :meth:`AlignmentFile.head`. | |
2128 | |
2129 """ | |
2130 | |
2131 def __init__(self, | |
2132 AlignmentFile samfile, | |
2133 int n, | |
2134 int multiple_iterators=False): | |
2135 super().__init__(samfile, multiple_iterators=multiple_iterators) | |
2136 | |
2137 self.max_rows = n | |
2138 self.current_row = 0 | |
2139 | |
2140 def __iter__(self): | |
2141 return self | |
2142 | |
2143 cdef bam1_t * getCurrent(self): | |
2144 return self.b | |
2145 | |
2146 cdef int cnext(self): | |
2147 '''cversion of iterator. Used by IteratorColumn''' | |
2148 cdef int ret | |
2149 cdef bam_hdr_t * hdr = self.header.ptr | |
2150 with nogil: | |
2151 ret = sam_read1(self.htsfile, | |
2152 hdr, | |
2153 self.b) | |
2154 return ret | |
2155 | |
2156 def __next__(self): | |
2157 if self.current_row >= self.max_rows: | |
2158 raise StopIteration | |
2159 | |
2160 cdef int ret = self.cnext() | |
2161 if ret >= 0: | |
2162 self.current_row += 1 | |
2163 return makeAlignedSegment(self.b, self.header) | |
2164 elif ret == -1: | |
2165 raise StopIteration | |
2166 else: | |
2167 raise IOError(read_failure_reason(ret)) | |
2168 | |
2169 | |
2170 cdef class IteratorRowAll(IteratorRow): | |
2171 """*(AlignmentFile samfile, int multiple_iterators=False)* | |
2172 | |
2173 iterate over all reads in `samfile` | |
2174 | |
2175 .. note:: | |
2176 | |
2177 It is usually not necessary to create an object of this class | |
2178 explicitly. It is returned as a result of call to a | |
2179 :meth:`AlignmentFile.fetch`. | |
2180 | |
2181 """ | |
2182 | |
2183 def __init__(self, AlignmentFile samfile, int multiple_iterators=False): | |
2184 super().__init__(samfile, multiple_iterators=multiple_iterators) | |
2185 | |
2186 def __iter__(self): | |
2187 return self | |
2188 | |
2189 cdef bam1_t * getCurrent(self): | |
2190 return self.b | |
2191 | |
2192 cdef int cnext(self): | |
2193 '''cversion of iterator. Used by IteratorColumn''' | |
2194 cdef int ret | |
2195 cdef bam_hdr_t * hdr = self.header.ptr | |
2196 with nogil: | |
2197 ret = sam_read1(self.htsfile, | |
2198 hdr, | |
2199 self.b) | |
2200 return ret | |
2201 | |
2202 def __next__(self): | |
2203 cdef int ret = self.cnext() | |
2204 if ret >= 0: | |
2205 return makeAlignedSegment(self.b, self.header) | |
2206 elif ret == -1: | |
2207 raise StopIteration | |
2208 else: | |
2209 raise IOError(read_failure_reason(ret)) | |
2210 | |
2211 | |
2212 cdef class IteratorRowAllRefs(IteratorRow): | |
2213 """iterates over all mapped reads by chaining iterators over each | |
2214 reference | |
2215 | |
2216 .. note:: | |
2217 It is usually not necessary to create an object of this class | |
2218 explicitly. It is returned as a result of call to a | |
2219 :meth:`AlignmentFile.fetch`. | |
2220 | |
2221 """ | |
2222 | |
2223 def __init__(self, AlignmentFile samfile, multiple_iterators=False): | |
2224 super().__init__(samfile, multiple_iterators=multiple_iterators) | |
2225 | |
2226 if not samfile.has_index(): | |
2227 raise ValueError("no index available for fetch") | |
2228 | |
2229 self.tid = -1 | |
2230 | |
2231 def nextiter(self): | |
2232 # get a new iterator for a chromosome. The file | |
2233 # will not be re-opened. | |
2234 self.rowiter = IteratorRowRegion(self.samfile, | |
2235 self.tid, | |
2236 0, | |
2237 MAX_POS) | |
2238 # set htsfile and header of the rowiter | |
2239 # to the values in this iterator to reflect multiple_iterators | |
2240 self.rowiter.htsfile = self.htsfile | |
2241 self.rowiter.header = self.header | |
2242 | |
2243 # make sure the iterator understand that IteratorRowAllRefs | |
2244 # has ownership | |
2245 self.rowiter.owns_samfile = False | |
2246 | |
2247 def __iter__(self): | |
2248 return self | |
2249 | |
2250 def __next__(self): | |
2251 # Create an initial iterator | |
2252 if self.tid == -1: | |
2253 if not self.samfile.nreferences: | |
2254 raise StopIteration | |
2255 self.tid = 0 | |
2256 self.nextiter() | |
2257 | |
2258 while 1: | |
2259 self.rowiter.cnext() | |
2260 | |
2261 # If current iterator is not exhausted, return aligned read | |
2262 if self.rowiter.retval > 0: | |
2263 return makeAlignedSegment(self.rowiter.b, self.header) | |
2264 | |
2265 self.tid += 1 | |
2266 | |
2267 # Otherwise, proceed to next reference or stop | |
2268 if self.tid < self.samfile.nreferences: | |
2269 self.nextiter() | |
2270 else: | |
2271 raise StopIteration | |
2272 | |
2273 | |
2274 cdef class IteratorRowSelection(IteratorRow): | |
2275 """*(AlignmentFile samfile)* | |
2276 | |
2277 iterate over reads in `samfile` at a given list of file positions. | |
2278 | |
2279 .. note:: | |
2280 It is usually not necessary to create an object of this class | |
2281 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. | |
2282 """ | |
2283 | |
2284 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): | |
2285 super().__init__(samfile, multiple_iterators=multiple_iterators) | |
2286 | |
2287 self.positions = positions | |
2288 self.current_pos = 0 | |
2289 | |
2290 def __iter__(self): | |
2291 return self | |
2292 | |
2293 cdef bam1_t * getCurrent(self): | |
2294 return self.b | |
2295 | |
2296 cdef int cnext(self): | |
2297 '''cversion of iterator''' | |
2298 # end iteration if out of positions | |
2299 if self.current_pos >= len(self.positions): return -1 | |
2300 | |
2301 cdef uint64_t pos = self.positions[self.current_pos] | |
2302 with nogil: | |
2303 bgzf_seek(hts_get_bgzfp(self.htsfile), | |
2304 pos, | |
2305 0) | |
2306 self.current_pos += 1 | |
2307 | |
2308 cdef int ret | |
2309 cdef bam_hdr_t * hdr = self.header.ptr | |
2310 with nogil: | |
2311 ret = sam_read1(self.htsfile, | |
2312 hdr, | |
2313 self.b) | |
2314 return ret | |
2315 | |
2316 def __next__(self): | |
2317 cdef int ret = self.cnext() | |
2318 if ret >= 0: | |
2319 return makeAlignedSegment(self.b, self.header) | |
2320 elif ret == -1: | |
2321 raise StopIteration | |
2322 else: | |
2323 raise IOError(read_failure_reason(ret)) | |
2324 | |
2325 | |
2326 cdef int __advance_nofilter(void *data, bam1_t *b): | |
2327 '''advance without any read filtering. | |
2328 ''' | |
2329 cdef __iterdata * d = <__iterdata*>data | |
2330 cdef int ret | |
2331 with nogil: | |
2332 ret = sam_itr_next(d.htsfile, d.iter, b) | |
2333 return ret | |
2334 | |
2335 | |
2336 cdef int __advance_raw_nofilter(void *data, bam1_t *b): | |
2337 '''advance (without iterator) without any read filtering. | |
2338 ''' | |
2339 cdef __iterdata * d = <__iterdata*>data | |
2340 cdef int ret | |
2341 with nogil: | |
2342 ret = sam_read1(d.htsfile, d.header, b) | |
2343 return ret | |
2344 | |
2345 | |
2346 cdef int __advance_all(void *data, bam1_t *b): | |
2347 '''only use reads for pileup passing basic filters such as | |
2348 | |
2349 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP | |
2350 ''' | |
2351 | |
2352 cdef __iterdata * d = <__iterdata*>data | |
2353 cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | |
2354 cdef int ret | |
2355 while 1: | |
2356 with nogil: | |
2357 ret = sam_itr_next(d.htsfile, d.iter, b) | |
2358 if ret < 0: | |
2359 break | |
2360 if b.core.flag & d.flag_filter: | |
2361 continue | |
2362 break | |
2363 return ret | |
2364 | |
2365 | |
2366 cdef int __advance_raw_all(void *data, bam1_t *b): | |
2367 '''only use reads for pileup passing basic filters such as | |
2368 | |
2369 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP | |
2370 ''' | |
2371 | |
2372 cdef __iterdata * d = <__iterdata*>data | |
2373 cdef int ret | |
2374 while 1: | |
2375 with nogil: | |
2376 ret = sam_read1(d.htsfile, d.header, b) | |
2377 if ret < 0: | |
2378 break | |
2379 if b.core.flag & d.flag_filter: | |
2380 continue | |
2381 break | |
2382 return ret | |
2383 | |
2384 | |
2385 cdef int __advance_samtools(void * data, bam1_t * b): | |
2386 '''advance using same filter and read processing as in | |
2387 the samtools pileup. | |
2388 ''' | |
2389 cdef __iterdata * d = <__iterdata*>data | |
2390 cdef int ret | |
2391 cdef int q | |
2392 | |
2393 while 1: | |
2394 with nogil: | |
2395 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) | |
2396 if ret < 0: | |
2397 break | |
2398 if b.core.flag & d.flag_filter: | |
2399 continue | |
2400 if d.flag_require and not (b.core.flag & d.flag_require): | |
2401 continue | |
2402 | |
2403 # reload sequence | |
2404 if d.fastafile != NULL and b.core.tid != d.tid: | |
2405 if d.seq != NULL: | |
2406 free(d.seq) | |
2407 d.tid = b.core.tid | |
2408 with nogil: | |
2409 d.seq = faidx_fetch_seq( | |
2410 d.fastafile, | |
2411 d.header.target_name[d.tid], | |
2412 0, MAX_POS, | |
2413 &d.seq_len) | |
2414 | |
2415 if d.seq == NULL: | |
2416 raise ValueError( | |
2417 "reference sequence for '{}' (tid={}) not found".format( | |
2418 d.header.target_name[d.tid], d.tid)) | |
2419 | |
2420 # realign read - changes base qualities | |
2421 if d.seq != NULL and d.compute_baq: | |
2422 # 4th option to realign is flag: | |
2423 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 | |
2424 if d.redo_baq: | |
2425 sam_prob_realn(b, d.seq, d.seq_len, 7) | |
2426 else: | |
2427 sam_prob_realn(b, d.seq, d.seq_len, 3) | |
2428 | |
2429 if d.seq != NULL and d.adjust_capq_threshold > 10: | |
2430 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) | |
2431 if q < 0: | |
2432 continue | |
2433 elif b.core.qual > q: | |
2434 b.core.qual = q | |
2435 | |
2436 if b.core.qual < d.min_mapping_quality: | |
2437 continue | |
2438 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): | |
2439 continue | |
2440 | |
2441 break | |
2442 | |
2443 return ret | |
2444 | |
2445 | |
2446 cdef class IteratorColumn: | |
2447 '''abstract base class for iterators over columns. | |
2448 | |
2449 IteratorColumn objects wrap the pileup functionality of samtools. | |
2450 | |
2451 For reasons of efficiency, the iterator points to the current | |
2452 pileup buffer. The pileup buffer is updated at every iteration. | |
2453 This might cause some unexpected behaviour. For example, | |
2454 consider the conversion to a list:: | |
2455 | |
2456 f = AlignmentFile("file.bam", "rb") | |
2457 result = list(f.pileup()) | |
2458 | |
2459 Here, ``result`` will contain ``n`` objects of type | |
2460 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in | |
2461 ``result`` will contain the same information. | |
2462 | |
2463 The desired behaviour can be achieved by list comprehension:: | |
2464 | |
2465 result = [x.pileups() for x in f.pileup()] | |
2466 | |
2467 ``result`` will be a list of ``n`` lists of objects of type | |
2468 :class:`~pysam.PileupRead`. | |
2469 | |
2470 If the iterator is associated with a :class:`~pysam.Fastafile` | |
2471 using the :meth:`add_reference` method, then the iterator will | |
2472 export the current sequence via the methods :meth:`get_sequence` | |
2473 and :meth:`seq_len`. | |
2474 | |
2475 See :class:`~AlignmentFile.pileup` for kwargs to the iterator. | |
2476 ''' | |
2477 | |
2478 def __cinit__( self, AlignmentFile samfile, **kwargs): | |
2479 self.samfile = samfile | |
2480 self.fastafile = kwargs.get("fastafile", None) | |
2481 self.stepper = kwargs.get("stepper", "samtools") | |
2482 self.max_depth = kwargs.get("max_depth", 8000) | |
2483 self.ignore_overlaps = kwargs.get("ignore_overlaps", True) | |
2484 self.min_base_quality = kwargs.get("min_base_quality", 13) | |
2485 self.iterdata.seq = NULL | |
2486 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) | |
2487 self.iterdata.flag_require = kwargs.get("flag_require", 0) | |
2488 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) | |
2489 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) | |
2490 self.iterdata.compute_baq = kwargs.get("compute_baq", True) | |
2491 self.iterdata.redo_baq = kwargs.get("redo_baq", False) | |
2492 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) | |
2493 | |
2494 self.tid = 0 | |
2495 self.pos = 0 | |
2496 self.n_plp = 0 | |
2497 self.plp = NULL | |
2498 self.pileup_iter = <bam_mplp_t>NULL | |
2499 | |
2500 def __iter__(self): | |
2501 return self | |
2502 | |
2503 cdef int cnext(self): | |
2504 '''perform next iteration. | |
2505 ''' | |
2506 # do not release gil here because of call-backs | |
2507 cdef int ret = bam_mplp_auto(self.pileup_iter, | |
2508 &self.tid, | |
2509 &self.pos, | |
2510 &self.n_plp, | |
2511 &self.plp) | |
2512 return ret | |
2513 | |
2514 cdef char * get_sequence(self): | |
2515 '''return current reference sequence underlying the iterator. | |
2516 ''' | |
2517 return self.iterdata.seq | |
2518 | |
2519 property seq_len: | |
2520 '''current sequence length.''' | |
2521 def __get__(self): | |
2522 return self.iterdata.seq_len | |
2523 | |
2524 def add_reference(self, FastaFile fastafile): | |
2525 ''' | |
2526 add reference sequences in `fastafile` to iterator.''' | |
2527 self.fastafile = fastafile | |
2528 if self.iterdata.seq != NULL: | |
2529 free(self.iterdata.seq) | |
2530 self.iterdata.tid = -1 | |
2531 self.iterdata.fastafile = self.fastafile.fastafile | |
2532 | |
2533 def has_reference(self): | |
2534 ''' | |
2535 return true if iterator is associated with a reference''' | |
2536 return self.fastafile | |
2537 | |
2538 cdef _setup_iterator(self, | |
2539 int tid, | |
2540 int start, | |
2541 int stop, | |
2542 int multiple_iterators=0): | |
2543 '''setup the iterator structure''' | |
2544 | |
2545 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) | |
2546 self.iterdata.htsfile = self.samfile.htsfile | |
2547 self.iterdata.iter = self.iter.iter | |
2548 self.iterdata.seq = NULL | |
2549 self.iterdata.tid = -1 | |
2550 self.iterdata.header = self.samfile.header.ptr | |
2551 | |
2552 if self.fastafile is not None: | |
2553 self.iterdata.fastafile = self.fastafile.fastafile | |
2554 else: | |
2555 self.iterdata.fastafile = NULL | |
2556 | |
2557 # Free any previously allocated memory before reassigning | |
2558 # pileup_iter | |
2559 self._free_pileup_iter() | |
2560 | |
2561 cdef void * data[1] | |
2562 data[0] = <void*>&self.iterdata | |
2563 | |
2564 if self.stepper is None or self.stepper == "all": | |
2565 with nogil: | |
2566 self.pileup_iter = bam_mplp_init(1, | |
2567 <bam_plp_auto_f>&__advance_all, | |
2568 data) | |
2569 elif self.stepper == "nofilter": | |
2570 with nogil: | |
2571 self.pileup_iter = bam_mplp_init(1, | |
2572 <bam_plp_auto_f>&__advance_nofilter, | |
2573 data) | |
2574 elif self.stepper == "samtools": | |
2575 with nogil: | |
2576 self.pileup_iter = bam_mplp_init(1, | |
2577 <bam_plp_auto_f>&__advance_samtools, | |
2578 data) | |
2579 else: | |
2580 raise ValueError( | |
2581 "unknown stepper option `%s` in IteratorColumn" % self.stepper) | |
2582 | |
2583 if self.max_depth: | |
2584 with nogil: | |
2585 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) | |
2586 | |
2587 if self.ignore_overlaps: | |
2588 with nogil: | |
2589 bam_mplp_init_overlaps(self.pileup_iter) | |
2590 | |
2591 cdef _setup_raw_rest_iterator(self): | |
2592 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST''' | |
2593 | |
2594 self.iter = None | |
2595 self.iterdata.iter = NULL | |
2596 self.iterdata.htsfile = self.samfile.htsfile | |
2597 self.iterdata.seq = NULL | |
2598 self.iterdata.tid = -1 | |
2599 self.iterdata.header = self.samfile.header.ptr | |
2600 | |
2601 if self.fastafile is not None: | |
2602 self.iterdata.fastafile = self.fastafile.fastafile | |
2603 else: | |
2604 self.iterdata.fastafile = NULL | |
2605 | |
2606 # Free any previously allocated memory before reassigning | |
2607 # pileup_iter | |
2608 self._free_pileup_iter() | |
2609 | |
2610 cdef void * data[1] | |
2611 data[0] = <void*>&self.iterdata | |
2612 | |
2613 if self.stepper is None or self.stepper == "all": | |
2614 with nogil: | |
2615 self.pileup_iter = bam_mplp_init(1, | |
2616 <bam_plp_auto_f>&__advance_raw_all, | |
2617 data) | |
2618 elif self.stepper == "nofilter": | |
2619 with nogil: | |
2620 self.pileup_iter = bam_mplp_init(1, | |
2621 <bam_plp_auto_f>&__advance_raw_nofilter, | |
2622 data) | |
2623 elif self.stepper == "samtools": | |
2624 with nogil: | |
2625 self.pileup_iter = bam_mplp_init(1, | |
2626 <bam_plp_auto_f>&__advance_samtools, | |
2627 data) | |
2628 else: | |
2629 raise ValueError( | |
2630 "unknown stepper option `%s` in IteratorColumn" % self.stepper) | |
2631 | |
2632 if self.max_depth: | |
2633 with nogil: | |
2634 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) | |
2635 | |
2636 if self.ignore_overlaps: | |
2637 with nogil: | |
2638 bam_mplp_init_overlaps(self.pileup_iter) | |
2639 | |
2640 cdef reset(self, tid, start, stop): | |
2641 '''reset iterator position. | |
2642 | |
2643 This permits using the iterator multiple times without | |
2644 having to incur the full set-up costs. | |
2645 ''' | |
2646 if self.iter is None: | |
2647 raise TypeError("Raw iterator set up without region cannot be reset") | |
2648 | |
2649 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) | |
2650 self.iterdata.iter = self.iter.iter | |
2651 | |
2652 # invalidate sequence if different tid | |
2653 if self.tid != tid: | |
2654 if self.iterdata.seq != NULL: | |
2655 free(self.iterdata.seq) | |
2656 self.iterdata.seq = NULL | |
2657 self.iterdata.tid = -1 | |
2658 | |
2659 # self.pileup_iter = bam_mplp_init(1 | |
2660 # &__advancepileup, | |
2661 # &self.iterdata) | |
2662 with nogil: | |
2663 bam_mplp_reset(self.pileup_iter) | |
2664 | |
2665 cdef _free_pileup_iter(self): | |
2666 '''free the memory alloc'd by bam_plp_init. | |
2667 | |
2668 This is needed before setup_iterator allocates another | |
2669 pileup_iter, or else memory will be lost. ''' | |
2670 if self.pileup_iter != <bam_mplp_t>NULL: | |
2671 with nogil: | |
2672 bam_mplp_reset(self.pileup_iter) | |
2673 bam_mplp_destroy(self.pileup_iter) | |
2674 self.pileup_iter = <bam_mplp_t>NULL | |
2675 | |
2676 def __dealloc__(self): | |
2677 # reset in order to avoid memory leak messages for iterators | |
2678 # that have not been fully consumed | |
2679 self._free_pileup_iter() | |
2680 self.plp = <const bam_pileup1_t*>NULL | |
2681 | |
2682 if self.iterdata.seq != NULL: | |
2683 free(self.iterdata.seq) | |
2684 self.iterdata.seq = NULL | |
2685 | |
2686 # backwards compatibility | |
2687 | |
2688 def hasReference(self): | |
2689 return self.has_reference() | |
2690 cdef char * getSequence(self): | |
2691 return self.get_sequence() | |
2692 def addReference(self, FastaFile fastafile): | |
2693 return self.add_reference(fastafile) | |
2694 | |
2695 | |
2696 cdef class IteratorColumnRegion(IteratorColumn): | |
2697 '''iterates over a region only. | |
2698 ''' | |
2699 def __cinit__(self, | |
2700 AlignmentFile samfile, | |
2701 int tid = 0, | |
2702 int start = 0, | |
2703 int stop = MAX_POS, | |
2704 int truncate = False, | |
2705 int multiple_iterators = True, | |
2706 **kwargs ): | |
2707 | |
2708 # initialize iterator. Multiple iterators not available | |
2709 # for CRAM. | |
2710 if multiple_iterators and samfile.is_cram: | |
2711 warnings.warn("multiple_iterators not implemented for CRAM") | |
2712 multiple_iterators = False | |
2713 | |
2714 self._setup_iterator(tid, start, stop, multiple_iterators) | |
2715 self.start = start | |
2716 self.stop = stop | |
2717 self.truncate = truncate | |
2718 | |
2719 def __next__(self): | |
2720 | |
2721 cdef int n | |
2722 | |
2723 while 1: | |
2724 n = self.cnext() | |
2725 if n < 0: | |
2726 raise ValueError("error during iteration" ) | |
2727 | |
2728 if n == 0: | |
2729 raise StopIteration | |
2730 | |
2731 if self.truncate: | |
2732 if self.start > self.pos: | |
2733 continue | |
2734 if self.pos >= self.stop: | |
2735 raise StopIteration | |
2736 | |
2737 return makePileupColumn(&self.plp, | |
2738 self.tid, | |
2739 self.pos, | |
2740 self.n_plp, | |
2741 self.min_base_quality, | |
2742 self.iterdata.seq, | |
2743 self.samfile.header) | |
2744 | |
2745 | |
2746 cdef class IteratorColumnAllRefs(IteratorColumn): | |
2747 """iterates over all columns by chaining iterators over each reference | |
2748 """ | |
2749 | |
2750 def __cinit__(self, | |
2751 AlignmentFile samfile, | |
2752 **kwargs): | |
2753 | |
2754 # no iteration over empty files | |
2755 if not samfile.nreferences: | |
2756 raise StopIteration | |
2757 | |
2758 # initialize iterator | |
2759 self._setup_iterator(self.tid, 0, MAX_POS, 1) | |
2760 | |
2761 def __next__(self): | |
2762 | |
2763 cdef int n | |
2764 while 1: | |
2765 n = self.cnext() | |
2766 if n < 0: | |
2767 raise ValueError("error during iteration") | |
2768 | |
2769 # proceed to next reference or stop | |
2770 if n == 0: | |
2771 self.tid += 1 | |
2772 if self.tid < self.samfile.nreferences: | |
2773 self._setup_iterator(self.tid, 0, MAX_POS, 0) | |
2774 else: | |
2775 raise StopIteration | |
2776 continue | |
2777 | |
2778 # return result, if within same reference | |
2779 return makePileupColumn(&self.plp, | |
2780 self.tid, | |
2781 self.pos, | |
2782 self.n_plp, | |
2783 self.min_base_quality, | |
2784 self.iterdata.seq, | |
2785 self.samfile.header) | |
2786 | |
2787 | |
2788 cdef class IteratorColumnAll(IteratorColumn): | |
2789 """iterates over all columns, without using an index | |
2790 """ | |
2791 | |
2792 def __cinit__(self, | |
2793 AlignmentFile samfile, | |
2794 **kwargs): | |
2795 | |
2796 self._setup_raw_rest_iterator() | |
2797 | |
2798 def __next__(self): | |
2799 | |
2800 cdef int n | |
2801 n = self.cnext() | |
2802 if n < 0: | |
2803 raise ValueError("error during iteration") | |
2804 | |
2805 if n == 0: | |
2806 raise StopIteration | |
2807 | |
2808 return makePileupColumn(&self.plp, | |
2809 self.tid, | |
2810 self.pos, | |
2811 self.n_plp, | |
2812 self.min_base_quality, | |
2813 self.iterdata.seq, | |
2814 self.samfile.header) | |
2815 | |
2816 | |
2817 cdef class SNPCall: | |
2818 '''the results of a SNP call.''' | |
2819 cdef int _tid | |
2820 cdef int _pos | |
2821 cdef char _reference_base | |
2822 cdef char _genotype | |
2823 cdef int _consensus_quality | |
2824 cdef int _snp_quality | |
2825 cdef int _rms_mapping_quality | |
2826 cdef int _coverage | |
2827 | |
2828 property tid: | |
2829 '''the chromosome ID as is defined in the header''' | |
2830 def __get__(self): | |
2831 return self._tid | |
2832 | |
2833 property pos: | |
2834 '''nucleotide position of SNP.''' | |
2835 def __get__(self): return self._pos | |
2836 | |
2837 property reference_base: | |
2838 '''reference base at pos. ``N`` if no reference sequence supplied.''' | |
2839 def __get__(self): return from_string_and_size( &self._reference_base, 1 ) | |
2840 | |
2841 property genotype: | |
2842 '''the genotype called.''' | |
2843 def __get__(self): return from_string_and_size( &self._genotype, 1 ) | |
2844 | |
2845 property consensus_quality: | |
2846 '''the genotype quality (Phred-scaled).''' | |
2847 def __get__(self): return self._consensus_quality | |
2848 | |
2849 property snp_quality: | |
2850 '''the snp quality (Phred scaled) - probability of consensus being | |
2851 identical to reference sequence.''' | |
2852 def __get__(self): return self._snp_quality | |
2853 | |
2854 property mapping_quality: | |
2855 '''the root mean square (rms) of the mapping quality of all reads | |
2856 involved in the call.''' | |
2857 def __get__(self): return self._rms_mapping_quality | |
2858 | |
2859 property coverage: | |
2860 '''coverage or read depth - the number of reads involved in the call.''' | |
2861 def __get__(self): return self._coverage | |
2862 | |
2863 def __str__(self): | |
2864 | |
2865 return "\t".join( map(str, ( | |
2866 self.tid, | |
2867 self.pos, | |
2868 self.reference_base, | |
2869 self.genotype, | |
2870 self.consensus_quality, | |
2871 self.snp_quality, | |
2872 self.mapping_quality, | |
2873 self.coverage ) ) ) | |
2874 | |
2875 | |
2876 cdef class IndexedReads: | |
2877 """Index a Sam/BAM-file by query name while keeping the | |
2878 original sort order intact. | |
2879 | |
2880 The index is kept in memory and can be substantial. | |
2881 | |
2882 By default, the file is re-opened to avoid conflicts if multiple | |
2883 operators work on the same file. Set `multiple_iterators` = False | |
2884 to not re-open `samfile`. | |
2885 | |
2886 Parameters | |
2887 ---------- | |
2888 | |
2889 samfile : AlignmentFile | |
2890 File to be indexed. | |
2891 | |
2892 multiple_iterators : bool | |
2893 Flag indicating whether the file should be reopened. Reopening prevents | |
2894 existing iterators being affected by the indexing. | |
2895 | |
2896 """ | |
2897 | |
2898 def __init__(self, AlignmentFile samfile, int multiple_iterators=True): | |
2899 cdef char *cfilename | |
2900 | |
2901 # makes sure that samfile stays alive as long as this | |
2902 # object is alive. | |
2903 self.samfile = samfile | |
2904 cdef bam_hdr_t * hdr = NULL | |
2905 assert samfile.is_bam, "can only apply IndexReads on bam files" | |
2906 | |
2907 # multiple_iterators the file - note that this makes the iterator | |
2908 # slow and causes pileup to slow down significantly. | |
2909 if multiple_iterators: | |
2910 cfilename = samfile.filename | |
2911 with nogil: | |
2912 self.htsfile = hts_open(cfilename, 'r') | |
2913 if self.htsfile == NULL: | |
2914 raise OSError("unable to reopen htsfile") | |
2915 | |
2916 # need to advance in newly opened file to position after header | |
2917 # better: use seek/tell? | |
2918 with nogil: | |
2919 hdr = sam_hdr_read(self.htsfile) | |
2920 if hdr == NULL: | |
2921 raise OSError("unable to read header information") | |
2922 self.header = makeAlignmentHeader(hdr) | |
2923 self.owns_samfile = True | |
2924 else: | |
2925 self.htsfile = self.samfile.htsfile | |
2926 self.header = samfile.header | |
2927 self.owns_samfile = False | |
2928 | |
2929 def build(self): | |
2930 '''build the index.''' | |
2931 | |
2932 self.index = collections.defaultdict(list) | |
2933 | |
2934 # this method will start indexing from the current file position | |
2935 cdef int ret = 1 | |
2936 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t)) | |
2937 if b == NULL: | |
2938 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) | |
2939 | |
2940 cdef uint64_t pos | |
2941 cdef bam_hdr_t * hdr = self.header.ptr | |
2942 | |
2943 while ret > 0: | |
2944 with nogil: | |
2945 pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) | |
2946 ret = sam_read1(self.htsfile, | |
2947 hdr, | |
2948 b) | |
2949 | |
2950 if ret > 0: | |
2951 qname = charptr_to_str(pysam_bam_get_qname(b)) | |
2952 self.index[qname].append(pos) | |
2953 | |
2954 bam_destroy1(b) | |
2955 | |
2956 def find(self, query_name): | |
2957 '''find `query_name` in index. | |
2958 | |
2959 Returns | |
2960 ------- | |
2961 | |
2962 IteratorRowSelection | |
2963 Returns an iterator over all reads with query_name. | |
2964 | |
2965 Raises | |
2966 ------ | |
2967 | |
2968 KeyError | |
2969 if the `query_name` is not in the index. | |
2970 | |
2971 ''' | |
2972 if query_name in self.index: | |
2973 return IteratorRowSelection( | |
2974 self.samfile, | |
2975 self.index[query_name], | |
2976 multiple_iterators = False) | |
2977 else: | |
2978 raise KeyError("read %s not found" % query_name) | |
2979 | |
2980 def __dealloc__(self): | |
2981 if self.owns_samfile: | |
2982 hts_close(self.htsfile) |