comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # cython: embedsignature=True
2 # cython: profile=True
3 ########################################################
4 ########################################################
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
6 ########################################################
7 # The principal classes defined in this module are:
8 #
9 # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
10 #
11 # class AlignmentHeader manage SAM/BAM/CRAM header data
12 #
13 # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
14 # the original sort order intact
15 #
16 # Additionally this module defines numerous additional classes that
17 # are part of the internal API. These are:
18 #
19 # Various iterator classes to iterate over alignments in sequential
20 # (IteratorRow) or in a stacked fashion (IteratorColumn):
21 #
22 # class IteratorRow
23 # class IteratorRowRegion
24 # class IteratorRowHead
25 # class IteratorRowAll
26 # class IteratorRowAllRefs
27 # class IteratorRowSelection
28 # class IteratorColumn
29 # class IteratorColumnRegion
30 # class IteratorColumnAll
31 # class IteratorColumnAllRefs
32 #
33 ########################################################
34 #
35 # The MIT License
36 #
37 # Copyright (c) 2015 Andreas Heger
38 #
39 # Permission is hereby granted, free of charge, to any person obtaining a
40 # copy of this software and associated documentation files (the "Software"),
41 # to deal in the Software without restriction, including without limitation
42 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
43 # and/or sell copies of the Software, and to permit persons to whom the
44 # Software is furnished to do so, subject to the following conditions:
45 #
46 # The above copyright notice and this permission notice shall be included in
47 # all copies or substantial portions of the Software.
48 #
49 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
52 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
55 # DEALINGS IN THE SOFTWARE.
56 #
57 ########################################################
58 import os
59 import collections
60 try:
61 from collections.abc import Sequence, Mapping # noqa
62 except ImportError:
63 from collections import Sequence, Mapping # noqa
64 import re
65 import warnings
66 import array
67 from libc.errno cimport errno, EPIPE
68 from libc.string cimport strcmp, strpbrk, strerror
69 from libc.stdint cimport INT32_MAX
70
71 from cpython cimport array as c_array
72
73 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
74 from pysam.libcutils cimport encode_filename, from_string_and_size
75 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
76 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
77 HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
78
79 from io import StringIO
80
81 cimport cython
82
83
84 __all__ = [
85 "AlignmentFile",
86 "AlignmentHeader",
87 "IteratorRow",
88 "IteratorColumn",
89 "IndexedReads"]
90
91 IndexStats = collections.namedtuple("IndexStats",
92 ("contig",
93 "mapped",
94 "unmapped",
95 "total"))
96
97 ########################################################
98 ## global variables
99 # maximum genomic coordinace
100 # for some reason, using 'int' causes overflow
101 cdef int MAX_POS = (1 << 31) - 1
102
103 # valid types for SAM headers
104 VALID_HEADER_TYPES = {"HD" : Mapping,
105 "SQ" : Sequence,
106 "RG" : Sequence,
107 "PG" : Sequence,
108 "CO" : Sequence}
109
110 # order of records within SAM headers
111 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
112
113 # default type conversions within SAM header records
114 KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
115 "SS" : str,},
116 "SQ" : {"SN" : str, "LN" : int, "AS" : str,
117 "M5" : str, "SP" : str, "UR" : str,
118 "AH" : str, "TP" : str, "DS" : str,
119 "AN" : str,},
120 "RG" : {"ID" : str, "CN" : str, "DS" : str,
121 "DT" : str, "FO" : str, "KS" : str,
122 "LB" : str, "PG" : str, "PI" : str,
123 "PL" : str, "PM" : str, "PU" : str,
124 "SM" : str, "BC" : str,},
125 "PG" : {"ID" : str, "PN" : str, "CL" : str,
126 "PP" : str, "DS" : str, "VN" : str,},}
127
128 # output order of fields within records. Ensure that CL is at
129 # the end as parsing a CL will ignore any subsequent records.
130 VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
131 "SQ" : ("SN", "LN", "AS", "M5",
132 "UR", "SP", "AH", "TP",
133 "DS", "AN"),
134 "RG" : ("ID", "CN", "SM", "LB",
135 "PU", "PI", "DT", "DS",
136 "PL", "FO", "KS", "PG",
137 "PM", "BC"),
138 "PG" : ("PN", "ID", "VN", "PP",
139 "DS", "CL"),}
140
141
142 def build_header_line(fields, record):
143 '''build a header line from `fields` dictionary for `record`'''
144
145 # TODO: add checking for field and sort order
146 line = ["@%s" % record]
147 # comment
148 if record == "CO":
149 line.append(fields)
150 # user tags
151 elif record.islower():
152 for key in sorted(fields):
153 line.append("%s:%s" % (key, str(fields[key])))
154 # defined tags
155 else:
156 # write fields of the specification
157 for key in VALID_HEADER_ORDER[record]:
158 if key in fields:
159 line.append("%s:%s" % (key, str(fields[key])))
160 # write user fields
161 for key in fields:
162 if not key.isupper():
163 line.append("%s:%s" % (key, str(fields[key])))
164
165 return "\t".join(line)
166
167
168 cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
169 if not hdr:
170 raise ValueError('cannot create AlignmentHeader, received NULL pointer')
171
172 # check: is AlignmetHeader.__cinit__ called?
173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
174 header.ptr = hdr
175
176 return header
177
178 def read_failure_reason(code):
179 if code == -2:
180 return 'truncated file'
181 else:
182 return "error {} while reading file".format(code)
183
184
185 # the following should be class-method for VariantHeader, but cdef @classmethods
186 # are not implemented in cython.
187 cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
188 reference_names,
189 reference_lengths,
190 add_sq_text=True,
191 text=None) except -1:
192 """build header from list of reference names and lengths.
193 """
194
195 cdef class AlignmentHeader(object):
196 """header information for a :class:`AlignmentFile` object
197
198 Parameters
199 ----------
200 header_dict : dict
201 build header from a multi-level dictionary. The
202 first level are the four types ('HD', 'SQ', ...). The second
203 level are a list of lines, with each line being a list of
204 tag-value pairs. The header is constructed first from all the
205 defined fields, followed by user tags in alphabetical
206 order. Alternatively, an :class:`~pysam.AlignmentHeader`
207 object can be passed directly.
208
209 text : string
210 use the string provided as the header
211
212 reference_names : list
213 see reference_lengths
214
215 reference_lengths : list
216 build header from list of chromosome names and lengths. By
217 default, 'SQ' and 'LN' tags will be added to the header
218 text. This option can be changed by unsetting the flag
219 `add_sq_text`.
220
221 add_sq_text : bool
222 do not add 'SQ' and 'LN' tags to header. This option permits
223 construction :term:`SAM` formatted files without a header.
224
225 """
226
227 # See makeVariantHeader for C constructor
228 def __cinit__(self):
229 self.ptr = NULL
230
231 # Python constructor
232 def __init__(self):
233 self.ptr = bam_hdr_init()
234 if self.ptr is NULL:
235 raise MemoryError("could not create header")
236
237 @classmethod
238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
239
240 cdef AlignmentHeader self = AlignmentHeader()
241 cdef char *ctext
242 cdef int l_text
243 cdef int n, x
244 if text is not None:
245 btext = force_bytes(text)
246 ctext = btext
247 l_text = len(btext)
248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
249 if self.ptr.text == NULL:
250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
251 self.ptr.l_text = l_text
252 memcpy(self.ptr.text, ctext, l_text + 1)
253
254 if reference_names and reference_lengths:
255 reference_names = [force_bytes(ref) for ref in reference_names]
256
257 self.ptr.n_targets = len(reference_names)
258
259 n = sum([len(reference_names) + 1])
260 self.ptr.target_name = <char**>calloc(n, sizeof(char*))
261 if self.ptr.target_name == NULL:
262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
263
264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
265 if self.ptr.target_len == NULL:
266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
267
268 for x from 0 <= x < self.ptr.n_targets:
269 self.ptr.target_len[x] = reference_lengths[x]
270 name = reference_names[x]
271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
272 if self.ptr.target_name[x] == NULL:
273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
274 strncpy(self.ptr.target_name[x], name, len(name))
275
276 return self
277
278 @classmethod
279 def from_text(cls, text):
280
281 reference_names, reference_lengths = [], []
282 for line in text.splitlines():
283 if line.startswith("@SQ"):
284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
285 try:
286 reference_names.append(fields["SN"])
287 reference_lengths.append(int(fields["LN"]))
288 except KeyError:
289 raise KeyError("incomplete sequence information in '%s'" % str(fields))
290 except ValueError:
291 raise ValueError("wrong sequence information in '%s'" % str(fields))
292
293 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
294
295 @classmethod
296 def from_dict(cls, header_dict):
297
298 cdef list lines = []
299 # first: defined tags
300 for record in VALID_HEADERS:
301 if record in header_dict:
302 data = header_dict[record]
303 if not isinstance(data, VALID_HEADER_TYPES[record]):
304 raise ValueError(
305 "invalid type for record %s: %s, expected %s".format(
306 record, type(data), VALID_HEADER_TYPES[record]))
307 if isinstance(data, Mapping):
308 lines.append(build_header_line(data, record))
309 else:
310 for fields in header_dict[record]:
311 lines.append(build_header_line(fields, record))
312
313 # then: user tags (lower case), sorted alphabetically
314 for record, data in sorted(header_dict.items()):
315 if record in VALID_HEADERS:
316 continue
317 if isinstance(data, Mapping):
318 lines.append(build_header_line(data, record))
319 else:
320 for fields in header_dict[record]:
321 lines.append(build_header_line(fields, record))
322
323 text = "\n".join(lines) + "\n"
324
325 reference_names, reference_lengths = [], []
326 if "SQ" in header_dict:
327 for fields in header_dict["SQ"]:
328 try:
329 reference_names.append(fields["SN"])
330 reference_lengths.append(fields["LN"])
331 except KeyError:
332 raise KeyError("incomplete sequence information in '%s'" % str(fields))
333
334 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
335
336 @classmethod
337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
338
339 if len(reference_names) != len(reference_lengths):
340 raise ValueError("number of reference names and lengths do not match")
341
342 # optionally, if there is no text, add a SAM compatible header to output file.
343 if text is None and add_sq_text:
344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
345 reference_names, reference_lengths)])
346
347 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
348
349 def __dealloc__(self):
350 bam_hdr_destroy(self.ptr)
351 self.ptr = NULL
352
353 def __bool__(self):
354 return self.ptr != NULL
355
356 def copy(self):
357 return makeAlignmentHeader(bam_hdr_dup(self.ptr))
358
359 property nreferences:
360 """int with the number of :term:`reference` sequences in the file.
361
362 This is a read-only attribute."""
363 def __get__(self):
364 return self.ptr.n_targets
365
366 property references:
367 """tuple with the names of :term:`reference` sequences. This is a
368 read-only attribute"""
369 def __get__(self):
370 t = []
371 cdef int x
372 for x in range(self.ptr.n_targets):
373 t.append(charptr_to_str(self.ptr.target_name[x]))
374 return tuple(t)
375
376 property lengths:
377 """tuple of the lengths of the :term:`reference` sequences. This is a
378 read-only attribute. The lengths are in the same order as
379 :attr:`pysam.AlignmentFile.references`
380 """
381 def __get__(self):
382 t = []
383 cdef int x
384 for x in range(self.ptr.n_targets):
385 t.append(self.ptr.target_len[x])
386 return tuple(t)
387
388 def _build_sequence_section(self):
389 """return sequence section of header.
390
391 The sequence section is built from the list of reference names and
392 lengths stored in the BAM-file and not from any @SQ entries that
393 are part of the header's text section.
394 """
395
396 cdef int x
397 text = []
398 for x in range(self.ptr.n_targets):
399 text.append("@SQ\tSN:{}\tLN:{}\n".format(
400 force_str(self.ptr.target_name[x]),
401 self.ptr.target_len[x]))
402 return "".join(text)
403
404 def to_dict(self):
405 """return two-level dictionary with header information from the file.
406
407 The first level contains the record (``HD``, ``SQ``, etc) and
408 the second level contains the fields (``VN``, ``LN``, etc).
409
410 The parser is validating and will raise an AssertionError if
411 if encounters any record or field tags that are not part of
412 the SAM specification. Use the
413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
414 header.
415
416 The parsing follows the SAM format specification with the
417 exception of the ``CL`` field. This option will consume the
418 rest of a header line irrespective of any additional fields.
419 This behaviour has been added to accommodate command line
420 options that contain characters that are not valid field
421 separators.
422
423 If no @SQ entries are within the text section of the header,
424 this will be automatically added from the reference names and
425 lengths stored in the binary part of the header.
426 """
427 result = collections.OrderedDict()
428
429 # convert to python string
430 t = self.__str__()
431 for line in t.split("\n"):
432 line = line.strip(' \0')
433 if not line:
434 continue
435 assert line.startswith("@"), \
436 "header line without '@': '%s'" % line
437 fields = line[1:].split("\t")
438 record = fields[0]
439 assert record in VALID_HEADER_TYPES, \
440 "header line with invalid type '%s': '%s'" % (record, line)
441
442 # treat comments
443 if record == "CO":
444 if record not in result:
445 result[record] = []
446 result[record].append("\t".join( fields[1:]))
447 continue
448 # the following is clumsy as generators do not work?
449 x = {}
450
451 for idx, field in enumerate(fields[1:]):
452 if ":" not in field:
453 raise ValueError("malformatted header: no ':' in field" )
454 key, value = field.split(":", 1)
455 if key in ("CL",):
456 # special treatment for command line
457 # statements (CL). These might contain
458 # characters that are non-conformant with
459 # the valid field separators in the SAM
460 # header. Thus, in contravention to the
461 # SAM API, consume the rest of the line.
462 key, value = "\t".join(fields[idx+1:]).split(":", 1)
463 x[key] = KNOWN_HEADER_FIELDS[record][key](value)
464 break
465
466 # interpret type of known header record tags, default to str
467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
468
469 if VALID_HEADER_TYPES[record] == Mapping:
470 if record in result:
471 raise ValueError(
472 "multiple '%s' lines are not permitted" % record)
473
474 result[record] = x
475 elif VALID_HEADER_TYPES[record] == Sequence:
476 if record not in result: result[record] = []
477 result[record].append(x)
478
479 # if there are no SQ lines in the header, add the
480 # reference names from the information in the bam
481 # file.
482 #
483 # Background: c-samtools keeps the textual part of the
484 # header separate from the list of reference names and
485 # lengths. Thus, if a header contains only SQ lines,
486 # the SQ information is not part of the textual header
487 # and thus are missing from the output. See issue 84.
488 if "SQ" not in result:
489 sq = []
490 for ref, length in zip(self.references, self.lengths):
491 sq.append({'LN': length, 'SN': ref })
492 result["SQ"] = sq
493
494 return result
495
496 def as_dict(self):
497 """deprecated, use :meth:`to_dict()` instead"""
498 return self.to_dict()
499
500 def get_reference_name(self, tid):
501 if tid == -1:
502 return None
503 if not 0 <= tid < self.ptr.n_targets:
504 raise ValueError("reference_id %i out of range 0<=tid<%i" %
505 (tid, self.ptr.n_targets))
506 return charptr_to_str(self.ptr.target_name[tid])
507
508 def get_reference_length(self, reference):
509 cdef int tid = self.get_tid(reference)
510 if tid < 0:
511 raise KeyError("unknown reference {}".format(reference))
512 else:
513 return self.ptr.target_len[tid]
514
515 def is_valid_tid(self, int tid):
516 """
517 return True if the numerical :term:`tid` is valid; False otherwise.
518
519 Note that the unmapped tid code (-1) counts as an invalid.
520 """
521 return 0 <= tid < self.ptr.n_targets
522
523 def get_tid(self, reference):
524 """
525 return the numerical :term:`tid` corresponding to
526 :term:`reference`
527
528 returns -1 if reference is not known.
529 """
530 reference = force_bytes(reference)
531 tid = bam_name2id(self.ptr, reference)
532 if tid < -1:
533 raise ValueError('could not parse header')
534 return tid
535
536 def __str__(self):
537 '''string with the full contents of the :term:`sam file` header as a
538 string.
539
540 If no @SQ entries are within the text section of the header,
541 this will be automatically added from the reference names and
542 lengths stored in the binary part of the header.
543
544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
545 representation of the header.
546 '''
547 text = from_string_and_size(self.ptr.text, self.ptr.l_text)
548 if "@SQ" not in text:
549 text += "\n" + self._build_sequence_section()
550 return text
551
552 # dictionary access methods, for backwards compatibility.
553 def __setitem__(self, key, value):
554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
555
556 def __getitem__(self, key):
557 return self.to_dict().__getitem__(key)
558
559 def items(self):
560 return self.to_dict().items()
561
562 # PY2 compatibility
563 def iteritems(self):
564 return self.to_dict().items()
565
566 def keys(self):
567 return self.to_dict().keys()
568
569 def values(self):
570 return self.to_dict().values()
571
572 def get(self, *args):
573 return self.to_dict().get(*args)
574
575 def __len__(self):
576 return self.to_dict().__len__()
577
578 def __contains__(self, key):
579 return self.to_dict().__contains__(key)
580
581
582 cdef class AlignmentFile(HTSFile):
583 """AlignmentFile(filepath_or_object, mode=None, template=None,
584 reference_names=None, reference_lengths=None, text=NULL,
585 header=None, add_sq_text=False, check_header=True, check_sq=True,
586 reference_filename=None, filename=None, index_filename=None,
587 filepath_index=None, require_index=False, duplicate_filehandle=True,
588 ignore_truncation=False, threads=1)
589
590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
591
592 If `filepath_or_object` is a string, the file is automatically
593 opened. If `filepath_or_object` is a python File object, the
594 already opened file will be used.
595
596 If the file is opened for reading and an index exists (if file is BAM, a
597 .bai file or if CRAM a .crai file), it will be opened automatically.
598 `index_filename` may be specified explicitly. If the index is not named
599 in the standard manner, not located in the same directory as the
600 BAM/CRAM file, or is remote. Without an index, random access via
601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
602 is disabled.
603
604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can
605 be constituted from several sources (see also the samtools format
606 specification):
607
608 1. If `template` is given, the header is copied from another
609 `AlignmentFile` (`template` must be a
610 :class:`~pysam.AlignmentFile`).
611
612 2. If `header` is given, the header is built from a
613 multi-level dictionary.
614
615 3. If `text` is given, new header text is copied from raw
616 text.
617
618 4. The names (`reference_names`) and lengths
619 (`reference_lengths`) are supplied directly as lists.
620
621 When reading or writing a CRAM file, the filename of a FASTA-formatted
622 reference can be specified with `reference_filename`.
623
624 By default, if a file is opened in mode 'r', it is checked
625 for a valid header (`check_header` = True) and a definition of
626 chromosome names (`check_sq` = True).
627
628 Parameters
629 ----------
630 mode : string
631 `mode` should be ``r`` for reading or ``w`` for writing. The
632 default is text mode (:term:`SAM`). For binary (:term:`BAM`)
633 I/O you should append ``b`` for compressed or ``u`` for
634 uncompressed :term:`BAM` output. Use ``h`` to output header
635 information in text (:term:`TAM`) mode. Use ``c`` for
636 :term:`CRAM` formatted files.
637
638 If ``b`` is present, it must immediately follow ``r`` or
639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
641 :term:`BAM` formatted file for reading, type::
642
643 f = pysam.AlignmentFile('ex1.bam','rb')
644
645 If mode is not specified, the method will try to auto-detect
646 in the order 'rb', 'r', thus both the following should work::
647
648 f1 = pysam.AlignmentFile('ex1.bam')
649 f2 = pysam.AlignmentFile('ex1.sam')
650
651 template : AlignmentFile
652 when writing, copy header from file `template`.
653
654 header : dict or AlignmentHeader
655 when writing, build header from a multi-level dictionary. The
656 first level are the four types ('HD', 'SQ', ...). The second
657 level are a list of lines, with each line being a list of
658 tag-value pairs. The header is constructed first from all the
659 defined fields, followed by user tags in alphabetical
660 order. Alternatively, an :class:`~pysam.AlignmentHeader`
661 object can be passed directly.
662
663 text : string
664 when writing, use the string provided as the header
665
666 reference_names : list
667 see reference_lengths
668
669 reference_lengths : list
670 when writing or opening a SAM file without header build header
671 from list of chromosome names and lengths. By default, 'SQ'
672 and 'LN' tags will be added to the header text. This option
673 can be changed by unsetting the flag `add_sq_text`.
674
675 add_sq_text : bool
676 do not add 'SQ' and 'LN' tags to header. This option permits
677 construction :term:`SAM` formatted files without a header.
678
679 add_sam_header : bool
680 when outputting SAM the default is to output a header. This is
681 equivalent to opening the file in 'wh' mode. If this option is
682 set to False, no header will be output. To read such a file,
683 set `check_header=False`.
684
685 check_header : bool
686 obsolete: when reading a SAM file, check if header is present
687 (default=True)
688
689 check_sq : bool
690 when reading, check if SQ entries are present in header
691 (default=True)
692
693 reference_filename : string
694 Path to a FASTA-formatted reference file. Valid only for CRAM files.
695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
696 specified in the header (``UR`` tag), which are normally used to find
697 the reference.
698
699 index_filename : string
700 Explicit path to the index file. Only needed if the index is not
701 named in the standard manner, not located in the same directory as
702 the BAM/CRAM file, or is remote. An IOError is raised if the index
703 cannot be found or is invalid.
704
705 filepath_index : string
706 Alias for `index_filename`.
707
708 require_index : bool
709 When reading, require that an index file is present and is valid or
710 raise an IOError. (default=False)
711
712 filename : string
713 Alternative to filepath_or_object. Filename of the file
714 to be opened.
715
716 duplicate_filehandle: bool
717 By default, file handles passed either directly or through
718 File-like objects will be duplicated before passing them to
719 htslib. The duplication prevents issues where the same stream
720 will be closed by htslib and through destruction of the
721 high-level python object. Set to False to turn off
722 duplication.
723
724 ignore_truncation: bool
725 Issue a warning, instead of raising an error if the current file
726 appears to be truncated due to a missing EOF marker. Only applies
727 to bgzipped formats. (Default=False)
728
729 format_options: list
730 A list of key=value strings, as accepted by --input-fmt-option and
731 --output-fmt-option in samtools.
732 threads: integer
733 Number of threads to use for compressing/decompressing BAM/CRAM files.
734 Setting threads to > 1 cannot be combined with `ignore_truncation`.
735 (Default=1)
736 """
737
738 def __cinit__(self, *args, **kwargs):
739 self.htsfile = NULL
740 self.filename = None
741 self.mode = None
742 self.threads = 1
743 self.is_stream = False
744 self.is_remote = False
745 self.index = NULL
746
747 if "filename" in kwargs:
748 args = [kwargs["filename"]]
749 del kwargs["filename"]
750
751 self._open(*args, **kwargs)
752
753 # allocate memory for iterator
754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
755 if self.b == NULL:
756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
757
758 def has_index(self):
759 """return true if htsfile has an existing (and opened) index.
760 """
761 return self.index != NULL
762
763 def check_index(self):
764 """return True if index is present.
765
766 Raises
767 ------
768
769 AttributeError
770 if htsfile is :term:`SAM` formatted and thus has no index.
771
772 ValueError
773 if htsfile is closed or index could not be opened.
774 """
775
776 if not self.is_open:
777 raise ValueError("I/O operation on closed file")
778 if not self.is_bam and not self.is_cram:
779 raise AttributeError(
780 "AlignmentFile.mapped only available in bam files")
781 if self.index == NULL:
782 raise ValueError(
783 "mapping information not recorded in index "
784 "or index not available")
785 return True
786
787 def _open(self,
788 filepath_or_object,
789 mode=None,
790 AlignmentFile template=None,
791 reference_names=None,
792 reference_lengths=None,
793 reference_filename=None,
794 text=None,
795 header=None,
796 port=None,
797 add_sq_text=True,
798 add_sam_header=True,
799 check_header=True,
800 check_sq=True,
801 index_filename=None,
802 filepath_index=None,
803 require_index=False,
804 referencenames=None,
805 referencelengths=None,
806 duplicate_filehandle=True,
807 ignore_truncation=False,
808 format_options=None,
809 threads=1):
810 '''open a sam, bam or cram formatted file.
811
812 If _open is called on an existing file, the current file
813 will be closed and a new file will be opened.
814
815 '''
816 cdef char *cfilename = NULL
817 cdef char *creference_filename = NULL
818 cdef char *cindexname = NULL
819 cdef char *cmode = NULL
820 cdef bam_hdr_t * hdr = NULL
821
822 if threads > 1 and ignore_truncation:
823 # This won't raise errors if reaching a truncated alignment,
824 # because bgzf_mt_reader in htslib does not deal with
825 # bgzf_mt_read_block returning non-zero values, contrary
826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
827 # Better to avoid this (for now) than to produce seemingly correct results.
828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
829 self.threads = threads
830
831 # for backwards compatibility:
832 if referencenames is not None:
833 reference_names = referencenames
834 if referencelengths is not None:
835 reference_lengths = referencelengths
836
837 # close a previously opened file
838 if self.is_open:
839 self.close()
840
841 # autodetection for read
842 if mode is None:
843 mode = "r"
844
845 if add_sam_header and mode == "w":
846 mode = "wh"
847
848 assert mode in ("r", "w", "rb", "wb", "wh",
849 "wbu", "rU", "wb0",
850 "rc", "wc"), \
851 "invalid file opening mode `%s`" % mode
852
853 self.duplicate_filehandle = duplicate_filehandle
854
855 # StringIO not supported
856 if isinstance(filepath_or_object, StringIO):
857 raise NotImplementedError(
858 "access from StringIO objects not supported")
859 # reading from a file descriptor
860 elif isinstance(filepath_or_object, int):
861 self.filename = filepath_or_object
862 filename = None
863 self.is_remote = False
864 self.is_stream = True
865 # reading from a File object or other object with fileno
866 elif hasattr(filepath_or_object, "fileno"):
867 if filepath_or_object.closed:
868 raise ValueError('I/O operation on closed file')
869 self.filename = filepath_or_object
870 # .name can be TextIOWrapper
871 try:
872 filename = encode_filename(str(filepath_or_object.name))
873 cfilename = filename
874 except AttributeError:
875 filename = None
876 self.is_remote = False
877 self.is_stream = True
878 # what remains is a filename
879 else:
880 self.filename = filename = encode_filename(filepath_or_object)
881 cfilename = filename
882 self.is_remote = hisremote(cfilename)
883 self.is_stream = self.filename == b'-'
884
885 # for htslib, wbu seems to not work
886 if mode == "wbu":
887 mode = "wb0"
888
889 self.mode = force_bytes(mode)
890 self.reference_filename = reference_filename = encode_filename(
891 reference_filename)
892
893 if mode[0] == 'w':
894 # open file for writing
895
896 if not (template or header or text or (reference_names and reference_lengths)):
897 raise ValueError(
898 "either supply options `template`, `header`, `text` or both `reference_names` "
899 "and `reference_lengths` for writing")
900
901 if template:
902 # header is copied, though at the moment not strictly
903 # necessary as AlignmentHeader is immutable.
904 self.header = template.header.copy()
905 elif isinstance(header, AlignmentHeader):
906 self.header = header.copy()
907 elif isinstance(header, Mapping):
908 self.header = AlignmentHeader.from_dict(header)
909 elif reference_names and reference_lengths:
910 self.header = AlignmentHeader.from_references(
911 reference_names,
912 reference_lengths,
913 add_sq_text=add_sq_text,
914 text=text)
915 elif text:
916 self.header = AlignmentHeader.from_text(text)
917 else:
918 raise ValueError("not enough information to construct header. Please provide template, "
919 "header, text or reference_names/reference_lengths")
920 self.htsfile = self._open_htsfile()
921
922 if self.htsfile == NULL:
923 if errno:
924 raise IOError(errno, "could not open alignment file `{}`: {}".format(
925 force_str(filename),
926 force_str(strerror(errno))))
927 else:
928 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
929 if format_options and len(format_options):
930 self.add_hts_options(format_options)
931 # set filename with reference sequences. If no filename
932 # is given, the CRAM reference arrays will be built from
933 # the @SQ header in the header
934 if "c" in mode and reference_filename:
935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
936 raise ValueError("failure when setting reference filename")
937
938 # write header to htsfile
939 if "b" in mode or "c" in mode or "h" in mode:
940 hdr = self.header.ptr
941 with nogil:
942 sam_hdr_write(self.htsfile, hdr)
943
944 elif mode[0] == "r":
945 # open file for reading
946 self.htsfile = self._open_htsfile()
947
948 if self.htsfile == NULL:
949 if errno:
950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
951 force_str(strerror(errno))))
952 else:
953 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
954
955 if self.htsfile.format.category != sequence_data:
956 raise ValueError("file does not contain alignment data")
957
958 if format_options and len(format_options):
959 self.add_hts_options(format_options)
960
961 self.check_truncation(ignore_truncation)
962
963 # bam/cram files require a valid header
964 if self.is_bam or self.is_cram:
965 with nogil:
966 hdr = sam_hdr_read(self.htsfile)
967 if hdr == NULL:
968 raise ValueError(
969 "file does not have a valid header (mode='%s') "
970 "- is it BAM/CRAM format?" % mode)
971 self.header = makeAlignmentHeader(hdr)
972 else:
973 # in sam files a header is optional. If not given,
974 # user may provide reference names and lengths to built
975 # an on-the-fly header.
976 if reference_names and reference_lengths:
977 # build header from a target names and lengths
978 self.header = AlignmentHeader.from_references(
979 reference_names=reference_names,
980 reference_lengths=reference_lengths,
981 add_sq_text=add_sq_text,
982 text=text)
983 else:
984 with nogil:
985 hdr = sam_hdr_read(self.htsfile)
986 if hdr == NULL:
987 raise ValueError(
988 "SAM? file does not have a valid header (mode='%s'), "
989 "please provide reference_names and reference_lengths")
990 self.header = makeAlignmentHeader(hdr)
991
992 # set filename with reference sequences
993 if self.is_cram and reference_filename:
994 creference_filename = self.reference_filename
995 hts_set_opt(self.htsfile,
996 CRAM_OPT_REFERENCE,
997 creference_filename)
998
999 if check_sq and self.header.nreferences == 0:
1000 raise ValueError(
1001 ("file has no sequences defined (mode='%s') - "
1002 "is it SAM/BAM format? Consider opening with "
1003 "check_sq=False") % mode)
1004
1005 if self.is_bam or self.is_cram:
1006 self.index_filename = index_filename or filepath_index
1007 if self.index_filename:
1008 cindexname = bfile_name = encode_filename(self.index_filename)
1009
1010 if cfilename or cindexname:
1011 with nogil:
1012 self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
1013 HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
1014
1015 if not self.index and (cindexname or require_index):
1016 if errno:
1017 raise IOError(errno, force_str(strerror(errno)))
1018 else:
1019 raise IOError('unable to open index file `%s`' % self.index_filename)
1020
1021 elif require_index:
1022 raise IOError('unable to open index file')
1023
1024 # save start of data section
1025 if not self.is_stream:
1026 self.start_offset = self.tell()
1027
1028 def fetch(self,
1029 contig=None,
1030 start=None,
1031 stop=None,
1032 region=None,
1033 tid=None,
1034 until_eof=False,
1035 multiple_iterators=False,
1036 reference=None,
1037 end=None):
1038 """fetch reads aligned in a :term:`region`.
1039
1040 See :meth:`~pysam.HTSFile.parse_region` for more information
1041 on how genomic regions can be specified. :term:`reference` and
1042 `end` are also accepted for backward compatibility as synonyms
1043 for :term:`contig` and `stop`, respectively.
1044
1045 Without a `contig` or `region` all mapped reads in the file
1046 will be fetched. The reads will be returned ordered by reference
1047 sequence, which will not necessarily be the order within the
1048 file. This mode of iteration still requires an index. If there is
1049 no index, use `until_eof=True`.
1050
1051 If only `contig` is set, all reads aligned to `contig`
1052 will be fetched.
1053
1054 A :term:`SAM` file does not allow random access. If `region`
1055 or `contig` are given, an exception is raised.
1056
1057 Parameters
1058 ----------
1059
1060 until_eof : bool
1061
1062 If `until_eof` is True, all reads from the current file
1063 position will be returned in order as they are within the
1064 file. Using this option will also fetch unmapped reads.
1065
1066 multiple_iterators : bool
1067
1068 If `multiple_iterators` is True, multiple
1069 iterators on the same file can be used at the same time. The
1070 iterator returned will receive its own copy of a filehandle to
1071 the file effectively re-opening the file. Re-opening a file
1072 creates some overhead, so beware.
1073
1074 Returns
1075 -------
1076
1077 An iterator over a collection of reads. : IteratorRow
1078
1079 Raises
1080 ------
1081
1082 ValueError
1083 if the genomic coordinates are out of range or invalid or the
1084 file does not permit random access to genomic coordinates.
1085
1086 """
1087 cdef int rtid, rstart, rstop, has_coord
1088
1089 if not self.is_open:
1090 raise ValueError( "I/O operation on closed file" )
1091
1092 has_coord, rtid, rstart, rstop = self.parse_region(
1093 contig, start, stop, region, tid,
1094 end=end, reference=reference)
1095
1096 # Turn of re-opening if htsfile is a stream
1097 if self.is_stream:
1098 multiple_iterators = False
1099
1100 if self.is_bam or self.is_cram:
1101 if not until_eof and not self.is_remote:
1102 if not self.has_index():
1103 raise ValueError(
1104 "fetch called on bamfile without index")
1105
1106 if has_coord:
1107 return IteratorRowRegion(
1108 self, rtid, rstart, rstop,
1109 multiple_iterators=multiple_iterators)
1110 else:
1111 if until_eof:
1112 return IteratorRowAll(
1113 self,
1114 multiple_iterators=multiple_iterators)
1115 else:
1116 # AH: check - reason why no multiple_iterators for
1117 # AllRefs?
1118 return IteratorRowAllRefs(
1119 self,
1120 multiple_iterators=multiple_iterators)
1121 else:
1122 if has_coord:
1123 raise ValueError(
1124 "fetching by region is not available for SAM files")
1125
1126 if multiple_iterators == True:
1127 raise ValueError(
1128 "multiple iterators not implemented for SAM files")
1129
1130 return IteratorRowAll(self,
1131 multiple_iterators=multiple_iterators)
1132
1133 def head(self, n, multiple_iterators=True):
1134 '''return an iterator over the first n alignments.
1135
1136 This iterator is is useful for inspecting the bam-file.
1137
1138 Parameters
1139 ----------
1140
1141 multiple_iterators : bool
1142
1143 is set to True by default in order to
1144 avoid changing the current file position.
1145
1146 Returns
1147 -------
1148
1149 an iterator over a collection of reads : IteratorRowHead
1150
1151 '''
1152 return IteratorRowHead(self, n,
1153 multiple_iterators=multiple_iterators)
1154
1155 def mate(self, AlignedSegment read):
1156 '''return the mate of :class:`pysam.AlignedSegment` `read`.
1157
1158 .. note::
1159
1160 Calling this method will change the file position.
1161 This might interfere with any iterators that have
1162 not re-opened the file.
1163
1164 .. note::
1165
1166 This method is too slow for high-throughput processing.
1167 If a read needs to be processed with its mate, work
1168 from a read name sorted file or, better, cache reads.
1169
1170 Returns
1171 -------
1172
1173 the mate : AlignedSegment
1174
1175 Raises
1176 ------
1177
1178 ValueError
1179 if the read is unpaired or the mate is unmapped
1180
1181 '''
1182 cdef uint32_t flag = read._delegate.core.flag
1183
1184 if flag & BAM_FPAIRED == 0:
1185 raise ValueError("read %s: is unpaired" %
1186 (read.query_name))
1187 if flag & BAM_FMUNMAP != 0:
1188 raise ValueError("mate %s: is unmapped" %
1189 (read.query_name))
1190
1191 # xor flags to get the other mate
1192 cdef int x = BAM_FREAD1 + BAM_FREAD2
1193 flag = (flag ^ x) & x
1194
1195 # Make sure to use a separate file to jump around
1196 # to mate as otherwise the original file position
1197 # will be lost
1198 # The following code is not using the C API and
1199 # could thus be made much quicker, for example
1200 # by using tell and seek.
1201 for mate in self.fetch(
1202 read._delegate.core.mpos,
1203 read._delegate.core.mpos + 1,
1204 tid=read._delegate.core.mtid,
1205 multiple_iterators=True):
1206 if mate.flag & flag != 0 and \
1207 mate.query_name == read.query_name:
1208 break
1209 else:
1210 raise ValueError("mate not found")
1211
1212 return mate
1213
1214 def pileup(self,
1215 contig=None,
1216 start=None,
1217 stop=None,
1218 region=None,
1219 reference=None,
1220 end=None,
1221 **kwargs):
1222 """perform a :term:`pileup` within a :term:`region`. The region is
1223 specified by :term:`contig`, `start` and `stop` (using
1224 0-based indexing). :term:`reference` and `end` are also accepted for
1225 backward compatibility as synonyms for :term:`contig` and `stop`,
1226 respectively. Alternatively, a samtools 'region' string
1227 can be supplied.
1228
1229 Without 'contig' or 'region' all reads will be used for the
1230 pileup. The reads will be returned ordered by
1231 :term:`contig` sequence, which will not necessarily be the
1232 order within the file.
1233
1234 Note that :term:`SAM` formatted files do not allow random
1235 access. In these files, if a 'region' or 'contig' are
1236 given an exception is raised.
1237
1238 .. note::
1239
1240 'all' reads which overlap the region are returned. The
1241 first base returned will be the first base of the first
1242 read 'not' necessarily the first base of the region used
1243 in the query.
1244
1245 Parameters
1246 ----------
1247
1248 truncate : bool
1249
1250 By default, the samtools pileup engine outputs all reads
1251 overlapping a region. If truncate is True and a region is
1252 given, only columns in the exact region specified are
1253 returned.
1254
1255 max_depth : int
1256 Maximum read depth permitted. The default limit is '8000'.
1257
1258 stepper : string
1259 The stepper controls how the iterator advances.
1260 Possible options for the stepper are
1261
1262 ``all``
1263 skip reads in which any of the following flags are set:
1264 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
1265
1266 ``nofilter``
1267 uses every single read turning off any filtering.
1268
1269 ``samtools``
1270 same filter and read processing as in samtools
1271 pileup. For full compatibility, this requires a
1272 'fastafile' to be given. The following options all pertain
1273 to filtering of the ``samtools`` stepper.
1274
1275 fastafile : :class:`~pysam.FastaFile` object.
1276
1277 This is required for some of the steppers.
1278
1279 ignore_overlaps: bool
1280
1281 If set to True, detect if read pairs overlap and only take
1282 the higher quality base. This is the default.
1283
1284 flag_filter : int
1285
1286 ignore reads where any of the bits in the flag are set. The default is
1287 BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
1288
1289 flag_require : int
1290
1291 only use reads where certain flags are set. The default is 0.
1292
1293 ignore_orphans: bool
1294
1295 ignore orphans (paired reads that are not in a proper pair).
1296 The default is to ignore orphans.
1297
1298 min_base_quality: int
1299
1300 Minimum base quality. Bases below the minimum quality will
1301 not be output. The default is 13.
1302
1303 adjust_capq_threshold: int
1304
1305 adjust mapping quality. The default is 0 for no
1306 adjustment. The recommended value for adjustment is 50.
1307
1308 min_mapping_quality : int
1309
1310 only use reads above a minimum mapping quality. The default is 0.
1311
1312 compute_baq: bool
1313
1314 re-alignment computing per-Base Alignment Qualities (BAQ). The
1315 default is to do re-alignment. Realignment requires a reference
1316 sequence. If none is present, no realignment will be performed.
1317
1318 redo_baq: bool
1319
1320 recompute per-Base Alignment Quality on the fly ignoring
1321 existing base qualities. The default is False (use existing
1322 base qualities).
1323
1324 Returns
1325 -------
1326
1327 an iterator over genomic positions. : IteratorColumn
1328
1329 """
1330 cdef int rtid, has_coord
1331 cdef int32_t rstart, rstop
1332
1333 if not self.is_open:
1334 raise ValueError("I/O operation on closed file")
1335
1336 has_coord, rtid, rstart, rstop = self.parse_region(
1337 contig, start, stop, region, reference=reference, end=end)
1338
1339 if has_coord:
1340 if not self.has_index():
1341 raise ValueError("no index available for pileup")
1342
1343 return IteratorColumnRegion(self,
1344 tid=rtid,
1345 start=rstart,
1346 stop=rstop,
1347 **kwargs)
1348 else:
1349 if self.has_index():
1350 return IteratorColumnAllRefs(self, **kwargs)
1351 else:
1352 return IteratorColumnAll(self, **kwargs)
1353
1354 def count(self,
1355 contig=None,
1356 start=None,
1357 stop=None,
1358 region=None,
1359 until_eof=False,
1360 read_callback="nofilter",
1361 reference=None,
1362 end=None):
1363 '''count the number of reads in :term:`region`
1364
1365 The region is specified by :term:`contig`, `start` and `stop`.
1366 :term:`reference` and `end` are also accepted for backward
1367 compatibility as synonyms for :term:`contig` and `stop`,
1368 respectively. Alternatively, a `samtools`_ :term:`region`
1369 string can be supplied.
1370
1371 A :term:`SAM` file does not allow random access and if
1372 `region` or `contig` are given, an exception is raised.
1373
1374 Parameters
1375 ----------
1376
1377 contig : string
1378 reference_name of the genomic region (chromosome)
1379
1380 start : int
1381 start of the genomic region (0-based inclusive)
1382
1383 stop : int
1384 end of the genomic region (0-based exclusive)
1385
1386 region : string
1387 a region string in samtools format.
1388
1389 until_eof : bool
1390 count until the end of the file, possibly including
1391 unmapped reads as well.
1392
1393 read_callback: string or function
1394
1395 select a call-back to ignore reads when counting. It can
1396 be either a string with the following values:
1397
1398 ``all``
1399 skip reads in which any of the following
1400 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
1401 BAM_FDUP
1402
1403 ``nofilter``
1404 uses every single read
1405
1406 Alternatively, `read_callback` can be a function
1407 ``check_read(read)`` that should return True only for
1408 those reads that shall be included in the counting.
1409
1410 reference : string
1411 backward compatible synonym for `contig`
1412
1413 end : int
1414 backward compatible synonym for `stop`
1415
1416 Raises
1417 ------
1418
1419 ValueError
1420 if the genomic coordinates are out of range or invalid.
1421
1422 '''
1423 cdef AlignedSegment read
1424 cdef long counter = 0
1425
1426 if not self.is_open:
1427 raise ValueError("I/O operation on closed file")
1428
1429 cdef int filter_method = 0
1430 if read_callback == "all":
1431 filter_method = 1
1432 elif read_callback == "nofilter":
1433 filter_method = 2
1434
1435 for read in self.fetch(contig=contig,
1436 start=start,
1437 stop=stop,
1438 reference=reference,
1439 end=end,
1440 region=region,
1441 until_eof=until_eof):
1442 # apply filter
1443 if filter_method == 1:
1444 # filter = "all"
1445 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
1446 continue
1447 elif filter_method == 2:
1448 # filter = "nofilter"
1449 pass
1450 else:
1451 if not read_callback(read):
1452 continue
1453 counter += 1
1454
1455 return counter
1456
1457 @cython.boundscheck(False) # we do manual bounds checking
1458 def count_coverage(self,
1459 contig,
1460 start=None,
1461 stop=None,
1462 region=None,
1463 quality_threshold=15,
1464 read_callback='all',
1465 reference=None,
1466 end=None):
1467 """count the coverage of genomic positions by reads in :term:`region`.
1468
1469 The region is specified by :term:`contig`, `start` and `stop`.
1470 :term:`reference` and `end` are also accepted for backward
1471 compatibility as synonyms for :term:`contig` and `stop`,
1472 respectively. Alternatively, a `samtools`_ :term:`region`
1473 string can be supplied. The coverage is computed per-base [ACGT].
1474
1475 Parameters
1476 ----------
1477
1478 contig : string
1479 reference_name of the genomic region (chromosome)
1480
1481 start : int
1482 start of the genomic region (0-based inclusive). If not
1483 given, count from the start of the chromosome.
1484
1485 stop : int
1486 end of the genomic region (0-based exclusive). If not given,
1487 count to the end of the chromosome.
1488
1489 region : string
1490 a region string.
1491
1492 quality_threshold : int
1493 quality_threshold is the minimum quality score (in phred) a
1494 base has to reach to be counted.
1495
1496 read_callback: string or function
1497
1498 select a call-back to ignore reads when counting. It can
1499 be either a string with the following values:
1500
1501 ``all``
1502 skip reads in which any of the following
1503 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
1504 BAM_FDUP
1505
1506 ``nofilter``
1507 uses every single read
1508
1509 Alternatively, `read_callback` can be a function
1510 ``check_read(read)`` that should return True only for
1511 those reads that shall be included in the counting.
1512
1513 reference : string
1514 backward compatible synonym for `contig`
1515
1516 end : int
1517 backward compatible synonym for `stop`
1518
1519 Raises
1520 ------
1521
1522 ValueError
1523 if the genomic coordinates are out of range or invalid.
1524
1525 Returns
1526 -------
1527
1528 four array.arrays of the same length in order A C G T : tuple
1529
1530 """
1531
1532 cdef uint32_t contig_length = self.get_reference_length(contig)
1533 cdef int _start = start if start is not None else 0
1534 cdef int _stop = stop if stop is not None else contig_length
1535 _stop = _stop if _stop < contig_length else contig_length
1536
1537 if _stop == _start:
1538 raise ValueError("interval of size 0")
1539 if _stop < _start:
1540 raise ValueError("interval of size less than 0")
1541
1542 cdef int length = _stop - _start
1543 cdef c_array.array int_array_template = array.array('L', [])
1544 cdef c_array.array count_a
1545 cdef c_array.array count_c
1546 cdef c_array.array count_g
1547 cdef c_array.array count_t
1548 count_a = c_array.clone(int_array_template, length, zero=True)
1549 count_c = c_array.clone(int_array_template, length, zero=True)
1550 count_g = c_array.clone(int_array_template, length, zero=True)
1551 count_t = c_array.clone(int_array_template, length, zero=True)
1552
1553 cdef AlignedSegment read
1554 cdef cython.str seq
1555 cdef c_array.array quality
1556 cdef int qpos
1557 cdef int refpos
1558 cdef int c = 0
1559 cdef int filter_method = 0
1560
1561
1562 if read_callback == "all":
1563 filter_method = 1
1564 elif read_callback == "nofilter":
1565 filter_method = 2
1566
1567 cdef int _threshold = quality_threshold or 0
1568 for read in self.fetch(contig=contig,
1569 reference=reference,
1570 start=start,
1571 stop=stop,
1572 end=end,
1573 region=region):
1574 # apply filter
1575 if filter_method == 1:
1576 # filter = "all"
1577 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
1578 continue
1579 elif filter_method == 2:
1580 # filter = "nofilter"
1581 pass
1582 else:
1583 if not read_callback(read):
1584 continue
1585
1586 # count
1587 seq = read.seq
1588 if seq is None:
1589 continue
1590 quality = read.query_qualities
1591
1592 for qpos, refpos in read.get_aligned_pairs(True):
1593 if qpos is not None and refpos is not None and \
1594 _start <= refpos < _stop:
1595
1596 # only check base quality if _threshold > 0
1597 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
1598 if seq[qpos] == 'A':
1599 count_a.data.as_ulongs[refpos - _start] += 1
1600 if seq[qpos] == 'C':
1601 count_c.data.as_ulongs[refpos - _start] += 1
1602 if seq[qpos] == 'G':
1603 count_g.data.as_ulongs[refpos - _start] += 1
1604 if seq[qpos] == 'T':
1605 count_t.data.as_ulongs[refpos - _start] += 1
1606
1607 return count_a, count_c, count_g, count_t
1608
1609 def find_introns_slow(self, read_iterator):
1610 """Return a dictionary {(start, stop): count}
1611 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
1612 and their support ( = number of reads ).
1613
1614 read_iterator can be the result of a .fetch(...) call.
1615 Or it can be a generator filtering such reads. Example
1616 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
1617 """
1618 res = collections.Counter()
1619 for r in read_iterator:
1620 if 'N' in r.cigarstring:
1621 last_read_pos = False
1622 for read_loc, genome_loc in r.get_aligned_pairs():
1623 if read_loc is None and last_read_pos:
1624 start = genome_loc
1625 elif read_loc and last_read_pos is None:
1626 stop = genome_loc # we are right exclusive ,so this is correct
1627 res[(start, stop)] += 1
1628 del start
1629 del stop
1630 last_read_pos = read_loc
1631 return res
1632
1633 def find_introns(self, read_iterator):
1634 """Return a dictionary {(start, stop): count}
1635 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
1636 and their support ( = number of reads ).
1637
1638 read_iterator can be the result of a .fetch(...) call.
1639 Or it can be a generator filtering such reads. Example
1640 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
1641 """
1642 cdef:
1643 uint32_t base_position, junc_start, nt
1644 int op
1645 AlignedSegment r
1646 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
1647
1648 res = collections.Counter()
1649
1650 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
1651 for r in read_iterator:
1652 base_position = r.pos
1653 cigar = r.cigartuples
1654 if cigar is None:
1655 continue
1656
1657 for op, nt in cigar:
1658 if op in match_or_deletion:
1659 base_position += nt
1660 elif op == BAM_CREF_SKIP:
1661 junc_start = base_position
1662 base_position += nt
1663 res[(junc_start, base_position)] += 1
1664 return res
1665
1666
1667 def close(self):
1668 '''closes the :class:`pysam.AlignmentFile`.'''
1669
1670 if self.htsfile == NULL:
1671 return
1672
1673 if self.index != NULL:
1674 hts_idx_destroy(self.index)
1675 self.index = NULL
1676
1677 cdef int ret = hts_close(self.htsfile)
1678 self.htsfile = NULL
1679
1680 self.header = None
1681
1682 if ret < 0:
1683 global errno
1684 if errno == EPIPE:
1685 errno = 0
1686 else:
1687 raise IOError(errno, force_str(strerror(errno)))
1688
1689 def __dealloc__(self):
1690 cdef int ret = 0
1691
1692 if self.index != NULL:
1693 hts_idx_destroy(self.index)
1694 self.index = NULL
1695
1696 if self.htsfile != NULL:
1697 ret = hts_close(self.htsfile)
1698 self.htsfile = NULL
1699
1700 self.header = None
1701
1702 if self.b:
1703 bam_destroy1(self.b)
1704 self.b = NULL
1705
1706 if ret < 0:
1707 global errno
1708 if errno == EPIPE:
1709 errno = 0
1710 else:
1711 raise IOError(errno, force_str(strerror(errno)))
1712
1713 cpdef int write(self, AlignedSegment read) except -1:
1714 '''
1715 write a single :class:`pysam.AlignedSegment` to disk.
1716
1717 Raises:
1718 ValueError
1719 if the writing failed
1720
1721 Returns:
1722 int :
1723 the number of bytes written. If the file is closed,
1724 this will be 0.
1725 '''
1726 if not self.is_open:
1727 return 0
1728
1729 if self.header.ptr.n_targets <= read._delegate.core.tid:
1730 raise ValueError(
1731 "AlignedSegment refers to reference number {} that "
1732 "is larger than the number of references ({}) in the header".format(
1733 read._delegate.core.tid, self.header.ptr.n_targets))
1734
1735 cdef int ret
1736 with nogil:
1737 ret = sam_write1(self.htsfile,
1738 self.header.ptr,
1739 read._delegate)
1740
1741 # kbj: Still need to raise an exception with except -1. Otherwise
1742 # when ret == -1 we get a "SystemError: error return without
1743 # exception set".
1744 if ret < 0:
1745 raise IOError(
1746 "sam_write1 failed with error code {}".format(ret))
1747
1748 return ret
1749
1750 # context manager interface
1751 def __enter__(self):
1752 return self
1753
1754 def __exit__(self, exc_type, exc_value, traceback):
1755 self.close()
1756 return False
1757
1758 ###############################################################
1759 ###############################################################
1760 ###############################################################
1761 ## properties
1762 ###############################################################
1763 property mapped:
1764 """int with total number of mapped alignments according to the
1765 statistics recorded in the index. This is a read-only
1766 attribute.
1767 (This will be 0 for a CRAM file indexed by a .crai index, as that
1768 index format does not record these statistics.)
1769 """
1770 def __get__(self):
1771 self.check_index()
1772 cdef int tid
1773 cdef uint64_t total = 0
1774 cdef uint64_t mapped, unmapped
1775 for tid from 0 <= tid < self.header.nreferences:
1776 with nogil:
1777 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1778 total += mapped
1779 return total
1780
1781 property unmapped:
1782 """int with total number of unmapped reads according to the statistics
1783 recorded in the index. This number of reads includes the number of reads
1784 without coordinates. This is a read-only attribute.
1785 (This will be 0 for a CRAM file indexed by a .crai index, as that
1786 index format does not record these statistics.)
1787 """
1788 def __get__(self):
1789 self.check_index()
1790 cdef int tid
1791 cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
1792 cdef uint64_t mapped, unmapped
1793 for tid from 0 <= tid < self.header.nreferences:
1794 with nogil:
1795 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1796 total += unmapped
1797 return total
1798
1799 property nocoordinate:
1800 """int with total number of reads without coordinates according to the
1801 statistics recorded in the index, i.e., the statistic printed for "*"
1802 by the ``samtools idxstats`` command. This is a read-only attribute.
1803 (This will be 0 for a CRAM file indexed by a .crai index, as that
1804 index format does not record these statistics.)
1805 """
1806 def __get__(self):
1807 self.check_index()
1808 cdef uint64_t n
1809 with nogil:
1810 n = hts_idx_get_n_no_coor(self.index)
1811 return n
1812
1813 def get_index_statistics(self):
1814 """return statistics about mapped/unmapped reads per chromosome as
1815 they are stored in the index, similarly to the statistics printed
1816 by the ``samtools idxstats`` command.
1817
1818 CRAI indexes do not record these statistics, so for a CRAM file
1819 with a .crai index the returned statistics will all be 0.
1820
1821 Returns:
1822 list :
1823 a list of records for each chromosome. Each record has the
1824 attributes 'contig', 'mapped', 'unmapped' and 'total'.
1825 """
1826
1827 self.check_index()
1828 cdef int tid
1829 cdef uint64_t mapped, unmapped
1830 results = []
1831 # TODO: use header
1832 for tid from 0 <= tid < self.nreferences:
1833 with nogil:
1834 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
1835 results.append(
1836 IndexStats._make((
1837 self.get_reference_name(tid),
1838 mapped,
1839 unmapped,
1840 mapped + unmapped)))
1841
1842 return results
1843
1844 ###############################################################
1845 ## file-object like iterator access
1846 ## note: concurrent access will cause errors (see IteratorRow
1847 ## and multiple_iterators)
1848 ## Possible solutions: deprecate or open new file handle
1849 def __iter__(self):
1850 if not self.is_open:
1851 raise ValueError("I/O operation on closed file")
1852
1853 if not self.is_bam and self.header.nreferences == 0:
1854 raise NotImplementedError(
1855 "can not iterate over samfile without header")
1856 return self
1857
1858 cdef bam1_t * getCurrent(self):
1859 return self.b
1860
1861 cdef int cnext(self):
1862 '''
1863 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
1864 '''
1865 cdef int ret
1866 cdef bam_hdr_t * hdr = self.header.ptr
1867 with nogil:
1868 ret = sam_read1(self.htsfile,
1869 hdr,
1870 self.b)
1871 return ret
1872
1873 def __next__(self):
1874 cdef int ret = self.cnext()
1875 if ret >= 0:
1876 return makeAlignedSegment(self.b, self.header)
1877 elif ret == -1:
1878 raise StopIteration
1879 else:
1880 raise IOError(read_failure_reason(ret))
1881
1882 ###########################################
1883 # methods/properties referencing the header
1884 def is_valid_tid(self, int tid):
1885 """
1886 return True if the numerical :term:`tid` is valid; False otherwise.
1887
1888 Note that the unmapped tid code (-1) counts as an invalid.
1889 """
1890 if self.header is None:
1891 raise ValueError("header not available in closed files")
1892 return self.header.is_valid_tid(tid)
1893
1894 def get_tid(self, reference):
1895 """
1896 return the numerical :term:`tid` corresponding to
1897 :term:`reference`
1898
1899 returns -1 if reference is not known.
1900 """
1901 if self.header is None:
1902 raise ValueError("header not available in closed files")
1903 return self.header.get_tid(reference)
1904
1905 def get_reference_name(self, tid):
1906 """
1907 return :term:`reference` name corresponding to numerical :term:`tid`
1908 """
1909 if self.header is None:
1910 raise ValueError("header not available in closed files")
1911 return self.header.get_reference_name(tid)
1912
1913 def get_reference_length(self, reference):
1914 """
1915 return :term:`reference` length corresponding to numerical :term:`tid`
1916 """
1917 if self.header is None:
1918 raise ValueError("header not available in closed files")
1919 return self.header.get_reference_length(reference)
1920
1921 property nreferences:
1922 """int with the number of :term:`reference` sequences in the file.
1923 This is a read-only attribute."""
1924 def __get__(self):
1925 if self.header:
1926 return self.header.nreferences
1927 else:
1928 raise ValueError("header not available in closed files")
1929
1930 property references:
1931 """tuple with the names of :term:`reference` sequences. This is a
1932 read-only attribute"""
1933 def __get__(self):
1934 if self.header:
1935 return self.header.references
1936 else:
1937 raise ValueError("header not available in closed files")
1938
1939 property lengths:
1940 """tuple of the lengths of the :term:`reference` sequences. This is a
1941 read-only attribute. The lengths are in the same order as
1942 :attr:`pysam.AlignmentFile.references`
1943
1944 """
1945 def __get__(self):
1946 if self.header:
1947 return self.header.lengths
1948 else:
1949 raise ValueError("header not available in closed files")
1950
1951 # Compatibility functions for pysam < 0.14
1952 property text:
1953 """deprecated, use :attr:`references` and :attr:`lengths` instead"""
1954 def __get__(self):
1955 if self.header:
1956 return self.header.__str__()
1957 else:
1958 raise ValueError("header not available in closed files")
1959
1960 # Compatibility functions for pysam < 0.8.3
1961 def gettid(self, reference):
1962 """deprecated, use :meth:`get_tid` instead"""
1963 return self.get_tid(reference)
1964
1965 def getrname(self, tid):
1966 """deprecated, use :meth:`get_reference_name` instead"""
1967 return self.get_reference_name(tid)
1968
1969
1970 cdef class IteratorRow:
1971 '''abstract base class for iterators over mapped reads.
1972
1973 Various iterators implement different behaviours for wrapping around
1974 contig boundaries. Examples include:
1975
1976 :class:`pysam.IteratorRowRegion`
1977 iterate within a single contig and a defined region.
1978
1979 :class:`pysam.IteratorRowAll`
1980 iterate until EOF. This iterator will also include unmapped reads.
1981
1982 :class:`pysam.IteratorRowAllRefs`
1983 iterate over all reads in all reference sequences.
1984
1985 The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
1986
1987 .. note::
1988
1989 It is usually not necessary to create an object of this class
1990 explicitly. It is returned as a result of call to a
1991 :meth:`AlignmentFile.fetch`.
1992
1993 '''
1994
1995 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
1996 cdef char *cfilename
1997 cdef char *creference_filename
1998 cdef char *cindexname = NULL
1999
2000 if not samfile.is_open:
2001 raise ValueError("I/O operation on closed file")
2002
2003 # makes sure that samfile stays alive as long as the
2004 # iterator is alive
2005 self.samfile = samfile
2006
2007 # reopen the file - note that this makes the iterator
2008 # slow and causes pileup to slow down significantly.
2009 if multiple_iterators:
2010
2011 cfilename = samfile.filename
2012 with nogil:
2013 self.htsfile = hts_open(cfilename, 'r')
2014 assert self.htsfile != NULL
2015
2016 if samfile.has_index():
2017 if samfile.index_filename:
2018 cindexname = bindex_filename = encode_filename(samfile.index_filename)
2019 with nogil:
2020 self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
2021 else:
2022 self.index = NULL
2023
2024 # need to advance in newly opened file to position after header
2025 # better: use seek/tell?
2026 with nogil:
2027 hdr = sam_hdr_read(self.htsfile)
2028 if hdr is NULL:
2029 raise IOError("unable to read header information")
2030 self.header = makeAlignmentHeader(hdr)
2031
2032 self.owns_samfile = True
2033
2034 # options specific to CRAM files
2035 if samfile.is_cram and samfile.reference_filename:
2036 creference_filename = samfile.reference_filename
2037 hts_set_opt(self.htsfile,
2038 CRAM_OPT_REFERENCE,
2039 creference_filename)
2040
2041 else:
2042 self.htsfile = samfile.htsfile
2043 self.index = samfile.index
2044 self.owns_samfile = False
2045 self.header = samfile.header
2046
2047 self.retval = 0
2048
2049 self.b = bam_init1()
2050
2051 def __dealloc__(self):
2052 bam_destroy1(self.b)
2053 if self.owns_samfile:
2054 hts_idx_destroy(self.index)
2055 hts_close(self.htsfile)
2056
2057
2058 cdef class IteratorRowRegion(IteratorRow):
2059 """*(AlignmentFile samfile, int tid, int beg, int stop,
2060 int multiple_iterators=False)*
2061
2062 iterate over mapped reads in a region.
2063
2064 .. note::
2065
2066 It is usually not necessary to create an object of this class
2067 explicitly. It is returned as a result of call to a
2068 :meth:`AlignmentFile.fetch`.
2069
2070 """
2071
2072 def __init__(self, AlignmentFile samfile,
2073 int tid, int beg, int stop,
2074 int multiple_iterators=False):
2075
2076 if not samfile.has_index():
2077 raise ValueError("no index available for iteration")
2078
2079 super().__init__(samfile, multiple_iterators=multiple_iterators)
2080 with nogil:
2081 self.iter = sam_itr_queryi(
2082 self.index,
2083 tid,
2084 beg,
2085 stop)
2086
2087 def __iter__(self):
2088 return self
2089
2090 cdef bam1_t * getCurrent(self):
2091 return self.b
2092
2093 cdef int cnext(self):
2094 '''cversion of iterator. Used by IteratorColumn'''
2095 with nogil:
2096 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
2097 self.iter,
2098 self.b,
2099 self.htsfile)
2100
2101 def __next__(self):
2102 self.cnext()
2103 if self.retval >= 0:
2104 return makeAlignedSegment(self.b, self.header)
2105 elif self.retval == -1:
2106 raise StopIteration
2107 elif self.retval == -2:
2108 # Note: it is currently not the case that hts_iter_next
2109 # returns -2 for a truncated file.
2110 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
2111 raise IOError('truncated file')
2112 else:
2113 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
2114
2115 def __dealloc__(self):
2116 hts_itr_destroy(self.iter)
2117
2118
2119 cdef class IteratorRowHead(IteratorRow):
2120 """*(AlignmentFile samfile, n, int multiple_iterators=False)*
2121
2122 iterate over first n reads in `samfile`
2123
2124 .. note::
2125 It is usually not necessary to create an object of this class
2126 explicitly. It is returned as a result of call to a
2127 :meth:`AlignmentFile.head`.
2128
2129 """
2130
2131 def __init__(self,
2132 AlignmentFile samfile,
2133 int n,
2134 int multiple_iterators=False):
2135 super().__init__(samfile, multiple_iterators=multiple_iterators)
2136
2137 self.max_rows = n
2138 self.current_row = 0
2139
2140 def __iter__(self):
2141 return self
2142
2143 cdef bam1_t * getCurrent(self):
2144 return self.b
2145
2146 cdef int cnext(self):
2147 '''cversion of iterator. Used by IteratorColumn'''
2148 cdef int ret
2149 cdef bam_hdr_t * hdr = self.header.ptr
2150 with nogil:
2151 ret = sam_read1(self.htsfile,
2152 hdr,
2153 self.b)
2154 return ret
2155
2156 def __next__(self):
2157 if self.current_row >= self.max_rows:
2158 raise StopIteration
2159
2160 cdef int ret = self.cnext()
2161 if ret >= 0:
2162 self.current_row += 1
2163 return makeAlignedSegment(self.b, self.header)
2164 elif ret == -1:
2165 raise StopIteration
2166 else:
2167 raise IOError(read_failure_reason(ret))
2168
2169
2170 cdef class IteratorRowAll(IteratorRow):
2171 """*(AlignmentFile samfile, int multiple_iterators=False)*
2172
2173 iterate over all reads in `samfile`
2174
2175 .. note::
2176
2177 It is usually not necessary to create an object of this class
2178 explicitly. It is returned as a result of call to a
2179 :meth:`AlignmentFile.fetch`.
2180
2181 """
2182
2183 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
2184 super().__init__(samfile, multiple_iterators=multiple_iterators)
2185
2186 def __iter__(self):
2187 return self
2188
2189 cdef bam1_t * getCurrent(self):
2190 return self.b
2191
2192 cdef int cnext(self):
2193 '''cversion of iterator. Used by IteratorColumn'''
2194 cdef int ret
2195 cdef bam_hdr_t * hdr = self.header.ptr
2196 with nogil:
2197 ret = sam_read1(self.htsfile,
2198 hdr,
2199 self.b)
2200 return ret
2201
2202 def __next__(self):
2203 cdef int ret = self.cnext()
2204 if ret >= 0:
2205 return makeAlignedSegment(self.b, self.header)
2206 elif ret == -1:
2207 raise StopIteration
2208 else:
2209 raise IOError(read_failure_reason(ret))
2210
2211
2212 cdef class IteratorRowAllRefs(IteratorRow):
2213 """iterates over all mapped reads by chaining iterators over each
2214 reference
2215
2216 .. note::
2217 It is usually not necessary to create an object of this class
2218 explicitly. It is returned as a result of call to a
2219 :meth:`AlignmentFile.fetch`.
2220
2221 """
2222
2223 def __init__(self, AlignmentFile samfile, multiple_iterators=False):
2224 super().__init__(samfile, multiple_iterators=multiple_iterators)
2225
2226 if not samfile.has_index():
2227 raise ValueError("no index available for fetch")
2228
2229 self.tid = -1
2230
2231 def nextiter(self):
2232 # get a new iterator for a chromosome. The file
2233 # will not be re-opened.
2234 self.rowiter = IteratorRowRegion(self.samfile,
2235 self.tid,
2236 0,
2237 MAX_POS)
2238 # set htsfile and header of the rowiter
2239 # to the values in this iterator to reflect multiple_iterators
2240 self.rowiter.htsfile = self.htsfile
2241 self.rowiter.header = self.header
2242
2243 # make sure the iterator understand that IteratorRowAllRefs
2244 # has ownership
2245 self.rowiter.owns_samfile = False
2246
2247 def __iter__(self):
2248 return self
2249
2250 def __next__(self):
2251 # Create an initial iterator
2252 if self.tid == -1:
2253 if not self.samfile.nreferences:
2254 raise StopIteration
2255 self.tid = 0
2256 self.nextiter()
2257
2258 while 1:
2259 self.rowiter.cnext()
2260
2261 # If current iterator is not exhausted, return aligned read
2262 if self.rowiter.retval > 0:
2263 return makeAlignedSegment(self.rowiter.b, self.header)
2264
2265 self.tid += 1
2266
2267 # Otherwise, proceed to next reference or stop
2268 if self.tid < self.samfile.nreferences:
2269 self.nextiter()
2270 else:
2271 raise StopIteration
2272
2273
2274 cdef class IteratorRowSelection(IteratorRow):
2275 """*(AlignmentFile samfile)*
2276
2277 iterate over reads in `samfile` at a given list of file positions.
2278
2279 .. note::
2280 It is usually not necessary to create an object of this class
2281 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
2282 """
2283
2284 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
2285 super().__init__(samfile, multiple_iterators=multiple_iterators)
2286
2287 self.positions = positions
2288 self.current_pos = 0
2289
2290 def __iter__(self):
2291 return self
2292
2293 cdef bam1_t * getCurrent(self):
2294 return self.b
2295
2296 cdef int cnext(self):
2297 '''cversion of iterator'''
2298 # end iteration if out of positions
2299 if self.current_pos >= len(self.positions): return -1
2300
2301 cdef uint64_t pos = self.positions[self.current_pos]
2302 with nogil:
2303 bgzf_seek(hts_get_bgzfp(self.htsfile),
2304 pos,
2305 0)
2306 self.current_pos += 1
2307
2308 cdef int ret
2309 cdef bam_hdr_t * hdr = self.header.ptr
2310 with nogil:
2311 ret = sam_read1(self.htsfile,
2312 hdr,
2313 self.b)
2314 return ret
2315
2316 def __next__(self):
2317 cdef int ret = self.cnext()
2318 if ret >= 0:
2319 return makeAlignedSegment(self.b, self.header)
2320 elif ret == -1:
2321 raise StopIteration
2322 else:
2323 raise IOError(read_failure_reason(ret))
2324
2325
2326 cdef int __advance_nofilter(void *data, bam1_t *b):
2327 '''advance without any read filtering.
2328 '''
2329 cdef __iterdata * d = <__iterdata*>data
2330 cdef int ret
2331 with nogil:
2332 ret = sam_itr_next(d.htsfile, d.iter, b)
2333 return ret
2334
2335
2336 cdef int __advance_raw_nofilter(void *data, bam1_t *b):
2337 '''advance (without iterator) without any read filtering.
2338 '''
2339 cdef __iterdata * d = <__iterdata*>data
2340 cdef int ret
2341 with nogil:
2342 ret = sam_read1(d.htsfile, d.header, b)
2343 return ret
2344
2345
2346 cdef int __advance_all(void *data, bam1_t *b):
2347 '''only use reads for pileup passing basic filters such as
2348
2349 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
2350 '''
2351
2352 cdef __iterdata * d = <__iterdata*>data
2353 cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
2354 cdef int ret
2355 while 1:
2356 with nogil:
2357 ret = sam_itr_next(d.htsfile, d.iter, b)
2358 if ret < 0:
2359 break
2360 if b.core.flag & d.flag_filter:
2361 continue
2362 break
2363 return ret
2364
2365
2366 cdef int __advance_raw_all(void *data, bam1_t *b):
2367 '''only use reads for pileup passing basic filters such as
2368
2369 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
2370 '''
2371
2372 cdef __iterdata * d = <__iterdata*>data
2373 cdef int ret
2374 while 1:
2375 with nogil:
2376 ret = sam_read1(d.htsfile, d.header, b)
2377 if ret < 0:
2378 break
2379 if b.core.flag & d.flag_filter:
2380 continue
2381 break
2382 return ret
2383
2384
2385 cdef int __advance_samtools(void * data, bam1_t * b):
2386 '''advance using same filter and read processing as in
2387 the samtools pileup.
2388 '''
2389 cdef __iterdata * d = <__iterdata*>data
2390 cdef int ret
2391 cdef int q
2392
2393 while 1:
2394 with nogil:
2395 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
2396 if ret < 0:
2397 break
2398 if b.core.flag & d.flag_filter:
2399 continue
2400 if d.flag_require and not (b.core.flag & d.flag_require):
2401 continue
2402
2403 # reload sequence
2404 if d.fastafile != NULL and b.core.tid != d.tid:
2405 if d.seq != NULL:
2406 free(d.seq)
2407 d.tid = b.core.tid
2408 with nogil:
2409 d.seq = faidx_fetch_seq(
2410 d.fastafile,
2411 d.header.target_name[d.tid],
2412 0, MAX_POS,
2413 &d.seq_len)
2414
2415 if d.seq == NULL:
2416 raise ValueError(
2417 "reference sequence for '{}' (tid={}) not found".format(
2418 d.header.target_name[d.tid], d.tid))
2419
2420 # realign read - changes base qualities
2421 if d.seq != NULL and d.compute_baq:
2422 # 4th option to realign is flag:
2423 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
2424 if d.redo_baq:
2425 sam_prob_realn(b, d.seq, d.seq_len, 7)
2426 else:
2427 sam_prob_realn(b, d.seq, d.seq_len, 3)
2428
2429 if d.seq != NULL and d.adjust_capq_threshold > 10:
2430 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
2431 if q < 0:
2432 continue
2433 elif b.core.qual > q:
2434 b.core.qual = q
2435
2436 if b.core.qual < d.min_mapping_quality:
2437 continue
2438 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
2439 continue
2440
2441 break
2442
2443 return ret
2444
2445
2446 cdef class IteratorColumn:
2447 '''abstract base class for iterators over columns.
2448
2449 IteratorColumn objects wrap the pileup functionality of samtools.
2450
2451 For reasons of efficiency, the iterator points to the current
2452 pileup buffer. The pileup buffer is updated at every iteration.
2453 This might cause some unexpected behaviour. For example,
2454 consider the conversion to a list::
2455
2456 f = AlignmentFile("file.bam", "rb")
2457 result = list(f.pileup())
2458
2459 Here, ``result`` will contain ``n`` objects of type
2460 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
2461 ``result`` will contain the same information.
2462
2463 The desired behaviour can be achieved by list comprehension::
2464
2465 result = [x.pileups() for x in f.pileup()]
2466
2467 ``result`` will be a list of ``n`` lists of objects of type
2468 :class:`~pysam.PileupRead`.
2469
2470 If the iterator is associated with a :class:`~pysam.Fastafile`
2471 using the :meth:`add_reference` method, then the iterator will
2472 export the current sequence via the methods :meth:`get_sequence`
2473 and :meth:`seq_len`.
2474
2475 See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
2476 '''
2477
2478 def __cinit__( self, AlignmentFile samfile, **kwargs):
2479 self.samfile = samfile
2480 self.fastafile = kwargs.get("fastafile", None)
2481 self.stepper = kwargs.get("stepper", "samtools")
2482 self.max_depth = kwargs.get("max_depth", 8000)
2483 self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
2484 self.min_base_quality = kwargs.get("min_base_quality", 13)
2485 self.iterdata.seq = NULL
2486 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
2487 self.iterdata.flag_require = kwargs.get("flag_require", 0)
2488 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
2489 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
2490 self.iterdata.compute_baq = kwargs.get("compute_baq", True)
2491 self.iterdata.redo_baq = kwargs.get("redo_baq", False)
2492 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
2493
2494 self.tid = 0
2495 self.pos = 0
2496 self.n_plp = 0
2497 self.plp = NULL
2498 self.pileup_iter = <bam_mplp_t>NULL
2499
2500 def __iter__(self):
2501 return self
2502
2503 cdef int cnext(self):
2504 '''perform next iteration.
2505 '''
2506 # do not release gil here because of call-backs
2507 cdef int ret = bam_mplp_auto(self.pileup_iter,
2508 &self.tid,
2509 &self.pos,
2510 &self.n_plp,
2511 &self.plp)
2512 return ret
2513
2514 cdef char * get_sequence(self):
2515 '''return current reference sequence underlying the iterator.
2516 '''
2517 return self.iterdata.seq
2518
2519 property seq_len:
2520 '''current sequence length.'''
2521 def __get__(self):
2522 return self.iterdata.seq_len
2523
2524 def add_reference(self, FastaFile fastafile):
2525 '''
2526 add reference sequences in `fastafile` to iterator.'''
2527 self.fastafile = fastafile
2528 if self.iterdata.seq != NULL:
2529 free(self.iterdata.seq)
2530 self.iterdata.tid = -1
2531 self.iterdata.fastafile = self.fastafile.fastafile
2532
2533 def has_reference(self):
2534 '''
2535 return true if iterator is associated with a reference'''
2536 return self.fastafile
2537
2538 cdef _setup_iterator(self,
2539 int tid,
2540 int start,
2541 int stop,
2542 int multiple_iterators=0):
2543 '''setup the iterator structure'''
2544
2545 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
2546 self.iterdata.htsfile = self.samfile.htsfile
2547 self.iterdata.iter = self.iter.iter
2548 self.iterdata.seq = NULL
2549 self.iterdata.tid = -1
2550 self.iterdata.header = self.samfile.header.ptr
2551
2552 if self.fastafile is not None:
2553 self.iterdata.fastafile = self.fastafile.fastafile
2554 else:
2555 self.iterdata.fastafile = NULL
2556
2557 # Free any previously allocated memory before reassigning
2558 # pileup_iter
2559 self._free_pileup_iter()
2560
2561 cdef void * data[1]
2562 data[0] = <void*>&self.iterdata
2563
2564 if self.stepper is None or self.stepper == "all":
2565 with nogil:
2566 self.pileup_iter = bam_mplp_init(1,
2567 <bam_plp_auto_f>&__advance_all,
2568 data)
2569 elif self.stepper == "nofilter":
2570 with nogil:
2571 self.pileup_iter = bam_mplp_init(1,
2572 <bam_plp_auto_f>&__advance_nofilter,
2573 data)
2574 elif self.stepper == "samtools":
2575 with nogil:
2576 self.pileup_iter = bam_mplp_init(1,
2577 <bam_plp_auto_f>&__advance_samtools,
2578 data)
2579 else:
2580 raise ValueError(
2581 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
2582
2583 if self.max_depth:
2584 with nogil:
2585 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
2586
2587 if self.ignore_overlaps:
2588 with nogil:
2589 bam_mplp_init_overlaps(self.pileup_iter)
2590
2591 cdef _setup_raw_rest_iterator(self):
2592 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
2593
2594 self.iter = None
2595 self.iterdata.iter = NULL
2596 self.iterdata.htsfile = self.samfile.htsfile
2597 self.iterdata.seq = NULL
2598 self.iterdata.tid = -1
2599 self.iterdata.header = self.samfile.header.ptr
2600
2601 if self.fastafile is not None:
2602 self.iterdata.fastafile = self.fastafile.fastafile
2603 else:
2604 self.iterdata.fastafile = NULL
2605
2606 # Free any previously allocated memory before reassigning
2607 # pileup_iter
2608 self._free_pileup_iter()
2609
2610 cdef void * data[1]
2611 data[0] = <void*>&self.iterdata
2612
2613 if self.stepper is None or self.stepper == "all":
2614 with nogil:
2615 self.pileup_iter = bam_mplp_init(1,
2616 <bam_plp_auto_f>&__advance_raw_all,
2617 data)
2618 elif self.stepper == "nofilter":
2619 with nogil:
2620 self.pileup_iter = bam_mplp_init(1,
2621 <bam_plp_auto_f>&__advance_raw_nofilter,
2622 data)
2623 elif self.stepper == "samtools":
2624 with nogil:
2625 self.pileup_iter = bam_mplp_init(1,
2626 <bam_plp_auto_f>&__advance_samtools,
2627 data)
2628 else:
2629 raise ValueError(
2630 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
2631
2632 if self.max_depth:
2633 with nogil:
2634 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
2635
2636 if self.ignore_overlaps:
2637 with nogil:
2638 bam_mplp_init_overlaps(self.pileup_iter)
2639
2640 cdef reset(self, tid, start, stop):
2641 '''reset iterator position.
2642
2643 This permits using the iterator multiple times without
2644 having to incur the full set-up costs.
2645 '''
2646 if self.iter is None:
2647 raise TypeError("Raw iterator set up without region cannot be reset")
2648
2649 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
2650 self.iterdata.iter = self.iter.iter
2651
2652 # invalidate sequence if different tid
2653 if self.tid != tid:
2654 if self.iterdata.seq != NULL:
2655 free(self.iterdata.seq)
2656 self.iterdata.seq = NULL
2657 self.iterdata.tid = -1
2658
2659 # self.pileup_iter = bam_mplp_init(1
2660 # &__advancepileup,
2661 # &self.iterdata)
2662 with nogil:
2663 bam_mplp_reset(self.pileup_iter)
2664
2665 cdef _free_pileup_iter(self):
2666 '''free the memory alloc'd by bam_plp_init.
2667
2668 This is needed before setup_iterator allocates another
2669 pileup_iter, or else memory will be lost. '''
2670 if self.pileup_iter != <bam_mplp_t>NULL:
2671 with nogil:
2672 bam_mplp_reset(self.pileup_iter)
2673 bam_mplp_destroy(self.pileup_iter)
2674 self.pileup_iter = <bam_mplp_t>NULL
2675
2676 def __dealloc__(self):
2677 # reset in order to avoid memory leak messages for iterators
2678 # that have not been fully consumed
2679 self._free_pileup_iter()
2680 self.plp = <const bam_pileup1_t*>NULL
2681
2682 if self.iterdata.seq != NULL:
2683 free(self.iterdata.seq)
2684 self.iterdata.seq = NULL
2685
2686 # backwards compatibility
2687
2688 def hasReference(self):
2689 return self.has_reference()
2690 cdef char * getSequence(self):
2691 return self.get_sequence()
2692 def addReference(self, FastaFile fastafile):
2693 return self.add_reference(fastafile)
2694
2695
2696 cdef class IteratorColumnRegion(IteratorColumn):
2697 '''iterates over a region only.
2698 '''
2699 def __cinit__(self,
2700 AlignmentFile samfile,
2701 int tid = 0,
2702 int start = 0,
2703 int stop = MAX_POS,
2704 int truncate = False,
2705 int multiple_iterators = True,
2706 **kwargs ):
2707
2708 # initialize iterator. Multiple iterators not available
2709 # for CRAM.
2710 if multiple_iterators and samfile.is_cram:
2711 warnings.warn("multiple_iterators not implemented for CRAM")
2712 multiple_iterators = False
2713
2714 self._setup_iterator(tid, start, stop, multiple_iterators)
2715 self.start = start
2716 self.stop = stop
2717 self.truncate = truncate
2718
2719 def __next__(self):
2720
2721 cdef int n
2722
2723 while 1:
2724 n = self.cnext()
2725 if n < 0:
2726 raise ValueError("error during iteration" )
2727
2728 if n == 0:
2729 raise StopIteration
2730
2731 if self.truncate:
2732 if self.start > self.pos:
2733 continue
2734 if self.pos >= self.stop:
2735 raise StopIteration
2736
2737 return makePileupColumn(&self.plp,
2738 self.tid,
2739 self.pos,
2740 self.n_plp,
2741 self.min_base_quality,
2742 self.iterdata.seq,
2743 self.samfile.header)
2744
2745
2746 cdef class IteratorColumnAllRefs(IteratorColumn):
2747 """iterates over all columns by chaining iterators over each reference
2748 """
2749
2750 def __cinit__(self,
2751 AlignmentFile samfile,
2752 **kwargs):
2753
2754 # no iteration over empty files
2755 if not samfile.nreferences:
2756 raise StopIteration
2757
2758 # initialize iterator
2759 self._setup_iterator(self.tid, 0, MAX_POS, 1)
2760
2761 def __next__(self):
2762
2763 cdef int n
2764 while 1:
2765 n = self.cnext()
2766 if n < 0:
2767 raise ValueError("error during iteration")
2768
2769 # proceed to next reference or stop
2770 if n == 0:
2771 self.tid += 1
2772 if self.tid < self.samfile.nreferences:
2773 self._setup_iterator(self.tid, 0, MAX_POS, 0)
2774 else:
2775 raise StopIteration
2776 continue
2777
2778 # return result, if within same reference
2779 return makePileupColumn(&self.plp,
2780 self.tid,
2781 self.pos,
2782 self.n_plp,
2783 self.min_base_quality,
2784 self.iterdata.seq,
2785 self.samfile.header)
2786
2787
2788 cdef class IteratorColumnAll(IteratorColumn):
2789 """iterates over all columns, without using an index
2790 """
2791
2792 def __cinit__(self,
2793 AlignmentFile samfile,
2794 **kwargs):
2795
2796 self._setup_raw_rest_iterator()
2797
2798 def __next__(self):
2799
2800 cdef int n
2801 n = self.cnext()
2802 if n < 0:
2803 raise ValueError("error during iteration")
2804
2805 if n == 0:
2806 raise StopIteration
2807
2808 return makePileupColumn(&self.plp,
2809 self.tid,
2810 self.pos,
2811 self.n_plp,
2812 self.min_base_quality,
2813 self.iterdata.seq,
2814 self.samfile.header)
2815
2816
2817 cdef class SNPCall:
2818 '''the results of a SNP call.'''
2819 cdef int _tid
2820 cdef int _pos
2821 cdef char _reference_base
2822 cdef char _genotype
2823 cdef int _consensus_quality
2824 cdef int _snp_quality
2825 cdef int _rms_mapping_quality
2826 cdef int _coverage
2827
2828 property tid:
2829 '''the chromosome ID as is defined in the header'''
2830 def __get__(self):
2831 return self._tid
2832
2833 property pos:
2834 '''nucleotide position of SNP.'''
2835 def __get__(self): return self._pos
2836
2837 property reference_base:
2838 '''reference base at pos. ``N`` if no reference sequence supplied.'''
2839 def __get__(self): return from_string_and_size( &self._reference_base, 1 )
2840
2841 property genotype:
2842 '''the genotype called.'''
2843 def __get__(self): return from_string_and_size( &self._genotype, 1 )
2844
2845 property consensus_quality:
2846 '''the genotype quality (Phred-scaled).'''
2847 def __get__(self): return self._consensus_quality
2848
2849 property snp_quality:
2850 '''the snp quality (Phred scaled) - probability of consensus being
2851 identical to reference sequence.'''
2852 def __get__(self): return self._snp_quality
2853
2854 property mapping_quality:
2855 '''the root mean square (rms) of the mapping quality of all reads
2856 involved in the call.'''
2857 def __get__(self): return self._rms_mapping_quality
2858
2859 property coverage:
2860 '''coverage or read depth - the number of reads involved in the call.'''
2861 def __get__(self): return self._coverage
2862
2863 def __str__(self):
2864
2865 return "\t".join( map(str, (
2866 self.tid,
2867 self.pos,
2868 self.reference_base,
2869 self.genotype,
2870 self.consensus_quality,
2871 self.snp_quality,
2872 self.mapping_quality,
2873 self.coverage ) ) )
2874
2875
2876 cdef class IndexedReads:
2877 """Index a Sam/BAM-file by query name while keeping the
2878 original sort order intact.
2879
2880 The index is kept in memory and can be substantial.
2881
2882 By default, the file is re-opened to avoid conflicts if multiple
2883 operators work on the same file. Set `multiple_iterators` = False
2884 to not re-open `samfile`.
2885
2886 Parameters
2887 ----------
2888
2889 samfile : AlignmentFile
2890 File to be indexed.
2891
2892 multiple_iterators : bool
2893 Flag indicating whether the file should be reopened. Reopening prevents
2894 existing iterators being affected by the indexing.
2895
2896 """
2897
2898 def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
2899 cdef char *cfilename
2900
2901 # makes sure that samfile stays alive as long as this
2902 # object is alive.
2903 self.samfile = samfile
2904 cdef bam_hdr_t * hdr = NULL
2905 assert samfile.is_bam, "can only apply IndexReads on bam files"
2906
2907 # multiple_iterators the file - note that this makes the iterator
2908 # slow and causes pileup to slow down significantly.
2909 if multiple_iterators:
2910 cfilename = samfile.filename
2911 with nogil:
2912 self.htsfile = hts_open(cfilename, 'r')
2913 if self.htsfile == NULL:
2914 raise OSError("unable to reopen htsfile")
2915
2916 # need to advance in newly opened file to position after header
2917 # better: use seek/tell?
2918 with nogil:
2919 hdr = sam_hdr_read(self.htsfile)
2920 if hdr == NULL:
2921 raise OSError("unable to read header information")
2922 self.header = makeAlignmentHeader(hdr)
2923 self.owns_samfile = True
2924 else:
2925 self.htsfile = self.samfile.htsfile
2926 self.header = samfile.header
2927 self.owns_samfile = False
2928
2929 def build(self):
2930 '''build the index.'''
2931
2932 self.index = collections.defaultdict(list)
2933
2934 # this method will start indexing from the current file position
2935 cdef int ret = 1
2936 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
2937 if b == NULL:
2938 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
2939
2940 cdef uint64_t pos
2941 cdef bam_hdr_t * hdr = self.header.ptr
2942
2943 while ret > 0:
2944 with nogil:
2945 pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
2946 ret = sam_read1(self.htsfile,
2947 hdr,
2948 b)
2949
2950 if ret > 0:
2951 qname = charptr_to_str(pysam_bam_get_qname(b))
2952 self.index[qname].append(pos)
2953
2954 bam_destroy1(b)
2955
2956 def find(self, query_name):
2957 '''find `query_name` in index.
2958
2959 Returns
2960 -------
2961
2962 IteratorRowSelection
2963 Returns an iterator over all reads with query_name.
2964
2965 Raises
2966 ------
2967
2968 KeyError
2969 if the `query_name` is not in the index.
2970
2971 '''
2972 if query_name in self.index:
2973 return IteratorRowSelection(
2974 self.samfile,
2975 self.index[query_name],
2976 multiple_iterators = False)
2977 else:
2978 raise KeyError("read %s not found" % query_name)
2979
2980 def __dealloc__(self):
2981 if self.owns_samfile:
2982 hts_close(self.htsfile)