csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:5028fdace37b
+# cython: embedsignature=True
+# cython: profile=True
+########################################################
+########################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+########################################################
+# The principal classes defined in this module are:
+#
+# class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
+#
+# class AlignmentHeader manage SAM/BAM/CRAM header data
+#
+# class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
+#                       the original sort order intact
+#
+# Additionally this module defines numerous additional classes that
+# are part of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential
+# (IteratorRow) or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+# class IteratorColumn
+# class IteratorColumnRegion
+# class IteratorColumnAll
+# class IteratorColumnAllRefs
+#
+########################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+########################################################
+import os
+import collections
+try:
+from collections.abc import Sequence, Mapping  # noqa
+except ImportError:
+from collections import Sequence, Mapping  # noqa
+import re
+import warnings
+import array
+from libc.errno  cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from libc.stdint cimport INT32_MAX
+from cpython cimport array as c_array
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
+HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
+from io import StringIO
+cimport cython
+__all__ = [
+"AlignmentFile",
+"AlignmentHeader",
+"IteratorRow",
+"IteratorColumn",
+"IndexedReads"]
+IndexStats = collections.namedtuple("IndexStats",
+("contig",
+"mapped",
+"unmapped",
+"total"))
+########################################################
+## global variables
+# maximum genomic coordinace
+# for some reason, using 'int' causes overflow
+cdef int MAX_POS = (1 << 31) - 1
+# valid types for SAM headers
+VALID_HEADER_TYPES = {"HD" : Mapping,
+"SQ" : Sequence,
+"RG" : Sequence,
+"PG" : Sequence,
+"CO" : Sequence}
+# order of records within SAM headers
+VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
+# default type conversions within SAM header records
+KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
+"SS" : str,},
+"SQ" : {"SN" : str, "LN" : int, "AS" : str,
+"M5" : str, "SP" : str, "UR" : str,
+"AH" : str, "TP" : str, "DS" : str,
+"AN" : str,},
+"RG" : {"ID" : str, "CN" : str, "DS" : str,
+"DT" : str, "FO" : str, "KS" : str,
+"LB" : str, "PG" : str, "PI" : str,
+"PL" : str, "PM" : str, "PU" : str,
+"SM" : str, "BC" : str,},
+"PG" : {"ID" : str, "PN" : str, "CL" : str,
+"PP" : str, "DS" : str, "VN" : str,},}
+# output order of fields within records. Ensure that CL is at
+# the end as parsing a CL will ignore any subsequent records.
+VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
+"SQ" : ("SN", "LN", "AS", "M5",
+"UR", "SP", "AH", "TP",
+"DS", "AN"),
+"RG" : ("ID", "CN", "SM", "LB",
+"PU", "PI", "DT", "DS",
+"PL", "FO", "KS", "PG",
+"PM", "BC"),
+"PG" : ("PN", "ID", "VN", "PP",
+"DS", "CL"),}
+def build_header_line(fields, record):
+'''build a header line from `fields` dictionary for `record`'''
+# TODO: add checking for field and sort order
+line = ["@%s" % record]
+# comment
+if record == "CO":
+line.append(fields)
+# user tags
+elif record.islower():
+for key in sorted(fields):
+line.append("%s:%s" % (key, str(fields[key])))
+# defined tags
+else:
+# write fields of the specification
+for key in VALID_HEADER_ORDER[record]:
+if key in fields:
+line.append("%s:%s" % (key, str(fields[key])))
+# write user fields
+for key in fields:
+if not key.isupper():
+line.append("%s:%s" % (key, str(fields[key])))
+return "\t".join(line)
+cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
+if not hdr:
+raise ValueError('cannot create AlignmentHeader, received NULL pointer')
+# check: is AlignmetHeader.__cinit__ called?
+cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
+header.ptr = hdr
+return header
+def read_failure_reason(code):
+if code == -2:
+return 'truncated file'
+else:
+return "error {} while reading file".format(code)
+# the following should be class-method for VariantHeader, but cdef @classmethods
+# are not implemented in cython.
+cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
+reference_names,
+reference_lengths,
+add_sq_text=True,
+text=None) except -1:
+"""build header from list of reference names and lengths.
+"""
+cdef class AlignmentHeader(object):
+"""header information for a :class:`AlignmentFile` object
+Parameters
+----------
+header_dict : dict
+build header from a multi-level dictionary. The
+first level are the four types ('HD', 'SQ', ...). The second
+level are a list of lines, with each line being a list of
+tag-value pairs. The header is constructed first from all the
+defined fields, followed by user tags in alphabetical
+order. Alternatively, an :class:`~pysam.AlignmentHeader`
+object can be passed directly.
+text : string
+use the string provided as the header
+reference_names : list
+see reference_lengths
+reference_lengths : list
+build header from list of chromosome names and lengths.  By
+default, 'SQ' and 'LN' tags will be added to the header
+text. This option can be changed by unsetting the flag
+`add_sq_text`.
+add_sq_text : bool
+do not add 'SQ' and 'LN' tags to header. This option permits
+construction :term:`SAM` formatted files without a header.
+"""
+# See makeVariantHeader for C constructor
+def __cinit__(self):
+self.ptr = NULL
+# Python constructor
+def __init__(self):
+self.ptr = bam_hdr_init()
+if self.ptr is NULL:
+raise MemoryError("could not create header")
+@classmethod
+def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
+cdef AlignmentHeader self = AlignmentHeader()
+cdef char *ctext
+cdef int l_text
+cdef int n, x
+if text is not None:
+btext = force_bytes(text)
+ctext = btext
+l_text = len(btext)
+self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
+if self.ptr.text == NULL:
+raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
+self.ptr.l_text = l_text
+memcpy(self.ptr.text, ctext, l_text + 1)
+if reference_names and reference_lengths:
+reference_names = [force_bytes(ref) for ref in reference_names]
+self.ptr.n_targets = len(reference_names)
+n = sum([len(reference_names) + 1])
+self.ptr.target_name = <char**>calloc(n, sizeof(char*))
+if self.ptr.target_name == NULL:
+raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
+self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+if self.ptr.target_len == NULL:
+raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
+for x from 0 <= x < self.ptr.n_targets:
+self.ptr.target_len[x] = reference_lengths[x]
+name = reference_names[x]
+self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
+if self.ptr.target_name[x] == NULL:
+raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
+strncpy(self.ptr.target_name[x], name, len(name))
+return self
+@classmethod
+def from_text(cls, text):
+reference_names, reference_lengths = [], []
+for line in text.splitlines():
+if line.startswith("@SQ"):
+fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
+try:
+reference_names.append(fields["SN"])
+reference_lengths.append(int(fields["LN"]))
+except KeyError:
+raise KeyError("incomplete sequence information in '%s'" % str(fields))
+except ValueError:
+raise ValueError("wrong sequence information in '%s'" % str(fields))
+return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+@classmethod
+def from_dict(cls, header_dict):
+cdef list lines = []
+# first: defined tags
+for record in VALID_HEADERS:
+if record in header_dict:
+data = header_dict[record]
+if not isinstance(data, VALID_HEADER_TYPES[record]):
+raise ValueError(
+"invalid type for record %s: %s, expected %s".format(
+record, type(data), VALID_HEADER_TYPES[record]))
+if isinstance(data, Mapping):
+lines.append(build_header_line(data, record))
+else:
+for fields in header_dict[record]:
+lines.append(build_header_line(fields, record))
+# then: user tags (lower case), sorted alphabetically
+for record, data in sorted(header_dict.items()):
+if record in VALID_HEADERS:
+continue
+if isinstance(data, Mapping):
+lines.append(build_header_line(data, record))
+else:
+for fields in header_dict[record]:
+lines.append(build_header_line(fields, record))
+text = "\n".join(lines) + "\n"
+reference_names, reference_lengths = [], []
+if "SQ" in header_dict:
+for fields in header_dict["SQ"]:
+try:
+reference_names.append(fields["SN"])
+reference_lengths.append(fields["LN"])
+except KeyError:
+raise KeyError("incomplete sequence information in '%s'" % str(fields))
+return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+@classmethod
+def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
+if len(reference_names) != len(reference_lengths):
+raise ValueError("number of reference names and lengths do not match")
+# optionally, if there is no text, add a SAM compatible header to output file.
+if text is None and add_sq_text:
+text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
+reference_names, reference_lengths)])
+return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+def __dealloc__(self):
+bam_hdr_destroy(self.ptr)
+self.ptr = NULL
+def __bool__(self):
+return self.ptr != NULL
+def copy(self):
+return makeAlignmentHeader(bam_hdr_dup(self.ptr))
+property nreferences:
+"""int with the number of :term:`reference` sequences in the file.
+This is a read-only attribute."""
+def __get__(self):
+return self.ptr.n_targets
+property references:
+"""tuple with the names of :term:`reference` sequences. This is a
+read-only attribute"""
+def __get__(self):
+t = []
+cdef int x
+for x in range(self.ptr.n_targets):
+t.append(charptr_to_str(self.ptr.target_name[x]))
+return tuple(t)
+property lengths:
+"""tuple of the lengths of the :term:`reference` sequences. This is a
+read-only attribute. The lengths are in the same order as
+:attr:`pysam.AlignmentFile.references`
+"""
+def __get__(self):
+t = []
+cdef int x
+for x in range(self.ptr.n_targets):
+t.append(self.ptr.target_len[x])
+return tuple(t)
+def _build_sequence_section(self):
+"""return sequence section of header.
+The sequence section is built from the list of reference names and
+lengths stored in the BAM-file and not from any @SQ entries that
+are part of the header's text section.
+"""
+cdef int x
+text = []
+for x in range(self.ptr.n_targets):
+text.append("@SQ\tSN:{}\tLN:{}\n".format(
+force_str(self.ptr.target_name[x]),
+self.ptr.target_len[x]))
+return "".join(text)
+def to_dict(self):
+"""return two-level dictionary with header information from the file.
+The first level contains the record (``HD``, ``SQ``, etc) and
+the second level contains the fields (``VN``, ``LN``, etc).
+The parser is validating and will raise an AssertionError if
+if encounters any record or field tags that are not part of
+the SAM specification. Use the
+:attr:`pysam.AlignmentFile.text` attribute to get the unparsed
+header.
+The parsing follows the SAM format specification with the
+exception of the ``CL`` field. This option will consume the
+rest of a header line irrespective of any additional fields.
+This behaviour has been added to accommodate command line
+options that contain characters that are not valid field
+separators.
+If no @SQ entries are within the text section of the header,
+this will be automatically added from the reference names and
+lengths stored in the binary part of the header.
+"""
+result = collections.OrderedDict()
+# convert to python string
+t = self.__str__()
+for line in t.split("\n"):
+line = line.strip(' \0')
+if not line:
+continue
+assert line.startswith("@"), \
+"header line without '@': '%s'" % line
+fields = line[1:].split("\t")
+record = fields[0]
+assert record in VALID_HEADER_TYPES, \
+"header line with invalid type '%s': '%s'" % (record, line)
+# treat comments
+if record == "CO":
+if record not in result:
+result[record] = []
+result[record].append("\t".join( fields[1:]))
+continue
+# the following is clumsy as generators do not work?
+x = {}
+for idx, field in enumerate(fields[1:]):
+if ":" not in field:
+raise ValueError("malformatted header: no ':' in field" )
+key, value = field.split(":", 1)
+if key in ("CL",):
+# special treatment for command line
+# statements (CL). These might contain
+# characters that are non-conformant with
+# the valid field separators in the SAM
+# header. Thus, in contravention to the
+# SAM API, consume the rest of the line.
+key, value = "\t".join(fields[idx+1:]).split(":", 1)
+x[key] = KNOWN_HEADER_FIELDS[record][key](value)
+break
+# interpret type of known header record tags, default to str
+x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
+if VALID_HEADER_TYPES[record] == Mapping:
+if record in result:
+raise ValueError(
+"multiple '%s' lines are not permitted" % record)
+result[record] = x
+elif VALID_HEADER_TYPES[record] == Sequence:
+if record not in result: result[record] = []
+result[record].append(x)
+# if there are no SQ lines in the header, add the
+# reference names from the information in the bam
+# file.
+#
+# Background: c-samtools keeps the textual part of the
+# header separate from the list of reference names and
+# lengths. Thus, if a header contains only SQ lines,
+# the SQ information is not part of the textual header
+# and thus are missing from the output. See issue 84.
+if "SQ" not in result:
+sq = []
+for ref, length in zip(self.references, self.lengths):
+sq.append({'LN': length, 'SN': ref })
+result["SQ"] = sq
+return result
+def as_dict(self):
+"""deprecated, use :meth:`to_dict()` instead"""
+return self.to_dict()
+def get_reference_name(self, tid):
+if tid == -1:
+return None
+if not 0 <= tid < self.ptr.n_targets:
+raise ValueError("reference_id %i out of range 0<=tid<%i" %
+(tid, self.ptr.n_targets))
+return charptr_to_str(self.ptr.target_name[tid])
+def get_reference_length(self, reference):
+cdef int tid = self.get_tid(reference)
+if tid < 0:
+raise KeyError("unknown reference {}".format(reference))
+else:
+return self.ptr.target_len[tid]
+def is_valid_tid(self, int tid):
+"""
+return True if the numerical :term:`tid` is valid; False otherwise.
+Note that the unmapped tid code (-1) counts as an invalid.
+"""
+return 0 <= tid < self.ptr.n_targets
+def get_tid(self, reference):
+"""
+return the numerical :term:`tid` corresponding to
+:term:`reference`
+returns -1 if reference is not known.
+"""
+reference = force_bytes(reference)
+tid = bam_name2id(self.ptr, reference)
+if tid < -1:
+raise ValueError('could not parse header')
+return tid
+def __str__(self):
+'''string with the full contents of the :term:`sam file` header as a
+string.
+If no @SQ entries are within the text section of the header,
+this will be automatically added from the reference names and
+lengths stored in the binary part of the header.
+See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
+representation of the header.
+'''
+text = from_string_and_size(self.ptr.text, self.ptr.l_text)
+if "@SQ" not in text:
+text += "\n" + self._build_sequence_section()
+return text
+# dictionary access methods, for backwards compatibility.
+def __setitem__(self, key, value):
+raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
+def __getitem__(self, key):
+return self.to_dict().__getitem__(key)
+def items(self):
+return self.to_dict().items()
+# PY2 compatibility
+def iteritems(self):
+return self.to_dict().items()
+def keys(self):
+return self.to_dict().keys()
+def values(self):
+return self.to_dict().values()
+def get(self, *args):
+return self.to_dict().get(*args)
+def __len__(self):
+return self.to_dict().__len__()
+def __contains__(self, key):
+return self.to_dict().__contains__(key)
+cdef class AlignmentFile(HTSFile):
+"""AlignmentFile(filepath_or_object, mode=None, template=None,
+reference_names=None, reference_lengths=None, text=NULL,
+header=None, add_sq_text=False, check_header=True, check_sq=True,
+reference_filename=None, filename=None, index_filename=None,
+filepath_index=None, require_index=False, duplicate_filehandle=True,
+ignore_truncation=False, threads=1)
+A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
+If `filepath_or_object` is a string, the file is automatically
+opened. If `filepath_or_object` is a python File object, the
+already opened file will be used.
+If the file is opened for reading and an index exists (if file is BAM, a
+.bai file or if CRAM a .crai file), it will be opened automatically.
+`index_filename` may be specified explicitly. If the index is not named
+in the standard manner, not located in the same directory as the
+BAM/CRAM file, or is remote.  Without an index, random access via
+:meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
+is disabled.
+For writing, the header of a :term:`SAM` file/:term:`BAM` file can
+be constituted from several sources (see also the samtools format
+specification):
+1. If `template` is given, the header is copied from another
+`AlignmentFile` (`template` must be a
+:class:`~pysam.AlignmentFile`).
+2. If `header` is given, the header is built from a
+multi-level dictionary.
+3. If `text` is given, new header text is copied from raw
+text.
+4. The names (`reference_names`) and lengths
+(`reference_lengths`) are supplied directly as lists.
+When reading or writing a CRAM file, the filename of a FASTA-formatted
+reference can be specified with `reference_filename`.
+By default, if a file is opened in mode 'r', it is checked
+for a valid header (`check_header` = True) and a definition of
+chromosome names (`check_sq` = True).
+Parameters
+----------
+mode : string
+`mode` should be ``r`` for reading or ``w`` for writing. The
+default is text mode (:term:`SAM`). For binary (:term:`BAM`)
+I/O you should append ``b`` for compressed or ``u`` for
+uncompressed :term:`BAM` output.  Use ``h`` to output header
+information in text (:term:`TAM`) mode. Use ``c`` for
+:term:`CRAM` formatted files.
+If ``b`` is present, it must immediately follow ``r`` or
+``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
+``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
+:term:`BAM` formatted file for reading, type::
+f = pysam.AlignmentFile('ex1.bam','rb')
+If mode is not specified, the method will try to auto-detect
+in the order 'rb', 'r', thus both the following should work::
+f1 = pysam.AlignmentFile('ex1.bam')
+f2 = pysam.AlignmentFile('ex1.sam')
+template : AlignmentFile
+when writing, copy header from file `template`.
+header :  dict or AlignmentHeader
+when writing, build header from a multi-level dictionary. The
+first level are the four types ('HD', 'SQ', ...). The second
+level are a list of lines, with each line being a list of
+tag-value pairs. The header is constructed first from all the
+defined fields, followed by user tags in alphabetical
+order. Alternatively, an :class:`~pysam.AlignmentHeader`
+object can be passed directly.
+text : string
+when writing, use the string provided as the header
+reference_names : list
+see reference_lengths
+reference_lengths : list
+when writing or opening a SAM file without header build header
+from list of chromosome names and lengths.  By default, 'SQ'
+and 'LN' tags will be added to the header text. This option
+can be changed by unsetting the flag `add_sq_text`.
+add_sq_text : bool
+do not add 'SQ' and 'LN' tags to header. This option permits
+construction :term:`SAM` formatted files without a header.
+add_sam_header : bool
+when outputting SAM the default is to output a header. This is
+equivalent to opening the file in 'wh' mode. If this option is
+set to False, no header will be output. To read such a file,
+set `check_header=False`.
+check_header : bool
+obsolete: when reading a SAM file, check if header is present
+(default=True)
+check_sq : bool
+when reading, check if SQ entries are present in header
+(default=True)
+reference_filename : string
+Path to a FASTA-formatted reference file. Valid only for CRAM files.
+When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
+specified in the header (``UR`` tag), which are normally used to find
+the reference.
+index_filename : string
+Explicit path to the index file.  Only needed if the index is not
+named in the standard manner, not located in the same directory as
+the BAM/CRAM file, or is remote.  An IOError is raised if the index
+cannot be found or is invalid.
+filepath_index : string
+Alias for `index_filename`.
+require_index : bool
+When reading, require that an index file is present and is valid or
+raise an IOError.  (default=False)
+filename : string
+Alternative to filepath_or_object. Filename of the file
+to be opened.
+duplicate_filehandle: bool
+By default, file handles passed either directly or through
+File-like objects will be duplicated before passing them to
+htslib. The duplication prevents issues where the same stream
+will be closed by htslib and through destruction of the
+high-level python object. Set to False to turn off
+duplication.
+ignore_truncation: bool
+Issue a warning, instead of raising an error if the current file
+appears to be truncated due to a missing EOF marker.  Only applies
+to bgzipped formats. (Default=False)
+format_options: list
+A list of key=value strings, as accepted by --input-fmt-option and
+--output-fmt-option in samtools.
+threads: integer
+Number of threads to use for compressing/decompressing BAM/CRAM files.
+Setting threads to > 1 cannot be combined with `ignore_truncation`.
+(Default=1)
+"""
+def __cinit__(self, *args, **kwargs):
+self.htsfile = NULL
+self.filename = None
+self.mode = None
+self.threads = 1
+self.is_stream = False
+self.is_remote = False
+self.index = NULL
+if "filename" in kwargs:
+args = [kwargs["filename"]]
+del kwargs["filename"]
+self._open(*args, **kwargs)
+# allocate memory for iterator
+self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+if self.b == NULL:
+raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
+def has_index(self):
+"""return true if htsfile has an existing (and opened) index.
+"""
+return self.index != NULL
+def check_index(self):
+"""return True if index is present.
+Raises
+------
+AttributeError
+if htsfile is :term:`SAM` formatted and thus has no index.
+ValueError
+if htsfile is closed or index could not be opened.
+"""
+if not self.is_open:
+raise ValueError("I/O operation on closed file")
+if not self.is_bam and not self.is_cram:
+raise AttributeError(
+"AlignmentFile.mapped only available in bam files")
+if self.index == NULL:
+raise ValueError(
+"mapping information not recorded in index "
+"or index not available")
+return True
+def _open(self,
+filepath_or_object,
+mode=None,
+AlignmentFile template=None,
+reference_names=None,
+reference_lengths=None,
+reference_filename=None,
+text=None,
+header=None,
+port=None,
+add_sq_text=True,
+add_sam_header=True,
+check_header=True,
+check_sq=True,
+index_filename=None,
+filepath_index=None,
+require_index=False,
+referencenames=None,
+referencelengths=None,
+duplicate_filehandle=True,
+ignore_truncation=False,
+format_options=None,
+threads=1):
+'''open a sam, bam or cram formatted file.
+If _open is called on an existing file, the current file
+will be closed and a new file will be opened.
+'''
+cdef char *cfilename = NULL
+cdef char *creference_filename = NULL
+cdef char *cindexname = NULL
+cdef char *cmode = NULL
+cdef bam_hdr_t * hdr = NULL
+if threads > 1 and ignore_truncation:
+# This won't raise errors if reaching a truncated alignment,
+# because bgzf_mt_reader in htslib does not deal with
+# bgzf_mt_read_block returning non-zero values, contrary
+# to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
+# Better to avoid this (for now) than to produce seemingly correct results.
+raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
+self.threads = threads
+# for backwards compatibility:
+if referencenames is not None:
+reference_names = referencenames
+if referencelengths is not None:
+reference_lengths = referencelengths
+# close a previously opened file
+if self.is_open:
+self.close()
+# autodetection for read
+if mode is None:
+mode = "r"
+if add_sam_header and mode == "w":
+mode = "wh"
+assert mode in ("r", "w", "rb", "wb", "wh",
+"wbu", "rU", "wb0",
+"rc", "wc"), \
+"invalid file opening mode `%s`" % mode
+self.duplicate_filehandle = duplicate_filehandle
+# StringIO not supported
+if isinstance(filepath_or_object, StringIO):
+raise NotImplementedError(
+"access from StringIO objects not supported")
+# reading from a file descriptor
+elif isinstance(filepath_or_object, int):
+self.filename = filepath_or_object
+filename = None
+self.is_remote = False
+self.is_stream = True
+# reading from a File object or other object with fileno
+elif hasattr(filepath_or_object, "fileno"):
+if filepath_or_object.closed:
+raise ValueError('I/O operation on closed file')
+self.filename = filepath_or_object
+# .name can be TextIOWrapper
+try:
+filename = encode_filename(str(filepath_or_object.name))
+cfilename = filename
+except AttributeError:
+filename = None
+self.is_remote = False
+self.is_stream = True
+# what remains is a filename
+else:
+self.filename = filename = encode_filename(filepath_or_object)
+cfilename = filename
+self.is_remote = hisremote(cfilename)
+self.is_stream = self.filename == b'-'
+# for htslib, wbu seems to not work
+if mode == "wbu":
+mode = "wb0"
+self.mode = force_bytes(mode)
+self.reference_filename = reference_filename = encode_filename(
+reference_filename)
+if mode[0] == 'w':
+# open file for writing
+if not (template or header or text or (reference_names and reference_lengths)):
+raise ValueError(
+"either supply options `template`, `header`, `text` or  both `reference_names` "
+"and `reference_lengths` for writing")
+if template:
+# header is copied, though at the moment not strictly
+# necessary as AlignmentHeader is immutable.
+self.header = template.header.copy()
+elif isinstance(header, AlignmentHeader):
+self.header = header.copy()
+elif isinstance(header, Mapping):
+self.header = AlignmentHeader.from_dict(header)
+elif reference_names and reference_lengths:
+self.header = AlignmentHeader.from_references(
+reference_names,
+reference_lengths,
+add_sq_text=add_sq_text,
+text=text)
+elif text:
+self.header = AlignmentHeader.from_text(text)
+else:
+raise ValueError("not enough information to construct header. Please provide template, "
+"header, text or reference_names/reference_lengths")
+self.htsfile = self._open_htsfile()
+if self.htsfile == NULL:
+if errno:
+raise IOError(errno, "could not open alignment file `{}`: {}".format(
+force_str(filename),
+force_str(strerror(errno))))
+else:
+raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
+if format_options and len(format_options):
+self.add_hts_options(format_options)
+# set filename with reference sequences. If no filename
+# is given, the CRAM reference arrays will be built from
+# the @SQ header in the header
+if "c" in mode and reference_filename:
+if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
+raise ValueError("failure when setting reference filename")
+# write header to htsfile
+if "b" in mode or "c" in mode or "h" in mode:
+hdr = self.header.ptr
+with nogil:
+sam_hdr_write(self.htsfile, hdr)
+elif mode[0] == "r":
+# open file for reading
+self.htsfile = self._open_htsfile()
+if self.htsfile == NULL:
+if errno:
+raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
+force_str(strerror(errno))))
+else:
+raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
+if self.htsfile.format.category != sequence_data:
+raise ValueError("file does not contain alignment data")
+if format_options and len(format_options):
+self.add_hts_options(format_options)
+self.check_truncation(ignore_truncation)
+# bam/cram files require a valid header
+if self.is_bam or self.is_cram:
+with nogil:
+hdr = sam_hdr_read(self.htsfile)
+if hdr == NULL:
+raise ValueError(
+"file does not have a valid header (mode='%s') "
+"- is it BAM/CRAM format?" % mode)
+self.header = makeAlignmentHeader(hdr)
+else:
+# in sam files a header is optional. If not given,
+# user may provide reference names and lengths to built
+# an on-the-fly header.
+if reference_names and reference_lengths:
+# build header from a target names and lengths
+self.header = AlignmentHeader.from_references(
+reference_names=reference_names,
+reference_lengths=reference_lengths,
+add_sq_text=add_sq_text,
+text=text)
+else:
+with nogil:
+hdr = sam_hdr_read(self.htsfile)
+if hdr == NULL:
+raise ValueError(
+"SAM? file does not have a valid header (mode='%s'), "
+"please provide reference_names and reference_lengths")
+self.header = makeAlignmentHeader(hdr)
+# set filename with reference sequences
+if self.is_cram and reference_filename:
+creference_filename = self.reference_filename
+hts_set_opt(self.htsfile,
+CRAM_OPT_REFERENCE,
+creference_filename)
+if check_sq and self.header.nreferences == 0:
+raise ValueError(
+("file has no sequences defined (mode='%s') - "
+"is it SAM/BAM format? Consider opening with "
+"check_sq=False") % mode)
+if self.is_bam or self.is_cram:
+self.index_filename = index_filename or filepath_index
+if self.index_filename:
+cindexname = bfile_name = encode_filename(self.index_filename)
+if cfilename or cindexname:
+with nogil:
+self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
+HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
+if not self.index and (cindexname or require_index):
+if errno:
+raise IOError(errno, force_str(strerror(errno)))
+else:
+raise IOError('unable to open index file `%s`' % self.index_filename)
+elif require_index:
+raise IOError('unable to open index file')
+# save start of data section
+if not self.is_stream:
+self.start_offset = self.tell()
+def fetch(self,
+contig=None,
+start=None,
+stop=None,
+region=None,
+tid=None,
+until_eof=False,
+multiple_iterators=False,
+reference=None,
+end=None):
+"""fetch reads aligned in a :term:`region`.
+See :meth:`~pysam.HTSFile.parse_region` for more information
+on how genomic regions can be specified. :term:`reference` and
+`end` are also accepted for backward compatibility as synonyms
+for :term:`contig` and `stop`, respectively.
+Without a `contig` or `region` all mapped reads in the file
+will be fetched. The reads will be returned ordered by reference
+sequence, which will not necessarily be the order within the
+file. This mode of iteration still requires an index. If there is
+no index, use `until_eof=True`.
+If only `contig` is set, all reads aligned to `contig`
+will be fetched.
+A :term:`SAM` file does not allow random access. If `region`
+or `contig` are given, an exception is raised.
+Parameters
+----------
+until_eof : bool
+If `until_eof` is True, all reads from the current file
+position will be returned in order as they are within the
+file. Using this option will also fetch unmapped reads.
+multiple_iterators : bool
+If `multiple_iterators` is True, multiple
+iterators on the same file can be used at the same time. The
+iterator returned will receive its own copy of a filehandle to
+the file effectively re-opening the file. Re-opening a file
+creates some overhead, so beware.
+Returns
+-------
+		An iterator over a collection of reads. : IteratorRow
+Raises
+------
+ValueError
+if the genomic coordinates are out of range or invalid or the
+file does not permit random access to genomic coordinates.
+"""
+cdef int rtid, rstart, rstop, has_coord
+if not self.is_open:
+raise ValueError( "I/O operation on closed file" )
+has_coord, rtid, rstart, rstop = self.parse_region(
+contig, start, stop, region, tid,
+end=end, reference=reference)
+# Turn of re-opening if htsfile is a stream
+if self.is_stream:
+multiple_iterators = False
+if self.is_bam or self.is_cram:
+if not until_eof and not self.is_remote:
+if not self.has_index():
+raise ValueError(
+"fetch called on bamfile without index")
+if has_coord:
+return IteratorRowRegion(
+self, rtid, rstart, rstop,
+multiple_iterators=multiple_iterators)
+else:
+if until_eof:
+return IteratorRowAll(
+self,
+multiple_iterators=multiple_iterators)
+else:
+# AH: check - reason why no multiple_iterators for
+# AllRefs?
+return IteratorRowAllRefs(
+self,
+multiple_iterators=multiple_iterators)
+else:
+if has_coord:
+raise ValueError(
+"fetching by region is not available for SAM files")
+if multiple_iterators == True:
+raise ValueError(
+"multiple iterators not implemented for SAM files")
+return IteratorRowAll(self,
+multiple_iterators=multiple_iterators)
+def head(self, n, multiple_iterators=True):
+'''return an iterator over the first n alignments.
+This iterator is is useful for inspecting the bam-file.
+Parameters
+----------
+multiple_iterators : bool
+is set to True by default in order to
+avoid changing the current file position.
+Returns
+-------
+		an iterator over a collection of reads : IteratorRowHead
+'''
+return IteratorRowHead(self, n,
+multiple_iterators=multiple_iterators)
+def mate(self, AlignedSegment read):
+'''return the mate of :class:`pysam.AlignedSegment` `read`.
+.. note::
+Calling this method will change the file position.
+This might interfere with any iterators that have
+not re-opened the file.
+.. note::
+This method is too slow for high-throughput processing.
+If a read needs to be processed with its mate, work
+from a read name sorted file or, better, cache reads.
+Returns
+-------
+the mate : AlignedSegment
+Raises
+------
+ValueError
+if the read is unpaired or the mate is unmapped
+'''
+cdef uint32_t flag = read._delegate.core.flag
+if flag & BAM_FPAIRED == 0:
+raise ValueError("read %s: is unpaired" %
+(read.query_name))
+if flag & BAM_FMUNMAP != 0:
+raise ValueError("mate %s: is unmapped" %
+(read.query_name))
+# xor flags to get the other mate
+cdef int x = BAM_FREAD1 + BAM_FREAD2
+flag = (flag ^ x) & x
+# Make sure to use a separate file to jump around
+# to mate as otherwise the original file position
+# will be lost
+# The following code is not using the C API and
+# could thus be made much quicker, for example
+# by using tell and seek.
+for mate in self.fetch(
+read._delegate.core.mpos,
+read._delegate.core.mpos + 1,
+tid=read._delegate.core.mtid,
+multiple_iterators=True):
+if mate.flag & flag != 0 and \
+mate.query_name == read.query_name:
+break
+else:
+raise ValueError("mate not found")
+return mate
+def pileup(self,
+contig=None,
+start=None,
+stop=None,
+region=None,
+reference=None,
+end=None,
+**kwargs):
+"""perform a :term:`pileup` within a :term:`region`. The region is
+specified by :term:`contig`, `start` and `stop` (using
+0-based indexing).  :term:`reference` and `end` are also accepted for
+backward compatibility as synonyms for :term:`contig` and `stop`,
+respectively.  Alternatively, a samtools 'region' string
+can be supplied.
+Without 'contig' or 'region' all reads will be used for the
+pileup. The reads will be returned ordered by
+:term:`contig` sequence, which will not necessarily be the
+order within the file.
+Note that :term:`SAM` formatted files do not allow random
+access.  In these files, if a 'region' or 'contig' are
+given an exception is raised.
+.. note::
+'all' reads which overlap the region are returned. The
+first base returned will be the first base of the first
+read 'not' necessarily the first base of the region used
+in the query.
+Parameters
+----------
+truncate : bool
+By default, the samtools pileup engine outputs all reads
+overlapping a region. If truncate is True and a region is
+given, only columns in the exact region specified are
+returned.
+max_depth : int
+Maximum read depth permitted. The default limit is '8000'.
+stepper : string
+The stepper controls how the iterator advances.
+Possible options for the stepper are
+``all``
+skip reads in which any of the following flags are set:
+BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+``nofilter``
+uses every single read turning off any filtering.
+``samtools``
+same filter and read processing as in samtools
+pileup. For full compatibility, this requires a
+'fastafile' to be given. The following options all pertain
+to filtering of the ``samtools`` stepper.
+fastafile : :class:`~pysam.FastaFile` object.
+This is required for some of the steppers.
+ignore_overlaps: bool
+If set to True, detect if read pairs overlap and only take
+the higher quality base. This is the default.
+flag_filter : int
+ignore reads where any of the bits in the flag are set. The default is
+BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
+flag_require : int
+only use reads where certain flags are set. The default is 0.
+ignore_orphans: bool
+ignore orphans (paired reads that are not in a proper pair).
+The default is to ignore orphans.
+min_base_quality: int
+Minimum base quality. Bases below the minimum quality will
+not be output. The default is 13.
+adjust_capq_threshold: int
+adjust mapping quality. The default is 0 for no
+adjustment. The recommended value for adjustment is 50.
+min_mapping_quality : int
+only use reads above a minimum mapping quality. The default is 0.
+compute_baq: bool
+re-alignment computing per-Base Alignment Qualities (BAQ). The
+default is to do re-alignment. Realignment requires a reference
+sequence. If none is present, no realignment will be performed.
+redo_baq: bool
+recompute per-Base Alignment Quality on the fly ignoring
+existing base qualities. The default is False (use existing
+base qualities).
+Returns
+-------
+an iterator over genomic positions. : IteratorColumn
+"""
+cdef int rtid, has_coord
+cdef int32_t rstart, rstop
+if not self.is_open:
+raise ValueError("I/O operation on closed file")
+has_coord, rtid, rstart, rstop = self.parse_region(
+contig, start, stop, region, reference=reference, end=end)
+if has_coord:
+if not self.has_index():
+raise ValueError("no index available for pileup")
+return IteratorColumnRegion(self,
+tid=rtid,
+start=rstart,
+stop=rstop,
+**kwargs)
+else:
+if self.has_index():
+return IteratorColumnAllRefs(self, **kwargs)
+else:
+return IteratorColumnAll(self, **kwargs)
+def count(self,
+contig=None,
+start=None,
+stop=None,
+region=None,
+until_eof=False,
+read_callback="nofilter",
+reference=None,
+end=None):
+'''count the number of reads in :term:`region`
+The region is specified by :term:`contig`, `start` and `stop`.
+:term:`reference` and `end` are also accepted for backward
+compatibility as synonyms for :term:`contig` and `stop`,
+respectively.  Alternatively, a `samtools`_ :term:`region`
+string can be supplied.
+A :term:`SAM` file does not allow random access and if
+`region` or `contig` are given, an exception is raised.
+Parameters
+----------
+contig : string
+reference_name of the genomic region (chromosome)
+start : int
+start of the genomic region (0-based inclusive)
+stop : int
+end of the genomic region (0-based exclusive)
+region : string
+a region string in samtools format.
+until_eof : bool
+count until the end of the file, possibly including
+unmapped reads as well.
+read_callback: string or function
+select a call-back to ignore reads when counting. It can
+be either a string with the following values:
+``all``
+skip reads in which any of the following
+flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+BAM_FDUP
+``nofilter``
+uses every single read
+Alternatively, `read_callback` can be a function
+``check_read(read)`` that should return True only for
+those reads that shall be included in the counting.
+reference : string
+backward compatible synonym for `contig`
+end : int
+backward compatible synonym for `stop`
+Raises
+------
+ValueError
+if the genomic coordinates are out of range or invalid.
+'''
+cdef AlignedSegment read
+cdef long counter = 0
+if not self.is_open:
+raise ValueError("I/O operation on closed file")
+cdef int filter_method = 0
+if read_callback == "all":
+filter_method = 1
+elif read_callback == "nofilter":
+filter_method = 2
+for read in self.fetch(contig=contig,
+start=start,
+stop=stop,
+reference=reference,
+end=end,
+region=region,
+until_eof=until_eof):
+# apply filter
+if filter_method == 1:
+# filter = "all"
+if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+continue
+elif filter_method == 2:
+# filter = "nofilter"
+pass
+else:
+if not read_callback(read):
+continue
+counter += 1
+return counter
+@cython.boundscheck(False)  # we do manual bounds checking
+def count_coverage(self,
+contig,
+start=None,
+stop=None,
+region=None,
+quality_threshold=15,
+read_callback='all',
+reference=None,
+end=None):
+"""count the coverage of genomic positions by reads in :term:`region`.
+The region is specified by :term:`contig`, `start` and `stop`.
+:term:`reference` and `end` are also accepted for backward
+compatibility as synonyms for :term:`contig` and `stop`,
+respectively.  Alternatively, a `samtools`_ :term:`region`
+string can be supplied.  The coverage is computed per-base [ACGT].
+Parameters
+----------
+contig : string
+reference_name of the genomic region (chromosome)
+start : int
+start of the genomic region (0-based inclusive). If not
+given, count from the start of the chromosome.
+stop : int
+end of the genomic region (0-based exclusive). If not given,
+count to the end of the chromosome.
+region : string
+a region string.
+quality_threshold : int
+quality_threshold is the minimum quality score (in phred) a
+base has to reach to be counted.
+read_callback: string or function
+select a call-back to ignore reads when counting. It can
+be either a string with the following values:
+``all``
+skip reads in which any of the following
+flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+BAM_FDUP
+``nofilter``
+uses every single read
+Alternatively, `read_callback` can be a function
+``check_read(read)`` that should return True only for
+those reads that shall be included in the counting.
+reference : string
+backward compatible synonym for `contig`
+end : int
+backward compatible synonym for `stop`
+Raises
+------
+ValueError
+if the genomic coordinates are out of range or invalid.
+Returns
+-------
+four array.arrays of the same length in order A C G T : tuple
+"""
+cdef uint32_t contig_length = self.get_reference_length(contig)
+cdef int _start = start if start is not None else 0
+cdef int _stop = stop if stop is not None else contig_length
+_stop = _stop if _stop < contig_length else contig_length
+if _stop == _start:
+raise ValueError("interval of size 0")
+if _stop < _start:
+raise ValueError("interval of size less than 0")
+cdef int length = _stop - _start
+cdef c_array.array int_array_template = array.array('L', [])
+cdef c_array.array count_a
+cdef c_array.array count_c
+cdef c_array.array count_g
+cdef c_array.array count_t
+count_a = c_array.clone(int_array_template, length, zero=True)
+count_c = c_array.clone(int_array_template, length, zero=True)
+count_g = c_array.clone(int_array_template, length, zero=True)
+count_t = c_array.clone(int_array_template, length, zero=True)
+cdef AlignedSegment read
+cdef cython.str seq
+cdef c_array.array quality
+cdef int qpos
+cdef int refpos
+cdef int c = 0
+cdef int filter_method = 0
+if read_callback == "all":
+filter_method = 1
+elif read_callback == "nofilter":
+filter_method = 2
+cdef int _threshold = quality_threshold or 0
+for read in self.fetch(contig=contig,
+reference=reference,
+start=start,
+stop=stop,
+end=end,
+region=region):
+# apply filter
+if filter_method == 1:
+# filter = "all"
+if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+continue
+elif filter_method == 2:
+# filter = "nofilter"
+pass
+else:
+if not read_callback(read):
+continue
+# count
+seq = read.seq
+if seq is None:
+continue
+quality = read.query_qualities
+for qpos, refpos in read.get_aligned_pairs(True):
+if qpos is not None and refpos is not None and \
+_start <= refpos < _stop:
+# only check base quality if _threshold > 0
+if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
+if seq[qpos] == 'A':
+count_a.data.as_ulongs[refpos - _start] += 1
+if seq[qpos] == 'C':
+count_c.data.as_ulongs[refpos - _start] += 1
+if seq[qpos] == 'G':
+count_g.data.as_ulongs[refpos - _start] += 1
+if seq[qpos] == 'T':
+count_t.data.as_ulongs[refpos - _start] += 1
+return count_a, count_c, count_g, count_t
+def find_introns_slow(self, read_iterator):
+"""Return a dictionary {(start, stop): count}
+Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+and their support ( = number of reads ).
+read_iterator can be the result of a .fetch(...) call.
+Or it can be a generator filtering such reads. Example
+samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+"""
+res = collections.Counter()
+for r in read_iterator:
+if 'N' in r.cigarstring:
+last_read_pos = False
+for read_loc, genome_loc in r.get_aligned_pairs():
+if read_loc is None and last_read_pos:
+start = genome_loc
+elif read_loc and last_read_pos is None:
+stop = genome_loc  # we are right exclusive ,so this is correct
+res[(start, stop)] += 1
+del start
+del stop
+last_read_pos = read_loc
+return res
+def find_introns(self, read_iterator):
+"""Return a dictionary {(start, stop): count}
+Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+and their support ( = number of reads ).
+read_iterator can be the result of a .fetch(...) call.
+Or it can be a generator filtering such reads. Example
+samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+"""
+cdef:
+uint32_t base_position, junc_start, nt
+int op
+AlignedSegment r
+int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
+res = collections.Counter()
+match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
+for r in read_iterator:
+base_position = r.pos
+cigar = r.cigartuples
+if cigar is None:
+continue
+for op, nt in cigar:
+if op in match_or_deletion:
+base_position += nt
+elif op == BAM_CREF_SKIP:
+junc_start = base_position
+base_position += nt
+res[(junc_start, base_position)] += 1
+return res
+def close(self):
+'''closes the :class:`pysam.AlignmentFile`.'''
+if self.htsfile == NULL:
+return
+if self.index != NULL:
+hts_idx_destroy(self.index)
+self.index = NULL
+cdef int ret = hts_close(self.htsfile)
+self.htsfile = NULL
+self.header = None
+if ret < 0:
+global errno
+if errno == EPIPE:
+errno = 0
+else:
+raise IOError(errno, force_str(strerror(errno)))
+def __dealloc__(self):
+cdef int ret = 0
+if self.index != NULL:
+hts_idx_destroy(self.index)
+self.index = NULL
+if self.htsfile != NULL:
+ret = hts_close(self.htsfile)
+self.htsfile = NULL
+self.header = None
+if self.b:
+bam_destroy1(self.b)
+self.b = NULL
+if ret < 0:
+global errno
+if errno == EPIPE:
+errno = 0
+else:
+raise IOError(errno, force_str(strerror(errno)))
+cpdef int write(self, AlignedSegment read) except -1:
+'''
+write a single :class:`pysam.AlignedSegment` to disk.
+Raises:
+ValueError
+if the writing failed
+Returns:
+int :
+the number of bytes written. If the file is closed,
+this will be 0.
+'''
+if not self.is_open:
+return 0
+if self.header.ptr.n_targets <= read._delegate.core.tid:
+raise ValueError(
+"AlignedSegment refers to reference number {} that "
+"is larger than the number of references ({}) in the header".format(
+read._delegate.core.tid, self.header.ptr.n_targets))
+cdef int ret
+with nogil:
+ret = sam_write1(self.htsfile,
+self.header.ptr,
+read._delegate)
+# kbj: Still need to raise an exception with except -1. Otherwise
+#      when ret == -1 we get a "SystemError: error return without
+#      exception set".
+if ret < 0:
+raise IOError(
+"sam_write1 failed with error code {}".format(ret))
+return ret
+# context manager interface
+def __enter__(self):
+return self
+def __exit__(self, exc_type, exc_value, traceback):
+self.close()
+return False
+###############################################################
+###############################################################
+###############################################################
+## properties
+###############################################################
+property mapped:
+"""int with total number of mapped alignments according to the
+statistics recorded in the index. This is a read-only
+attribute.
+(This will be 0 for a CRAM file indexed by a .crai index, as that
+index format does not record these statistics.)
+"""
+def __get__(self):
+self.check_index()
+cdef int tid
+cdef uint64_t total = 0
+cdef uint64_t mapped, unmapped
+for tid from 0 <= tid < self.header.nreferences:
+with nogil:
+hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+total += mapped
+return total
+property unmapped:
+"""int with total number of unmapped reads according to the statistics
+recorded in the index. This number of reads includes the number of reads
+without coordinates. This is a read-only attribute.
+(This will be 0 for a CRAM file indexed by a .crai index, as that
+index format does not record these statistics.)
+"""
+def __get__(self):
+self.check_index()
+cdef int tid
+cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
+cdef uint64_t mapped, unmapped
+for tid from 0 <= tid < self.header.nreferences:
+with nogil:
+hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+total += unmapped
+return total
+property nocoordinate:
+"""int with total number of reads without coordinates according to the
+statistics recorded in the index, i.e., the statistic printed for "*"
+by the ``samtools idxstats`` command. This is a read-only attribute.
+(This will be 0 for a CRAM file indexed by a .crai index, as that
+index format does not record these statistics.)
+"""
+def __get__(self):
+self.check_index()
+cdef uint64_t n
+with nogil:
+n = hts_idx_get_n_no_coor(self.index)
+return n
+def get_index_statistics(self):
+"""return statistics about mapped/unmapped reads per chromosome as
+they are stored in the index, similarly to the statistics printed
+by the ``samtools idxstats`` command.
+CRAI indexes do not record these statistics, so for a CRAM file
+with a .crai index the returned statistics will all be 0.
+Returns:
+list :
+a list of records for each chromosome. Each record has the
+attributes 'contig', 'mapped', 'unmapped' and 'total'.
+"""
+self.check_index()
+cdef int tid
+cdef uint64_t mapped, unmapped
+results = []
+# TODO: use header
+for tid from 0 <= tid < self.nreferences:
+with nogil:
+hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+results.append(
+IndexStats._make((
+self.get_reference_name(tid),
+mapped,
+unmapped,
+mapped + unmapped)))
+return results
+###############################################################
+## file-object like iterator access
+## note: concurrent access will cause errors (see IteratorRow
+## and multiple_iterators)
+## Possible solutions: deprecate or open new file handle
+def __iter__(self):
+if not self.is_open:
+raise ValueError("I/O operation on closed file")
+if not self.is_bam and self.header.nreferences == 0:
+raise NotImplementedError(
+"can not iterate over samfile without header")
+return self
+cdef bam1_t * getCurrent(self):
+return self.b
+cdef int cnext(self):
+'''
+cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
+'''
+cdef int ret
+cdef bam_hdr_t * hdr = self.header.ptr
+with nogil:
+ret = sam_read1(self.htsfile,
+hdr,
+self.b)
+return ret
+def __next__(self):
+cdef int ret = self.cnext()
+if ret >= 0:
+return makeAlignedSegment(self.b, self.header)
+elif ret == -1:
+raise StopIteration
+else:
+raise IOError(read_failure_reason(ret))
+###########################################
+# methods/properties referencing the header
+def is_valid_tid(self, int tid):
+"""
+return True if the numerical :term:`tid` is valid; False otherwise.
+Note that the unmapped tid code (-1) counts as an invalid.
+"""
+if self.header is None:
+raise ValueError("header not available in closed files")
+return self.header.is_valid_tid(tid)
+def get_tid(self, reference):
+"""
+return the numerical :term:`tid` corresponding to
+:term:`reference`
+returns -1 if reference is not known.
+"""
+if self.header is None:
+raise ValueError("header not available in closed files")
+return self.header.get_tid(reference)
+def get_reference_name(self, tid):
+"""
+return :term:`reference` name corresponding to numerical :term:`tid`
+"""
+if self.header is None:
+raise ValueError("header not available in closed files")
+return self.header.get_reference_name(tid)
+def get_reference_length(self, reference):
+"""
+return :term:`reference` length corresponding to numerical :term:`tid`
+"""
+if self.header is None:
+raise ValueError("header not available in closed files")
+return self.header.get_reference_length(reference)
+property nreferences:
+"""int with the number of :term:`reference` sequences in the file.
+This is a read-only attribute."""
+def __get__(self):
+if self.header:
+return self.header.nreferences
+else:
+raise ValueError("header not available in closed files")
+property references:
+"""tuple with the names of :term:`reference` sequences. This is a
+read-only attribute"""
+def __get__(self):
+if self.header:
+return self.header.references
+else:
+raise ValueError("header not available in closed files")
+property lengths:
+"""tuple of the lengths of the :term:`reference` sequences. This is a
+read-only attribute. The lengths are in the same order as
+:attr:`pysam.AlignmentFile.references`
+"""
+def __get__(self):
+if self.header:
+return self.header.lengths
+else:
+raise ValueError("header not available in closed files")
+# Compatibility functions for pysam < 0.14
+property text:
+"""deprecated, use :attr:`references` and :attr:`lengths` instead"""
+def __get__(self):
+if self.header:
+return self.header.__str__()
+else:
+raise ValueError("header not available in closed files")
+# Compatibility functions for pysam < 0.8.3
+def gettid(self, reference):
+"""deprecated, use :meth:`get_tid` instead"""
+return self.get_tid(reference)
+def getrname(self, tid):
+"""deprecated, use :meth:`get_reference_name` instead"""
+return self.get_reference_name(tid)
+cdef class IteratorRow:
+'''abstract base class for iterators over mapped reads.
+Various iterators implement different behaviours for wrapping around
+contig boundaries. Examples include:
+:class:`pysam.IteratorRowRegion`
+iterate within a single contig and a defined region.
+:class:`pysam.IteratorRowAll`
+iterate until EOF. This iterator will also include unmapped reads.
+:class:`pysam.IteratorRowAllRefs`
+iterate over all reads in all reference sequences.
+The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a
+:meth:`AlignmentFile.fetch`.
+'''
+def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+cdef char *cfilename
+cdef char *creference_filename
+cdef char *cindexname = NULL
+if not samfile.is_open:
+raise ValueError("I/O operation on closed file")
+# makes sure that samfile stays alive as long as the
+# iterator is alive
+self.samfile = samfile
+# reopen the file - note that this makes the iterator
+# slow and causes pileup to slow down significantly.
+if multiple_iterators:
+cfilename = samfile.filename
+with nogil:
+self.htsfile = hts_open(cfilename, 'r')
+assert self.htsfile != NULL
+if samfile.has_index():
+if samfile.index_filename:
+cindexname = bindex_filename = encode_filename(samfile.index_filename)
+with nogil:
+self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
+else:
+self.index = NULL
+# need to advance in newly opened file to position after header
+# better: use seek/tell?
+with nogil:
+hdr = sam_hdr_read(self.htsfile)
+if hdr is NULL:
+raise IOError("unable to read header information")
+self.header = makeAlignmentHeader(hdr)
+self.owns_samfile = True
+# options specific to CRAM files
+if samfile.is_cram and samfile.reference_filename:
+creference_filename = samfile.reference_filename
+hts_set_opt(self.htsfile,
+CRAM_OPT_REFERENCE,
+creference_filename)
+else:
+self.htsfile = samfile.htsfile
+self.index = samfile.index
+self.owns_samfile = False
+self.header = samfile.header
+self.retval = 0
+self.b = bam_init1()
+def __dealloc__(self):
+bam_destroy1(self.b)
+if self.owns_samfile:
+hts_idx_destroy(self.index)
+hts_close(self.htsfile)
+cdef class IteratorRowRegion(IteratorRow):
+"""*(AlignmentFile samfile, int tid, int beg, int stop,
+int multiple_iterators=False)*
+iterate over mapped reads in a region.
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a
+:meth:`AlignmentFile.fetch`.
+"""
+def __init__(self, AlignmentFile samfile,
+int tid, int beg, int stop,
+int multiple_iterators=False):
+if not samfile.has_index():
+raise ValueError("no index available for iteration")
+super().__init__(samfile, multiple_iterators=multiple_iterators)
+with nogil:
+self.iter = sam_itr_queryi(
+self.index,
+tid,
+beg,
+stop)
+def __iter__(self):
+return self
+cdef bam1_t * getCurrent(self):
+return self.b
+cdef int cnext(self):
+'''cversion of iterator. Used by IteratorColumn'''
+with nogil:
+self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
+self.iter,
+self.b,
+self.htsfile)
+def __next__(self):
+self.cnext()
+if self.retval >= 0:
+return makeAlignedSegment(self.b, self.header)
+elif self.retval == -1:
+raise StopIteration
+elif self.retval == -2:
+# Note: it is currently not the case that hts_iter_next
+# returns -2 for a truncated file.
+# See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+raise IOError('truncated file')
+else:
+raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
+def __dealloc__(self):
+hts_itr_destroy(self.iter)
+cdef class IteratorRowHead(IteratorRow):
+"""*(AlignmentFile samfile, n, int multiple_iterators=False)*
+iterate over first n reads in `samfile`
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a
+:meth:`AlignmentFile.head`.
+"""
+def __init__(self,
+AlignmentFile samfile,
+int n,
+int multiple_iterators=False):
+super().__init__(samfile, multiple_iterators=multiple_iterators)
+self.max_rows = n
+self.current_row = 0
+def __iter__(self):
+return self
+cdef bam1_t * getCurrent(self):
+return self.b
+cdef int cnext(self):
+'''cversion of iterator. Used by IteratorColumn'''
+cdef int ret
+cdef bam_hdr_t * hdr = self.header.ptr
+with nogil:
+ret = sam_read1(self.htsfile,
+hdr,
+self.b)
+return ret
+def __next__(self):
+if self.current_row >= self.max_rows:
+raise StopIteration
+cdef int ret = self.cnext()
+if ret >= 0:
+self.current_row += 1
+return makeAlignedSegment(self.b, self.header)
+elif ret == -1:
+raise StopIteration
+else:
+raise IOError(read_failure_reason(ret))
+cdef class IteratorRowAll(IteratorRow):
+"""*(AlignmentFile samfile, int multiple_iterators=False)*
+iterate over all reads in `samfile`
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a
+:meth:`AlignmentFile.fetch`.
+"""
+def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+super().__init__(samfile, multiple_iterators=multiple_iterators)
+def __iter__(self):
+return self
+cdef bam1_t * getCurrent(self):
+return self.b
+cdef int cnext(self):
+'''cversion of iterator. Used by IteratorColumn'''
+cdef int ret
+cdef bam_hdr_t * hdr = self.header.ptr
+with nogil:
+ret = sam_read1(self.htsfile,
+hdr,
+self.b)
+return ret
+def __next__(self):
+cdef int ret = self.cnext()
+if ret >= 0:
+return makeAlignedSegment(self.b, self.header)
+elif ret == -1:
+raise StopIteration
+else:
+raise IOError(read_failure_reason(ret))
+cdef class IteratorRowAllRefs(IteratorRow):
+"""iterates over all mapped reads by chaining iterators over each
+reference
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a
+:meth:`AlignmentFile.fetch`.
+"""
+def __init__(self, AlignmentFile samfile, multiple_iterators=False):
+super().__init__(samfile, multiple_iterators=multiple_iterators)
+if not samfile.has_index():
+raise ValueError("no index available for fetch")
+self.tid = -1
+def nextiter(self):
+# get a new iterator for a chromosome. The file
+# will not be re-opened.
+self.rowiter = IteratorRowRegion(self.samfile,
+self.tid,
+0,
+MAX_POS)
+# set htsfile and header of the rowiter
+# to the values in this iterator to reflect multiple_iterators
+self.rowiter.htsfile = self.htsfile
+self.rowiter.header = self.header
+# make sure the iterator understand that IteratorRowAllRefs
+# has ownership
+self.rowiter.owns_samfile = False
+def __iter__(self):
+return self
+def __next__(self):
+# Create an initial iterator
+if self.tid == -1:
+if not self.samfile.nreferences:
+raise StopIteration
+self.tid = 0
+self.nextiter()
+while 1:
+self.rowiter.cnext()
+# If current iterator is not exhausted, return aligned read
+if self.rowiter.retval > 0:
+return makeAlignedSegment(self.rowiter.b, self.header)
+self.tid += 1
+# Otherwise, proceed to next reference or stop
+if self.tid < self.samfile.nreferences:
+self.nextiter()
+else:
+raise StopIteration
+cdef class IteratorRowSelection(IteratorRow):
+"""*(AlignmentFile samfile)*
+iterate over reads in `samfile` at a given list of file positions.
+.. note::
+It is usually not necessary to create an object of this class
+explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+"""
+def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
+super().__init__(samfile, multiple_iterators=multiple_iterators)
+self.positions = positions
+self.current_pos = 0
+def __iter__(self):
+return self
+cdef bam1_t * getCurrent(self):
+return self.b
+cdef int cnext(self):
+'''cversion of iterator'''
+# end iteration if out of positions
+if self.current_pos >= len(self.positions): return -1
+cdef uint64_t pos = self.positions[self.current_pos]
+with nogil:
+bgzf_seek(hts_get_bgzfp(self.htsfile),
+pos,
+0)
+self.current_pos += 1
+cdef int ret
+cdef bam_hdr_t * hdr = self.header.ptr
+with nogil:
+ret = sam_read1(self.htsfile,
+hdr,
+self.b)
+return ret
+def __next__(self):
+cdef int ret = self.cnext()
+if ret >= 0:
+return makeAlignedSegment(self.b, self.header)
+elif ret == -1:
+raise StopIteration
+else:
+raise IOError(read_failure_reason(ret))
+cdef int __advance_nofilter(void *data, bam1_t *b):
+'''advance without any read filtering.
+'''
+cdef __iterdata * d = <__iterdata*>data
+cdef int ret
+with nogil:
+ret = sam_itr_next(d.htsfile, d.iter, b)
+return ret
+cdef int __advance_raw_nofilter(void *data, bam1_t *b):
+'''advance (without iterator) without any read filtering.
+'''
+cdef __iterdata * d = <__iterdata*>data
+cdef int ret
+with nogil:
+ret = sam_read1(d.htsfile, d.header, b)
+return ret
+cdef int __advance_all(void *data, bam1_t *b):
+'''only use reads for pileup passing basic filters such as
+BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+'''
+cdef __iterdata * d = <__iterdata*>data
+cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
+cdef int ret
+while 1:
+with nogil:
+ret = sam_itr_next(d.htsfile, d.iter, b)
+if ret < 0:
+break
+if b.core.flag & d.flag_filter:
+continue
+break
+return ret
+cdef int __advance_raw_all(void *data, bam1_t *b):
+'''only use reads for pileup passing basic filters such as
+BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+'''
+cdef __iterdata * d = <__iterdata*>data
+cdef int ret
+while 1:
+with nogil:
+ret = sam_read1(d.htsfile, d.header, b)
+if ret < 0:
+break
+if b.core.flag & d.flag_filter:
+continue
+break
+return ret
+cdef int __advance_samtools(void * data, bam1_t * b):
+'''advance using same filter and read processing as in
+the samtools pileup.
+'''
+cdef __iterdata * d = <__iterdata*>data
+cdef int ret
+cdef int q
+while 1:
+with nogil:
+ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
+if ret < 0:
+break
+if b.core.flag & d.flag_filter:
+continue
+if d.flag_require and not (b.core.flag & d.flag_require):
+continue
+# reload sequence
+if d.fastafile != NULL and b.core.tid != d.tid:
+if d.seq != NULL:
+free(d.seq)
+d.tid = b.core.tid
+with nogil:
+d.seq = faidx_fetch_seq(
+d.fastafile,
+d.header.target_name[d.tid],
+0, MAX_POS,
+&d.seq_len)
+if d.seq == NULL:
+raise ValueError(
+"reference sequence for '{}' (tid={}) not found".format(
+d.header.target_name[d.tid], d.tid))
+# realign read - changes base qualities
+if d.seq != NULL and d.compute_baq:
+# 4th option to realign is flag:
+# apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
+if d.redo_baq:
+sam_prob_realn(b, d.seq, d.seq_len, 7)
+else:
+sam_prob_realn(b, d.seq, d.seq_len, 3)
+if d.seq != NULL and d.adjust_capq_threshold > 10:
+q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
+if q < 0:
+continue
+elif b.core.qual > q:
+b.core.qual = q
+if b.core.qual < d.min_mapping_quality:
+continue
+if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
+continue
+break
+return ret
+cdef class IteratorColumn:
+'''abstract base class for iterators over columns.
+IteratorColumn objects wrap the pileup functionality of samtools.
+For reasons of efficiency, the iterator points to the current
+pileup buffer. The pileup buffer is updated at every iteration.
+This might cause some unexpected behaviour. For example,
+consider the conversion to a list::
+f = AlignmentFile("file.bam", "rb")
+result = list(f.pileup())
+Here, ``result`` will contain ``n`` objects of type
+:class:`~pysam.PileupColumn` for ``n`` columns, but each object in
+``result`` will contain the same information.
+The desired behaviour can be achieved by list comprehension::
+result = [x.pileups() for x in f.pileup()]
+``result`` will be a list of ``n`` lists of objects of type
+:class:`~pysam.PileupRead`.
+If the iterator is associated with a :class:`~pysam.Fastafile`
+using the :meth:`add_reference` method, then the iterator will
+export the current sequence via the methods :meth:`get_sequence`
+and :meth:`seq_len`.
+See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
+'''
+def __cinit__( self, AlignmentFile samfile, **kwargs):
+self.samfile = samfile
+self.fastafile = kwargs.get("fastafile", None)
+self.stepper = kwargs.get("stepper", "samtools")
+self.max_depth = kwargs.get("max_depth", 8000)
+self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
+self.min_base_quality = kwargs.get("min_base_quality", 13)
+self.iterdata.seq = NULL
+self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
+self.iterdata.flag_require = kwargs.get("flag_require", 0)
+self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
+self.iterdata.compute_baq = kwargs.get("compute_baq", True)
+self.iterdata.redo_baq = kwargs.get("redo_baq", False)
+self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
+self.tid = 0
+self.pos = 0
+self.n_plp = 0
+self.plp = NULL
+self.pileup_iter = <bam_mplp_t>NULL
+def __iter__(self):
+return self
+cdef int cnext(self):
+'''perform next iteration.
+'''
+# do not release gil here because of call-backs
+cdef int ret = bam_mplp_auto(self.pileup_iter,
+&self.tid,
+&self.pos,
+&self.n_plp,
+&self.plp)
+return ret
+cdef char * get_sequence(self):
+'''return current reference sequence underlying the iterator.
+'''
+return self.iterdata.seq
+property seq_len:
+'''current sequence length.'''
+def __get__(self):
+return self.iterdata.seq_len
+def add_reference(self, FastaFile fastafile):
+'''
+add reference sequences in `fastafile` to iterator.'''
+self.fastafile = fastafile
+if self.iterdata.seq != NULL:
+free(self.iterdata.seq)
+self.iterdata.tid = -1
+self.iterdata.fastafile = self.fastafile.fastafile
+def has_reference(self):
+'''
+return true if iterator is associated with a reference'''
+return self.fastafile
+cdef _setup_iterator(self,
+int tid,
+int start,
+int stop,
+int multiple_iterators=0):
+'''setup the iterator structure'''
+self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
+self.iterdata.htsfile = self.samfile.htsfile
+self.iterdata.iter = self.iter.iter
+self.iterdata.seq = NULL
+self.iterdata.tid = -1
+self.iterdata.header = self.samfile.header.ptr
+if self.fastafile is not None:
+self.iterdata.fastafile = self.fastafile.fastafile
+else:
+self.iterdata.fastafile = NULL
+# Free any previously allocated memory before reassigning
+# pileup_iter
+self._free_pileup_iter()
+cdef void * data[1]
+data[0] = <void*>&self.iterdata
+if self.stepper is None or self.stepper == "all":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_all,
+data)
+elif self.stepper == "nofilter":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_nofilter,
+data)
+elif self.stepper == "samtools":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_samtools,
+data)
+else:
+raise ValueError(
+"unknown stepper option `%s` in IteratorColumn" % self.stepper)
+if self.max_depth:
+with nogil:
+bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
+if self.ignore_overlaps:
+with nogil:
+bam_mplp_init_overlaps(self.pileup_iter)
+cdef _setup_raw_rest_iterator(self):
+'''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
+self.iter = None
+self.iterdata.iter = NULL
+self.iterdata.htsfile = self.samfile.htsfile
+self.iterdata.seq = NULL
+self.iterdata.tid = -1
+self.iterdata.header = self.samfile.header.ptr
+if self.fastafile is not None:
+self.iterdata.fastafile = self.fastafile.fastafile
+else:
+self.iterdata.fastafile = NULL
+# Free any previously allocated memory before reassigning
+# pileup_iter
+self._free_pileup_iter()
+cdef void * data[1]
+data[0] = <void*>&self.iterdata
+if self.stepper is None or self.stepper == "all":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_raw_all,
+data)
+elif self.stepper == "nofilter":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_raw_nofilter,
+data)
+elif self.stepper == "samtools":
+with nogil:
+self.pileup_iter = bam_mplp_init(1,
+<bam_plp_auto_f>&__advance_samtools,
+data)
+else:
+raise ValueError(
+"unknown stepper option `%s` in IteratorColumn" % self.stepper)
+if self.max_depth:
+with nogil:
+bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
+if self.ignore_overlaps:
+with nogil:
+bam_mplp_init_overlaps(self.pileup_iter)
+cdef reset(self, tid, start, stop):
+'''reset iterator position.
+This permits using the iterator multiple times without
+having to incur the full set-up costs.
+'''
+if self.iter is None:
+raise TypeError("Raw iterator set up without region cannot be reset")
+self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
+self.iterdata.iter = self.iter.iter
+# invalidate sequence if different tid
+if self.tid != tid:
+if self.iterdata.seq != NULL:
+free(self.iterdata.seq)
+self.iterdata.seq = NULL
+self.iterdata.tid = -1
+# self.pileup_iter = bam_mplp_init(1
+#                                  &__advancepileup,
+#                                  &self.iterdata)
+with nogil:
+bam_mplp_reset(self.pileup_iter)
+cdef _free_pileup_iter(self):
+'''free the memory alloc'd by bam_plp_init.
+This is needed before setup_iterator allocates another
+pileup_iter, or else memory will be lost.  '''
+if self.pileup_iter != <bam_mplp_t>NULL:
+with nogil:
+bam_mplp_reset(self.pileup_iter)
+bam_mplp_destroy(self.pileup_iter)
+self.pileup_iter = <bam_mplp_t>NULL
+def __dealloc__(self):
+# reset in order to avoid memory leak messages for iterators
+# that have not been fully consumed
+self._free_pileup_iter()
+self.plp = <const bam_pileup1_t*>NULL
+if self.iterdata.seq != NULL:
+free(self.iterdata.seq)
+self.iterdata.seq = NULL
+# backwards compatibility
+def hasReference(self):
+return self.has_reference()
+cdef char * getSequence(self):
+return self.get_sequence()
+def addReference(self, FastaFile fastafile):
+return self.add_reference(fastafile)
+cdef class IteratorColumnRegion(IteratorColumn):
+'''iterates over a region only.
+'''
+def __cinit__(self,
+AlignmentFile samfile,
+int tid = 0,
+int start = 0,
+int stop = MAX_POS,
+int truncate = False,
+int multiple_iterators = True,
+**kwargs ):
+# initialize iterator. Multiple iterators not available
+# for CRAM.
+if multiple_iterators and samfile.is_cram:
+warnings.warn("multiple_iterators not implemented for CRAM")
+multiple_iterators = False
+self._setup_iterator(tid, start, stop, multiple_iterators)
+self.start = start
+self.stop = stop
+self.truncate = truncate
+def __next__(self):
+cdef int n
+while 1:
+n = self.cnext()
+if n < 0:
+raise ValueError("error during iteration" )
+if n == 0:
+raise StopIteration
+if self.truncate:
+if self.start > self.pos:
+continue
+if self.pos >= self.stop:
+raise StopIteration
+return makePileupColumn(&self.plp,
+self.tid,
+self.pos,
+self.n_plp,
+self.min_base_quality,
+self.iterdata.seq,
+self.samfile.header)
+cdef class IteratorColumnAllRefs(IteratorColumn):
+"""iterates over all columns by chaining iterators over each reference
+"""
+def __cinit__(self,
+AlignmentFile samfile,
+**kwargs):
+# no iteration over empty files
+if not samfile.nreferences:
+raise StopIteration
+# initialize iterator
+self._setup_iterator(self.tid, 0, MAX_POS, 1)
+def __next__(self):
+cdef int n
+while 1:
+n = self.cnext()
+if n < 0:
+raise ValueError("error during iteration")
+# proceed to next reference or stop
+if n == 0:
+self.tid += 1
+if self.tid < self.samfile.nreferences:
+self._setup_iterator(self.tid, 0, MAX_POS, 0)
+else:
+raise StopIteration
+continue
+# return result, if within same reference
+return makePileupColumn(&self.plp,
+self.tid,
+self.pos,
+self.n_plp,
+self.min_base_quality,
+self.iterdata.seq,
+self.samfile.header)
+cdef class IteratorColumnAll(IteratorColumn):
+"""iterates over all columns, without using an index
+"""
+def __cinit__(self,
+AlignmentFile samfile,
+**kwargs):
+self._setup_raw_rest_iterator()
+def __next__(self):
+cdef int n
+n = self.cnext()
+if n < 0:
+raise ValueError("error during iteration")
+if n == 0:
+raise StopIteration
+return makePileupColumn(&self.plp,
+self.tid,
+self.pos,
+self.n_plp,
+self.min_base_quality,
+self.iterdata.seq,
+self.samfile.header)
+cdef class SNPCall:
+'''the results of a SNP call.'''
+cdef int _tid
+cdef int _pos
+cdef char _reference_base
+cdef char _genotype
+cdef int _consensus_quality
+cdef int _snp_quality
+cdef int _rms_mapping_quality
+cdef int _coverage
+property tid:
+'''the chromosome ID as is defined in the header'''
+def __get__(self):
+return self._tid
+property pos:
+'''nucleotide position of SNP.'''
+def __get__(self): return self._pos
+property reference_base:
+'''reference base at pos. ``N`` if no reference sequence supplied.'''
+def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+property genotype:
+'''the genotype called.'''
+def __get__(self): return from_string_and_size( &self._genotype, 1 )
+property consensus_quality:
+'''the genotype quality (Phred-scaled).'''
+def __get__(self): return self._consensus_quality
+property snp_quality:
+'''the snp quality (Phred scaled) - probability of consensus being
+identical to reference sequence.'''
+def __get__(self): return self._snp_quality
+property mapping_quality:
+'''the root mean square (rms) of the mapping quality of all reads
+involved in the call.'''
+def __get__(self): return self._rms_mapping_quality
+property coverage:
+'''coverage or read depth - the number of reads involved in the call.'''
+def __get__(self): return self._coverage
+def __str__(self):
+return "\t".join( map(str, (
+self.tid,
+self.pos,
+self.reference_base,
+self.genotype,
+self.consensus_quality,
+self.snp_quality,
+self.mapping_quality,
+self.coverage ) ) )
+cdef class IndexedReads:
+"""Index a Sam/BAM-file by query name while keeping the
+original sort order intact.
+The index is kept in memory and can be substantial.
+By default, the file is re-opened to avoid conflicts if multiple
+operators work on the same file. Set `multiple_iterators` = False
+to not re-open `samfile`.
+Parameters
+----------
+samfile : AlignmentFile
+File to be indexed.
+multiple_iterators : bool
+Flag indicating whether the file should be reopened. Reopening prevents
+existing iterators being affected by the indexing.
+"""
+def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
+cdef char *cfilename
+# makes sure that samfile stays alive as long as this
+# object is alive.
+self.samfile = samfile
+cdef bam_hdr_t * hdr = NULL
+assert samfile.is_bam, "can only apply IndexReads on bam files"
+# multiple_iterators the file - note that this makes the iterator
+# slow and causes pileup to slow down significantly.
+if multiple_iterators:
+cfilename = samfile.filename
+with nogil:
+self.htsfile = hts_open(cfilename, 'r')
+if self.htsfile == NULL:
+raise OSError("unable to reopen htsfile")
+# need to advance in newly opened file to position after header
+# better: use seek/tell?
+with nogil:
+hdr = sam_hdr_read(self.htsfile)
+if hdr == NULL:
+raise OSError("unable to read header information")
+self.header = makeAlignmentHeader(hdr)
+self.owns_samfile = True
+else:
+self.htsfile = self.samfile.htsfile
+self.header = samfile.header
+self.owns_samfile = False
+def build(self):
+'''build the index.'''
+self.index = collections.defaultdict(list)
+# this method will start indexing from the current file position
+cdef int ret = 1
+cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
+if b == NULL:
+raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
+cdef uint64_t pos
+cdef bam_hdr_t * hdr = self.header.ptr
+while ret > 0:
+with nogil:
+pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
+ret = sam_read1(self.htsfile,
+hdr,
+b)
+if ret > 0:
+qname = charptr_to_str(pysam_bam_get_qname(b))
+self.index[qname].append(pos)
+bam_destroy1(b)
+def find(self, query_name):
+'''find `query_name` in index.
+Returns
+-------
+IteratorRowSelection
+Returns an iterator over all reads with query_name.
+Raises
+------
+KeyError
+if the `query_name` is not in the index.
+'''
+if query_name in self.index:
+return IteratorRowSelection(
+self.samfile,
+self.index[query_name],
+multiple_iterators = False)
+else:
+raise KeyError("read %s not found" % query_name)
+def __dealloc__(self):
+if self.owns_samfile:
+hts_close(self.htsfile)

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 68:5028fdace37b