Mercurial > repos > rliterman > csp2

diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author: jpayne
date: Tue, 18 Mar 2025 17:55:14 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,2982 @@
+# cython: embedsignature=True
+# cython: profile=True
+########################################################
+########################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+########################################################
+# The principal classes defined in this module are:
+#
+# class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
+#
+# class AlignmentHeader manage SAM/BAM/CRAM header data
+#
+# class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
+#                       the original sort order intact
+#
+# Additionally this module defines numerous additional classes that
+# are part of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential
+# (IteratorRow) or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+# class IteratorColumn
+# class IteratorColumnRegion
+# class IteratorColumnAll
+# class IteratorColumnAllRefs
+#
+########################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+########################################################
+import os
+import collections
+try:
+    from collections.abc import Sequence, Mapping  # noqa
+except ImportError:
+    from collections import Sequence, Mapping  # noqa
+import re
+import warnings
+import array
+from libc.errno  cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from libc.stdint cimport INT32_MAX
+
+from cpython cimport array as c_array
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
+                              HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
+
+from io import StringIO
+
+cimport cython
+
+
+__all__ = [
+    "AlignmentFile",
+    "AlignmentHeader",
+    "IteratorRow",
+    "IteratorColumn",
+    "IndexedReads"]
+
+IndexStats = collections.namedtuple("IndexStats",
+                                    ("contig",
+                                     "mapped",
+                                     "unmapped",
+                                     "total"))
+
+########################################################
+## global variables
+# maximum genomic coordinace
+# for some reason, using 'int' causes overflow
+cdef int MAX_POS = (1 << 31) - 1
+
+# valid types for SAM headers
+VALID_HEADER_TYPES = {"HD" : Mapping,
+                      "SQ" : Sequence,
+                      "RG" : Sequence,
+                      "PG" : Sequence,
+                      "CO" : Sequence}
+
+# order of records within SAM headers
+VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
+
+# default type conversions within SAM header records
+KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
+                               "SS" : str,},
+                       "SQ" : {"SN" : str, "LN" : int, "AS" : str,
+                               "M5" : str, "SP" : str, "UR" : str,
+                               "AH" : str, "TP" : str, "DS" : str,
+                               "AN" : str,},
+                       "RG" : {"ID" : str, "CN" : str, "DS" : str,
+                               "DT" : str, "FO" : str, "KS" : str,
+                               "LB" : str, "PG" : str, "PI" : str,
+                               "PL" : str, "PM" : str, "PU" : str,
+                               "SM" : str, "BC" : str,},
+                       "PG" : {"ID" : str, "PN" : str, "CL" : str,
+                               "PP" : str, "DS" : str, "VN" : str,},}
+
+# output order of fields within records. Ensure that CL is at
+# the end as parsing a CL will ignore any subsequent records.
+VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
+                      "SQ" : ("SN", "LN", "AS", "M5",
+                              "UR", "SP", "AH", "TP",
+                              "DS", "AN"),
+                      "RG" : ("ID", "CN", "SM", "LB",
+                              "PU", "PI", "DT", "DS",
+                              "PL", "FO", "KS", "PG",
+                              "PM", "BC"),
+                      "PG" : ("PN", "ID", "VN", "PP",
+                              "DS", "CL"),}
+
+
+def build_header_line(fields, record):
+    '''build a header line from `fields` dictionary for `record`'''
+
+    # TODO: add checking for field and sort order
+    line = ["@%s" % record]
+        # comment
+    if record == "CO":
+        line.append(fields)
+    # user tags
+    elif record.islower():
+        for key in sorted(fields):
+            line.append("%s:%s" % (key, str(fields[key])))
+    # defined tags
+    else:
+        # write fields of the specification
+        for key in VALID_HEADER_ORDER[record]:
+            if key in fields:
+                line.append("%s:%s" % (key, str(fields[key])))
+        # write user fields
+        for key in fields:
+            if not key.isupper():
+                line.append("%s:%s" % (key, str(fields[key])))
+
+    return "\t".join(line)
+
+
+cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
+    if not hdr:
+        raise ValueError('cannot create AlignmentHeader, received NULL pointer')
+
+    # check: is AlignmetHeader.__cinit__ called?
+    cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
+    header.ptr = hdr
+
+    return header
+
+def read_failure_reason(code):
+    if code == -2:
+        return 'truncated file'
+    else:
+        return "error {} while reading file".format(code)
+
+
+# the following should be class-method for VariantHeader, but cdef @classmethods
+# are not implemented in cython.
+cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
+                                        reference_names,
+                                        reference_lengths,
+                                        add_sq_text=True,
+                                        text=None) except -1:
+    """build header from list of reference names and lengths.
+    """
+
+cdef class AlignmentHeader(object):
+    """header information for a :class:`AlignmentFile` object
+
+    Parameters
+    ----------
+    header_dict : dict
+        build header from a multi-level dictionary. The
+        first level are the four types ('HD', 'SQ', ...). The second
+        level are a list of lines, with each line being a list of
+        tag-value pairs. The header is constructed first from all the
+        defined fields, followed by user tags in alphabetical
+        order. Alternatively, an :class:`~pysam.AlignmentHeader`
+        object can be passed directly.
+
+    text : string
+        use the string provided as the header
+
+    reference_names : list
+        see reference_lengths
+
+    reference_lengths : list
+        build header from list of chromosome names and lengths.  By
+        default, 'SQ' and 'LN' tags will be added to the header
+        text. This option can be changed by unsetting the flag
+        `add_sq_text`.
+
+    add_sq_text : bool
+        do not add 'SQ' and 'LN' tags to header. This option permits
+        construction :term:`SAM` formatted files without a header.
+
+    """
+
+    # See makeVariantHeader for C constructor
+    def __cinit__(self):
+        self.ptr = NULL
+
+    # Python constructor
+    def __init__(self):
+        self.ptr = bam_hdr_init()
+        if self.ptr is NULL:
+            raise MemoryError("could not create header")
+
+    @classmethod
+    def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
+
+        cdef AlignmentHeader self = AlignmentHeader()
+        cdef char *ctext
+        cdef int l_text
+        cdef int n, x
+        if text is not None:
+            btext = force_bytes(text)
+            ctext = btext
+            l_text = len(btext)
+            self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
+            if self.ptr.text == NULL:
+                raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
+            self.ptr.l_text = l_text
+            memcpy(self.ptr.text, ctext, l_text + 1)
+
+        if reference_names and reference_lengths:
+            reference_names = [force_bytes(ref) for ref in reference_names]
+
+            self.ptr.n_targets = len(reference_names)
+
+            n = sum([len(reference_names) + 1])
+            self.ptr.target_name = <char**>calloc(n, sizeof(char*))
+            if self.ptr.target_name == NULL:
+                raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
+
+            self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+            if self.ptr.target_len == NULL:
+                raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
+
+            for x from 0 <= x < self.ptr.n_targets:
+                self.ptr.target_len[x] = reference_lengths[x]
+                name = reference_names[x]
+                self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
+                if self.ptr.target_name[x] == NULL:
+                    raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
+                strncpy(self.ptr.target_name[x], name, len(name))
+
+        return self
+
+    @classmethod
+    def from_text(cls, text):
+
+        reference_names, reference_lengths = [], []
+        for line in text.splitlines():
+            if line.startswith("@SQ"):
+                fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
+                try:
+                    reference_names.append(fields["SN"])
+                    reference_lengths.append(int(fields["LN"]))
+                except KeyError:
+                    raise KeyError("incomplete sequence information in '%s'" % str(fields))
+                except ValueError:
+                    raise ValueError("wrong sequence information in '%s'" % str(fields))
+
+        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+
+    @classmethod
+    def from_dict(cls, header_dict):
+
+        cdef list lines = []
+        # first: defined tags
+        for record in VALID_HEADERS:
+            if record in header_dict:
+                data = header_dict[record]
+                if not isinstance(data, VALID_HEADER_TYPES[record]):
+                    raise ValueError(
+                        "invalid type for record %s: %s, expected %s".format(
+                            record, type(data), VALID_HEADER_TYPES[record]))
+                if isinstance(data, Mapping):
+                    lines.append(build_header_line(data, record))
+                else:
+                    for fields in header_dict[record]:
+                        lines.append(build_header_line(fields, record))
+
+        # then: user tags (lower case), sorted alphabetically
+        for record, data in sorted(header_dict.items()):
+            if record in VALID_HEADERS:
+                continue
+            if isinstance(data, Mapping):
+                lines.append(build_header_line(data, record))
+            else:
+                for fields in header_dict[record]:
+                    lines.append(build_header_line(fields, record))
+
+        text = "\n".join(lines) + "\n"
+
+        reference_names, reference_lengths = [], []
+        if "SQ" in header_dict:
+            for fields in header_dict["SQ"]:
+                try:
+                    reference_names.append(fields["SN"])
+                    reference_lengths.append(fields["LN"])
+                except KeyError:
+                    raise KeyError("incomplete sequence information in '%s'" % str(fields))
+
+        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+
+    @classmethod
+    def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
+
+        if len(reference_names) != len(reference_lengths):
+            raise ValueError("number of reference names and lengths do not match")
+
+        # optionally, if there is no text, add a SAM compatible header to output file.
+        if text is None and add_sq_text:
+            text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
+                reference_names, reference_lengths)])
+
+        return cls._from_text_and_lengths(text, reference_names, reference_lengths)
+
+    def __dealloc__(self):
+        bam_hdr_destroy(self.ptr)
+        self.ptr = NULL
+
+    def __bool__(self):
+        return self.ptr != NULL
+
+    def copy(self):
+        return makeAlignmentHeader(bam_hdr_dup(self.ptr))
+
+    property nreferences:
+        """int with the number of :term:`reference` sequences in the file.
+
+        This is a read-only attribute."""
+        def __get__(self):
+            return self.ptr.n_targets
+
+    property references:
+        """tuple with the names of :term:`reference` sequences. This is a
+        read-only attribute"""
+        def __get__(self):
+            t = []
+            cdef int x
+            for x in range(self.ptr.n_targets):
+                t.append(charptr_to_str(self.ptr.target_name[x]))
+            return tuple(t)
+
+    property lengths:
+        """tuple of the lengths of the :term:`reference` sequences. This is a
+        read-only attribute. The lengths are in the same order as
+        :attr:`pysam.AlignmentFile.references`
+        """
+        def __get__(self):
+            t = []
+            cdef int x
+            for x in range(self.ptr.n_targets):
+                t.append(self.ptr.target_len[x])
+            return tuple(t)
+
+    def _build_sequence_section(self):
+        """return sequence section of header.
+
+        The sequence section is built from the list of reference names and
+        lengths stored in the BAM-file and not from any @SQ entries that
+        are part of the header's text section.
+        """
+
+        cdef int x
+        text = []
+        for x in range(self.ptr.n_targets):
+            text.append("@SQ\tSN:{}\tLN:{}\n".format(
+                force_str(self.ptr.target_name[x]),
+                self.ptr.target_len[x]))
+        return "".join(text)
+
+    def to_dict(self):
+        """return two-level dictionary with header information from the file.
+
+        The first level contains the record (``HD``, ``SQ``, etc) and
+        the second level contains the fields (``VN``, ``LN``, etc).
+
+        The parser is validating and will raise an AssertionError if
+        if encounters any record or field tags that are not part of
+        the SAM specification. Use the
+        :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
+        header.
+
+        The parsing follows the SAM format specification with the
+        exception of the ``CL`` field. This option will consume the
+        rest of a header line irrespective of any additional fields.
+        This behaviour has been added to accommodate command line
+        options that contain characters that are not valid field
+        separators.
+
+        If no @SQ entries are within the text section of the header,
+        this will be automatically added from the reference names and
+        lengths stored in the binary part of the header.
+        """
+        result = collections.OrderedDict()
+
+        # convert to python string
+        t = self.__str__()
+        for line in t.split("\n"):
+            line = line.strip(' \0')
+            if not line:
+                continue
+            assert line.startswith("@"), \
+                "header line without '@': '%s'" % line
+            fields = line[1:].split("\t")
+            record = fields[0]
+            assert record in VALID_HEADER_TYPES, \
+                "header line with invalid type '%s': '%s'" % (record, line)
+
+            # treat comments
+            if record == "CO":
+                if record not in result:
+                    result[record] = []
+                result[record].append("\t".join( fields[1:]))
+                continue
+            # the following is clumsy as generators do not work?
+            x = {}
+
+            for idx, field in enumerate(fields[1:]):
+                if ":" not in field:
+                    raise ValueError("malformatted header: no ':' in field" )
+                key, value = field.split(":", 1)
+                if key in ("CL",):
+                    # special treatment for command line
+                    # statements (CL). These might contain
+                    # characters that are non-conformant with
+                    # the valid field separators in the SAM
+                    # header. Thus, in contravention to the
+                    # SAM API, consume the rest of the line.
+                    key, value = "\t".join(fields[idx+1:]).split(":", 1)
+                    x[key] = KNOWN_HEADER_FIELDS[record][key](value)
+                    break
+
+                # interpret type of known header record tags, default to str
+                x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
+
+            if VALID_HEADER_TYPES[record] == Mapping:
+                if record in result:
+                    raise ValueError(
+                        "multiple '%s' lines are not permitted" % record)
+
+                result[record] = x
+            elif VALID_HEADER_TYPES[record] == Sequence:
+                if record not in result: result[record] = []
+                result[record].append(x)
+
+        # if there are no SQ lines in the header, add the
+        # reference names from the information in the bam
+        # file.
+        #
+        # Background: c-samtools keeps the textual part of the
+        # header separate from the list of reference names and
+        # lengths. Thus, if a header contains only SQ lines,
+        # the SQ information is not part of the textual header
+        # and thus are missing from the output. See issue 84.
+        if "SQ" not in result:
+            sq = []
+            for ref, length in zip(self.references, self.lengths):
+                sq.append({'LN': length, 'SN': ref })
+            result["SQ"] = sq
+
+        return result
+
+    def as_dict(self):
+        """deprecated, use :meth:`to_dict()` instead"""
+        return self.to_dict()
+
+    def get_reference_name(self, tid):
+        if tid == -1:
+            return None
+        if not 0 <= tid < self.ptr.n_targets:
+            raise ValueError("reference_id %i out of range 0<=tid<%i" %
+                             (tid, self.ptr.n_targets))
+        return charptr_to_str(self.ptr.target_name[tid])
+
+    def get_reference_length(self, reference):
+        cdef int tid = self.get_tid(reference)
+        if tid < 0:
+            raise KeyError("unknown reference {}".format(reference))
+        else:
+            return self.ptr.target_len[tid]
+
+    def is_valid_tid(self, int tid):
+        """
+        return True if the numerical :term:`tid` is valid; False otherwise.
+
+        Note that the unmapped tid code (-1) counts as an invalid.
+        """
+        return 0 <= tid < self.ptr.n_targets
+
+    def get_tid(self, reference):
+        """
+        return the numerical :term:`tid` corresponding to
+        :term:`reference`
+
+        returns -1 if reference is not known.
+        """
+        reference = force_bytes(reference)
+        tid = bam_name2id(self.ptr, reference)
+        if tid < -1:
+            raise ValueError('could not parse header')
+        return tid
+
+    def __str__(self):
+        '''string with the full contents of the :term:`sam file` header as a
+        string.
+
+        If no @SQ entries are within the text section of the header,
+        this will be automatically added from the reference names and
+        lengths stored in the binary part of the header.
+
+        See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
+        representation of the header.
+        '''
+        text = from_string_and_size(self.ptr.text, self.ptr.l_text)
+        if "@SQ" not in text:
+            text += "\n" + self._build_sequence_section()
+        return text
+
+    # dictionary access methods, for backwards compatibility.
+    def __setitem__(self, key, value):
+        raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
+
+    def __getitem__(self, key):
+        return self.to_dict().__getitem__(key)
+
+    def items(self):
+        return self.to_dict().items()
+
+    # PY2 compatibility
+    def iteritems(self):
+        return self.to_dict().items()
+
+    def keys(self):
+        return self.to_dict().keys()
+
+    def values(self):
+        return self.to_dict().values()
+
+    def get(self, *args):
+        return self.to_dict().get(*args)
+
+    def __len__(self):
+        return self.to_dict().__len__()
+
+    def __contains__(self, key):
+        return self.to_dict().__contains__(key)
+
+
+cdef class AlignmentFile(HTSFile):
+    """AlignmentFile(filepath_or_object, mode=None, template=None,
+    reference_names=None, reference_lengths=None, text=NULL,
+    header=None, add_sq_text=False, check_header=True, check_sq=True,
+    reference_filename=None, filename=None, index_filename=None,
+    filepath_index=None, require_index=False, duplicate_filehandle=True,
+    ignore_truncation=False, threads=1)
+
+    A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
+
+    If `filepath_or_object` is a string, the file is automatically
+    opened. If `filepath_or_object` is a python File object, the
+    already opened file will be used.
+
+    If the file is opened for reading and an index exists (if file is BAM, a
+    .bai file or if CRAM a .crai file), it will be opened automatically.
+    `index_filename` may be specified explicitly. If the index is not named
+    in the standard manner, not located in the same directory as the
+    BAM/CRAM file, or is remote.  Without an index, random access via
+    :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
+    is disabled.
+
+    For writing, the header of a :term:`SAM` file/:term:`BAM` file can
+    be constituted from several sources (see also the samtools format
+    specification):
+
+        1. If `template` is given, the header is copied from another
+           `AlignmentFile` (`template` must be a
+           :class:`~pysam.AlignmentFile`).
+
+        2. If `header` is given, the header is built from a
+           multi-level dictionary.
+
+        3. If `text` is given, new header text is copied from raw
+           text.
+
+        4. The names (`reference_names`) and lengths
+           (`reference_lengths`) are supplied directly as lists.
+
+    When reading or writing a CRAM file, the filename of a FASTA-formatted
+    reference can be specified with `reference_filename`.
+
+    By default, if a file is opened in mode 'r', it is checked
+    for a valid header (`check_header` = True) and a definition of
+    chromosome names (`check_sq` = True).
+
+    Parameters
+    ----------
+    mode : string
+        `mode` should be ``r`` for reading or ``w`` for writing. The
+        default is text mode (:term:`SAM`). For binary (:term:`BAM`)
+        I/O you should append ``b`` for compressed or ``u`` for
+        uncompressed :term:`BAM` output.  Use ``h`` to output header
+        information in text (:term:`TAM`) mode. Use ``c`` for
+        :term:`CRAM` formatted files.
+
+        If ``b`` is present, it must immediately follow ``r`` or
+        ``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
+        ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
+        :term:`BAM` formatted file for reading, type::
+
+           f = pysam.AlignmentFile('ex1.bam','rb')
+
+        If mode is not specified, the method will try to auto-detect
+        in the order 'rb', 'r', thus both the following should work::
+
+            f1 = pysam.AlignmentFile('ex1.bam')
+            f2 = pysam.AlignmentFile('ex1.sam')
+
+    template : AlignmentFile
+        when writing, copy header from file `template`.
+
+    header :  dict or AlignmentHeader
+        when writing, build header from a multi-level dictionary. The
+        first level are the four types ('HD', 'SQ', ...). The second
+        level are a list of lines, with each line being a list of
+        tag-value pairs. The header is constructed first from all the
+        defined fields, followed by user tags in alphabetical
+        order. Alternatively, an :class:`~pysam.AlignmentHeader`
+        object can be passed directly.
+
+    text : string
+        when writing, use the string provided as the header
+
+    reference_names : list
+        see reference_lengths
+
+    reference_lengths : list
+        when writing or opening a SAM file without header build header
+        from list of chromosome names and lengths.  By default, 'SQ'
+        and 'LN' tags will be added to the header text. This option
+        can be changed by unsetting the flag `add_sq_text`.
+
+    add_sq_text : bool
+        do not add 'SQ' and 'LN' tags to header. This option permits
+        construction :term:`SAM` formatted files without a header.
+
+    add_sam_header : bool
+        when outputting SAM the default is to output a header. This is
+        equivalent to opening the file in 'wh' mode. If this option is
+        set to False, no header will be output. To read such a file,
+        set `check_header=False`.
+
+    check_header : bool
+        obsolete: when reading a SAM file, check if header is present
+        (default=True)
+
+    check_sq : bool
+        when reading, check if SQ entries are present in header
+        (default=True)
+
+    reference_filename : string
+        Path to a FASTA-formatted reference file. Valid only for CRAM files.
+        When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
+        specified in the header (``UR`` tag), which are normally used to find
+        the reference.
+
+    index_filename : string
+        Explicit path to the index file.  Only needed if the index is not
+        named in the standard manner, not located in the same directory as
+        the BAM/CRAM file, or is remote.  An IOError is raised if the index
+        cannot be found or is invalid.
+
+    filepath_index : string
+        Alias for `index_filename`.
+
+    require_index : bool
+        When reading, require that an index file is present and is valid or
+        raise an IOError.  (default=False)
+
+    filename : string
+        Alternative to filepath_or_object. Filename of the file
+        to be opened.
+
+    duplicate_filehandle: bool
+        By default, file handles passed either directly or through
+        File-like objects will be duplicated before passing them to
+        htslib. The duplication prevents issues where the same stream
+        will be closed by htslib and through destruction of the
+        high-level python object. Set to False to turn off
+        duplication.
+
+    ignore_truncation: bool
+        Issue a warning, instead of raising an error if the current file
+        appears to be truncated due to a missing EOF marker.  Only applies
+        to bgzipped formats. (Default=False)
+
+    format_options: list
+        A list of key=value strings, as accepted by --input-fmt-option and
+        --output-fmt-option in samtools.
+    threads: integer
+        Number of threads to use for compressing/decompressing BAM/CRAM files.
+        Setting threads to > 1 cannot be combined with `ignore_truncation`.
+        (Default=1)
+    """
+
+    def __cinit__(self, *args, **kwargs):
+        self.htsfile = NULL
+        self.filename = None
+        self.mode = None
+        self.threads = 1
+        self.is_stream = False
+        self.is_remote = False
+        self.index = NULL
+
+        if "filename" in kwargs:
+            args = [kwargs["filename"]]
+            del kwargs["filename"]
+
+        self._open(*args, **kwargs)
+
+        # allocate memory for iterator
+        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+        if self.b == NULL:
+            raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
+
+    def has_index(self):
+        """return true if htsfile has an existing (and opened) index.
+        """
+        return self.index != NULL
+
+    def check_index(self):
+        """return True if index is present.
+
+        Raises
+        ------
+
+        AttributeError
+            if htsfile is :term:`SAM` formatted and thus has no index.
+
+        ValueError
+            if htsfile is closed or index could not be opened.
+        """
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+        if not self.is_bam and not self.is_cram:
+            raise AttributeError(
+                "AlignmentFile.mapped only available in bam files")
+        if self.index == NULL:
+            raise ValueError(
+                "mapping information not recorded in index "
+                "or index not available")
+        return True
+
+    def _open(self,
+              filepath_or_object,
+              mode=None,
+              AlignmentFile template=None,
+              reference_names=None,
+              reference_lengths=None,
+              reference_filename=None,
+              text=None,
+              header=None,
+              port=None,
+              add_sq_text=True,
+              add_sam_header=True,
+              check_header=True,
+              check_sq=True,
+              index_filename=None,
+              filepath_index=None,
+              require_index=False,
+              referencenames=None,
+              referencelengths=None,
+              duplicate_filehandle=True,
+              ignore_truncation=False,
+              format_options=None,
+              threads=1):
+        '''open a sam, bam or cram formatted file.
+
+        If _open is called on an existing file, the current file
+        will be closed and a new file will be opened.
+
+        '''
+        cdef char *cfilename = NULL
+        cdef char *creference_filename = NULL
+        cdef char *cindexname = NULL
+        cdef char *cmode = NULL
+        cdef bam_hdr_t * hdr = NULL
+
+        if threads > 1 and ignore_truncation:
+           # This won't raise errors if reaching a truncated alignment,
+           # because bgzf_mt_reader in htslib does not deal with
+           # bgzf_mt_read_block returning non-zero values, contrary
+           # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
+           # Better to avoid this (for now) than to produce seemingly correct results.
+           raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
+        self.threads = threads
+
+        # for backwards compatibility:
+        if referencenames is not None:
+            reference_names = referencenames
+        if referencelengths is not None:
+            reference_lengths = referencelengths
+
+        # close a previously opened file
+        if self.is_open:
+            self.close()
+
+        # autodetection for read
+        if mode is None:
+            mode = "r"
+
+        if add_sam_header and mode == "w":
+            mode = "wh"
+
+        assert mode in ("r", "w", "rb", "wb", "wh",
+                        "wbu", "rU", "wb0",
+                        "rc", "wc"), \
+            "invalid file opening mode `%s`" % mode
+
+        self.duplicate_filehandle = duplicate_filehandle
+
+        # StringIO not supported
+        if isinstance(filepath_or_object, StringIO):
+            raise NotImplementedError(
+                "access from StringIO objects not supported")
+        # reading from a file descriptor
+        elif isinstance(filepath_or_object, int):
+            self.filename = filepath_or_object
+            filename = None
+            self.is_remote = False
+            self.is_stream = True
+        # reading from a File object or other object with fileno
+        elif hasattr(filepath_or_object, "fileno"):
+            if filepath_or_object.closed:
+                raise ValueError('I/O operation on closed file')
+            self.filename = filepath_or_object
+            # .name can be TextIOWrapper
+            try:
+                filename = encode_filename(str(filepath_or_object.name))
+                cfilename = filename
+            except AttributeError:
+                filename = None
+            self.is_remote = False
+            self.is_stream = True
+        # what remains is a filename
+        else:
+            self.filename = filename = encode_filename(filepath_or_object)
+            cfilename = filename
+            self.is_remote = hisremote(cfilename)
+            self.is_stream = self.filename == b'-'
+
+        # for htslib, wbu seems to not work
+        if mode == "wbu":
+            mode = "wb0"
+
+        self.mode = force_bytes(mode)
+        self.reference_filename = reference_filename = encode_filename(
+            reference_filename)
+
+        if mode[0] == 'w':
+            # open file for writing
+
+            if not (template or header or text or (reference_names and reference_lengths)):
+                raise ValueError(
+                    "either supply options `template`, `header`, `text` or  both `reference_names` "
+                    "and `reference_lengths` for writing")
+
+            if template:
+                # header is copied, though at the moment not strictly
+                # necessary as AlignmentHeader is immutable.
+                self.header = template.header.copy()
+            elif isinstance(header, AlignmentHeader):
+                self.header = header.copy()
+            elif isinstance(header, Mapping):
+                self.header = AlignmentHeader.from_dict(header)
+            elif reference_names and reference_lengths:
+                self.header = AlignmentHeader.from_references(
+                    reference_names,
+                    reference_lengths,
+                    add_sq_text=add_sq_text,
+                    text=text)
+            elif text:
+                self.header = AlignmentHeader.from_text(text)
+            else:
+                raise ValueError("not enough information to construct header. Please provide template, "
+                                 "header, text or reference_names/reference_lengths")
+            self.htsfile = self._open_htsfile()
+
+            if self.htsfile == NULL:
+                if errno:
+                    raise IOError(errno, "could not open alignment file `{}`: {}".format(
+                        force_str(filename),
+                        force_str(strerror(errno))))
+                else:
+                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
+            if format_options and len(format_options):
+                self.add_hts_options(format_options)
+            # set filename with reference sequences. If no filename
+            # is given, the CRAM reference arrays will be built from
+            # the @SQ header in the header
+            if "c" in mode and reference_filename:
+                if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
+                    raise ValueError("failure when setting reference filename")
+
+            # write header to htsfile
+            if "b" in mode or "c" in mode or "h" in mode:
+                hdr = self.header.ptr
+                with nogil:
+                    sam_hdr_write(self.htsfile, hdr)
+
+        elif mode[0] == "r":
+            # open file for reading
+            self.htsfile = self._open_htsfile()
+
+            if self.htsfile == NULL:
+                if errno:
+                    raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
+                                  force_str(strerror(errno))))
+                else:
+                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
+
+            if self.htsfile.format.category != sequence_data:
+                raise ValueError("file does not contain alignment data")
+
+            if format_options and len(format_options):
+                self.add_hts_options(format_options)
+
+            self.check_truncation(ignore_truncation)
+
+            # bam/cram files require a valid header
+            if self.is_bam or self.is_cram:
+                with nogil:
+                    hdr = sam_hdr_read(self.htsfile)
+                if hdr == NULL:
+                    raise ValueError(
+                        "file does not have a valid header (mode='%s') "
+                        "- is it BAM/CRAM format?" % mode)
+                self.header = makeAlignmentHeader(hdr)
+            else:
+                # in sam files a header is optional. If not given,
+                # user may provide reference names and lengths to built
+                # an on-the-fly header.
+                if reference_names and reference_lengths:
+                    # build header from a target names and lengths
+                    self.header = AlignmentHeader.from_references(
+                        reference_names=reference_names,
+                        reference_lengths=reference_lengths,
+                        add_sq_text=add_sq_text,
+                        text=text)
+                else:
+                    with nogil:
+                        hdr = sam_hdr_read(self.htsfile)
+                    if hdr == NULL:
+                        raise ValueError(
+                            "SAM? file does not have a valid header (mode='%s'), "
+                            "please provide reference_names and reference_lengths")
+                    self.header = makeAlignmentHeader(hdr)
+
+            # set filename with reference sequences
+            if self.is_cram and reference_filename:
+                creference_filename = self.reference_filename
+                hts_set_opt(self.htsfile,
+                            CRAM_OPT_REFERENCE,
+                            creference_filename)
+
+            if check_sq and self.header.nreferences == 0:
+                raise ValueError(
+                    ("file has no sequences defined (mode='%s') - "
+                     "is it SAM/BAM format? Consider opening with "
+                     "check_sq=False") % mode)
+
+            if self.is_bam or self.is_cram:
+                self.index_filename = index_filename or filepath_index
+                if self.index_filename:
+                    cindexname = bfile_name = encode_filename(self.index_filename)
+
+                if cfilename or cindexname:
+                    with nogil:
+                        self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
+                                                     HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
+
+                    if not self.index and (cindexname or require_index):
+                        if errno:
+                            raise IOError(errno, force_str(strerror(errno)))
+                        else:
+                            raise IOError('unable to open index file `%s`' % self.index_filename)
+
+                elif require_index:
+                    raise IOError('unable to open index file')
+
+                # save start of data section
+                if not self.is_stream:
+                    self.start_offset = self.tell()
+
+    def fetch(self,
+              contig=None,
+              start=None,
+              stop=None,
+              region=None,
+              tid=None,
+              until_eof=False,
+              multiple_iterators=False,
+              reference=None,
+              end=None):
+        """fetch reads aligned in a :term:`region`.
+
+        See :meth:`~pysam.HTSFile.parse_region` for more information
+        on how genomic regions can be specified. :term:`reference` and
+        `end` are also accepted for backward compatibility as synonyms
+        for :term:`contig` and `stop`, respectively.
+
+        Without a `contig` or `region` all mapped reads in the file
+        will be fetched. The reads will be returned ordered by reference
+        sequence, which will not necessarily be the order within the
+        file. This mode of iteration still requires an index. If there is
+        no index, use `until_eof=True`.
+
+        If only `contig` is set, all reads aligned to `contig`
+        will be fetched.
+
+        A :term:`SAM` file does not allow random access. If `region`
+        or `contig` are given, an exception is raised.
+
+        Parameters
+        ----------
+
+        until_eof : bool
+
+           If `until_eof` is True, all reads from the current file
+           position will be returned in order as they are within the
+           file. Using this option will also fetch unmapped reads.
+
+        multiple_iterators : bool
+
+           If `multiple_iterators` is True, multiple
+           iterators on the same file can be used at the same time. The
+           iterator returned will receive its own copy of a filehandle to
+           the file effectively re-opening the file. Re-opening a file
+           creates some overhead, so beware.
+
+        Returns
+        -------
+
+		An iterator over a collection of reads. : IteratorRow
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid or the
+            file does not permit random access to genomic coordinates.
+
+        """
+        cdef int rtid, rstart, rstop, has_coord
+
+        if not self.is_open:
+            raise ValueError( "I/O operation on closed file" )
+
+        has_coord, rtid, rstart, rstop = self.parse_region(
+            contig, start, stop, region, tid,
+            end=end, reference=reference)
+
+       # Turn of re-opening if htsfile is a stream
+        if self.is_stream:
+            multiple_iterators = False
+
+        if self.is_bam or self.is_cram:
+            if not until_eof and not self.is_remote:
+                if not self.has_index():
+                    raise ValueError(
+                        "fetch called on bamfile without index")
+
+            if has_coord:
+                return IteratorRowRegion(
+                    self, rtid, rstart, rstop,
+                    multiple_iterators=multiple_iterators)
+            else:
+                if until_eof:
+                    return IteratorRowAll(
+                        self,
+                        multiple_iterators=multiple_iterators)
+                else:
+                    # AH: check - reason why no multiple_iterators for
+                    # AllRefs?
+                    return IteratorRowAllRefs(
+                        self,
+                        multiple_iterators=multiple_iterators)
+        else:
+            if has_coord:
+                raise ValueError(
+                    "fetching by region is not available for SAM files")
+
+            if multiple_iterators == True:
+                raise ValueError(
+                    "multiple iterators not implemented for SAM files")
+
+            return IteratorRowAll(self,
+                                  multiple_iterators=multiple_iterators)
+
+    def head(self, n, multiple_iterators=True):
+        '''return an iterator over the first n alignments.
+
+        This iterator is is useful for inspecting the bam-file.
+
+        Parameters
+        ----------
+
+        multiple_iterators : bool
+
+            is set to True by default in order to
+            avoid changing the current file position.
+
+        Returns
+        -------
+
+		an iterator over a collection of reads : IteratorRowHead
+
+        '''
+        return IteratorRowHead(self, n,
+                               multiple_iterators=multiple_iterators)
+
+    def mate(self, AlignedSegment read):
+        '''return the mate of :class:`pysam.AlignedSegment` `read`.
+
+        .. note::
+
+            Calling this method will change the file position.
+            This might interfere with any iterators that have
+            not re-opened the file.
+
+        .. note::
+
+           This method is too slow for high-throughput processing.
+           If a read needs to be processed with its mate, work
+           from a read name sorted file or, better, cache reads.
+
+        Returns
+        -------
+
+        the mate : AlignedSegment
+
+        Raises
+        ------
+
+        ValueError
+            if the read is unpaired or the mate is unmapped
+
+        '''
+        cdef uint32_t flag = read._delegate.core.flag
+
+        if flag & BAM_FPAIRED == 0:
+            raise ValueError("read %s: is unpaired" %
+                             (read.query_name))
+        if flag & BAM_FMUNMAP != 0:
+            raise ValueError("mate %s: is unmapped" %
+                             (read.query_name))
+
+        # xor flags to get the other mate
+        cdef int x = BAM_FREAD1 + BAM_FREAD2
+        flag = (flag ^ x) & x
+
+        # Make sure to use a separate file to jump around
+        # to mate as otherwise the original file position
+        # will be lost
+        # The following code is not using the C API and
+        # could thus be made much quicker, for example
+        # by using tell and seek.
+        for mate in self.fetch(
+                read._delegate.core.mpos,
+                read._delegate.core.mpos + 1,
+                tid=read._delegate.core.mtid,
+                multiple_iterators=True):
+            if mate.flag & flag != 0 and \
+               mate.query_name == read.query_name:
+                break
+        else:
+            raise ValueError("mate not found")
+
+        return mate
+
+    def pileup(self,
+               contig=None,
+               start=None,
+               stop=None,
+               region=None,
+               reference=None,
+               end=None,
+               **kwargs):
+        """perform a :term:`pileup` within a :term:`region`. The region is
+        specified by :term:`contig`, `start` and `stop` (using
+        0-based indexing).  :term:`reference` and `end` are also accepted for
+        backward compatibility as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a samtools 'region' string
+        can be supplied.
+
+        Without 'contig' or 'region' all reads will be used for the
+        pileup. The reads will be returned ordered by
+        :term:`contig` sequence, which will not necessarily be the
+        order within the file.
+
+        Note that :term:`SAM` formatted files do not allow random
+        access.  In these files, if a 'region' or 'contig' are
+        given an exception is raised.
+
+        .. note::
+
+            'all' reads which overlap the region are returned. The
+            first base returned will be the first base of the first
+            read 'not' necessarily the first base of the region used
+            in the query.
+
+        Parameters
+        ----------
+
+        truncate : bool
+
+           By default, the samtools pileup engine outputs all reads
+           overlapping a region. If truncate is True and a region is
+           given, only columns in the exact region specified are
+           returned.
+
+        max_depth : int
+           Maximum read depth permitted. The default limit is '8000'.
+
+        stepper : string
+           The stepper controls how the iterator advances.
+           Possible options for the stepper are
+
+           ``all``
+              skip reads in which any of the following flags are set:
+              BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+
+           ``nofilter``
+              uses every single read turning off any filtering.
+
+           ``samtools``
+              same filter and read processing as in samtools
+              pileup. For full compatibility, this requires a
+              'fastafile' to be given. The following options all pertain
+              to filtering of the ``samtools`` stepper.
+
+        fastafile : :class:`~pysam.FastaFile` object.
+
+           This is required for some of the steppers.
+
+        ignore_overlaps: bool
+
+           If set to True, detect if read pairs overlap and only take
+           the higher quality base. This is the default.
+
+        flag_filter : int
+
+           ignore reads where any of the bits in the flag are set. The default is
+           BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
+
+        flag_require : int
+
+           only use reads where certain flags are set. The default is 0.
+
+        ignore_orphans: bool
+
+            ignore orphans (paired reads that are not in a proper pair).
+            The default is to ignore orphans.
+
+        min_base_quality: int
+
+           Minimum base quality. Bases below the minimum quality will
+           not be output. The default is 13.
+
+        adjust_capq_threshold: int
+
+           adjust mapping quality. The default is 0 for no
+           adjustment. The recommended value for adjustment is 50.
+
+        min_mapping_quality : int
+
+           only use reads above a minimum mapping quality. The default is 0.
+
+        compute_baq: bool
+
+           re-alignment computing per-Base Alignment Qualities (BAQ). The
+           default is to do re-alignment. Realignment requires a reference
+           sequence. If none is present, no realignment will be performed.
+
+        redo_baq: bool
+
+           recompute per-Base Alignment Quality on the fly ignoring
+           existing base qualities. The default is False (use existing
+           base qualities).
+
+        Returns
+        -------
+
+        an iterator over genomic positions. : IteratorColumn
+
+        """
+        cdef int rtid, has_coord
+        cdef int32_t rstart, rstop
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        has_coord, rtid, rstart, rstop = self.parse_region(
+            contig, start, stop, region, reference=reference, end=end)
+
+        if has_coord:
+            if not self.has_index():
+                raise ValueError("no index available for pileup")
+
+            return IteratorColumnRegion(self,
+                                        tid=rtid,
+                                        start=rstart,
+                                        stop=rstop,
+                                        **kwargs)
+        else:
+            if self.has_index():
+                return IteratorColumnAllRefs(self, **kwargs)
+            else:
+                return IteratorColumnAll(self, **kwargs)
+
+    def count(self,
+              contig=None,
+              start=None,
+              stop=None,
+              region=None,
+              until_eof=False,
+              read_callback="nofilter",
+              reference=None,
+              end=None):
+        '''count the number of reads in :term:`region`
+
+        The region is specified by :term:`contig`, `start` and `stop`.
+        :term:`reference` and `end` are also accepted for backward
+        compatibility as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a `samtools`_ :term:`region`
+        string can be supplied.
+
+        A :term:`SAM` file does not allow random access and if
+        `region` or `contig` are given, an exception is raised.
+
+        Parameters
+        ----------
+
+        contig : string
+            reference_name of the genomic region (chromosome)
+
+        start : int
+            start of the genomic region (0-based inclusive)
+
+        stop : int
+            end of the genomic region (0-based exclusive)
+
+        region : string
+            a region string in samtools format.
+
+        until_eof : bool
+            count until the end of the file, possibly including
+            unmapped reads as well.
+
+        read_callback: string or function
+
+            select a call-back to ignore reads when counting. It can
+            be either a string with the following values:
+
+            ``all``
+                skip reads in which any of the following
+                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+                BAM_FDUP
+
+            ``nofilter``
+                uses every single read
+
+            Alternatively, `read_callback` can be a function
+            ``check_read(read)`` that should return True only for
+            those reads that shall be included in the counting.
+
+        reference : string
+            backward compatible synonym for `contig`
+
+        end : int
+            backward compatible synonym for `stop`
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid.
+
+        '''
+        cdef AlignedSegment read
+        cdef long counter = 0
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        cdef int filter_method = 0
+        if read_callback == "all":
+            filter_method = 1
+        elif read_callback == "nofilter":
+            filter_method = 2
+
+        for read in self.fetch(contig=contig,
+                               start=start,
+                               stop=stop,
+                               reference=reference,
+                               end=end,
+                               region=region,
+                               until_eof=until_eof):
+            # apply filter
+            if filter_method == 1:
+                # filter = "all"
+                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+                    continue
+            elif filter_method == 2:
+                # filter = "nofilter"
+                pass
+            else:
+                if not read_callback(read):
+                    continue
+            counter += 1
+
+        return counter
+
+    @cython.boundscheck(False)  # we do manual bounds checking
+    def count_coverage(self,
+                       contig,
+                       start=None,
+                       stop=None,
+                       region=None,
+                       quality_threshold=15,
+                       read_callback='all',
+                       reference=None,
+                       end=None):
+        """count the coverage of genomic positions by reads in :term:`region`.
+
+        The region is specified by :term:`contig`, `start` and `stop`.
+        :term:`reference` and `end` are also accepted for backward
+        compatibility as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a `samtools`_ :term:`region`
+        string can be supplied.  The coverage is computed per-base [ACGT].
+
+        Parameters
+        ----------
+
+        contig : string
+            reference_name of the genomic region (chromosome)
+
+        start : int
+            start of the genomic region (0-based inclusive). If not
+            given, count from the start of the chromosome.
+
+        stop : int
+            end of the genomic region (0-based exclusive). If not given,
+            count to the end of the chromosome.
+
+        region : string
+            a region string.
+
+        quality_threshold : int
+            quality_threshold is the minimum quality score (in phred) a
+            base has to reach to be counted.
+
+        read_callback: string or function
+
+            select a call-back to ignore reads when counting. It can
+            be either a string with the following values:
+
+            ``all``
+                skip reads in which any of the following
+                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+                BAM_FDUP
+
+            ``nofilter``
+                uses every single read
+
+            Alternatively, `read_callback` can be a function
+            ``check_read(read)`` that should return True only for
+            those reads that shall be included in the counting.
+
+        reference : string
+            backward compatible synonym for `contig`
+
+        end : int
+            backward compatible synonym for `stop`
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid.
+
+        Returns
+        -------
+
+        four array.arrays of the same length in order A C G T : tuple
+
+        """
+
+        cdef uint32_t contig_length = self.get_reference_length(contig)
+        cdef int _start = start if start is not None else 0
+        cdef int _stop = stop if stop is not None else contig_length
+        _stop = _stop if _stop < contig_length else contig_length
+
+        if _stop == _start:
+            raise ValueError("interval of size 0")
+        if _stop < _start:
+            raise ValueError("interval of size less than 0")
+
+        cdef int length = _stop - _start
+        cdef c_array.array int_array_template = array.array('L', [])
+        cdef c_array.array count_a
+        cdef c_array.array count_c
+        cdef c_array.array count_g
+        cdef c_array.array count_t
+        count_a = c_array.clone(int_array_template, length, zero=True)
+        count_c = c_array.clone(int_array_template, length, zero=True)
+        count_g = c_array.clone(int_array_template, length, zero=True)
+        count_t = c_array.clone(int_array_template, length, zero=True)
+
+        cdef AlignedSegment read
+        cdef cython.str seq
+        cdef c_array.array quality
+        cdef int qpos
+        cdef int refpos
+        cdef int c = 0
+        cdef int filter_method = 0
+
+
+        if read_callback == "all":
+            filter_method = 1
+        elif read_callback == "nofilter":
+            filter_method = 2
+
+        cdef int _threshold = quality_threshold or 0
+        for read in self.fetch(contig=contig,
+                               reference=reference,
+                               start=start,
+                               stop=stop,
+                               end=end,
+                               region=region):
+            # apply filter
+            if filter_method == 1:
+                # filter = "all"
+                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+                    continue
+            elif filter_method == 2:
+                # filter = "nofilter"
+                pass
+            else:
+                if not read_callback(read):
+                    continue
+
+            # count
+            seq = read.seq
+            if seq is None:
+                continue
+            quality = read.query_qualities
+
+            for qpos, refpos in read.get_aligned_pairs(True):
+                if qpos is not None and refpos is not None and \
+                   _start <= refpos < _stop:
+
+                    # only check base quality if _threshold > 0
+                    if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
+                        if seq[qpos] == 'A':
+                            count_a.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'C':
+                            count_c.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'G':
+                            count_g.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'T':
+                            count_t.data.as_ulongs[refpos - _start] += 1
+
+        return count_a, count_c, count_g, count_t
+
+    def find_introns_slow(self, read_iterator):
+        """Return a dictionary {(start, stop): count}
+        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+        and their support ( = number of reads ).
+
+        read_iterator can be the result of a .fetch(...) call.
+        Or it can be a generator filtering such reads. Example
+        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+        """
+        res = collections.Counter()
+        for r in read_iterator:
+            if 'N' in r.cigarstring:
+                last_read_pos = False
+                for read_loc, genome_loc in r.get_aligned_pairs():
+                    if read_loc is None and last_read_pos:
+                        start = genome_loc
+                    elif read_loc and last_read_pos is None:
+                        stop = genome_loc  # we are right exclusive ,so this is correct
+                        res[(start, stop)] += 1
+                        del start
+                        del stop
+                    last_read_pos = read_loc
+        return res
+
+    def find_introns(self, read_iterator):
+        """Return a dictionary {(start, stop): count}
+        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+        and their support ( = number of reads ).
+
+        read_iterator can be the result of a .fetch(...) call.
+        Or it can be a generator filtering such reads. Example
+        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+        """
+        cdef:
+            uint32_t base_position, junc_start, nt
+            int op
+            AlignedSegment r
+            int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
+
+        res = collections.Counter()
+
+        match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
+        for r in read_iterator:
+            base_position = r.pos
+            cigar = r.cigartuples
+            if cigar is None:
+                continue
+
+            for op, nt in cigar:
+                if op in match_or_deletion:
+                    base_position += nt
+                elif op == BAM_CREF_SKIP:
+                    junc_start = base_position
+                    base_position += nt
+                    res[(junc_start, base_position)] += 1
+        return res
+
+
+    def close(self):
+        '''closes the :class:`pysam.AlignmentFile`.'''
+
+        if self.htsfile == NULL:
+            return
+
+        if self.index != NULL:
+            hts_idx_destroy(self.index)
+            self.index = NULL
+
+        cdef int ret = hts_close(self.htsfile)
+        self.htsfile = NULL
+
+        self.header = None
+
+        if ret < 0:
+            global errno
+            if errno == EPIPE:
+                errno = 0
+            else:
+                raise IOError(errno, force_str(strerror(errno)))
+
+    def __dealloc__(self):
+        cdef int ret = 0
+
+        if self.index != NULL:
+            hts_idx_destroy(self.index)
+            self.index = NULL
+
+        if self.htsfile != NULL:
+            ret = hts_close(self.htsfile)
+            self.htsfile = NULL
+
+        self.header = None
+
+        if self.b:
+            bam_destroy1(self.b)
+            self.b = NULL
+
+        if ret < 0:
+            global errno
+            if errno == EPIPE:
+                errno = 0
+            else:
+                raise IOError(errno, force_str(strerror(errno)))
+
+    cpdef int write(self, AlignedSegment read) except -1:
+        '''
+        write a single :class:`pysam.AlignedSegment` to disk.
+
+        Raises:
+            ValueError
+                if the writing failed
+
+        Returns:
+            int :
+                the number of bytes written. If the file is closed,
+                this will be 0.
+        '''
+        if not self.is_open:
+            return 0
+
+        if self.header.ptr.n_targets <= read._delegate.core.tid:
+            raise ValueError(
+                "AlignedSegment refers to reference number {} that "
+                "is larger than the number of references ({}) in the header".format(
+                    read._delegate.core.tid, self.header.ptr.n_targets))
+
+        cdef int ret
+        with nogil:
+            ret = sam_write1(self.htsfile,
+                             self.header.ptr,
+                             read._delegate)
+
+        # kbj: Still need to raise an exception with except -1. Otherwise
+        #      when ret == -1 we get a "SystemError: error return without
+        #      exception set".
+        if ret < 0:
+            raise IOError(
+            "sam_write1 failed with error code {}".format(ret))
+
+        return ret
+
+    # context manager interface
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## properties
+    ###############################################################
+    property mapped:
+        """int with total number of mapped alignments according to the
+        statistics recorded in the index. This is a read-only
+        attribute.
+        (This will be 0 for a CRAM file indexed by a .crai index, as that
+        index format does not record these statistics.)
+        """
+        def __get__(self):
+            self.check_index()
+            cdef int tid
+            cdef uint64_t total = 0
+            cdef uint64_t mapped, unmapped
+            for tid from 0 <= tid < self.header.nreferences:
+                with nogil:
+                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+                total += mapped
+            return total
+
+    property unmapped:
+        """int with total number of unmapped reads according to the statistics
+        recorded in the index. This number of reads includes the number of reads
+        without coordinates. This is a read-only attribute.
+        (This will be 0 for a CRAM file indexed by a .crai index, as that
+        index format does not record these statistics.)
+        """
+        def __get__(self):
+            self.check_index()
+            cdef int tid
+            cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
+            cdef uint64_t mapped, unmapped
+            for tid from 0 <= tid < self.header.nreferences:
+                with nogil:
+                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+                total += unmapped
+            return total
+
+    property nocoordinate:
+        """int with total number of reads without coordinates according to the
+        statistics recorded in the index, i.e., the statistic printed for "*"
+        by the ``samtools idxstats`` command. This is a read-only attribute.
+        (This will be 0 for a CRAM file indexed by a .crai index, as that
+        index format does not record these statistics.)
+        """
+        def __get__(self):
+            self.check_index()
+            cdef uint64_t n
+            with nogil:
+                n = hts_idx_get_n_no_coor(self.index)
+            return n
+
+    def get_index_statistics(self):
+        """return statistics about mapped/unmapped reads per chromosome as
+        they are stored in the index, similarly to the statistics printed
+        by the ``samtools idxstats`` command.
+
+        CRAI indexes do not record these statistics, so for a CRAM file
+        with a .crai index the returned statistics will all be 0.
+
+        Returns:
+            list :
+                a list of records for each chromosome. Each record has the
+                attributes 'contig', 'mapped', 'unmapped' and 'total'.
+        """
+
+        self.check_index()
+        cdef int tid
+        cdef uint64_t mapped, unmapped
+        results = []
+        # TODO: use header
+        for tid from 0 <= tid < self.nreferences:
+            with nogil:
+                hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+            results.append(
+                IndexStats._make((
+                    self.get_reference_name(tid),
+                    mapped,
+                    unmapped,
+                    mapped + unmapped)))
+
+        return results
+
+    ###############################################################
+    ## file-object like iterator access
+    ## note: concurrent access will cause errors (see IteratorRow
+    ## and multiple_iterators)
+    ## Possible solutions: deprecate or open new file handle
+    def __iter__(self):
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        if not self.is_bam and self.header.nreferences == 0:
+            raise NotImplementedError(
+                "can not iterate over samfile without header")
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''
+        cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
+        '''
+        cdef int ret
+        cdef bam_hdr_t * hdr = self.header.ptr
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            hdr,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            return makeAlignedSegment(self.b, self.header)
+        elif ret == -1:
+            raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
+
+    ###########################################
+    # methods/properties referencing the header
+    def is_valid_tid(self, int tid):
+        """
+        return True if the numerical :term:`tid` is valid; False otherwise.
+
+        Note that the unmapped tid code (-1) counts as an invalid.
+        """
+        if self.header is None:
+            raise ValueError("header not available in closed files")
+        return self.header.is_valid_tid(tid)
+
+    def get_tid(self, reference):
+        """
+        return the numerical :term:`tid` corresponding to
+        :term:`reference`
+
+        returns -1 if reference is not known.
+        """
+        if self.header is None:
+            raise ValueError("header not available in closed files")
+        return self.header.get_tid(reference)
+
+    def get_reference_name(self, tid):
+        """
+        return :term:`reference` name corresponding to numerical :term:`tid`
+        """
+        if self.header is None:
+            raise ValueError("header not available in closed files")
+        return self.header.get_reference_name(tid)
+
+    def get_reference_length(self, reference):
+        """
+        return :term:`reference` length corresponding to numerical :term:`tid`
+        """
+        if self.header is None:
+            raise ValueError("header not available in closed files")
+        return self.header.get_reference_length(reference)
+
+    property nreferences:
+        """int with the number of :term:`reference` sequences in the file.
+        This is a read-only attribute."""
+        def __get__(self):
+            if self.header:
+                return self.header.nreferences
+            else:
+                raise ValueError("header not available in closed files")
+
+    property references:
+        """tuple with the names of :term:`reference` sequences. This is a
+        read-only attribute"""
+        def __get__(self):
+            if self.header:
+                return self.header.references
+            else:
+                raise ValueError("header not available in closed files")
+
+    property lengths:
+        """tuple of the lengths of the :term:`reference` sequences. This is a
+        read-only attribute. The lengths are in the same order as
+        :attr:`pysam.AlignmentFile.references`
+
+        """
+        def __get__(self):
+            if self.header:
+                return self.header.lengths
+            else:
+                raise ValueError("header not available in closed files")
+
+    # Compatibility functions for pysam < 0.14
+    property text:
+        """deprecated, use :attr:`references` and :attr:`lengths` instead"""
+        def __get__(self):
+            if self.header:
+                return self.header.__str__()
+            else:
+                raise ValueError("header not available in closed files")
+
+    # Compatibility functions for pysam < 0.8.3
+    def gettid(self, reference):
+        """deprecated, use :meth:`get_tid` instead"""
+        return self.get_tid(reference)
+
+    def getrname(self, tid):
+        """deprecated, use :meth:`get_reference_name` instead"""
+        return self.get_reference_name(tid)
+
+
+cdef class IteratorRow:
+    '''abstract base class for iterators over mapped reads.
+
+    Various iterators implement different behaviours for wrapping around
+    contig boundaries. Examples include:
+
+    :class:`pysam.IteratorRowRegion`
+        iterate within a single contig and a defined region.
+
+    :class:`pysam.IteratorRowAll`
+        iterate until EOF. This iterator will also include unmapped reads.
+
+    :class:`pysam.IteratorRowAllRefs`
+        iterate over all reads in all reference sequences.
+
+    The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    '''
+
+    def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+        cdef char *cfilename
+        cdef char *creference_filename
+        cdef char *cindexname = NULL
+
+        if not samfile.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        # makes sure that samfile stays alive as long as the
+        # iterator is alive
+        self.samfile = samfile
+
+        # reopen the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if multiple_iterators:
+
+            cfilename = samfile.filename
+            with nogil:
+                self.htsfile = hts_open(cfilename, 'r')
+            assert self.htsfile != NULL
+
+            if samfile.has_index():
+                if samfile.index_filename:
+                    cindexname = bindex_filename = encode_filename(samfile.index_filename)
+                with nogil:
+                    self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
+            else:
+                self.index = NULL
+
+            # need to advance in newly opened file to position after header
+            # better: use seek/tell?
+            with nogil:
+                hdr = sam_hdr_read(self.htsfile)
+            if hdr is NULL:
+                raise IOError("unable to read header information")
+            self.header = makeAlignmentHeader(hdr)
+
+            self.owns_samfile = True
+
+            # options specific to CRAM files
+            if samfile.is_cram and samfile.reference_filename:
+                creference_filename = samfile.reference_filename
+                hts_set_opt(self.htsfile,
+                            CRAM_OPT_REFERENCE,
+                            creference_filename)
+
+        else:
+            self.htsfile = samfile.htsfile
+            self.index = samfile.index
+            self.owns_samfile = False
+            self.header = samfile.header
+
+        self.retval = 0
+
+        self.b = bam_init1()
+
+    def __dealloc__(self):
+        bam_destroy1(self.b)
+        if self.owns_samfile:
+            hts_idx_destroy(self.index)
+            hts_close(self.htsfile)
+
+
+cdef class IteratorRowRegion(IteratorRow):
+    """*(AlignmentFile samfile, int tid, int beg, int stop,
+    int multiple_iterators=False)*
+
+    iterate over mapped reads in a region.
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile,
+                 int tid, int beg, int stop,
+                 int multiple_iterators=False):
+
+        if not samfile.has_index():
+            raise ValueError("no index available for iteration")
+
+        super().__init__(samfile, multiple_iterators=multiple_iterators)
+        with nogil:
+            self.iter = sam_itr_queryi(
+                self.index,
+                tid,
+                beg,
+                stop)
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        with nogil:
+            self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
+                                       self.iter,
+                                       self.b,
+                                       self.htsfile)
+
+    def __next__(self):
+        self.cnext()
+        if self.retval >= 0:
+            return makeAlignedSegment(self.b, self.header)
+        elif self.retval == -1:
+            raise StopIteration
+        elif self.retval == -2:
+            # Note: it is currently not the case that hts_iter_next
+            # returns -2 for a truncated file.
+            # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+            raise IOError('truncated file')
+        else:
+            raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
+
+    def __dealloc__(self):
+        hts_itr_destroy(self.iter)
+
+
+cdef class IteratorRowHead(IteratorRow):
+    """*(AlignmentFile samfile, n, int multiple_iterators=False)*
+
+    iterate over first n reads in `samfile`
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.head`.
+
+    """
+
+    def __init__(self,
+                 AlignmentFile samfile,
+                 int n,
+                 int multiple_iterators=False):
+        super().__init__(samfile, multiple_iterators=multiple_iterators)
+
+        self.max_rows = n
+        self.current_row = 0
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        cdef bam_hdr_t * hdr = self.header.ptr
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            hdr,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        if self.current_row >= self.max_rows:
+            raise StopIteration
+
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            self.current_row += 1
+            return makeAlignedSegment(self.b, self.header)
+        elif ret == -1:
+            raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
+
+
+cdef class IteratorRowAll(IteratorRow):
+    """*(AlignmentFile samfile, int multiple_iterators=False)*
+
+    iterate over all reads in `samfile`
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+        super().__init__(samfile, multiple_iterators=multiple_iterators)
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        cdef bam_hdr_t * hdr = self.header.ptr
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            hdr,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            return makeAlignedSegment(self.b, self.header)
+        elif ret == -1:
+            raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
+
+
+cdef class IteratorRowAllRefs(IteratorRow):
+    """iterates over all mapped reads by chaining iterators over each
+    reference
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile, multiple_iterators=False):
+        super().__init__(samfile, multiple_iterators=multiple_iterators)
+
+        if not samfile.has_index():
+            raise ValueError("no index available for fetch")
+
+        self.tid = -1
+
+    def nextiter(self):
+        # get a new iterator for a chromosome. The file
+        # will not be re-opened.
+        self.rowiter = IteratorRowRegion(self.samfile,
+                                         self.tid,
+                                         0,
+                                         MAX_POS)
+        # set htsfile and header of the rowiter
+        # to the values in this iterator to reflect multiple_iterators
+        self.rowiter.htsfile = self.htsfile
+        self.rowiter.header = self.header
+
+        # make sure the iterator understand that IteratorRowAllRefs
+        # has ownership
+        self.rowiter.owns_samfile = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # Create an initial iterator
+        if self.tid == -1:
+            if not self.samfile.nreferences:
+                raise StopIteration
+            self.tid = 0
+            self.nextiter()
+
+        while 1:
+            self.rowiter.cnext()
+
+            # If current iterator is not exhausted, return aligned read
+            if self.rowiter.retval > 0:
+                return makeAlignedSegment(self.rowiter.b, self.header)
+
+            self.tid += 1
+
+            # Otherwise, proceed to next reference or stop
+            if self.tid < self.samfile.nreferences:
+                self.nextiter()
+            else:
+                raise StopIteration
+
+
+cdef class IteratorRowSelection(IteratorRow):
+    """*(AlignmentFile samfile)*
+
+    iterate over reads in `samfile` at a given list of file positions.
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+    """
+
+    def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
+        super().__init__(samfile, multiple_iterators=multiple_iterators)
+
+        self.positions = positions
+        self.current_pos = 0
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator'''
+        # end iteration if out of positions
+        if self.current_pos >= len(self.positions): return -1
+
+        cdef uint64_t pos = self.positions[self.current_pos]
+        with nogil:
+            bgzf_seek(hts_get_bgzfp(self.htsfile),
+                      pos,
+                      0)
+        self.current_pos += 1
+
+        cdef int ret
+        cdef bam_hdr_t * hdr = self.header.ptr
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            hdr,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            return makeAlignedSegment(self.b, self.header)
+        elif ret == -1:
+            raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
+
+
+cdef int __advance_nofilter(void *data, bam1_t *b):
+    '''advance without any read filtering.
+    '''
+    cdef __iterdata * d = <__iterdata*>data
+    cdef int ret
+    with nogil:
+        ret = sam_itr_next(d.htsfile, d.iter, b)
+    return ret
+
+
+cdef int __advance_raw_nofilter(void *data, bam1_t *b):
+    '''advance (without iterator) without any read filtering.
+    '''
+    cdef __iterdata * d = <__iterdata*>data
+    cdef int ret
+    with nogil:
+        ret = sam_read1(d.htsfile, d.header, b)
+    return ret
+
+
+cdef int __advance_all(void *data, bam1_t *b):
+    '''only use reads for pileup passing basic filters such as
+
+    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+    '''
+
+    cdef __iterdata * d = <__iterdata*>data
+    cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
+    cdef int ret
+    while 1:
+        with nogil:
+            ret = sam_itr_next(d.htsfile, d.iter, b)
+        if ret < 0:
+            break
+        if b.core.flag & d.flag_filter:
+            continue
+        break
+    return ret
+
+
+cdef int __advance_raw_all(void *data, bam1_t *b):
+    '''only use reads for pileup passing basic filters such as
+
+    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+    '''
+
+    cdef __iterdata * d = <__iterdata*>data
+    cdef int ret
+    while 1:
+        with nogil:
+            ret = sam_read1(d.htsfile, d.header, b)
+        if ret < 0:
+            break
+        if b.core.flag & d.flag_filter:
+            continue
+        break
+    return ret
+
+
+cdef int __advance_samtools(void * data, bam1_t * b):
+    '''advance using same filter and read processing as in
+    the samtools pileup.
+    '''
+    cdef __iterdata * d = <__iterdata*>data
+    cdef int ret
+    cdef int q
+
+    while 1:
+        with nogil:
+            ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
+        if ret < 0:
+            break
+        if b.core.flag & d.flag_filter:
+            continue
+        if d.flag_require and not (b.core.flag & d.flag_require):
+            continue
+
+        # reload sequence
+        if d.fastafile != NULL and b.core.tid != d.tid:
+            if d.seq != NULL:
+                free(d.seq)
+            d.tid = b.core.tid
+            with nogil:
+                d.seq = faidx_fetch_seq(
+                    d.fastafile,
+                    d.header.target_name[d.tid],
+                    0, MAX_POS,
+                    &d.seq_len)
+
+            if d.seq == NULL:
+                raise ValueError(
+                    "reference sequence for '{}' (tid={}) not found".format(
+                        d.header.target_name[d.tid], d.tid))
+
+        # realign read - changes base qualities
+        if d.seq != NULL and d.compute_baq:
+            # 4th option to realign is flag:
+            # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
+            if d.redo_baq:
+                sam_prob_realn(b, d.seq, d.seq_len, 7)
+            else:
+                sam_prob_realn(b, d.seq, d.seq_len, 3)
+
+        if d.seq != NULL and d.adjust_capq_threshold > 10:
+            q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
+            if q < 0:
+                continue
+            elif b.core.qual > q:
+                b.core.qual = q
+
+        if b.core.qual < d.min_mapping_quality:
+            continue
+        if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
+            continue
+
+        break
+
+    return ret
+
+
+cdef class IteratorColumn:
+    '''abstract base class for iterators over columns.
+
+    IteratorColumn objects wrap the pileup functionality of samtools.
+
+    For reasons of efficiency, the iterator points to the current
+    pileup buffer. The pileup buffer is updated at every iteration.
+    This might cause some unexpected behaviour. For example,
+    consider the conversion to a list::
+
+       f = AlignmentFile("file.bam", "rb")
+       result = list(f.pileup())
+
+    Here, ``result`` will contain ``n`` objects of type
+    :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
+    ``result`` will contain the same information.
+
+    The desired behaviour can be achieved by list comprehension::
+
+       result = [x.pileups() for x in f.pileup()]
+
+    ``result`` will be a list of ``n`` lists of objects of type
+    :class:`~pysam.PileupRead`.
+
+    If the iterator is associated with a :class:`~pysam.Fastafile`
+    using the :meth:`add_reference` method, then the iterator will
+    export the current sequence via the methods :meth:`get_sequence`
+    and :meth:`seq_len`.
+
+    See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
+    '''
+
+    def __cinit__( self, AlignmentFile samfile, **kwargs):
+        self.samfile = samfile
+        self.fastafile = kwargs.get("fastafile", None)
+        self.stepper = kwargs.get("stepper", "samtools")
+        self.max_depth = kwargs.get("max_depth", 8000)
+        self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
+        self.min_base_quality = kwargs.get("min_base_quality", 13)
+        self.iterdata.seq = NULL
+        self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
+        self.iterdata.flag_require = kwargs.get("flag_require", 0)
+        self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+        self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
+        self.iterdata.compute_baq = kwargs.get("compute_baq", True)
+        self.iterdata.redo_baq = kwargs.get("redo_baq", False)
+        self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
+
+        self.tid = 0
+        self.pos = 0
+        self.n_plp = 0
+        self.plp = NULL
+        self.pileup_iter = <bam_mplp_t>NULL
+
+    def __iter__(self):
+        return self
+
+    cdef int cnext(self):
+        '''perform next iteration.
+        '''
+        # do not release gil here because of call-backs
+        cdef int ret = bam_mplp_auto(self.pileup_iter,
+                                     &self.tid,
+                                     &self.pos,
+                                     &self.n_plp,
+                                     &self.plp)
+        return ret
+
+    cdef char * get_sequence(self):
+        '''return current reference sequence underlying the iterator.
+        '''
+        return self.iterdata.seq
+
+    property seq_len:
+        '''current sequence length.'''
+        def __get__(self):
+            return self.iterdata.seq_len
+
+    def add_reference(self, FastaFile fastafile):
+       '''
+       add reference sequences in `fastafile` to iterator.'''
+       self.fastafile = fastafile
+       if self.iterdata.seq != NULL:
+           free(self.iterdata.seq)
+       self.iterdata.tid = -1
+       self.iterdata.fastafile = self.fastafile.fastafile
+
+    def has_reference(self):
+        '''
+        return true if iterator is associated with a reference'''
+        return self.fastafile
+
+    cdef _setup_iterator(self,
+                         int tid,
+                         int start,
+                         int stop,
+                         int multiple_iterators=0):
+        '''setup the iterator structure'''
+
+        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
+        self.iterdata.htsfile = self.samfile.htsfile
+        self.iterdata.iter = self.iter.iter
+        self.iterdata.seq = NULL
+        self.iterdata.tid = -1
+        self.iterdata.header = self.samfile.header.ptr
+
+        if self.fastafile is not None:
+            self.iterdata.fastafile = self.fastafile.fastafile
+        else:
+            self.iterdata.fastafile = NULL
+
+        # Free any previously allocated memory before reassigning
+        # pileup_iter
+        self._free_pileup_iter()
+
+        cdef void * data[1]
+        data[0] = <void*>&self.iterdata
+
+        if self.stepper is None or self.stepper == "all":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_all,
+                                                 data)
+        elif self.stepper == "nofilter":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_nofilter,
+                                                 data)
+        elif self.stepper == "samtools":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_samtools,
+                                                 data)
+        else:
+            raise ValueError(
+                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+
+        if self.max_depth:
+            with nogil:
+                bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
+
+        if self.ignore_overlaps:
+            with nogil:
+                bam_mplp_init_overlaps(self.pileup_iter)
+
+    cdef _setup_raw_rest_iterator(self):
+        '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
+
+        self.iter = None
+        self.iterdata.iter = NULL
+        self.iterdata.htsfile = self.samfile.htsfile
+        self.iterdata.seq = NULL
+        self.iterdata.tid = -1
+        self.iterdata.header = self.samfile.header.ptr
+
+        if self.fastafile is not None:
+            self.iterdata.fastafile = self.fastafile.fastafile
+        else:
+            self.iterdata.fastafile = NULL
+
+        # Free any previously allocated memory before reassigning
+        # pileup_iter
+        self._free_pileup_iter()
+
+        cdef void * data[1]
+        data[0] = <void*>&self.iterdata
+
+        if self.stepper is None or self.stepper == "all":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_raw_all,
+                                                 data)
+        elif self.stepper == "nofilter":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_raw_nofilter,
+                                                 data)
+        elif self.stepper == "samtools":
+            with nogil:
+                self.pileup_iter = bam_mplp_init(1,
+                                                 <bam_plp_auto_f>&__advance_samtools,
+                                                 data)
+        else:
+            raise ValueError(
+                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+
+        if self.max_depth:
+            with nogil:
+                bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
+
+        if self.ignore_overlaps:
+            with nogil:
+                bam_mplp_init_overlaps(self.pileup_iter)
+
+    cdef reset(self, tid, start, stop):
+        '''reset iterator position.
+
+        This permits using the iterator multiple times without
+        having to incur the full set-up costs.
+        '''
+        if self.iter is None:
+            raise TypeError("Raw iterator set up without region cannot be reset")
+
+        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
+        self.iterdata.iter = self.iter.iter
+
+        # invalidate sequence if different tid
+        if self.tid != tid:
+            if self.iterdata.seq != NULL:
+                free(self.iterdata.seq)
+            self.iterdata.seq = NULL
+            self.iterdata.tid = -1
+
+        # self.pileup_iter = bam_mplp_init(1
+        #                                  &__advancepileup,
+        #                                  &self.iterdata)
+        with nogil:
+            bam_mplp_reset(self.pileup_iter)
+
+    cdef _free_pileup_iter(self):
+        '''free the memory alloc'd by bam_plp_init.
+
+        This is needed before setup_iterator allocates another
+        pileup_iter, or else memory will be lost.  '''
+        if self.pileup_iter != <bam_mplp_t>NULL:
+            with nogil:
+                bam_mplp_reset(self.pileup_iter)
+                bam_mplp_destroy(self.pileup_iter)
+                self.pileup_iter = <bam_mplp_t>NULL
+
+    def __dealloc__(self):
+        # reset in order to avoid memory leak messages for iterators
+        # that have not been fully consumed
+        self._free_pileup_iter()
+        self.plp = <const bam_pileup1_t*>NULL
+
+        if self.iterdata.seq != NULL:
+            free(self.iterdata.seq)
+            self.iterdata.seq = NULL
+
+    # backwards compatibility
+
+    def hasReference(self):
+        return self.has_reference()
+    cdef char * getSequence(self):
+        return self.get_sequence()
+    def addReference(self, FastaFile fastafile):
+        return self.add_reference(fastafile)
+
+
+cdef class IteratorColumnRegion(IteratorColumn):
+    '''iterates over a region only.
+    '''
+    def __cinit__(self,
+                  AlignmentFile samfile,
+                  int tid = 0,
+                  int start = 0,
+                  int stop = MAX_POS,
+                  int truncate = False,
+                  int multiple_iterators = True,
+                  **kwargs ):
+
+        # initialize iterator. Multiple iterators not available
+        # for CRAM.
+        if multiple_iterators and samfile.is_cram:
+            warnings.warn("multiple_iterators not implemented for CRAM")
+            multiple_iterators = False
+
+        self._setup_iterator(tid, start, stop, multiple_iterators)
+        self.start = start
+        self.stop = stop
+        self.truncate = truncate
+
+    def __next__(self):
+
+        cdef int n
+
+        while 1:
+            n = self.cnext()
+            if n < 0:
+                raise ValueError("error during iteration" )
+
+            if n == 0:
+                raise StopIteration
+
+            if self.truncate:
+                if self.start > self.pos:
+                    continue
+                if self.pos >= self.stop:
+                    raise StopIteration
+
+            return makePileupColumn(&self.plp,
+                                    self.tid,
+                                    self.pos,
+                                    self.n_plp,
+                                    self.min_base_quality,
+                                    self.iterdata.seq,
+                                    self.samfile.header)
+
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+    """iterates over all columns by chaining iterators over each reference
+    """
+
+    def __cinit__(self,
+                  AlignmentFile samfile,
+                  **kwargs):
+
+        # no iteration over empty files
+        if not samfile.nreferences:
+            raise StopIteration
+
+        # initialize iterator
+        self._setup_iterator(self.tid, 0, MAX_POS, 1)
+
+    def __next__(self):
+
+        cdef int n
+        while 1:
+            n = self.cnext()
+            if n < 0:
+                raise ValueError("error during iteration")
+
+            # proceed to next reference or stop
+            if n == 0:
+                self.tid += 1
+                if self.tid < self.samfile.nreferences:
+                    self._setup_iterator(self.tid, 0, MAX_POS, 0)
+                else:
+                    raise StopIteration
+                continue
+
+            # return result, if within same reference
+            return makePileupColumn(&self.plp,
+                                    self.tid,
+                                    self.pos,
+                                    self.n_plp,
+                                    self.min_base_quality,
+                                    self.iterdata.seq,
+                                    self.samfile.header)
+
+
+cdef class IteratorColumnAll(IteratorColumn):
+    """iterates over all columns, without using an index
+    """
+
+    def __cinit__(self,
+                  AlignmentFile samfile,
+                  **kwargs):
+
+        self._setup_raw_rest_iterator()
+
+    def __next__(self):
+
+        cdef int n
+        n = self.cnext()
+        if n < 0:
+            raise ValueError("error during iteration")
+
+        if n == 0:
+            raise StopIteration
+
+        return makePileupColumn(&self.plp,
+                                self.tid,
+                                self.pos,
+                                self.n_plp,
+                                self.min_base_quality,
+                                self.iterdata.seq,
+                                self.samfile.header)
+
+
+cdef class SNPCall:
+    '''the results of a SNP call.'''
+    cdef int _tid
+    cdef int _pos
+    cdef char _reference_base
+    cdef char _genotype
+    cdef int _consensus_quality
+    cdef int _snp_quality
+    cdef int _rms_mapping_quality
+    cdef int _coverage
+
+    property tid:
+        '''the chromosome ID as is defined in the header'''
+        def __get__(self):
+            return self._tid
+
+    property pos:
+       '''nucleotide position of SNP.'''
+       def __get__(self): return self._pos
+
+    property reference_base:
+       '''reference base at pos. ``N`` if no reference sequence supplied.'''
+       def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+
+    property genotype:
+       '''the genotype called.'''
+       def __get__(self): return from_string_and_size( &self._genotype, 1 )
+
+    property consensus_quality:
+       '''the genotype quality (Phred-scaled).'''
+       def __get__(self): return self._consensus_quality
+
+    property snp_quality:
+       '''the snp quality (Phred scaled) - probability of consensus being
+       identical to reference sequence.'''
+       def __get__(self): return self._snp_quality
+
+    property mapping_quality:
+       '''the root mean square (rms) of the mapping quality of all reads
+       involved in the call.'''
+       def __get__(self): return self._rms_mapping_quality
+
+    property coverage:
+       '''coverage or read depth - the number of reads involved in the call.'''
+       def __get__(self): return self._coverage
+
+    def __str__(self):
+
+        return "\t".join( map(str, (
+                    self.tid,
+                    self.pos,
+                    self.reference_base,
+                    self.genotype,
+                    self.consensus_quality,
+                    self.snp_quality,
+                    self.mapping_quality,
+                    self.coverage ) ) )
+
+
+cdef class IndexedReads:
+    """Index a Sam/BAM-file by query name while keeping the
+    original sort order intact.
+
+    The index is kept in memory and can be substantial.
+
+    By default, the file is re-opened to avoid conflicts if multiple
+    operators work on the same file. Set `multiple_iterators` = False
+    to not re-open `samfile`.
+
+    Parameters
+    ----------
+
+    samfile : AlignmentFile
+        File to be indexed.
+
+    multiple_iterators : bool
+        Flag indicating whether the file should be reopened. Reopening prevents
+        existing iterators being affected by the indexing.
+
+    """
+
+    def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
+        cdef char *cfilename
+
+        # makes sure that samfile stays alive as long as this
+        # object is alive.
+        self.samfile = samfile
+        cdef bam_hdr_t * hdr = NULL
+        assert samfile.is_bam, "can only apply IndexReads on bam files"
+
+        # multiple_iterators the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if multiple_iterators:
+            cfilename = samfile.filename
+            with nogil:
+                self.htsfile = hts_open(cfilename, 'r')
+            if self.htsfile == NULL:
+                raise OSError("unable to reopen htsfile")
+
+            # need to advance in newly opened file to position after header
+            # better: use seek/tell?
+            with nogil:
+                hdr = sam_hdr_read(self.htsfile)
+            if hdr == NULL:
+                raise OSError("unable to read header information")
+            self.header = makeAlignmentHeader(hdr)
+            self.owns_samfile = True
+        else:
+            self.htsfile = self.samfile.htsfile
+            self.header = samfile.header
+            self.owns_samfile = False
+
+    def build(self):
+        '''build the index.'''
+
+        self.index = collections.defaultdict(list)
+
+        # this method will start indexing from the current file position
+        cdef int ret = 1
+        cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
+        if b == NULL:
+            raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
+
+        cdef uint64_t pos
+        cdef bam_hdr_t * hdr = self.header.ptr
+
+        while ret > 0:
+            with nogil:
+                pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
+                ret = sam_read1(self.htsfile,
+                                hdr,
+                                b)
+
+            if ret > 0:
+                qname = charptr_to_str(pysam_bam_get_qname(b))
+                self.index[qname].append(pos)
+
+        bam_destroy1(b)
+
+    def find(self, query_name):
+        '''find `query_name` in index.
+
+        Returns
+        -------
+
+        IteratorRowSelection
+            Returns an iterator over all reads with query_name.
+
+        Raises
+        ------
+
+        KeyError
+            if the `query_name` is not in the index.
+
+        '''
+        if query_name in self.index:
+            return IteratorRowSelection(
+                self.samfile,
+                self.index[query_name],
+                multiple_iterators = False)
+        else:
+            raise KeyError("read %s not found" % query_name)
+
+    def __dealloc__(self):
+        if self.owns_samfile:
+            hts_close(self.htsfile)
author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children