jpayne@68: # cython: embedsignature=True
jpayne@68: # cython: profile=True
jpayne@68: ########################################################
jpayne@68: ########################################################
jpayne@68: # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@68: ########################################################
jpayne@68: # The principal classes defined in this module are:
jpayne@68: #
jpayne@68: # class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
jpayne@68: #
jpayne@68: # class AlignmentHeader manage SAM/BAM/CRAM header data
jpayne@68: #
jpayne@68: # class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
jpayne@68: #                       the original sort order intact
jpayne@68: #
jpayne@68: # Additionally this module defines numerous additional classes that
jpayne@68: # are part of the internal API. These are:
jpayne@68: #
jpayne@68: # Various iterator classes to iterate over alignments in sequential
jpayne@68: # (IteratorRow) or in a stacked fashion (IteratorColumn):
jpayne@68: #
jpayne@68: # class IteratorRow
jpayne@68: # class IteratorRowRegion
jpayne@68: # class IteratorRowHead
jpayne@68: # class IteratorRowAll
jpayne@68: # class IteratorRowAllRefs
jpayne@68: # class IteratorRowSelection
jpayne@68: # class IteratorColumn
jpayne@68: # class IteratorColumnRegion
jpayne@68: # class IteratorColumnAll
jpayne@68: # class IteratorColumnAllRefs
jpayne@68: #
jpayne@68: ########################################################
jpayne@68: #
jpayne@68: # The MIT License
jpayne@68: #
jpayne@68: # Copyright (c) 2015 Andreas Heger
jpayne@68: #
jpayne@68: # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68: # copy of this software and associated documentation files (the "Software"),
jpayne@68: # to deal in the Software without restriction, including without limitation
jpayne@68: # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68: # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68: # Software is furnished to do so, subject to the following conditions:
jpayne@68: #
jpayne@68: # The above copyright notice and this permission notice shall be included in
jpayne@68: # all copies or substantial portions of the Software.
jpayne@68: #
jpayne@68: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
jpayne@68: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68: # DEALINGS IN THE SOFTWARE.
jpayne@68: #
jpayne@68: ########################################################
jpayne@68: import os
jpayne@68: import collections
jpayne@68: try:
jpayne@68:     from collections.abc import Sequence, Mapping  # noqa
jpayne@68: except ImportError:
jpayne@68:     from collections import Sequence, Mapping  # noqa
jpayne@68: import re
jpayne@68: import warnings
jpayne@68: import array
jpayne@68: from libc.errno  cimport errno, EPIPE
jpayne@68: from libc.string cimport strcmp, strpbrk, strerror
jpayne@68: from libc.stdint cimport INT32_MAX
jpayne@68: 
jpayne@68: from cpython cimport array as c_array
jpayne@68: 
jpayne@68: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68: from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68: from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
jpayne@68: from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
jpayne@68:                               HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
jpayne@68: 
jpayne@68: from io import StringIO
jpayne@68: 
jpayne@68: cimport cython
jpayne@68: 
jpayne@68: 
jpayne@68: __all__ = [
jpayne@68:     "AlignmentFile",
jpayne@68:     "AlignmentHeader",
jpayne@68:     "IteratorRow",
jpayne@68:     "IteratorColumn",
jpayne@68:     "IndexedReads"]
jpayne@68: 
jpayne@68: IndexStats = collections.namedtuple("IndexStats",
jpayne@68:                                     ("contig",
jpayne@68:                                      "mapped",
jpayne@68:                                      "unmapped",
jpayne@68:                                      "total"))
jpayne@68: 
jpayne@68: ########################################################
jpayne@68: ## global variables
jpayne@68: # maximum genomic coordinace
jpayne@68: # for some reason, using 'int' causes overflow
jpayne@68: cdef int MAX_POS = (1 << 31) - 1
jpayne@68: 
jpayne@68: # valid types for SAM headers
jpayne@68: VALID_HEADER_TYPES = {"HD" : Mapping,
jpayne@68:                       "SQ" : Sequence,
jpayne@68:                       "RG" : Sequence,
jpayne@68:                       "PG" : Sequence,
jpayne@68:                       "CO" : Sequence}
jpayne@68: 
jpayne@68: # order of records within SAM headers
jpayne@68: VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
jpayne@68: 
jpayne@68: # default type conversions within SAM header records
jpayne@68: KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
jpayne@68:                                "SS" : str,},
jpayne@68:                        "SQ" : {"SN" : str, "LN" : int, "AS" : str,
jpayne@68:                                "M5" : str, "SP" : str, "UR" : str,
jpayne@68:                                "AH" : str, "TP" : str, "DS" : str,
jpayne@68:                                "AN" : str,},
jpayne@68:                        "RG" : {"ID" : str, "CN" : str, "DS" : str,
jpayne@68:                                "DT" : str, "FO" : str, "KS" : str,
jpayne@68:                                "LB" : str, "PG" : str, "PI" : str,
jpayne@68:                                "PL" : str, "PM" : str, "PU" : str,
jpayne@68:                                "SM" : str, "BC" : str,},
jpayne@68:                        "PG" : {"ID" : str, "PN" : str, "CL" : str,
jpayne@68:                                "PP" : str, "DS" : str, "VN" : str,},}
jpayne@68: 
jpayne@68: # output order of fields within records. Ensure that CL is at
jpayne@68: # the end as parsing a CL will ignore any subsequent records.
jpayne@68: VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
jpayne@68:                       "SQ" : ("SN", "LN", "AS", "M5",
jpayne@68:                               "UR", "SP", "AH", "TP",
jpayne@68:                               "DS", "AN"),
jpayne@68:                       "RG" : ("ID", "CN", "SM", "LB",
jpayne@68:                               "PU", "PI", "DT", "DS",
jpayne@68:                               "PL", "FO", "KS", "PG",
jpayne@68:                               "PM", "BC"),
jpayne@68:                       "PG" : ("PN", "ID", "VN", "PP",
jpayne@68:                               "DS", "CL"),}
jpayne@68: 
jpayne@68: 
jpayne@68: def build_header_line(fields, record):
jpayne@68:     '''build a header line from `fields` dictionary for `record`'''
jpayne@68: 
jpayne@68:     # TODO: add checking for field and sort order
jpayne@68:     line = ["@%s" % record]
jpayne@68:         # comment
jpayne@68:     if record == "CO":
jpayne@68:         line.append(fields)
jpayne@68:     # user tags
jpayne@68:     elif record.islower():
jpayne@68:         for key in sorted(fields):
jpayne@68:             line.append("%s:%s" % (key, str(fields[key])))
jpayne@68:     # defined tags
jpayne@68:     else:
jpayne@68:         # write fields of the specification
jpayne@68:         for key in VALID_HEADER_ORDER[record]:
jpayne@68:             if key in fields:
jpayne@68:                 line.append("%s:%s" % (key, str(fields[key])))
jpayne@68:         # write user fields
jpayne@68:         for key in fields:
jpayne@68:             if not key.isupper():
jpayne@68:                 line.append("%s:%s" % (key, str(fields[key])))
jpayne@68: 
jpayne@68:     return "\t".join(line)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
jpayne@68:     if not hdr:
jpayne@68:         raise ValueError('cannot create AlignmentHeader, received NULL pointer')
jpayne@68: 
jpayne@68:     # check: is AlignmetHeader.__cinit__ called?
jpayne@68:     cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
jpayne@68:     header.ptr = hdr
jpayne@68: 
jpayne@68:     return header
jpayne@68: 
jpayne@68: def read_failure_reason(code):
jpayne@68:     if code == -2:
jpayne@68:         return 'truncated file'
jpayne@68:     else:
jpayne@68:         return "error {} while reading file".format(code)
jpayne@68: 
jpayne@68: 
jpayne@68: # the following should be class-method for VariantHeader, but cdef @classmethods
jpayne@68: # are not implemented in cython.
jpayne@68: cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
jpayne@68:                                         reference_names,
jpayne@68:                                         reference_lengths,
jpayne@68:                                         add_sq_text=True,
jpayne@68:                                         text=None) except -1:
jpayne@68:     """build header from list of reference names and lengths.
jpayne@68:     """
jpayne@68: 
jpayne@68: cdef class AlignmentHeader(object):
jpayne@68:     """header information for a :class:`AlignmentFile` object
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     header_dict : dict
jpayne@68:         build header from a multi-level dictionary. The
jpayne@68:         first level are the four types ('HD', 'SQ', ...). The second
jpayne@68:         level are a list of lines, with each line being a list of
jpayne@68:         tag-value pairs. The header is constructed first from all the
jpayne@68:         defined fields, followed by user tags in alphabetical
jpayne@68:         order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@68:         object can be passed directly.
jpayne@68: 
jpayne@68:     text : string
jpayne@68:         use the string provided as the header
jpayne@68: 
jpayne@68:     reference_names : list
jpayne@68:         see reference_lengths
jpayne@68: 
jpayne@68:     reference_lengths : list
jpayne@68:         build header from list of chromosome names and lengths.  By
jpayne@68:         default, 'SQ' and 'LN' tags will be added to the header
jpayne@68:         text. This option can be changed by unsetting the flag
jpayne@68:         `add_sq_text`.
jpayne@68: 
jpayne@68:     add_sq_text : bool
jpayne@68:         do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@68:         construction :term:`SAM` formatted files without a header.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     # See makeVariantHeader for C constructor
jpayne@68:     def __cinit__(self):
jpayne@68:         self.ptr = NULL
jpayne@68: 
jpayne@68:     # Python constructor
jpayne@68:     def __init__(self):
jpayne@68:         self.ptr = bam_hdr_init()
jpayne@68:         if self.ptr is NULL:
jpayne@68:             raise MemoryError("could not create header")
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
jpayne@68: 
jpayne@68:         cdef AlignmentHeader self = AlignmentHeader()
jpayne@68:         cdef char *ctext
jpayne@68:         cdef int l_text
jpayne@68:         cdef int n, x
jpayne@68:         if text is not None:
jpayne@68:             btext = force_bytes(text)
jpayne@68:             ctext = btext
jpayne@68:             l_text = len(btext)
jpayne@68:             self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
jpayne@68:             if self.ptr.text == NULL:
jpayne@68:                 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
jpayne@68:             self.ptr.l_text = l_text
jpayne@68:             memcpy(self.ptr.text, ctext, l_text + 1)
jpayne@68: 
jpayne@68:         if reference_names and reference_lengths:
jpayne@68:             reference_names = [force_bytes(ref) for ref in reference_names]
jpayne@68: 
jpayne@68:             self.ptr.n_targets = len(reference_names)
jpayne@68: 
jpayne@68:             n = sum([len(reference_names) + 1])
jpayne@68:             self.ptr.target_name = <char**>calloc(n, sizeof(char*))
jpayne@68:             if self.ptr.target_name == NULL:
jpayne@68:                 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
jpayne@68: 
jpayne@68:             self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
jpayne@68:             if self.ptr.target_len == NULL:
jpayne@68:                 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
jpayne@68: 
jpayne@68:             for x from 0 <= x < self.ptr.n_targets:
jpayne@68:                 self.ptr.target_len[x] = reference_lengths[x]
jpayne@68:                 name = reference_names[x]
jpayne@68:                 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
jpayne@68:                 if self.ptr.target_name[x] == NULL:
jpayne@68:                     raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
jpayne@68:                 strncpy(self.ptr.target_name[x], name, len(name))
jpayne@68: 
jpayne@68:         return self
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def from_text(cls, text):
jpayne@68: 
jpayne@68:         reference_names, reference_lengths = [], []
jpayne@68:         for line in text.splitlines():
jpayne@68:             if line.startswith("@SQ"):
jpayne@68:                 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
jpayne@68:                 try:
jpayne@68:                     reference_names.append(fields["SN"])
jpayne@68:                     reference_lengths.append(int(fields["LN"]))
jpayne@68:                 except KeyError:
jpayne@68:                     raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@68:                 except ValueError:
jpayne@68:                     raise ValueError("wrong sequence information in '%s'" % str(fields))
jpayne@68: 
jpayne@68:         return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def from_dict(cls, header_dict):
jpayne@68: 
jpayne@68:         cdef list lines = []
jpayne@68:         # first: defined tags
jpayne@68:         for record in VALID_HEADERS:
jpayne@68:             if record in header_dict:
jpayne@68:                 data = header_dict[record]
jpayne@68:                 if not isinstance(data, VALID_HEADER_TYPES[record]):
jpayne@68:                     raise ValueError(
jpayne@68:                         "invalid type for record %s: %s, expected %s".format(
jpayne@68:                             record, type(data), VALID_HEADER_TYPES[record]))
jpayne@68:                 if isinstance(data, Mapping):
jpayne@68:                     lines.append(build_header_line(data, record))
jpayne@68:                 else:
jpayne@68:                     for fields in header_dict[record]:
jpayne@68:                         lines.append(build_header_line(fields, record))
jpayne@68: 
jpayne@68:         # then: user tags (lower case), sorted alphabetically
jpayne@68:         for record, data in sorted(header_dict.items()):
jpayne@68:             if record in VALID_HEADERS:
jpayne@68:                 continue
jpayne@68:             if isinstance(data, Mapping):
jpayne@68:                 lines.append(build_header_line(data, record))
jpayne@68:             else:
jpayne@68:                 for fields in header_dict[record]:
jpayne@68:                     lines.append(build_header_line(fields, record))
jpayne@68: 
jpayne@68:         text = "\n".join(lines) + "\n"
jpayne@68: 
jpayne@68:         reference_names, reference_lengths = [], []
jpayne@68:         if "SQ" in header_dict:
jpayne@68:             for fields in header_dict["SQ"]:
jpayne@68:                 try:
jpayne@68:                     reference_names.append(fields["SN"])
jpayne@68:                     reference_lengths.append(fields["LN"])
jpayne@68:                 except KeyError:
jpayne@68:                     raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@68: 
jpayne@68:         return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
jpayne@68: 
jpayne@68:         if len(reference_names) != len(reference_lengths):
jpayne@68:             raise ValueError("number of reference names and lengths do not match")
jpayne@68: 
jpayne@68:         # optionally, if there is no text, add a SAM compatible header to output file.
jpayne@68:         if text is None and add_sq_text:
jpayne@68:             text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
jpayne@68:                 reference_names, reference_lengths)])
jpayne@68: 
jpayne@68:         return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         bam_hdr_destroy(self.ptr)
jpayne@68:         self.ptr = NULL
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return self.ptr != NULL
jpayne@68: 
jpayne@68:     def copy(self):
jpayne@68:         return makeAlignmentHeader(bam_hdr_dup(self.ptr))
jpayne@68: 
jpayne@68:     property nreferences:
jpayne@68:         """int with the number of :term:`reference` sequences in the file.
jpayne@68: 
jpayne@68:         This is a read-only attribute."""
jpayne@68:         def __get__(self):
jpayne@68:             return self.ptr.n_targets
jpayne@68: 
jpayne@68:     property references:
jpayne@68:         """tuple with the names of :term:`reference` sequences. This is a
jpayne@68:         read-only attribute"""
jpayne@68:         def __get__(self):
jpayne@68:             t = []
jpayne@68:             cdef int x
jpayne@68:             for x in range(self.ptr.n_targets):
jpayne@68:                 t.append(charptr_to_str(self.ptr.target_name[x]))
jpayne@68:             return tuple(t)
jpayne@68: 
jpayne@68:     property lengths:
jpayne@68:         """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@68:         read-only attribute. The lengths are in the same order as
jpayne@68:         :attr:`pysam.AlignmentFile.references`
jpayne@68:         """
jpayne@68:         def __get__(self):
jpayne@68:             t = []
jpayne@68:             cdef int x
jpayne@68:             for x in range(self.ptr.n_targets):
jpayne@68:                 t.append(self.ptr.target_len[x])
jpayne@68:             return tuple(t)
jpayne@68: 
jpayne@68:     def _build_sequence_section(self):
jpayne@68:         """return sequence section of header.
jpayne@68: 
jpayne@68:         The sequence section is built from the list of reference names and
jpayne@68:         lengths stored in the BAM-file and not from any @SQ entries that
jpayne@68:         are part of the header's text section.
jpayne@68:         """
jpayne@68: 
jpayne@68:         cdef int x
jpayne@68:         text = []
jpayne@68:         for x in range(self.ptr.n_targets):
jpayne@68:             text.append("@SQ\tSN:{}\tLN:{}\n".format(
jpayne@68:                 force_str(self.ptr.target_name[x]),
jpayne@68:                 self.ptr.target_len[x]))
jpayne@68:         return "".join(text)
jpayne@68: 
jpayne@68:     def to_dict(self):
jpayne@68:         """return two-level dictionary with header information from the file.
jpayne@68: 
jpayne@68:         The first level contains the record (``HD``, ``SQ``, etc) and
jpayne@68:         the second level contains the fields (``VN``, ``LN``, etc).
jpayne@68: 
jpayne@68:         The parser is validating and will raise an AssertionError if
jpayne@68:         if encounters any record or field tags that are not part of
jpayne@68:         the SAM specification. Use the
jpayne@68:         :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
jpayne@68:         header.
jpayne@68: 
jpayne@68:         The parsing follows the SAM format specification with the
jpayne@68:         exception of the ``CL`` field. This option will consume the
jpayne@68:         rest of a header line irrespective of any additional fields.
jpayne@68:         This behaviour has been added to accommodate command line
jpayne@68:         options that contain characters that are not valid field
jpayne@68:         separators.
jpayne@68: 
jpayne@68:         If no @SQ entries are within the text section of the header,
jpayne@68:         this will be automatically added from the reference names and
jpayne@68:         lengths stored in the binary part of the header.
jpayne@68:         """
jpayne@68:         result = collections.OrderedDict()
jpayne@68: 
jpayne@68:         # convert to python string
jpayne@68:         t = self.__str__()
jpayne@68:         for line in t.split("\n"):
jpayne@68:             line = line.strip(' \0')
jpayne@68:             if not line:
jpayne@68:                 continue
jpayne@68:             assert line.startswith("@"), \
jpayne@68:                 "header line without '@': '%s'" % line
jpayne@68:             fields = line[1:].split("\t")
jpayne@68:             record = fields[0]
jpayne@68:             assert record in VALID_HEADER_TYPES, \
jpayne@68:                 "header line with invalid type '%s': '%s'" % (record, line)
jpayne@68: 
jpayne@68:             # treat comments
jpayne@68:             if record == "CO":
jpayne@68:                 if record not in result:
jpayne@68:                     result[record] = []
jpayne@68:                 result[record].append("\t".join( fields[1:]))
jpayne@68:                 continue
jpayne@68:             # the following is clumsy as generators do not work?
jpayne@68:             x = {}
jpayne@68: 
jpayne@68:             for idx, field in enumerate(fields[1:]):
jpayne@68:                 if ":" not in field:
jpayne@68:                     raise ValueError("malformatted header: no ':' in field" )
jpayne@68:                 key, value = field.split(":", 1)
jpayne@68:                 if key in ("CL",):
jpayne@68:                     # special treatment for command line
jpayne@68:                     # statements (CL). These might contain
jpayne@68:                     # characters that are non-conformant with
jpayne@68:                     # the valid field separators in the SAM
jpayne@68:                     # header. Thus, in contravention to the
jpayne@68:                     # SAM API, consume the rest of the line.
jpayne@68:                     key, value = "\t".join(fields[idx+1:]).split(":", 1)
jpayne@68:                     x[key] = KNOWN_HEADER_FIELDS[record][key](value)
jpayne@68:                     break
jpayne@68: 
jpayne@68:                 # interpret type of known header record tags, default to str
jpayne@68:                 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
jpayne@68: 
jpayne@68:             if VALID_HEADER_TYPES[record] == Mapping:
jpayne@68:                 if record in result:
jpayne@68:                     raise ValueError(
jpayne@68:                         "multiple '%s' lines are not permitted" % record)
jpayne@68: 
jpayne@68:                 result[record] = x
jpayne@68:             elif VALID_HEADER_TYPES[record] == Sequence:
jpayne@68:                 if record not in result: result[record] = []
jpayne@68:                 result[record].append(x)
jpayne@68: 
jpayne@68:         # if there are no SQ lines in the header, add the
jpayne@68:         # reference names from the information in the bam
jpayne@68:         # file.
jpayne@68:         #
jpayne@68:         # Background: c-samtools keeps the textual part of the
jpayne@68:         # header separate from the list of reference names and
jpayne@68:         # lengths. Thus, if a header contains only SQ lines,
jpayne@68:         # the SQ information is not part of the textual header
jpayne@68:         # and thus are missing from the output. See issue 84.
jpayne@68:         if "SQ" not in result:
jpayne@68:             sq = []
jpayne@68:             for ref, length in zip(self.references, self.lengths):
jpayne@68:                 sq.append({'LN': length, 'SN': ref })
jpayne@68:             result["SQ"] = sq
jpayne@68: 
jpayne@68:         return result
jpayne@68: 
jpayne@68:     def as_dict(self):
jpayne@68:         """deprecated, use :meth:`to_dict()` instead"""
jpayne@68:         return self.to_dict()
jpayne@68: 
jpayne@68:     def get_reference_name(self, tid):
jpayne@68:         if tid == -1:
jpayne@68:             return None
jpayne@68:         if not 0 <= tid < self.ptr.n_targets:
jpayne@68:             raise ValueError("reference_id %i out of range 0<=tid<%i" %
jpayne@68:                              (tid, self.ptr.n_targets))
jpayne@68:         return charptr_to_str(self.ptr.target_name[tid])
jpayne@68: 
jpayne@68:     def get_reference_length(self, reference):
jpayne@68:         cdef int tid = self.get_tid(reference)
jpayne@68:         if tid < 0:
jpayne@68:             raise KeyError("unknown reference {}".format(reference))
jpayne@68:         else:
jpayne@68:             return self.ptr.target_len[tid]
jpayne@68: 
jpayne@68:     def is_valid_tid(self, int tid):
jpayne@68:         """
jpayne@68:         return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68: 
jpayne@68:         Note that the unmapped tid code (-1) counts as an invalid.
jpayne@68:         """
jpayne@68:         return 0 <= tid < self.ptr.n_targets
jpayne@68: 
jpayne@68:     def get_tid(self, reference):
jpayne@68:         """
jpayne@68:         return the numerical :term:`tid` corresponding to
jpayne@68:         :term:`reference`
jpayne@68: 
jpayne@68:         returns -1 if reference is not known.
jpayne@68:         """
jpayne@68:         reference = force_bytes(reference)
jpayne@68:         tid = bam_name2id(self.ptr, reference)
jpayne@68:         if tid < -1:
jpayne@68:             raise ValueError('could not parse header')
jpayne@68:         return tid
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         '''string with the full contents of the :term:`sam file` header as a
jpayne@68:         string.
jpayne@68: 
jpayne@68:         If no @SQ entries are within the text section of the header,
jpayne@68:         this will be automatically added from the reference names and
jpayne@68:         lengths stored in the binary part of the header.
jpayne@68: 
jpayne@68:         See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
jpayne@68:         representation of the header.
jpayne@68:         '''
jpayne@68:         text = from_string_and_size(self.ptr.text, self.ptr.l_text)
jpayne@68:         if "@SQ" not in text:
jpayne@68:             text += "\n" + self._build_sequence_section()
jpayne@68:         return text
jpayne@68: 
jpayne@68:     # dictionary access methods, for backwards compatibility.
jpayne@68:     def __setitem__(self, key, value):
jpayne@68:         raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         return self.to_dict().__getitem__(key)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         return self.to_dict().items()
jpayne@68: 
jpayne@68:     # PY2 compatibility
jpayne@68:     def iteritems(self):
jpayne@68:         return self.to_dict().items()
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         return self.to_dict().keys()
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         return self.to_dict().values()
jpayne@68: 
jpayne@68:     def get(self, *args):
jpayne@68:         return self.to_dict().get(*args)
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self.to_dict().__len__()
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         return self.to_dict().__contains__(key)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class AlignmentFile(HTSFile):
jpayne@68:     """AlignmentFile(filepath_or_object, mode=None, template=None,
jpayne@68:     reference_names=None, reference_lengths=None, text=NULL,
jpayne@68:     header=None, add_sq_text=False, check_header=True, check_sq=True,
jpayne@68:     reference_filename=None, filename=None, index_filename=None,
jpayne@68:     filepath_index=None, require_index=False, duplicate_filehandle=True,
jpayne@68:     ignore_truncation=False, threads=1)
jpayne@68: 
jpayne@68:     A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
jpayne@68: 
jpayne@68:     If `filepath_or_object` is a string, the file is automatically
jpayne@68:     opened. If `filepath_or_object` is a python File object, the
jpayne@68:     already opened file will be used.
jpayne@68: 
jpayne@68:     If the file is opened for reading and an index exists (if file is BAM, a
jpayne@68:     .bai file or if CRAM a .crai file), it will be opened automatically.
jpayne@68:     `index_filename` may be specified explicitly. If the index is not named
jpayne@68:     in the standard manner, not located in the same directory as the
jpayne@68:     BAM/CRAM file, or is remote.  Without an index, random access via
jpayne@68:     :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
jpayne@68:     is disabled.
jpayne@68: 
jpayne@68:     For writing, the header of a :term:`SAM` file/:term:`BAM` file can
jpayne@68:     be constituted from several sources (see also the samtools format
jpayne@68:     specification):
jpayne@68: 
jpayne@68:         1. If `template` is given, the header is copied from another
jpayne@68:            `AlignmentFile` (`template` must be a
jpayne@68:            :class:`~pysam.AlignmentFile`).
jpayne@68: 
jpayne@68:         2. If `header` is given, the header is built from a
jpayne@68:            multi-level dictionary.
jpayne@68: 
jpayne@68:         3. If `text` is given, new header text is copied from raw
jpayne@68:            text.
jpayne@68: 
jpayne@68:         4. The names (`reference_names`) and lengths
jpayne@68:            (`reference_lengths`) are supplied directly as lists.
jpayne@68: 
jpayne@68:     When reading or writing a CRAM file, the filename of a FASTA-formatted
jpayne@68:     reference can be specified with `reference_filename`.
jpayne@68: 
jpayne@68:     By default, if a file is opened in mode 'r', it is checked
jpayne@68:     for a valid header (`check_header` = True) and a definition of
jpayne@68:     chromosome names (`check_sq` = True).
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     mode : string
jpayne@68:         `mode` should be ``r`` for reading or ``w`` for writing. The
jpayne@68:         default is text mode (:term:`SAM`). For binary (:term:`BAM`)
jpayne@68:         I/O you should append ``b`` for compressed or ``u`` for
jpayne@68:         uncompressed :term:`BAM` output.  Use ``h`` to output header
jpayne@68:         information in text (:term:`TAM`) mode. Use ``c`` for
jpayne@68:         :term:`CRAM` formatted files.
jpayne@68: 
jpayne@68:         If ``b`` is present, it must immediately follow ``r`` or
jpayne@68:         ``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
jpayne@68:         ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
jpayne@68:         :term:`BAM` formatted file for reading, type::
jpayne@68: 
jpayne@68:            f = pysam.AlignmentFile('ex1.bam','rb')
jpayne@68: 
jpayne@68:         If mode is not specified, the method will try to auto-detect
jpayne@68:         in the order 'rb', 'r', thus both the following should work::
jpayne@68: 
jpayne@68:             f1 = pysam.AlignmentFile('ex1.bam')
jpayne@68:             f2 = pysam.AlignmentFile('ex1.sam')
jpayne@68: 
jpayne@68:     template : AlignmentFile
jpayne@68:         when writing, copy header from file `template`.
jpayne@68: 
jpayne@68:     header :  dict or AlignmentHeader
jpayne@68:         when writing, build header from a multi-level dictionary. The
jpayne@68:         first level are the four types ('HD', 'SQ', ...). The second
jpayne@68:         level are a list of lines, with each line being a list of
jpayne@68:         tag-value pairs. The header is constructed first from all the
jpayne@68:         defined fields, followed by user tags in alphabetical
jpayne@68:         order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@68:         object can be passed directly.
jpayne@68: 
jpayne@68:     text : string
jpayne@68:         when writing, use the string provided as the header
jpayne@68: 
jpayne@68:     reference_names : list
jpayne@68:         see reference_lengths
jpayne@68: 
jpayne@68:     reference_lengths : list
jpayne@68:         when writing or opening a SAM file without header build header
jpayne@68:         from list of chromosome names and lengths.  By default, 'SQ'
jpayne@68:         and 'LN' tags will be added to the header text. This option
jpayne@68:         can be changed by unsetting the flag `add_sq_text`.
jpayne@68: 
jpayne@68:     add_sq_text : bool
jpayne@68:         do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@68:         construction :term:`SAM` formatted files without a header.
jpayne@68: 
jpayne@68:     add_sam_header : bool
jpayne@68:         when outputting SAM the default is to output a header. This is
jpayne@68:         equivalent to opening the file in 'wh' mode. If this option is
jpayne@68:         set to False, no header will be output. To read such a file,
jpayne@68:         set `check_header=False`.
jpayne@68: 
jpayne@68:     check_header : bool
jpayne@68:         obsolete: when reading a SAM file, check if header is present
jpayne@68:         (default=True)
jpayne@68: 
jpayne@68:     check_sq : bool
jpayne@68:         when reading, check if SQ entries are present in header
jpayne@68:         (default=True)
jpayne@68: 
jpayne@68:     reference_filename : string
jpayne@68:         Path to a FASTA-formatted reference file. Valid only for CRAM files.
jpayne@68:         When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
jpayne@68:         specified in the header (``UR`` tag), which are normally used to find
jpayne@68:         the reference.
jpayne@68: 
jpayne@68:     index_filename : string
jpayne@68:         Explicit path to the index file.  Only needed if the index is not
jpayne@68:         named in the standard manner, not located in the same directory as
jpayne@68:         the BAM/CRAM file, or is remote.  An IOError is raised if the index
jpayne@68:         cannot be found or is invalid.
jpayne@68: 
jpayne@68:     filepath_index : string
jpayne@68:         Alias for `index_filename`.
jpayne@68: 
jpayne@68:     require_index : bool
jpayne@68:         When reading, require that an index file is present and is valid or
jpayne@68:         raise an IOError.  (default=False)
jpayne@68: 
jpayne@68:     filename : string
jpayne@68:         Alternative to filepath_or_object. Filename of the file
jpayne@68:         to be opened.
jpayne@68: 
jpayne@68:     duplicate_filehandle: bool
jpayne@68:         By default, file handles passed either directly or through
jpayne@68:         File-like objects will be duplicated before passing them to
jpayne@68:         htslib. The duplication prevents issues where the same stream
jpayne@68:         will be closed by htslib and through destruction of the
jpayne@68:         high-level python object. Set to False to turn off
jpayne@68:         duplication.
jpayne@68: 
jpayne@68:     ignore_truncation: bool
jpayne@68:         Issue a warning, instead of raising an error if the current file
jpayne@68:         appears to be truncated due to a missing EOF marker.  Only applies
jpayne@68:         to bgzipped formats. (Default=False)
jpayne@68: 
jpayne@68:     format_options: list
jpayne@68:         A list of key=value strings, as accepted by --input-fmt-option and
jpayne@68:         --output-fmt-option in samtools.
jpayne@68:     threads: integer
jpayne@68:         Number of threads to use for compressing/decompressing BAM/CRAM files.
jpayne@68:         Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@68:         (Default=1)
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __cinit__(self, *args, **kwargs):
jpayne@68:         self.htsfile = NULL
jpayne@68:         self.filename = None
jpayne@68:         self.mode = None
jpayne@68:         self.threads = 1
jpayne@68:         self.is_stream = False
jpayne@68:         self.is_remote = False
jpayne@68:         self.index = NULL
jpayne@68: 
jpayne@68:         if "filename" in kwargs:
jpayne@68:             args = [kwargs["filename"]]
jpayne@68:             del kwargs["filename"]
jpayne@68: 
jpayne@68:         self._open(*args, **kwargs)
jpayne@68: 
jpayne@68:         # allocate memory for iterator
jpayne@68:         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@68:         if self.b == NULL:
jpayne@68:             raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
jpayne@68: 
jpayne@68:     def has_index(self):
jpayne@68:         """return true if htsfile has an existing (and opened) index.
jpayne@68:         """
jpayne@68:         return self.index != NULL
jpayne@68: 
jpayne@68:     def check_index(self):
jpayne@68:         """return True if index is present.
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         AttributeError
jpayne@68:             if htsfile is :term:`SAM` formatted and thus has no index.
jpayne@68: 
jpayne@68:         ValueError
jpayne@68:             if htsfile is closed or index could not be opened.
jpayne@68:         """
jpayne@68: 
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError("I/O operation on closed file")
jpayne@68:         if not self.is_bam and not self.is_cram:
jpayne@68:             raise AttributeError(
jpayne@68:                 "AlignmentFile.mapped only available in bam files")
jpayne@68:         if self.index == NULL:
jpayne@68:             raise ValueError(
jpayne@68:                 "mapping information not recorded in index "
jpayne@68:                 "or index not available")
jpayne@68:         return True
jpayne@68: 
jpayne@68:     def _open(self,
jpayne@68:               filepath_or_object,
jpayne@68:               mode=None,
jpayne@68:               AlignmentFile template=None,
jpayne@68:               reference_names=None,
jpayne@68:               reference_lengths=None,
jpayne@68:               reference_filename=None,
jpayne@68:               text=None,
jpayne@68:               header=None,
jpayne@68:               port=None,
jpayne@68:               add_sq_text=True,
jpayne@68:               add_sam_header=True,
jpayne@68:               check_header=True,
jpayne@68:               check_sq=True,
jpayne@68:               index_filename=None,
jpayne@68:               filepath_index=None,
jpayne@68:               require_index=False,
jpayne@68:               referencenames=None,
jpayne@68:               referencelengths=None,
jpayne@68:               duplicate_filehandle=True,
jpayne@68:               ignore_truncation=False,
jpayne@68:               format_options=None,
jpayne@68:               threads=1):
jpayne@68:         '''open a sam, bam or cram formatted file.
jpayne@68: 
jpayne@68:         If _open is called on an existing file, the current file
jpayne@68:         will be closed and a new file will be opened.
jpayne@68: 
jpayne@68:         '''
jpayne@68:         cdef char *cfilename = NULL
jpayne@68:         cdef char *creference_filename = NULL
jpayne@68:         cdef char *cindexname = NULL
jpayne@68:         cdef char *cmode = NULL
jpayne@68:         cdef bam_hdr_t * hdr = NULL
jpayne@68: 
jpayne@68:         if threads > 1 and ignore_truncation:
jpayne@68:            # This won't raise errors if reaching a truncated alignment,
jpayne@68:            # because bgzf_mt_reader in htslib does not deal with
jpayne@68:            # bgzf_mt_read_block returning non-zero values, contrary
jpayne@68:            # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@68:            # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@68:            raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@68:         self.threads = threads
jpayne@68: 
jpayne@68:         # for backwards compatibility:
jpayne@68:         if referencenames is not None:
jpayne@68:             reference_names = referencenames
jpayne@68:         if referencelengths is not None:
jpayne@68:             reference_lengths = referencelengths
jpayne@68: 
jpayne@68:         # close a previously opened file
jpayne@68:         if self.is_open:
jpayne@68:             self.close()
jpayne@68: 
jpayne@68:         # autodetection for read
jpayne@68:         if mode is None:
jpayne@68:             mode = "r"
jpayne@68: 
jpayne@68:         if add_sam_header and mode == "w":
jpayne@68:             mode = "wh"
jpayne@68: 
jpayne@68:         assert mode in ("r", "w", "rb", "wb", "wh",
jpayne@68:                         "wbu", "rU", "wb0",
jpayne@68:                         "rc", "wc"), \
jpayne@68:             "invalid file opening mode `%s`" % mode
jpayne@68: 
jpayne@68:         self.duplicate_filehandle = duplicate_filehandle
jpayne@68: 
jpayne@68:         # StringIO not supported
jpayne@68:         if isinstance(filepath_or_object, StringIO):
jpayne@68:             raise NotImplementedError(
jpayne@68:                 "access from StringIO objects not supported")
jpayne@68:         # reading from a file descriptor
jpayne@68:         elif isinstance(filepath_or_object, int):
jpayne@68:             self.filename = filepath_or_object
jpayne@68:             filename = None
jpayne@68:             self.is_remote = False
jpayne@68:             self.is_stream = True
jpayne@68:         # reading from a File object or other object with fileno
jpayne@68:         elif hasattr(filepath_or_object, "fileno"):
jpayne@68:             if filepath_or_object.closed:
jpayne@68:                 raise ValueError('I/O operation on closed file')
jpayne@68:             self.filename = filepath_or_object
jpayne@68:             # .name can be TextIOWrapper
jpayne@68:             try:
jpayne@68:                 filename = encode_filename(str(filepath_or_object.name))
jpayne@68:                 cfilename = filename
jpayne@68:             except AttributeError:
jpayne@68:                 filename = None
jpayne@68:             self.is_remote = False
jpayne@68:             self.is_stream = True
jpayne@68:         # what remains is a filename
jpayne@68:         else:
jpayne@68:             self.filename = filename = encode_filename(filepath_or_object)
jpayne@68:             cfilename = filename
jpayne@68:             self.is_remote = hisremote(cfilename)
jpayne@68:             self.is_stream = self.filename == b'-'
jpayne@68: 
jpayne@68:         # for htslib, wbu seems to not work
jpayne@68:         if mode == "wbu":
jpayne@68:             mode = "wb0"
jpayne@68: 
jpayne@68:         self.mode = force_bytes(mode)
jpayne@68:         self.reference_filename = reference_filename = encode_filename(
jpayne@68:             reference_filename)
jpayne@68: 
jpayne@68:         if mode[0] == 'w':
jpayne@68:             # open file for writing
jpayne@68: 
jpayne@68:             if not (template or header or text or (reference_names and reference_lengths)):
jpayne@68:                 raise ValueError(
jpayne@68:                     "either supply options `template`, `header`, `text` or  both `reference_names` "
jpayne@68:                     "and `reference_lengths` for writing")
jpayne@68: 
jpayne@68:             if template:
jpayne@68:                 # header is copied, though at the moment not strictly
jpayne@68:                 # necessary as AlignmentHeader is immutable.
jpayne@68:                 self.header = template.header.copy()
jpayne@68:             elif isinstance(header, AlignmentHeader):
jpayne@68:                 self.header = header.copy()
jpayne@68:             elif isinstance(header, Mapping):
jpayne@68:                 self.header = AlignmentHeader.from_dict(header)
jpayne@68:             elif reference_names and reference_lengths:
jpayne@68:                 self.header = AlignmentHeader.from_references(
jpayne@68:                     reference_names,
jpayne@68:                     reference_lengths,
jpayne@68:                     add_sq_text=add_sq_text,
jpayne@68:                     text=text)
jpayne@68:             elif text:
jpayne@68:                 self.header = AlignmentHeader.from_text(text)
jpayne@68:             else:
jpayne@68:                 raise ValueError("not enough information to construct header. Please provide template, "
jpayne@68:                                  "header, text or reference_names/reference_lengths")
jpayne@68:             self.htsfile = self._open_htsfile()
jpayne@68: 
jpayne@68:             if self.htsfile == NULL:
jpayne@68:                 if errno:
jpayne@68:                     raise IOError(errno, "could not open alignment file `{}`: {}".format(
jpayne@68:                         force_str(filename),
jpayne@68:                         force_str(strerror(errno))))
jpayne@68:                 else:
jpayne@68:                     raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@68:             if format_options and len(format_options):
jpayne@68:                 self.add_hts_options(format_options)
jpayne@68:             # set filename with reference sequences. If no filename
jpayne@68:             # is given, the CRAM reference arrays will be built from
jpayne@68:             # the @SQ header in the header
jpayne@68:             if "c" in mode and reference_filename:
jpayne@68:                 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
jpayne@68:                     raise ValueError("failure when setting reference filename")
jpayne@68: 
jpayne@68:             # write header to htsfile
jpayne@68:             if "b" in mode or "c" in mode or "h" in mode:
jpayne@68:                 hdr = self.header.ptr
jpayne@68:                 with nogil:
jpayne@68:                     sam_hdr_write(self.htsfile, hdr)
jpayne@68: 
jpayne@68:         elif mode[0] == "r":
jpayne@68:             # open file for reading
jpayne@68:             self.htsfile = self._open_htsfile()
jpayne@68: 
jpayne@68:             if self.htsfile == NULL:
jpayne@68:                 if errno:
jpayne@68:                     raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
jpayne@68:                                   force_str(strerror(errno))))
jpayne@68:                 else:
jpayne@68:                     raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@68: 
jpayne@68:             if self.htsfile.format.category != sequence_data:
jpayne@68:                 raise ValueError("file does not contain alignment data")
jpayne@68: 
jpayne@68:             if format_options and len(format_options):
jpayne@68:                 self.add_hts_options(format_options)
jpayne@68: 
jpayne@68:             self.check_truncation(ignore_truncation)
jpayne@68: 
jpayne@68:             # bam/cram files require a valid header
jpayne@68:             if self.is_bam or self.is_cram:
jpayne@68:                 with nogil:
jpayne@68:                     hdr = sam_hdr_read(self.htsfile)
jpayne@68:                 if hdr == NULL:
jpayne@68:                     raise ValueError(
jpayne@68:                         "file does not have a valid header (mode='%s') "
jpayne@68:                         "- is it BAM/CRAM format?" % mode)
jpayne@68:                 self.header = makeAlignmentHeader(hdr)
jpayne@68:             else:
jpayne@68:                 # in sam files a header is optional. If not given,
jpayne@68:                 # user may provide reference names and lengths to built
jpayne@68:                 # an on-the-fly header.
jpayne@68:                 if reference_names and reference_lengths:
jpayne@68:                     # build header from a target names and lengths
jpayne@68:                     self.header = AlignmentHeader.from_references(
jpayne@68:                         reference_names=reference_names,
jpayne@68:                         reference_lengths=reference_lengths,
jpayne@68:                         add_sq_text=add_sq_text,
jpayne@68:                         text=text)
jpayne@68:                 else:
jpayne@68:                     with nogil:
jpayne@68:                         hdr = sam_hdr_read(self.htsfile)
jpayne@68:                     if hdr == NULL:
jpayne@68:                         raise ValueError(
jpayne@68:                             "SAM? file does not have a valid header (mode='%s'), "
jpayne@68:                             "please provide reference_names and reference_lengths")
jpayne@68:                     self.header = makeAlignmentHeader(hdr)
jpayne@68: 
jpayne@68:             # set filename with reference sequences
jpayne@68:             if self.is_cram and reference_filename:
jpayne@68:                 creference_filename = self.reference_filename
jpayne@68:                 hts_set_opt(self.htsfile,
jpayne@68:                             CRAM_OPT_REFERENCE,
jpayne@68:                             creference_filename)
jpayne@68: 
jpayne@68:             if check_sq and self.header.nreferences == 0:
jpayne@68:                 raise ValueError(
jpayne@68:                     ("file has no sequences defined (mode='%s') - "
jpayne@68:                      "is it SAM/BAM format? Consider opening with "
jpayne@68:                      "check_sq=False") % mode)
jpayne@68: 
jpayne@68:             if self.is_bam or self.is_cram:
jpayne@68:                 self.index_filename = index_filename or filepath_index
jpayne@68:                 if self.index_filename:
jpayne@68:                     cindexname = bfile_name = encode_filename(self.index_filename)
jpayne@68: 
jpayne@68:                 if cfilename or cindexname:
jpayne@68:                     with nogil:
jpayne@68:                         self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
jpayne@68:                                                      HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
jpayne@68: 
jpayne@68:                     if not self.index and (cindexname or require_index):
jpayne@68:                         if errno:
jpayne@68:                             raise IOError(errno, force_str(strerror(errno)))
jpayne@68:                         else:
jpayne@68:                             raise IOError('unable to open index file `%s`' % self.index_filename)
jpayne@68: 
jpayne@68:                 elif require_index:
jpayne@68:                     raise IOError('unable to open index file')
jpayne@68: 
jpayne@68:                 # save start of data section
jpayne@68:                 if not self.is_stream:
jpayne@68:                     self.start_offset = self.tell()
jpayne@68: 
jpayne@68:     def fetch(self,
jpayne@68:               contig=None,
jpayne@68:               start=None,
jpayne@68:               stop=None,
jpayne@68:               region=None,
jpayne@68:               tid=None,
jpayne@68:               until_eof=False,
jpayne@68:               multiple_iterators=False,
jpayne@68:               reference=None,
jpayne@68:               end=None):
jpayne@68:         """fetch reads aligned in a :term:`region`.
jpayne@68: 
jpayne@68:         See :meth:`~pysam.HTSFile.parse_region` for more information
jpayne@68:         on how genomic regions can be specified. :term:`reference` and
jpayne@68:         `end` are also accepted for backward compatibility as synonyms
jpayne@68:         for :term:`contig` and `stop`, respectively.
jpayne@68: 
jpayne@68:         Without a `contig` or `region` all mapped reads in the file
jpayne@68:         will be fetched. The reads will be returned ordered by reference
jpayne@68:         sequence, which will not necessarily be the order within the
jpayne@68:         file. This mode of iteration still requires an index. If there is
jpayne@68:         no index, use `until_eof=True`.
jpayne@68: 
jpayne@68:         If only `contig` is set, all reads aligned to `contig`
jpayne@68:         will be fetched.
jpayne@68: 
jpayne@68:         A :term:`SAM` file does not allow random access. If `region`
jpayne@68:         or `contig` are given, an exception is raised.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68: 
jpayne@68:         until_eof : bool
jpayne@68: 
jpayne@68:            If `until_eof` is True, all reads from the current file
jpayne@68:            position will be returned in order as they are within the
jpayne@68:            file. Using this option will also fetch unmapped reads.
jpayne@68: 
jpayne@68:         multiple_iterators : bool
jpayne@68: 
jpayne@68:            If `multiple_iterators` is True, multiple
jpayne@68:            iterators on the same file can be used at the same time. The
jpayne@68:            iterator returned will receive its own copy of a filehandle to
jpayne@68:            the file effectively re-opening the file. Re-opening a file
jpayne@68:            creates some overhead, so beware.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68: 		An iterator over a collection of reads. : IteratorRow
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         ValueError
jpayne@68:             if the genomic coordinates are out of range or invalid or the
jpayne@68:             file does not permit random access to genomic coordinates.
jpayne@68: 
jpayne@68:         """
jpayne@68:         cdef int rtid, rstart, rstop, has_coord
jpayne@68: 
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError( "I/O operation on closed file" )
jpayne@68: 
jpayne@68:         has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@68:             contig, start, stop, region, tid,
jpayne@68:             end=end, reference=reference)
jpayne@68: 
jpayne@68:        # Turn of re-opening if htsfile is a stream
jpayne@68:         if self.is_stream:
jpayne@68:             multiple_iterators = False
jpayne@68: 
jpayne@68:         if self.is_bam or self.is_cram:
jpayne@68:             if not until_eof and not self.is_remote:
jpayne@68:                 if not self.has_index():
jpayne@68:                     raise ValueError(
jpayne@68:                         "fetch called on bamfile without index")
jpayne@68: 
jpayne@68:             if has_coord:
jpayne@68:                 return IteratorRowRegion(
jpayne@68:                     self, rtid, rstart, rstop,
jpayne@68:                     multiple_iterators=multiple_iterators)
jpayne@68:             else:
jpayne@68:                 if until_eof:
jpayne@68:                     return IteratorRowAll(
jpayne@68:                         self,
jpayne@68:                         multiple_iterators=multiple_iterators)
jpayne@68:                 else:
jpayne@68:                     # AH: check - reason why no multiple_iterators for
jpayne@68:                     # AllRefs?
jpayne@68:                     return IteratorRowAllRefs(
jpayne@68:                         self,
jpayne@68:                         multiple_iterators=multiple_iterators)
jpayne@68:         else:
jpayne@68:             if has_coord:
jpayne@68:                 raise ValueError(
jpayne@68:                     "fetching by region is not available for SAM files")
jpayne@68: 
jpayne@68:             if multiple_iterators == True:
jpayne@68:                 raise ValueError(
jpayne@68:                     "multiple iterators not implemented for SAM files")
jpayne@68: 
jpayne@68:             return IteratorRowAll(self,
jpayne@68:                                   multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:     def head(self, n, multiple_iterators=True):
jpayne@68:         '''return an iterator over the first n alignments.
jpayne@68: 
jpayne@68:         This iterator is is useful for inspecting the bam-file.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68: 
jpayne@68:         multiple_iterators : bool
jpayne@68: 
jpayne@68:             is set to True by default in order to
jpayne@68:             avoid changing the current file position.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68: 		an iterator over a collection of reads : IteratorRowHead
jpayne@68: 
jpayne@68:         '''
jpayne@68:         return IteratorRowHead(self, n,
jpayne@68:                                multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:     def mate(self, AlignedSegment read):
jpayne@68:         '''return the mate of :class:`pysam.AlignedSegment` `read`.
jpayne@68: 
jpayne@68:         .. note::
jpayne@68: 
jpayne@68:             Calling this method will change the file position.
jpayne@68:             This might interfere with any iterators that have
jpayne@68:             not re-opened the file.
jpayne@68: 
jpayne@68:         .. note::
jpayne@68: 
jpayne@68:            This method is too slow for high-throughput processing.
jpayne@68:            If a read needs to be processed with its mate, work
jpayne@68:            from a read name sorted file or, better, cache reads.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68:         the mate : AlignedSegment
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         ValueError
jpayne@68:             if the read is unpaired or the mate is unmapped
jpayne@68: 
jpayne@68:         '''
jpayne@68:         cdef uint32_t flag = read._delegate.core.flag
jpayne@68: 
jpayne@68:         if flag & BAM_FPAIRED == 0:
jpayne@68:             raise ValueError("read %s: is unpaired" %
jpayne@68:                              (read.query_name))
jpayne@68:         if flag & BAM_FMUNMAP != 0:
jpayne@68:             raise ValueError("mate %s: is unmapped" %
jpayne@68:                              (read.query_name))
jpayne@68: 
jpayne@68:         # xor flags to get the other mate
jpayne@68:         cdef int x = BAM_FREAD1 + BAM_FREAD2
jpayne@68:         flag = (flag ^ x) & x
jpayne@68: 
jpayne@68:         # Make sure to use a separate file to jump around
jpayne@68:         # to mate as otherwise the original file position
jpayne@68:         # will be lost
jpayne@68:         # The following code is not using the C API and
jpayne@68:         # could thus be made much quicker, for example
jpayne@68:         # by using tell and seek.
jpayne@68:         for mate in self.fetch(
jpayne@68:                 read._delegate.core.mpos,
jpayne@68:                 read._delegate.core.mpos + 1,
jpayne@68:                 tid=read._delegate.core.mtid,
jpayne@68:                 multiple_iterators=True):
jpayne@68:             if mate.flag & flag != 0 and \
jpayne@68:                mate.query_name == read.query_name:
jpayne@68:                 break
jpayne@68:         else:
jpayne@68:             raise ValueError("mate not found")
jpayne@68: 
jpayne@68:         return mate
jpayne@68: 
jpayne@68:     def pileup(self,
jpayne@68:                contig=None,
jpayne@68:                start=None,
jpayne@68:                stop=None,
jpayne@68:                region=None,
jpayne@68:                reference=None,
jpayne@68:                end=None,
jpayne@68:                **kwargs):
jpayne@68:         """perform a :term:`pileup` within a :term:`region`. The region is
jpayne@68:         specified by :term:`contig`, `start` and `stop` (using
jpayne@68:         0-based indexing).  :term:`reference` and `end` are also accepted for
jpayne@68:         backward compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68:         respectively.  Alternatively, a samtools 'region' string
jpayne@68:         can be supplied.
jpayne@68: 
jpayne@68:         Without 'contig' or 'region' all reads will be used for the
jpayne@68:         pileup. The reads will be returned ordered by
jpayne@68:         :term:`contig` sequence, which will not necessarily be the
jpayne@68:         order within the file.
jpayne@68: 
jpayne@68:         Note that :term:`SAM` formatted files do not allow random
jpayne@68:         access.  In these files, if a 'region' or 'contig' are
jpayne@68:         given an exception is raised.
jpayne@68: 
jpayne@68:         .. note::
jpayne@68: 
jpayne@68:             'all' reads which overlap the region are returned. The
jpayne@68:             first base returned will be the first base of the first
jpayne@68:             read 'not' necessarily the first base of the region used
jpayne@68:             in the query.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68: 
jpayne@68:         truncate : bool
jpayne@68: 
jpayne@68:            By default, the samtools pileup engine outputs all reads
jpayne@68:            overlapping a region. If truncate is True and a region is
jpayne@68:            given, only columns in the exact region specified are
jpayne@68:            returned.
jpayne@68: 
jpayne@68:         max_depth : int
jpayne@68:            Maximum read depth permitted. The default limit is '8000'.
jpayne@68: 
jpayne@68:         stepper : string
jpayne@68:            The stepper controls how the iterator advances.
jpayne@68:            Possible options for the stepper are
jpayne@68: 
jpayne@68:            ``all``
jpayne@68:               skip reads in which any of the following flags are set:
jpayne@68:               BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68: 
jpayne@68:            ``nofilter``
jpayne@68:               uses every single read turning off any filtering.
jpayne@68: 
jpayne@68:            ``samtools``
jpayne@68:               same filter and read processing as in samtools
jpayne@68:               pileup. For full compatibility, this requires a
jpayne@68:               'fastafile' to be given. The following options all pertain
jpayne@68:               to filtering of the ``samtools`` stepper.
jpayne@68: 
jpayne@68:         fastafile : :class:`~pysam.FastaFile` object.
jpayne@68: 
jpayne@68:            This is required for some of the steppers.
jpayne@68: 
jpayne@68:         ignore_overlaps: bool
jpayne@68: 
jpayne@68:            If set to True, detect if read pairs overlap and only take
jpayne@68:            the higher quality base. This is the default.
jpayne@68: 
jpayne@68:         flag_filter : int
jpayne@68: 
jpayne@68:            ignore reads where any of the bits in the flag are set. The default is
jpayne@68:            BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
jpayne@68: 
jpayne@68:         flag_require : int
jpayne@68: 
jpayne@68:            only use reads where certain flags are set. The default is 0.
jpayne@68: 
jpayne@68:         ignore_orphans: bool
jpayne@68: 
jpayne@68:             ignore orphans (paired reads that are not in a proper pair).
jpayne@68:             The default is to ignore orphans.
jpayne@68: 
jpayne@68:         min_base_quality: int
jpayne@68: 
jpayne@68:            Minimum base quality. Bases below the minimum quality will
jpayne@68:            not be output. The default is 13.
jpayne@68: 
jpayne@68:         adjust_capq_threshold: int
jpayne@68: 
jpayne@68:            adjust mapping quality. The default is 0 for no
jpayne@68:            adjustment. The recommended value for adjustment is 50.
jpayne@68: 
jpayne@68:         min_mapping_quality : int
jpayne@68: 
jpayne@68:            only use reads above a minimum mapping quality. The default is 0.
jpayne@68: 
jpayne@68:         compute_baq: bool
jpayne@68: 
jpayne@68:            re-alignment computing per-Base Alignment Qualities (BAQ). The
jpayne@68:            default is to do re-alignment. Realignment requires a reference
jpayne@68:            sequence. If none is present, no realignment will be performed.
jpayne@68: 
jpayne@68:         redo_baq: bool
jpayne@68: 
jpayne@68:            recompute per-Base Alignment Quality on the fly ignoring
jpayne@68:            existing base qualities. The default is False (use existing
jpayne@68:            base qualities).
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68:         an iterator over genomic positions. : IteratorColumn
jpayne@68: 
jpayne@68:         """
jpayne@68:         cdef int rtid, has_coord
jpayne@68:         cdef int32_t rstart, rstop
jpayne@68: 
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError("I/O operation on closed file")
jpayne@68: 
jpayne@68:         has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@68:             contig, start, stop, region, reference=reference, end=end)
jpayne@68: 
jpayne@68:         if has_coord:
jpayne@68:             if not self.has_index():
jpayne@68:                 raise ValueError("no index available for pileup")
jpayne@68: 
jpayne@68:             return IteratorColumnRegion(self,
jpayne@68:                                         tid=rtid,
jpayne@68:                                         start=rstart,
jpayne@68:                                         stop=rstop,
jpayne@68:                                         **kwargs)
jpayne@68:         else:
jpayne@68:             if self.has_index():
jpayne@68:                 return IteratorColumnAllRefs(self, **kwargs)
jpayne@68:             else:
jpayne@68:                 return IteratorColumnAll(self, **kwargs)
jpayne@68: 
jpayne@68:     def count(self,
jpayne@68:               contig=None,
jpayne@68:               start=None,
jpayne@68:               stop=None,
jpayne@68:               region=None,
jpayne@68:               until_eof=False,
jpayne@68:               read_callback="nofilter",
jpayne@68:               reference=None,
jpayne@68:               end=None):
jpayne@68:         '''count the number of reads in :term:`region`
jpayne@68: 
jpayne@68:         The region is specified by :term:`contig`, `start` and `stop`.
jpayne@68:         :term:`reference` and `end` are also accepted for backward
jpayne@68:         compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68:         respectively.  Alternatively, a `samtools`_ :term:`region`
jpayne@68:         string can be supplied.
jpayne@68: 
jpayne@68:         A :term:`SAM` file does not allow random access and if
jpayne@68:         `region` or `contig` are given, an exception is raised.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68: 
jpayne@68:         contig : string
jpayne@68:             reference_name of the genomic region (chromosome)
jpayne@68: 
jpayne@68:         start : int
jpayne@68:             start of the genomic region (0-based inclusive)
jpayne@68: 
jpayne@68:         stop : int
jpayne@68:             end of the genomic region (0-based exclusive)
jpayne@68: 
jpayne@68:         region : string
jpayne@68:             a region string in samtools format.
jpayne@68: 
jpayne@68:         until_eof : bool
jpayne@68:             count until the end of the file, possibly including
jpayne@68:             unmapped reads as well.
jpayne@68: 
jpayne@68:         read_callback: string or function
jpayne@68: 
jpayne@68:             select a call-back to ignore reads when counting. It can
jpayne@68:             be either a string with the following values:
jpayne@68: 
jpayne@68:             ``all``
jpayne@68:                 skip reads in which any of the following
jpayne@68:                 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@68:                 BAM_FDUP
jpayne@68: 
jpayne@68:             ``nofilter``
jpayne@68:                 uses every single read
jpayne@68: 
jpayne@68:             Alternatively, `read_callback` can be a function
jpayne@68:             ``check_read(read)`` that should return True only for
jpayne@68:             those reads that shall be included in the counting.
jpayne@68: 
jpayne@68:         reference : string
jpayne@68:             backward compatible synonym for `contig`
jpayne@68: 
jpayne@68:         end : int
jpayne@68:             backward compatible synonym for `stop`
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         ValueError
jpayne@68:             if the genomic coordinates are out of range or invalid.
jpayne@68: 
jpayne@68:         '''
jpayne@68:         cdef AlignedSegment read
jpayne@68:         cdef long counter = 0
jpayne@68: 
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError("I/O operation on closed file")
jpayne@68: 
jpayne@68:         cdef int filter_method = 0
jpayne@68:         if read_callback == "all":
jpayne@68:             filter_method = 1
jpayne@68:         elif read_callback == "nofilter":
jpayne@68:             filter_method = 2
jpayne@68: 
jpayne@68:         for read in self.fetch(contig=contig,
jpayne@68:                                start=start,
jpayne@68:                                stop=stop,
jpayne@68:                                reference=reference,
jpayne@68:                                end=end,
jpayne@68:                                region=region,
jpayne@68:                                until_eof=until_eof):
jpayne@68:             # apply filter
jpayne@68:             if filter_method == 1:
jpayne@68:                 # filter = "all"
jpayne@68:                 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
jpayne@68:                     continue
jpayne@68:             elif filter_method == 2:
jpayne@68:                 # filter = "nofilter"
jpayne@68:                 pass
jpayne@68:             else:
jpayne@68:                 if not read_callback(read):
jpayne@68:                     continue
jpayne@68:             counter += 1
jpayne@68: 
jpayne@68:         return counter
jpayne@68: 
jpayne@68:     @cython.boundscheck(False)  # we do manual bounds checking
jpayne@68:     def count_coverage(self,
jpayne@68:                        contig,
jpayne@68:                        start=None,
jpayne@68:                        stop=None,
jpayne@68:                        region=None,
jpayne@68:                        quality_threshold=15,
jpayne@68:                        read_callback='all',
jpayne@68:                        reference=None,
jpayne@68:                        end=None):
jpayne@68:         """count the coverage of genomic positions by reads in :term:`region`.
jpayne@68: 
jpayne@68:         The region is specified by :term:`contig`, `start` and `stop`.
jpayne@68:         :term:`reference` and `end` are also accepted for backward
jpayne@68:         compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68:         respectively.  Alternatively, a `samtools`_ :term:`region`
jpayne@68:         string can be supplied.  The coverage is computed per-base [ACGT].
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68: 
jpayne@68:         contig : string
jpayne@68:             reference_name of the genomic region (chromosome)
jpayne@68: 
jpayne@68:         start : int
jpayne@68:             start of the genomic region (0-based inclusive). If not
jpayne@68:             given, count from the start of the chromosome.
jpayne@68: 
jpayne@68:         stop : int
jpayne@68:             end of the genomic region (0-based exclusive). If not given,
jpayne@68:             count to the end of the chromosome.
jpayne@68: 
jpayne@68:         region : string
jpayne@68:             a region string.
jpayne@68: 
jpayne@68:         quality_threshold : int
jpayne@68:             quality_threshold is the minimum quality score (in phred) a
jpayne@68:             base has to reach to be counted.
jpayne@68: 
jpayne@68:         read_callback: string or function
jpayne@68: 
jpayne@68:             select a call-back to ignore reads when counting. It can
jpayne@68:             be either a string with the following values:
jpayne@68: 
jpayne@68:             ``all``
jpayne@68:                 skip reads in which any of the following
jpayne@68:                 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@68:                 BAM_FDUP
jpayne@68: 
jpayne@68:             ``nofilter``
jpayne@68:                 uses every single read
jpayne@68: 
jpayne@68:             Alternatively, `read_callback` can be a function
jpayne@68:             ``check_read(read)`` that should return True only for
jpayne@68:             those reads that shall be included in the counting.
jpayne@68: 
jpayne@68:         reference : string
jpayne@68:             backward compatible synonym for `contig`
jpayne@68: 
jpayne@68:         end : int
jpayne@68:             backward compatible synonym for `stop`
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         ValueError
jpayne@68:             if the genomic coordinates are out of range or invalid.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68:         four array.arrays of the same length in order A C G T : tuple
jpayne@68: 
jpayne@68:         """
jpayne@68: 
jpayne@68:         cdef uint32_t contig_length = self.get_reference_length(contig)
jpayne@68:         cdef int _start = start if start is not None else 0
jpayne@68:         cdef int _stop = stop if stop is not None else contig_length
jpayne@68:         _stop = _stop if _stop < contig_length else contig_length
jpayne@68: 
jpayne@68:         if _stop == _start:
jpayne@68:             raise ValueError("interval of size 0")
jpayne@68:         if _stop < _start:
jpayne@68:             raise ValueError("interval of size less than 0")
jpayne@68: 
jpayne@68:         cdef int length = _stop - _start
jpayne@68:         cdef c_array.array int_array_template = array.array('L', [])
jpayne@68:         cdef c_array.array count_a
jpayne@68:         cdef c_array.array count_c
jpayne@68:         cdef c_array.array count_g
jpayne@68:         cdef c_array.array count_t
jpayne@68:         count_a = c_array.clone(int_array_template, length, zero=True)
jpayne@68:         count_c = c_array.clone(int_array_template, length, zero=True)
jpayne@68:         count_g = c_array.clone(int_array_template, length, zero=True)
jpayne@68:         count_t = c_array.clone(int_array_template, length, zero=True)
jpayne@68: 
jpayne@68:         cdef AlignedSegment read
jpayne@68:         cdef cython.str seq
jpayne@68:         cdef c_array.array quality
jpayne@68:         cdef int qpos
jpayne@68:         cdef int refpos
jpayne@68:         cdef int c = 0
jpayne@68:         cdef int filter_method = 0
jpayne@68: 
jpayne@68: 
jpayne@68:         if read_callback == "all":
jpayne@68:             filter_method = 1
jpayne@68:         elif read_callback == "nofilter":
jpayne@68:             filter_method = 2
jpayne@68: 
jpayne@68:         cdef int _threshold = quality_threshold or 0
jpayne@68:         for read in self.fetch(contig=contig,
jpayne@68:                                reference=reference,
jpayne@68:                                start=start,
jpayne@68:                                stop=stop,
jpayne@68:                                end=end,
jpayne@68:                                region=region):
jpayne@68:             # apply filter
jpayne@68:             if filter_method == 1:
jpayne@68:                 # filter = "all"
jpayne@68:                 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
jpayne@68:                     continue
jpayne@68:             elif filter_method == 2:
jpayne@68:                 # filter = "nofilter"
jpayne@68:                 pass
jpayne@68:             else:
jpayne@68:                 if not read_callback(read):
jpayne@68:                     continue
jpayne@68: 
jpayne@68:             # count
jpayne@68:             seq = read.seq
jpayne@68:             if seq is None:
jpayne@68:                 continue
jpayne@68:             quality = read.query_qualities
jpayne@68: 
jpayne@68:             for qpos, refpos in read.get_aligned_pairs(True):
jpayne@68:                 if qpos is not None and refpos is not None and \
jpayne@68:                    _start <= refpos < _stop:
jpayne@68: 
jpayne@68:                     # only check base quality if _threshold > 0
jpayne@68:                     if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
jpayne@68:                         if seq[qpos] == 'A':
jpayne@68:                             count_a.data.as_ulongs[refpos - _start] += 1
jpayne@68:                         if seq[qpos] == 'C':
jpayne@68:                             count_c.data.as_ulongs[refpos - _start] += 1
jpayne@68:                         if seq[qpos] == 'G':
jpayne@68:                             count_g.data.as_ulongs[refpos - _start] += 1
jpayne@68:                         if seq[qpos] == 'T':
jpayne@68:                             count_t.data.as_ulongs[refpos - _start] += 1
jpayne@68: 
jpayne@68:         return count_a, count_c, count_g, count_t
jpayne@68: 
jpayne@68:     def find_introns_slow(self, read_iterator):
jpayne@68:         """Return a dictionary {(start, stop): count}
jpayne@68:         Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@68:         and their support ( = number of reads ).
jpayne@68: 
jpayne@68:         read_iterator can be the result of a .fetch(...) call.
jpayne@68:         Or it can be a generator filtering such reads. Example
jpayne@68:         samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@68:         """
jpayne@68:         res = collections.Counter()
jpayne@68:         for r in read_iterator:
jpayne@68:             if 'N' in r.cigarstring:
jpayne@68:                 last_read_pos = False
jpayne@68:                 for read_loc, genome_loc in r.get_aligned_pairs():
jpayne@68:                     if read_loc is None and last_read_pos:
jpayne@68:                         start = genome_loc
jpayne@68:                     elif read_loc and last_read_pos is None:
jpayne@68:                         stop = genome_loc  # we are right exclusive ,so this is correct
jpayne@68:                         res[(start, stop)] += 1
jpayne@68:                         del start
jpayne@68:                         del stop
jpayne@68:                     last_read_pos = read_loc
jpayne@68:         return res
jpayne@68: 
jpayne@68:     def find_introns(self, read_iterator):
jpayne@68:         """Return a dictionary {(start, stop): count}
jpayne@68:         Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@68:         and their support ( = number of reads ).
jpayne@68: 
jpayne@68:         read_iterator can be the result of a .fetch(...) call.
jpayne@68:         Or it can be a generator filtering such reads. Example
jpayne@68:         samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@68:         """
jpayne@68:         cdef:
jpayne@68:             uint32_t base_position, junc_start, nt
jpayne@68:             int op
jpayne@68:             AlignedSegment r
jpayne@68:             int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
jpayne@68: 
jpayne@68:         res = collections.Counter()
jpayne@68: 
jpayne@68:         match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
jpayne@68:         for r in read_iterator:
jpayne@68:             base_position = r.pos
jpayne@68:             cigar = r.cigartuples
jpayne@68:             if cigar is None:
jpayne@68:                 continue
jpayne@68: 
jpayne@68:             for op, nt in cigar:
jpayne@68:                 if op in match_or_deletion:
jpayne@68:                     base_position += nt
jpayne@68:                 elif op == BAM_CREF_SKIP:
jpayne@68:                     junc_start = base_position
jpayne@68:                     base_position += nt
jpayne@68:                     res[(junc_start, base_position)] += 1
jpayne@68:         return res
jpayne@68: 
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         '''closes the :class:`pysam.AlignmentFile`.'''
jpayne@68: 
jpayne@68:         if self.htsfile == NULL:
jpayne@68:             return
jpayne@68: 
jpayne@68:         if self.index != NULL:
jpayne@68:             hts_idx_destroy(self.index)
jpayne@68:             self.index = NULL
jpayne@68: 
jpayne@68:         cdef int ret = hts_close(self.htsfile)
jpayne@68:         self.htsfile = NULL
jpayne@68: 
jpayne@68:         self.header = None
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             global errno
jpayne@68:             if errno == EPIPE:
jpayne@68:                 errno = 0
jpayne@68:             else:
jpayne@68:                 raise IOError(errno, force_str(strerror(errno)))
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         cdef int ret = 0
jpayne@68: 
jpayne@68:         if self.index != NULL:
jpayne@68:             hts_idx_destroy(self.index)
jpayne@68:             self.index = NULL
jpayne@68: 
jpayne@68:         if self.htsfile != NULL:
jpayne@68:             ret = hts_close(self.htsfile)
jpayne@68:             self.htsfile = NULL
jpayne@68: 
jpayne@68:         self.header = None
jpayne@68: 
jpayne@68:         if self.b:
jpayne@68:             bam_destroy1(self.b)
jpayne@68:             self.b = NULL
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             global errno
jpayne@68:             if errno == EPIPE:
jpayne@68:                 errno = 0
jpayne@68:             else:
jpayne@68:                 raise IOError(errno, force_str(strerror(errno)))
jpayne@68: 
jpayne@68:     cpdef int write(self, AlignedSegment read) except -1:
jpayne@68:         '''
jpayne@68:         write a single :class:`pysam.AlignedSegment` to disk.
jpayne@68: 
jpayne@68:         Raises:
jpayne@68:             ValueError
jpayne@68:                 if the writing failed
jpayne@68: 
jpayne@68:         Returns:
jpayne@68:             int :
jpayne@68:                 the number of bytes written. If the file is closed,
jpayne@68:                 this will be 0.
jpayne@68:         '''
jpayne@68:         if not self.is_open:
jpayne@68:             return 0
jpayne@68: 
jpayne@68:         if self.header.ptr.n_targets <= read._delegate.core.tid:
jpayne@68:             raise ValueError(
jpayne@68:                 "AlignedSegment refers to reference number {} that "
jpayne@68:                 "is larger than the number of references ({}) in the header".format(
jpayne@68:                     read._delegate.core.tid, self.header.ptr.n_targets))
jpayne@68: 
jpayne@68:         cdef int ret
jpayne@68:         with nogil:
jpayne@68:             ret = sam_write1(self.htsfile,
jpayne@68:                              self.header.ptr,
jpayne@68:                              read._delegate)
jpayne@68: 
jpayne@68:         # kbj: Still need to raise an exception with except -1. Otherwise
jpayne@68:         #      when ret == -1 we get a "SystemError: error return without
jpayne@68:         #      exception set".
jpayne@68:         if ret < 0:
jpayne@68:             raise IOError(
jpayne@68:             "sam_write1 failed with error code {}".format(ret))
jpayne@68: 
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     # context manager interface
jpayne@68:     def __enter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     def __exit__(self, exc_type, exc_value, traceback):
jpayne@68:         self.close()
jpayne@68:         return False
jpayne@68: 
jpayne@68:     ###############################################################
jpayne@68:     ###############################################################
jpayne@68:     ###############################################################
jpayne@68:     ## properties
jpayne@68:     ###############################################################
jpayne@68:     property mapped:
jpayne@68:         """int with total number of mapped alignments according to the
jpayne@68:         statistics recorded in the index. This is a read-only
jpayne@68:         attribute.
jpayne@68:         (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68:         index format does not record these statistics.)
jpayne@68:         """
jpayne@68:         def __get__(self):
jpayne@68:             self.check_index()
jpayne@68:             cdef int tid
jpayne@68:             cdef uint64_t total = 0
jpayne@68:             cdef uint64_t mapped, unmapped
jpayne@68:             for tid from 0 <= tid < self.header.nreferences:
jpayne@68:                 with nogil:
jpayne@68:                     hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68:                 total += mapped
jpayne@68:             return total
jpayne@68: 
jpayne@68:     property unmapped:
jpayne@68:         """int with total number of unmapped reads according to the statistics
jpayne@68:         recorded in the index. This number of reads includes the number of reads
jpayne@68:         without coordinates. This is a read-only attribute.
jpayne@68:         (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68:         index format does not record these statistics.)
jpayne@68:         """
jpayne@68:         def __get__(self):
jpayne@68:             self.check_index()
jpayne@68:             cdef int tid
jpayne@68:             cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
jpayne@68:             cdef uint64_t mapped, unmapped
jpayne@68:             for tid from 0 <= tid < self.header.nreferences:
jpayne@68:                 with nogil:
jpayne@68:                     hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68:                 total += unmapped
jpayne@68:             return total
jpayne@68: 
jpayne@68:     property nocoordinate:
jpayne@68:         """int with total number of reads without coordinates according to the
jpayne@68:         statistics recorded in the index, i.e., the statistic printed for "*"
jpayne@68:         by the ``samtools idxstats`` command. This is a read-only attribute.
jpayne@68:         (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68:         index format does not record these statistics.)
jpayne@68:         """
jpayne@68:         def __get__(self):
jpayne@68:             self.check_index()
jpayne@68:             cdef uint64_t n
jpayne@68:             with nogil:
jpayne@68:                 n = hts_idx_get_n_no_coor(self.index)
jpayne@68:             return n
jpayne@68: 
jpayne@68:     def get_index_statistics(self):
jpayne@68:         """return statistics about mapped/unmapped reads per chromosome as
jpayne@68:         they are stored in the index, similarly to the statistics printed
jpayne@68:         by the ``samtools idxstats`` command.
jpayne@68: 
jpayne@68:         CRAI indexes do not record these statistics, so for a CRAM file
jpayne@68:         with a .crai index the returned statistics will all be 0.
jpayne@68: 
jpayne@68:         Returns:
jpayne@68:             list :
jpayne@68:                 a list of records for each chromosome. Each record has the
jpayne@68:                 attributes 'contig', 'mapped', 'unmapped' and 'total'.
jpayne@68:         """
jpayne@68: 
jpayne@68:         self.check_index()
jpayne@68:         cdef int tid
jpayne@68:         cdef uint64_t mapped, unmapped
jpayne@68:         results = []
jpayne@68:         # TODO: use header
jpayne@68:         for tid from 0 <= tid < self.nreferences:
jpayne@68:             with nogil:
jpayne@68:                 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68:             results.append(
jpayne@68:                 IndexStats._make((
jpayne@68:                     self.get_reference_name(tid),
jpayne@68:                     mapped,
jpayne@68:                     unmapped,
jpayne@68:                     mapped + unmapped)))
jpayne@68: 
jpayne@68:         return results
jpayne@68: 
jpayne@68:     ###############################################################
jpayne@68:     ## file-object like iterator access
jpayne@68:     ## note: concurrent access will cause errors (see IteratorRow
jpayne@68:     ## and multiple_iterators)
jpayne@68:     ## Possible solutions: deprecate or open new file handle
jpayne@68:     def __iter__(self):
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError("I/O operation on closed file")
jpayne@68: 
jpayne@68:         if not self.is_bam and self.header.nreferences == 0:
jpayne@68:             raise NotImplementedError(
jpayne@68:                 "can not iterate over samfile without header")
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef bam1_t * getCurrent(self):
jpayne@68:         return self.b
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''
jpayne@68:         cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
jpayne@68:         '''
jpayne@68:         cdef int ret
jpayne@68:         cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68:         with nogil:
jpayne@68:             ret = sam_read1(self.htsfile,
jpayne@68:                             hdr,
jpayne@68:                             self.b)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         cdef int ret = self.cnext()
jpayne@68:         if ret >= 0:
jpayne@68:             return makeAlignedSegment(self.b, self.header)
jpayne@68:         elif ret == -1:
jpayne@68:             raise StopIteration
jpayne@68:         else:
jpayne@68:             raise IOError(read_failure_reason(ret))
jpayne@68: 
jpayne@68:     ###########################################
jpayne@68:     # methods/properties referencing the header
jpayne@68:     def is_valid_tid(self, int tid):
jpayne@68:         """
jpayne@68:         return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68: 
jpayne@68:         Note that the unmapped tid code (-1) counts as an invalid.
jpayne@68:         """
jpayne@68:         if self.header is None:
jpayne@68:             raise ValueError("header not available in closed files")
jpayne@68:         return self.header.is_valid_tid(tid)
jpayne@68: 
jpayne@68:     def get_tid(self, reference):
jpayne@68:         """
jpayne@68:         return the numerical :term:`tid` corresponding to
jpayne@68:         :term:`reference`
jpayne@68: 
jpayne@68:         returns -1 if reference is not known.
jpayne@68:         """
jpayne@68:         if self.header is None:
jpayne@68:             raise ValueError("header not available in closed files")
jpayne@68:         return self.header.get_tid(reference)
jpayne@68: 
jpayne@68:     def get_reference_name(self, tid):
jpayne@68:         """
jpayne@68:         return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@68:         """
jpayne@68:         if self.header is None:
jpayne@68:             raise ValueError("header not available in closed files")
jpayne@68:         return self.header.get_reference_name(tid)
jpayne@68: 
jpayne@68:     def get_reference_length(self, reference):
jpayne@68:         """
jpayne@68:         return :term:`reference` length corresponding to numerical :term:`tid`
jpayne@68:         """
jpayne@68:         if self.header is None:
jpayne@68:             raise ValueError("header not available in closed files")
jpayne@68:         return self.header.get_reference_length(reference)
jpayne@68: 
jpayne@68:     property nreferences:
jpayne@68:         """int with the number of :term:`reference` sequences in the file.
jpayne@68:         This is a read-only attribute."""
jpayne@68:         def __get__(self):
jpayne@68:             if self.header:
jpayne@68:                 return self.header.nreferences
jpayne@68:             else:
jpayne@68:                 raise ValueError("header not available in closed files")
jpayne@68: 
jpayne@68:     property references:
jpayne@68:         """tuple with the names of :term:`reference` sequences. This is a
jpayne@68:         read-only attribute"""
jpayne@68:         def __get__(self):
jpayne@68:             if self.header:
jpayne@68:                 return self.header.references
jpayne@68:             else:
jpayne@68:                 raise ValueError("header not available in closed files")
jpayne@68: 
jpayne@68:     property lengths:
jpayne@68:         """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@68:         read-only attribute. The lengths are in the same order as
jpayne@68:         :attr:`pysam.AlignmentFile.references`
jpayne@68: 
jpayne@68:         """
jpayne@68:         def __get__(self):
jpayne@68:             if self.header:
jpayne@68:                 return self.header.lengths
jpayne@68:             else:
jpayne@68:                 raise ValueError("header not available in closed files")
jpayne@68: 
jpayne@68:     # Compatibility functions for pysam < 0.14
jpayne@68:     property text:
jpayne@68:         """deprecated, use :attr:`references` and :attr:`lengths` instead"""
jpayne@68:         def __get__(self):
jpayne@68:             if self.header:
jpayne@68:                 return self.header.__str__()
jpayne@68:             else:
jpayne@68:                 raise ValueError("header not available in closed files")
jpayne@68: 
jpayne@68:     # Compatibility functions for pysam < 0.8.3
jpayne@68:     def gettid(self, reference):
jpayne@68:         """deprecated, use :meth:`get_tid` instead"""
jpayne@68:         return self.get_tid(reference)
jpayne@68: 
jpayne@68:     def getrname(self, tid):
jpayne@68:         """deprecated, use :meth:`get_reference_name` instead"""
jpayne@68:         return self.get_reference_name(tid)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRow:
jpayne@68:     '''abstract base class for iterators over mapped reads.
jpayne@68: 
jpayne@68:     Various iterators implement different behaviours for wrapping around
jpayne@68:     contig boundaries. Examples include:
jpayne@68: 
jpayne@68:     :class:`pysam.IteratorRowRegion`
jpayne@68:         iterate within a single contig and a defined region.
jpayne@68: 
jpayne@68:     :class:`pysam.IteratorRowAll`
jpayne@68:         iterate until EOF. This iterator will also include unmapped reads.
jpayne@68: 
jpayne@68:     :class:`pysam.IteratorRowAllRefs`
jpayne@68:         iterate over all reads in all reference sequences.
jpayne@68: 
jpayne@68:     The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
jpayne@68: 
jpayne@68:     .. note::
jpayne@68: 
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a
jpayne@68:         :meth:`AlignmentFile.fetch`.
jpayne@68: 
jpayne@68:     '''
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@68:         cdef char *cfilename
jpayne@68:         cdef char *creference_filename
jpayne@68:         cdef char *cindexname = NULL
jpayne@68: 
jpayne@68:         if not samfile.is_open:
jpayne@68:             raise ValueError("I/O operation on closed file")
jpayne@68: 
jpayne@68:         # makes sure that samfile stays alive as long as the
jpayne@68:         # iterator is alive
jpayne@68:         self.samfile = samfile
jpayne@68: 
jpayne@68:         # reopen the file - note that this makes the iterator
jpayne@68:         # slow and causes pileup to slow down significantly.
jpayne@68:         if multiple_iterators:
jpayne@68: 
jpayne@68:             cfilename = samfile.filename
jpayne@68:             with nogil:
jpayne@68:                 self.htsfile = hts_open(cfilename, 'r')
jpayne@68:             assert self.htsfile != NULL
jpayne@68: 
jpayne@68:             if samfile.has_index():
jpayne@68:                 if samfile.index_filename:
jpayne@68:                     cindexname = bindex_filename = encode_filename(samfile.index_filename)
jpayne@68:                 with nogil:
jpayne@68:                     self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
jpayne@68:             else:
jpayne@68:                 self.index = NULL
jpayne@68: 
jpayne@68:             # need to advance in newly opened file to position after header
jpayne@68:             # better: use seek/tell?
jpayne@68:             with nogil:
jpayne@68:                 hdr = sam_hdr_read(self.htsfile)
jpayne@68:             if hdr is NULL:
jpayne@68:                 raise IOError("unable to read header information")
jpayne@68:             self.header = makeAlignmentHeader(hdr)
jpayne@68: 
jpayne@68:             self.owns_samfile = True
jpayne@68: 
jpayne@68:             # options specific to CRAM files
jpayne@68:             if samfile.is_cram and samfile.reference_filename:
jpayne@68:                 creference_filename = samfile.reference_filename
jpayne@68:                 hts_set_opt(self.htsfile,
jpayne@68:                             CRAM_OPT_REFERENCE,
jpayne@68:                             creference_filename)
jpayne@68: 
jpayne@68:         else:
jpayne@68:             self.htsfile = samfile.htsfile
jpayne@68:             self.index = samfile.index
jpayne@68:             self.owns_samfile = False
jpayne@68:             self.header = samfile.header
jpayne@68: 
jpayne@68:         self.retval = 0
jpayne@68: 
jpayne@68:         self.b = bam_init1()
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         bam_destroy1(self.b)
jpayne@68:         if self.owns_samfile:
jpayne@68:             hts_idx_destroy(self.index)
jpayne@68:             hts_close(self.htsfile)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRowRegion(IteratorRow):
jpayne@68:     """*(AlignmentFile samfile, int tid, int beg, int stop,
jpayne@68:     int multiple_iterators=False)*
jpayne@68: 
jpayne@68:     iterate over mapped reads in a region.
jpayne@68: 
jpayne@68:     .. note::
jpayne@68: 
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a
jpayne@68:         :meth:`AlignmentFile.fetch`.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile,
jpayne@68:                  int tid, int beg, int stop,
jpayne@68:                  int multiple_iterators=False):
jpayne@68: 
jpayne@68:         if not samfile.has_index():
jpayne@68:             raise ValueError("no index available for iteration")
jpayne@68: 
jpayne@68:         super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68:         with nogil:
jpayne@68:             self.iter = sam_itr_queryi(
jpayne@68:                 self.index,
jpayne@68:                 tid,
jpayne@68:                 beg,
jpayne@68:                 stop)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef bam1_t * getCurrent(self):
jpayne@68:         return self.b
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''cversion of iterator. Used by IteratorColumn'''
jpayne@68:         with nogil:
jpayne@68:             self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
jpayne@68:                                        self.iter,
jpayne@68:                                        self.b,
jpayne@68:                                        self.htsfile)
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         self.cnext()
jpayne@68:         if self.retval >= 0:
jpayne@68:             return makeAlignedSegment(self.b, self.header)
jpayne@68:         elif self.retval == -1:
jpayne@68:             raise StopIteration
jpayne@68:         elif self.retval == -2:
jpayne@68:             # Note: it is currently not the case that hts_iter_next
jpayne@68:             # returns -2 for a truncated file.
jpayne@68:             # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
jpayne@68:             raise IOError('truncated file')
jpayne@68:         else:
jpayne@68:             raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         hts_itr_destroy(self.iter)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRowHead(IteratorRow):
jpayne@68:     """*(AlignmentFile samfile, n, int multiple_iterators=False)*
jpayne@68: 
jpayne@68:     iterate over first n reads in `samfile`
jpayne@68: 
jpayne@68:     .. note::
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a
jpayne@68:         :meth:`AlignmentFile.head`.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self,
jpayne@68:                  AlignmentFile samfile,
jpayne@68:                  int n,
jpayne@68:                  int multiple_iterators=False):
jpayne@68:         super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:         self.max_rows = n
jpayne@68:         self.current_row = 0
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef bam1_t * getCurrent(self):
jpayne@68:         return self.b
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''cversion of iterator. Used by IteratorColumn'''
jpayne@68:         cdef int ret
jpayne@68:         cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68:         with nogil:
jpayne@68:             ret = sam_read1(self.htsfile,
jpayne@68:                             hdr,
jpayne@68:                             self.b)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         if self.current_row >= self.max_rows:
jpayne@68:             raise StopIteration
jpayne@68: 
jpayne@68:         cdef int ret = self.cnext()
jpayne@68:         if ret >= 0:
jpayne@68:             self.current_row += 1
jpayne@68:             return makeAlignedSegment(self.b, self.header)
jpayne@68:         elif ret == -1:
jpayne@68:             raise StopIteration
jpayne@68:         else:
jpayne@68:             raise IOError(read_failure_reason(ret))
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRowAll(IteratorRow):
jpayne@68:     """*(AlignmentFile samfile, int multiple_iterators=False)*
jpayne@68: 
jpayne@68:     iterate over all reads in `samfile`
jpayne@68: 
jpayne@68:     .. note::
jpayne@68: 
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a
jpayne@68:         :meth:`AlignmentFile.fetch`.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@68:         super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef bam1_t * getCurrent(self):
jpayne@68:         return self.b
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''cversion of iterator. Used by IteratorColumn'''
jpayne@68:         cdef int ret
jpayne@68:         cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68:         with nogil:
jpayne@68:             ret = sam_read1(self.htsfile,
jpayne@68:                             hdr,
jpayne@68:                             self.b)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         cdef int ret = self.cnext()
jpayne@68:         if ret >= 0:
jpayne@68:             return makeAlignedSegment(self.b, self.header)
jpayne@68:         elif ret == -1:
jpayne@68:             raise StopIteration
jpayne@68:         else:
jpayne@68:             raise IOError(read_failure_reason(ret))
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRowAllRefs(IteratorRow):
jpayne@68:     """iterates over all mapped reads by chaining iterators over each
jpayne@68:     reference
jpayne@68: 
jpayne@68:     .. note::
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a
jpayne@68:         :meth:`AlignmentFile.fetch`.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile, multiple_iterators=False):
jpayne@68:         super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:         if not samfile.has_index():
jpayne@68:             raise ValueError("no index available for fetch")
jpayne@68: 
jpayne@68:         self.tid = -1
jpayne@68: 
jpayne@68:     def nextiter(self):
jpayne@68:         # get a new iterator for a chromosome. The file
jpayne@68:         # will not be re-opened.
jpayne@68:         self.rowiter = IteratorRowRegion(self.samfile,
jpayne@68:                                          self.tid,
jpayne@68:                                          0,
jpayne@68:                                          MAX_POS)
jpayne@68:         # set htsfile and header of the rowiter
jpayne@68:         # to the values in this iterator to reflect multiple_iterators
jpayne@68:         self.rowiter.htsfile = self.htsfile
jpayne@68:         self.rowiter.header = self.header
jpayne@68: 
jpayne@68:         # make sure the iterator understand that IteratorRowAllRefs
jpayne@68:         # has ownership
jpayne@68:         self.rowiter.owns_samfile = False
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         # Create an initial iterator
jpayne@68:         if self.tid == -1:
jpayne@68:             if not self.samfile.nreferences:
jpayne@68:                 raise StopIteration
jpayne@68:             self.tid = 0
jpayne@68:             self.nextiter()
jpayne@68: 
jpayne@68:         while 1:
jpayne@68:             self.rowiter.cnext()
jpayne@68: 
jpayne@68:             # If current iterator is not exhausted, return aligned read
jpayne@68:             if self.rowiter.retval > 0:
jpayne@68:                 return makeAlignedSegment(self.rowiter.b, self.header)
jpayne@68: 
jpayne@68:             self.tid += 1
jpayne@68: 
jpayne@68:             # Otherwise, proceed to next reference or stop
jpayne@68:             if self.tid < self.samfile.nreferences:
jpayne@68:                 self.nextiter()
jpayne@68:             else:
jpayne@68:                 raise StopIteration
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorRowSelection(IteratorRow):
jpayne@68:     """*(AlignmentFile samfile)*
jpayne@68: 
jpayne@68:     iterate over reads in `samfile` at a given list of file positions.
jpayne@68: 
jpayne@68:     .. note::
jpayne@68:         It is usually not necessary to create an object of this class
jpayne@68:         explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
jpayne@68:         super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68: 
jpayne@68:         self.positions = positions
jpayne@68:         self.current_pos = 0
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef bam1_t * getCurrent(self):
jpayne@68:         return self.b
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''cversion of iterator'''
jpayne@68:         # end iteration if out of positions
jpayne@68:         if self.current_pos >= len(self.positions): return -1
jpayne@68: 
jpayne@68:         cdef uint64_t pos = self.positions[self.current_pos]
jpayne@68:         with nogil:
jpayne@68:             bgzf_seek(hts_get_bgzfp(self.htsfile),
jpayne@68:                       pos,
jpayne@68:                       0)
jpayne@68:         self.current_pos += 1
jpayne@68: 
jpayne@68:         cdef int ret
jpayne@68:         cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68:         with nogil:
jpayne@68:             ret = sam_read1(self.htsfile,
jpayne@68:                             hdr,
jpayne@68:                             self.b)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         cdef int ret = self.cnext()
jpayne@68:         if ret >= 0:
jpayne@68:             return makeAlignedSegment(self.b, self.header)
jpayne@68:         elif ret == -1:
jpayne@68:             raise StopIteration
jpayne@68:         else:
jpayne@68:             raise IOError(read_failure_reason(ret))
jpayne@68: 
jpayne@68: 
jpayne@68: cdef int __advance_nofilter(void *data, bam1_t *b):
jpayne@68:     '''advance without any read filtering.
jpayne@68:     '''
jpayne@68:     cdef __iterdata * d = <__iterdata*>data
jpayne@68:     cdef int ret
jpayne@68:     with nogil:
jpayne@68:         ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@68:     return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef int __advance_raw_nofilter(void *data, bam1_t *b):
jpayne@68:     '''advance (without iterator) without any read filtering.
jpayne@68:     '''
jpayne@68:     cdef __iterdata * d = <__iterdata*>data
jpayne@68:     cdef int ret
jpayne@68:     with nogil:
jpayne@68:         ret = sam_read1(d.htsfile, d.header, b)
jpayne@68:     return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef int __advance_all(void *data, bam1_t *b):
jpayne@68:     '''only use reads for pileup passing basic filters such as
jpayne@68: 
jpayne@68:     BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68:     '''
jpayne@68: 
jpayne@68:     cdef __iterdata * d = <__iterdata*>data
jpayne@68:     cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
jpayne@68:     cdef int ret
jpayne@68:     while 1:
jpayne@68:         with nogil:
jpayne@68:             ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@68:         if ret < 0:
jpayne@68:             break
jpayne@68:         if b.core.flag & d.flag_filter:
jpayne@68:             continue
jpayne@68:         break
jpayne@68:     return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef int __advance_raw_all(void *data, bam1_t *b):
jpayne@68:     '''only use reads for pileup passing basic filters such as
jpayne@68: 
jpayne@68:     BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68:     '''
jpayne@68: 
jpayne@68:     cdef __iterdata * d = <__iterdata*>data
jpayne@68:     cdef int ret
jpayne@68:     while 1:
jpayne@68:         with nogil:
jpayne@68:             ret = sam_read1(d.htsfile, d.header, b)
jpayne@68:         if ret < 0:
jpayne@68:             break
jpayne@68:         if b.core.flag & d.flag_filter:
jpayne@68:             continue
jpayne@68:         break
jpayne@68:     return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef int __advance_samtools(void * data, bam1_t * b):
jpayne@68:     '''advance using same filter and read processing as in
jpayne@68:     the samtools pileup.
jpayne@68:     '''
jpayne@68:     cdef __iterdata * d = <__iterdata*>data
jpayne@68:     cdef int ret
jpayne@68:     cdef int q
jpayne@68: 
jpayne@68:     while 1:
jpayne@68:         with nogil:
jpayne@68:             ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
jpayne@68:         if ret < 0:
jpayne@68:             break
jpayne@68:         if b.core.flag & d.flag_filter:
jpayne@68:             continue
jpayne@68:         if d.flag_require and not (b.core.flag & d.flag_require):
jpayne@68:             continue
jpayne@68: 
jpayne@68:         # reload sequence
jpayne@68:         if d.fastafile != NULL and b.core.tid != d.tid:
jpayne@68:             if d.seq != NULL:
jpayne@68:                 free(d.seq)
jpayne@68:             d.tid = b.core.tid
jpayne@68:             with nogil:
jpayne@68:                 d.seq = faidx_fetch_seq(
jpayne@68:                     d.fastafile,
jpayne@68:                     d.header.target_name[d.tid],
jpayne@68:                     0, MAX_POS,
jpayne@68:                     &d.seq_len)
jpayne@68: 
jpayne@68:             if d.seq == NULL:
jpayne@68:                 raise ValueError(
jpayne@68:                     "reference sequence for '{}' (tid={}) not found".format(
jpayne@68:                         d.header.target_name[d.tid], d.tid))
jpayne@68: 
jpayne@68:         # realign read - changes base qualities
jpayne@68:         if d.seq != NULL and d.compute_baq:
jpayne@68:             # 4th option to realign is flag:
jpayne@68:             # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
jpayne@68:             if d.redo_baq:
jpayne@68:                 sam_prob_realn(b, d.seq, d.seq_len, 7)
jpayne@68:             else:
jpayne@68:                 sam_prob_realn(b, d.seq, d.seq_len, 3)
jpayne@68: 
jpayne@68:         if d.seq != NULL and d.adjust_capq_threshold > 10:
jpayne@68:             q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
jpayne@68:             if q < 0:
jpayne@68:                 continue
jpayne@68:             elif b.core.qual > q:
jpayne@68:                 b.core.qual = q
jpayne@68: 
jpayne@68:         if b.core.qual < d.min_mapping_quality:
jpayne@68:             continue
jpayne@68:         if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
jpayne@68:             continue
jpayne@68: 
jpayne@68:         break
jpayne@68: 
jpayne@68:     return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorColumn:
jpayne@68:     '''abstract base class for iterators over columns.
jpayne@68: 
jpayne@68:     IteratorColumn objects wrap the pileup functionality of samtools.
jpayne@68: 
jpayne@68:     For reasons of efficiency, the iterator points to the current
jpayne@68:     pileup buffer. The pileup buffer is updated at every iteration.
jpayne@68:     This might cause some unexpected behaviour. For example,
jpayne@68:     consider the conversion to a list::
jpayne@68: 
jpayne@68:        f = AlignmentFile("file.bam", "rb")
jpayne@68:        result = list(f.pileup())
jpayne@68: 
jpayne@68:     Here, ``result`` will contain ``n`` objects of type
jpayne@68:     :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
jpayne@68:     ``result`` will contain the same information.
jpayne@68: 
jpayne@68:     The desired behaviour can be achieved by list comprehension::
jpayne@68: 
jpayne@68:        result = [x.pileups() for x in f.pileup()]
jpayne@68: 
jpayne@68:     ``result`` will be a list of ``n`` lists of objects of type
jpayne@68:     :class:`~pysam.PileupRead`.
jpayne@68: 
jpayne@68:     If the iterator is associated with a :class:`~pysam.Fastafile`
jpayne@68:     using the :meth:`add_reference` method, then the iterator will
jpayne@68:     export the current sequence via the methods :meth:`get_sequence`
jpayne@68:     and :meth:`seq_len`.
jpayne@68: 
jpayne@68:     See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
jpayne@68:     '''
jpayne@68: 
jpayne@68:     def __cinit__( self, AlignmentFile samfile, **kwargs):
jpayne@68:         self.samfile = samfile
jpayne@68:         self.fastafile = kwargs.get("fastafile", None)
jpayne@68:         self.stepper = kwargs.get("stepper", "samtools")
jpayne@68:         self.max_depth = kwargs.get("max_depth", 8000)
jpayne@68:         self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
jpayne@68:         self.min_base_quality = kwargs.get("min_base_quality", 13)
jpayne@68:         self.iterdata.seq = NULL
jpayne@68:         self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
jpayne@68:         self.iterdata.flag_require = kwargs.get("flag_require", 0)
jpayne@68:         self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
jpayne@68:         self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
jpayne@68:         self.iterdata.compute_baq = kwargs.get("compute_baq", True)
jpayne@68:         self.iterdata.redo_baq = kwargs.get("redo_baq", False)
jpayne@68:         self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
jpayne@68: 
jpayne@68:         self.tid = 0
jpayne@68:         self.pos = 0
jpayne@68:         self.n_plp = 0
jpayne@68:         self.plp = NULL
jpayne@68:         self.pileup_iter = <bam_mplp_t>NULL
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     cdef int cnext(self):
jpayne@68:         '''perform next iteration.
jpayne@68:         '''
jpayne@68:         # do not release gil here because of call-backs
jpayne@68:         cdef int ret = bam_mplp_auto(self.pileup_iter,
jpayne@68:                                      &self.tid,
jpayne@68:                                      &self.pos,
jpayne@68:                                      &self.n_plp,
jpayne@68:                                      &self.plp)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     cdef char * get_sequence(self):
jpayne@68:         '''return current reference sequence underlying the iterator.
jpayne@68:         '''
jpayne@68:         return self.iterdata.seq
jpayne@68: 
jpayne@68:     property seq_len:
jpayne@68:         '''current sequence length.'''
jpayne@68:         def __get__(self):
jpayne@68:             return self.iterdata.seq_len
jpayne@68: 
jpayne@68:     def add_reference(self, FastaFile fastafile):
jpayne@68:        '''
jpayne@68:        add reference sequences in `fastafile` to iterator.'''
jpayne@68:        self.fastafile = fastafile
jpayne@68:        if self.iterdata.seq != NULL:
jpayne@68:            free(self.iterdata.seq)
jpayne@68:        self.iterdata.tid = -1
jpayne@68:        self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68: 
jpayne@68:     def has_reference(self):
jpayne@68:         '''
jpayne@68:         return true if iterator is associated with a reference'''
jpayne@68:         return self.fastafile
jpayne@68: 
jpayne@68:     cdef _setup_iterator(self,
jpayne@68:                          int tid,
jpayne@68:                          int start,
jpayne@68:                          int stop,
jpayne@68:                          int multiple_iterators=0):
jpayne@68:         '''setup the iterator structure'''
jpayne@68: 
jpayne@68:         self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
jpayne@68:         self.iterdata.htsfile = self.samfile.htsfile
jpayne@68:         self.iterdata.iter = self.iter.iter
jpayne@68:         self.iterdata.seq = NULL
jpayne@68:         self.iterdata.tid = -1
jpayne@68:         self.iterdata.header = self.samfile.header.ptr
jpayne@68: 
jpayne@68:         if self.fastafile is not None:
jpayne@68:             self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68:         else:
jpayne@68:             self.iterdata.fastafile = NULL
jpayne@68: 
jpayne@68:         # Free any previously allocated memory before reassigning
jpayne@68:         # pileup_iter
jpayne@68:         self._free_pileup_iter()
jpayne@68: 
jpayne@68:         cdef void * data[1]
jpayne@68:         data[0] = <void*>&self.iterdata
jpayne@68: 
jpayne@68:         if self.stepper is None or self.stepper == "all":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_all,
jpayne@68:                                                  data)
jpayne@68:         elif self.stepper == "nofilter":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_nofilter,
jpayne@68:                                                  data)
jpayne@68:         elif self.stepper == "samtools":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_samtools,
jpayne@68:                                                  data)
jpayne@68:         else:
jpayne@68:             raise ValueError(
jpayne@68:                 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@68: 
jpayne@68:         if self.max_depth:
jpayne@68:             with nogil:
jpayne@68:                 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@68: 
jpayne@68:         if self.ignore_overlaps:
jpayne@68:             with nogil:
jpayne@68:                 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@68: 
jpayne@68:     cdef _setup_raw_rest_iterator(self):
jpayne@68:         '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
jpayne@68: 
jpayne@68:         self.iter = None
jpayne@68:         self.iterdata.iter = NULL
jpayne@68:         self.iterdata.htsfile = self.samfile.htsfile
jpayne@68:         self.iterdata.seq = NULL
jpayne@68:         self.iterdata.tid = -1
jpayne@68:         self.iterdata.header = self.samfile.header.ptr
jpayne@68: 
jpayne@68:         if self.fastafile is not None:
jpayne@68:             self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68:         else:
jpayne@68:             self.iterdata.fastafile = NULL
jpayne@68: 
jpayne@68:         # Free any previously allocated memory before reassigning
jpayne@68:         # pileup_iter
jpayne@68:         self._free_pileup_iter()
jpayne@68: 
jpayne@68:         cdef void * data[1]
jpayne@68:         data[0] = <void*>&self.iterdata
jpayne@68: 
jpayne@68:         if self.stepper is None or self.stepper == "all":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_raw_all,
jpayne@68:                                                  data)
jpayne@68:         elif self.stepper == "nofilter":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_raw_nofilter,
jpayne@68:                                                  data)
jpayne@68:         elif self.stepper == "samtools":
jpayne@68:             with nogil:
jpayne@68:                 self.pileup_iter = bam_mplp_init(1,
jpayne@68:                                                  <bam_plp_auto_f>&__advance_samtools,
jpayne@68:                                                  data)
jpayne@68:         else:
jpayne@68:             raise ValueError(
jpayne@68:                 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@68: 
jpayne@68:         if self.max_depth:
jpayne@68:             with nogil:
jpayne@68:                 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@68: 
jpayne@68:         if self.ignore_overlaps:
jpayne@68:             with nogil:
jpayne@68:                 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@68: 
jpayne@68:     cdef reset(self, tid, start, stop):
jpayne@68:         '''reset iterator position.
jpayne@68: 
jpayne@68:         This permits using the iterator multiple times without
jpayne@68:         having to incur the full set-up costs.
jpayne@68:         '''
jpayne@68:         if self.iter is None:
jpayne@68:             raise TypeError("Raw iterator set up without region cannot be reset")
jpayne@68: 
jpayne@68:         self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
jpayne@68:         self.iterdata.iter = self.iter.iter
jpayne@68: 
jpayne@68:         # invalidate sequence if different tid
jpayne@68:         if self.tid != tid:
jpayne@68:             if self.iterdata.seq != NULL:
jpayne@68:                 free(self.iterdata.seq)
jpayne@68:             self.iterdata.seq = NULL
jpayne@68:             self.iterdata.tid = -1
jpayne@68: 
jpayne@68:         # self.pileup_iter = bam_mplp_init(1
jpayne@68:         #                                  &__advancepileup,
jpayne@68:         #                                  &self.iterdata)
jpayne@68:         with nogil:
jpayne@68:             bam_mplp_reset(self.pileup_iter)
jpayne@68: 
jpayne@68:     cdef _free_pileup_iter(self):
jpayne@68:         '''free the memory alloc'd by bam_plp_init.
jpayne@68: 
jpayne@68:         This is needed before setup_iterator allocates another
jpayne@68:         pileup_iter, or else memory will be lost.  '''
jpayne@68:         if self.pileup_iter != <bam_mplp_t>NULL:
jpayne@68:             with nogil:
jpayne@68:                 bam_mplp_reset(self.pileup_iter)
jpayne@68:                 bam_mplp_destroy(self.pileup_iter)
jpayne@68:                 self.pileup_iter = <bam_mplp_t>NULL
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         # reset in order to avoid memory leak messages for iterators
jpayne@68:         # that have not been fully consumed
jpayne@68:         self._free_pileup_iter()
jpayne@68:         self.plp = <const bam_pileup1_t*>NULL
jpayne@68: 
jpayne@68:         if self.iterdata.seq != NULL:
jpayne@68:             free(self.iterdata.seq)
jpayne@68:             self.iterdata.seq = NULL
jpayne@68: 
jpayne@68:     # backwards compatibility
jpayne@68: 
jpayne@68:     def hasReference(self):
jpayne@68:         return self.has_reference()
jpayne@68:     cdef char * getSequence(self):
jpayne@68:         return self.get_sequence()
jpayne@68:     def addReference(self, FastaFile fastafile):
jpayne@68:         return self.add_reference(fastafile)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorColumnRegion(IteratorColumn):
jpayne@68:     '''iterates over a region only.
jpayne@68:     '''
jpayne@68:     def __cinit__(self,
jpayne@68:                   AlignmentFile samfile,
jpayne@68:                   int tid = 0,
jpayne@68:                   int start = 0,
jpayne@68:                   int stop = MAX_POS,
jpayne@68:                   int truncate = False,
jpayne@68:                   int multiple_iterators = True,
jpayne@68:                   **kwargs ):
jpayne@68: 
jpayne@68:         # initialize iterator. Multiple iterators not available
jpayne@68:         # for CRAM.
jpayne@68:         if multiple_iterators and samfile.is_cram:
jpayne@68:             warnings.warn("multiple_iterators not implemented for CRAM")
jpayne@68:             multiple_iterators = False
jpayne@68: 
jpayne@68:         self._setup_iterator(tid, start, stop, multiple_iterators)
jpayne@68:         self.start = start
jpayne@68:         self.stop = stop
jpayne@68:         self.truncate = truncate
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68: 
jpayne@68:         cdef int n
jpayne@68: 
jpayne@68:         while 1:
jpayne@68:             n = self.cnext()
jpayne@68:             if n < 0:
jpayne@68:                 raise ValueError("error during iteration" )
jpayne@68: 
jpayne@68:             if n == 0:
jpayne@68:                 raise StopIteration
jpayne@68: 
jpayne@68:             if self.truncate:
jpayne@68:                 if self.start > self.pos:
jpayne@68:                     continue
jpayne@68:                 if self.pos >= self.stop:
jpayne@68:                     raise StopIteration
jpayne@68: 
jpayne@68:             return makePileupColumn(&self.plp,
jpayne@68:                                     self.tid,
jpayne@68:                                     self.pos,
jpayne@68:                                     self.n_plp,
jpayne@68:                                     self.min_base_quality,
jpayne@68:                                     self.iterdata.seq,
jpayne@68:                                     self.samfile.header)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorColumnAllRefs(IteratorColumn):
jpayne@68:     """iterates over all columns by chaining iterators over each reference
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __cinit__(self,
jpayne@68:                   AlignmentFile samfile,
jpayne@68:                   **kwargs):
jpayne@68: 
jpayne@68:         # no iteration over empty files
jpayne@68:         if not samfile.nreferences:
jpayne@68:             raise StopIteration
jpayne@68: 
jpayne@68:         # initialize iterator
jpayne@68:         self._setup_iterator(self.tid, 0, MAX_POS, 1)
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68: 
jpayne@68:         cdef int n
jpayne@68:         while 1:
jpayne@68:             n = self.cnext()
jpayne@68:             if n < 0:
jpayne@68:                 raise ValueError("error during iteration")
jpayne@68: 
jpayne@68:             # proceed to next reference or stop
jpayne@68:             if n == 0:
jpayne@68:                 self.tid += 1
jpayne@68:                 if self.tid < self.samfile.nreferences:
jpayne@68:                     self._setup_iterator(self.tid, 0, MAX_POS, 0)
jpayne@68:                 else:
jpayne@68:                     raise StopIteration
jpayne@68:                 continue
jpayne@68: 
jpayne@68:             # return result, if within same reference
jpayne@68:             return makePileupColumn(&self.plp,
jpayne@68:                                     self.tid,
jpayne@68:                                     self.pos,
jpayne@68:                                     self.n_plp,
jpayne@68:                                     self.min_base_quality,
jpayne@68:                                     self.iterdata.seq,
jpayne@68:                                     self.samfile.header)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IteratorColumnAll(IteratorColumn):
jpayne@68:     """iterates over all columns, without using an index
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __cinit__(self,
jpayne@68:                   AlignmentFile samfile,
jpayne@68:                   **kwargs):
jpayne@68: 
jpayne@68:         self._setup_raw_rest_iterator()
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68: 
jpayne@68:         cdef int n
jpayne@68:         n = self.cnext()
jpayne@68:         if n < 0:
jpayne@68:             raise ValueError("error during iteration")
jpayne@68: 
jpayne@68:         if n == 0:
jpayne@68:             raise StopIteration
jpayne@68: 
jpayne@68:         return makePileupColumn(&self.plp,
jpayne@68:                                 self.tid,
jpayne@68:                                 self.pos,
jpayne@68:                                 self.n_plp,
jpayne@68:                                 self.min_base_quality,
jpayne@68:                                 self.iterdata.seq,
jpayne@68:                                 self.samfile.header)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class SNPCall:
jpayne@68:     '''the results of a SNP call.'''
jpayne@68:     cdef int _tid
jpayne@68:     cdef int _pos
jpayne@68:     cdef char _reference_base
jpayne@68:     cdef char _genotype
jpayne@68:     cdef int _consensus_quality
jpayne@68:     cdef int _snp_quality
jpayne@68:     cdef int _rms_mapping_quality
jpayne@68:     cdef int _coverage
jpayne@68: 
jpayne@68:     property tid:
jpayne@68:         '''the chromosome ID as is defined in the header'''
jpayne@68:         def __get__(self):
jpayne@68:             return self._tid
jpayne@68: 
jpayne@68:     property pos:
jpayne@68:        '''nucleotide position of SNP.'''
jpayne@68:        def __get__(self): return self._pos
jpayne@68: 
jpayne@68:     property reference_base:
jpayne@68:        '''reference base at pos. ``N`` if no reference sequence supplied.'''
jpayne@68:        def __get__(self): return from_string_and_size( &self._reference_base, 1 )
jpayne@68: 
jpayne@68:     property genotype:
jpayne@68:        '''the genotype called.'''
jpayne@68:        def __get__(self): return from_string_and_size( &self._genotype, 1 )
jpayne@68: 
jpayne@68:     property consensus_quality:
jpayne@68:        '''the genotype quality (Phred-scaled).'''
jpayne@68:        def __get__(self): return self._consensus_quality
jpayne@68: 
jpayne@68:     property snp_quality:
jpayne@68:        '''the snp quality (Phred scaled) - probability of consensus being
jpayne@68:        identical to reference sequence.'''
jpayne@68:        def __get__(self): return self._snp_quality
jpayne@68: 
jpayne@68:     property mapping_quality:
jpayne@68:        '''the root mean square (rms) of the mapping quality of all reads
jpayne@68:        involved in the call.'''
jpayne@68:        def __get__(self): return self._rms_mapping_quality
jpayne@68: 
jpayne@68:     property coverage:
jpayne@68:        '''coverage or read depth - the number of reads involved in the call.'''
jpayne@68:        def __get__(self): return self._coverage
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68: 
jpayne@68:         return "\t".join( map(str, (
jpayne@68:                     self.tid,
jpayne@68:                     self.pos,
jpayne@68:                     self.reference_base,
jpayne@68:                     self.genotype,
jpayne@68:                     self.consensus_quality,
jpayne@68:                     self.snp_quality,
jpayne@68:                     self.mapping_quality,
jpayne@68:                     self.coverage ) ) )
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class IndexedReads:
jpayne@68:     """Index a Sam/BAM-file by query name while keeping the
jpayne@68:     original sort order intact.
jpayne@68: 
jpayne@68:     The index is kept in memory and can be substantial.
jpayne@68: 
jpayne@68:     By default, the file is re-opened to avoid conflicts if multiple
jpayne@68:     operators work on the same file. Set `multiple_iterators` = False
jpayne@68:     to not re-open `samfile`.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68: 
jpayne@68:     samfile : AlignmentFile
jpayne@68:         File to be indexed.
jpayne@68: 
jpayne@68:     multiple_iterators : bool
jpayne@68:         Flag indicating whether the file should be reopened. Reopening prevents
jpayne@68:         existing iterators being affected by the indexing.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
jpayne@68:         cdef char *cfilename
jpayne@68: 
jpayne@68:         # makes sure that samfile stays alive as long as this
jpayne@68:         # object is alive.
jpayne@68:         self.samfile = samfile
jpayne@68:         cdef bam_hdr_t * hdr = NULL
jpayne@68:         assert samfile.is_bam, "can only apply IndexReads on bam files"
jpayne@68: 
jpayne@68:         # multiple_iterators the file - note that this makes the iterator
jpayne@68:         # slow and causes pileup to slow down significantly.
jpayne@68:         if multiple_iterators:
jpayne@68:             cfilename = samfile.filename
jpayne@68:             with nogil:
jpayne@68:                 self.htsfile = hts_open(cfilename, 'r')
jpayne@68:             if self.htsfile == NULL:
jpayne@68:                 raise OSError("unable to reopen htsfile")
jpayne@68: 
jpayne@68:             # need to advance in newly opened file to position after header
jpayne@68:             # better: use seek/tell?
jpayne@68:             with nogil:
jpayne@68:                 hdr = sam_hdr_read(self.htsfile)
jpayne@68:             if hdr == NULL:
jpayne@68:                 raise OSError("unable to read header information")
jpayne@68:             self.header = makeAlignmentHeader(hdr)
jpayne@68:             self.owns_samfile = True
jpayne@68:         else:
jpayne@68:             self.htsfile = self.samfile.htsfile
jpayne@68:             self.header = samfile.header
jpayne@68:             self.owns_samfile = False
jpayne@68: 
jpayne@68:     def build(self):
jpayne@68:         '''build the index.'''
jpayne@68: 
jpayne@68:         self.index = collections.defaultdict(list)
jpayne@68: 
jpayne@68:         # this method will start indexing from the current file position
jpayne@68:         cdef int ret = 1
jpayne@68:         cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
jpayne@68:         if b == NULL:
jpayne@68:             raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
jpayne@68: 
jpayne@68:         cdef uint64_t pos
jpayne@68:         cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68: 
jpayne@68:         while ret > 0:
jpayne@68:             with nogil:
jpayne@68:                 pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
jpayne@68:                 ret = sam_read1(self.htsfile,
jpayne@68:                                 hdr,
jpayne@68:                                 b)
jpayne@68: 
jpayne@68:             if ret > 0:
jpayne@68:                 qname = charptr_to_str(pysam_bam_get_qname(b))
jpayne@68:                 self.index[qname].append(pos)
jpayne@68: 
jpayne@68:         bam_destroy1(b)
jpayne@68: 
jpayne@68:     def find(self, query_name):
jpayne@68:         '''find `query_name` in index.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68: 
jpayne@68:         IteratorRowSelection
jpayne@68:             Returns an iterator over all reads with query_name.
jpayne@68: 
jpayne@68:         Raises
jpayne@68:         ------
jpayne@68: 
jpayne@68:         KeyError
jpayne@68:             if the `query_name` is not in the index.
jpayne@68: 
jpayne@68:         '''
jpayne@68:         if query_name in self.index:
jpayne@68:             return IteratorRowSelection(
jpayne@68:                 self.samfile,
jpayne@68:                 self.index[query_name],
jpayne@68:                 multiple_iterators = False)
jpayne@68:         else:
jpayne@68:             raise KeyError("read %s not found" % query_name)
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.owns_samfile:
jpayne@68:             hts_close(self.htsfile)