Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,2982 @@ +# cython: embedsignature=True +# cython: profile=True +######################################################## +######################################################## +# Cython wrapper for SAM/BAM/CRAM files based on htslib +######################################################## +# The principal classes defined in this module are: +# +# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files +# +# class AlignmentHeader manage SAM/BAM/CRAM header data +# +# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping +# the original sort order intact +# +# Additionally this module defines numerous additional classes that +# are part of the internal API. These are: +# +# Various iterator classes to iterate over alignments in sequential +# (IteratorRow) or in a stacked fashion (IteratorColumn): +# +# class IteratorRow +# class IteratorRowRegion +# class IteratorRowHead +# class IteratorRowAll +# class IteratorRowAllRefs +# class IteratorRowSelection +# class IteratorColumn +# class IteratorColumnRegion +# class IteratorColumnAll +# class IteratorColumnAllRefs +# +######################################################## +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +######################################################## +import os +import collections +try: + from collections.abc import Sequence, Mapping # noqa +except ImportError: + from collections import Sequence, Mapping # noqa +import re +import warnings +import array +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror +from libc.stdint cimport INT32_MAX + +from cpython cimport array as c_array + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn +from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \ + HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL + +from io import StringIO + +cimport cython + + +__all__ = [ + "AlignmentFile", + "AlignmentHeader", + "IteratorRow", + "IteratorColumn", + "IndexedReads"] + +IndexStats = collections.namedtuple("IndexStats", + ("contig", + "mapped", + "unmapped", + "total")) + +######################################################## +## global variables +# maximum genomic coordinace +# for some reason, using 'int' causes overflow +cdef int MAX_POS = (1 << 31) - 1 + +# valid types for SAM headers +VALID_HEADER_TYPES = {"HD" : Mapping, + "SQ" : Sequence, + "RG" : Sequence, + "PG" : Sequence, + "CO" : Sequence} + +# order of records within SAM headers +VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") + +# default type conversions within SAM header records +KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str, + "SS" : str,}, + "SQ" : {"SN" : str, "LN" : int, "AS" : str, + "M5" : str, "SP" : str, "UR" : str, + "AH" : str, "TP" : str, "DS" : str, + "AN" : str,}, + "RG" : {"ID" : str, "CN" : str, "DS" : str, + "DT" : str, "FO" : str, "KS" : str, + "LB" : str, "PG" : str, "PI" : str, + "PL" : str, "PM" : str, "PU" : str, + "SM" : str, "BC" : str,}, + "PG" : {"ID" : str, "PN" : str, "CL" : str, + "PP" : str, "DS" : str, "VN" : str,},} + +# output order of fields within records. Ensure that CL is at +# the end as parsing a CL will ignore any subsequent records. +VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"), + "SQ" : ("SN", "LN", "AS", "M5", + "UR", "SP", "AH", "TP", + "DS", "AN"), + "RG" : ("ID", "CN", "SM", "LB", + "PU", "PI", "DT", "DS", + "PL", "FO", "KS", "PG", + "PM", "BC"), + "PG" : ("PN", "ID", "VN", "PP", + "DS", "CL"),} + + +def build_header_line(fields, record): + '''build a header line from `fields` dictionary for `record`''' + + # TODO: add checking for field and sort order + line = ["@%s" % record] + # comment + if record == "CO": + line.append(fields) + # user tags + elif record.islower(): + for key in sorted(fields): + line.append("%s:%s" % (key, str(fields[key]))) + # defined tags + else: + # write fields of the specification + for key in VALID_HEADER_ORDER[record]: + if key in fields: + line.append("%s:%s" % (key, str(fields[key]))) + # write user fields + for key in fields: + if not key.isupper(): + line.append("%s:%s" % (key, str(fields[key]))) + + return "\t".join(line) + + +cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): + if not hdr: + raise ValueError('cannot create AlignmentHeader, received NULL pointer') + + # check: is AlignmetHeader.__cinit__ called? + cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) + header.ptr = hdr + + return header + +def read_failure_reason(code): + if code == -2: + return 'truncated file' + else: + return "error {} while reading file".format(code) + + +# the following should be class-method for VariantHeader, but cdef @classmethods +# are not implemented in cython. +cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, + reference_names, + reference_lengths, + add_sq_text=True, + text=None) except -1: + """build header from list of reference names and lengths. + """ + +cdef class AlignmentHeader(object): + """header information for a :class:`AlignmentFile` object + + Parameters + ---------- + header_dict : dict + build header from a multi-level dictionary. The + first level are the four types ('HD', 'SQ', ...). The second + level are a list of lines, with each line being a list of + tag-value pairs. The header is constructed first from all the + defined fields, followed by user tags in alphabetical + order. Alternatively, an :class:`~pysam.AlignmentHeader` + object can be passed directly. + + text : string + use the string provided as the header + + reference_names : list + see reference_lengths + + reference_lengths : list + build header from list of chromosome names and lengths. By + default, 'SQ' and 'LN' tags will be added to the header + text. This option can be changed by unsetting the flag + `add_sq_text`. + + add_sq_text : bool + do not add 'SQ' and 'LN' tags to header. This option permits + construction :term:`SAM` formatted files without a header. + + """ + + # See makeVariantHeader for C constructor + def __cinit__(self): + self.ptr = NULL + + # Python constructor + def __init__(self): + self.ptr = bam_hdr_init() + if self.ptr is NULL: + raise MemoryError("could not create header") + + @classmethod + def _from_text_and_lengths(cls, text, reference_names, reference_lengths): + + cdef AlignmentHeader self = AlignmentHeader() + cdef char *ctext + cdef int l_text + cdef int n, x + if text is not None: + btext = force_bytes(text) + ctext = btext + l_text = len(btext) + self.ptr.text = <char*>calloc(l_text + 1, sizeof(char)) + if self.ptr.text == NULL: + raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) + self.ptr.l_text = l_text + memcpy(self.ptr.text, ctext, l_text + 1) + + if reference_names and reference_lengths: + reference_names = [force_bytes(ref) for ref in reference_names] + + self.ptr.n_targets = len(reference_names) + + n = sum([len(reference_names) + 1]) + self.ptr.target_name = <char**>calloc(n, sizeof(char*)) + if self.ptr.target_name == NULL: + raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) + + self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t)) + if self.ptr.target_len == NULL: + raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) + + for x from 0 <= x < self.ptr.n_targets: + self.ptr.target_len[x] = reference_lengths[x] + name = reference_names[x] + self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char)) + if self.ptr.target_name[x] == NULL: + raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) + strncpy(self.ptr.target_name[x], name, len(name)) + + return self + + @classmethod + def from_text(cls, text): + + reference_names, reference_lengths = [], [] + for line in text.splitlines(): + if line.startswith("@SQ"): + fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) + try: + reference_names.append(fields["SN"]) + reference_lengths.append(int(fields["LN"])) + except KeyError: + raise KeyError("incomplete sequence information in '%s'" % str(fields)) + except ValueError: + raise ValueError("wrong sequence information in '%s'" % str(fields)) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + @classmethod + def from_dict(cls, header_dict): + + cdef list lines = [] + # first: defined tags + for record in VALID_HEADERS: + if record in header_dict: + data = header_dict[record] + if not isinstance(data, VALID_HEADER_TYPES[record]): + raise ValueError( + "invalid type for record %s: %s, expected %s".format( + record, type(data), VALID_HEADER_TYPES[record])) + if isinstance(data, Mapping): + lines.append(build_header_line(data, record)) + else: + for fields in header_dict[record]: + lines.append(build_header_line(fields, record)) + + # then: user tags (lower case), sorted alphabetically + for record, data in sorted(header_dict.items()): + if record in VALID_HEADERS: + continue + if isinstance(data, Mapping): + lines.append(build_header_line(data, record)) + else: + for fields in header_dict[record]: + lines.append(build_header_line(fields, record)) + + text = "\n".join(lines) + "\n" + + reference_names, reference_lengths = [], [] + if "SQ" in header_dict: + for fields in header_dict["SQ"]: + try: + reference_names.append(fields["SN"]) + reference_lengths.append(fields["LN"]) + except KeyError: + raise KeyError("incomplete sequence information in '%s'" % str(fields)) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + @classmethod + def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): + + if len(reference_names) != len(reference_lengths): + raise ValueError("number of reference names and lengths do not match") + + # optionally, if there is no text, add a SAM compatible header to output file. + if text is None and add_sq_text: + text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( + reference_names, reference_lengths)]) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + def __dealloc__(self): + bam_hdr_destroy(self.ptr) + self.ptr = NULL + + def __bool__(self): + return self.ptr != NULL + + def copy(self): + return makeAlignmentHeader(bam_hdr_dup(self.ptr)) + + property nreferences: + """int with the number of :term:`reference` sequences in the file. + + This is a read-only attribute.""" + def __get__(self): + return self.ptr.n_targets + + property references: + """tuple with the names of :term:`reference` sequences. This is a + read-only attribute""" + def __get__(self): + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(charptr_to_str(self.ptr.target_name[x])) + return tuple(t) + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. This is a + read-only attribute. The lengths are in the same order as + :attr:`pysam.AlignmentFile.references` + """ + def __get__(self): + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(self.ptr.target_len[x]) + return tuple(t) + + def _build_sequence_section(self): + """return sequence section of header. + + The sequence section is built from the list of reference names and + lengths stored in the BAM-file and not from any @SQ entries that + are part of the header's text section. + """ + + cdef int x + text = [] + for x in range(self.ptr.n_targets): + text.append("@SQ\tSN:{}\tLN:{}\n".format( + force_str(self.ptr.target_name[x]), + self.ptr.target_len[x])) + return "".join(text) + + def to_dict(self): + """return two-level dictionary with header information from the file. + + The first level contains the record (``HD``, ``SQ``, etc) and + the second level contains the fields (``VN``, ``LN``, etc). + + The parser is validating and will raise an AssertionError if + if encounters any record or field tags that are not part of + the SAM specification. Use the + :attr:`pysam.AlignmentFile.text` attribute to get the unparsed + header. + + The parsing follows the SAM format specification with the + exception of the ``CL`` field. This option will consume the + rest of a header line irrespective of any additional fields. + This behaviour has been added to accommodate command line + options that contain characters that are not valid field + separators. + + If no @SQ entries are within the text section of the header, + this will be automatically added from the reference names and + lengths stored in the binary part of the header. + """ + result = collections.OrderedDict() + + # convert to python string + t = self.__str__() + for line in t.split("\n"): + line = line.strip(' \0') + if not line: + continue + assert line.startswith("@"), \ + "header line without '@': '%s'" % line + fields = line[1:].split("\t") + record = fields[0] + assert record in VALID_HEADER_TYPES, \ + "header line with invalid type '%s': '%s'" % (record, line) + + # treat comments + if record == "CO": + if record not in result: + result[record] = [] + result[record].append("\t".join( fields[1:])) + continue + # the following is clumsy as generators do not work? + x = {} + + for idx, field in enumerate(fields[1:]): + if ":" not in field: + raise ValueError("malformatted header: no ':' in field" ) + key, value = field.split(":", 1) + if key in ("CL",): + # special treatment for command line + # statements (CL). These might contain + # characters that are non-conformant with + # the valid field separators in the SAM + # header. Thus, in contravention to the + # SAM API, consume the rest of the line. + key, value = "\t".join(fields[idx+1:]).split(":", 1) + x[key] = KNOWN_HEADER_FIELDS[record][key](value) + break + + # interpret type of known header record tags, default to str + x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) + + if VALID_HEADER_TYPES[record] == Mapping: + if record in result: + raise ValueError( + "multiple '%s' lines are not permitted" % record) + + result[record] = x + elif VALID_HEADER_TYPES[record] == Sequence: + if record not in result: result[record] = [] + result[record].append(x) + + # if there are no SQ lines in the header, add the + # reference names from the information in the bam + # file. + # + # Background: c-samtools keeps the textual part of the + # header separate from the list of reference names and + # lengths. Thus, if a header contains only SQ lines, + # the SQ information is not part of the textual header + # and thus are missing from the output. See issue 84. + if "SQ" not in result: + sq = [] + for ref, length in zip(self.references, self.lengths): + sq.append({'LN': length, 'SN': ref }) + result["SQ"] = sq + + return result + + def as_dict(self): + """deprecated, use :meth:`to_dict()` instead""" + return self.to_dict() + + def get_reference_name(self, tid): + if tid == -1: + return None + if not 0 <= tid < self.ptr.n_targets: + raise ValueError("reference_id %i out of range 0<=tid<%i" % + (tid, self.ptr.n_targets)) + return charptr_to_str(self.ptr.target_name[tid]) + + def get_reference_length(self, reference): + cdef int tid = self.get_tid(reference) + if tid < 0: + raise KeyError("unknown reference {}".format(reference)) + else: + return self.ptr.target_len[tid] + + def is_valid_tid(self, int tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + Note that the unmapped tid code (-1) counts as an invalid. + """ + return 0 <= tid < self.ptr.n_targets + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + reference = force_bytes(reference) + tid = bam_name2id(self.ptr, reference) + if tid < -1: + raise ValueError('could not parse header') + return tid + + def __str__(self): + '''string with the full contents of the :term:`sam file` header as a + string. + + If no @SQ entries are within the text section of the header, + this will be automatically added from the reference names and + lengths stored in the binary part of the header. + + See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed + representation of the header. + ''' + text = from_string_and_size(self.ptr.text, self.ptr.l_text) + if "@SQ" not in text: + text += "\n" + self._build_sequence_section() + return text + + # dictionary access methods, for backwards compatibility. + def __setitem__(self, key, value): + raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") + + def __getitem__(self, key): + return self.to_dict().__getitem__(key) + + def items(self): + return self.to_dict().items() + + # PY2 compatibility + def iteritems(self): + return self.to_dict().items() + + def keys(self): + return self.to_dict().keys() + + def values(self): + return self.to_dict().values() + + def get(self, *args): + return self.to_dict().get(*args) + + def __len__(self): + return self.to_dict().__len__() + + def __contains__(self, key): + return self.to_dict().__contains__(key) + + +cdef class AlignmentFile(HTSFile): + """AlignmentFile(filepath_or_object, mode=None, template=None, + reference_names=None, reference_lengths=None, text=NULL, + header=None, add_sq_text=False, check_header=True, check_sq=True, + reference_filename=None, filename=None, index_filename=None, + filepath_index=None, require_index=False, duplicate_filehandle=True, + ignore_truncation=False, threads=1) + + A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. + + If `filepath_or_object` is a string, the file is automatically + opened. If `filepath_or_object` is a python File object, the + already opened file will be used. + + If the file is opened for reading and an index exists (if file is BAM, a + .bai file or if CRAM a .crai file), it will be opened automatically. + `index_filename` may be specified explicitly. If the index is not named + in the standard manner, not located in the same directory as the + BAM/CRAM file, or is remote. Without an index, random access via + :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup` + is disabled. + + For writing, the header of a :term:`SAM` file/:term:`BAM` file can + be constituted from several sources (see also the samtools format + specification): + + 1. If `template` is given, the header is copied from another + `AlignmentFile` (`template` must be a + :class:`~pysam.AlignmentFile`). + + 2. If `header` is given, the header is built from a + multi-level dictionary. + + 3. If `text` is given, new header text is copied from raw + text. + + 4. The names (`reference_names`) and lengths + (`reference_lengths`) are supplied directly as lists. + + When reading or writing a CRAM file, the filename of a FASTA-formatted + reference can be specified with `reference_filename`. + + By default, if a file is opened in mode 'r', it is checked + for a valid header (`check_header` = True) and a definition of + chromosome names (`check_sq` = True). + + Parameters + ---------- + mode : string + `mode` should be ``r`` for reading or ``w`` for writing. The + default is text mode (:term:`SAM`). For binary (:term:`BAM`) + I/O you should append ``b`` for compressed or ``u`` for + uncompressed :term:`BAM` output. Use ``h`` to output header + information in text (:term:`TAM`) mode. Use ``c`` for + :term:`CRAM` formatted files. + + If ``b`` is present, it must immediately follow ``r`` or + ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, + ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a + :term:`BAM` formatted file for reading, type:: + + f = pysam.AlignmentFile('ex1.bam','rb') + + If mode is not specified, the method will try to auto-detect + in the order 'rb', 'r', thus both the following should work:: + + f1 = pysam.AlignmentFile('ex1.bam') + f2 = pysam.AlignmentFile('ex1.sam') + + template : AlignmentFile + when writing, copy header from file `template`. + + header : dict or AlignmentHeader + when writing, build header from a multi-level dictionary. The + first level are the four types ('HD', 'SQ', ...). The second + level are a list of lines, with each line being a list of + tag-value pairs. The header is constructed first from all the + defined fields, followed by user tags in alphabetical + order. Alternatively, an :class:`~pysam.AlignmentHeader` + object can be passed directly. + + text : string + when writing, use the string provided as the header + + reference_names : list + see reference_lengths + + reference_lengths : list + when writing or opening a SAM file without header build header + from list of chromosome names and lengths. By default, 'SQ' + and 'LN' tags will be added to the header text. This option + can be changed by unsetting the flag `add_sq_text`. + + add_sq_text : bool + do not add 'SQ' and 'LN' tags to header. This option permits + construction :term:`SAM` formatted files without a header. + + add_sam_header : bool + when outputting SAM the default is to output a header. This is + equivalent to opening the file in 'wh' mode. If this option is + set to False, no header will be output. To read such a file, + set `check_header=False`. + + check_header : bool + obsolete: when reading a SAM file, check if header is present + (default=True) + + check_sq : bool + when reading, check if SQ entries are present in header + (default=True) + + reference_filename : string + Path to a FASTA-formatted reference file. Valid only for CRAM files. + When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL + specified in the header (``UR`` tag), which are normally used to find + the reference. + + index_filename : string + Explicit path to the index file. Only needed if the index is not + named in the standard manner, not located in the same directory as + the BAM/CRAM file, or is remote. An IOError is raised if the index + cannot be found or is invalid. + + filepath_index : string + Alias for `index_filename`. + + require_index : bool + When reading, require that an index file is present and is valid or + raise an IOError. (default=False) + + filename : string + Alternative to filepath_or_object. Filename of the file + to be opened. + + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. + + ignore_truncation: bool + Issue a warning, instead of raising an error if the current file + appears to be truncated due to a missing EOF marker. Only applies + to bgzipped formats. (Default=False) + + format_options: list + A list of key=value strings, as accepted by --input-fmt-option and + --output-fmt-option in samtools. + threads: integer + Number of threads to use for compressing/decompressing BAM/CRAM files. + Setting threads to > 1 cannot be combined with `ignore_truncation`. + (Default=1) + """ + + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + self.filename = None + self.mode = None + self.threads = 1 + self.is_stream = False + self.is_remote = False + self.index = NULL + + if "filename" in kwargs: + args = [kwargs["filename"]] + del kwargs["filename"] + + self._open(*args, **kwargs) + + # allocate memory for iterator + self.b = <bam1_t*>calloc(1, sizeof(bam1_t)) + if self.b == NULL: + raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) + + def has_index(self): + """return true if htsfile has an existing (and opened) index. + """ + return self.index != NULL + + def check_index(self): + """return True if index is present. + + Raises + ------ + + AttributeError + if htsfile is :term:`SAM` formatted and thus has no index. + + ValueError + if htsfile is closed or index could not be opened. + """ + + if not self.is_open: + raise ValueError("I/O operation on closed file") + if not self.is_bam and not self.is_cram: + raise AttributeError( + "AlignmentFile.mapped only available in bam files") + if self.index == NULL: + raise ValueError( + "mapping information not recorded in index " + "or index not available") + return True + + def _open(self, + filepath_or_object, + mode=None, + AlignmentFile template=None, + reference_names=None, + reference_lengths=None, + reference_filename=None, + text=None, + header=None, + port=None, + add_sq_text=True, + add_sam_header=True, + check_header=True, + check_sq=True, + index_filename=None, + filepath_index=None, + require_index=False, + referencenames=None, + referencelengths=None, + duplicate_filehandle=True, + ignore_truncation=False, + format_options=None, + threads=1): + '''open a sam, bam or cram formatted file. + + If _open is called on an existing file, the current file + will be closed and a new file will be opened. + + ''' + cdef char *cfilename = NULL + cdef char *creference_filename = NULL + cdef char *cindexname = NULL + cdef char *cmode = NULL + cdef bam_hdr_t * hdr = NULL + + if threads > 1 and ignore_truncation: + # This won't raise errors if reaching a truncated alignment, + # because bgzf_mt_reader in htslib does not deal with + # bgzf_mt_read_block returning non-zero values, contrary + # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) + # Better to avoid this (for now) than to produce seemingly correct results. + raise ValueError('Cannot add extra threads when "ignore_truncation" is True') + self.threads = threads + + # for backwards compatibility: + if referencenames is not None: + reference_names = referencenames + if referencelengths is not None: + reference_lengths = referencelengths + + # close a previously opened file + if self.is_open: + self.close() + + # autodetection for read + if mode is None: + mode = "r" + + if add_sam_header and mode == "w": + mode = "wh" + + assert mode in ("r", "w", "rb", "wb", "wh", + "wbu", "rU", "wb0", + "rc", "wc"), \ + "invalid file opening mode `%s`" % mode + + self.duplicate_filehandle = duplicate_filehandle + + # StringIO not supported + if isinstance(filepath_or_object, StringIO): + raise NotImplementedError( + "access from StringIO objects not supported") + # reading from a file descriptor + elif isinstance(filepath_or_object, int): + self.filename = filepath_or_object + filename = None + self.is_remote = False + self.is_stream = True + # reading from a File object or other object with fileno + elif hasattr(filepath_or_object, "fileno"): + if filepath_or_object.closed: + raise ValueError('I/O operation on closed file') + self.filename = filepath_or_object + # .name can be TextIOWrapper + try: + filename = encode_filename(str(filepath_or_object.name)) + cfilename = filename + except AttributeError: + filename = None + self.is_remote = False + self.is_stream = True + # what remains is a filename + else: + self.filename = filename = encode_filename(filepath_or_object) + cfilename = filename + self.is_remote = hisremote(cfilename) + self.is_stream = self.filename == b'-' + + # for htslib, wbu seems to not work + if mode == "wbu": + mode = "wb0" + + self.mode = force_bytes(mode) + self.reference_filename = reference_filename = encode_filename( + reference_filename) + + if mode[0] == 'w': + # open file for writing + + if not (template or header or text or (reference_names and reference_lengths)): + raise ValueError( + "either supply options `template`, `header`, `text` or both `reference_names` " + "and `reference_lengths` for writing") + + if template: + # header is copied, though at the moment not strictly + # necessary as AlignmentHeader is immutable. + self.header = template.header.copy() + elif isinstance(header, AlignmentHeader): + self.header = header.copy() + elif isinstance(header, Mapping): + self.header = AlignmentHeader.from_dict(header) + elif reference_names and reference_lengths: + self.header = AlignmentHeader.from_references( + reference_names, + reference_lengths, + add_sq_text=add_sq_text, + text=text) + elif text: + self.header = AlignmentHeader.from_text(text) + else: + raise ValueError("not enough information to construct header. Please provide template, " + "header, text or reference_names/reference_lengths") + self.htsfile = self._open_htsfile() + + if self.htsfile == NULL: + if errno: + raise IOError(errno, "could not open alignment file `{}`: {}".format( + force_str(filename), + force_str(strerror(errno)))) + else: + raise ValueError("could not open alignment file `{}`".format(force_str(filename))) + if format_options and len(format_options): + self.add_hts_options(format_options) + # set filename with reference sequences. If no filename + # is given, the CRAM reference arrays will be built from + # the @SQ header in the header + if "c" in mode and reference_filename: + if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0): + raise ValueError("failure when setting reference filename") + + # write header to htsfile + if "b" in mode or "c" in mode or "h" in mode: + hdr = self.header.ptr + with nogil: + sam_hdr_write(self.htsfile, hdr) + + elif mode[0] == "r": + # open file for reading + self.htsfile = self._open_htsfile() + + if self.htsfile == NULL: + if errno: + raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), + force_str(strerror(errno)))) + else: + raise ValueError("could not open alignment file `{}`".format(force_str(filename))) + + if self.htsfile.format.category != sequence_data: + raise ValueError("file does not contain alignment data") + + if format_options and len(format_options): + self.add_hts_options(format_options) + + self.check_truncation(ignore_truncation) + + # bam/cram files require a valid header + if self.is_bam or self.is_cram: + with nogil: + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: + raise ValueError( + "file does not have a valid header (mode='%s') " + "- is it BAM/CRAM format?" % mode) + self.header = makeAlignmentHeader(hdr) + else: + # in sam files a header is optional. If not given, + # user may provide reference names and lengths to built + # an on-the-fly header. + if reference_names and reference_lengths: + # build header from a target names and lengths + self.header = AlignmentHeader.from_references( + reference_names=reference_names, + reference_lengths=reference_lengths, + add_sq_text=add_sq_text, + text=text) + else: + with nogil: + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: + raise ValueError( + "SAM? file does not have a valid header (mode='%s'), " + "please provide reference_names and reference_lengths") + self.header = makeAlignmentHeader(hdr) + + # set filename with reference sequences + if self.is_cram and reference_filename: + creference_filename = self.reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + + if check_sq and self.header.nreferences == 0: + raise ValueError( + ("file has no sequences defined (mode='%s') - " + "is it SAM/BAM format? Consider opening with " + "check_sq=False") % mode) + + if self.is_bam or self.is_cram: + self.index_filename = index_filename or filepath_index + if self.index_filename: + cindexname = bfile_name = encode_filename(self.index_filename) + + if cfilename or cindexname: + with nogil: + self.index = sam_index_load3(self.htsfile, cfilename, cindexname, + HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL) + + if not self.index and (cindexname or require_index): + if errno: + raise IOError(errno, force_str(strerror(errno))) + else: + raise IOError('unable to open index file `%s`' % self.index_filename) + + elif require_index: + raise IOError('unable to open index file') + + # save start of data section + if not self.is_stream: + self.start_offset = self.tell() + + def fetch(self, + contig=None, + start=None, + stop=None, + region=None, + tid=None, + until_eof=False, + multiple_iterators=False, + reference=None, + end=None): + """fetch reads aligned in a :term:`region`. + + See :meth:`~pysam.HTSFile.parse_region` for more information + on how genomic regions can be specified. :term:`reference` and + `end` are also accepted for backward compatibility as synonyms + for :term:`contig` and `stop`, respectively. + + Without a `contig` or `region` all mapped reads in the file + will be fetched. The reads will be returned ordered by reference + sequence, which will not necessarily be the order within the + file. This mode of iteration still requires an index. If there is + no index, use `until_eof=True`. + + If only `contig` is set, all reads aligned to `contig` + will be fetched. + + A :term:`SAM` file does not allow random access. If `region` + or `contig` are given, an exception is raised. + + Parameters + ---------- + + until_eof : bool + + If `until_eof` is True, all reads from the current file + position will be returned in order as they are within the + file. Using this option will also fetch unmapped reads. + + multiple_iterators : bool + + If `multiple_iterators` is True, multiple + iterators on the same file can be used at the same time. The + iterator returned will receive its own copy of a filehandle to + the file effectively re-opening the file. Re-opening a file + creates some overhead, so beware. + + Returns + ------- + + An iterator over a collection of reads. : IteratorRow + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid or the + file does not permit random access to genomic coordinates. + + """ + cdef int rtid, rstart, rstop, has_coord + + if not self.is_open: + raise ValueError( "I/O operation on closed file" ) + + has_coord, rtid, rstart, rstop = self.parse_region( + contig, start, stop, region, tid, + end=end, reference=reference) + + # Turn of re-opening if htsfile is a stream + if self.is_stream: + multiple_iterators = False + + if self.is_bam or self.is_cram: + if not until_eof and not self.is_remote: + if not self.has_index(): + raise ValueError( + "fetch called on bamfile without index") + + if has_coord: + return IteratorRowRegion( + self, rtid, rstart, rstop, + multiple_iterators=multiple_iterators) + else: + if until_eof: + return IteratorRowAll( + self, + multiple_iterators=multiple_iterators) + else: + # AH: check - reason why no multiple_iterators for + # AllRefs? + return IteratorRowAllRefs( + self, + multiple_iterators=multiple_iterators) + else: + if has_coord: + raise ValueError( + "fetching by region is not available for SAM files") + + if multiple_iterators == True: + raise ValueError( + "multiple iterators not implemented for SAM files") + + return IteratorRowAll(self, + multiple_iterators=multiple_iterators) + + def head(self, n, multiple_iterators=True): + '''return an iterator over the first n alignments. + + This iterator is is useful for inspecting the bam-file. + + Parameters + ---------- + + multiple_iterators : bool + + is set to True by default in order to + avoid changing the current file position. + + Returns + ------- + + an iterator over a collection of reads : IteratorRowHead + + ''' + return IteratorRowHead(self, n, + multiple_iterators=multiple_iterators) + + def mate(self, AlignedSegment read): + '''return the mate of :class:`pysam.AlignedSegment` `read`. + + .. note:: + + Calling this method will change the file position. + This might interfere with any iterators that have + not re-opened the file. + + .. note:: + + This method is too slow for high-throughput processing. + If a read needs to be processed with its mate, work + from a read name sorted file or, better, cache reads. + + Returns + ------- + + the mate : AlignedSegment + + Raises + ------ + + ValueError + if the read is unpaired or the mate is unmapped + + ''' + cdef uint32_t flag = read._delegate.core.flag + + if flag & BAM_FPAIRED == 0: + raise ValueError("read %s: is unpaired" % + (read.query_name)) + if flag & BAM_FMUNMAP != 0: + raise ValueError("mate %s: is unmapped" % + (read.query_name)) + + # xor flags to get the other mate + cdef int x = BAM_FREAD1 + BAM_FREAD2 + flag = (flag ^ x) & x + + # Make sure to use a separate file to jump around + # to mate as otherwise the original file position + # will be lost + # The following code is not using the C API and + # could thus be made much quicker, for example + # by using tell and seek. + for mate in self.fetch( + read._delegate.core.mpos, + read._delegate.core.mpos + 1, + tid=read._delegate.core.mtid, + multiple_iterators=True): + if mate.flag & flag != 0 and \ + mate.query_name == read.query_name: + break + else: + raise ValueError("mate not found") + + return mate + + def pileup(self, + contig=None, + start=None, + stop=None, + region=None, + reference=None, + end=None, + **kwargs): + """perform a :term:`pileup` within a :term:`region`. The region is + specified by :term:`contig`, `start` and `stop` (using + 0-based indexing). :term:`reference` and `end` are also accepted for + backward compatibility as synonyms for :term:`contig` and `stop`, + respectively. Alternatively, a samtools 'region' string + can be supplied. + + Without 'contig' or 'region' all reads will be used for the + pileup. The reads will be returned ordered by + :term:`contig` sequence, which will not necessarily be the + order within the file. + + Note that :term:`SAM` formatted files do not allow random + access. In these files, if a 'region' or 'contig' are + given an exception is raised. + + .. note:: + + 'all' reads which overlap the region are returned. The + first base returned will be the first base of the first + read 'not' necessarily the first base of the region used + in the query. + + Parameters + ---------- + + truncate : bool + + By default, the samtools pileup engine outputs all reads + overlapping a region. If truncate is True and a region is + given, only columns in the exact region specified are + returned. + + max_depth : int + Maximum read depth permitted. The default limit is '8000'. + + stepper : string + The stepper controls how the iterator advances. + Possible options for the stepper are + + ``all`` + skip reads in which any of the following flags are set: + BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP + + ``nofilter`` + uses every single read turning off any filtering. + + ``samtools`` + same filter and read processing as in samtools + pileup. For full compatibility, this requires a + 'fastafile' to be given. The following options all pertain + to filtering of the ``samtools`` stepper. + + fastafile : :class:`~pysam.FastaFile` object. + + This is required for some of the steppers. + + ignore_overlaps: bool + + If set to True, detect if read pairs overlap and only take + the higher quality base. This is the default. + + flag_filter : int + + ignore reads where any of the bits in the flag are set. The default is + BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. + + flag_require : int + + only use reads where certain flags are set. The default is 0. + + ignore_orphans: bool + + ignore orphans (paired reads that are not in a proper pair). + The default is to ignore orphans. + + min_base_quality: int + + Minimum base quality. Bases below the minimum quality will + not be output. The default is 13. + + adjust_capq_threshold: int + + adjust mapping quality. The default is 0 for no + adjustment. The recommended value for adjustment is 50. + + min_mapping_quality : int + + only use reads above a minimum mapping quality. The default is 0. + + compute_baq: bool + + re-alignment computing per-Base Alignment Qualities (BAQ). The + default is to do re-alignment. Realignment requires a reference + sequence. If none is present, no realignment will be performed. + + redo_baq: bool + + recompute per-Base Alignment Quality on the fly ignoring + existing base qualities. The default is False (use existing + base qualities). + + Returns + ------- + + an iterator over genomic positions. : IteratorColumn + + """ + cdef int rtid, has_coord + cdef int32_t rstart, rstop + + if not self.is_open: + raise ValueError("I/O operation on closed file") + + has_coord, rtid, rstart, rstop = self.parse_region( + contig, start, stop, region, reference=reference, end=end) + + if has_coord: + if not self.has_index(): + raise ValueError("no index available for pileup") + + return IteratorColumnRegion(self, + tid=rtid, + start=rstart, + stop=rstop, + **kwargs) + else: + if self.has_index(): + return IteratorColumnAllRefs(self, **kwargs) + else: + return IteratorColumnAll(self, **kwargs) + + def count(self, + contig=None, + start=None, + stop=None, + region=None, + until_eof=False, + read_callback="nofilter", + reference=None, + end=None): + '''count the number of reads in :term:`region` + + The region is specified by :term:`contig`, `start` and `stop`. + :term:`reference` and `end` are also accepted for backward + compatibility as synonyms for :term:`contig` and `stop`, + respectively. Alternatively, a `samtools`_ :term:`region` + string can be supplied. + + A :term:`SAM` file does not allow random access and if + `region` or `contig` are given, an exception is raised. + + Parameters + ---------- + + contig : string + reference_name of the genomic region (chromosome) + + start : int + start of the genomic region (0-based inclusive) + + stop : int + end of the genomic region (0-based exclusive) + + region : string + a region string in samtools format. + + until_eof : bool + count until the end of the file, possibly including + unmapped reads as well. + + read_callback: string or function + + select a call-back to ignore reads when counting. It can + be either a string with the following values: + + ``all`` + skip reads in which any of the following + flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, + BAM_FDUP + + ``nofilter`` + uses every single read + + Alternatively, `read_callback` can be a function + ``check_read(read)`` that should return True only for + those reads that shall be included in the counting. + + reference : string + backward compatible synonym for `contig` + + end : int + backward compatible synonym for `stop` + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid. + + ''' + cdef AlignedSegment read + cdef long counter = 0 + + if not self.is_open: + raise ValueError("I/O operation on closed file") + + cdef int filter_method = 0 + if read_callback == "all": + filter_method = 1 + elif read_callback == "nofilter": + filter_method = 2 + + for read in self.fetch(contig=contig, + start=start, + stop=stop, + reference=reference, + end=end, + region=region, + until_eof=until_eof): + # apply filter + if filter_method == 1: + # filter = "all" + if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): + continue + elif filter_method == 2: + # filter = "nofilter" + pass + else: + if not read_callback(read): + continue + counter += 1 + + return counter + + @cython.boundscheck(False) # we do manual bounds checking + def count_coverage(self, + contig, + start=None, + stop=None, + region=None, + quality_threshold=15, + read_callback='all', + reference=None, + end=None): + """count the coverage of genomic positions by reads in :term:`region`. + + The region is specified by :term:`contig`, `start` and `stop`. + :term:`reference` and `end` are also accepted for backward + compatibility as synonyms for :term:`contig` and `stop`, + respectively. Alternatively, a `samtools`_ :term:`region` + string can be supplied. The coverage is computed per-base [ACGT]. + + Parameters + ---------- + + contig : string + reference_name of the genomic region (chromosome) + + start : int + start of the genomic region (0-based inclusive). If not + given, count from the start of the chromosome. + + stop : int + end of the genomic region (0-based exclusive). If not given, + count to the end of the chromosome. + + region : string + a region string. + + quality_threshold : int + quality_threshold is the minimum quality score (in phred) a + base has to reach to be counted. + + read_callback: string or function + + select a call-back to ignore reads when counting. It can + be either a string with the following values: + + ``all`` + skip reads in which any of the following + flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, + BAM_FDUP + + ``nofilter`` + uses every single read + + Alternatively, `read_callback` can be a function + ``check_read(read)`` that should return True only for + those reads that shall be included in the counting. + + reference : string + backward compatible synonym for `contig` + + end : int + backward compatible synonym for `stop` + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid. + + Returns + ------- + + four array.arrays of the same length in order A C G T : tuple + + """ + + cdef uint32_t contig_length = self.get_reference_length(contig) + cdef int _start = start if start is not None else 0 + cdef int _stop = stop if stop is not None else contig_length + _stop = _stop if _stop < contig_length else contig_length + + if _stop == _start: + raise ValueError("interval of size 0") + if _stop < _start: + raise ValueError("interval of size less than 0") + + cdef int length = _stop - _start + cdef c_array.array int_array_template = array.array('L', []) + cdef c_array.array count_a + cdef c_array.array count_c + cdef c_array.array count_g + cdef c_array.array count_t + count_a = c_array.clone(int_array_template, length, zero=True) + count_c = c_array.clone(int_array_template, length, zero=True) + count_g = c_array.clone(int_array_template, length, zero=True) + count_t = c_array.clone(int_array_template, length, zero=True) + + cdef AlignedSegment read + cdef cython.str seq + cdef c_array.array quality + cdef int qpos + cdef int refpos + cdef int c = 0 + cdef int filter_method = 0 + + + if read_callback == "all": + filter_method = 1 + elif read_callback == "nofilter": + filter_method = 2 + + cdef int _threshold = quality_threshold or 0 + for read in self.fetch(contig=contig, + reference=reference, + start=start, + stop=stop, + end=end, + region=region): + # apply filter + if filter_method == 1: + # filter = "all" + if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): + continue + elif filter_method == 2: + # filter = "nofilter" + pass + else: + if not read_callback(read): + continue + + # count + seq = read.seq + if seq is None: + continue + quality = read.query_qualities + + for qpos, refpos in read.get_aligned_pairs(True): + if qpos is not None and refpos is not None and \ + _start <= refpos < _stop: + + # only check base quality if _threshold > 0 + if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: + if seq[qpos] == 'A': + count_a.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'C': + count_c.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'G': + count_g.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'T': + count_t.data.as_ulongs[refpos - _start] += 1 + + return count_a, count_c, count_g, count_t + + def find_introns_slow(self, read_iterator): + """Return a dictionary {(start, stop): count} + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), + and their support ( = number of reads ). + + read_iterator can be the result of a .fetch(...) call. + Or it can be a generator filtering such reads. Example + samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ + res = collections.Counter() + for r in read_iterator: + if 'N' in r.cigarstring: + last_read_pos = False + for read_loc, genome_loc in r.get_aligned_pairs(): + if read_loc is None and last_read_pos: + start = genome_loc + elif read_loc and last_read_pos is None: + stop = genome_loc # we are right exclusive ,so this is correct + res[(start, stop)] += 1 + del start + del stop + last_read_pos = read_loc + return res + + def find_introns(self, read_iterator): + """Return a dictionary {(start, stop): count} + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), + and their support ( = number of reads ). + + read_iterator can be the result of a .fetch(...) call. + Or it can be a generator filtering such reads. Example + samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ + cdef: + uint32_t base_position, junc_start, nt + int op + AlignedSegment r + int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP + + res = collections.Counter() + + match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position + for r in read_iterator: + base_position = r.pos + cigar = r.cigartuples + if cigar is None: + continue + + for op, nt in cigar: + if op in match_or_deletion: + base_position += nt + elif op == BAM_CREF_SKIP: + junc_start = base_position + base_position += nt + res[(junc_start, base_position)] += 1 + return res + + + def close(self): + '''closes the :class:`pysam.AlignmentFile`.''' + + if self.htsfile == NULL: + return + + if self.index != NULL: + hts_idx_destroy(self.index) + self.index = NULL + + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + + self.header = None + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise IOError(errno, force_str(strerror(errno))) + + def __dealloc__(self): + cdef int ret = 0 + + if self.index != NULL: + hts_idx_destroy(self.index) + self.index = NULL + + if self.htsfile != NULL: + ret = hts_close(self.htsfile) + self.htsfile = NULL + + self.header = None + + if self.b: + bam_destroy1(self.b) + self.b = NULL + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise IOError(errno, force_str(strerror(errno))) + + cpdef int write(self, AlignedSegment read) except -1: + ''' + write a single :class:`pysam.AlignedSegment` to disk. + + Raises: + ValueError + if the writing failed + + Returns: + int : + the number of bytes written. If the file is closed, + this will be 0. + ''' + if not self.is_open: + return 0 + + if self.header.ptr.n_targets <= read._delegate.core.tid: + raise ValueError( + "AlignedSegment refers to reference number {} that " + "is larger than the number of references ({}) in the header".format( + read._delegate.core.tid, self.header.ptr.n_targets)) + + cdef int ret + with nogil: + ret = sam_write1(self.htsfile, + self.header.ptr, + read._delegate) + + # kbj: Still need to raise an exception with except -1. Otherwise + # when ret == -1 we get a "SystemError: error return without + # exception set". + if ret < 0: + raise IOError( + "sam_write1 failed with error code {}".format(ret)) + + return ret + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + ############################################################### + ############################################################### + ############################################################### + ## properties + ############################################################### + property mapped: + """int with total number of mapped alignments according to the + statistics recorded in the index. This is a read-only + attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) + """ + def __get__(self): + self.check_index() + cdef int tid + cdef uint64_t total = 0 + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.nreferences: + with nogil: + hts_idx_get_stat(self.index, tid, &mapped, &unmapped) + total += mapped + return total + + property unmapped: + """int with total number of unmapped reads according to the statistics + recorded in the index. This number of reads includes the number of reads + without coordinates. This is a read-only attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) + """ + def __get__(self): + self.check_index() + cdef int tid + cdef uint64_t total = hts_idx_get_n_no_coor(self.index) + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.nreferences: + with nogil: + hts_idx_get_stat(self.index, tid, &mapped, &unmapped) + total += unmapped + return total + + property nocoordinate: + """int with total number of reads without coordinates according to the + statistics recorded in the index, i.e., the statistic printed for "*" + by the ``samtools idxstats`` command. This is a read-only attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) + """ + def __get__(self): + self.check_index() + cdef uint64_t n + with nogil: + n = hts_idx_get_n_no_coor(self.index) + return n + + def get_index_statistics(self): + """return statistics about mapped/unmapped reads per chromosome as + they are stored in the index, similarly to the statistics printed + by the ``samtools idxstats`` command. + + CRAI indexes do not record these statistics, so for a CRAM file + with a .crai index the returned statistics will all be 0. + + Returns: + list : + a list of records for each chromosome. Each record has the + attributes 'contig', 'mapped', 'unmapped' and 'total'. + """ + + self.check_index() + cdef int tid + cdef uint64_t mapped, unmapped + results = [] + # TODO: use header + for tid from 0 <= tid < self.nreferences: + with nogil: + hts_idx_get_stat(self.index, tid, &mapped, &unmapped) + results.append( + IndexStats._make(( + self.get_reference_name(tid), + mapped, + unmapped, + mapped + unmapped))) + + return results + + ############################################################### + ## file-object like iterator access + ## note: concurrent access will cause errors (see IteratorRow + ## and multiple_iterators) + ## Possible solutions: deprecate or open new file handle + def __iter__(self): + if not self.is_open: + raise ValueError("I/O operation on closed file") + + if not self.is_bam and self.header.nreferences == 0: + raise NotImplementedError( + "can not iterate over samfile without header") + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + ''' + cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. + ''' + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + ret = sam_read1(self.htsfile, + hdr, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + ########################################### + # methods/properties referencing the header + def is_valid_tid(self, int tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + Note that the unmapped tid code (-1) counts as an invalid. + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.is_valid_tid(tid) + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_tid(reference) + + def get_reference_name(self, tid): + """ + return :term:`reference` name corresponding to numerical :term:`tid` + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_reference_name(tid) + + def get_reference_length(self, reference): + """ + return :term:`reference` length corresponding to numerical :term:`tid` + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_reference_length(reference) + + property nreferences: + """int with the number of :term:`reference` sequences in the file. + This is a read-only attribute.""" + def __get__(self): + if self.header: + return self.header.nreferences + else: + raise ValueError("header not available in closed files") + + property references: + """tuple with the names of :term:`reference` sequences. This is a + read-only attribute""" + def __get__(self): + if self.header: + return self.header.references + else: + raise ValueError("header not available in closed files") + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. This is a + read-only attribute. The lengths are in the same order as + :attr:`pysam.AlignmentFile.references` + + """ + def __get__(self): + if self.header: + return self.header.lengths + else: + raise ValueError("header not available in closed files") + + # Compatibility functions for pysam < 0.14 + property text: + """deprecated, use :attr:`references` and :attr:`lengths` instead""" + def __get__(self): + if self.header: + return self.header.__str__() + else: + raise ValueError("header not available in closed files") + + # Compatibility functions for pysam < 0.8.3 + def gettid(self, reference): + """deprecated, use :meth:`get_tid` instead""" + return self.get_tid(reference) + + def getrname(self, tid): + """deprecated, use :meth:`get_reference_name` instead""" + return self.get_reference_name(tid) + + +cdef class IteratorRow: + '''abstract base class for iterators over mapped reads. + + Various iterators implement different behaviours for wrapping around + contig boundaries. Examples include: + + :class:`pysam.IteratorRowRegion` + iterate within a single contig and a defined region. + + :class:`pysam.IteratorRowAll` + iterate until EOF. This iterator will also include unmapped reads. + + :class:`pysam.IteratorRowAllRefs` + iterate over all reads in all reference sequences. + + The method :meth:`AlignmentFile.fetch` returns an IteratorRow. + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + ''' + + def __init__(self, AlignmentFile samfile, int multiple_iterators=False): + cdef char *cfilename + cdef char *creference_filename + cdef char *cindexname = NULL + + if not samfile.is_open: + raise ValueError("I/O operation on closed file") + + # makes sure that samfile stays alive as long as the + # iterator is alive + self.samfile = samfile + + # reopen the file - note that this makes the iterator + # slow and causes pileup to slow down significantly. + if multiple_iterators: + + cfilename = samfile.filename + with nogil: + self.htsfile = hts_open(cfilename, 'r') + assert self.htsfile != NULL + + if samfile.has_index(): + if samfile.index_filename: + cindexname = bindex_filename = encode_filename(samfile.index_filename) + with nogil: + self.index = sam_index_load2(self.htsfile, cfilename, cindexname) + else: + self.index = NULL + + # need to advance in newly opened file to position after header + # better: use seek/tell? + with nogil: + hdr = sam_hdr_read(self.htsfile) + if hdr is NULL: + raise IOError("unable to read header information") + self.header = makeAlignmentHeader(hdr) + + self.owns_samfile = True + + # options specific to CRAM files + if samfile.is_cram and samfile.reference_filename: + creference_filename = samfile.reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + + else: + self.htsfile = samfile.htsfile + self.index = samfile.index + self.owns_samfile = False + self.header = samfile.header + + self.retval = 0 + + self.b = bam_init1() + + def __dealloc__(self): + bam_destroy1(self.b) + if self.owns_samfile: + hts_idx_destroy(self.index) + hts_close(self.htsfile) + + +cdef class IteratorRowRegion(IteratorRow): + """*(AlignmentFile samfile, int tid, int beg, int stop, + int multiple_iterators=False)* + + iterate over mapped reads in a region. + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, + int tid, int beg, int stop, + int multiple_iterators=False): + + if not samfile.has_index(): + raise ValueError("no index available for iteration") + + super().__init__(samfile, multiple_iterators=multiple_iterators) + with nogil: + self.iter = sam_itr_queryi( + self.index, + tid, + beg, + stop) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + with nogil: + self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), + self.iter, + self.b, + self.htsfile) + + def __next__(self): + self.cnext() + if self.retval >= 0: + return makeAlignedSegment(self.b, self.header) + elif self.retval == -1: + raise StopIteration + elif self.retval == -2: + # Note: it is currently not the case that hts_iter_next + # returns -2 for a truncated file. + # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 + raise IOError('truncated file') + else: + raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) + + def __dealloc__(self): + hts_itr_destroy(self.iter) + + +cdef class IteratorRowHead(IteratorRow): + """*(AlignmentFile samfile, n, int multiple_iterators=False)* + + iterate over first n reads in `samfile` + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.head`. + + """ + + def __init__(self, + AlignmentFile samfile, + int n, + int multiple_iterators=False): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + self.max_rows = n + self.current_row = 0 + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + ret = sam_read1(self.htsfile, + hdr, + self.b) + return ret + + def __next__(self): + if self.current_row >= self.max_rows: + raise StopIteration + + cdef int ret = self.cnext() + if ret >= 0: + self.current_row += 1 + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + +cdef class IteratorRowAll(IteratorRow): + """*(AlignmentFile samfile, int multiple_iterators=False)* + + iterate over all reads in `samfile` + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, int multiple_iterators=False): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + ret = sam_read1(self.htsfile, + hdr, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + +cdef class IteratorRowAllRefs(IteratorRow): + """iterates over all mapped reads by chaining iterators over each + reference + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, multiple_iterators=False): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + if not samfile.has_index(): + raise ValueError("no index available for fetch") + + self.tid = -1 + + def nextiter(self): + # get a new iterator for a chromosome. The file + # will not be re-opened. + self.rowiter = IteratorRowRegion(self.samfile, + self.tid, + 0, + MAX_POS) + # set htsfile and header of the rowiter + # to the values in this iterator to reflect multiple_iterators + self.rowiter.htsfile = self.htsfile + self.rowiter.header = self.header + + # make sure the iterator understand that IteratorRowAllRefs + # has ownership + self.rowiter.owns_samfile = False + + def __iter__(self): + return self + + def __next__(self): + # Create an initial iterator + if self.tid == -1: + if not self.samfile.nreferences: + raise StopIteration + self.tid = 0 + self.nextiter() + + while 1: + self.rowiter.cnext() + + # If current iterator is not exhausted, return aligned read + if self.rowiter.retval > 0: + return makeAlignedSegment(self.rowiter.b, self.header) + + self.tid += 1 + + # Otherwise, proceed to next reference or stop + if self.tid < self.samfile.nreferences: + self.nextiter() + else: + raise StopIteration + + +cdef class IteratorRowSelection(IteratorRow): + """*(AlignmentFile samfile)* + + iterate over reads in `samfile` at a given list of file positions. + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. + """ + + def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + self.positions = positions + self.current_pos = 0 + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator''' + # end iteration if out of positions + if self.current_pos >= len(self.positions): return -1 + + cdef uint64_t pos = self.positions[self.current_pos] + with nogil: + bgzf_seek(hts_get_bgzfp(self.htsfile), + pos, + 0) + self.current_pos += 1 + + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + ret = sam_read1(self.htsfile, + hdr, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + +cdef int __advance_nofilter(void *data, bam1_t *b): + '''advance without any read filtering. + ''' + cdef __iterdata * d = <__iterdata*>data + cdef int ret + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + return ret + + +cdef int __advance_raw_nofilter(void *data, bam1_t *b): + '''advance (without iterator) without any read filtering. + ''' + cdef __iterdata * d = <__iterdata*>data + cdef int ret + with nogil: + ret = sam_read1(d.htsfile, d.header, b) + return ret + + +cdef int __advance_all(void *data, bam1_t *b): + '''only use reads for pileup passing basic filters such as + + BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP + ''' + + cdef __iterdata * d = <__iterdata*>data + cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP + cdef int ret + while 1: + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + if ret < 0: + break + if b.core.flag & d.flag_filter: + continue + break + return ret + + +cdef int __advance_raw_all(void *data, bam1_t *b): + '''only use reads for pileup passing basic filters such as + + BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP + ''' + + cdef __iterdata * d = <__iterdata*>data + cdef int ret + while 1: + with nogil: + ret = sam_read1(d.htsfile, d.header, b) + if ret < 0: + break + if b.core.flag & d.flag_filter: + continue + break + return ret + + +cdef int __advance_samtools(void * data, bam1_t * b): + '''advance using same filter and read processing as in + the samtools pileup. + ''' + cdef __iterdata * d = <__iterdata*>data + cdef int ret + cdef int q + + while 1: + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) + if ret < 0: + break + if b.core.flag & d.flag_filter: + continue + if d.flag_require and not (b.core.flag & d.flag_require): + continue + + # reload sequence + if d.fastafile != NULL and b.core.tid != d.tid: + if d.seq != NULL: + free(d.seq) + d.tid = b.core.tid + with nogil: + d.seq = faidx_fetch_seq( + d.fastafile, + d.header.target_name[d.tid], + 0, MAX_POS, + &d.seq_len) + + if d.seq == NULL: + raise ValueError( + "reference sequence for '{}' (tid={}) not found".format( + d.header.target_name[d.tid], d.tid)) + + # realign read - changes base qualities + if d.seq != NULL and d.compute_baq: + # 4th option to realign is flag: + # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 + if d.redo_baq: + sam_prob_realn(b, d.seq, d.seq_len, 7) + else: + sam_prob_realn(b, d.seq, d.seq_len, 3) + + if d.seq != NULL and d.adjust_capq_threshold > 10: + q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) + if q < 0: + continue + elif b.core.qual > q: + b.core.qual = q + + if b.core.qual < d.min_mapping_quality: + continue + if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): + continue + + break + + return ret + + +cdef class IteratorColumn: + '''abstract base class for iterators over columns. + + IteratorColumn objects wrap the pileup functionality of samtools. + + For reasons of efficiency, the iterator points to the current + pileup buffer. The pileup buffer is updated at every iteration. + This might cause some unexpected behaviour. For example, + consider the conversion to a list:: + + f = AlignmentFile("file.bam", "rb") + result = list(f.pileup()) + + Here, ``result`` will contain ``n`` objects of type + :class:`~pysam.PileupColumn` for ``n`` columns, but each object in + ``result`` will contain the same information. + + The desired behaviour can be achieved by list comprehension:: + + result = [x.pileups() for x in f.pileup()] + + ``result`` will be a list of ``n`` lists of objects of type + :class:`~pysam.PileupRead`. + + If the iterator is associated with a :class:`~pysam.Fastafile` + using the :meth:`add_reference` method, then the iterator will + export the current sequence via the methods :meth:`get_sequence` + and :meth:`seq_len`. + + See :class:`~AlignmentFile.pileup` for kwargs to the iterator. + ''' + + def __cinit__( self, AlignmentFile samfile, **kwargs): + self.samfile = samfile + self.fastafile = kwargs.get("fastafile", None) + self.stepper = kwargs.get("stepper", "samtools") + self.max_depth = kwargs.get("max_depth", 8000) + self.ignore_overlaps = kwargs.get("ignore_overlaps", True) + self.min_base_quality = kwargs.get("min_base_quality", 13) + self.iterdata.seq = NULL + self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) + self.iterdata.flag_require = kwargs.get("flag_require", 0) + self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) + self.iterdata.compute_baq = kwargs.get("compute_baq", True) + self.iterdata.redo_baq = kwargs.get("redo_baq", False) + self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) + + self.tid = 0 + self.pos = 0 + self.n_plp = 0 + self.plp = NULL + self.pileup_iter = <bam_mplp_t>NULL + + def __iter__(self): + return self + + cdef int cnext(self): + '''perform next iteration. + ''' + # do not release gil here because of call-backs + cdef int ret = bam_mplp_auto(self.pileup_iter, + &self.tid, + &self.pos, + &self.n_plp, + &self.plp) + return ret + + cdef char * get_sequence(self): + '''return current reference sequence underlying the iterator. + ''' + return self.iterdata.seq + + property seq_len: + '''current sequence length.''' + def __get__(self): + return self.iterdata.seq_len + + def add_reference(self, FastaFile fastafile): + ''' + add reference sequences in `fastafile` to iterator.''' + self.fastafile = fastafile + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.tid = -1 + self.iterdata.fastafile = self.fastafile.fastafile + + def has_reference(self): + ''' + return true if iterator is associated with a reference''' + return self.fastafile + + cdef _setup_iterator(self, + int tid, + int start, + int stop, + int multiple_iterators=0): + '''setup the iterator structure''' + + self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) + self.iterdata.htsfile = self.samfile.htsfile + self.iterdata.iter = self.iter.iter + self.iterdata.seq = NULL + self.iterdata.tid = -1 + self.iterdata.header = self.samfile.header.ptr + + if self.fastafile is not None: + self.iterdata.fastafile = self.fastafile.fastafile + else: + self.iterdata.fastafile = NULL + + # Free any previously allocated memory before reassigning + # pileup_iter + self._free_pileup_iter() + + cdef void * data[1] + data[0] = <void*>&self.iterdata + + if self.stepper is None or self.stepper == "all": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_all, + data) + elif self.stepper == "nofilter": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_nofilter, + data) + elif self.stepper == "samtools": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_samtools, + data) + else: + raise ValueError( + "unknown stepper option `%s` in IteratorColumn" % self.stepper) + + if self.max_depth: + with nogil: + bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) + + if self.ignore_overlaps: + with nogil: + bam_mplp_init_overlaps(self.pileup_iter) + + cdef _setup_raw_rest_iterator(self): + '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST''' + + self.iter = None + self.iterdata.iter = NULL + self.iterdata.htsfile = self.samfile.htsfile + self.iterdata.seq = NULL + self.iterdata.tid = -1 + self.iterdata.header = self.samfile.header.ptr + + if self.fastafile is not None: + self.iterdata.fastafile = self.fastafile.fastafile + else: + self.iterdata.fastafile = NULL + + # Free any previously allocated memory before reassigning + # pileup_iter + self._free_pileup_iter() + + cdef void * data[1] + data[0] = <void*>&self.iterdata + + if self.stepper is None or self.stepper == "all": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_raw_all, + data) + elif self.stepper == "nofilter": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_raw_nofilter, + data) + elif self.stepper == "samtools": + with nogil: + self.pileup_iter = bam_mplp_init(1, + <bam_plp_auto_f>&__advance_samtools, + data) + else: + raise ValueError( + "unknown stepper option `%s` in IteratorColumn" % self.stepper) + + if self.max_depth: + with nogil: + bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) + + if self.ignore_overlaps: + with nogil: + bam_mplp_init_overlaps(self.pileup_iter) + + cdef reset(self, tid, start, stop): + '''reset iterator position. + + This permits using the iterator multiple times without + having to incur the full set-up costs. + ''' + if self.iter is None: + raise TypeError("Raw iterator set up without region cannot be reset") + + self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) + self.iterdata.iter = self.iter.iter + + # invalidate sequence if different tid + if self.tid != tid: + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.seq = NULL + self.iterdata.tid = -1 + + # self.pileup_iter = bam_mplp_init(1 + # &__advancepileup, + # &self.iterdata) + with nogil: + bam_mplp_reset(self.pileup_iter) + + cdef _free_pileup_iter(self): + '''free the memory alloc'd by bam_plp_init. + + This is needed before setup_iterator allocates another + pileup_iter, or else memory will be lost. ''' + if self.pileup_iter != <bam_mplp_t>NULL: + with nogil: + bam_mplp_reset(self.pileup_iter) + bam_mplp_destroy(self.pileup_iter) + self.pileup_iter = <bam_mplp_t>NULL + + def __dealloc__(self): + # reset in order to avoid memory leak messages for iterators + # that have not been fully consumed + self._free_pileup_iter() + self.plp = <const bam_pileup1_t*>NULL + + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.seq = NULL + + # backwards compatibility + + def hasReference(self): + return self.has_reference() + cdef char * getSequence(self): + return self.get_sequence() + def addReference(self, FastaFile fastafile): + return self.add_reference(fastafile) + + +cdef class IteratorColumnRegion(IteratorColumn): + '''iterates over a region only. + ''' + def __cinit__(self, + AlignmentFile samfile, + int tid = 0, + int start = 0, + int stop = MAX_POS, + int truncate = False, + int multiple_iterators = True, + **kwargs ): + + # initialize iterator. Multiple iterators not available + # for CRAM. + if multiple_iterators and samfile.is_cram: + warnings.warn("multiple_iterators not implemented for CRAM") + multiple_iterators = False + + self._setup_iterator(tid, start, stop, multiple_iterators) + self.start = start + self.stop = stop + self.truncate = truncate + + def __next__(self): + + cdef int n + + while 1: + n = self.cnext() + if n < 0: + raise ValueError("error during iteration" ) + + if n == 0: + raise StopIteration + + if self.truncate: + if self.start > self.pos: + continue + if self.pos >= self.stop: + raise StopIteration + + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.iterdata.seq, + self.samfile.header) + + +cdef class IteratorColumnAllRefs(IteratorColumn): + """iterates over all columns by chaining iterators over each reference + """ + + def __cinit__(self, + AlignmentFile samfile, + **kwargs): + + # no iteration over empty files + if not samfile.nreferences: + raise StopIteration + + # initialize iterator + self._setup_iterator(self.tid, 0, MAX_POS, 1) + + def __next__(self): + + cdef int n + while 1: + n = self.cnext() + if n < 0: + raise ValueError("error during iteration") + + # proceed to next reference or stop + if n == 0: + self.tid += 1 + if self.tid < self.samfile.nreferences: + self._setup_iterator(self.tid, 0, MAX_POS, 0) + else: + raise StopIteration + continue + + # return result, if within same reference + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.iterdata.seq, + self.samfile.header) + + +cdef class IteratorColumnAll(IteratorColumn): + """iterates over all columns, without using an index + """ + + def __cinit__(self, + AlignmentFile samfile, + **kwargs): + + self._setup_raw_rest_iterator() + + def __next__(self): + + cdef int n + n = self.cnext() + if n < 0: + raise ValueError("error during iteration") + + if n == 0: + raise StopIteration + + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.iterdata.seq, + self.samfile.header) + + +cdef class SNPCall: + '''the results of a SNP call.''' + cdef int _tid + cdef int _pos + cdef char _reference_base + cdef char _genotype + cdef int _consensus_quality + cdef int _snp_quality + cdef int _rms_mapping_quality + cdef int _coverage + + property tid: + '''the chromosome ID as is defined in the header''' + def __get__(self): + return self._tid + + property pos: + '''nucleotide position of SNP.''' + def __get__(self): return self._pos + + property reference_base: + '''reference base at pos. ``N`` if no reference sequence supplied.''' + def __get__(self): return from_string_and_size( &self._reference_base, 1 ) + + property genotype: + '''the genotype called.''' + def __get__(self): return from_string_and_size( &self._genotype, 1 ) + + property consensus_quality: + '''the genotype quality (Phred-scaled).''' + def __get__(self): return self._consensus_quality + + property snp_quality: + '''the snp quality (Phred scaled) - probability of consensus being + identical to reference sequence.''' + def __get__(self): return self._snp_quality + + property mapping_quality: + '''the root mean square (rms) of the mapping quality of all reads + involved in the call.''' + def __get__(self): return self._rms_mapping_quality + + property coverage: + '''coverage or read depth - the number of reads involved in the call.''' + def __get__(self): return self._coverage + + def __str__(self): + + return "\t".join( map(str, ( + self.tid, + self.pos, + self.reference_base, + self.genotype, + self.consensus_quality, + self.snp_quality, + self.mapping_quality, + self.coverage ) ) ) + + +cdef class IndexedReads: + """Index a Sam/BAM-file by query name while keeping the + original sort order intact. + + The index is kept in memory and can be substantial. + + By default, the file is re-opened to avoid conflicts if multiple + operators work on the same file. Set `multiple_iterators` = False + to not re-open `samfile`. + + Parameters + ---------- + + samfile : AlignmentFile + File to be indexed. + + multiple_iterators : bool + Flag indicating whether the file should be reopened. Reopening prevents + existing iterators being affected by the indexing. + + """ + + def __init__(self, AlignmentFile samfile, int multiple_iterators=True): + cdef char *cfilename + + # makes sure that samfile stays alive as long as this + # object is alive. + self.samfile = samfile + cdef bam_hdr_t * hdr = NULL + assert samfile.is_bam, "can only apply IndexReads on bam files" + + # multiple_iterators the file - note that this makes the iterator + # slow and causes pileup to slow down significantly. + if multiple_iterators: + cfilename = samfile.filename + with nogil: + self.htsfile = hts_open(cfilename, 'r') + if self.htsfile == NULL: + raise OSError("unable to reopen htsfile") + + # need to advance in newly opened file to position after header + # better: use seek/tell? + with nogil: + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: + raise OSError("unable to read header information") + self.header = makeAlignmentHeader(hdr) + self.owns_samfile = True + else: + self.htsfile = self.samfile.htsfile + self.header = samfile.header + self.owns_samfile = False + + def build(self): + '''build the index.''' + + self.index = collections.defaultdict(list) + + # this method will start indexing from the current file position + cdef int ret = 1 + cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t)) + if b == NULL: + raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) + + cdef uint64_t pos + cdef bam_hdr_t * hdr = self.header.ptr + + while ret > 0: + with nogil: + pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) + ret = sam_read1(self.htsfile, + hdr, + b) + + if ret > 0: + qname = charptr_to_str(pysam_bam_get_qname(b)) + self.index[qname].append(pos) + + bam_destroy1(b) + + def find(self, query_name): + '''find `query_name` in index. + + Returns + ------- + + IteratorRowSelection + Returns an iterator over all reads with query_name. + + Raises + ------ + + KeyError + if the `query_name` is not in the index. + + ''' + if query_name in self.index: + return IteratorRowSelection( + self.samfile, + self.index[query_name], + multiple_iterators = False) + else: + raise KeyError("read %s not found" % query_name) + + def __dealloc__(self): + if self.owns_samfile: + hts_close(self.htsfile)