jpayne@69: # cython: embedsignature=True jpayne@69: # cython: profile=True jpayne@69: ######################################################## jpayne@69: ######################################################## jpayne@69: # Cython wrapper for SAM/BAM/CRAM files based on htslib jpayne@69: ######################################################## jpayne@69: # The principal classes defined in this module are: jpayne@69: # jpayne@69: # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files jpayne@69: # jpayne@69: # class AlignmentHeader manage SAM/BAM/CRAM header data jpayne@69: # jpayne@69: # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping jpayne@69: # the original sort order intact jpayne@69: # jpayne@69: # Additionally this module defines numerous additional classes that jpayne@69: # are part of the internal API. These are: jpayne@69: # jpayne@69: # Various iterator classes to iterate over alignments in sequential jpayne@69: # (IteratorRow) or in a stacked fashion (IteratorColumn): jpayne@69: # jpayne@69: # class IteratorRow jpayne@69: # class IteratorRowRegion jpayne@69: # class IteratorRowHead jpayne@69: # class IteratorRowAll jpayne@69: # class IteratorRowAllRefs jpayne@69: # class IteratorRowSelection jpayne@69: # class IteratorColumn jpayne@69: # class IteratorColumnRegion jpayne@69: # class IteratorColumnAll jpayne@69: # class IteratorColumnAllRefs jpayne@69: # jpayne@69: ######################################################## jpayne@69: # jpayne@69: # The MIT License jpayne@69: # jpayne@69: # Copyright (c) 2015 Andreas Heger jpayne@69: # jpayne@69: # Permission is hereby granted, free of charge, to any person obtaining a jpayne@69: # copy of this software and associated documentation files (the "Software"), jpayne@69: # to deal in the Software without restriction, including without limitation jpayne@69: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jpayne@69: # and/or sell copies of the Software, and to permit persons to whom the jpayne@69: # Software is furnished to do so, subject to the following conditions: jpayne@69: # jpayne@69: # The above copyright notice and this permission notice shall be included in jpayne@69: # all copies or substantial portions of the Software. jpayne@69: # jpayne@69: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@69: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@69: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL jpayne@69: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@69: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jpayne@69: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jpayne@69: # DEALINGS IN THE SOFTWARE. jpayne@69: # jpayne@69: ######################################################## jpayne@69: import os jpayne@69: import collections jpayne@69: try: jpayne@69: from collections.abc import Sequence, Mapping # noqa jpayne@69: except ImportError: jpayne@69: from collections import Sequence, Mapping # noqa jpayne@69: import re jpayne@69: import warnings jpayne@69: import array jpayne@69: from libc.errno cimport errno, EPIPE jpayne@69: from libc.string cimport strcmp, strpbrk, strerror jpayne@69: from libc.stdint cimport INT32_MAX jpayne@69: jpayne@69: from cpython cimport array as c_array jpayne@69: jpayne@69: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str jpayne@69: from pysam.libcutils cimport encode_filename, from_string_and_size jpayne@69: from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn jpayne@69: from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \ jpayne@69: HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL jpayne@69: jpayne@69: from io import StringIO jpayne@69: jpayne@69: cimport cython jpayne@69: jpayne@69: jpayne@69: __all__ = [ jpayne@69: "AlignmentFile", jpayne@69: "AlignmentHeader", jpayne@69: "IteratorRow", jpayne@69: "IteratorColumn", jpayne@69: "IndexedReads"] jpayne@69: jpayne@69: IndexStats = collections.namedtuple("IndexStats", jpayne@69: ("contig", jpayne@69: "mapped", jpayne@69: "unmapped", jpayne@69: "total")) jpayne@69: jpayne@69: ######################################################## jpayne@69: ## global variables jpayne@69: # maximum genomic coordinace jpayne@69: # for some reason, using 'int' causes overflow jpayne@69: cdef int MAX_POS = (1 << 31) - 1 jpayne@69: jpayne@69: # valid types for SAM headers jpayne@69: VALID_HEADER_TYPES = {"HD" : Mapping, jpayne@69: "SQ" : Sequence, jpayne@69: "RG" : Sequence, jpayne@69: "PG" : Sequence, jpayne@69: "CO" : Sequence} jpayne@69: jpayne@69: # order of records within SAM headers jpayne@69: VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") jpayne@69: jpayne@69: # default type conversions within SAM header records jpayne@69: KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str, jpayne@69: "SS" : str,}, jpayne@69: "SQ" : {"SN" : str, "LN" : int, "AS" : str, jpayne@69: "M5" : str, "SP" : str, "UR" : str, jpayne@69: "AH" : str, "TP" : str, "DS" : str, jpayne@69: "AN" : str,}, jpayne@69: "RG" : {"ID" : str, "CN" : str, "DS" : str, jpayne@69: "DT" : str, "FO" : str, "KS" : str, jpayne@69: "LB" : str, "PG" : str, "PI" : str, jpayne@69: "PL" : str, "PM" : str, "PU" : str, jpayne@69: "SM" : str, "BC" : str,}, jpayne@69: "PG" : {"ID" : str, "PN" : str, "CL" : str, jpayne@69: "PP" : str, "DS" : str, "VN" : str,},} jpayne@69: jpayne@69: # output order of fields within records. Ensure that CL is at jpayne@69: # the end as parsing a CL will ignore any subsequent records. jpayne@69: VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"), jpayne@69: "SQ" : ("SN", "LN", "AS", "M5", jpayne@69: "UR", "SP", "AH", "TP", jpayne@69: "DS", "AN"), jpayne@69: "RG" : ("ID", "CN", "SM", "LB", jpayne@69: "PU", "PI", "DT", "DS", jpayne@69: "PL", "FO", "KS", "PG", jpayne@69: "PM", "BC"), jpayne@69: "PG" : ("PN", "ID", "VN", "PP", jpayne@69: "DS", "CL"),} jpayne@69: jpayne@69: jpayne@69: def build_header_line(fields, record): jpayne@69: '''build a header line from `fields` dictionary for `record`''' jpayne@69: jpayne@69: # TODO: add checking for field and sort order jpayne@69: line = ["@%s" % record] jpayne@69: # comment jpayne@69: if record == "CO": jpayne@69: line.append(fields) jpayne@69: # user tags jpayne@69: elif record.islower(): jpayne@69: for key in sorted(fields): jpayne@69: line.append("%s:%s" % (key, str(fields[key]))) jpayne@69: # defined tags jpayne@69: else: jpayne@69: # write fields of the specification jpayne@69: for key in VALID_HEADER_ORDER[record]: jpayne@69: if key in fields: jpayne@69: line.append("%s:%s" % (key, str(fields[key]))) jpayne@69: # write user fields jpayne@69: for key in fields: jpayne@69: if not key.isupper(): jpayne@69: line.append("%s:%s" % (key, str(fields[key]))) jpayne@69: jpayne@69: return "\t".join(line) jpayne@69: jpayne@69: jpayne@69: cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): jpayne@69: if not hdr: jpayne@69: raise ValueError('cannot create AlignmentHeader, received NULL pointer') jpayne@69: jpayne@69: # check: is AlignmetHeader.__cinit__ called? jpayne@69: cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) jpayne@69: header.ptr = hdr jpayne@69: jpayne@69: return header jpayne@69: jpayne@69: def read_failure_reason(code): jpayne@69: if code == -2: jpayne@69: return 'truncated file' jpayne@69: else: jpayne@69: return "error {} while reading file".format(code) jpayne@69: jpayne@69: jpayne@69: # the following should be class-method for VariantHeader, but cdef @classmethods jpayne@69: # are not implemented in cython. jpayne@69: cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, jpayne@69: reference_names, jpayne@69: reference_lengths, jpayne@69: add_sq_text=True, jpayne@69: text=None) except -1: jpayne@69: """build header from list of reference names and lengths. jpayne@69: """ jpayne@69: jpayne@69: cdef class AlignmentHeader(object): jpayne@69: """header information for a :class:`AlignmentFile` object jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: header_dict : dict jpayne@69: build header from a multi-level dictionary. The jpayne@69: first level are the four types ('HD', 'SQ', ...). The second jpayne@69: level are a list of lines, with each line being a list of jpayne@69: tag-value pairs. The header is constructed first from all the jpayne@69: defined fields, followed by user tags in alphabetical jpayne@69: order. Alternatively, an :class:`~pysam.AlignmentHeader` jpayne@69: object can be passed directly. jpayne@69: jpayne@69: text : string jpayne@69: use the string provided as the header jpayne@69: jpayne@69: reference_names : list jpayne@69: see reference_lengths jpayne@69: jpayne@69: reference_lengths : list jpayne@69: build header from list of chromosome names and lengths. By jpayne@69: default, 'SQ' and 'LN' tags will be added to the header jpayne@69: text. This option can be changed by unsetting the flag jpayne@69: `add_sq_text`. jpayne@69: jpayne@69: add_sq_text : bool jpayne@69: do not add 'SQ' and 'LN' tags to header. This option permits jpayne@69: construction :term:`SAM` formatted files without a header. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: # See makeVariantHeader for C constructor jpayne@69: def __cinit__(self): jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: # Python constructor jpayne@69: def __init__(self): jpayne@69: self.ptr = bam_hdr_init() jpayne@69: if self.ptr is NULL: jpayne@69: raise MemoryError("could not create header") jpayne@69: jpayne@69: @classmethod jpayne@69: def _from_text_and_lengths(cls, text, reference_names, reference_lengths): jpayne@69: jpayne@69: cdef AlignmentHeader self = AlignmentHeader() jpayne@69: cdef char *ctext jpayne@69: cdef int l_text jpayne@69: cdef int n, x jpayne@69: if text is not None: jpayne@69: btext = force_bytes(text) jpayne@69: ctext = btext jpayne@69: l_text = len(btext) jpayne@69: self.ptr.text = calloc(l_text + 1, sizeof(char)) jpayne@69: if self.ptr.text == NULL: jpayne@69: raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) jpayne@69: self.ptr.l_text = l_text jpayne@69: memcpy(self.ptr.text, ctext, l_text + 1) jpayne@69: jpayne@69: if reference_names and reference_lengths: jpayne@69: reference_names = [force_bytes(ref) for ref in reference_names] jpayne@69: jpayne@69: self.ptr.n_targets = len(reference_names) jpayne@69: jpayne@69: n = sum([len(reference_names) + 1]) jpayne@69: self.ptr.target_name = calloc(n, sizeof(char*)) jpayne@69: if self.ptr.target_name == NULL: jpayne@69: raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) jpayne@69: jpayne@69: self.ptr.target_len = calloc(n, sizeof(uint32_t)) jpayne@69: if self.ptr.target_len == NULL: jpayne@69: raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) jpayne@69: jpayne@69: for x from 0 <= x < self.ptr.n_targets: jpayne@69: self.ptr.target_len[x] = reference_lengths[x] jpayne@69: name = reference_names[x] jpayne@69: self.ptr.target_name[x] = calloc(len(name) + 1, sizeof(char)) jpayne@69: if self.ptr.target_name[x] == NULL: jpayne@69: raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) jpayne@69: strncpy(self.ptr.target_name[x], name, len(name)) jpayne@69: jpayne@69: return self jpayne@69: jpayne@69: @classmethod jpayne@69: def from_text(cls, text): jpayne@69: jpayne@69: reference_names, reference_lengths = [], [] jpayne@69: for line in text.splitlines(): jpayne@69: if line.startswith("@SQ"): jpayne@69: fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) jpayne@69: try: jpayne@69: reference_names.append(fields["SN"]) jpayne@69: reference_lengths.append(int(fields["LN"])) jpayne@69: except KeyError: jpayne@69: raise KeyError("incomplete sequence information in '%s'" % str(fields)) jpayne@69: except ValueError: jpayne@69: raise ValueError("wrong sequence information in '%s'" % str(fields)) jpayne@69: jpayne@69: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@69: jpayne@69: @classmethod jpayne@69: def from_dict(cls, header_dict): jpayne@69: jpayne@69: cdef list lines = [] jpayne@69: # first: defined tags jpayne@69: for record in VALID_HEADERS: jpayne@69: if record in header_dict: jpayne@69: data = header_dict[record] jpayne@69: if not isinstance(data, VALID_HEADER_TYPES[record]): jpayne@69: raise ValueError( jpayne@69: "invalid type for record %s: %s, expected %s".format( jpayne@69: record, type(data), VALID_HEADER_TYPES[record])) jpayne@69: if isinstance(data, Mapping): jpayne@69: lines.append(build_header_line(data, record)) jpayne@69: else: jpayne@69: for fields in header_dict[record]: jpayne@69: lines.append(build_header_line(fields, record)) jpayne@69: jpayne@69: # then: user tags (lower case), sorted alphabetically jpayne@69: for record, data in sorted(header_dict.items()): jpayne@69: if record in VALID_HEADERS: jpayne@69: continue jpayne@69: if isinstance(data, Mapping): jpayne@69: lines.append(build_header_line(data, record)) jpayne@69: else: jpayne@69: for fields in header_dict[record]: jpayne@69: lines.append(build_header_line(fields, record)) jpayne@69: jpayne@69: text = "\n".join(lines) + "\n" jpayne@69: jpayne@69: reference_names, reference_lengths = [], [] jpayne@69: if "SQ" in header_dict: jpayne@69: for fields in header_dict["SQ"]: jpayne@69: try: jpayne@69: reference_names.append(fields["SN"]) jpayne@69: reference_lengths.append(fields["LN"]) jpayne@69: except KeyError: jpayne@69: raise KeyError("incomplete sequence information in '%s'" % str(fields)) jpayne@69: jpayne@69: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@69: jpayne@69: @classmethod jpayne@69: def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): jpayne@69: jpayne@69: if len(reference_names) != len(reference_lengths): jpayne@69: raise ValueError("number of reference names and lengths do not match") jpayne@69: jpayne@69: # optionally, if there is no text, add a SAM compatible header to output file. jpayne@69: if text is None and add_sq_text: jpayne@69: text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( jpayne@69: reference_names, reference_lengths)]) jpayne@69: jpayne@69: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: bam_hdr_destroy(self.ptr) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return self.ptr != NULL jpayne@69: jpayne@69: def copy(self): jpayne@69: return makeAlignmentHeader(bam_hdr_dup(self.ptr)) jpayne@69: jpayne@69: property nreferences: jpayne@69: """int with the number of :term:`reference` sequences in the file. jpayne@69: jpayne@69: This is a read-only attribute.""" jpayne@69: def __get__(self): jpayne@69: return self.ptr.n_targets jpayne@69: jpayne@69: property references: jpayne@69: """tuple with the names of :term:`reference` sequences. This is a jpayne@69: read-only attribute""" jpayne@69: def __get__(self): jpayne@69: t = [] jpayne@69: cdef int x jpayne@69: for x in range(self.ptr.n_targets): jpayne@69: t.append(charptr_to_str(self.ptr.target_name[x])) jpayne@69: return tuple(t) jpayne@69: jpayne@69: property lengths: jpayne@69: """tuple of the lengths of the :term:`reference` sequences. This is a jpayne@69: read-only attribute. The lengths are in the same order as jpayne@69: :attr:`pysam.AlignmentFile.references` jpayne@69: """ jpayne@69: def __get__(self): jpayne@69: t = [] jpayne@69: cdef int x jpayne@69: for x in range(self.ptr.n_targets): jpayne@69: t.append(self.ptr.target_len[x]) jpayne@69: return tuple(t) jpayne@69: jpayne@69: def _build_sequence_section(self): jpayne@69: """return sequence section of header. jpayne@69: jpayne@69: The sequence section is built from the list of reference names and jpayne@69: lengths stored in the BAM-file and not from any @SQ entries that jpayne@69: are part of the header's text section. jpayne@69: """ jpayne@69: jpayne@69: cdef int x jpayne@69: text = [] jpayne@69: for x in range(self.ptr.n_targets): jpayne@69: text.append("@SQ\tSN:{}\tLN:{}\n".format( jpayne@69: force_str(self.ptr.target_name[x]), jpayne@69: self.ptr.target_len[x])) jpayne@69: return "".join(text) jpayne@69: jpayne@69: def to_dict(self): jpayne@69: """return two-level dictionary with header information from the file. jpayne@69: jpayne@69: The first level contains the record (``HD``, ``SQ``, etc) and jpayne@69: the second level contains the fields (``VN``, ``LN``, etc). jpayne@69: jpayne@69: The parser is validating and will raise an AssertionError if jpayne@69: if encounters any record or field tags that are not part of jpayne@69: the SAM specification. Use the jpayne@69: :attr:`pysam.AlignmentFile.text` attribute to get the unparsed jpayne@69: header. jpayne@69: jpayne@69: The parsing follows the SAM format specification with the jpayne@69: exception of the ``CL`` field. This option will consume the jpayne@69: rest of a header line irrespective of any additional fields. jpayne@69: This behaviour has been added to accommodate command line jpayne@69: options that contain characters that are not valid field jpayne@69: separators. jpayne@69: jpayne@69: If no @SQ entries are within the text section of the header, jpayne@69: this will be automatically added from the reference names and jpayne@69: lengths stored in the binary part of the header. jpayne@69: """ jpayne@69: result = collections.OrderedDict() jpayne@69: jpayne@69: # convert to python string jpayne@69: t = self.__str__() jpayne@69: for line in t.split("\n"): jpayne@69: line = line.strip(' \0') jpayne@69: if not line: jpayne@69: continue jpayne@69: assert line.startswith("@"), \ jpayne@69: "header line without '@': '%s'" % line jpayne@69: fields = line[1:].split("\t") jpayne@69: record = fields[0] jpayne@69: assert record in VALID_HEADER_TYPES, \ jpayne@69: "header line with invalid type '%s': '%s'" % (record, line) jpayne@69: jpayne@69: # treat comments jpayne@69: if record == "CO": jpayne@69: if record not in result: jpayne@69: result[record] = [] jpayne@69: result[record].append("\t".join( fields[1:])) jpayne@69: continue jpayne@69: # the following is clumsy as generators do not work? jpayne@69: x = {} jpayne@69: jpayne@69: for idx, field in enumerate(fields[1:]): jpayne@69: if ":" not in field: jpayne@69: raise ValueError("malformatted header: no ':' in field" ) jpayne@69: key, value = field.split(":", 1) jpayne@69: if key in ("CL",): jpayne@69: # special treatment for command line jpayne@69: # statements (CL). These might contain jpayne@69: # characters that are non-conformant with jpayne@69: # the valid field separators in the SAM jpayne@69: # header. Thus, in contravention to the jpayne@69: # SAM API, consume the rest of the line. jpayne@69: key, value = "\t".join(fields[idx+1:]).split(":", 1) jpayne@69: x[key] = KNOWN_HEADER_FIELDS[record][key](value) jpayne@69: break jpayne@69: jpayne@69: # interpret type of known header record tags, default to str jpayne@69: x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) jpayne@69: jpayne@69: if VALID_HEADER_TYPES[record] == Mapping: jpayne@69: if record in result: jpayne@69: raise ValueError( jpayne@69: "multiple '%s' lines are not permitted" % record) jpayne@69: jpayne@69: result[record] = x jpayne@69: elif VALID_HEADER_TYPES[record] == Sequence: jpayne@69: if record not in result: result[record] = [] jpayne@69: result[record].append(x) jpayne@69: jpayne@69: # if there are no SQ lines in the header, add the jpayne@69: # reference names from the information in the bam jpayne@69: # file. jpayne@69: # jpayne@69: # Background: c-samtools keeps the textual part of the jpayne@69: # header separate from the list of reference names and jpayne@69: # lengths. Thus, if a header contains only SQ lines, jpayne@69: # the SQ information is not part of the textual header jpayne@69: # and thus are missing from the output. See issue 84. jpayne@69: if "SQ" not in result: jpayne@69: sq = [] jpayne@69: for ref, length in zip(self.references, self.lengths): jpayne@69: sq.append({'LN': length, 'SN': ref }) jpayne@69: result["SQ"] = sq jpayne@69: jpayne@69: return result jpayne@69: jpayne@69: def as_dict(self): jpayne@69: """deprecated, use :meth:`to_dict()` instead""" jpayne@69: return self.to_dict() jpayne@69: jpayne@69: def get_reference_name(self, tid): jpayne@69: if tid == -1: jpayne@69: return None jpayne@69: if not 0 <= tid < self.ptr.n_targets: jpayne@69: raise ValueError("reference_id %i out of range 0<=tid<%i" % jpayne@69: (tid, self.ptr.n_targets)) jpayne@69: return charptr_to_str(self.ptr.target_name[tid]) jpayne@69: jpayne@69: def get_reference_length(self, reference): jpayne@69: cdef int tid = self.get_tid(reference) jpayne@69: if tid < 0: jpayne@69: raise KeyError("unknown reference {}".format(reference)) jpayne@69: else: jpayne@69: return self.ptr.target_len[tid] jpayne@69: jpayne@69: def is_valid_tid(self, int tid): jpayne@69: """ jpayne@69: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@69: jpayne@69: Note that the unmapped tid code (-1) counts as an invalid. jpayne@69: """ jpayne@69: return 0 <= tid < self.ptr.n_targets jpayne@69: jpayne@69: def get_tid(self, reference): jpayne@69: """ jpayne@69: return the numerical :term:`tid` corresponding to jpayne@69: :term:`reference` jpayne@69: jpayne@69: returns -1 if reference is not known. jpayne@69: """ jpayne@69: reference = force_bytes(reference) jpayne@69: tid = bam_name2id(self.ptr, reference) jpayne@69: if tid < -1: jpayne@69: raise ValueError('could not parse header') jpayne@69: return tid jpayne@69: jpayne@69: def __str__(self): jpayne@69: '''string with the full contents of the :term:`sam file` header as a jpayne@69: string. jpayne@69: jpayne@69: If no @SQ entries are within the text section of the header, jpayne@69: this will be automatically added from the reference names and jpayne@69: lengths stored in the binary part of the header. jpayne@69: jpayne@69: See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed jpayne@69: representation of the header. jpayne@69: ''' jpayne@69: text = from_string_and_size(self.ptr.text, self.ptr.l_text) jpayne@69: if "@SQ" not in text: jpayne@69: text += "\n" + self._build_sequence_section() jpayne@69: return text jpayne@69: jpayne@69: # dictionary access methods, for backwards compatibility. jpayne@69: def __setitem__(self, key, value): jpayne@69: raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: return self.to_dict().__getitem__(key) jpayne@69: jpayne@69: def items(self): jpayne@69: return self.to_dict().items() jpayne@69: jpayne@69: # PY2 compatibility jpayne@69: def iteritems(self): jpayne@69: return self.to_dict().items() jpayne@69: jpayne@69: def keys(self): jpayne@69: return self.to_dict().keys() jpayne@69: jpayne@69: def values(self): jpayne@69: return self.to_dict().values() jpayne@69: jpayne@69: def get(self, *args): jpayne@69: return self.to_dict().get(*args) jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self.to_dict().__len__() jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: return self.to_dict().__contains__(key) jpayne@69: jpayne@69: jpayne@69: cdef class AlignmentFile(HTSFile): jpayne@69: """AlignmentFile(filepath_or_object, mode=None, template=None, jpayne@69: reference_names=None, reference_lengths=None, text=NULL, jpayne@69: header=None, add_sq_text=False, check_header=True, check_sq=True, jpayne@69: reference_filename=None, filename=None, index_filename=None, jpayne@69: filepath_index=None, require_index=False, duplicate_filehandle=True, jpayne@69: ignore_truncation=False, threads=1) jpayne@69: jpayne@69: A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. jpayne@69: jpayne@69: If `filepath_or_object` is a string, the file is automatically jpayne@69: opened. If `filepath_or_object` is a python File object, the jpayne@69: already opened file will be used. jpayne@69: jpayne@69: If the file is opened for reading and an index exists (if file is BAM, a jpayne@69: .bai file or if CRAM a .crai file), it will be opened automatically. jpayne@69: `index_filename` may be specified explicitly. If the index is not named jpayne@69: in the standard manner, not located in the same directory as the jpayne@69: BAM/CRAM file, or is remote. Without an index, random access via jpayne@69: :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup` jpayne@69: is disabled. jpayne@69: jpayne@69: For writing, the header of a :term:`SAM` file/:term:`BAM` file can jpayne@69: be constituted from several sources (see also the samtools format jpayne@69: specification): jpayne@69: jpayne@69: 1. If `template` is given, the header is copied from another jpayne@69: `AlignmentFile` (`template` must be a jpayne@69: :class:`~pysam.AlignmentFile`). jpayne@69: jpayne@69: 2. If `header` is given, the header is built from a jpayne@69: multi-level dictionary. jpayne@69: jpayne@69: 3. If `text` is given, new header text is copied from raw jpayne@69: text. jpayne@69: jpayne@69: 4. The names (`reference_names`) and lengths jpayne@69: (`reference_lengths`) are supplied directly as lists. jpayne@69: jpayne@69: When reading or writing a CRAM file, the filename of a FASTA-formatted jpayne@69: reference can be specified with `reference_filename`. jpayne@69: jpayne@69: By default, if a file is opened in mode 'r', it is checked jpayne@69: for a valid header (`check_header` = True) and a definition of jpayne@69: chromosome names (`check_sq` = True). jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: mode : string jpayne@69: `mode` should be ``r`` for reading or ``w`` for writing. The jpayne@69: default is text mode (:term:`SAM`). For binary (:term:`BAM`) jpayne@69: I/O you should append ``b`` for compressed or ``u`` for jpayne@69: uncompressed :term:`BAM` output. Use ``h`` to output header jpayne@69: information in text (:term:`TAM`) mode. Use ``c`` for jpayne@69: :term:`CRAM` formatted files. jpayne@69: jpayne@69: If ``b`` is present, it must immediately follow ``r`` or jpayne@69: ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, jpayne@69: ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a jpayne@69: :term:`BAM` formatted file for reading, type:: jpayne@69: jpayne@69: f = pysam.AlignmentFile('ex1.bam','rb') jpayne@69: jpayne@69: If mode is not specified, the method will try to auto-detect jpayne@69: in the order 'rb', 'r', thus both the following should work:: jpayne@69: jpayne@69: f1 = pysam.AlignmentFile('ex1.bam') jpayne@69: f2 = pysam.AlignmentFile('ex1.sam') jpayne@69: jpayne@69: template : AlignmentFile jpayne@69: when writing, copy header from file `template`. jpayne@69: jpayne@69: header : dict or AlignmentHeader jpayne@69: when writing, build header from a multi-level dictionary. The jpayne@69: first level are the four types ('HD', 'SQ', ...). The second jpayne@69: level are a list of lines, with each line being a list of jpayne@69: tag-value pairs. The header is constructed first from all the jpayne@69: defined fields, followed by user tags in alphabetical jpayne@69: order. Alternatively, an :class:`~pysam.AlignmentHeader` jpayne@69: object can be passed directly. jpayne@69: jpayne@69: text : string jpayne@69: when writing, use the string provided as the header jpayne@69: jpayne@69: reference_names : list jpayne@69: see reference_lengths jpayne@69: jpayne@69: reference_lengths : list jpayne@69: when writing or opening a SAM file without header build header jpayne@69: from list of chromosome names and lengths. By default, 'SQ' jpayne@69: and 'LN' tags will be added to the header text. This option jpayne@69: can be changed by unsetting the flag `add_sq_text`. jpayne@69: jpayne@69: add_sq_text : bool jpayne@69: do not add 'SQ' and 'LN' tags to header. This option permits jpayne@69: construction :term:`SAM` formatted files without a header. jpayne@69: jpayne@69: add_sam_header : bool jpayne@69: when outputting SAM the default is to output a header. This is jpayne@69: equivalent to opening the file in 'wh' mode. If this option is jpayne@69: set to False, no header will be output. To read such a file, jpayne@69: set `check_header=False`. jpayne@69: jpayne@69: check_header : bool jpayne@69: obsolete: when reading a SAM file, check if header is present jpayne@69: (default=True) jpayne@69: jpayne@69: check_sq : bool jpayne@69: when reading, check if SQ entries are present in header jpayne@69: (default=True) jpayne@69: jpayne@69: reference_filename : string jpayne@69: Path to a FASTA-formatted reference file. Valid only for CRAM files. jpayne@69: When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL jpayne@69: specified in the header (``UR`` tag), which are normally used to find jpayne@69: the reference. jpayne@69: jpayne@69: index_filename : string jpayne@69: Explicit path to the index file. Only needed if the index is not jpayne@69: named in the standard manner, not located in the same directory as jpayne@69: the BAM/CRAM file, or is remote. An IOError is raised if the index jpayne@69: cannot be found or is invalid. jpayne@69: jpayne@69: filepath_index : string jpayne@69: Alias for `index_filename`. jpayne@69: jpayne@69: require_index : bool jpayne@69: When reading, require that an index file is present and is valid or jpayne@69: raise an IOError. (default=False) jpayne@69: jpayne@69: filename : string jpayne@69: Alternative to filepath_or_object. Filename of the file jpayne@69: to be opened. jpayne@69: jpayne@69: duplicate_filehandle: bool jpayne@69: By default, file handles passed either directly or through jpayne@69: File-like objects will be duplicated before passing them to jpayne@69: htslib. The duplication prevents issues where the same stream jpayne@69: will be closed by htslib and through destruction of the jpayne@69: high-level python object. Set to False to turn off jpayne@69: duplication. jpayne@69: jpayne@69: ignore_truncation: bool jpayne@69: Issue a warning, instead of raising an error if the current file jpayne@69: appears to be truncated due to a missing EOF marker. Only applies jpayne@69: to bgzipped formats. (Default=False) jpayne@69: jpayne@69: format_options: list jpayne@69: A list of key=value strings, as accepted by --input-fmt-option and jpayne@69: --output-fmt-option in samtools. jpayne@69: threads: integer jpayne@69: Number of threads to use for compressing/decompressing BAM/CRAM files. jpayne@69: Setting threads to > 1 cannot be combined with `ignore_truncation`. jpayne@69: (Default=1) jpayne@69: """ jpayne@69: jpayne@69: def __cinit__(self, *args, **kwargs): jpayne@69: self.htsfile = NULL jpayne@69: self.filename = None jpayne@69: self.mode = None jpayne@69: self.threads = 1 jpayne@69: self.is_stream = False jpayne@69: self.is_remote = False jpayne@69: self.index = NULL jpayne@69: jpayne@69: if "filename" in kwargs: jpayne@69: args = [kwargs["filename"]] jpayne@69: del kwargs["filename"] jpayne@69: jpayne@69: self._open(*args, **kwargs) jpayne@69: jpayne@69: # allocate memory for iterator jpayne@69: self.b = calloc(1, sizeof(bam1_t)) jpayne@69: if self.b == NULL: jpayne@69: raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) jpayne@69: jpayne@69: def has_index(self): jpayne@69: """return true if htsfile has an existing (and opened) index. jpayne@69: """ jpayne@69: return self.index != NULL jpayne@69: jpayne@69: def check_index(self): jpayne@69: """return True if index is present. jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: AttributeError jpayne@69: if htsfile is :term:`SAM` formatted and thus has no index. jpayne@69: jpayne@69: ValueError jpayne@69: if htsfile is closed or index could not be opened. jpayne@69: """ jpayne@69: jpayne@69: if not self.is_open: jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: if not self.is_bam and not self.is_cram: jpayne@69: raise AttributeError( jpayne@69: "AlignmentFile.mapped only available in bam files") jpayne@69: if self.index == NULL: jpayne@69: raise ValueError( jpayne@69: "mapping information not recorded in index " jpayne@69: "or index not available") jpayne@69: return True jpayne@69: jpayne@69: def _open(self, jpayne@69: filepath_or_object, jpayne@69: mode=None, jpayne@69: AlignmentFile template=None, jpayne@69: reference_names=None, jpayne@69: reference_lengths=None, jpayne@69: reference_filename=None, jpayne@69: text=None, jpayne@69: header=None, jpayne@69: port=None, jpayne@69: add_sq_text=True, jpayne@69: add_sam_header=True, jpayne@69: check_header=True, jpayne@69: check_sq=True, jpayne@69: index_filename=None, jpayne@69: filepath_index=None, jpayne@69: require_index=False, jpayne@69: referencenames=None, jpayne@69: referencelengths=None, jpayne@69: duplicate_filehandle=True, jpayne@69: ignore_truncation=False, jpayne@69: format_options=None, jpayne@69: threads=1): jpayne@69: '''open a sam, bam or cram formatted file. jpayne@69: jpayne@69: If _open is called on an existing file, the current file jpayne@69: will be closed and a new file will be opened. jpayne@69: jpayne@69: ''' jpayne@69: cdef char *cfilename = NULL jpayne@69: cdef char *creference_filename = NULL jpayne@69: cdef char *cindexname = NULL jpayne@69: cdef char *cmode = NULL jpayne@69: cdef bam_hdr_t * hdr = NULL jpayne@69: jpayne@69: if threads > 1 and ignore_truncation: jpayne@69: # This won't raise errors if reaching a truncated alignment, jpayne@69: # because bgzf_mt_reader in htslib does not deal with jpayne@69: # bgzf_mt_read_block returning non-zero values, contrary jpayne@69: # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) jpayne@69: # Better to avoid this (for now) than to produce seemingly correct results. jpayne@69: raise ValueError('Cannot add extra threads when "ignore_truncation" is True') jpayne@69: self.threads = threads jpayne@69: jpayne@69: # for backwards compatibility: jpayne@69: if referencenames is not None: jpayne@69: reference_names = referencenames jpayne@69: if referencelengths is not None: jpayne@69: reference_lengths = referencelengths jpayne@69: jpayne@69: # close a previously opened file jpayne@69: if self.is_open: jpayne@69: self.close() jpayne@69: jpayne@69: # autodetection for read jpayne@69: if mode is None: jpayne@69: mode = "r" jpayne@69: jpayne@69: if add_sam_header and mode == "w": jpayne@69: mode = "wh" jpayne@69: jpayne@69: assert mode in ("r", "w", "rb", "wb", "wh", jpayne@69: "wbu", "rU", "wb0", jpayne@69: "rc", "wc"), \ jpayne@69: "invalid file opening mode `%s`" % mode jpayne@69: jpayne@69: self.duplicate_filehandle = duplicate_filehandle jpayne@69: jpayne@69: # StringIO not supported jpayne@69: if isinstance(filepath_or_object, StringIO): jpayne@69: raise NotImplementedError( jpayne@69: "access from StringIO objects not supported") jpayne@69: # reading from a file descriptor jpayne@69: elif isinstance(filepath_or_object, int): jpayne@69: self.filename = filepath_or_object jpayne@69: filename = None jpayne@69: self.is_remote = False jpayne@69: self.is_stream = True jpayne@69: # reading from a File object or other object with fileno jpayne@69: elif hasattr(filepath_or_object, "fileno"): jpayne@69: if filepath_or_object.closed: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: self.filename = filepath_or_object jpayne@69: # .name can be TextIOWrapper jpayne@69: try: jpayne@69: filename = encode_filename(str(filepath_or_object.name)) jpayne@69: cfilename = filename jpayne@69: except AttributeError: jpayne@69: filename = None jpayne@69: self.is_remote = False jpayne@69: self.is_stream = True jpayne@69: # what remains is a filename jpayne@69: else: jpayne@69: self.filename = filename = encode_filename(filepath_or_object) jpayne@69: cfilename = filename jpayne@69: self.is_remote = hisremote(cfilename) jpayne@69: self.is_stream = self.filename == b'-' jpayne@69: jpayne@69: # for htslib, wbu seems to not work jpayne@69: if mode == "wbu": jpayne@69: mode = "wb0" jpayne@69: jpayne@69: self.mode = force_bytes(mode) jpayne@69: self.reference_filename = reference_filename = encode_filename( jpayne@69: reference_filename) jpayne@69: jpayne@69: if mode[0] == 'w': jpayne@69: # open file for writing jpayne@69: jpayne@69: if not (template or header or text or (reference_names and reference_lengths)): jpayne@69: raise ValueError( jpayne@69: "either supply options `template`, `header`, `text` or both `reference_names` " jpayne@69: "and `reference_lengths` for writing") jpayne@69: jpayne@69: if template: jpayne@69: # header is copied, though at the moment not strictly jpayne@69: # necessary as AlignmentHeader is immutable. jpayne@69: self.header = template.header.copy() jpayne@69: elif isinstance(header, AlignmentHeader): jpayne@69: self.header = header.copy() jpayne@69: elif isinstance(header, Mapping): jpayne@69: self.header = AlignmentHeader.from_dict(header) jpayne@69: elif reference_names and reference_lengths: jpayne@69: self.header = AlignmentHeader.from_references( jpayne@69: reference_names, jpayne@69: reference_lengths, jpayne@69: add_sq_text=add_sq_text, jpayne@69: text=text) jpayne@69: elif text: jpayne@69: self.header = AlignmentHeader.from_text(text) jpayne@69: else: jpayne@69: raise ValueError("not enough information to construct header. Please provide template, " jpayne@69: "header, text or reference_names/reference_lengths") jpayne@69: self.htsfile = self._open_htsfile() jpayne@69: jpayne@69: if self.htsfile == NULL: jpayne@69: if errno: jpayne@69: raise IOError(errno, "could not open alignment file `{}`: {}".format( jpayne@69: force_str(filename), jpayne@69: force_str(strerror(errno)))) jpayne@69: else: jpayne@69: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) jpayne@69: if format_options and len(format_options): jpayne@69: self.add_hts_options(format_options) jpayne@69: # set filename with reference sequences. If no filename jpayne@69: # is given, the CRAM reference arrays will be built from jpayne@69: # the @SQ header in the header jpayne@69: if "c" in mode and reference_filename: jpayne@69: if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0): jpayne@69: raise ValueError("failure when setting reference filename") jpayne@69: jpayne@69: # write header to htsfile jpayne@69: if "b" in mode or "c" in mode or "h" in mode: jpayne@69: hdr = self.header.ptr jpayne@69: with nogil: jpayne@69: sam_hdr_write(self.htsfile, hdr) jpayne@69: jpayne@69: elif mode[0] == "r": jpayne@69: # open file for reading jpayne@69: self.htsfile = self._open_htsfile() jpayne@69: jpayne@69: if self.htsfile == NULL: jpayne@69: if errno: jpayne@69: raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), jpayne@69: force_str(strerror(errno)))) jpayne@69: else: jpayne@69: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) jpayne@69: jpayne@69: if self.htsfile.format.category != sequence_data: jpayne@69: raise ValueError("file does not contain alignment data") jpayne@69: jpayne@69: if format_options and len(format_options): jpayne@69: self.add_hts_options(format_options) jpayne@69: jpayne@69: self.check_truncation(ignore_truncation) jpayne@69: jpayne@69: # bam/cram files require a valid header jpayne@69: if self.is_bam or self.is_cram: jpayne@69: with nogil: jpayne@69: hdr = sam_hdr_read(self.htsfile) jpayne@69: if hdr == NULL: jpayne@69: raise ValueError( jpayne@69: "file does not have a valid header (mode='%s') " jpayne@69: "- is it BAM/CRAM format?" % mode) jpayne@69: self.header = makeAlignmentHeader(hdr) jpayne@69: else: jpayne@69: # in sam files a header is optional. If not given, jpayne@69: # user may provide reference names and lengths to built jpayne@69: # an on-the-fly header. jpayne@69: if reference_names and reference_lengths: jpayne@69: # build header from a target names and lengths jpayne@69: self.header = AlignmentHeader.from_references( jpayne@69: reference_names=reference_names, jpayne@69: reference_lengths=reference_lengths, jpayne@69: add_sq_text=add_sq_text, jpayne@69: text=text) jpayne@69: else: jpayne@69: with nogil: jpayne@69: hdr = sam_hdr_read(self.htsfile) jpayne@69: if hdr == NULL: jpayne@69: raise ValueError( jpayne@69: "SAM? file does not have a valid header (mode='%s'), " jpayne@69: "please provide reference_names and reference_lengths") jpayne@69: self.header = makeAlignmentHeader(hdr) jpayne@69: jpayne@69: # set filename with reference sequences jpayne@69: if self.is_cram and reference_filename: jpayne@69: creference_filename = self.reference_filename jpayne@69: hts_set_opt(self.htsfile, jpayne@69: CRAM_OPT_REFERENCE, jpayne@69: creference_filename) jpayne@69: jpayne@69: if check_sq and self.header.nreferences == 0: jpayne@69: raise ValueError( jpayne@69: ("file has no sequences defined (mode='%s') - " jpayne@69: "is it SAM/BAM format? Consider opening with " jpayne@69: "check_sq=False") % mode) jpayne@69: jpayne@69: if self.is_bam or self.is_cram: jpayne@69: self.index_filename = index_filename or filepath_index jpayne@69: if self.index_filename: jpayne@69: cindexname = bfile_name = encode_filename(self.index_filename) jpayne@69: jpayne@69: if cfilename or cindexname: jpayne@69: with nogil: jpayne@69: self.index = sam_index_load3(self.htsfile, cfilename, cindexname, jpayne@69: HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL) jpayne@69: jpayne@69: if not self.index and (cindexname or require_index): jpayne@69: if errno: jpayne@69: raise IOError(errno, force_str(strerror(errno))) jpayne@69: else: jpayne@69: raise IOError('unable to open index file `%s`' % self.index_filename) jpayne@69: jpayne@69: elif require_index: jpayne@69: raise IOError('unable to open index file') jpayne@69: jpayne@69: # save start of data section jpayne@69: if not self.is_stream: jpayne@69: self.start_offset = self.tell() jpayne@69: jpayne@69: def fetch(self, jpayne@69: contig=None, jpayne@69: start=None, jpayne@69: stop=None, jpayne@69: region=None, jpayne@69: tid=None, jpayne@69: until_eof=False, jpayne@69: multiple_iterators=False, jpayne@69: reference=None, jpayne@69: end=None): jpayne@69: """fetch reads aligned in a :term:`region`. jpayne@69: jpayne@69: See :meth:`~pysam.HTSFile.parse_region` for more information jpayne@69: on how genomic regions can be specified. :term:`reference` and jpayne@69: `end` are also accepted for backward compatibility as synonyms jpayne@69: for :term:`contig` and `stop`, respectively. jpayne@69: jpayne@69: Without a `contig` or `region` all mapped reads in the file jpayne@69: will be fetched. The reads will be returned ordered by reference jpayne@69: sequence, which will not necessarily be the order within the jpayne@69: file. This mode of iteration still requires an index. If there is jpayne@69: no index, use `until_eof=True`. jpayne@69: jpayne@69: If only `contig` is set, all reads aligned to `contig` jpayne@69: will be fetched. jpayne@69: jpayne@69: A :term:`SAM` file does not allow random access. If `region` jpayne@69: or `contig` are given, an exception is raised. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: until_eof : bool jpayne@69: jpayne@69: If `until_eof` is True, all reads from the current file jpayne@69: position will be returned in order as they are within the jpayne@69: file. Using this option will also fetch unmapped reads. jpayne@69: jpayne@69: multiple_iterators : bool jpayne@69: jpayne@69: If `multiple_iterators` is True, multiple jpayne@69: iterators on the same file can be used at the same time. The jpayne@69: iterator returned will receive its own copy of a filehandle to jpayne@69: the file effectively re-opening the file. Re-opening a file jpayne@69: creates some overhead, so beware. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: An iterator over a collection of reads. : IteratorRow jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: ValueError jpayne@69: if the genomic coordinates are out of range or invalid or the jpayne@69: file does not permit random access to genomic coordinates. jpayne@69: jpayne@69: """ jpayne@69: cdef int rtid, rstart, rstop, has_coord jpayne@69: jpayne@69: if not self.is_open: jpayne@69: raise ValueError( "I/O operation on closed file" ) jpayne@69: jpayne@69: has_coord, rtid, rstart, rstop = self.parse_region( jpayne@69: contig, start, stop, region, tid, jpayne@69: end=end, reference=reference) jpayne@69: jpayne@69: # Turn of re-opening if htsfile is a stream jpayne@69: if self.is_stream: jpayne@69: multiple_iterators = False jpayne@69: jpayne@69: if self.is_bam or self.is_cram: jpayne@69: if not until_eof and not self.is_remote: jpayne@69: if not self.has_index(): jpayne@69: raise ValueError( jpayne@69: "fetch called on bamfile without index") jpayne@69: jpayne@69: if has_coord: jpayne@69: return IteratorRowRegion( jpayne@69: self, rtid, rstart, rstop, jpayne@69: multiple_iterators=multiple_iterators) jpayne@69: else: jpayne@69: if until_eof: jpayne@69: return IteratorRowAll( jpayne@69: self, jpayne@69: multiple_iterators=multiple_iterators) jpayne@69: else: jpayne@69: # AH: check - reason why no multiple_iterators for jpayne@69: # AllRefs? jpayne@69: return IteratorRowAllRefs( jpayne@69: self, jpayne@69: multiple_iterators=multiple_iterators) jpayne@69: else: jpayne@69: if has_coord: jpayne@69: raise ValueError( jpayne@69: "fetching by region is not available for SAM files") jpayne@69: jpayne@69: if multiple_iterators == True: jpayne@69: raise ValueError( jpayne@69: "multiple iterators not implemented for SAM files") jpayne@69: jpayne@69: return IteratorRowAll(self, jpayne@69: multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: def head(self, n, multiple_iterators=True): jpayne@69: '''return an iterator over the first n alignments. jpayne@69: jpayne@69: This iterator is is useful for inspecting the bam-file. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: multiple_iterators : bool jpayne@69: jpayne@69: is set to True by default in order to jpayne@69: avoid changing the current file position. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: an iterator over a collection of reads : IteratorRowHead jpayne@69: jpayne@69: ''' jpayne@69: return IteratorRowHead(self, n, jpayne@69: multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: def mate(self, AlignedSegment read): jpayne@69: '''return the mate of :class:`pysam.AlignedSegment` `read`. jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: Calling this method will change the file position. jpayne@69: This might interfere with any iterators that have jpayne@69: not re-opened the file. jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: This method is too slow for high-throughput processing. jpayne@69: If a read needs to be processed with its mate, work jpayne@69: from a read name sorted file or, better, cache reads. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: the mate : AlignedSegment jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: ValueError jpayne@69: if the read is unpaired or the mate is unmapped jpayne@69: jpayne@69: ''' jpayne@69: cdef uint32_t flag = read._delegate.core.flag jpayne@69: jpayne@69: if flag & BAM_FPAIRED == 0: jpayne@69: raise ValueError("read %s: is unpaired" % jpayne@69: (read.query_name)) jpayne@69: if flag & BAM_FMUNMAP != 0: jpayne@69: raise ValueError("mate %s: is unmapped" % jpayne@69: (read.query_name)) jpayne@69: jpayne@69: # xor flags to get the other mate jpayne@69: cdef int x = BAM_FREAD1 + BAM_FREAD2 jpayne@69: flag = (flag ^ x) & x jpayne@69: jpayne@69: # Make sure to use a separate file to jump around jpayne@69: # to mate as otherwise the original file position jpayne@69: # will be lost jpayne@69: # The following code is not using the C API and jpayne@69: # could thus be made much quicker, for example jpayne@69: # by using tell and seek. jpayne@69: for mate in self.fetch( jpayne@69: read._delegate.core.mpos, jpayne@69: read._delegate.core.mpos + 1, jpayne@69: tid=read._delegate.core.mtid, jpayne@69: multiple_iterators=True): jpayne@69: if mate.flag & flag != 0 and \ jpayne@69: mate.query_name == read.query_name: jpayne@69: break jpayne@69: else: jpayne@69: raise ValueError("mate not found") jpayne@69: jpayne@69: return mate jpayne@69: jpayne@69: def pileup(self, jpayne@69: contig=None, jpayne@69: start=None, jpayne@69: stop=None, jpayne@69: region=None, jpayne@69: reference=None, jpayne@69: end=None, jpayne@69: **kwargs): jpayne@69: """perform a :term:`pileup` within a :term:`region`. The region is jpayne@69: specified by :term:`contig`, `start` and `stop` (using jpayne@69: 0-based indexing). :term:`reference` and `end` are also accepted for jpayne@69: backward compatibility as synonyms for :term:`contig` and `stop`, jpayne@69: respectively. Alternatively, a samtools 'region' string jpayne@69: can be supplied. jpayne@69: jpayne@69: Without 'contig' or 'region' all reads will be used for the jpayne@69: pileup. The reads will be returned ordered by jpayne@69: :term:`contig` sequence, which will not necessarily be the jpayne@69: order within the file. jpayne@69: jpayne@69: Note that :term:`SAM` formatted files do not allow random jpayne@69: access. In these files, if a 'region' or 'contig' are jpayne@69: given an exception is raised. jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: 'all' reads which overlap the region are returned. The jpayne@69: first base returned will be the first base of the first jpayne@69: read 'not' necessarily the first base of the region used jpayne@69: in the query. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: truncate : bool jpayne@69: jpayne@69: By default, the samtools pileup engine outputs all reads jpayne@69: overlapping a region. If truncate is True and a region is jpayne@69: given, only columns in the exact region specified are jpayne@69: returned. jpayne@69: jpayne@69: max_depth : int jpayne@69: Maximum read depth permitted. The default limit is '8000'. jpayne@69: jpayne@69: stepper : string jpayne@69: The stepper controls how the iterator advances. jpayne@69: Possible options for the stepper are jpayne@69: jpayne@69: ``all`` jpayne@69: skip reads in which any of the following flags are set: jpayne@69: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@69: jpayne@69: ``nofilter`` jpayne@69: uses every single read turning off any filtering. jpayne@69: jpayne@69: ``samtools`` jpayne@69: same filter and read processing as in samtools jpayne@69: pileup. For full compatibility, this requires a jpayne@69: 'fastafile' to be given. The following options all pertain jpayne@69: to filtering of the ``samtools`` stepper. jpayne@69: jpayne@69: fastafile : :class:`~pysam.FastaFile` object. jpayne@69: jpayne@69: This is required for some of the steppers. jpayne@69: jpayne@69: ignore_overlaps: bool jpayne@69: jpayne@69: If set to True, detect if read pairs overlap and only take jpayne@69: the higher quality base. This is the default. jpayne@69: jpayne@69: flag_filter : int jpayne@69: jpayne@69: ignore reads where any of the bits in the flag are set. The default is jpayne@69: BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. jpayne@69: jpayne@69: flag_require : int jpayne@69: jpayne@69: only use reads where certain flags are set. The default is 0. jpayne@69: jpayne@69: ignore_orphans: bool jpayne@69: jpayne@69: ignore orphans (paired reads that are not in a proper pair). jpayne@69: The default is to ignore orphans. jpayne@69: jpayne@69: min_base_quality: int jpayne@69: jpayne@69: Minimum base quality. Bases below the minimum quality will jpayne@69: not be output. The default is 13. jpayne@69: jpayne@69: adjust_capq_threshold: int jpayne@69: jpayne@69: adjust mapping quality. The default is 0 for no jpayne@69: adjustment. The recommended value for adjustment is 50. jpayne@69: jpayne@69: min_mapping_quality : int jpayne@69: jpayne@69: only use reads above a minimum mapping quality. The default is 0. jpayne@69: jpayne@69: compute_baq: bool jpayne@69: jpayne@69: re-alignment computing per-Base Alignment Qualities (BAQ). The jpayne@69: default is to do re-alignment. Realignment requires a reference jpayne@69: sequence. If none is present, no realignment will be performed. jpayne@69: jpayne@69: redo_baq: bool jpayne@69: jpayne@69: recompute per-Base Alignment Quality on the fly ignoring jpayne@69: existing base qualities. The default is False (use existing jpayne@69: base qualities). jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: an iterator over genomic positions. : IteratorColumn jpayne@69: jpayne@69: """ jpayne@69: cdef int rtid, has_coord jpayne@69: cdef int32_t rstart, rstop jpayne@69: jpayne@69: if not self.is_open: jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: jpayne@69: has_coord, rtid, rstart, rstop = self.parse_region( jpayne@69: contig, start, stop, region, reference=reference, end=end) jpayne@69: jpayne@69: if has_coord: jpayne@69: if not self.has_index(): jpayne@69: raise ValueError("no index available for pileup") jpayne@69: jpayne@69: return IteratorColumnRegion(self, jpayne@69: tid=rtid, jpayne@69: start=rstart, jpayne@69: stop=rstop, jpayne@69: **kwargs) jpayne@69: else: jpayne@69: if self.has_index(): jpayne@69: return IteratorColumnAllRefs(self, **kwargs) jpayne@69: else: jpayne@69: return IteratorColumnAll(self, **kwargs) jpayne@69: jpayne@69: def count(self, jpayne@69: contig=None, jpayne@69: start=None, jpayne@69: stop=None, jpayne@69: region=None, jpayne@69: until_eof=False, jpayne@69: read_callback="nofilter", jpayne@69: reference=None, jpayne@69: end=None): jpayne@69: '''count the number of reads in :term:`region` jpayne@69: jpayne@69: The region is specified by :term:`contig`, `start` and `stop`. jpayne@69: :term:`reference` and `end` are also accepted for backward jpayne@69: compatibility as synonyms for :term:`contig` and `stop`, jpayne@69: respectively. Alternatively, a `samtools`_ :term:`region` jpayne@69: string can be supplied. jpayne@69: jpayne@69: A :term:`SAM` file does not allow random access and if jpayne@69: `region` or `contig` are given, an exception is raised. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: contig : string jpayne@69: reference_name of the genomic region (chromosome) jpayne@69: jpayne@69: start : int jpayne@69: start of the genomic region (0-based inclusive) jpayne@69: jpayne@69: stop : int jpayne@69: end of the genomic region (0-based exclusive) jpayne@69: jpayne@69: region : string jpayne@69: a region string in samtools format. jpayne@69: jpayne@69: until_eof : bool jpayne@69: count until the end of the file, possibly including jpayne@69: unmapped reads as well. jpayne@69: jpayne@69: read_callback: string or function jpayne@69: jpayne@69: select a call-back to ignore reads when counting. It can jpayne@69: be either a string with the following values: jpayne@69: jpayne@69: ``all`` jpayne@69: skip reads in which any of the following jpayne@69: flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, jpayne@69: BAM_FDUP jpayne@69: jpayne@69: ``nofilter`` jpayne@69: uses every single read jpayne@69: jpayne@69: Alternatively, `read_callback` can be a function jpayne@69: ``check_read(read)`` that should return True only for jpayne@69: those reads that shall be included in the counting. jpayne@69: jpayne@69: reference : string jpayne@69: backward compatible synonym for `contig` jpayne@69: jpayne@69: end : int jpayne@69: backward compatible synonym for `stop` jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: ValueError jpayne@69: if the genomic coordinates are out of range or invalid. jpayne@69: jpayne@69: ''' jpayne@69: cdef AlignedSegment read jpayne@69: cdef long counter = 0 jpayne@69: jpayne@69: if not self.is_open: jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: jpayne@69: cdef int filter_method = 0 jpayne@69: if read_callback == "all": jpayne@69: filter_method = 1 jpayne@69: elif read_callback == "nofilter": jpayne@69: filter_method = 2 jpayne@69: jpayne@69: for read in self.fetch(contig=contig, jpayne@69: start=start, jpayne@69: stop=stop, jpayne@69: reference=reference, jpayne@69: end=end, jpayne@69: region=region, jpayne@69: until_eof=until_eof): jpayne@69: # apply filter jpayne@69: if filter_method == 1: jpayne@69: # filter = "all" jpayne@69: if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): jpayne@69: continue jpayne@69: elif filter_method == 2: jpayne@69: # filter = "nofilter" jpayne@69: pass jpayne@69: else: jpayne@69: if not read_callback(read): jpayne@69: continue jpayne@69: counter += 1 jpayne@69: jpayne@69: return counter jpayne@69: jpayne@69: @cython.boundscheck(False) # we do manual bounds checking jpayne@69: def count_coverage(self, jpayne@69: contig, jpayne@69: start=None, jpayne@69: stop=None, jpayne@69: region=None, jpayne@69: quality_threshold=15, jpayne@69: read_callback='all', jpayne@69: reference=None, jpayne@69: end=None): jpayne@69: """count the coverage of genomic positions by reads in :term:`region`. jpayne@69: jpayne@69: The region is specified by :term:`contig`, `start` and `stop`. jpayne@69: :term:`reference` and `end` are also accepted for backward jpayne@69: compatibility as synonyms for :term:`contig` and `stop`, jpayne@69: respectively. Alternatively, a `samtools`_ :term:`region` jpayne@69: string can be supplied. The coverage is computed per-base [ACGT]. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: contig : string jpayne@69: reference_name of the genomic region (chromosome) jpayne@69: jpayne@69: start : int jpayne@69: start of the genomic region (0-based inclusive). If not jpayne@69: given, count from the start of the chromosome. jpayne@69: jpayne@69: stop : int jpayne@69: end of the genomic region (0-based exclusive). If not given, jpayne@69: count to the end of the chromosome. jpayne@69: jpayne@69: region : string jpayne@69: a region string. jpayne@69: jpayne@69: quality_threshold : int jpayne@69: quality_threshold is the minimum quality score (in phred) a jpayne@69: base has to reach to be counted. jpayne@69: jpayne@69: read_callback: string or function jpayne@69: jpayne@69: select a call-back to ignore reads when counting. It can jpayne@69: be either a string with the following values: jpayne@69: jpayne@69: ``all`` jpayne@69: skip reads in which any of the following jpayne@69: flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, jpayne@69: BAM_FDUP jpayne@69: jpayne@69: ``nofilter`` jpayne@69: uses every single read jpayne@69: jpayne@69: Alternatively, `read_callback` can be a function jpayne@69: ``check_read(read)`` that should return True only for jpayne@69: those reads that shall be included in the counting. jpayne@69: jpayne@69: reference : string jpayne@69: backward compatible synonym for `contig` jpayne@69: jpayne@69: end : int jpayne@69: backward compatible synonym for `stop` jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: ValueError jpayne@69: if the genomic coordinates are out of range or invalid. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: four array.arrays of the same length in order A C G T : tuple jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: cdef uint32_t contig_length = self.get_reference_length(contig) jpayne@69: cdef int _start = start if start is not None else 0 jpayne@69: cdef int _stop = stop if stop is not None else contig_length jpayne@69: _stop = _stop if _stop < contig_length else contig_length jpayne@69: jpayne@69: if _stop == _start: jpayne@69: raise ValueError("interval of size 0") jpayne@69: if _stop < _start: jpayne@69: raise ValueError("interval of size less than 0") jpayne@69: jpayne@69: cdef int length = _stop - _start jpayne@69: cdef c_array.array int_array_template = array.array('L', []) jpayne@69: cdef c_array.array count_a jpayne@69: cdef c_array.array count_c jpayne@69: cdef c_array.array count_g jpayne@69: cdef c_array.array count_t jpayne@69: count_a = c_array.clone(int_array_template, length, zero=True) jpayne@69: count_c = c_array.clone(int_array_template, length, zero=True) jpayne@69: count_g = c_array.clone(int_array_template, length, zero=True) jpayne@69: count_t = c_array.clone(int_array_template, length, zero=True) jpayne@69: jpayne@69: cdef AlignedSegment read jpayne@69: cdef cython.str seq jpayne@69: cdef c_array.array quality jpayne@69: cdef int qpos jpayne@69: cdef int refpos jpayne@69: cdef int c = 0 jpayne@69: cdef int filter_method = 0 jpayne@69: jpayne@69: jpayne@69: if read_callback == "all": jpayne@69: filter_method = 1 jpayne@69: elif read_callback == "nofilter": jpayne@69: filter_method = 2 jpayne@69: jpayne@69: cdef int _threshold = quality_threshold or 0 jpayne@69: for read in self.fetch(contig=contig, jpayne@69: reference=reference, jpayne@69: start=start, jpayne@69: stop=stop, jpayne@69: end=end, jpayne@69: region=region): jpayne@69: # apply filter jpayne@69: if filter_method == 1: jpayne@69: # filter = "all" jpayne@69: if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): jpayne@69: continue jpayne@69: elif filter_method == 2: jpayne@69: # filter = "nofilter" jpayne@69: pass jpayne@69: else: jpayne@69: if not read_callback(read): jpayne@69: continue jpayne@69: jpayne@69: # count jpayne@69: seq = read.seq jpayne@69: if seq is None: jpayne@69: continue jpayne@69: quality = read.query_qualities jpayne@69: jpayne@69: for qpos, refpos in read.get_aligned_pairs(True): jpayne@69: if qpos is not None and refpos is not None and \ jpayne@69: _start <= refpos < _stop: jpayne@69: jpayne@69: # only check base quality if _threshold > 0 jpayne@69: if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: jpayne@69: if seq[qpos] == 'A': jpayne@69: count_a.data.as_ulongs[refpos - _start] += 1 jpayne@69: if seq[qpos] == 'C': jpayne@69: count_c.data.as_ulongs[refpos - _start] += 1 jpayne@69: if seq[qpos] == 'G': jpayne@69: count_g.data.as_ulongs[refpos - _start] += 1 jpayne@69: if seq[qpos] == 'T': jpayne@69: count_t.data.as_ulongs[refpos - _start] += 1 jpayne@69: jpayne@69: return count_a, count_c, count_g, count_t jpayne@69: jpayne@69: def find_introns_slow(self, read_iterator): jpayne@69: """Return a dictionary {(start, stop): count} jpayne@69: Listing the intronic sites in the reads (identified by 'N' in the cigar strings), jpayne@69: and their support ( = number of reads ). jpayne@69: jpayne@69: read_iterator can be the result of a .fetch(...) call. jpayne@69: Or it can be a generator filtering such reads. Example jpayne@69: samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) jpayne@69: """ jpayne@69: res = collections.Counter() jpayne@69: for r in read_iterator: jpayne@69: if 'N' in r.cigarstring: jpayne@69: last_read_pos = False jpayne@69: for read_loc, genome_loc in r.get_aligned_pairs(): jpayne@69: if read_loc is None and last_read_pos: jpayne@69: start = genome_loc jpayne@69: elif read_loc and last_read_pos is None: jpayne@69: stop = genome_loc # we are right exclusive ,so this is correct jpayne@69: res[(start, stop)] += 1 jpayne@69: del start jpayne@69: del stop jpayne@69: last_read_pos = read_loc jpayne@69: return res jpayne@69: jpayne@69: def find_introns(self, read_iterator): jpayne@69: """Return a dictionary {(start, stop): count} jpayne@69: Listing the intronic sites in the reads (identified by 'N' in the cigar strings), jpayne@69: and their support ( = number of reads ). jpayne@69: jpayne@69: read_iterator can be the result of a .fetch(...) call. jpayne@69: Or it can be a generator filtering such reads. Example jpayne@69: samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) jpayne@69: """ jpayne@69: cdef: jpayne@69: uint32_t base_position, junc_start, nt jpayne@69: int op jpayne@69: AlignedSegment r jpayne@69: int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP jpayne@69: jpayne@69: res = collections.Counter() jpayne@69: jpayne@69: match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position jpayne@69: for r in read_iterator: jpayne@69: base_position = r.pos jpayne@69: cigar = r.cigartuples jpayne@69: if cigar is None: jpayne@69: continue jpayne@69: jpayne@69: for op, nt in cigar: jpayne@69: if op in match_or_deletion: jpayne@69: base_position += nt jpayne@69: elif op == BAM_CREF_SKIP: jpayne@69: junc_start = base_position jpayne@69: base_position += nt jpayne@69: res[(junc_start, base_position)] += 1 jpayne@69: return res jpayne@69: jpayne@69: jpayne@69: def close(self): jpayne@69: '''closes the :class:`pysam.AlignmentFile`.''' jpayne@69: jpayne@69: if self.htsfile == NULL: jpayne@69: return jpayne@69: jpayne@69: if self.index != NULL: jpayne@69: hts_idx_destroy(self.index) jpayne@69: self.index = NULL jpayne@69: jpayne@69: cdef int ret = hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: jpayne@69: self.header = None jpayne@69: jpayne@69: if ret < 0: jpayne@69: global errno jpayne@69: if errno == EPIPE: jpayne@69: errno = 0 jpayne@69: else: jpayne@69: raise IOError(errno, force_str(strerror(errno))) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: cdef int ret = 0 jpayne@69: jpayne@69: if self.index != NULL: jpayne@69: hts_idx_destroy(self.index) jpayne@69: self.index = NULL jpayne@69: jpayne@69: if self.htsfile != NULL: jpayne@69: ret = hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: jpayne@69: self.header = None jpayne@69: jpayne@69: if self.b: jpayne@69: bam_destroy1(self.b) jpayne@69: self.b = NULL jpayne@69: jpayne@69: if ret < 0: jpayne@69: global errno jpayne@69: if errno == EPIPE: jpayne@69: errno = 0 jpayne@69: else: jpayne@69: raise IOError(errno, force_str(strerror(errno))) jpayne@69: jpayne@69: cpdef int write(self, AlignedSegment read) except -1: jpayne@69: ''' jpayne@69: write a single :class:`pysam.AlignedSegment` to disk. jpayne@69: jpayne@69: Raises: jpayne@69: ValueError jpayne@69: if the writing failed jpayne@69: jpayne@69: Returns: jpayne@69: int : jpayne@69: the number of bytes written. If the file is closed, jpayne@69: this will be 0. jpayne@69: ''' jpayne@69: if not self.is_open: jpayne@69: return 0 jpayne@69: jpayne@69: if self.header.ptr.n_targets <= read._delegate.core.tid: jpayne@69: raise ValueError( jpayne@69: "AlignedSegment refers to reference number {} that " jpayne@69: "is larger than the number of references ({}) in the header".format( jpayne@69: read._delegate.core.tid, self.header.ptr.n_targets)) jpayne@69: jpayne@69: cdef int ret jpayne@69: with nogil: jpayne@69: ret = sam_write1(self.htsfile, jpayne@69: self.header.ptr, jpayne@69: read._delegate) jpayne@69: jpayne@69: # kbj: Still need to raise an exception with except -1. Otherwise jpayne@69: # when ret == -1 we get a "SystemError: error return without jpayne@69: # exception set". jpayne@69: if ret < 0: jpayne@69: raise IOError( jpayne@69: "sam_write1 failed with error code {}".format(ret)) jpayne@69: jpayne@69: return ret jpayne@69: jpayne@69: # context manager interface jpayne@69: def __enter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __exit__(self, exc_type, exc_value, traceback): jpayne@69: self.close() jpayne@69: return False jpayne@69: jpayne@69: ############################################################### jpayne@69: ############################################################### jpayne@69: ############################################################### jpayne@69: ## properties jpayne@69: ############################################################### jpayne@69: property mapped: jpayne@69: """int with total number of mapped alignments according to the jpayne@69: statistics recorded in the index. This is a read-only jpayne@69: attribute. jpayne@69: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@69: index format does not record these statistics.) jpayne@69: """ jpayne@69: def __get__(self): jpayne@69: self.check_index() jpayne@69: cdef int tid jpayne@69: cdef uint64_t total = 0 jpayne@69: cdef uint64_t mapped, unmapped jpayne@69: for tid from 0 <= tid < self.header.nreferences: jpayne@69: with nogil: jpayne@69: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@69: total += mapped jpayne@69: return total jpayne@69: jpayne@69: property unmapped: jpayne@69: """int with total number of unmapped reads according to the statistics jpayne@69: recorded in the index. This number of reads includes the number of reads jpayne@69: without coordinates. This is a read-only attribute. jpayne@69: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@69: index format does not record these statistics.) jpayne@69: """ jpayne@69: def __get__(self): jpayne@69: self.check_index() jpayne@69: cdef int tid jpayne@69: cdef uint64_t total = hts_idx_get_n_no_coor(self.index) jpayne@69: cdef uint64_t mapped, unmapped jpayne@69: for tid from 0 <= tid < self.header.nreferences: jpayne@69: with nogil: jpayne@69: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@69: total += unmapped jpayne@69: return total jpayne@69: jpayne@69: property nocoordinate: jpayne@69: """int with total number of reads without coordinates according to the jpayne@69: statistics recorded in the index, i.e., the statistic printed for "*" jpayne@69: by the ``samtools idxstats`` command. This is a read-only attribute. jpayne@69: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@69: index format does not record these statistics.) jpayne@69: """ jpayne@69: def __get__(self): jpayne@69: self.check_index() jpayne@69: cdef uint64_t n jpayne@69: with nogil: jpayne@69: n = hts_idx_get_n_no_coor(self.index) jpayne@69: return n jpayne@69: jpayne@69: def get_index_statistics(self): jpayne@69: """return statistics about mapped/unmapped reads per chromosome as jpayne@69: they are stored in the index, similarly to the statistics printed jpayne@69: by the ``samtools idxstats`` command. jpayne@69: jpayne@69: CRAI indexes do not record these statistics, so for a CRAM file jpayne@69: with a .crai index the returned statistics will all be 0. jpayne@69: jpayne@69: Returns: jpayne@69: list : jpayne@69: a list of records for each chromosome. Each record has the jpayne@69: attributes 'contig', 'mapped', 'unmapped' and 'total'. jpayne@69: """ jpayne@69: jpayne@69: self.check_index() jpayne@69: cdef int tid jpayne@69: cdef uint64_t mapped, unmapped jpayne@69: results = [] jpayne@69: # TODO: use header jpayne@69: for tid from 0 <= tid < self.nreferences: jpayne@69: with nogil: jpayne@69: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@69: results.append( jpayne@69: IndexStats._make(( jpayne@69: self.get_reference_name(tid), jpayne@69: mapped, jpayne@69: unmapped, jpayne@69: mapped + unmapped))) jpayne@69: jpayne@69: return results jpayne@69: jpayne@69: ############################################################### jpayne@69: ## file-object like iterator access jpayne@69: ## note: concurrent access will cause errors (see IteratorRow jpayne@69: ## and multiple_iterators) jpayne@69: ## Possible solutions: deprecate or open new file handle jpayne@69: def __iter__(self): jpayne@69: if not self.is_open: jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: jpayne@69: if not self.is_bam and self.header.nreferences == 0: jpayne@69: raise NotImplementedError( jpayne@69: "can not iterate over samfile without header") jpayne@69: return self jpayne@69: jpayne@69: cdef bam1_t * getCurrent(self): jpayne@69: return self.b jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: ''' jpayne@69: cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. jpayne@69: ''' jpayne@69: cdef int ret jpayne@69: cdef bam_hdr_t * hdr = self.header.ptr jpayne@69: with nogil: jpayne@69: ret = sam_read1(self.htsfile, jpayne@69: hdr, jpayne@69: self.b) jpayne@69: return ret jpayne@69: jpayne@69: def __next__(self): jpayne@69: cdef int ret = self.cnext() jpayne@69: if ret >= 0: jpayne@69: return makeAlignedSegment(self.b, self.header) jpayne@69: elif ret == -1: jpayne@69: raise StopIteration jpayne@69: else: jpayne@69: raise IOError(read_failure_reason(ret)) jpayne@69: jpayne@69: ########################################### jpayne@69: # methods/properties referencing the header jpayne@69: def is_valid_tid(self, int tid): jpayne@69: """ jpayne@69: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@69: jpayne@69: Note that the unmapped tid code (-1) counts as an invalid. jpayne@69: """ jpayne@69: if self.header is None: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: return self.header.is_valid_tid(tid) jpayne@69: jpayne@69: def get_tid(self, reference): jpayne@69: """ jpayne@69: return the numerical :term:`tid` corresponding to jpayne@69: :term:`reference` jpayne@69: jpayne@69: returns -1 if reference is not known. jpayne@69: """ jpayne@69: if self.header is None: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: return self.header.get_tid(reference) jpayne@69: jpayne@69: def get_reference_name(self, tid): jpayne@69: """ jpayne@69: return :term:`reference` name corresponding to numerical :term:`tid` jpayne@69: """ jpayne@69: if self.header is None: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: return self.header.get_reference_name(tid) jpayne@69: jpayne@69: def get_reference_length(self, reference): jpayne@69: """ jpayne@69: return :term:`reference` length corresponding to numerical :term:`tid` jpayne@69: """ jpayne@69: if self.header is None: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: return self.header.get_reference_length(reference) jpayne@69: jpayne@69: property nreferences: jpayne@69: """int with the number of :term:`reference` sequences in the file. jpayne@69: This is a read-only attribute.""" jpayne@69: def __get__(self): jpayne@69: if self.header: jpayne@69: return self.header.nreferences jpayne@69: else: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: jpayne@69: property references: jpayne@69: """tuple with the names of :term:`reference` sequences. This is a jpayne@69: read-only attribute""" jpayne@69: def __get__(self): jpayne@69: if self.header: jpayne@69: return self.header.references jpayne@69: else: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: jpayne@69: property lengths: jpayne@69: """tuple of the lengths of the :term:`reference` sequences. This is a jpayne@69: read-only attribute. The lengths are in the same order as jpayne@69: :attr:`pysam.AlignmentFile.references` jpayne@69: jpayne@69: """ jpayne@69: def __get__(self): jpayne@69: if self.header: jpayne@69: return self.header.lengths jpayne@69: else: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: jpayne@69: # Compatibility functions for pysam < 0.14 jpayne@69: property text: jpayne@69: """deprecated, use :attr:`references` and :attr:`lengths` instead""" jpayne@69: def __get__(self): jpayne@69: if self.header: jpayne@69: return self.header.__str__() jpayne@69: else: jpayne@69: raise ValueError("header not available in closed files") jpayne@69: jpayne@69: # Compatibility functions for pysam < 0.8.3 jpayne@69: def gettid(self, reference): jpayne@69: """deprecated, use :meth:`get_tid` instead""" jpayne@69: return self.get_tid(reference) jpayne@69: jpayne@69: def getrname(self, tid): jpayne@69: """deprecated, use :meth:`get_reference_name` instead""" jpayne@69: return self.get_reference_name(tid) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRow: jpayne@69: '''abstract base class for iterators over mapped reads. jpayne@69: jpayne@69: Various iterators implement different behaviours for wrapping around jpayne@69: contig boundaries. Examples include: jpayne@69: jpayne@69: :class:`pysam.IteratorRowRegion` jpayne@69: iterate within a single contig and a defined region. jpayne@69: jpayne@69: :class:`pysam.IteratorRowAll` jpayne@69: iterate until EOF. This iterator will also include unmapped reads. jpayne@69: jpayne@69: :class:`pysam.IteratorRowAllRefs` jpayne@69: iterate over all reads in all reference sequences. jpayne@69: jpayne@69: The method :meth:`AlignmentFile.fetch` returns an IteratorRow. jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a jpayne@69: :meth:`AlignmentFile.fetch`. jpayne@69: jpayne@69: ''' jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): jpayne@69: cdef char *cfilename jpayne@69: cdef char *creference_filename jpayne@69: cdef char *cindexname = NULL jpayne@69: jpayne@69: if not samfile.is_open: jpayne@69: raise ValueError("I/O operation on closed file") jpayne@69: jpayne@69: # makes sure that samfile stays alive as long as the jpayne@69: # iterator is alive jpayne@69: self.samfile = samfile jpayne@69: jpayne@69: # reopen the file - note that this makes the iterator jpayne@69: # slow and causes pileup to slow down significantly. jpayne@69: if multiple_iterators: jpayne@69: jpayne@69: cfilename = samfile.filename jpayne@69: with nogil: jpayne@69: self.htsfile = hts_open(cfilename, 'r') jpayne@69: assert self.htsfile != NULL jpayne@69: jpayne@69: if samfile.has_index(): jpayne@69: if samfile.index_filename: jpayne@69: cindexname = bindex_filename = encode_filename(samfile.index_filename) jpayne@69: with nogil: jpayne@69: self.index = sam_index_load2(self.htsfile, cfilename, cindexname) jpayne@69: else: jpayne@69: self.index = NULL jpayne@69: jpayne@69: # need to advance in newly opened file to position after header jpayne@69: # better: use seek/tell? jpayne@69: with nogil: jpayne@69: hdr = sam_hdr_read(self.htsfile) jpayne@69: if hdr is NULL: jpayne@69: raise IOError("unable to read header information") jpayne@69: self.header = makeAlignmentHeader(hdr) jpayne@69: jpayne@69: self.owns_samfile = True jpayne@69: jpayne@69: # options specific to CRAM files jpayne@69: if samfile.is_cram and samfile.reference_filename: jpayne@69: creference_filename = samfile.reference_filename jpayne@69: hts_set_opt(self.htsfile, jpayne@69: CRAM_OPT_REFERENCE, jpayne@69: creference_filename) jpayne@69: jpayne@69: else: jpayne@69: self.htsfile = samfile.htsfile jpayne@69: self.index = samfile.index jpayne@69: self.owns_samfile = False jpayne@69: self.header = samfile.header jpayne@69: jpayne@69: self.retval = 0 jpayne@69: jpayne@69: self.b = bam_init1() jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: bam_destroy1(self.b) jpayne@69: if self.owns_samfile: jpayne@69: hts_idx_destroy(self.index) jpayne@69: hts_close(self.htsfile) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRowRegion(IteratorRow): jpayne@69: """*(AlignmentFile samfile, int tid, int beg, int stop, jpayne@69: int multiple_iterators=False)* jpayne@69: jpayne@69: iterate over mapped reads in a region. jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a jpayne@69: :meth:`AlignmentFile.fetch`. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, jpayne@69: int tid, int beg, int stop, jpayne@69: int multiple_iterators=False): jpayne@69: jpayne@69: if not samfile.has_index(): jpayne@69: raise ValueError("no index available for iteration") jpayne@69: jpayne@69: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@69: with nogil: jpayne@69: self.iter = sam_itr_queryi( jpayne@69: self.index, jpayne@69: tid, jpayne@69: beg, jpayne@69: stop) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef bam1_t * getCurrent(self): jpayne@69: return self.b jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: '''cversion of iterator. Used by IteratorColumn''' jpayne@69: with nogil: jpayne@69: self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), jpayne@69: self.iter, jpayne@69: self.b, jpayne@69: self.htsfile) jpayne@69: jpayne@69: def __next__(self): jpayne@69: self.cnext() jpayne@69: if self.retval >= 0: jpayne@69: return makeAlignedSegment(self.b, self.header) jpayne@69: elif self.retval == -1: jpayne@69: raise StopIteration jpayne@69: elif self.retval == -2: jpayne@69: # Note: it is currently not the case that hts_iter_next jpayne@69: # returns -2 for a truncated file. jpayne@69: # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 jpayne@69: raise IOError('truncated file') jpayne@69: else: jpayne@69: raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: hts_itr_destroy(self.iter) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRowHead(IteratorRow): jpayne@69: """*(AlignmentFile samfile, n, int multiple_iterators=False)* jpayne@69: jpayne@69: iterate over first n reads in `samfile` jpayne@69: jpayne@69: .. note:: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a jpayne@69: :meth:`AlignmentFile.head`. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, jpayne@69: AlignmentFile samfile, jpayne@69: int n, jpayne@69: int multiple_iterators=False): jpayne@69: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: self.max_rows = n jpayne@69: self.current_row = 0 jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef bam1_t * getCurrent(self): jpayne@69: return self.b jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: '''cversion of iterator. Used by IteratorColumn''' jpayne@69: cdef int ret jpayne@69: cdef bam_hdr_t * hdr = self.header.ptr jpayne@69: with nogil: jpayne@69: ret = sam_read1(self.htsfile, jpayne@69: hdr, jpayne@69: self.b) jpayne@69: return ret jpayne@69: jpayne@69: def __next__(self): jpayne@69: if self.current_row >= self.max_rows: jpayne@69: raise StopIteration jpayne@69: jpayne@69: cdef int ret = self.cnext() jpayne@69: if ret >= 0: jpayne@69: self.current_row += 1 jpayne@69: return makeAlignedSegment(self.b, self.header) jpayne@69: elif ret == -1: jpayne@69: raise StopIteration jpayne@69: else: jpayne@69: raise IOError(read_failure_reason(ret)) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRowAll(IteratorRow): jpayne@69: """*(AlignmentFile samfile, int multiple_iterators=False)* jpayne@69: jpayne@69: iterate over all reads in `samfile` jpayne@69: jpayne@69: .. note:: jpayne@69: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a jpayne@69: :meth:`AlignmentFile.fetch`. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): jpayne@69: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef bam1_t * getCurrent(self): jpayne@69: return self.b jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: '''cversion of iterator. Used by IteratorColumn''' jpayne@69: cdef int ret jpayne@69: cdef bam_hdr_t * hdr = self.header.ptr jpayne@69: with nogil: jpayne@69: ret = sam_read1(self.htsfile, jpayne@69: hdr, jpayne@69: self.b) jpayne@69: return ret jpayne@69: jpayne@69: def __next__(self): jpayne@69: cdef int ret = self.cnext() jpayne@69: if ret >= 0: jpayne@69: return makeAlignedSegment(self.b, self.header) jpayne@69: elif ret == -1: jpayne@69: raise StopIteration jpayne@69: else: jpayne@69: raise IOError(read_failure_reason(ret)) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRowAllRefs(IteratorRow): jpayne@69: """iterates over all mapped reads by chaining iterators over each jpayne@69: reference jpayne@69: jpayne@69: .. note:: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a jpayne@69: :meth:`AlignmentFile.fetch`. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, multiple_iterators=False): jpayne@69: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: if not samfile.has_index(): jpayne@69: raise ValueError("no index available for fetch") jpayne@69: jpayne@69: self.tid = -1 jpayne@69: jpayne@69: def nextiter(self): jpayne@69: # get a new iterator for a chromosome. The file jpayne@69: # will not be re-opened. jpayne@69: self.rowiter = IteratorRowRegion(self.samfile, jpayne@69: self.tid, jpayne@69: 0, jpayne@69: MAX_POS) jpayne@69: # set htsfile and header of the rowiter jpayne@69: # to the values in this iterator to reflect multiple_iterators jpayne@69: self.rowiter.htsfile = self.htsfile jpayne@69: self.rowiter.header = self.header jpayne@69: jpayne@69: # make sure the iterator understand that IteratorRowAllRefs jpayne@69: # has ownership jpayne@69: self.rowiter.owns_samfile = False jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: # Create an initial iterator jpayne@69: if self.tid == -1: jpayne@69: if not self.samfile.nreferences: jpayne@69: raise StopIteration jpayne@69: self.tid = 0 jpayne@69: self.nextiter() jpayne@69: jpayne@69: while 1: jpayne@69: self.rowiter.cnext() jpayne@69: jpayne@69: # If current iterator is not exhausted, return aligned read jpayne@69: if self.rowiter.retval > 0: jpayne@69: return makeAlignedSegment(self.rowiter.b, self.header) jpayne@69: jpayne@69: self.tid += 1 jpayne@69: jpayne@69: # Otherwise, proceed to next reference or stop jpayne@69: if self.tid < self.samfile.nreferences: jpayne@69: self.nextiter() jpayne@69: else: jpayne@69: raise StopIteration jpayne@69: jpayne@69: jpayne@69: cdef class IteratorRowSelection(IteratorRow): jpayne@69: """*(AlignmentFile samfile)* jpayne@69: jpayne@69: iterate over reads in `samfile` at a given list of file positions. jpayne@69: jpayne@69: .. note:: jpayne@69: It is usually not necessary to create an object of this class jpayne@69: explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): jpayne@69: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@69: jpayne@69: self.positions = positions jpayne@69: self.current_pos = 0 jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef bam1_t * getCurrent(self): jpayne@69: return self.b jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: '''cversion of iterator''' jpayne@69: # end iteration if out of positions jpayne@69: if self.current_pos >= len(self.positions): return -1 jpayne@69: jpayne@69: cdef uint64_t pos = self.positions[self.current_pos] jpayne@69: with nogil: jpayne@69: bgzf_seek(hts_get_bgzfp(self.htsfile), jpayne@69: pos, jpayne@69: 0) jpayne@69: self.current_pos += 1 jpayne@69: jpayne@69: cdef int ret jpayne@69: cdef bam_hdr_t * hdr = self.header.ptr jpayne@69: with nogil: jpayne@69: ret = sam_read1(self.htsfile, jpayne@69: hdr, jpayne@69: self.b) jpayne@69: return ret jpayne@69: jpayne@69: def __next__(self): jpayne@69: cdef int ret = self.cnext() jpayne@69: if ret >= 0: jpayne@69: return makeAlignedSegment(self.b, self.header) jpayne@69: elif ret == -1: jpayne@69: raise StopIteration jpayne@69: else: jpayne@69: raise IOError(read_failure_reason(ret)) jpayne@69: jpayne@69: jpayne@69: cdef int __advance_nofilter(void *data, bam1_t *b): jpayne@69: '''advance without any read filtering. jpayne@69: ''' jpayne@69: cdef __iterdata * d = <__iterdata*>data jpayne@69: cdef int ret jpayne@69: with nogil: jpayne@69: ret = sam_itr_next(d.htsfile, d.iter, b) jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef int __advance_raw_nofilter(void *data, bam1_t *b): jpayne@69: '''advance (without iterator) without any read filtering. jpayne@69: ''' jpayne@69: cdef __iterdata * d = <__iterdata*>data jpayne@69: cdef int ret jpayne@69: with nogil: jpayne@69: ret = sam_read1(d.htsfile, d.header, b) jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef int __advance_all(void *data, bam1_t *b): jpayne@69: '''only use reads for pileup passing basic filters such as jpayne@69: jpayne@69: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@69: ''' jpayne@69: jpayne@69: cdef __iterdata * d = <__iterdata*>data jpayne@69: cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP jpayne@69: cdef int ret jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: ret = sam_itr_next(d.htsfile, d.iter, b) jpayne@69: if ret < 0: jpayne@69: break jpayne@69: if b.core.flag & d.flag_filter: jpayne@69: continue jpayne@69: break jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef int __advance_raw_all(void *data, bam1_t *b): jpayne@69: '''only use reads for pileup passing basic filters such as jpayne@69: jpayne@69: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@69: ''' jpayne@69: jpayne@69: cdef __iterdata * d = <__iterdata*>data jpayne@69: cdef int ret jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: ret = sam_read1(d.htsfile, d.header, b) jpayne@69: if ret < 0: jpayne@69: break jpayne@69: if b.core.flag & d.flag_filter: jpayne@69: continue jpayne@69: break jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef int __advance_samtools(void * data, bam1_t * b): jpayne@69: '''advance using same filter and read processing as in jpayne@69: the samtools pileup. jpayne@69: ''' jpayne@69: cdef __iterdata * d = <__iterdata*>data jpayne@69: cdef int ret jpayne@69: cdef int q jpayne@69: jpayne@69: while 1: jpayne@69: with nogil: jpayne@69: ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) jpayne@69: if ret < 0: jpayne@69: break jpayne@69: if b.core.flag & d.flag_filter: jpayne@69: continue jpayne@69: if d.flag_require and not (b.core.flag & d.flag_require): jpayne@69: continue jpayne@69: jpayne@69: # reload sequence jpayne@69: if d.fastafile != NULL and b.core.tid != d.tid: jpayne@69: if d.seq != NULL: jpayne@69: free(d.seq) jpayne@69: d.tid = b.core.tid jpayne@69: with nogil: jpayne@69: d.seq = faidx_fetch_seq( jpayne@69: d.fastafile, jpayne@69: d.header.target_name[d.tid], jpayne@69: 0, MAX_POS, jpayne@69: &d.seq_len) jpayne@69: jpayne@69: if d.seq == NULL: jpayne@69: raise ValueError( jpayne@69: "reference sequence for '{}' (tid={}) not found".format( jpayne@69: d.header.target_name[d.tid], d.tid)) jpayne@69: jpayne@69: # realign read - changes base qualities jpayne@69: if d.seq != NULL and d.compute_baq: jpayne@69: # 4th option to realign is flag: jpayne@69: # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 jpayne@69: if d.redo_baq: jpayne@69: sam_prob_realn(b, d.seq, d.seq_len, 7) jpayne@69: else: jpayne@69: sam_prob_realn(b, d.seq, d.seq_len, 3) jpayne@69: jpayne@69: if d.seq != NULL and d.adjust_capq_threshold > 10: jpayne@69: q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) jpayne@69: if q < 0: jpayne@69: continue jpayne@69: elif b.core.qual > q: jpayne@69: b.core.qual = q jpayne@69: jpayne@69: if b.core.qual < d.min_mapping_quality: jpayne@69: continue jpayne@69: if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): jpayne@69: continue jpayne@69: jpayne@69: break jpayne@69: jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef class IteratorColumn: jpayne@69: '''abstract base class for iterators over columns. jpayne@69: jpayne@69: IteratorColumn objects wrap the pileup functionality of samtools. jpayne@69: jpayne@69: For reasons of efficiency, the iterator points to the current jpayne@69: pileup buffer. The pileup buffer is updated at every iteration. jpayne@69: This might cause some unexpected behaviour. For example, jpayne@69: consider the conversion to a list:: jpayne@69: jpayne@69: f = AlignmentFile("file.bam", "rb") jpayne@69: result = list(f.pileup()) jpayne@69: jpayne@69: Here, ``result`` will contain ``n`` objects of type jpayne@69: :class:`~pysam.PileupColumn` for ``n`` columns, but each object in jpayne@69: ``result`` will contain the same information. jpayne@69: jpayne@69: The desired behaviour can be achieved by list comprehension:: jpayne@69: jpayne@69: result = [x.pileups() for x in f.pileup()] jpayne@69: jpayne@69: ``result`` will be a list of ``n`` lists of objects of type jpayne@69: :class:`~pysam.PileupRead`. jpayne@69: jpayne@69: If the iterator is associated with a :class:`~pysam.Fastafile` jpayne@69: using the :meth:`add_reference` method, then the iterator will jpayne@69: export the current sequence via the methods :meth:`get_sequence` jpayne@69: and :meth:`seq_len`. jpayne@69: jpayne@69: See :class:`~AlignmentFile.pileup` for kwargs to the iterator. jpayne@69: ''' jpayne@69: jpayne@69: def __cinit__( self, AlignmentFile samfile, **kwargs): jpayne@69: self.samfile = samfile jpayne@69: self.fastafile = kwargs.get("fastafile", None) jpayne@69: self.stepper = kwargs.get("stepper", "samtools") jpayne@69: self.max_depth = kwargs.get("max_depth", 8000) jpayne@69: self.ignore_overlaps = kwargs.get("ignore_overlaps", True) jpayne@69: self.min_base_quality = kwargs.get("min_base_quality", 13) jpayne@69: self.iterdata.seq = NULL jpayne@69: self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) jpayne@69: self.iterdata.flag_require = kwargs.get("flag_require", 0) jpayne@69: self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) jpayne@69: self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) jpayne@69: self.iterdata.compute_baq = kwargs.get("compute_baq", True) jpayne@69: self.iterdata.redo_baq = kwargs.get("redo_baq", False) jpayne@69: self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) jpayne@69: jpayne@69: self.tid = 0 jpayne@69: self.pos = 0 jpayne@69: self.n_plp = 0 jpayne@69: self.plp = NULL jpayne@69: self.pileup_iter = NULL jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: cdef int cnext(self): jpayne@69: '''perform next iteration. jpayne@69: ''' jpayne@69: # do not release gil here because of call-backs jpayne@69: cdef int ret = bam_mplp_auto(self.pileup_iter, jpayne@69: &self.tid, jpayne@69: &self.pos, jpayne@69: &self.n_plp, jpayne@69: &self.plp) jpayne@69: return ret jpayne@69: jpayne@69: cdef char * get_sequence(self): jpayne@69: '''return current reference sequence underlying the iterator. jpayne@69: ''' jpayne@69: return self.iterdata.seq jpayne@69: jpayne@69: property seq_len: jpayne@69: '''current sequence length.''' jpayne@69: def __get__(self): jpayne@69: return self.iterdata.seq_len jpayne@69: jpayne@69: def add_reference(self, FastaFile fastafile): jpayne@69: ''' jpayne@69: add reference sequences in `fastafile` to iterator.''' jpayne@69: self.fastafile = fastafile jpayne@69: if self.iterdata.seq != NULL: jpayne@69: free(self.iterdata.seq) jpayne@69: self.iterdata.tid = -1 jpayne@69: self.iterdata.fastafile = self.fastafile.fastafile jpayne@69: jpayne@69: def has_reference(self): jpayne@69: ''' jpayne@69: return true if iterator is associated with a reference''' jpayne@69: return self.fastafile jpayne@69: jpayne@69: cdef _setup_iterator(self, jpayne@69: int tid, jpayne@69: int start, jpayne@69: int stop, jpayne@69: int multiple_iterators=0): jpayne@69: '''setup the iterator structure''' jpayne@69: jpayne@69: self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) jpayne@69: self.iterdata.htsfile = self.samfile.htsfile jpayne@69: self.iterdata.iter = self.iter.iter jpayne@69: self.iterdata.seq = NULL jpayne@69: self.iterdata.tid = -1 jpayne@69: self.iterdata.header = self.samfile.header.ptr jpayne@69: jpayne@69: if self.fastafile is not None: jpayne@69: self.iterdata.fastafile = self.fastafile.fastafile jpayne@69: else: jpayne@69: self.iterdata.fastafile = NULL jpayne@69: jpayne@69: # Free any previously allocated memory before reassigning jpayne@69: # pileup_iter jpayne@69: self._free_pileup_iter() jpayne@69: jpayne@69: cdef void * data[1] jpayne@69: data[0] = &self.iterdata jpayne@69: jpayne@69: if self.stepper is None or self.stepper == "all": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_all, jpayne@69: data) jpayne@69: elif self.stepper == "nofilter": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_nofilter, jpayne@69: data) jpayne@69: elif self.stepper == "samtools": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_samtools, jpayne@69: data) jpayne@69: else: jpayne@69: raise ValueError( jpayne@69: "unknown stepper option `%s` in IteratorColumn" % self.stepper) jpayne@69: jpayne@69: if self.max_depth: jpayne@69: with nogil: jpayne@69: bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) jpayne@69: jpayne@69: if self.ignore_overlaps: jpayne@69: with nogil: jpayne@69: bam_mplp_init_overlaps(self.pileup_iter) jpayne@69: jpayne@69: cdef _setup_raw_rest_iterator(self): jpayne@69: '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST''' jpayne@69: jpayne@69: self.iter = None jpayne@69: self.iterdata.iter = NULL jpayne@69: self.iterdata.htsfile = self.samfile.htsfile jpayne@69: self.iterdata.seq = NULL jpayne@69: self.iterdata.tid = -1 jpayne@69: self.iterdata.header = self.samfile.header.ptr jpayne@69: jpayne@69: if self.fastafile is not None: jpayne@69: self.iterdata.fastafile = self.fastafile.fastafile jpayne@69: else: jpayne@69: self.iterdata.fastafile = NULL jpayne@69: jpayne@69: # Free any previously allocated memory before reassigning jpayne@69: # pileup_iter jpayne@69: self._free_pileup_iter() jpayne@69: jpayne@69: cdef void * data[1] jpayne@69: data[0] = &self.iterdata jpayne@69: jpayne@69: if self.stepper is None or self.stepper == "all": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_raw_all, jpayne@69: data) jpayne@69: elif self.stepper == "nofilter": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_raw_nofilter, jpayne@69: data) jpayne@69: elif self.stepper == "samtools": jpayne@69: with nogil: jpayne@69: self.pileup_iter = bam_mplp_init(1, jpayne@69: &__advance_samtools, jpayne@69: data) jpayne@69: else: jpayne@69: raise ValueError( jpayne@69: "unknown stepper option `%s` in IteratorColumn" % self.stepper) jpayne@69: jpayne@69: if self.max_depth: jpayne@69: with nogil: jpayne@69: bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) jpayne@69: jpayne@69: if self.ignore_overlaps: jpayne@69: with nogil: jpayne@69: bam_mplp_init_overlaps(self.pileup_iter) jpayne@69: jpayne@69: cdef reset(self, tid, start, stop): jpayne@69: '''reset iterator position. jpayne@69: jpayne@69: This permits using the iterator multiple times without jpayne@69: having to incur the full set-up costs. jpayne@69: ''' jpayne@69: if self.iter is None: jpayne@69: raise TypeError("Raw iterator set up without region cannot be reset") jpayne@69: jpayne@69: self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) jpayne@69: self.iterdata.iter = self.iter.iter jpayne@69: jpayne@69: # invalidate sequence if different tid jpayne@69: if self.tid != tid: jpayne@69: if self.iterdata.seq != NULL: jpayne@69: free(self.iterdata.seq) jpayne@69: self.iterdata.seq = NULL jpayne@69: self.iterdata.tid = -1 jpayne@69: jpayne@69: # self.pileup_iter = bam_mplp_init(1 jpayne@69: # &__advancepileup, jpayne@69: # &self.iterdata) jpayne@69: with nogil: jpayne@69: bam_mplp_reset(self.pileup_iter) jpayne@69: jpayne@69: cdef _free_pileup_iter(self): jpayne@69: '''free the memory alloc'd by bam_plp_init. jpayne@69: jpayne@69: This is needed before setup_iterator allocates another jpayne@69: pileup_iter, or else memory will be lost. ''' jpayne@69: if self.pileup_iter != NULL: jpayne@69: with nogil: jpayne@69: bam_mplp_reset(self.pileup_iter) jpayne@69: bam_mplp_destroy(self.pileup_iter) jpayne@69: self.pileup_iter = NULL jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: # reset in order to avoid memory leak messages for iterators jpayne@69: # that have not been fully consumed jpayne@69: self._free_pileup_iter() jpayne@69: self.plp = NULL jpayne@69: jpayne@69: if self.iterdata.seq != NULL: jpayne@69: free(self.iterdata.seq) jpayne@69: self.iterdata.seq = NULL jpayne@69: jpayne@69: # backwards compatibility jpayne@69: jpayne@69: def hasReference(self): jpayne@69: return self.has_reference() jpayne@69: cdef char * getSequence(self): jpayne@69: return self.get_sequence() jpayne@69: def addReference(self, FastaFile fastafile): jpayne@69: return self.add_reference(fastafile) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorColumnRegion(IteratorColumn): jpayne@69: '''iterates over a region only. jpayne@69: ''' jpayne@69: def __cinit__(self, jpayne@69: AlignmentFile samfile, jpayne@69: int tid = 0, jpayne@69: int start = 0, jpayne@69: int stop = MAX_POS, jpayne@69: int truncate = False, jpayne@69: int multiple_iterators = True, jpayne@69: **kwargs ): jpayne@69: jpayne@69: # initialize iterator. Multiple iterators not available jpayne@69: # for CRAM. jpayne@69: if multiple_iterators and samfile.is_cram: jpayne@69: warnings.warn("multiple_iterators not implemented for CRAM") jpayne@69: multiple_iterators = False jpayne@69: jpayne@69: self._setup_iterator(tid, start, stop, multiple_iterators) jpayne@69: self.start = start jpayne@69: self.stop = stop jpayne@69: self.truncate = truncate jpayne@69: jpayne@69: def __next__(self): jpayne@69: jpayne@69: cdef int n jpayne@69: jpayne@69: while 1: jpayne@69: n = self.cnext() jpayne@69: if n < 0: jpayne@69: raise ValueError("error during iteration" ) jpayne@69: jpayne@69: if n == 0: jpayne@69: raise StopIteration jpayne@69: jpayne@69: if self.truncate: jpayne@69: if self.start > self.pos: jpayne@69: continue jpayne@69: if self.pos >= self.stop: jpayne@69: raise StopIteration jpayne@69: jpayne@69: return makePileupColumn(&self.plp, jpayne@69: self.tid, jpayne@69: self.pos, jpayne@69: self.n_plp, jpayne@69: self.min_base_quality, jpayne@69: self.iterdata.seq, jpayne@69: self.samfile.header) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorColumnAllRefs(IteratorColumn): jpayne@69: """iterates over all columns by chaining iterators over each reference jpayne@69: """ jpayne@69: jpayne@69: def __cinit__(self, jpayne@69: AlignmentFile samfile, jpayne@69: **kwargs): jpayne@69: jpayne@69: # no iteration over empty files jpayne@69: if not samfile.nreferences: jpayne@69: raise StopIteration jpayne@69: jpayne@69: # initialize iterator jpayne@69: self._setup_iterator(self.tid, 0, MAX_POS, 1) jpayne@69: jpayne@69: def __next__(self): jpayne@69: jpayne@69: cdef int n jpayne@69: while 1: jpayne@69: n = self.cnext() jpayne@69: if n < 0: jpayne@69: raise ValueError("error during iteration") jpayne@69: jpayne@69: # proceed to next reference or stop jpayne@69: if n == 0: jpayne@69: self.tid += 1 jpayne@69: if self.tid < self.samfile.nreferences: jpayne@69: self._setup_iterator(self.tid, 0, MAX_POS, 0) jpayne@69: else: jpayne@69: raise StopIteration jpayne@69: continue jpayne@69: jpayne@69: # return result, if within same reference jpayne@69: return makePileupColumn(&self.plp, jpayne@69: self.tid, jpayne@69: self.pos, jpayne@69: self.n_plp, jpayne@69: self.min_base_quality, jpayne@69: self.iterdata.seq, jpayne@69: self.samfile.header) jpayne@69: jpayne@69: jpayne@69: cdef class IteratorColumnAll(IteratorColumn): jpayne@69: """iterates over all columns, without using an index jpayne@69: """ jpayne@69: jpayne@69: def __cinit__(self, jpayne@69: AlignmentFile samfile, jpayne@69: **kwargs): jpayne@69: jpayne@69: self._setup_raw_rest_iterator() jpayne@69: jpayne@69: def __next__(self): jpayne@69: jpayne@69: cdef int n jpayne@69: n = self.cnext() jpayne@69: if n < 0: jpayne@69: raise ValueError("error during iteration") jpayne@69: jpayne@69: if n == 0: jpayne@69: raise StopIteration jpayne@69: jpayne@69: return makePileupColumn(&self.plp, jpayne@69: self.tid, jpayne@69: self.pos, jpayne@69: self.n_plp, jpayne@69: self.min_base_quality, jpayne@69: self.iterdata.seq, jpayne@69: self.samfile.header) jpayne@69: jpayne@69: jpayne@69: cdef class SNPCall: jpayne@69: '''the results of a SNP call.''' jpayne@69: cdef int _tid jpayne@69: cdef int _pos jpayne@69: cdef char _reference_base jpayne@69: cdef char _genotype jpayne@69: cdef int _consensus_quality jpayne@69: cdef int _snp_quality jpayne@69: cdef int _rms_mapping_quality jpayne@69: cdef int _coverage jpayne@69: jpayne@69: property tid: jpayne@69: '''the chromosome ID as is defined in the header''' jpayne@69: def __get__(self): jpayne@69: return self._tid jpayne@69: jpayne@69: property pos: jpayne@69: '''nucleotide position of SNP.''' jpayne@69: def __get__(self): return self._pos jpayne@69: jpayne@69: property reference_base: jpayne@69: '''reference base at pos. ``N`` if no reference sequence supplied.''' jpayne@69: def __get__(self): return from_string_and_size( &self._reference_base, 1 ) jpayne@69: jpayne@69: property genotype: jpayne@69: '''the genotype called.''' jpayne@69: def __get__(self): return from_string_and_size( &self._genotype, 1 ) jpayne@69: jpayne@69: property consensus_quality: jpayne@69: '''the genotype quality (Phred-scaled).''' jpayne@69: def __get__(self): return self._consensus_quality jpayne@69: jpayne@69: property snp_quality: jpayne@69: '''the snp quality (Phred scaled) - probability of consensus being jpayne@69: identical to reference sequence.''' jpayne@69: def __get__(self): return self._snp_quality jpayne@69: jpayne@69: property mapping_quality: jpayne@69: '''the root mean square (rms) of the mapping quality of all reads jpayne@69: involved in the call.''' jpayne@69: def __get__(self): return self._rms_mapping_quality jpayne@69: jpayne@69: property coverage: jpayne@69: '''coverage or read depth - the number of reads involved in the call.''' jpayne@69: def __get__(self): return self._coverage jpayne@69: jpayne@69: def __str__(self): jpayne@69: jpayne@69: return "\t".join( map(str, ( jpayne@69: self.tid, jpayne@69: self.pos, jpayne@69: self.reference_base, jpayne@69: self.genotype, jpayne@69: self.consensus_quality, jpayne@69: self.snp_quality, jpayne@69: self.mapping_quality, jpayne@69: self.coverage ) ) ) jpayne@69: jpayne@69: jpayne@69: cdef class IndexedReads: jpayne@69: """Index a Sam/BAM-file by query name while keeping the jpayne@69: original sort order intact. jpayne@69: jpayne@69: The index is kept in memory and can be substantial. jpayne@69: jpayne@69: By default, the file is re-opened to avoid conflicts if multiple jpayne@69: operators work on the same file. Set `multiple_iterators` = False jpayne@69: to not re-open `samfile`. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: jpayne@69: samfile : AlignmentFile jpayne@69: File to be indexed. jpayne@69: jpayne@69: multiple_iterators : bool jpayne@69: Flag indicating whether the file should be reopened. Reopening prevents jpayne@69: existing iterators being affected by the indexing. jpayne@69: jpayne@69: """ jpayne@69: jpayne@69: def __init__(self, AlignmentFile samfile, int multiple_iterators=True): jpayne@69: cdef char *cfilename jpayne@69: jpayne@69: # makes sure that samfile stays alive as long as this jpayne@69: # object is alive. jpayne@69: self.samfile = samfile jpayne@69: cdef bam_hdr_t * hdr = NULL jpayne@69: assert samfile.is_bam, "can only apply IndexReads on bam files" jpayne@69: jpayne@69: # multiple_iterators the file - note that this makes the iterator jpayne@69: # slow and causes pileup to slow down significantly. jpayne@69: if multiple_iterators: jpayne@69: cfilename = samfile.filename jpayne@69: with nogil: jpayne@69: self.htsfile = hts_open(cfilename, 'r') jpayne@69: if self.htsfile == NULL: jpayne@69: raise OSError("unable to reopen htsfile") jpayne@69: jpayne@69: # need to advance in newly opened file to position after header jpayne@69: # better: use seek/tell? jpayne@69: with nogil: jpayne@69: hdr = sam_hdr_read(self.htsfile) jpayne@69: if hdr == NULL: jpayne@69: raise OSError("unable to read header information") jpayne@69: self.header = makeAlignmentHeader(hdr) jpayne@69: self.owns_samfile = True jpayne@69: else: jpayne@69: self.htsfile = self.samfile.htsfile jpayne@69: self.header = samfile.header jpayne@69: self.owns_samfile = False jpayne@69: jpayne@69: def build(self): jpayne@69: '''build the index.''' jpayne@69: jpayne@69: self.index = collections.defaultdict(list) jpayne@69: jpayne@69: # this method will start indexing from the current file position jpayne@69: cdef int ret = 1 jpayne@69: cdef bam1_t * b = calloc(1, sizeof( bam1_t)) jpayne@69: if b == NULL: jpayne@69: raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) jpayne@69: jpayne@69: cdef uint64_t pos jpayne@69: cdef bam_hdr_t * hdr = self.header.ptr jpayne@69: jpayne@69: while ret > 0: jpayne@69: with nogil: jpayne@69: pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) jpayne@69: ret = sam_read1(self.htsfile, jpayne@69: hdr, jpayne@69: b) jpayne@69: jpayne@69: if ret > 0: jpayne@69: qname = charptr_to_str(pysam_bam_get_qname(b)) jpayne@69: self.index[qname].append(pos) jpayne@69: jpayne@69: bam_destroy1(b) jpayne@69: jpayne@69: def find(self, query_name): jpayne@69: '''find `query_name` in index. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: jpayne@69: IteratorRowSelection jpayne@69: Returns an iterator over all reads with query_name. jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: jpayne@69: KeyError jpayne@69: if the `query_name` is not in the index. jpayne@69: jpayne@69: ''' jpayne@69: if query_name in self.index: jpayne@69: return IteratorRowSelection( jpayne@69: self.samfile, jpayne@69: self.index[query_name], jpayne@69: multiple_iterators = False) jpayne@69: else: jpayne@69: raise KeyError("read %s not found" % query_name) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.owns_samfile: jpayne@69: hts_close(self.htsfile)