jpayne@68: # cython: embedsignature=True jpayne@68: # cython: profile=True jpayne@68: ######################################################## jpayne@68: ######################################################## jpayne@68: # Cython wrapper for SAM/BAM/CRAM files based on htslib jpayne@68: ######################################################## jpayne@68: # The principal classes defined in this module are: jpayne@68: # jpayne@68: # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files jpayne@68: # jpayne@68: # class AlignmentHeader manage SAM/BAM/CRAM header data jpayne@68: # jpayne@68: # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping jpayne@68: # the original sort order intact jpayne@68: # jpayne@68: # Additionally this module defines numerous additional classes that jpayne@68: # are part of the internal API. These are: jpayne@68: # jpayne@68: # Various iterator classes to iterate over alignments in sequential jpayne@68: # (IteratorRow) or in a stacked fashion (IteratorColumn): jpayne@68: # jpayne@68: # class IteratorRow jpayne@68: # class IteratorRowRegion jpayne@68: # class IteratorRowHead jpayne@68: # class IteratorRowAll jpayne@68: # class IteratorRowAllRefs jpayne@68: # class IteratorRowSelection jpayne@68: # class IteratorColumn jpayne@68: # class IteratorColumnRegion jpayne@68: # class IteratorColumnAll jpayne@68: # class IteratorColumnAllRefs jpayne@68: # jpayne@68: ######################################################## jpayne@68: # jpayne@68: # The MIT License jpayne@68: # jpayne@68: # Copyright (c) 2015 Andreas Heger jpayne@68: # jpayne@68: # Permission is hereby granted, free of charge, to any person obtaining a jpayne@68: # copy of this software and associated documentation files (the "Software"), jpayne@68: # to deal in the Software without restriction, including without limitation jpayne@68: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jpayne@68: # and/or sell copies of the Software, and to permit persons to whom the jpayne@68: # Software is furnished to do so, subject to the following conditions: jpayne@68: # jpayne@68: # The above copyright notice and this permission notice shall be included in jpayne@68: # all copies or substantial portions of the Software. jpayne@68: # jpayne@68: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@68: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@68: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL jpayne@68: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@68: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jpayne@68: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jpayne@68: # DEALINGS IN THE SOFTWARE. jpayne@68: # jpayne@68: ######################################################## jpayne@68: import os jpayne@68: import collections jpayne@68: try: jpayne@68: from collections.abc import Sequence, Mapping # noqa jpayne@68: except ImportError: jpayne@68: from collections import Sequence, Mapping # noqa jpayne@68: import re jpayne@68: import warnings jpayne@68: import array jpayne@68: from libc.errno cimport errno, EPIPE jpayne@68: from libc.string cimport strcmp, strpbrk, strerror jpayne@68: from libc.stdint cimport INT32_MAX jpayne@68: jpayne@68: from cpython cimport array as c_array jpayne@68: jpayne@68: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str jpayne@68: from pysam.libcutils cimport encode_filename, from_string_and_size jpayne@68: from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn jpayne@68: from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \ jpayne@68: HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL jpayne@68: jpayne@68: from io import StringIO jpayne@68: jpayne@68: cimport cython jpayne@68: jpayne@68: jpayne@68: __all__ = [ jpayne@68: "AlignmentFile", jpayne@68: "AlignmentHeader", jpayne@68: "IteratorRow", jpayne@68: "IteratorColumn", jpayne@68: "IndexedReads"] jpayne@68: jpayne@68: IndexStats = collections.namedtuple("IndexStats", jpayne@68: ("contig", jpayne@68: "mapped", jpayne@68: "unmapped", jpayne@68: "total")) jpayne@68: jpayne@68: ######################################################## jpayne@68: ## global variables jpayne@68: # maximum genomic coordinace jpayne@68: # for some reason, using 'int' causes overflow jpayne@68: cdef int MAX_POS = (1 << 31) - 1 jpayne@68: jpayne@68: # valid types for SAM headers jpayne@68: VALID_HEADER_TYPES = {"HD" : Mapping, jpayne@68: "SQ" : Sequence, jpayne@68: "RG" : Sequence, jpayne@68: "PG" : Sequence, jpayne@68: "CO" : Sequence} jpayne@68: jpayne@68: # order of records within SAM headers jpayne@68: VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") jpayne@68: jpayne@68: # default type conversions within SAM header records jpayne@68: KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str, jpayne@68: "SS" : str,}, jpayne@68: "SQ" : {"SN" : str, "LN" : int, "AS" : str, jpayne@68: "M5" : str, "SP" : str, "UR" : str, jpayne@68: "AH" : str, "TP" : str, "DS" : str, jpayne@68: "AN" : str,}, jpayne@68: "RG" : {"ID" : str, "CN" : str, "DS" : str, jpayne@68: "DT" : str, "FO" : str, "KS" : str, jpayne@68: "LB" : str, "PG" : str, "PI" : str, jpayne@68: "PL" : str, "PM" : str, "PU" : str, jpayne@68: "SM" : str, "BC" : str,}, jpayne@68: "PG" : {"ID" : str, "PN" : str, "CL" : str, jpayne@68: "PP" : str, "DS" : str, "VN" : str,},} jpayne@68: jpayne@68: # output order of fields within records. Ensure that CL is at jpayne@68: # the end as parsing a CL will ignore any subsequent records. jpayne@68: VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"), jpayne@68: "SQ" : ("SN", "LN", "AS", "M5", jpayne@68: "UR", "SP", "AH", "TP", jpayne@68: "DS", "AN"), jpayne@68: "RG" : ("ID", "CN", "SM", "LB", jpayne@68: "PU", "PI", "DT", "DS", jpayne@68: "PL", "FO", "KS", "PG", jpayne@68: "PM", "BC"), jpayne@68: "PG" : ("PN", "ID", "VN", "PP", jpayne@68: "DS", "CL"),} jpayne@68: jpayne@68: jpayne@68: def build_header_line(fields, record): jpayne@68: '''build a header line from `fields` dictionary for `record`''' jpayne@68: jpayne@68: # TODO: add checking for field and sort order jpayne@68: line = ["@%s" % record] jpayne@68: # comment jpayne@68: if record == "CO": jpayne@68: line.append(fields) jpayne@68: # user tags jpayne@68: elif record.islower(): jpayne@68: for key in sorted(fields): jpayne@68: line.append("%s:%s" % (key, str(fields[key]))) jpayne@68: # defined tags jpayne@68: else: jpayne@68: # write fields of the specification jpayne@68: for key in VALID_HEADER_ORDER[record]: jpayne@68: if key in fields: jpayne@68: line.append("%s:%s" % (key, str(fields[key]))) jpayne@68: # write user fields jpayne@68: for key in fields: jpayne@68: if not key.isupper(): jpayne@68: line.append("%s:%s" % (key, str(fields[key]))) jpayne@68: jpayne@68: return "\t".join(line) jpayne@68: jpayne@68: jpayne@68: cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): jpayne@68: if not hdr: jpayne@68: raise ValueError('cannot create AlignmentHeader, received NULL pointer') jpayne@68: jpayne@68: # check: is AlignmetHeader.__cinit__ called? jpayne@68: cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) jpayne@68: header.ptr = hdr jpayne@68: jpayne@68: return header jpayne@68: jpayne@68: def read_failure_reason(code): jpayne@68: if code == -2: jpayne@68: return 'truncated file' jpayne@68: else: jpayne@68: return "error {} while reading file".format(code) jpayne@68: jpayne@68: jpayne@68: # the following should be class-method for VariantHeader, but cdef @classmethods jpayne@68: # are not implemented in cython. jpayne@68: cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, jpayne@68: reference_names, jpayne@68: reference_lengths, jpayne@68: add_sq_text=True, jpayne@68: text=None) except -1: jpayne@68: """build header from list of reference names and lengths. jpayne@68: """ jpayne@68: jpayne@68: cdef class AlignmentHeader(object): jpayne@68: """header information for a :class:`AlignmentFile` object jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: header_dict : dict jpayne@68: build header from a multi-level dictionary. The jpayne@68: first level are the four types ('HD', 'SQ', ...). The second jpayne@68: level are a list of lines, with each line being a list of jpayne@68: tag-value pairs. The header is constructed first from all the jpayne@68: defined fields, followed by user tags in alphabetical jpayne@68: order. Alternatively, an :class:`~pysam.AlignmentHeader` jpayne@68: object can be passed directly. jpayne@68: jpayne@68: text : string jpayne@68: use the string provided as the header jpayne@68: jpayne@68: reference_names : list jpayne@68: see reference_lengths jpayne@68: jpayne@68: reference_lengths : list jpayne@68: build header from list of chromosome names and lengths. By jpayne@68: default, 'SQ' and 'LN' tags will be added to the header jpayne@68: text. This option can be changed by unsetting the flag jpayne@68: `add_sq_text`. jpayne@68: jpayne@68: add_sq_text : bool jpayne@68: do not add 'SQ' and 'LN' tags to header. This option permits jpayne@68: construction :term:`SAM` formatted files without a header. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: # See makeVariantHeader for C constructor jpayne@68: def __cinit__(self): jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: # Python constructor jpayne@68: def __init__(self): jpayne@68: self.ptr = bam_hdr_init() jpayne@68: if self.ptr is NULL: jpayne@68: raise MemoryError("could not create header") jpayne@68: jpayne@68: @classmethod jpayne@68: def _from_text_and_lengths(cls, text, reference_names, reference_lengths): jpayne@68: jpayne@68: cdef AlignmentHeader self = AlignmentHeader() jpayne@68: cdef char *ctext jpayne@68: cdef int l_text jpayne@68: cdef int n, x jpayne@68: if text is not None: jpayne@68: btext = force_bytes(text) jpayne@68: ctext = btext jpayne@68: l_text = len(btext) jpayne@68: self.ptr.text = calloc(l_text + 1, sizeof(char)) jpayne@68: if self.ptr.text == NULL: jpayne@68: raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) jpayne@68: self.ptr.l_text = l_text jpayne@68: memcpy(self.ptr.text, ctext, l_text + 1) jpayne@68: jpayne@68: if reference_names and reference_lengths: jpayne@68: reference_names = [force_bytes(ref) for ref in reference_names] jpayne@68: jpayne@68: self.ptr.n_targets = len(reference_names) jpayne@68: jpayne@68: n = sum([len(reference_names) + 1]) jpayne@68: self.ptr.target_name = calloc(n, sizeof(char*)) jpayne@68: if self.ptr.target_name == NULL: jpayne@68: raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) jpayne@68: jpayne@68: self.ptr.target_len = calloc(n, sizeof(uint32_t)) jpayne@68: if self.ptr.target_len == NULL: jpayne@68: raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) jpayne@68: jpayne@68: for x from 0 <= x < self.ptr.n_targets: jpayne@68: self.ptr.target_len[x] = reference_lengths[x] jpayne@68: name = reference_names[x] jpayne@68: self.ptr.target_name[x] = calloc(len(name) + 1, sizeof(char)) jpayne@68: if self.ptr.target_name[x] == NULL: jpayne@68: raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) jpayne@68: strncpy(self.ptr.target_name[x], name, len(name)) jpayne@68: jpayne@68: return self jpayne@68: jpayne@68: @classmethod jpayne@68: def from_text(cls, text): jpayne@68: jpayne@68: reference_names, reference_lengths = [], [] jpayne@68: for line in text.splitlines(): jpayne@68: if line.startswith("@SQ"): jpayne@68: fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) jpayne@68: try: jpayne@68: reference_names.append(fields["SN"]) jpayne@68: reference_lengths.append(int(fields["LN"])) jpayne@68: except KeyError: jpayne@68: raise KeyError("incomplete sequence information in '%s'" % str(fields)) jpayne@68: except ValueError: jpayne@68: raise ValueError("wrong sequence information in '%s'" % str(fields)) jpayne@68: jpayne@68: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@68: jpayne@68: @classmethod jpayne@68: def from_dict(cls, header_dict): jpayne@68: jpayne@68: cdef list lines = [] jpayne@68: # first: defined tags jpayne@68: for record in VALID_HEADERS: jpayne@68: if record in header_dict: jpayne@68: data = header_dict[record] jpayne@68: if not isinstance(data, VALID_HEADER_TYPES[record]): jpayne@68: raise ValueError( jpayne@68: "invalid type for record %s: %s, expected %s".format( jpayne@68: record, type(data), VALID_HEADER_TYPES[record])) jpayne@68: if isinstance(data, Mapping): jpayne@68: lines.append(build_header_line(data, record)) jpayne@68: else: jpayne@68: for fields in header_dict[record]: jpayne@68: lines.append(build_header_line(fields, record)) jpayne@68: jpayne@68: # then: user tags (lower case), sorted alphabetically jpayne@68: for record, data in sorted(header_dict.items()): jpayne@68: if record in VALID_HEADERS: jpayne@68: continue jpayne@68: if isinstance(data, Mapping): jpayne@68: lines.append(build_header_line(data, record)) jpayne@68: else: jpayne@68: for fields in header_dict[record]: jpayne@68: lines.append(build_header_line(fields, record)) jpayne@68: jpayne@68: text = "\n".join(lines) + "\n" jpayne@68: jpayne@68: reference_names, reference_lengths = [], [] jpayne@68: if "SQ" in header_dict: jpayne@68: for fields in header_dict["SQ"]: jpayne@68: try: jpayne@68: reference_names.append(fields["SN"]) jpayne@68: reference_lengths.append(fields["LN"]) jpayne@68: except KeyError: jpayne@68: raise KeyError("incomplete sequence information in '%s'" % str(fields)) jpayne@68: jpayne@68: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@68: jpayne@68: @classmethod jpayne@68: def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): jpayne@68: jpayne@68: if len(reference_names) != len(reference_lengths): jpayne@68: raise ValueError("number of reference names and lengths do not match") jpayne@68: jpayne@68: # optionally, if there is no text, add a SAM compatible header to output file. jpayne@68: if text is None and add_sq_text: jpayne@68: text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( jpayne@68: reference_names, reference_lengths)]) jpayne@68: jpayne@68: return cls._from_text_and_lengths(text, reference_names, reference_lengths) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: bam_hdr_destroy(self.ptr) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return self.ptr != NULL jpayne@68: jpayne@68: def copy(self): jpayne@68: return makeAlignmentHeader(bam_hdr_dup(self.ptr)) jpayne@68: jpayne@68: property nreferences: jpayne@68: """int with the number of :term:`reference` sequences in the file. jpayne@68: jpayne@68: This is a read-only attribute.""" jpayne@68: def __get__(self): jpayne@68: return self.ptr.n_targets jpayne@68: jpayne@68: property references: jpayne@68: """tuple with the names of :term:`reference` sequences. This is a jpayne@68: read-only attribute""" jpayne@68: def __get__(self): jpayne@68: t = [] jpayne@68: cdef int x jpayne@68: for x in range(self.ptr.n_targets): jpayne@68: t.append(charptr_to_str(self.ptr.target_name[x])) jpayne@68: return tuple(t) jpayne@68: jpayne@68: property lengths: jpayne@68: """tuple of the lengths of the :term:`reference` sequences. This is a jpayne@68: read-only attribute. The lengths are in the same order as jpayne@68: :attr:`pysam.AlignmentFile.references` jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: t = [] jpayne@68: cdef int x jpayne@68: for x in range(self.ptr.n_targets): jpayne@68: t.append(self.ptr.target_len[x]) jpayne@68: return tuple(t) jpayne@68: jpayne@68: def _build_sequence_section(self): jpayne@68: """return sequence section of header. jpayne@68: jpayne@68: The sequence section is built from the list of reference names and jpayne@68: lengths stored in the BAM-file and not from any @SQ entries that jpayne@68: are part of the header's text section. jpayne@68: """ jpayne@68: jpayne@68: cdef int x jpayne@68: text = [] jpayne@68: for x in range(self.ptr.n_targets): jpayne@68: text.append("@SQ\tSN:{}\tLN:{}\n".format( jpayne@68: force_str(self.ptr.target_name[x]), jpayne@68: self.ptr.target_len[x])) jpayne@68: return "".join(text) jpayne@68: jpayne@68: def to_dict(self): jpayne@68: """return two-level dictionary with header information from the file. jpayne@68: jpayne@68: The first level contains the record (``HD``, ``SQ``, etc) and jpayne@68: the second level contains the fields (``VN``, ``LN``, etc). jpayne@68: jpayne@68: The parser is validating and will raise an AssertionError if jpayne@68: if encounters any record or field tags that are not part of jpayne@68: the SAM specification. Use the jpayne@68: :attr:`pysam.AlignmentFile.text` attribute to get the unparsed jpayne@68: header. jpayne@68: jpayne@68: The parsing follows the SAM format specification with the jpayne@68: exception of the ``CL`` field. This option will consume the jpayne@68: rest of a header line irrespective of any additional fields. jpayne@68: This behaviour has been added to accommodate command line jpayne@68: options that contain characters that are not valid field jpayne@68: separators. jpayne@68: jpayne@68: If no @SQ entries are within the text section of the header, jpayne@68: this will be automatically added from the reference names and jpayne@68: lengths stored in the binary part of the header. jpayne@68: """ jpayne@68: result = collections.OrderedDict() jpayne@68: jpayne@68: # convert to python string jpayne@68: t = self.__str__() jpayne@68: for line in t.split("\n"): jpayne@68: line = line.strip(' \0') jpayne@68: if not line: jpayne@68: continue jpayne@68: assert line.startswith("@"), \ jpayne@68: "header line without '@': '%s'" % line jpayne@68: fields = line[1:].split("\t") jpayne@68: record = fields[0] jpayne@68: assert record in VALID_HEADER_TYPES, \ jpayne@68: "header line with invalid type '%s': '%s'" % (record, line) jpayne@68: jpayne@68: # treat comments jpayne@68: if record == "CO": jpayne@68: if record not in result: jpayne@68: result[record] = [] jpayne@68: result[record].append("\t".join( fields[1:])) jpayne@68: continue jpayne@68: # the following is clumsy as generators do not work? jpayne@68: x = {} jpayne@68: jpayne@68: for idx, field in enumerate(fields[1:]): jpayne@68: if ":" not in field: jpayne@68: raise ValueError("malformatted header: no ':' in field" ) jpayne@68: key, value = field.split(":", 1) jpayne@68: if key in ("CL",): jpayne@68: # special treatment for command line jpayne@68: # statements (CL). These might contain jpayne@68: # characters that are non-conformant with jpayne@68: # the valid field separators in the SAM jpayne@68: # header. Thus, in contravention to the jpayne@68: # SAM API, consume the rest of the line. jpayne@68: key, value = "\t".join(fields[idx+1:]).split(":", 1) jpayne@68: x[key] = KNOWN_HEADER_FIELDS[record][key](value) jpayne@68: break jpayne@68: jpayne@68: # interpret type of known header record tags, default to str jpayne@68: x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) jpayne@68: jpayne@68: if VALID_HEADER_TYPES[record] == Mapping: jpayne@68: if record in result: jpayne@68: raise ValueError( jpayne@68: "multiple '%s' lines are not permitted" % record) jpayne@68: jpayne@68: result[record] = x jpayne@68: elif VALID_HEADER_TYPES[record] == Sequence: jpayne@68: if record not in result: result[record] = [] jpayne@68: result[record].append(x) jpayne@68: jpayne@68: # if there are no SQ lines in the header, add the jpayne@68: # reference names from the information in the bam jpayne@68: # file. jpayne@68: # jpayne@68: # Background: c-samtools keeps the textual part of the jpayne@68: # header separate from the list of reference names and jpayne@68: # lengths. Thus, if a header contains only SQ lines, jpayne@68: # the SQ information is not part of the textual header jpayne@68: # and thus are missing from the output. See issue 84. jpayne@68: if "SQ" not in result: jpayne@68: sq = [] jpayne@68: for ref, length in zip(self.references, self.lengths): jpayne@68: sq.append({'LN': length, 'SN': ref }) jpayne@68: result["SQ"] = sq jpayne@68: jpayne@68: return result jpayne@68: jpayne@68: def as_dict(self): jpayne@68: """deprecated, use :meth:`to_dict()` instead""" jpayne@68: return self.to_dict() jpayne@68: jpayne@68: def get_reference_name(self, tid): jpayne@68: if tid == -1: jpayne@68: return None jpayne@68: if not 0 <= tid < self.ptr.n_targets: jpayne@68: raise ValueError("reference_id %i out of range 0<=tid<%i" % jpayne@68: (tid, self.ptr.n_targets)) jpayne@68: return charptr_to_str(self.ptr.target_name[tid]) jpayne@68: jpayne@68: def get_reference_length(self, reference): jpayne@68: cdef int tid = self.get_tid(reference) jpayne@68: if tid < 0: jpayne@68: raise KeyError("unknown reference {}".format(reference)) jpayne@68: else: jpayne@68: return self.ptr.target_len[tid] jpayne@68: jpayne@68: def is_valid_tid(self, int tid): jpayne@68: """ jpayne@68: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@68: jpayne@68: Note that the unmapped tid code (-1) counts as an invalid. jpayne@68: """ jpayne@68: return 0 <= tid < self.ptr.n_targets jpayne@68: jpayne@68: def get_tid(self, reference): jpayne@68: """ jpayne@68: return the numerical :term:`tid` corresponding to jpayne@68: :term:`reference` jpayne@68: jpayne@68: returns -1 if reference is not known. jpayne@68: """ jpayne@68: reference = force_bytes(reference) jpayne@68: tid = bam_name2id(self.ptr, reference) jpayne@68: if tid < -1: jpayne@68: raise ValueError('could not parse header') jpayne@68: return tid jpayne@68: jpayne@68: def __str__(self): jpayne@68: '''string with the full contents of the :term:`sam file` header as a jpayne@68: string. jpayne@68: jpayne@68: If no @SQ entries are within the text section of the header, jpayne@68: this will be automatically added from the reference names and jpayne@68: lengths stored in the binary part of the header. jpayne@68: jpayne@68: See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed jpayne@68: representation of the header. jpayne@68: ''' jpayne@68: text = from_string_and_size(self.ptr.text, self.ptr.l_text) jpayne@68: if "@SQ" not in text: jpayne@68: text += "\n" + self._build_sequence_section() jpayne@68: return text jpayne@68: jpayne@68: # dictionary access methods, for backwards compatibility. jpayne@68: def __setitem__(self, key, value): jpayne@68: raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: return self.to_dict().__getitem__(key) jpayne@68: jpayne@68: def items(self): jpayne@68: return self.to_dict().items() jpayne@68: jpayne@68: # PY2 compatibility jpayne@68: def iteritems(self): jpayne@68: return self.to_dict().items() jpayne@68: jpayne@68: def keys(self): jpayne@68: return self.to_dict().keys() jpayne@68: jpayne@68: def values(self): jpayne@68: return self.to_dict().values() jpayne@68: jpayne@68: def get(self, *args): jpayne@68: return self.to_dict().get(*args) jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self.to_dict().__len__() jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: return self.to_dict().__contains__(key) jpayne@68: jpayne@68: jpayne@68: cdef class AlignmentFile(HTSFile): jpayne@68: """AlignmentFile(filepath_or_object, mode=None, template=None, jpayne@68: reference_names=None, reference_lengths=None, text=NULL, jpayne@68: header=None, add_sq_text=False, check_header=True, check_sq=True, jpayne@68: reference_filename=None, filename=None, index_filename=None, jpayne@68: filepath_index=None, require_index=False, duplicate_filehandle=True, jpayne@68: ignore_truncation=False, threads=1) jpayne@68: jpayne@68: A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file. jpayne@68: jpayne@68: If `filepath_or_object` is a string, the file is automatically jpayne@68: opened. If `filepath_or_object` is a python File object, the jpayne@68: already opened file will be used. jpayne@68: jpayne@68: If the file is opened for reading and an index exists (if file is BAM, a jpayne@68: .bai file or if CRAM a .crai file), it will be opened automatically. jpayne@68: `index_filename` may be specified explicitly. If the index is not named jpayne@68: in the standard manner, not located in the same directory as the jpayne@68: BAM/CRAM file, or is remote. Without an index, random access via jpayne@68: :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup` jpayne@68: is disabled. jpayne@68: jpayne@68: For writing, the header of a :term:`SAM` file/:term:`BAM` file can jpayne@68: be constituted from several sources (see also the samtools format jpayne@68: specification): jpayne@68: jpayne@68: 1. If `template` is given, the header is copied from another jpayne@68: `AlignmentFile` (`template` must be a jpayne@68: :class:`~pysam.AlignmentFile`). jpayne@68: jpayne@68: 2. If `header` is given, the header is built from a jpayne@68: multi-level dictionary. jpayne@68: jpayne@68: 3. If `text` is given, new header text is copied from raw jpayne@68: text. jpayne@68: jpayne@68: 4. The names (`reference_names`) and lengths jpayne@68: (`reference_lengths`) are supplied directly as lists. jpayne@68: jpayne@68: When reading or writing a CRAM file, the filename of a FASTA-formatted jpayne@68: reference can be specified with `reference_filename`. jpayne@68: jpayne@68: By default, if a file is opened in mode 'r', it is checked jpayne@68: for a valid header (`check_header` = True) and a definition of jpayne@68: chromosome names (`check_sq` = True). jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: mode : string jpayne@68: `mode` should be ``r`` for reading or ``w`` for writing. The jpayne@68: default is text mode (:term:`SAM`). For binary (:term:`BAM`) jpayne@68: I/O you should append ``b`` for compressed or ``u`` for jpayne@68: uncompressed :term:`BAM` output. Use ``h`` to output header jpayne@68: information in text (:term:`TAM`) mode. Use ``c`` for jpayne@68: :term:`CRAM` formatted files. jpayne@68: jpayne@68: If ``b`` is present, it must immediately follow ``r`` or jpayne@68: ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, jpayne@68: ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a jpayne@68: :term:`BAM` formatted file for reading, type:: jpayne@68: jpayne@68: f = pysam.AlignmentFile('ex1.bam','rb') jpayne@68: jpayne@68: If mode is not specified, the method will try to auto-detect jpayne@68: in the order 'rb', 'r', thus both the following should work:: jpayne@68: jpayne@68: f1 = pysam.AlignmentFile('ex1.bam') jpayne@68: f2 = pysam.AlignmentFile('ex1.sam') jpayne@68: jpayne@68: template : AlignmentFile jpayne@68: when writing, copy header from file `template`. jpayne@68: jpayne@68: header : dict or AlignmentHeader jpayne@68: when writing, build header from a multi-level dictionary. The jpayne@68: first level are the four types ('HD', 'SQ', ...). The second jpayne@68: level are a list of lines, with each line being a list of jpayne@68: tag-value pairs. The header is constructed first from all the jpayne@68: defined fields, followed by user tags in alphabetical jpayne@68: order. Alternatively, an :class:`~pysam.AlignmentHeader` jpayne@68: object can be passed directly. jpayne@68: jpayne@68: text : string jpayne@68: when writing, use the string provided as the header jpayne@68: jpayne@68: reference_names : list jpayne@68: see reference_lengths jpayne@68: jpayne@68: reference_lengths : list jpayne@68: when writing or opening a SAM file without header build header jpayne@68: from list of chromosome names and lengths. By default, 'SQ' jpayne@68: and 'LN' tags will be added to the header text. This option jpayne@68: can be changed by unsetting the flag `add_sq_text`. jpayne@68: jpayne@68: add_sq_text : bool jpayne@68: do not add 'SQ' and 'LN' tags to header. This option permits jpayne@68: construction :term:`SAM` formatted files without a header. jpayne@68: jpayne@68: add_sam_header : bool jpayne@68: when outputting SAM the default is to output a header. This is jpayne@68: equivalent to opening the file in 'wh' mode. If this option is jpayne@68: set to False, no header will be output. To read such a file, jpayne@68: set `check_header=False`. jpayne@68: jpayne@68: check_header : bool jpayne@68: obsolete: when reading a SAM file, check if header is present jpayne@68: (default=True) jpayne@68: jpayne@68: check_sq : bool jpayne@68: when reading, check if SQ entries are present in header jpayne@68: (default=True) jpayne@68: jpayne@68: reference_filename : string jpayne@68: Path to a FASTA-formatted reference file. Valid only for CRAM files. jpayne@68: When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL jpayne@68: specified in the header (``UR`` tag), which are normally used to find jpayne@68: the reference. jpayne@68: jpayne@68: index_filename : string jpayne@68: Explicit path to the index file. Only needed if the index is not jpayne@68: named in the standard manner, not located in the same directory as jpayne@68: the BAM/CRAM file, or is remote. An IOError is raised if the index jpayne@68: cannot be found or is invalid. jpayne@68: jpayne@68: filepath_index : string jpayne@68: Alias for `index_filename`. jpayne@68: jpayne@68: require_index : bool jpayne@68: When reading, require that an index file is present and is valid or jpayne@68: raise an IOError. (default=False) jpayne@68: jpayne@68: filename : string jpayne@68: Alternative to filepath_or_object. Filename of the file jpayne@68: to be opened. jpayne@68: jpayne@68: duplicate_filehandle: bool jpayne@68: By default, file handles passed either directly or through jpayne@68: File-like objects will be duplicated before passing them to jpayne@68: htslib. The duplication prevents issues where the same stream jpayne@68: will be closed by htslib and through destruction of the jpayne@68: high-level python object. Set to False to turn off jpayne@68: duplication. jpayne@68: jpayne@68: ignore_truncation: bool jpayne@68: Issue a warning, instead of raising an error if the current file jpayne@68: appears to be truncated due to a missing EOF marker. Only applies jpayne@68: to bgzipped formats. (Default=False) jpayne@68: jpayne@68: format_options: list jpayne@68: A list of key=value strings, as accepted by --input-fmt-option and jpayne@68: --output-fmt-option in samtools. jpayne@68: threads: integer jpayne@68: Number of threads to use for compressing/decompressing BAM/CRAM files. jpayne@68: Setting threads to > 1 cannot be combined with `ignore_truncation`. jpayne@68: (Default=1) jpayne@68: """ jpayne@68: jpayne@68: def __cinit__(self, *args, **kwargs): jpayne@68: self.htsfile = NULL jpayne@68: self.filename = None jpayne@68: self.mode = None jpayne@68: self.threads = 1 jpayne@68: self.is_stream = False jpayne@68: self.is_remote = False jpayne@68: self.index = NULL jpayne@68: jpayne@68: if "filename" in kwargs: jpayne@68: args = [kwargs["filename"]] jpayne@68: del kwargs["filename"] jpayne@68: jpayne@68: self._open(*args, **kwargs) jpayne@68: jpayne@68: # allocate memory for iterator jpayne@68: self.b = calloc(1, sizeof(bam1_t)) jpayne@68: if self.b == NULL: jpayne@68: raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) jpayne@68: jpayne@68: def has_index(self): jpayne@68: """return true if htsfile has an existing (and opened) index. jpayne@68: """ jpayne@68: return self.index != NULL jpayne@68: jpayne@68: def check_index(self): jpayne@68: """return True if index is present. jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: AttributeError jpayne@68: if htsfile is :term:`SAM` formatted and thus has no index. jpayne@68: jpayne@68: ValueError jpayne@68: if htsfile is closed or index could not be opened. jpayne@68: """ jpayne@68: jpayne@68: if not self.is_open: jpayne@68: raise ValueError("I/O operation on closed file") jpayne@68: if not self.is_bam and not self.is_cram: jpayne@68: raise AttributeError( jpayne@68: "AlignmentFile.mapped only available in bam files") jpayne@68: if self.index == NULL: jpayne@68: raise ValueError( jpayne@68: "mapping information not recorded in index " jpayne@68: "or index not available") jpayne@68: return True jpayne@68: jpayne@68: def _open(self, jpayne@68: filepath_or_object, jpayne@68: mode=None, jpayne@68: AlignmentFile template=None, jpayne@68: reference_names=None, jpayne@68: reference_lengths=None, jpayne@68: reference_filename=None, jpayne@68: text=None, jpayne@68: header=None, jpayne@68: port=None, jpayne@68: add_sq_text=True, jpayne@68: add_sam_header=True, jpayne@68: check_header=True, jpayne@68: check_sq=True, jpayne@68: index_filename=None, jpayne@68: filepath_index=None, jpayne@68: require_index=False, jpayne@68: referencenames=None, jpayne@68: referencelengths=None, jpayne@68: duplicate_filehandle=True, jpayne@68: ignore_truncation=False, jpayne@68: format_options=None, jpayne@68: threads=1): jpayne@68: '''open a sam, bam or cram formatted file. jpayne@68: jpayne@68: If _open is called on an existing file, the current file jpayne@68: will be closed and a new file will be opened. jpayne@68: jpayne@68: ''' jpayne@68: cdef char *cfilename = NULL jpayne@68: cdef char *creference_filename = NULL jpayne@68: cdef char *cindexname = NULL jpayne@68: cdef char *cmode = NULL jpayne@68: cdef bam_hdr_t * hdr = NULL jpayne@68: jpayne@68: if threads > 1 and ignore_truncation: jpayne@68: # This won't raise errors if reaching a truncated alignment, jpayne@68: # because bgzf_mt_reader in htslib does not deal with jpayne@68: # bgzf_mt_read_block returning non-zero values, contrary jpayne@68: # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) jpayne@68: # Better to avoid this (for now) than to produce seemingly correct results. jpayne@68: raise ValueError('Cannot add extra threads when "ignore_truncation" is True') jpayne@68: self.threads = threads jpayne@68: jpayne@68: # for backwards compatibility: jpayne@68: if referencenames is not None: jpayne@68: reference_names = referencenames jpayne@68: if referencelengths is not None: jpayne@68: reference_lengths = referencelengths jpayne@68: jpayne@68: # close a previously opened file jpayne@68: if self.is_open: jpayne@68: self.close() jpayne@68: jpayne@68: # autodetection for read jpayne@68: if mode is None: jpayne@68: mode = "r" jpayne@68: jpayne@68: if add_sam_header and mode == "w": jpayne@68: mode = "wh" jpayne@68: jpayne@68: assert mode in ("r", "w", "rb", "wb", "wh", jpayne@68: "wbu", "rU", "wb0", jpayne@68: "rc", "wc"), \ jpayne@68: "invalid file opening mode `%s`" % mode jpayne@68: jpayne@68: self.duplicate_filehandle = duplicate_filehandle jpayne@68: jpayne@68: # StringIO not supported jpayne@68: if isinstance(filepath_or_object, StringIO): jpayne@68: raise NotImplementedError( jpayne@68: "access from StringIO objects not supported") jpayne@68: # reading from a file descriptor jpayne@68: elif isinstance(filepath_or_object, int): jpayne@68: self.filename = filepath_or_object jpayne@68: filename = None jpayne@68: self.is_remote = False jpayne@68: self.is_stream = True jpayne@68: # reading from a File object or other object with fileno jpayne@68: elif hasattr(filepath_or_object, "fileno"): jpayne@68: if filepath_or_object.closed: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: self.filename = filepath_or_object jpayne@68: # .name can be TextIOWrapper jpayne@68: try: jpayne@68: filename = encode_filename(str(filepath_or_object.name)) jpayne@68: cfilename = filename jpayne@68: except AttributeError: jpayne@68: filename = None jpayne@68: self.is_remote = False jpayne@68: self.is_stream = True jpayne@68: # what remains is a filename jpayne@68: else: jpayne@68: self.filename = filename = encode_filename(filepath_or_object) jpayne@68: cfilename = filename jpayne@68: self.is_remote = hisremote(cfilename) jpayne@68: self.is_stream = self.filename == b'-' jpayne@68: jpayne@68: # for htslib, wbu seems to not work jpayne@68: if mode == "wbu": jpayne@68: mode = "wb0" jpayne@68: jpayne@68: self.mode = force_bytes(mode) jpayne@68: self.reference_filename = reference_filename = encode_filename( jpayne@68: reference_filename) jpayne@68: jpayne@68: if mode[0] == 'w': jpayne@68: # open file for writing jpayne@68: jpayne@68: if not (template or header or text or (reference_names and reference_lengths)): jpayne@68: raise ValueError( jpayne@68: "either supply options `template`, `header`, `text` or both `reference_names` " jpayne@68: "and `reference_lengths` for writing") jpayne@68: jpayne@68: if template: jpayne@68: # header is copied, though at the moment not strictly jpayne@68: # necessary as AlignmentHeader is immutable. jpayne@68: self.header = template.header.copy() jpayne@68: elif isinstance(header, AlignmentHeader): jpayne@68: self.header = header.copy() jpayne@68: elif isinstance(header, Mapping): jpayne@68: self.header = AlignmentHeader.from_dict(header) jpayne@68: elif reference_names and reference_lengths: jpayne@68: self.header = AlignmentHeader.from_references( jpayne@68: reference_names, jpayne@68: reference_lengths, jpayne@68: add_sq_text=add_sq_text, jpayne@68: text=text) jpayne@68: elif text: jpayne@68: self.header = AlignmentHeader.from_text(text) jpayne@68: else: jpayne@68: raise ValueError("not enough information to construct header. Please provide template, " jpayne@68: "header, text or reference_names/reference_lengths") jpayne@68: self.htsfile = self._open_htsfile() jpayne@68: jpayne@68: if self.htsfile == NULL: jpayne@68: if errno: jpayne@68: raise IOError(errno, "could not open alignment file `{}`: {}".format( jpayne@68: force_str(filename), jpayne@68: force_str(strerror(errno)))) jpayne@68: else: jpayne@68: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) jpayne@68: if format_options and len(format_options): jpayne@68: self.add_hts_options(format_options) jpayne@68: # set filename with reference sequences. If no filename jpayne@68: # is given, the CRAM reference arrays will be built from jpayne@68: # the @SQ header in the header jpayne@68: if "c" in mode and reference_filename: jpayne@68: if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0): jpayne@68: raise ValueError("failure when setting reference filename") jpayne@68: jpayne@68: # write header to htsfile jpayne@68: if "b" in mode or "c" in mode or "h" in mode: jpayne@68: hdr = self.header.ptr jpayne@68: with nogil: jpayne@68: sam_hdr_write(self.htsfile, hdr) jpayne@68: jpayne@68: elif mode[0] == "r": jpayne@68: # open file for reading jpayne@68: self.htsfile = self._open_htsfile() jpayne@68: jpayne@68: if self.htsfile == NULL: jpayne@68: if errno: jpayne@68: raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), jpayne@68: force_str(strerror(errno)))) jpayne@68: else: jpayne@68: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) jpayne@68: jpayne@68: if self.htsfile.format.category != sequence_data: jpayne@68: raise ValueError("file does not contain alignment data") jpayne@68: jpayne@68: if format_options and len(format_options): jpayne@68: self.add_hts_options(format_options) jpayne@68: jpayne@68: self.check_truncation(ignore_truncation) jpayne@68: jpayne@68: # bam/cram files require a valid header jpayne@68: if self.is_bam or self.is_cram: jpayne@68: with nogil: jpayne@68: hdr = sam_hdr_read(self.htsfile) jpayne@68: if hdr == NULL: jpayne@68: raise ValueError( jpayne@68: "file does not have a valid header (mode='%s') " jpayne@68: "- is it BAM/CRAM format?" % mode) jpayne@68: self.header = makeAlignmentHeader(hdr) jpayne@68: else: jpayne@68: # in sam files a header is optional. If not given, jpayne@68: # user may provide reference names and lengths to built jpayne@68: # an on-the-fly header. jpayne@68: if reference_names and reference_lengths: jpayne@68: # build header from a target names and lengths jpayne@68: self.header = AlignmentHeader.from_references( jpayne@68: reference_names=reference_names, jpayne@68: reference_lengths=reference_lengths, jpayne@68: add_sq_text=add_sq_text, jpayne@68: text=text) jpayne@68: else: jpayne@68: with nogil: jpayne@68: hdr = sam_hdr_read(self.htsfile) jpayne@68: if hdr == NULL: jpayne@68: raise ValueError( jpayne@68: "SAM? file does not have a valid header (mode='%s'), " jpayne@68: "please provide reference_names and reference_lengths") jpayne@68: self.header = makeAlignmentHeader(hdr) jpayne@68: jpayne@68: # set filename with reference sequences jpayne@68: if self.is_cram and reference_filename: jpayne@68: creference_filename = self.reference_filename jpayne@68: hts_set_opt(self.htsfile, jpayne@68: CRAM_OPT_REFERENCE, jpayne@68: creference_filename) jpayne@68: jpayne@68: if check_sq and self.header.nreferences == 0: jpayne@68: raise ValueError( jpayne@68: ("file has no sequences defined (mode='%s') - " jpayne@68: "is it SAM/BAM format? Consider opening with " jpayne@68: "check_sq=False") % mode) jpayne@68: jpayne@68: if self.is_bam or self.is_cram: jpayne@68: self.index_filename = index_filename or filepath_index jpayne@68: if self.index_filename: jpayne@68: cindexname = bfile_name = encode_filename(self.index_filename) jpayne@68: jpayne@68: if cfilename or cindexname: jpayne@68: with nogil: jpayne@68: self.index = sam_index_load3(self.htsfile, cfilename, cindexname, jpayne@68: HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL) jpayne@68: jpayne@68: if not self.index and (cindexname or require_index): jpayne@68: if errno: jpayne@68: raise IOError(errno, force_str(strerror(errno))) jpayne@68: else: jpayne@68: raise IOError('unable to open index file `%s`' % self.index_filename) jpayne@68: jpayne@68: elif require_index: jpayne@68: raise IOError('unable to open index file') jpayne@68: jpayne@68: # save start of data section jpayne@68: if not self.is_stream: jpayne@68: self.start_offset = self.tell() jpayne@68: jpayne@68: def fetch(self, jpayne@68: contig=None, jpayne@68: start=None, jpayne@68: stop=None, jpayne@68: region=None, jpayne@68: tid=None, jpayne@68: until_eof=False, jpayne@68: multiple_iterators=False, jpayne@68: reference=None, jpayne@68: end=None): jpayne@68: """fetch reads aligned in a :term:`region`. jpayne@68: jpayne@68: See :meth:`~pysam.HTSFile.parse_region` for more information jpayne@68: on how genomic regions can be specified. :term:`reference` and jpayne@68: `end` are also accepted for backward compatibility as synonyms jpayne@68: for :term:`contig` and `stop`, respectively. jpayne@68: jpayne@68: Without a `contig` or `region` all mapped reads in the file jpayne@68: will be fetched. The reads will be returned ordered by reference jpayne@68: sequence, which will not necessarily be the order within the jpayne@68: file. This mode of iteration still requires an index. If there is jpayne@68: no index, use `until_eof=True`. jpayne@68: jpayne@68: If only `contig` is set, all reads aligned to `contig` jpayne@68: will be fetched. jpayne@68: jpayne@68: A :term:`SAM` file does not allow random access. If `region` jpayne@68: or `contig` are given, an exception is raised. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: until_eof : bool jpayne@68: jpayne@68: If `until_eof` is True, all reads from the current file jpayne@68: position will be returned in order as they are within the jpayne@68: file. Using this option will also fetch unmapped reads. jpayne@68: jpayne@68: multiple_iterators : bool jpayne@68: jpayne@68: If `multiple_iterators` is True, multiple jpayne@68: iterators on the same file can be used at the same time. The jpayne@68: iterator returned will receive its own copy of a filehandle to jpayne@68: the file effectively re-opening the file. Re-opening a file jpayne@68: creates some overhead, so beware. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: An iterator over a collection of reads. : IteratorRow jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: ValueError jpayne@68: if the genomic coordinates are out of range or invalid or the jpayne@68: file does not permit random access to genomic coordinates. jpayne@68: jpayne@68: """ jpayne@68: cdef int rtid, rstart, rstop, has_coord jpayne@68: jpayne@68: if not self.is_open: jpayne@68: raise ValueError( "I/O operation on closed file" ) jpayne@68: jpayne@68: has_coord, rtid, rstart, rstop = self.parse_region( jpayne@68: contig, start, stop, region, tid, jpayne@68: end=end, reference=reference) jpayne@68: jpayne@68: # Turn of re-opening if htsfile is a stream jpayne@68: if self.is_stream: jpayne@68: multiple_iterators = False jpayne@68: jpayne@68: if self.is_bam or self.is_cram: jpayne@68: if not until_eof and not self.is_remote: jpayne@68: if not self.has_index(): jpayne@68: raise ValueError( jpayne@68: "fetch called on bamfile without index") jpayne@68: jpayne@68: if has_coord: jpayne@68: return IteratorRowRegion( jpayne@68: self, rtid, rstart, rstop, jpayne@68: multiple_iterators=multiple_iterators) jpayne@68: else: jpayne@68: if until_eof: jpayne@68: return IteratorRowAll( jpayne@68: self, jpayne@68: multiple_iterators=multiple_iterators) jpayne@68: else: jpayne@68: # AH: check - reason why no multiple_iterators for jpayne@68: # AllRefs? jpayne@68: return IteratorRowAllRefs( jpayne@68: self, jpayne@68: multiple_iterators=multiple_iterators) jpayne@68: else: jpayne@68: if has_coord: jpayne@68: raise ValueError( jpayne@68: "fetching by region is not available for SAM files") jpayne@68: jpayne@68: if multiple_iterators == True: jpayne@68: raise ValueError( jpayne@68: "multiple iterators not implemented for SAM files") jpayne@68: jpayne@68: return IteratorRowAll(self, jpayne@68: multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: def head(self, n, multiple_iterators=True): jpayne@68: '''return an iterator over the first n alignments. jpayne@68: jpayne@68: This iterator is is useful for inspecting the bam-file. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: multiple_iterators : bool jpayne@68: jpayne@68: is set to True by default in order to jpayne@68: avoid changing the current file position. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: an iterator over a collection of reads : IteratorRowHead jpayne@68: jpayne@68: ''' jpayne@68: return IteratorRowHead(self, n, jpayne@68: multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: def mate(self, AlignedSegment read): jpayne@68: '''return the mate of :class:`pysam.AlignedSegment` `read`. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: Calling this method will change the file position. jpayne@68: This might interfere with any iterators that have jpayne@68: not re-opened the file. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: This method is too slow for high-throughput processing. jpayne@68: If a read needs to be processed with its mate, work jpayne@68: from a read name sorted file or, better, cache reads. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: the mate : AlignedSegment jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: ValueError jpayne@68: if the read is unpaired or the mate is unmapped jpayne@68: jpayne@68: ''' jpayne@68: cdef uint32_t flag = read._delegate.core.flag jpayne@68: jpayne@68: if flag & BAM_FPAIRED == 0: jpayne@68: raise ValueError("read %s: is unpaired" % jpayne@68: (read.query_name)) jpayne@68: if flag & BAM_FMUNMAP != 0: jpayne@68: raise ValueError("mate %s: is unmapped" % jpayne@68: (read.query_name)) jpayne@68: jpayne@68: # xor flags to get the other mate jpayne@68: cdef int x = BAM_FREAD1 + BAM_FREAD2 jpayne@68: flag = (flag ^ x) & x jpayne@68: jpayne@68: # Make sure to use a separate file to jump around jpayne@68: # to mate as otherwise the original file position jpayne@68: # will be lost jpayne@68: # The following code is not using the C API and jpayne@68: # could thus be made much quicker, for example jpayne@68: # by using tell and seek. jpayne@68: for mate in self.fetch( jpayne@68: read._delegate.core.mpos, jpayne@68: read._delegate.core.mpos + 1, jpayne@68: tid=read._delegate.core.mtid, jpayne@68: multiple_iterators=True): jpayne@68: if mate.flag & flag != 0 and \ jpayne@68: mate.query_name == read.query_name: jpayne@68: break jpayne@68: else: jpayne@68: raise ValueError("mate not found") jpayne@68: jpayne@68: return mate jpayne@68: jpayne@68: def pileup(self, jpayne@68: contig=None, jpayne@68: start=None, jpayne@68: stop=None, jpayne@68: region=None, jpayne@68: reference=None, jpayne@68: end=None, jpayne@68: **kwargs): jpayne@68: """perform a :term:`pileup` within a :term:`region`. The region is jpayne@68: specified by :term:`contig`, `start` and `stop` (using jpayne@68: 0-based indexing). :term:`reference` and `end` are also accepted for jpayne@68: backward compatibility as synonyms for :term:`contig` and `stop`, jpayne@68: respectively. Alternatively, a samtools 'region' string jpayne@68: can be supplied. jpayne@68: jpayne@68: Without 'contig' or 'region' all reads will be used for the jpayne@68: pileup. The reads will be returned ordered by jpayne@68: :term:`contig` sequence, which will not necessarily be the jpayne@68: order within the file. jpayne@68: jpayne@68: Note that :term:`SAM` formatted files do not allow random jpayne@68: access. In these files, if a 'region' or 'contig' are jpayne@68: given an exception is raised. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: 'all' reads which overlap the region are returned. The jpayne@68: first base returned will be the first base of the first jpayne@68: read 'not' necessarily the first base of the region used jpayne@68: in the query. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: truncate : bool jpayne@68: jpayne@68: By default, the samtools pileup engine outputs all reads jpayne@68: overlapping a region. If truncate is True and a region is jpayne@68: given, only columns in the exact region specified are jpayne@68: returned. jpayne@68: jpayne@68: max_depth : int jpayne@68: Maximum read depth permitted. The default limit is '8000'. jpayne@68: jpayne@68: stepper : string jpayne@68: The stepper controls how the iterator advances. jpayne@68: Possible options for the stepper are jpayne@68: jpayne@68: ``all`` jpayne@68: skip reads in which any of the following flags are set: jpayne@68: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@68: jpayne@68: ``nofilter`` jpayne@68: uses every single read turning off any filtering. jpayne@68: jpayne@68: ``samtools`` jpayne@68: same filter and read processing as in samtools jpayne@68: pileup. For full compatibility, this requires a jpayne@68: 'fastafile' to be given. The following options all pertain jpayne@68: to filtering of the ``samtools`` stepper. jpayne@68: jpayne@68: fastafile : :class:`~pysam.FastaFile` object. jpayne@68: jpayne@68: This is required for some of the steppers. jpayne@68: jpayne@68: ignore_overlaps: bool jpayne@68: jpayne@68: If set to True, detect if read pairs overlap and only take jpayne@68: the higher quality base. This is the default. jpayne@68: jpayne@68: flag_filter : int jpayne@68: jpayne@68: ignore reads where any of the bits in the flag are set. The default is jpayne@68: BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. jpayne@68: jpayne@68: flag_require : int jpayne@68: jpayne@68: only use reads where certain flags are set. The default is 0. jpayne@68: jpayne@68: ignore_orphans: bool jpayne@68: jpayne@68: ignore orphans (paired reads that are not in a proper pair). jpayne@68: The default is to ignore orphans. jpayne@68: jpayne@68: min_base_quality: int jpayne@68: jpayne@68: Minimum base quality. Bases below the minimum quality will jpayne@68: not be output. The default is 13. jpayne@68: jpayne@68: adjust_capq_threshold: int jpayne@68: jpayne@68: adjust mapping quality. The default is 0 for no jpayne@68: adjustment. The recommended value for adjustment is 50. jpayne@68: jpayne@68: min_mapping_quality : int jpayne@68: jpayne@68: only use reads above a minimum mapping quality. The default is 0. jpayne@68: jpayne@68: compute_baq: bool jpayne@68: jpayne@68: re-alignment computing per-Base Alignment Qualities (BAQ). The jpayne@68: default is to do re-alignment. Realignment requires a reference jpayne@68: sequence. If none is present, no realignment will be performed. jpayne@68: jpayne@68: redo_baq: bool jpayne@68: jpayne@68: recompute per-Base Alignment Quality on the fly ignoring jpayne@68: existing base qualities. The default is False (use existing jpayne@68: base qualities). jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: an iterator over genomic positions. : IteratorColumn jpayne@68: jpayne@68: """ jpayne@68: cdef int rtid, has_coord jpayne@68: cdef int32_t rstart, rstop jpayne@68: jpayne@68: if not self.is_open: jpayne@68: raise ValueError("I/O operation on closed file") jpayne@68: jpayne@68: has_coord, rtid, rstart, rstop = self.parse_region( jpayne@68: contig, start, stop, region, reference=reference, end=end) jpayne@68: jpayne@68: if has_coord: jpayne@68: if not self.has_index(): jpayne@68: raise ValueError("no index available for pileup") jpayne@68: jpayne@68: return IteratorColumnRegion(self, jpayne@68: tid=rtid, jpayne@68: start=rstart, jpayne@68: stop=rstop, jpayne@68: **kwargs) jpayne@68: else: jpayne@68: if self.has_index(): jpayne@68: return IteratorColumnAllRefs(self, **kwargs) jpayne@68: else: jpayne@68: return IteratorColumnAll(self, **kwargs) jpayne@68: jpayne@68: def count(self, jpayne@68: contig=None, jpayne@68: start=None, jpayne@68: stop=None, jpayne@68: region=None, jpayne@68: until_eof=False, jpayne@68: read_callback="nofilter", jpayne@68: reference=None, jpayne@68: end=None): jpayne@68: '''count the number of reads in :term:`region` jpayne@68: jpayne@68: The region is specified by :term:`contig`, `start` and `stop`. jpayne@68: :term:`reference` and `end` are also accepted for backward jpayne@68: compatibility as synonyms for :term:`contig` and `stop`, jpayne@68: respectively. Alternatively, a `samtools`_ :term:`region` jpayne@68: string can be supplied. jpayne@68: jpayne@68: A :term:`SAM` file does not allow random access and if jpayne@68: `region` or `contig` are given, an exception is raised. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: contig : string jpayne@68: reference_name of the genomic region (chromosome) jpayne@68: jpayne@68: start : int jpayne@68: start of the genomic region (0-based inclusive) jpayne@68: jpayne@68: stop : int jpayne@68: end of the genomic region (0-based exclusive) jpayne@68: jpayne@68: region : string jpayne@68: a region string in samtools format. jpayne@68: jpayne@68: until_eof : bool jpayne@68: count until the end of the file, possibly including jpayne@68: unmapped reads as well. jpayne@68: jpayne@68: read_callback: string or function jpayne@68: jpayne@68: select a call-back to ignore reads when counting. It can jpayne@68: be either a string with the following values: jpayne@68: jpayne@68: ``all`` jpayne@68: skip reads in which any of the following jpayne@68: flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, jpayne@68: BAM_FDUP jpayne@68: jpayne@68: ``nofilter`` jpayne@68: uses every single read jpayne@68: jpayne@68: Alternatively, `read_callback` can be a function jpayne@68: ``check_read(read)`` that should return True only for jpayne@68: those reads that shall be included in the counting. jpayne@68: jpayne@68: reference : string jpayne@68: backward compatible synonym for `contig` jpayne@68: jpayne@68: end : int jpayne@68: backward compatible synonym for `stop` jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: ValueError jpayne@68: if the genomic coordinates are out of range or invalid. jpayne@68: jpayne@68: ''' jpayne@68: cdef AlignedSegment read jpayne@68: cdef long counter = 0 jpayne@68: jpayne@68: if not self.is_open: jpayne@68: raise ValueError("I/O operation on closed file") jpayne@68: jpayne@68: cdef int filter_method = 0 jpayne@68: if read_callback == "all": jpayne@68: filter_method = 1 jpayne@68: elif read_callback == "nofilter": jpayne@68: filter_method = 2 jpayne@68: jpayne@68: for read in self.fetch(contig=contig, jpayne@68: start=start, jpayne@68: stop=stop, jpayne@68: reference=reference, jpayne@68: end=end, jpayne@68: region=region, jpayne@68: until_eof=until_eof): jpayne@68: # apply filter jpayne@68: if filter_method == 1: jpayne@68: # filter = "all" jpayne@68: if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): jpayne@68: continue jpayne@68: elif filter_method == 2: jpayne@68: # filter = "nofilter" jpayne@68: pass jpayne@68: else: jpayne@68: if not read_callback(read): jpayne@68: continue jpayne@68: counter += 1 jpayne@68: jpayne@68: return counter jpayne@68: jpayne@68: @cython.boundscheck(False) # we do manual bounds checking jpayne@68: def count_coverage(self, jpayne@68: contig, jpayne@68: start=None, jpayne@68: stop=None, jpayne@68: region=None, jpayne@68: quality_threshold=15, jpayne@68: read_callback='all', jpayne@68: reference=None, jpayne@68: end=None): jpayne@68: """count the coverage of genomic positions by reads in :term:`region`. jpayne@68: jpayne@68: The region is specified by :term:`contig`, `start` and `stop`. jpayne@68: :term:`reference` and `end` are also accepted for backward jpayne@68: compatibility as synonyms for :term:`contig` and `stop`, jpayne@68: respectively. Alternatively, a `samtools`_ :term:`region` jpayne@68: string can be supplied. The coverage is computed per-base [ACGT]. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: contig : string jpayne@68: reference_name of the genomic region (chromosome) jpayne@68: jpayne@68: start : int jpayne@68: start of the genomic region (0-based inclusive). If not jpayne@68: given, count from the start of the chromosome. jpayne@68: jpayne@68: stop : int jpayne@68: end of the genomic region (0-based exclusive). If not given, jpayne@68: count to the end of the chromosome. jpayne@68: jpayne@68: region : string jpayne@68: a region string. jpayne@68: jpayne@68: quality_threshold : int jpayne@68: quality_threshold is the minimum quality score (in phred) a jpayne@68: base has to reach to be counted. jpayne@68: jpayne@68: read_callback: string or function jpayne@68: jpayne@68: select a call-back to ignore reads when counting. It can jpayne@68: be either a string with the following values: jpayne@68: jpayne@68: ``all`` jpayne@68: skip reads in which any of the following jpayne@68: flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, jpayne@68: BAM_FDUP jpayne@68: jpayne@68: ``nofilter`` jpayne@68: uses every single read jpayne@68: jpayne@68: Alternatively, `read_callback` can be a function jpayne@68: ``check_read(read)`` that should return True only for jpayne@68: those reads that shall be included in the counting. jpayne@68: jpayne@68: reference : string jpayne@68: backward compatible synonym for `contig` jpayne@68: jpayne@68: end : int jpayne@68: backward compatible synonym for `stop` jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: ValueError jpayne@68: if the genomic coordinates are out of range or invalid. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: four array.arrays of the same length in order A C G T : tuple jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: cdef uint32_t contig_length = self.get_reference_length(contig) jpayne@68: cdef int _start = start if start is not None else 0 jpayne@68: cdef int _stop = stop if stop is not None else contig_length jpayne@68: _stop = _stop if _stop < contig_length else contig_length jpayne@68: jpayne@68: if _stop == _start: jpayne@68: raise ValueError("interval of size 0") jpayne@68: if _stop < _start: jpayne@68: raise ValueError("interval of size less than 0") jpayne@68: jpayne@68: cdef int length = _stop - _start jpayne@68: cdef c_array.array int_array_template = array.array('L', []) jpayne@68: cdef c_array.array count_a jpayne@68: cdef c_array.array count_c jpayne@68: cdef c_array.array count_g jpayne@68: cdef c_array.array count_t jpayne@68: count_a = c_array.clone(int_array_template, length, zero=True) jpayne@68: count_c = c_array.clone(int_array_template, length, zero=True) jpayne@68: count_g = c_array.clone(int_array_template, length, zero=True) jpayne@68: count_t = c_array.clone(int_array_template, length, zero=True) jpayne@68: jpayne@68: cdef AlignedSegment read jpayne@68: cdef cython.str seq jpayne@68: cdef c_array.array quality jpayne@68: cdef int qpos jpayne@68: cdef int refpos jpayne@68: cdef int c = 0 jpayne@68: cdef int filter_method = 0 jpayne@68: jpayne@68: jpayne@68: if read_callback == "all": jpayne@68: filter_method = 1 jpayne@68: elif read_callback == "nofilter": jpayne@68: filter_method = 2 jpayne@68: jpayne@68: cdef int _threshold = quality_threshold or 0 jpayne@68: for read in self.fetch(contig=contig, jpayne@68: reference=reference, jpayne@68: start=start, jpayne@68: stop=stop, jpayne@68: end=end, jpayne@68: region=region): jpayne@68: # apply filter jpayne@68: if filter_method == 1: jpayne@68: # filter = "all" jpayne@68: if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): jpayne@68: continue jpayne@68: elif filter_method == 2: jpayne@68: # filter = "nofilter" jpayne@68: pass jpayne@68: else: jpayne@68: if not read_callback(read): jpayne@68: continue jpayne@68: jpayne@68: # count jpayne@68: seq = read.seq jpayne@68: if seq is None: jpayne@68: continue jpayne@68: quality = read.query_qualities jpayne@68: jpayne@68: for qpos, refpos in read.get_aligned_pairs(True): jpayne@68: if qpos is not None and refpos is not None and \ jpayne@68: _start <= refpos < _stop: jpayne@68: jpayne@68: # only check base quality if _threshold > 0 jpayne@68: if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: jpayne@68: if seq[qpos] == 'A': jpayne@68: count_a.data.as_ulongs[refpos - _start] += 1 jpayne@68: if seq[qpos] == 'C': jpayne@68: count_c.data.as_ulongs[refpos - _start] += 1 jpayne@68: if seq[qpos] == 'G': jpayne@68: count_g.data.as_ulongs[refpos - _start] += 1 jpayne@68: if seq[qpos] == 'T': jpayne@68: count_t.data.as_ulongs[refpos - _start] += 1 jpayne@68: jpayne@68: return count_a, count_c, count_g, count_t jpayne@68: jpayne@68: def find_introns_slow(self, read_iterator): jpayne@68: """Return a dictionary {(start, stop): count} jpayne@68: Listing the intronic sites in the reads (identified by 'N' in the cigar strings), jpayne@68: and their support ( = number of reads ). jpayne@68: jpayne@68: read_iterator can be the result of a .fetch(...) call. jpayne@68: Or it can be a generator filtering such reads. Example jpayne@68: samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) jpayne@68: """ jpayne@68: res = collections.Counter() jpayne@68: for r in read_iterator: jpayne@68: if 'N' in r.cigarstring: jpayne@68: last_read_pos = False jpayne@68: for read_loc, genome_loc in r.get_aligned_pairs(): jpayne@68: if read_loc is None and last_read_pos: jpayne@68: start = genome_loc jpayne@68: elif read_loc and last_read_pos is None: jpayne@68: stop = genome_loc # we are right exclusive ,so this is correct jpayne@68: res[(start, stop)] += 1 jpayne@68: del start jpayne@68: del stop jpayne@68: last_read_pos = read_loc jpayne@68: return res jpayne@68: jpayne@68: def find_introns(self, read_iterator): jpayne@68: """Return a dictionary {(start, stop): count} jpayne@68: Listing the intronic sites in the reads (identified by 'N' in the cigar strings), jpayne@68: and their support ( = number of reads ). jpayne@68: jpayne@68: read_iterator can be the result of a .fetch(...) call. jpayne@68: Or it can be a generator filtering such reads. Example jpayne@68: samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) jpayne@68: """ jpayne@68: cdef: jpayne@68: uint32_t base_position, junc_start, nt jpayne@68: int op jpayne@68: AlignedSegment r jpayne@68: int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP jpayne@68: jpayne@68: res = collections.Counter() jpayne@68: jpayne@68: match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position jpayne@68: for r in read_iterator: jpayne@68: base_position = r.pos jpayne@68: cigar = r.cigartuples jpayne@68: if cigar is None: jpayne@68: continue jpayne@68: jpayne@68: for op, nt in cigar: jpayne@68: if op in match_or_deletion: jpayne@68: base_position += nt jpayne@68: elif op == BAM_CREF_SKIP: jpayne@68: junc_start = base_position jpayne@68: base_position += nt jpayne@68: res[(junc_start, base_position)] += 1 jpayne@68: return res jpayne@68: jpayne@68: jpayne@68: def close(self): jpayne@68: '''closes the :class:`pysam.AlignmentFile`.''' jpayne@68: jpayne@68: if self.htsfile == NULL: jpayne@68: return jpayne@68: jpayne@68: if self.index != NULL: jpayne@68: hts_idx_destroy(self.index) jpayne@68: self.index = NULL jpayne@68: jpayne@68: cdef int ret = hts_close(self.htsfile) jpayne@68: self.htsfile = NULL jpayne@68: jpayne@68: self.header = None jpayne@68: jpayne@68: if ret < 0: jpayne@68: global errno jpayne@68: if errno == EPIPE: jpayne@68: errno = 0 jpayne@68: else: jpayne@68: raise IOError(errno, force_str(strerror(errno))) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: cdef int ret = 0 jpayne@68: jpayne@68: if self.index != NULL: jpayne@68: hts_idx_destroy(self.index) jpayne@68: self.index = NULL jpayne@68: jpayne@68: if self.htsfile != NULL: jpayne@68: ret = hts_close(self.htsfile) jpayne@68: self.htsfile = NULL jpayne@68: jpayne@68: self.header = None jpayne@68: jpayne@68: if self.b: jpayne@68: bam_destroy1(self.b) jpayne@68: self.b = NULL jpayne@68: jpayne@68: if ret < 0: jpayne@68: global errno jpayne@68: if errno == EPIPE: jpayne@68: errno = 0 jpayne@68: else: jpayne@68: raise IOError(errno, force_str(strerror(errno))) jpayne@68: jpayne@68: cpdef int write(self, AlignedSegment read) except -1: jpayne@68: ''' jpayne@68: write a single :class:`pysam.AlignedSegment` to disk. jpayne@68: jpayne@68: Raises: jpayne@68: ValueError jpayne@68: if the writing failed jpayne@68: jpayne@68: Returns: jpayne@68: int : jpayne@68: the number of bytes written. If the file is closed, jpayne@68: this will be 0. jpayne@68: ''' jpayne@68: if not self.is_open: jpayne@68: return 0 jpayne@68: jpayne@68: if self.header.ptr.n_targets <= read._delegate.core.tid: jpayne@68: raise ValueError( jpayne@68: "AlignedSegment refers to reference number {} that " jpayne@68: "is larger than the number of references ({}) in the header".format( jpayne@68: read._delegate.core.tid, self.header.ptr.n_targets)) jpayne@68: jpayne@68: cdef int ret jpayne@68: with nogil: jpayne@68: ret = sam_write1(self.htsfile, jpayne@68: self.header.ptr, jpayne@68: read._delegate) jpayne@68: jpayne@68: # kbj: Still need to raise an exception with except -1. Otherwise jpayne@68: # when ret == -1 we get a "SystemError: error return without jpayne@68: # exception set". jpayne@68: if ret < 0: jpayne@68: raise IOError( jpayne@68: "sam_write1 failed with error code {}".format(ret)) jpayne@68: jpayne@68: return ret jpayne@68: jpayne@68: # context manager interface jpayne@68: def __enter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __exit__(self, exc_type, exc_value, traceback): jpayne@68: self.close() jpayne@68: return False jpayne@68: jpayne@68: ############################################################### jpayne@68: ############################################################### jpayne@68: ############################################################### jpayne@68: ## properties jpayne@68: ############################################################### jpayne@68: property mapped: jpayne@68: """int with total number of mapped alignments according to the jpayne@68: statistics recorded in the index. This is a read-only jpayne@68: attribute. jpayne@68: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@68: index format does not record these statistics.) jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: self.check_index() jpayne@68: cdef int tid jpayne@68: cdef uint64_t total = 0 jpayne@68: cdef uint64_t mapped, unmapped jpayne@68: for tid from 0 <= tid < self.header.nreferences: jpayne@68: with nogil: jpayne@68: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@68: total += mapped jpayne@68: return total jpayne@68: jpayne@68: property unmapped: jpayne@68: """int with total number of unmapped reads according to the statistics jpayne@68: recorded in the index. This number of reads includes the number of reads jpayne@68: without coordinates. This is a read-only attribute. jpayne@68: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@68: index format does not record these statistics.) jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: self.check_index() jpayne@68: cdef int tid jpayne@68: cdef uint64_t total = hts_idx_get_n_no_coor(self.index) jpayne@68: cdef uint64_t mapped, unmapped jpayne@68: for tid from 0 <= tid < self.header.nreferences: jpayne@68: with nogil: jpayne@68: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@68: total += unmapped jpayne@68: return total jpayne@68: jpayne@68: property nocoordinate: jpayne@68: """int with total number of reads without coordinates according to the jpayne@68: statistics recorded in the index, i.e., the statistic printed for "*" jpayne@68: by the ``samtools idxstats`` command. This is a read-only attribute. jpayne@68: (This will be 0 for a CRAM file indexed by a .crai index, as that jpayne@68: index format does not record these statistics.) jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: self.check_index() jpayne@68: cdef uint64_t n jpayne@68: with nogil: jpayne@68: n = hts_idx_get_n_no_coor(self.index) jpayne@68: return n jpayne@68: jpayne@68: def get_index_statistics(self): jpayne@68: """return statistics about mapped/unmapped reads per chromosome as jpayne@68: they are stored in the index, similarly to the statistics printed jpayne@68: by the ``samtools idxstats`` command. jpayne@68: jpayne@68: CRAI indexes do not record these statistics, so for a CRAM file jpayne@68: with a .crai index the returned statistics will all be 0. jpayne@68: jpayne@68: Returns: jpayne@68: list : jpayne@68: a list of records for each chromosome. Each record has the jpayne@68: attributes 'contig', 'mapped', 'unmapped' and 'total'. jpayne@68: """ jpayne@68: jpayne@68: self.check_index() jpayne@68: cdef int tid jpayne@68: cdef uint64_t mapped, unmapped jpayne@68: results = [] jpayne@68: # TODO: use header jpayne@68: for tid from 0 <= tid < self.nreferences: jpayne@68: with nogil: jpayne@68: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) jpayne@68: results.append( jpayne@68: IndexStats._make(( jpayne@68: self.get_reference_name(tid), jpayne@68: mapped, jpayne@68: unmapped, jpayne@68: mapped + unmapped))) jpayne@68: jpayne@68: return results jpayne@68: jpayne@68: ############################################################### jpayne@68: ## file-object like iterator access jpayne@68: ## note: concurrent access will cause errors (see IteratorRow jpayne@68: ## and multiple_iterators) jpayne@68: ## Possible solutions: deprecate or open new file handle jpayne@68: def __iter__(self): jpayne@68: if not self.is_open: jpayne@68: raise ValueError("I/O operation on closed file") jpayne@68: jpayne@68: if not self.is_bam and self.header.nreferences == 0: jpayne@68: raise NotImplementedError( jpayne@68: "can not iterate over samfile without header") jpayne@68: return self jpayne@68: jpayne@68: cdef bam1_t * getCurrent(self): jpayne@68: return self.b jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: ''' jpayne@68: cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. jpayne@68: ''' jpayne@68: cdef int ret jpayne@68: cdef bam_hdr_t * hdr = self.header.ptr jpayne@68: with nogil: jpayne@68: ret = sam_read1(self.htsfile, jpayne@68: hdr, jpayne@68: self.b) jpayne@68: return ret jpayne@68: jpayne@68: def __next__(self): jpayne@68: cdef int ret = self.cnext() jpayne@68: if ret >= 0: jpayne@68: return makeAlignedSegment(self.b, self.header) jpayne@68: elif ret == -1: jpayne@68: raise StopIteration jpayne@68: else: jpayne@68: raise IOError(read_failure_reason(ret)) jpayne@68: jpayne@68: ########################################### jpayne@68: # methods/properties referencing the header jpayne@68: def is_valid_tid(self, int tid): jpayne@68: """ jpayne@68: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@68: jpayne@68: Note that the unmapped tid code (-1) counts as an invalid. jpayne@68: """ jpayne@68: if self.header is None: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: return self.header.is_valid_tid(tid) jpayne@68: jpayne@68: def get_tid(self, reference): jpayne@68: """ jpayne@68: return the numerical :term:`tid` corresponding to jpayne@68: :term:`reference` jpayne@68: jpayne@68: returns -1 if reference is not known. jpayne@68: """ jpayne@68: if self.header is None: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: return self.header.get_tid(reference) jpayne@68: jpayne@68: def get_reference_name(self, tid): jpayne@68: """ jpayne@68: return :term:`reference` name corresponding to numerical :term:`tid` jpayne@68: """ jpayne@68: if self.header is None: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: return self.header.get_reference_name(tid) jpayne@68: jpayne@68: def get_reference_length(self, reference): jpayne@68: """ jpayne@68: return :term:`reference` length corresponding to numerical :term:`tid` jpayne@68: """ jpayne@68: if self.header is None: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: return self.header.get_reference_length(reference) jpayne@68: jpayne@68: property nreferences: jpayne@68: """int with the number of :term:`reference` sequences in the file. jpayne@68: This is a read-only attribute.""" jpayne@68: def __get__(self): jpayne@68: if self.header: jpayne@68: return self.header.nreferences jpayne@68: else: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: jpayne@68: property references: jpayne@68: """tuple with the names of :term:`reference` sequences. This is a jpayne@68: read-only attribute""" jpayne@68: def __get__(self): jpayne@68: if self.header: jpayne@68: return self.header.references jpayne@68: else: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: jpayne@68: property lengths: jpayne@68: """tuple of the lengths of the :term:`reference` sequences. This is a jpayne@68: read-only attribute. The lengths are in the same order as jpayne@68: :attr:`pysam.AlignmentFile.references` jpayne@68: jpayne@68: """ jpayne@68: def __get__(self): jpayne@68: if self.header: jpayne@68: return self.header.lengths jpayne@68: else: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: jpayne@68: # Compatibility functions for pysam < 0.14 jpayne@68: property text: jpayne@68: """deprecated, use :attr:`references` and :attr:`lengths` instead""" jpayne@68: def __get__(self): jpayne@68: if self.header: jpayne@68: return self.header.__str__() jpayne@68: else: jpayne@68: raise ValueError("header not available in closed files") jpayne@68: jpayne@68: # Compatibility functions for pysam < 0.8.3 jpayne@68: def gettid(self, reference): jpayne@68: """deprecated, use :meth:`get_tid` instead""" jpayne@68: return self.get_tid(reference) jpayne@68: jpayne@68: def getrname(self, tid): jpayne@68: """deprecated, use :meth:`get_reference_name` instead""" jpayne@68: return self.get_reference_name(tid) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRow: jpayne@68: '''abstract base class for iterators over mapped reads. jpayne@68: jpayne@68: Various iterators implement different behaviours for wrapping around jpayne@68: contig boundaries. Examples include: jpayne@68: jpayne@68: :class:`pysam.IteratorRowRegion` jpayne@68: iterate within a single contig and a defined region. jpayne@68: jpayne@68: :class:`pysam.IteratorRowAll` jpayne@68: iterate until EOF. This iterator will also include unmapped reads. jpayne@68: jpayne@68: :class:`pysam.IteratorRowAllRefs` jpayne@68: iterate over all reads in all reference sequences. jpayne@68: jpayne@68: The method :meth:`AlignmentFile.fetch` returns an IteratorRow. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a jpayne@68: :meth:`AlignmentFile.fetch`. jpayne@68: jpayne@68: ''' jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): jpayne@68: cdef char *cfilename jpayne@68: cdef char *creference_filename jpayne@68: cdef char *cindexname = NULL jpayne@68: jpayne@68: if not samfile.is_open: jpayne@68: raise ValueError("I/O operation on closed file") jpayne@68: jpayne@68: # makes sure that samfile stays alive as long as the jpayne@68: # iterator is alive jpayne@68: self.samfile = samfile jpayne@68: jpayne@68: # reopen the file - note that this makes the iterator jpayne@68: # slow and causes pileup to slow down significantly. jpayne@68: if multiple_iterators: jpayne@68: jpayne@68: cfilename = samfile.filename jpayne@68: with nogil: jpayne@68: self.htsfile = hts_open(cfilename, 'r') jpayne@68: assert self.htsfile != NULL jpayne@68: jpayne@68: if samfile.has_index(): jpayne@68: if samfile.index_filename: jpayne@68: cindexname = bindex_filename = encode_filename(samfile.index_filename) jpayne@68: with nogil: jpayne@68: self.index = sam_index_load2(self.htsfile, cfilename, cindexname) jpayne@68: else: jpayne@68: self.index = NULL jpayne@68: jpayne@68: # need to advance in newly opened file to position after header jpayne@68: # better: use seek/tell? jpayne@68: with nogil: jpayne@68: hdr = sam_hdr_read(self.htsfile) jpayne@68: if hdr is NULL: jpayne@68: raise IOError("unable to read header information") jpayne@68: self.header = makeAlignmentHeader(hdr) jpayne@68: jpayne@68: self.owns_samfile = True jpayne@68: jpayne@68: # options specific to CRAM files jpayne@68: if samfile.is_cram and samfile.reference_filename: jpayne@68: creference_filename = samfile.reference_filename jpayne@68: hts_set_opt(self.htsfile, jpayne@68: CRAM_OPT_REFERENCE, jpayne@68: creference_filename) jpayne@68: jpayne@68: else: jpayne@68: self.htsfile = samfile.htsfile jpayne@68: self.index = samfile.index jpayne@68: self.owns_samfile = False jpayne@68: self.header = samfile.header jpayne@68: jpayne@68: self.retval = 0 jpayne@68: jpayne@68: self.b = bam_init1() jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: bam_destroy1(self.b) jpayne@68: if self.owns_samfile: jpayne@68: hts_idx_destroy(self.index) jpayne@68: hts_close(self.htsfile) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRowRegion(IteratorRow): jpayne@68: """*(AlignmentFile samfile, int tid, int beg, int stop, jpayne@68: int multiple_iterators=False)* jpayne@68: jpayne@68: iterate over mapped reads in a region. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a jpayne@68: :meth:`AlignmentFile.fetch`. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, jpayne@68: int tid, int beg, int stop, jpayne@68: int multiple_iterators=False): jpayne@68: jpayne@68: if not samfile.has_index(): jpayne@68: raise ValueError("no index available for iteration") jpayne@68: jpayne@68: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@68: with nogil: jpayne@68: self.iter = sam_itr_queryi( jpayne@68: self.index, jpayne@68: tid, jpayne@68: beg, jpayne@68: stop) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: cdef bam1_t * getCurrent(self): jpayne@68: return self.b jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: '''cversion of iterator. Used by IteratorColumn''' jpayne@68: with nogil: jpayne@68: self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), jpayne@68: self.iter, jpayne@68: self.b, jpayne@68: self.htsfile) jpayne@68: jpayne@68: def __next__(self): jpayne@68: self.cnext() jpayne@68: if self.retval >= 0: jpayne@68: return makeAlignedSegment(self.b, self.header) jpayne@68: elif self.retval == -1: jpayne@68: raise StopIteration jpayne@68: elif self.retval == -2: jpayne@68: # Note: it is currently not the case that hts_iter_next jpayne@68: # returns -2 for a truncated file. jpayne@68: # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 jpayne@68: raise IOError('truncated file') jpayne@68: else: jpayne@68: raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: hts_itr_destroy(self.iter) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRowHead(IteratorRow): jpayne@68: """*(AlignmentFile samfile, n, int multiple_iterators=False)* jpayne@68: jpayne@68: iterate over first n reads in `samfile` jpayne@68: jpayne@68: .. note:: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a jpayne@68: :meth:`AlignmentFile.head`. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, jpayne@68: AlignmentFile samfile, jpayne@68: int n, jpayne@68: int multiple_iterators=False): jpayne@68: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: self.max_rows = n jpayne@68: self.current_row = 0 jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: cdef bam1_t * getCurrent(self): jpayne@68: return self.b jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: '''cversion of iterator. Used by IteratorColumn''' jpayne@68: cdef int ret jpayne@68: cdef bam_hdr_t * hdr = self.header.ptr jpayne@68: with nogil: jpayne@68: ret = sam_read1(self.htsfile, jpayne@68: hdr, jpayne@68: self.b) jpayne@68: return ret jpayne@68: jpayne@68: def __next__(self): jpayne@68: if self.current_row >= self.max_rows: jpayne@68: raise StopIteration jpayne@68: jpayne@68: cdef int ret = self.cnext() jpayne@68: if ret >= 0: jpayne@68: self.current_row += 1 jpayne@68: return makeAlignedSegment(self.b, self.header) jpayne@68: elif ret == -1: jpayne@68: raise StopIteration jpayne@68: else: jpayne@68: raise IOError(read_failure_reason(ret)) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRowAll(IteratorRow): jpayne@68: """*(AlignmentFile samfile, int multiple_iterators=False)* jpayne@68: jpayne@68: iterate over all reads in `samfile` jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a jpayne@68: :meth:`AlignmentFile.fetch`. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): jpayne@68: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: cdef bam1_t * getCurrent(self): jpayne@68: return self.b jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: '''cversion of iterator. Used by IteratorColumn''' jpayne@68: cdef int ret jpayne@68: cdef bam_hdr_t * hdr = self.header.ptr jpayne@68: with nogil: jpayne@68: ret = sam_read1(self.htsfile, jpayne@68: hdr, jpayne@68: self.b) jpayne@68: return ret jpayne@68: jpayne@68: def __next__(self): jpayne@68: cdef int ret = self.cnext() jpayne@68: if ret >= 0: jpayne@68: return makeAlignedSegment(self.b, self.header) jpayne@68: elif ret == -1: jpayne@68: raise StopIteration jpayne@68: else: jpayne@68: raise IOError(read_failure_reason(ret)) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRowAllRefs(IteratorRow): jpayne@68: """iterates over all mapped reads by chaining iterators over each jpayne@68: reference jpayne@68: jpayne@68: .. note:: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a jpayne@68: :meth:`AlignmentFile.fetch`. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, multiple_iterators=False): jpayne@68: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: if not samfile.has_index(): jpayne@68: raise ValueError("no index available for fetch") jpayne@68: jpayne@68: self.tid = -1 jpayne@68: jpayne@68: def nextiter(self): jpayne@68: # get a new iterator for a chromosome. The file jpayne@68: # will not be re-opened. jpayne@68: self.rowiter = IteratorRowRegion(self.samfile, jpayne@68: self.tid, jpayne@68: 0, jpayne@68: MAX_POS) jpayne@68: # set htsfile and header of the rowiter jpayne@68: # to the values in this iterator to reflect multiple_iterators jpayne@68: self.rowiter.htsfile = self.htsfile jpayne@68: self.rowiter.header = self.header jpayne@68: jpayne@68: # make sure the iterator understand that IteratorRowAllRefs jpayne@68: # has ownership jpayne@68: self.rowiter.owns_samfile = False jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: # Create an initial iterator jpayne@68: if self.tid == -1: jpayne@68: if not self.samfile.nreferences: jpayne@68: raise StopIteration jpayne@68: self.tid = 0 jpayne@68: self.nextiter() jpayne@68: jpayne@68: while 1: jpayne@68: self.rowiter.cnext() jpayne@68: jpayne@68: # If current iterator is not exhausted, return aligned read jpayne@68: if self.rowiter.retval > 0: jpayne@68: return makeAlignedSegment(self.rowiter.b, self.header) jpayne@68: jpayne@68: self.tid += 1 jpayne@68: jpayne@68: # Otherwise, proceed to next reference or stop jpayne@68: if self.tid < self.samfile.nreferences: jpayne@68: self.nextiter() jpayne@68: else: jpayne@68: raise StopIteration jpayne@68: jpayne@68: jpayne@68: cdef class IteratorRowSelection(IteratorRow): jpayne@68: """*(AlignmentFile samfile)* jpayne@68: jpayne@68: iterate over reads in `samfile` at a given list of file positions. jpayne@68: jpayne@68: .. note:: jpayne@68: It is usually not necessary to create an object of this class jpayne@68: explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): jpayne@68: super().__init__(samfile, multiple_iterators=multiple_iterators) jpayne@68: jpayne@68: self.positions = positions jpayne@68: self.current_pos = 0 jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: cdef bam1_t * getCurrent(self): jpayne@68: return self.b jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: '''cversion of iterator''' jpayne@68: # end iteration if out of positions jpayne@68: if self.current_pos >= len(self.positions): return -1 jpayne@68: jpayne@68: cdef uint64_t pos = self.positions[self.current_pos] jpayne@68: with nogil: jpayne@68: bgzf_seek(hts_get_bgzfp(self.htsfile), jpayne@68: pos, jpayne@68: 0) jpayne@68: self.current_pos += 1 jpayne@68: jpayne@68: cdef int ret jpayne@68: cdef bam_hdr_t * hdr = self.header.ptr jpayne@68: with nogil: jpayne@68: ret = sam_read1(self.htsfile, jpayne@68: hdr, jpayne@68: self.b) jpayne@68: return ret jpayne@68: jpayne@68: def __next__(self): jpayne@68: cdef int ret = self.cnext() jpayne@68: if ret >= 0: jpayne@68: return makeAlignedSegment(self.b, self.header) jpayne@68: elif ret == -1: jpayne@68: raise StopIteration jpayne@68: else: jpayne@68: raise IOError(read_failure_reason(ret)) jpayne@68: jpayne@68: jpayne@68: cdef int __advance_nofilter(void *data, bam1_t *b): jpayne@68: '''advance without any read filtering. jpayne@68: ''' jpayne@68: cdef __iterdata * d = <__iterdata*>data jpayne@68: cdef int ret jpayne@68: with nogil: jpayne@68: ret = sam_itr_next(d.htsfile, d.iter, b) jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef int __advance_raw_nofilter(void *data, bam1_t *b): jpayne@68: '''advance (without iterator) without any read filtering. jpayne@68: ''' jpayne@68: cdef __iterdata * d = <__iterdata*>data jpayne@68: cdef int ret jpayne@68: with nogil: jpayne@68: ret = sam_read1(d.htsfile, d.header, b) jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef int __advance_all(void *data, bam1_t *b): jpayne@68: '''only use reads for pileup passing basic filters such as jpayne@68: jpayne@68: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@68: ''' jpayne@68: jpayne@68: cdef __iterdata * d = <__iterdata*>data jpayne@68: cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP jpayne@68: cdef int ret jpayne@68: while 1: jpayne@68: with nogil: jpayne@68: ret = sam_itr_next(d.htsfile, d.iter, b) jpayne@68: if ret < 0: jpayne@68: break jpayne@68: if b.core.flag & d.flag_filter: jpayne@68: continue jpayne@68: break jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef int __advance_raw_all(void *data, bam1_t *b): jpayne@68: '''only use reads for pileup passing basic filters such as jpayne@68: jpayne@68: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP jpayne@68: ''' jpayne@68: jpayne@68: cdef __iterdata * d = <__iterdata*>data jpayne@68: cdef int ret jpayne@68: while 1: jpayne@68: with nogil: jpayne@68: ret = sam_read1(d.htsfile, d.header, b) jpayne@68: if ret < 0: jpayne@68: break jpayne@68: if b.core.flag & d.flag_filter: jpayne@68: continue jpayne@68: break jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef int __advance_samtools(void * data, bam1_t * b): jpayne@68: '''advance using same filter and read processing as in jpayne@68: the samtools pileup. jpayne@68: ''' jpayne@68: cdef __iterdata * d = <__iterdata*>data jpayne@68: cdef int ret jpayne@68: cdef int q jpayne@68: jpayne@68: while 1: jpayne@68: with nogil: jpayne@68: ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) jpayne@68: if ret < 0: jpayne@68: break jpayne@68: if b.core.flag & d.flag_filter: jpayne@68: continue jpayne@68: if d.flag_require and not (b.core.flag & d.flag_require): jpayne@68: continue jpayne@68: jpayne@68: # reload sequence jpayne@68: if d.fastafile != NULL and b.core.tid != d.tid: jpayne@68: if d.seq != NULL: jpayne@68: free(d.seq) jpayne@68: d.tid = b.core.tid jpayne@68: with nogil: jpayne@68: d.seq = faidx_fetch_seq( jpayne@68: d.fastafile, jpayne@68: d.header.target_name[d.tid], jpayne@68: 0, MAX_POS, jpayne@68: &d.seq_len) jpayne@68: jpayne@68: if d.seq == NULL: jpayne@68: raise ValueError( jpayne@68: "reference sequence for '{}' (tid={}) not found".format( jpayne@68: d.header.target_name[d.tid], d.tid)) jpayne@68: jpayne@68: # realign read - changes base qualities jpayne@68: if d.seq != NULL and d.compute_baq: jpayne@68: # 4th option to realign is flag: jpayne@68: # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 jpayne@68: if d.redo_baq: jpayne@68: sam_prob_realn(b, d.seq, d.seq_len, 7) jpayne@68: else: jpayne@68: sam_prob_realn(b, d.seq, d.seq_len, 3) jpayne@68: jpayne@68: if d.seq != NULL and d.adjust_capq_threshold > 10: jpayne@68: q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) jpayne@68: if q < 0: jpayne@68: continue jpayne@68: elif b.core.qual > q: jpayne@68: b.core.qual = q jpayne@68: jpayne@68: if b.core.qual < d.min_mapping_quality: jpayne@68: continue jpayne@68: if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): jpayne@68: continue jpayne@68: jpayne@68: break jpayne@68: jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef class IteratorColumn: jpayne@68: '''abstract base class for iterators over columns. jpayne@68: jpayne@68: IteratorColumn objects wrap the pileup functionality of samtools. jpayne@68: jpayne@68: For reasons of efficiency, the iterator points to the current jpayne@68: pileup buffer. The pileup buffer is updated at every iteration. jpayne@68: This might cause some unexpected behaviour. For example, jpayne@68: consider the conversion to a list:: jpayne@68: jpayne@68: f = AlignmentFile("file.bam", "rb") jpayne@68: result = list(f.pileup()) jpayne@68: jpayne@68: Here, ``result`` will contain ``n`` objects of type jpayne@68: :class:`~pysam.PileupColumn` for ``n`` columns, but each object in jpayne@68: ``result`` will contain the same information. jpayne@68: jpayne@68: The desired behaviour can be achieved by list comprehension:: jpayne@68: jpayne@68: result = [x.pileups() for x in f.pileup()] jpayne@68: jpayne@68: ``result`` will be a list of ``n`` lists of objects of type jpayne@68: :class:`~pysam.PileupRead`. jpayne@68: jpayne@68: If the iterator is associated with a :class:`~pysam.Fastafile` jpayne@68: using the :meth:`add_reference` method, then the iterator will jpayne@68: export the current sequence via the methods :meth:`get_sequence` jpayne@68: and :meth:`seq_len`. jpayne@68: jpayne@68: See :class:`~AlignmentFile.pileup` for kwargs to the iterator. jpayne@68: ''' jpayne@68: jpayne@68: def __cinit__( self, AlignmentFile samfile, **kwargs): jpayne@68: self.samfile = samfile jpayne@68: self.fastafile = kwargs.get("fastafile", None) jpayne@68: self.stepper = kwargs.get("stepper", "samtools") jpayne@68: self.max_depth = kwargs.get("max_depth", 8000) jpayne@68: self.ignore_overlaps = kwargs.get("ignore_overlaps", True) jpayne@68: self.min_base_quality = kwargs.get("min_base_quality", 13) jpayne@68: self.iterdata.seq = NULL jpayne@68: self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) jpayne@68: self.iterdata.flag_require = kwargs.get("flag_require", 0) jpayne@68: self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) jpayne@68: self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) jpayne@68: self.iterdata.compute_baq = kwargs.get("compute_baq", True) jpayne@68: self.iterdata.redo_baq = kwargs.get("redo_baq", False) jpayne@68: self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) jpayne@68: jpayne@68: self.tid = 0 jpayne@68: self.pos = 0 jpayne@68: self.n_plp = 0 jpayne@68: self.plp = NULL jpayne@68: self.pileup_iter = NULL jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: cdef int cnext(self): jpayne@68: '''perform next iteration. jpayne@68: ''' jpayne@68: # do not release gil here because of call-backs jpayne@68: cdef int ret = bam_mplp_auto(self.pileup_iter, jpayne@68: &self.tid, jpayne@68: &self.pos, jpayne@68: &self.n_plp, jpayne@68: &self.plp) jpayne@68: return ret jpayne@68: jpayne@68: cdef char * get_sequence(self): jpayne@68: '''return current reference sequence underlying the iterator. jpayne@68: ''' jpayne@68: return self.iterdata.seq jpayne@68: jpayne@68: property seq_len: jpayne@68: '''current sequence length.''' jpayne@68: def __get__(self): jpayne@68: return self.iterdata.seq_len jpayne@68: jpayne@68: def add_reference(self, FastaFile fastafile): jpayne@68: ''' jpayne@68: add reference sequences in `fastafile` to iterator.''' jpayne@68: self.fastafile = fastafile jpayne@68: if self.iterdata.seq != NULL: jpayne@68: free(self.iterdata.seq) jpayne@68: self.iterdata.tid = -1 jpayne@68: self.iterdata.fastafile = self.fastafile.fastafile jpayne@68: jpayne@68: def has_reference(self): jpayne@68: ''' jpayne@68: return true if iterator is associated with a reference''' jpayne@68: return self.fastafile jpayne@68: jpayne@68: cdef _setup_iterator(self, jpayne@68: int tid, jpayne@68: int start, jpayne@68: int stop, jpayne@68: int multiple_iterators=0): jpayne@68: '''setup the iterator structure''' jpayne@68: jpayne@68: self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) jpayne@68: self.iterdata.htsfile = self.samfile.htsfile jpayne@68: self.iterdata.iter = self.iter.iter jpayne@68: self.iterdata.seq = NULL jpayne@68: self.iterdata.tid = -1 jpayne@68: self.iterdata.header = self.samfile.header.ptr jpayne@68: jpayne@68: if self.fastafile is not None: jpayne@68: self.iterdata.fastafile = self.fastafile.fastafile jpayne@68: else: jpayne@68: self.iterdata.fastafile = NULL jpayne@68: jpayne@68: # Free any previously allocated memory before reassigning jpayne@68: # pileup_iter jpayne@68: self._free_pileup_iter() jpayne@68: jpayne@68: cdef void * data[1] jpayne@68: data[0] = &self.iterdata jpayne@68: jpayne@68: if self.stepper is None or self.stepper == "all": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_all, jpayne@68: data) jpayne@68: elif self.stepper == "nofilter": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_nofilter, jpayne@68: data) jpayne@68: elif self.stepper == "samtools": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_samtools, jpayne@68: data) jpayne@68: else: jpayne@68: raise ValueError( jpayne@68: "unknown stepper option `%s` in IteratorColumn" % self.stepper) jpayne@68: jpayne@68: if self.max_depth: jpayne@68: with nogil: jpayne@68: bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) jpayne@68: jpayne@68: if self.ignore_overlaps: jpayne@68: with nogil: jpayne@68: bam_mplp_init_overlaps(self.pileup_iter) jpayne@68: jpayne@68: cdef _setup_raw_rest_iterator(self): jpayne@68: '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST''' jpayne@68: jpayne@68: self.iter = None jpayne@68: self.iterdata.iter = NULL jpayne@68: self.iterdata.htsfile = self.samfile.htsfile jpayne@68: self.iterdata.seq = NULL jpayne@68: self.iterdata.tid = -1 jpayne@68: self.iterdata.header = self.samfile.header.ptr jpayne@68: jpayne@68: if self.fastafile is not None: jpayne@68: self.iterdata.fastafile = self.fastafile.fastafile jpayne@68: else: jpayne@68: self.iterdata.fastafile = NULL jpayne@68: jpayne@68: # Free any previously allocated memory before reassigning jpayne@68: # pileup_iter jpayne@68: self._free_pileup_iter() jpayne@68: jpayne@68: cdef void * data[1] jpayne@68: data[0] = &self.iterdata jpayne@68: jpayne@68: if self.stepper is None or self.stepper == "all": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_raw_all, jpayne@68: data) jpayne@68: elif self.stepper == "nofilter": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_raw_nofilter, jpayne@68: data) jpayne@68: elif self.stepper == "samtools": jpayne@68: with nogil: jpayne@68: self.pileup_iter = bam_mplp_init(1, jpayne@68: &__advance_samtools, jpayne@68: data) jpayne@68: else: jpayne@68: raise ValueError( jpayne@68: "unknown stepper option `%s` in IteratorColumn" % self.stepper) jpayne@68: jpayne@68: if self.max_depth: jpayne@68: with nogil: jpayne@68: bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) jpayne@68: jpayne@68: if self.ignore_overlaps: jpayne@68: with nogil: jpayne@68: bam_mplp_init_overlaps(self.pileup_iter) jpayne@68: jpayne@68: cdef reset(self, tid, start, stop): jpayne@68: '''reset iterator position. jpayne@68: jpayne@68: This permits using the iterator multiple times without jpayne@68: having to incur the full set-up costs. jpayne@68: ''' jpayne@68: if self.iter is None: jpayne@68: raise TypeError("Raw iterator set up without region cannot be reset") jpayne@68: jpayne@68: self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0) jpayne@68: self.iterdata.iter = self.iter.iter jpayne@68: jpayne@68: # invalidate sequence if different tid jpayne@68: if self.tid != tid: jpayne@68: if self.iterdata.seq != NULL: jpayne@68: free(self.iterdata.seq) jpayne@68: self.iterdata.seq = NULL jpayne@68: self.iterdata.tid = -1 jpayne@68: jpayne@68: # self.pileup_iter = bam_mplp_init(1 jpayne@68: # &__advancepileup, jpayne@68: # &self.iterdata) jpayne@68: with nogil: jpayne@68: bam_mplp_reset(self.pileup_iter) jpayne@68: jpayne@68: cdef _free_pileup_iter(self): jpayne@68: '''free the memory alloc'd by bam_plp_init. jpayne@68: jpayne@68: This is needed before setup_iterator allocates another jpayne@68: pileup_iter, or else memory will be lost. ''' jpayne@68: if self.pileup_iter != NULL: jpayne@68: with nogil: jpayne@68: bam_mplp_reset(self.pileup_iter) jpayne@68: bam_mplp_destroy(self.pileup_iter) jpayne@68: self.pileup_iter = NULL jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: # reset in order to avoid memory leak messages for iterators jpayne@68: # that have not been fully consumed jpayne@68: self._free_pileup_iter() jpayne@68: self.plp = NULL jpayne@68: jpayne@68: if self.iterdata.seq != NULL: jpayne@68: free(self.iterdata.seq) jpayne@68: self.iterdata.seq = NULL jpayne@68: jpayne@68: # backwards compatibility jpayne@68: jpayne@68: def hasReference(self): jpayne@68: return self.has_reference() jpayne@68: cdef char * getSequence(self): jpayne@68: return self.get_sequence() jpayne@68: def addReference(self, FastaFile fastafile): jpayne@68: return self.add_reference(fastafile) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorColumnRegion(IteratorColumn): jpayne@68: '''iterates over a region only. jpayne@68: ''' jpayne@68: def __cinit__(self, jpayne@68: AlignmentFile samfile, jpayne@68: int tid = 0, jpayne@68: int start = 0, jpayne@68: int stop = MAX_POS, jpayne@68: int truncate = False, jpayne@68: int multiple_iterators = True, jpayne@68: **kwargs ): jpayne@68: jpayne@68: # initialize iterator. Multiple iterators not available jpayne@68: # for CRAM. jpayne@68: if multiple_iterators and samfile.is_cram: jpayne@68: warnings.warn("multiple_iterators not implemented for CRAM") jpayne@68: multiple_iterators = False jpayne@68: jpayne@68: self._setup_iterator(tid, start, stop, multiple_iterators) jpayne@68: self.start = start jpayne@68: self.stop = stop jpayne@68: self.truncate = truncate jpayne@68: jpayne@68: def __next__(self): jpayne@68: jpayne@68: cdef int n jpayne@68: jpayne@68: while 1: jpayne@68: n = self.cnext() jpayne@68: if n < 0: jpayne@68: raise ValueError("error during iteration" ) jpayne@68: jpayne@68: if n == 0: jpayne@68: raise StopIteration jpayne@68: jpayne@68: if self.truncate: jpayne@68: if self.start > self.pos: jpayne@68: continue jpayne@68: if self.pos >= self.stop: jpayne@68: raise StopIteration jpayne@68: jpayne@68: return makePileupColumn(&self.plp, jpayne@68: self.tid, jpayne@68: self.pos, jpayne@68: self.n_plp, jpayne@68: self.min_base_quality, jpayne@68: self.iterdata.seq, jpayne@68: self.samfile.header) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorColumnAllRefs(IteratorColumn): jpayne@68: """iterates over all columns by chaining iterators over each reference jpayne@68: """ jpayne@68: jpayne@68: def __cinit__(self, jpayne@68: AlignmentFile samfile, jpayne@68: **kwargs): jpayne@68: jpayne@68: # no iteration over empty files jpayne@68: if not samfile.nreferences: jpayne@68: raise StopIteration jpayne@68: jpayne@68: # initialize iterator jpayne@68: self._setup_iterator(self.tid, 0, MAX_POS, 1) jpayne@68: jpayne@68: def __next__(self): jpayne@68: jpayne@68: cdef int n jpayne@68: while 1: jpayne@68: n = self.cnext() jpayne@68: if n < 0: jpayne@68: raise ValueError("error during iteration") jpayne@68: jpayne@68: # proceed to next reference or stop jpayne@68: if n == 0: jpayne@68: self.tid += 1 jpayne@68: if self.tid < self.samfile.nreferences: jpayne@68: self._setup_iterator(self.tid, 0, MAX_POS, 0) jpayne@68: else: jpayne@68: raise StopIteration jpayne@68: continue jpayne@68: jpayne@68: # return result, if within same reference jpayne@68: return makePileupColumn(&self.plp, jpayne@68: self.tid, jpayne@68: self.pos, jpayne@68: self.n_plp, jpayne@68: self.min_base_quality, jpayne@68: self.iterdata.seq, jpayne@68: self.samfile.header) jpayne@68: jpayne@68: jpayne@68: cdef class IteratorColumnAll(IteratorColumn): jpayne@68: """iterates over all columns, without using an index jpayne@68: """ jpayne@68: jpayne@68: def __cinit__(self, jpayne@68: AlignmentFile samfile, jpayne@68: **kwargs): jpayne@68: jpayne@68: self._setup_raw_rest_iterator() jpayne@68: jpayne@68: def __next__(self): jpayne@68: jpayne@68: cdef int n jpayne@68: n = self.cnext() jpayne@68: if n < 0: jpayne@68: raise ValueError("error during iteration") jpayne@68: jpayne@68: if n == 0: jpayne@68: raise StopIteration jpayne@68: jpayne@68: return makePileupColumn(&self.plp, jpayne@68: self.tid, jpayne@68: self.pos, jpayne@68: self.n_plp, jpayne@68: self.min_base_quality, jpayne@68: self.iterdata.seq, jpayne@68: self.samfile.header) jpayne@68: jpayne@68: jpayne@68: cdef class SNPCall: jpayne@68: '''the results of a SNP call.''' jpayne@68: cdef int _tid jpayne@68: cdef int _pos jpayne@68: cdef char _reference_base jpayne@68: cdef char _genotype jpayne@68: cdef int _consensus_quality jpayne@68: cdef int _snp_quality jpayne@68: cdef int _rms_mapping_quality jpayne@68: cdef int _coverage jpayne@68: jpayne@68: property tid: jpayne@68: '''the chromosome ID as is defined in the header''' jpayne@68: def __get__(self): jpayne@68: return self._tid jpayne@68: jpayne@68: property pos: jpayne@68: '''nucleotide position of SNP.''' jpayne@68: def __get__(self): return self._pos jpayne@68: jpayne@68: property reference_base: jpayne@68: '''reference base at pos. ``N`` if no reference sequence supplied.''' jpayne@68: def __get__(self): return from_string_and_size( &self._reference_base, 1 ) jpayne@68: jpayne@68: property genotype: jpayne@68: '''the genotype called.''' jpayne@68: def __get__(self): return from_string_and_size( &self._genotype, 1 ) jpayne@68: jpayne@68: property consensus_quality: jpayne@68: '''the genotype quality (Phred-scaled).''' jpayne@68: def __get__(self): return self._consensus_quality jpayne@68: jpayne@68: property snp_quality: jpayne@68: '''the snp quality (Phred scaled) - probability of consensus being jpayne@68: identical to reference sequence.''' jpayne@68: def __get__(self): return self._snp_quality jpayne@68: jpayne@68: property mapping_quality: jpayne@68: '''the root mean square (rms) of the mapping quality of all reads jpayne@68: involved in the call.''' jpayne@68: def __get__(self): return self._rms_mapping_quality jpayne@68: jpayne@68: property coverage: jpayne@68: '''coverage or read depth - the number of reads involved in the call.''' jpayne@68: def __get__(self): return self._coverage jpayne@68: jpayne@68: def __str__(self): jpayne@68: jpayne@68: return "\t".join( map(str, ( jpayne@68: self.tid, jpayne@68: self.pos, jpayne@68: self.reference_base, jpayne@68: self.genotype, jpayne@68: self.consensus_quality, jpayne@68: self.snp_quality, jpayne@68: self.mapping_quality, jpayne@68: self.coverage ) ) ) jpayne@68: jpayne@68: jpayne@68: cdef class IndexedReads: jpayne@68: """Index a Sam/BAM-file by query name while keeping the jpayne@68: original sort order intact. jpayne@68: jpayne@68: The index is kept in memory and can be substantial. jpayne@68: jpayne@68: By default, the file is re-opened to avoid conflicts if multiple jpayne@68: operators work on the same file. Set `multiple_iterators` = False jpayne@68: to not re-open `samfile`. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: samfile : AlignmentFile jpayne@68: File to be indexed. jpayne@68: jpayne@68: multiple_iterators : bool jpayne@68: Flag indicating whether the file should be reopened. Reopening prevents jpayne@68: existing iterators being affected by the indexing. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, AlignmentFile samfile, int multiple_iterators=True): jpayne@68: cdef char *cfilename jpayne@68: jpayne@68: # makes sure that samfile stays alive as long as this jpayne@68: # object is alive. jpayne@68: self.samfile = samfile jpayne@68: cdef bam_hdr_t * hdr = NULL jpayne@68: assert samfile.is_bam, "can only apply IndexReads on bam files" jpayne@68: jpayne@68: # multiple_iterators the file - note that this makes the iterator jpayne@68: # slow and causes pileup to slow down significantly. jpayne@68: if multiple_iterators: jpayne@68: cfilename = samfile.filename jpayne@68: with nogil: jpayne@68: self.htsfile = hts_open(cfilename, 'r') jpayne@68: if self.htsfile == NULL: jpayne@68: raise OSError("unable to reopen htsfile") jpayne@68: jpayne@68: # need to advance in newly opened file to position after header jpayne@68: # better: use seek/tell? jpayne@68: with nogil: jpayne@68: hdr = sam_hdr_read(self.htsfile) jpayne@68: if hdr == NULL: jpayne@68: raise OSError("unable to read header information") jpayne@68: self.header = makeAlignmentHeader(hdr) jpayne@68: self.owns_samfile = True jpayne@68: else: jpayne@68: self.htsfile = self.samfile.htsfile jpayne@68: self.header = samfile.header jpayne@68: self.owns_samfile = False jpayne@68: jpayne@68: def build(self): jpayne@68: '''build the index.''' jpayne@68: jpayne@68: self.index = collections.defaultdict(list) jpayne@68: jpayne@68: # this method will start indexing from the current file position jpayne@68: cdef int ret = 1 jpayne@68: cdef bam1_t * b = calloc(1, sizeof( bam1_t)) jpayne@68: if b == NULL: jpayne@68: raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) jpayne@68: jpayne@68: cdef uint64_t pos jpayne@68: cdef bam_hdr_t * hdr = self.header.ptr jpayne@68: jpayne@68: while ret > 0: jpayne@68: with nogil: jpayne@68: pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) jpayne@68: ret = sam_read1(self.htsfile, jpayne@68: hdr, jpayne@68: b) jpayne@68: jpayne@68: if ret > 0: jpayne@68: qname = charptr_to_str(pysam_bam_get_qname(b)) jpayne@68: self.index[qname].append(pos) jpayne@68: jpayne@68: bam_destroy1(b) jpayne@68: jpayne@68: def find(self, query_name): jpayne@68: '''find `query_name` in index. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: jpayne@68: IteratorRowSelection jpayne@68: Returns an iterator over all reads with query_name. jpayne@68: jpayne@68: Raises jpayne@68: ------ jpayne@68: jpayne@68: KeyError jpayne@68: if the `query_name` is not in the index. jpayne@68: jpayne@68: ''' jpayne@68: if query_name in self.index: jpayne@68: return IteratorRowSelection( jpayne@68: self.samfile, jpayne@68: self.index[query_name], jpayne@68: multiple_iterators = False) jpayne@68: else: jpayne@68: raise KeyError("read %s not found" % query_name) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.owns_samfile: jpayne@68: hts_close(self.htsfile)