csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # cython: embedsignature=True
jpayne@68	2 # cython: profile=True
jpayne@68	3 ########################################################
jpayne@68	4 ########################################################
jpayne@68	5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@68	6 ########################################################
jpayne@68	7 # The principal classes defined in this module are:
jpayne@68	8 #
jpayne@68	9 # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
jpayne@68	10 #
jpayne@68	11 # class AlignmentHeader manage SAM/BAM/CRAM header data
jpayne@68	12 #
jpayne@68	13 # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
jpayne@68	14 # the original sort order intact
jpayne@68	15 #
jpayne@68	16 # Additionally this module defines numerous additional classes that
jpayne@68	17 # are part of the internal API. These are:
jpayne@68	18 #
jpayne@68	19 # Various iterator classes to iterate over alignments in sequential
jpayne@68	20 # (IteratorRow) or in a stacked fashion (IteratorColumn):
jpayne@68	21 #
jpayne@68	22 # class IteratorRow
jpayne@68	23 # class IteratorRowRegion
jpayne@68	24 # class IteratorRowHead
jpayne@68	25 # class IteratorRowAll
jpayne@68	26 # class IteratorRowAllRefs
jpayne@68	27 # class IteratorRowSelection
jpayne@68	28 # class IteratorColumn
jpayne@68	29 # class IteratorColumnRegion
jpayne@68	30 # class IteratorColumnAll
jpayne@68	31 # class IteratorColumnAllRefs
jpayne@68	32 #
jpayne@68	33 ########################################################
jpayne@68	34 #
jpayne@68	35 # The MIT License
jpayne@68	36 #
jpayne@68	37 # Copyright (c) 2015 Andreas Heger
jpayne@68	38 #
jpayne@68	39 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68	40 # copy of this software and associated documentation files (the "Software"),
jpayne@68	41 # to deal in the Software without restriction, including without limitation
jpayne@68	42 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68	43 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68	44 # Software is furnished to do so, subject to the following conditions:
jpayne@68	45 #
jpayne@68	46 # The above copyright notice and this permission notice shall be included in
jpayne@68	47 # all copies or substantial portions of the Software.
jpayne@68	48 #
jpayne@68	49 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68	50 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68	51 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68	52 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68	53 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68	54 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68	55 # DEALINGS IN THE SOFTWARE.
jpayne@68	56 #
jpayne@68	57 ########################################################
jpayne@68	58 import os
jpayne@68	59 import collections
jpayne@68	60 try:
jpayne@68	61 from collections.abc import Sequence, Mapping # noqa
jpayne@68	62 except ImportError:
jpayne@68	63 from collections import Sequence, Mapping # noqa
jpayne@68	64 import re
jpayne@68	65 import warnings
jpayne@68	66 import array
jpayne@68	67 from libc.errno cimport errno, EPIPE
jpayne@68	68 from libc.string cimport strcmp, strpbrk, strerror
jpayne@68	69 from libc.stdint cimport INT32_MAX
jpayne@68	70
jpayne@68	71 from cpython cimport array as c_array
jpayne@68	72
jpayne@68	73 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68	74 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68	75 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
jpayne@68	76 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
jpayne@68	77 HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
jpayne@68	78
jpayne@68	79 from io import StringIO
jpayne@68	80
jpayne@68	81 cimport cython
jpayne@68	82
jpayne@68	83
jpayne@68	84 __all__ = [
jpayne@68	85 "AlignmentFile",
jpayne@68	86 "AlignmentHeader",
jpayne@68	87 "IteratorRow",
jpayne@68	88 "IteratorColumn",
jpayne@68	89 "IndexedReads"]
jpayne@68	90
jpayne@68	91 IndexStats = collections.namedtuple("IndexStats",
jpayne@68	92 ("contig",
jpayne@68	93 "mapped",
jpayne@68	94 "unmapped",
jpayne@68	95 "total"))
jpayne@68	96
jpayne@68	97 ########################################################
jpayne@68	98 ## global variables
jpayne@68	99 # maximum genomic coordinace
jpayne@68	100 # for some reason, using 'int' causes overflow
jpayne@68	101 cdef int MAX_POS = (1 << 31) - 1
jpayne@68	102
jpayne@68	103 # valid types for SAM headers
jpayne@68	104 VALID_HEADER_TYPES = {"HD" : Mapping,
jpayne@68	105 "SQ" : Sequence,
jpayne@68	106 "RG" : Sequence,
jpayne@68	107 "PG" : Sequence,
jpayne@68	108 "CO" : Sequence}
jpayne@68	109
jpayne@68	110 # order of records within SAM headers
jpayne@68	111 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
jpayne@68	112
jpayne@68	113 # default type conversions within SAM header records
jpayne@68	114 KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
jpayne@68	115 "SS" : str,},
jpayne@68	116 "SQ" : {"SN" : str, "LN" : int, "AS" : str,
jpayne@68	117 "M5" : str, "SP" : str, "UR" : str,
jpayne@68	118 "AH" : str, "TP" : str, "DS" : str,
jpayne@68	119 "AN" : str,},
jpayne@68	120 "RG" : {"ID" : str, "CN" : str, "DS" : str,
jpayne@68	121 "DT" : str, "FO" : str, "KS" : str,
jpayne@68	122 "LB" : str, "PG" : str, "PI" : str,
jpayne@68	123 "PL" : str, "PM" : str, "PU" : str,
jpayne@68	124 "SM" : str, "BC" : str,},
jpayne@68	125 "PG" : {"ID" : str, "PN" : str, "CL" : str,
jpayne@68	126 "PP" : str, "DS" : str, "VN" : str,},}
jpayne@68	127
jpayne@68	128 # output order of fields within records. Ensure that CL is at
jpayne@68	129 # the end as parsing a CL will ignore any subsequent records.
jpayne@68	130 VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
jpayne@68	131 "SQ" : ("SN", "LN", "AS", "M5",
jpayne@68	132 "UR", "SP", "AH", "TP",
jpayne@68	133 "DS", "AN"),
jpayne@68	134 "RG" : ("ID", "CN", "SM", "LB",
jpayne@68	135 "PU", "PI", "DT", "DS",
jpayne@68	136 "PL", "FO", "KS", "PG",
jpayne@68	137 "PM", "BC"),
jpayne@68	138 "PG" : ("PN", "ID", "VN", "PP",
jpayne@68	139 "DS", "CL"),}
jpayne@68	140
jpayne@68	141
jpayne@68	142 def build_header_line(fields, record):
jpayne@68	143 '''build a header line from `fields` dictionary for `record`'''
jpayne@68	144
jpayne@68	145 # TODO: add checking for field and sort order
jpayne@68	146 line = ["@%s" % record]
jpayne@68	147 # comment
jpayne@68	148 if record == "CO":
jpayne@68	149 line.append(fields)
jpayne@68	150 # user tags
jpayne@68	151 elif record.islower():
jpayne@68	152 for key in sorted(fields):
jpayne@68	153 line.append("%s:%s" % (key, str(fields[key])))
jpayne@68	154 # defined tags
jpayne@68	155 else:
jpayne@68	156 # write fields of the specification
jpayne@68	157 for key in VALID_HEADER_ORDER[record]:
jpayne@68	158 if key in fields:
jpayne@68	159 line.append("%s:%s" % (key, str(fields[key])))
jpayne@68	160 # write user fields
jpayne@68	161 for key in fields:
jpayne@68	162 if not key.isupper():
jpayne@68	163 line.append("%s:%s" % (key, str(fields[key])))
jpayne@68	164
jpayne@68	165 return "\t".join(line)
jpayne@68	166
jpayne@68	167
jpayne@68	168 cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
jpayne@68	169 if not hdr:
jpayne@68	170 raise ValueError('cannot create AlignmentHeader, received NULL pointer')
jpayne@68	171
jpayne@68	172 # check: is AlignmetHeader.__cinit__ called?
jpayne@68	173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
jpayne@68	174 header.ptr = hdr
jpayne@68	175
jpayne@68	176 return header
jpayne@68	177
jpayne@68	178 def read_failure_reason(code):
jpayne@68	179 if code == -2:
jpayne@68	180 return 'truncated file'
jpayne@68	181 else:
jpayne@68	182 return "error {} while reading file".format(code)
jpayne@68	183
jpayne@68	184
jpayne@68	185 # the following should be class-method for VariantHeader, but cdef @classmethods
jpayne@68	186 # are not implemented in cython.
jpayne@68	187 cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
jpayne@68	188 reference_names,
jpayne@68	189 reference_lengths,
jpayne@68	190 add_sq_text=True,
jpayne@68	191 text=None) except -1:
jpayne@68	192 """build header from list of reference names and lengths.
jpayne@68	193 """
jpayne@68	194
jpayne@68	195 cdef class AlignmentHeader(object):
jpayne@68	196 """header information for a :class:`AlignmentFile` object
jpayne@68	197
jpayne@68	198 Parameters
jpayne@68	199 ----------
jpayne@68	200 header_dict : dict
jpayne@68	201 build header from a multi-level dictionary. The
jpayne@68	202 first level are the four types ('HD', 'SQ', ...). The second
jpayne@68	203 level are a list of lines, with each line being a list of
jpayne@68	204 tag-value pairs. The header is constructed first from all the
jpayne@68	205 defined fields, followed by user tags in alphabetical
jpayne@68	206 order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@68	207 object can be passed directly.
jpayne@68	208
jpayne@68	209 text : string
jpayne@68	210 use the string provided as the header
jpayne@68	211
jpayne@68	212 reference_names : list
jpayne@68	213 see reference_lengths
jpayne@68	214
jpayne@68	215 reference_lengths : list
jpayne@68	216 build header from list of chromosome names and lengths. By
jpayne@68	217 default, 'SQ' and 'LN' tags will be added to the header
jpayne@68	218 text. This option can be changed by unsetting the flag
jpayne@68	219 `add_sq_text`.
jpayne@68	220
jpayne@68	221 add_sq_text : bool
jpayne@68	222 do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@68	223 construction :term:`SAM` formatted files without a header.
jpayne@68	224
jpayne@68	225 """
jpayne@68	226
jpayne@68	227 # See makeVariantHeader for C constructor
jpayne@68	228 def __cinit__(self):
jpayne@68	229 self.ptr = NULL
jpayne@68	230
jpayne@68	231 # Python constructor
jpayne@68	232 def __init__(self):
jpayne@68	233 self.ptr = bam_hdr_init()
jpayne@68	234 if self.ptr is NULL:
jpayne@68	235 raise MemoryError("could not create header")
jpayne@68	236
jpayne@68	237 @classmethod
jpayne@68	238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
jpayne@68	239
jpayne@68	240 cdef AlignmentHeader self = AlignmentHeader()
jpayne@68	241 cdef char *ctext
jpayne@68	242 cdef int l_text
jpayne@68	243 cdef int n, x
jpayne@68	244 if text is not None:
jpayne@68	245 btext = force_bytes(text)
jpayne@68	246 ctext = btext
jpayne@68	247 l_text = len(btext)
jpayne@68	248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
jpayne@68	249 if self.ptr.text == NULL:
jpayne@68	250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
jpayne@68	251 self.ptr.l_text = l_text
jpayne@68	252 memcpy(self.ptr.text, ctext, l_text + 1)
jpayne@68	253
jpayne@68	254 if reference_names and reference_lengths:
jpayne@68	255 reference_names = [force_bytes(ref) for ref in reference_names]
jpayne@68	256
jpayne@68	257 self.ptr.n_targets = len(reference_names)
jpayne@68	258
jpayne@68	259 n = sum([len(reference_names) + 1])
jpayne@68	260 self.ptr.target_name = <char*>calloc(n, sizeof(char))
jpayne@68	261 if self.ptr.target_name == NULL:
jpayne@68	262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
jpayne@68	263
jpayne@68	264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
jpayne@68	265 if self.ptr.target_len == NULL:
jpayne@68	266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
jpayne@68	267
jpayne@68	268 for x from 0 <= x < self.ptr.n_targets:
jpayne@68	269 self.ptr.target_len[x] = reference_lengths[x]
jpayne@68	270 name = reference_names[x]
jpayne@68	271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
jpayne@68	272 if self.ptr.target_name[x] == NULL:
jpayne@68	273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
jpayne@68	274 strncpy(self.ptr.target_name[x], name, len(name))
jpayne@68	275
jpayne@68	276 return self
jpayne@68	277
jpayne@68	278 @classmethod
jpayne@68	279 def from_text(cls, text):
jpayne@68	280
jpayne@68	281 reference_names, reference_lengths = [], []
jpayne@68	282 for line in text.splitlines():
jpayne@68	283 if line.startswith("@SQ"):
jpayne@68	284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
jpayne@68	285 try:
jpayne@68	286 reference_names.append(fields["SN"])
jpayne@68	287 reference_lengths.append(int(fields["LN"]))
jpayne@68	288 except KeyError:
jpayne@68	289 raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@68	290 except ValueError:
jpayne@68	291 raise ValueError("wrong sequence information in '%s'" % str(fields))
jpayne@68	292
jpayne@68	293 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68	294
jpayne@68	295 @classmethod
jpayne@68	296 def from_dict(cls, header_dict):
jpayne@68	297
jpayne@68	298 cdef list lines = []
jpayne@68	299 # first: defined tags
jpayne@68	300 for record in VALID_HEADERS:
jpayne@68	301 if record in header_dict:
jpayne@68	302 data = header_dict[record]
jpayne@68	303 if not isinstance(data, VALID_HEADER_TYPES[record]):
jpayne@68	304 raise ValueError(
jpayne@68	305 "invalid type for record %s: %s, expected %s".format(
jpayne@68	306 record, type(data), VALID_HEADER_TYPES[record]))
jpayne@68	307 if isinstance(data, Mapping):
jpayne@68	308 lines.append(build_header_line(data, record))
jpayne@68	309 else:
jpayne@68	310 for fields in header_dict[record]:
jpayne@68	311 lines.append(build_header_line(fields, record))
jpayne@68	312
jpayne@68	313 # then: user tags (lower case), sorted alphabetically
jpayne@68	314 for record, data in sorted(header_dict.items()):
jpayne@68	315 if record in VALID_HEADERS:
jpayne@68	316 continue
jpayne@68	317 if isinstance(data, Mapping):
jpayne@68	318 lines.append(build_header_line(data, record))
jpayne@68	319 else:
jpayne@68	320 for fields in header_dict[record]:
jpayne@68	321 lines.append(build_header_line(fields, record))
jpayne@68	322
jpayne@68	323 text = "\n".join(lines) + "\n"
jpayne@68	324
jpayne@68	325 reference_names, reference_lengths = [], []
jpayne@68	326 if "SQ" in header_dict:
jpayne@68	327 for fields in header_dict["SQ"]:
jpayne@68	328 try:
jpayne@68	329 reference_names.append(fields["SN"])
jpayne@68	330 reference_lengths.append(fields["LN"])
jpayne@68	331 except KeyError:
jpayne@68	332 raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@68	333
jpayne@68	334 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68	335
jpayne@68	336 @classmethod
jpayne@68	337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
jpayne@68	338
jpayne@68	339 if len(reference_names) != len(reference_lengths):
jpayne@68	340 raise ValueError("number of reference names and lengths do not match")
jpayne@68	341
jpayne@68	342 # optionally, if there is no text, add a SAM compatible header to output file.
jpayne@68	343 if text is None and add_sq_text:
jpayne@68	344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
jpayne@68	345 reference_names, reference_lengths)])
jpayne@68	346
jpayne@68	347 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@68	348
jpayne@68	349 def __dealloc__(self):
jpayne@68	350 bam_hdr_destroy(self.ptr)
jpayne@68	351 self.ptr = NULL
jpayne@68	352
jpayne@68	353 def __bool__(self):
jpayne@68	354 return self.ptr != NULL
jpayne@68	355
jpayne@68	356 def copy(self):
jpayne@68	357 return makeAlignmentHeader(bam_hdr_dup(self.ptr))
jpayne@68	358
jpayne@68	359 property nreferences:
jpayne@68	360 """int with the number of :term:`reference` sequences in the file.
jpayne@68	361
jpayne@68	362 This is a read-only attribute."""
jpayne@68	363 def __get__(self):
jpayne@68	364 return self.ptr.n_targets
jpayne@68	365
jpayne@68	366 property references:
jpayne@68	367 """tuple with the names of :term:`reference` sequences. This is a
jpayne@68	368 read-only attribute"""
jpayne@68	369 def __get__(self):
jpayne@68	370 t = []
jpayne@68	371 cdef int x
jpayne@68	372 for x in range(self.ptr.n_targets):
jpayne@68	373 t.append(charptr_to_str(self.ptr.target_name[x]))
jpayne@68	374 return tuple(t)
jpayne@68	375
jpayne@68	376 property lengths:
jpayne@68	377 """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@68	378 read-only attribute. The lengths are in the same order as
jpayne@68	379 :attr:`pysam.AlignmentFile.references`
jpayne@68	380 """
jpayne@68	381 def __get__(self):
jpayne@68	382 t = []
jpayne@68	383 cdef int x
jpayne@68	384 for x in range(self.ptr.n_targets):
jpayne@68	385 t.append(self.ptr.target_len[x])
jpayne@68	386 return tuple(t)
jpayne@68	387
jpayne@68	388 def _build_sequence_section(self):
jpayne@68	389 """return sequence section of header.
jpayne@68	390
jpayne@68	391 The sequence section is built from the list of reference names and
jpayne@68	392 lengths stored in the BAM-file and not from any @SQ entries that
jpayne@68	393 are part of the header's text section.
jpayne@68	394 """
jpayne@68	395
jpayne@68	396 cdef int x
jpayne@68	397 text = []
jpayne@68	398 for x in range(self.ptr.n_targets):
jpayne@68	399 text.append("@SQ\tSN:{}\tLN:{}\n".format(
jpayne@68	400 force_str(self.ptr.target_name[x]),
jpayne@68	401 self.ptr.target_len[x]))
jpayne@68	402 return "".join(text)
jpayne@68	403
jpayne@68	404 def to_dict(self):
jpayne@68	405 """return two-level dictionary with header information from the file.
jpayne@68	406
jpayne@68	407 The first level contains the record (``HD``, ``SQ``, etc) and
jpayne@68	408 the second level contains the fields (``VN``, ``LN``, etc).
jpayne@68	409
jpayne@68	410 The parser is validating and will raise an AssertionError if
jpayne@68	411 if encounters any record or field tags that are not part of
jpayne@68	412 the SAM specification. Use the
jpayne@68	413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
jpayne@68	414 header.
jpayne@68	415
jpayne@68	416 The parsing follows the SAM format specification with the
jpayne@68	417 exception of the ``CL`` field. This option will consume the
jpayne@68	418 rest of a header line irrespective of any additional fields.
jpayne@68	419 This behaviour has been added to accommodate command line
jpayne@68	420 options that contain characters that are not valid field
jpayne@68	421 separators.
jpayne@68	422
jpayne@68	423 If no @SQ entries are within the text section of the header,
jpayne@68	424 this will be automatically added from the reference names and
jpayne@68	425 lengths stored in the binary part of the header.
jpayne@68	426 """
jpayne@68	427 result = collections.OrderedDict()
jpayne@68	428
jpayne@68	429 # convert to python string
jpayne@68	430 t = self.__str__()
jpayne@68	431 for line in t.split("\n"):
jpayne@68	432 line = line.strip(' \0')
jpayne@68	433 if not line:
jpayne@68	434 continue
jpayne@68	435 assert line.startswith("@"), \
jpayne@68	436 "header line without '@': '%s'" % line
jpayne@68	437 fields = line[1:].split("\t")
jpayne@68	438 record = fields[0]
jpayne@68	439 assert record in VALID_HEADER_TYPES, \
jpayne@68	440 "header line with invalid type '%s': '%s'" % (record, line)
jpayne@68	441
jpayne@68	442 # treat comments
jpayne@68	443 if record == "CO":
jpayne@68	444 if record not in result:
jpayne@68	445 result[record] = []
jpayne@68	446 result[record].append("\t".join( fields[1:]))
jpayne@68	447 continue
jpayne@68	448 # the following is clumsy as generators do not work?
jpayne@68	449 x = {}
jpayne@68	450
jpayne@68	451 for idx, field in enumerate(fields[1:]):
jpayne@68	452 if ":" not in field:
jpayne@68	453 raise ValueError("malformatted header: no ':' in field" )
jpayne@68	454 key, value = field.split(":", 1)
jpayne@68	455 if key in ("CL",):
jpayne@68	456 # special treatment for command line
jpayne@68	457 # statements (CL). These might contain
jpayne@68	458 # characters that are non-conformant with
jpayne@68	459 # the valid field separators in the SAM
jpayne@68	460 # header. Thus, in contravention to the
jpayne@68	461 # SAM API, consume the rest of the line.
jpayne@68	462 key, value = "\t".join(fields[idx+1:]).split(":", 1)
jpayne@68	463 x[key] = KNOWN_HEADER_FIELDS[record][key](value)
jpayne@68	464 break
jpayne@68	465
jpayne@68	466 # interpret type of known header record tags, default to str
jpayne@68	467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
jpayne@68	468
jpayne@68	469 if VALID_HEADER_TYPES[record] == Mapping:
jpayne@68	470 if record in result:
jpayne@68	471 raise ValueError(
jpayne@68	472 "multiple '%s' lines are not permitted" % record)
jpayne@68	473
jpayne@68	474 result[record] = x
jpayne@68	475 elif VALID_HEADER_TYPES[record] == Sequence:
jpayne@68	476 if record not in result: result[record] = []
jpayne@68	477 result[record].append(x)
jpayne@68	478
jpayne@68	479 # if there are no SQ lines in the header, add the
jpayne@68	480 # reference names from the information in the bam
jpayne@68	481 # file.
jpayne@68	482 #
jpayne@68	483 # Background: c-samtools keeps the textual part of the
jpayne@68	484 # header separate from the list of reference names and
jpayne@68	485 # lengths. Thus, if a header contains only SQ lines,
jpayne@68	486 # the SQ information is not part of the textual header
jpayne@68	487 # and thus are missing from the output. See issue 84.
jpayne@68	488 if "SQ" not in result:
jpayne@68	489 sq = []
jpayne@68	490 for ref, length in zip(self.references, self.lengths):
jpayne@68	491 sq.append({'LN': length, 'SN': ref })
jpayne@68	492 result["SQ"] = sq
jpayne@68	493
jpayne@68	494 return result
jpayne@68	495
jpayne@68	496 def as_dict(self):
jpayne@68	497 """deprecated, use :meth:`to_dict()` instead"""
jpayne@68	498 return self.to_dict()
jpayne@68	499
jpayne@68	500 def get_reference_name(self, tid):
jpayne@68	501 if tid == -1:
jpayne@68	502 return None
jpayne@68	503 if not 0 <= tid < self.ptr.n_targets:
jpayne@68	504 raise ValueError("reference_id %i out of range 0<=tid<%i" %
jpayne@68	505 (tid, self.ptr.n_targets))
jpayne@68	506 return charptr_to_str(self.ptr.target_name[tid])
jpayne@68	507
jpayne@68	508 def get_reference_length(self, reference):
jpayne@68	509 cdef int tid = self.get_tid(reference)
jpayne@68	510 if tid < 0:
jpayne@68	511 raise KeyError("unknown reference {}".format(reference))
jpayne@68	512 else:
jpayne@68	513 return self.ptr.target_len[tid]
jpayne@68	514
jpayne@68	515 def is_valid_tid(self, int tid):
jpayne@68	516 """
jpayne@68	517 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68	518
jpayne@68	519 Note that the unmapped tid code (-1) counts as an invalid.
jpayne@68	520 """
jpayne@68	521 return 0 <= tid < self.ptr.n_targets
jpayne@68	522
jpayne@68	523 def get_tid(self, reference):
jpayne@68	524 """
jpayne@68	525 return the numerical :term:`tid` corresponding to
jpayne@68	526 :term:`reference`
jpayne@68	527
jpayne@68	528 returns -1 if reference is not known.
jpayne@68	529 """
jpayne@68	530 reference = force_bytes(reference)
jpayne@68	531 tid = bam_name2id(self.ptr, reference)
jpayne@68	532 if tid < -1:
jpayne@68	533 raise ValueError('could not parse header')
jpayne@68	534 return tid
jpayne@68	535
jpayne@68	536 def __str__(self):
jpayne@68	537 '''string with the full contents of the :term:`sam file` header as a
jpayne@68	538 string.
jpayne@68	539
jpayne@68	540 If no @SQ entries are within the text section of the header,
jpayne@68	541 this will be automatically added from the reference names and
jpayne@68	542 lengths stored in the binary part of the header.
jpayne@68	543
jpayne@68	544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
jpayne@68	545 representation of the header.
jpayne@68	546 '''
jpayne@68	547 text = from_string_and_size(self.ptr.text, self.ptr.l_text)
jpayne@68	548 if "@SQ" not in text:
jpayne@68	549 text += "\n" + self._build_sequence_section()
jpayne@68	550 return text
jpayne@68	551
jpayne@68	552 # dictionary access methods, for backwards compatibility.
jpayne@68	553 def __setitem__(self, key, value):
jpayne@68	554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
jpayne@68	555
jpayne@68	556 def __getitem__(self, key):
jpayne@68	557 return self.to_dict().__getitem__(key)
jpayne@68	558
jpayne@68	559 def items(self):
jpayne@68	560 return self.to_dict().items()
jpayne@68	561
jpayne@68	562 # PY2 compatibility
jpayne@68	563 def iteritems(self):
jpayne@68	564 return self.to_dict().items()
jpayne@68	565
jpayne@68	566 def keys(self):
jpayne@68	567 return self.to_dict().keys()
jpayne@68	568
jpayne@68	569 def values(self):
jpayne@68	570 return self.to_dict().values()
jpayne@68	571
jpayne@68	572 def get(self, *args):
jpayne@68	573 return self.to_dict().get(*args)
jpayne@68	574
jpayne@68	575 def __len__(self):
jpayne@68	576 return self.to_dict().__len__()
jpayne@68	577
jpayne@68	578 def __contains__(self, key):
jpayne@68	579 return self.to_dict().__contains__(key)
jpayne@68	580
jpayne@68	581
jpayne@68	582 cdef class AlignmentFile(HTSFile):
jpayne@68	583 """AlignmentFile(filepath_or_object, mode=None, template=None,
jpayne@68	584 reference_names=None, reference_lengths=None, text=NULL,
jpayne@68	585 header=None, add_sq_text=False, check_header=True, check_sq=True,
jpayne@68	586 reference_filename=None, filename=None, index_filename=None,
jpayne@68	587 filepath_index=None, require_index=False, duplicate_filehandle=True,
jpayne@68	588 ignore_truncation=False, threads=1)
jpayne@68	589
jpayne@68	590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
jpayne@68	591
jpayne@68	592 If `filepath_or_object` is a string, the file is automatically
jpayne@68	593 opened. If `filepath_or_object` is a python File object, the
jpayne@68	594 already opened file will be used.
jpayne@68	595
jpayne@68	596 If the file is opened for reading and an index exists (if file is BAM, a
jpayne@68	597 .bai file or if CRAM a .crai file), it will be opened automatically.
jpayne@68	598 `index_filename` may be specified explicitly. If the index is not named
jpayne@68	599 in the standard manner, not located in the same directory as the
jpayne@68	600 BAM/CRAM file, or is remote. Without an index, random access via
jpayne@68	601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
jpayne@68	602 is disabled.
jpayne@68	603
jpayne@68	604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can
jpayne@68	605 be constituted from several sources (see also the samtools format
jpayne@68	606 specification):
jpayne@68	607
jpayne@68	608 1. If `template` is given, the header is copied from another
jpayne@68	609 `AlignmentFile` (`template` must be a
jpayne@68	610 :class:`~pysam.AlignmentFile`).
jpayne@68	611
jpayne@68	612 2. If `header` is given, the header is built from a
jpayne@68	613 multi-level dictionary.
jpayne@68	614
jpayne@68	615 3. If `text` is given, new header text is copied from raw
jpayne@68	616 text.
jpayne@68	617
jpayne@68	618 4. The names (`reference_names`) and lengths
jpayne@68	619 (`reference_lengths`) are supplied directly as lists.
jpayne@68	620
jpayne@68	621 When reading or writing a CRAM file, the filename of a FASTA-formatted
jpayne@68	622 reference can be specified with `reference_filename`.
jpayne@68	623
jpayne@68	624 By default, if a file is opened in mode 'r', it is checked
jpayne@68	625 for a valid header (`check_header` = True) and a definition of
jpayne@68	626 chromosome names (`check_sq` = True).
jpayne@68	627
jpayne@68	628 Parameters
jpayne@68	629 ----------
jpayne@68	630 mode : string
jpayne@68	631 `mode` should be ``r`` for reading or ``w`` for writing. The
jpayne@68	632 default is text mode (:term:`SAM`). For binary (:term:`BAM`)
jpayne@68	633 I/O you should append ``b`` for compressed or ``u`` for
jpayne@68	634 uncompressed :term:`BAM` output. Use ``h`` to output header
jpayne@68	635 information in text (:term:`TAM`) mode. Use ``c`` for
jpayne@68	636 :term:`CRAM` formatted files.
jpayne@68	637
jpayne@68	638 If ``b`` is present, it must immediately follow ``r`` or
jpayne@68	639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
jpayne@68	640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
jpayne@68	641 :term:`BAM` formatted file for reading, type::
jpayne@68	642
jpayne@68	643 f = pysam.AlignmentFile('ex1.bam','rb')
jpayne@68	644
jpayne@68	645 If mode is not specified, the method will try to auto-detect
jpayne@68	646 in the order 'rb', 'r', thus both the following should work::
jpayne@68	647
jpayne@68	648 f1 = pysam.AlignmentFile('ex1.bam')
jpayne@68	649 f2 = pysam.AlignmentFile('ex1.sam')
jpayne@68	650
jpayne@68	651 template : AlignmentFile
jpayne@68	652 when writing, copy header from file `template`.
jpayne@68	653
jpayne@68	654 header : dict or AlignmentHeader
jpayne@68	655 when writing, build header from a multi-level dictionary. The
jpayne@68	656 first level are the four types ('HD', 'SQ', ...). The second
jpayne@68	657 level are a list of lines, with each line being a list of
jpayne@68	658 tag-value pairs. The header is constructed first from all the
jpayne@68	659 defined fields, followed by user tags in alphabetical
jpayne@68	660 order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@68	661 object can be passed directly.
jpayne@68	662
jpayne@68	663 text : string
jpayne@68	664 when writing, use the string provided as the header
jpayne@68	665
jpayne@68	666 reference_names : list
jpayne@68	667 see reference_lengths
jpayne@68	668
jpayne@68	669 reference_lengths : list
jpayne@68	670 when writing or opening a SAM file without header build header
jpayne@68	671 from list of chromosome names and lengths. By default, 'SQ'
jpayne@68	672 and 'LN' tags will be added to the header text. This option
jpayne@68	673 can be changed by unsetting the flag `add_sq_text`.
jpayne@68	674
jpayne@68	675 add_sq_text : bool
jpayne@68	676 do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@68	677 construction :term:`SAM` formatted files without a header.
jpayne@68	678
jpayne@68	679 add_sam_header : bool
jpayne@68	680 when outputting SAM the default is to output a header. This is
jpayne@68	681 equivalent to opening the file in 'wh' mode. If this option is
jpayne@68	682 set to False, no header will be output. To read such a file,
jpayne@68	683 set `check_header=False`.
jpayne@68	684
jpayne@68	685 check_header : bool
jpayne@68	686 obsolete: when reading a SAM file, check if header is present
jpayne@68	687 (default=True)
jpayne@68	688
jpayne@68	689 check_sq : bool
jpayne@68	690 when reading, check if SQ entries are present in header
jpayne@68	691 (default=True)
jpayne@68	692
jpayne@68	693 reference_filename : string
jpayne@68	694 Path to a FASTA-formatted reference file. Valid only for CRAM files.
jpayne@68	695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
jpayne@68	696 specified in the header (``UR`` tag), which are normally used to find
jpayne@68	697 the reference.
jpayne@68	698
jpayne@68	699 index_filename : string
jpayne@68	700 Explicit path to the index file. Only needed if the index is not
jpayne@68	701 named in the standard manner, not located in the same directory as
jpayne@68	702 the BAM/CRAM file, or is remote. An IOError is raised if the index
jpayne@68	703 cannot be found or is invalid.
jpayne@68	704
jpayne@68	705 filepath_index : string
jpayne@68	706 Alias for `index_filename`.
jpayne@68	707
jpayne@68	708 require_index : bool
jpayne@68	709 When reading, require that an index file is present and is valid or
jpayne@68	710 raise an IOError. (default=False)
jpayne@68	711
jpayne@68	712 filename : string
jpayne@68	713 Alternative to filepath_or_object. Filename of the file
jpayne@68	714 to be opened.
jpayne@68	715
jpayne@68	716 duplicate_filehandle: bool
jpayne@68	717 By default, file handles passed either directly or through
jpayne@68	718 File-like objects will be duplicated before passing them to
jpayne@68	719 htslib. The duplication prevents issues where the same stream
jpayne@68	720 will be closed by htslib and through destruction of the
jpayne@68	721 high-level python object. Set to False to turn off
jpayne@68	722 duplication.
jpayne@68	723
jpayne@68	724 ignore_truncation: bool
jpayne@68	725 Issue a warning, instead of raising an error if the current file
jpayne@68	726 appears to be truncated due to a missing EOF marker. Only applies
jpayne@68	727 to bgzipped formats. (Default=False)
jpayne@68	728
jpayne@68	729 format_options: list
jpayne@68	730 A list of key=value strings, as accepted by --input-fmt-option and
jpayne@68	731 --output-fmt-option in samtools.
jpayne@68	732 threads: integer
jpayne@68	733 Number of threads to use for compressing/decompressing BAM/CRAM files.
jpayne@68	734 Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@68	735 (Default=1)
jpayne@68	736 """
jpayne@68	737
jpayne@68	738 def __cinit__(self, args, *kwargs):
jpayne@68	739 self.htsfile = NULL
jpayne@68	740 self.filename = None
jpayne@68	741 self.mode = None
jpayne@68	742 self.threads = 1
jpayne@68	743 self.is_stream = False
jpayne@68	744 self.is_remote = False
jpayne@68	745 self.index = NULL
jpayne@68	746
jpayne@68	747 if "filename" in kwargs:
jpayne@68	748 args = [kwargs["filename"]]
jpayne@68	749 del kwargs["filename"]
jpayne@68	750
jpayne@68	751 self._open(args, *kwargs)
jpayne@68	752
jpayne@68	753 # allocate memory for iterator
jpayne@68	754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@68	755 if self.b == NULL:
jpayne@68	756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
jpayne@68	757
jpayne@68	758 def has_index(self):
jpayne@68	759 """return true if htsfile has an existing (and opened) index.
jpayne@68	760 """
jpayne@68	761 return self.index != NULL
jpayne@68	762
jpayne@68	763 def check_index(self):
jpayne@68	764 """return True if index is present.
jpayne@68	765
jpayne@68	766 Raises
jpayne@68	767 ------
jpayne@68	768
jpayne@68	769 AttributeError
jpayne@68	770 if htsfile is :term:`SAM` formatted and thus has no index.
jpayne@68	771
jpayne@68	772 ValueError
jpayne@68	773 if htsfile is closed or index could not be opened.
jpayne@68	774 """
jpayne@68	775
jpayne@68	776 if not self.is_open:
jpayne@68	777 raise ValueError("I/O operation on closed file")
jpayne@68	778 if not self.is_bam and not self.is_cram:
jpayne@68	779 raise AttributeError(
jpayne@68	780 "AlignmentFile.mapped only available in bam files")
jpayne@68	781 if self.index == NULL:
jpayne@68	782 raise ValueError(
jpayne@68	783 "mapping information not recorded in index "
jpayne@68	784 "or index not available")
jpayne@68	785 return True
jpayne@68	786
jpayne@68	787 def _open(self,
jpayne@68	788 filepath_or_object,
jpayne@68	789 mode=None,
jpayne@68	790 AlignmentFile template=None,
jpayne@68	791 reference_names=None,
jpayne@68	792 reference_lengths=None,
jpayne@68	793 reference_filename=None,
jpayne@68	794 text=None,
jpayne@68	795 header=None,
jpayne@68	796 port=None,
jpayne@68	797 add_sq_text=True,
jpayne@68	798 add_sam_header=True,
jpayne@68	799 check_header=True,
jpayne@68	800 check_sq=True,
jpayne@68	801 index_filename=None,
jpayne@68	802 filepath_index=None,
jpayne@68	803 require_index=False,
jpayne@68	804 referencenames=None,
jpayne@68	805 referencelengths=None,
jpayne@68	806 duplicate_filehandle=True,
jpayne@68	807 ignore_truncation=False,
jpayne@68	808 format_options=None,
jpayne@68	809 threads=1):
jpayne@68	810 '''open a sam, bam or cram formatted file.
jpayne@68	811
jpayne@68	812 If _open is called on an existing file, the current file
jpayne@68	813 will be closed and a new file will be opened.
jpayne@68	814
jpayne@68	815 '''
jpayne@68	816 cdef char *cfilename = NULL
jpayne@68	817 cdef char *creference_filename = NULL
jpayne@68	818 cdef char *cindexname = NULL
jpayne@68	819 cdef char *cmode = NULL
jpayne@68	820 cdef bam_hdr_t * hdr = NULL
jpayne@68	821
jpayne@68	822 if threads > 1 and ignore_truncation:
jpayne@68	823 # This won't raise errors if reaching a truncated alignment,
jpayne@68	824 # because bgzf_mt_reader in htslib does not deal with
jpayne@68	825 # bgzf_mt_read_block returning non-zero values, contrary
jpayne@68	826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@68	827 # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@68	828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@68	829 self.threads = threads
jpayne@68	830
jpayne@68	831 # for backwards compatibility:
jpayne@68	832 if referencenames is not None:
jpayne@68	833 reference_names = referencenames
jpayne@68	834 if referencelengths is not None:
jpayne@68	835 reference_lengths = referencelengths
jpayne@68	836
jpayne@68	837 # close a previously opened file
jpayne@68	838 if self.is_open:
jpayne@68	839 self.close()
jpayne@68	840
jpayne@68	841 # autodetection for read
jpayne@68	842 if mode is None:
jpayne@68	843 mode = "r"
jpayne@68	844
jpayne@68	845 if add_sam_header and mode == "w":
jpayne@68	846 mode = "wh"
jpayne@68	847
jpayne@68	848 assert mode in ("r", "w", "rb", "wb", "wh",
jpayne@68	849 "wbu", "rU", "wb0",
jpayne@68	850 "rc", "wc"), \
jpayne@68	851 "invalid file opening mode `%s`" % mode
jpayne@68	852
jpayne@68	853 self.duplicate_filehandle = duplicate_filehandle
jpayne@68	854
jpayne@68	855 # StringIO not supported
jpayne@68	856 if isinstance(filepath_or_object, StringIO):
jpayne@68	857 raise NotImplementedError(
jpayne@68	858 "access from StringIO objects not supported")
jpayne@68	859 # reading from a file descriptor
jpayne@68	860 elif isinstance(filepath_or_object, int):
jpayne@68	861 self.filename = filepath_or_object
jpayne@68	862 filename = None
jpayne@68	863 self.is_remote = False
jpayne@68	864 self.is_stream = True
jpayne@68	865 # reading from a File object or other object with fileno
jpayne@68	866 elif hasattr(filepath_or_object, "fileno"):
jpayne@68	867 if filepath_or_object.closed:
jpayne@68	868 raise ValueError('I/O operation on closed file')
jpayne@68	869 self.filename = filepath_or_object
jpayne@68	870 # .name can be TextIOWrapper
jpayne@68	871 try:
jpayne@68	872 filename = encode_filename(str(filepath_or_object.name))
jpayne@68	873 cfilename = filename
jpayne@68	874 except AttributeError:
jpayne@68	875 filename = None
jpayne@68	876 self.is_remote = False
jpayne@68	877 self.is_stream = True
jpayne@68	878 # what remains is a filename
jpayne@68	879 else:
jpayne@68	880 self.filename = filename = encode_filename(filepath_or_object)
jpayne@68	881 cfilename = filename
jpayne@68	882 self.is_remote = hisremote(cfilename)
jpayne@68	883 self.is_stream = self.filename == b'-'
jpayne@68	884
jpayne@68	885 # for htslib, wbu seems to not work
jpayne@68	886 if mode == "wbu":
jpayne@68	887 mode = "wb0"
jpayne@68	888
jpayne@68	889 self.mode = force_bytes(mode)
jpayne@68	890 self.reference_filename = reference_filename = encode_filename(
jpayne@68	891 reference_filename)
jpayne@68	892
jpayne@68	893 if mode[0] == 'w':
jpayne@68	894 # open file for writing
jpayne@68	895
jpayne@68	896 if not (template or header or text or (reference_names and reference_lengths)):
jpayne@68	897 raise ValueError(
jpayne@68	898 "either supply options `template`, `header`, `text` or both `reference_names` "
jpayne@68	899 "and `reference_lengths` for writing")
jpayne@68	900
jpayne@68	901 if template:
jpayne@68	902 # header is copied, though at the moment not strictly
jpayne@68	903 # necessary as AlignmentHeader is immutable.
jpayne@68	904 self.header = template.header.copy()
jpayne@68	905 elif isinstance(header, AlignmentHeader):
jpayne@68	906 self.header = header.copy()
jpayne@68	907 elif isinstance(header, Mapping):
jpayne@68	908 self.header = AlignmentHeader.from_dict(header)
jpayne@68	909 elif reference_names and reference_lengths:
jpayne@68	910 self.header = AlignmentHeader.from_references(
jpayne@68	911 reference_names,
jpayne@68	912 reference_lengths,
jpayne@68	913 add_sq_text=add_sq_text,
jpayne@68	914 text=text)
jpayne@68	915 elif text:
jpayne@68	916 self.header = AlignmentHeader.from_text(text)
jpayne@68	917 else:
jpayne@68	918 raise ValueError("not enough information to construct header. Please provide template, "
jpayne@68	919 "header, text or reference_names/reference_lengths")
jpayne@68	920 self.htsfile = self._open_htsfile()
jpayne@68	921
jpayne@68	922 if self.htsfile == NULL:
jpayne@68	923 if errno:
jpayne@68	924 raise IOError(errno, "could not open alignment file `{}`: {}".format(
jpayne@68	925 force_str(filename),
jpayne@68	926 force_str(strerror(errno))))
jpayne@68	927 else:
jpayne@68	928 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@68	929 if format_options and len(format_options):
jpayne@68	930 self.add_hts_options(format_options)
jpayne@68	931 # set filename with reference sequences. If no filename
jpayne@68	932 # is given, the CRAM reference arrays will be built from
jpayne@68	933 # the @SQ header in the header
jpayne@68	934 if "c" in mode and reference_filename:
jpayne@68	935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
jpayne@68	936 raise ValueError("failure when setting reference filename")
jpayne@68	937
jpayne@68	938 # write header to htsfile
jpayne@68	939 if "b" in mode or "c" in mode or "h" in mode:
jpayne@68	940 hdr = self.header.ptr
jpayne@68	941 with nogil:
jpayne@68	942 sam_hdr_write(self.htsfile, hdr)
jpayne@68	943
jpayne@68	944 elif mode[0] == "r":
jpayne@68	945 # open file for reading
jpayne@68	946 self.htsfile = self._open_htsfile()
jpayne@68	947
jpayne@68	948 if self.htsfile == NULL:
jpayne@68	949 if errno:
jpayne@68	950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
jpayne@68	951 force_str(strerror(errno))))
jpayne@68	952 else:
jpayne@68	953 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@68	954
jpayne@68	955 if self.htsfile.format.category != sequence_data:
jpayne@68	956 raise ValueError("file does not contain alignment data")
jpayne@68	957
jpayne@68	958 if format_options and len(format_options):
jpayne@68	959 self.add_hts_options(format_options)
jpayne@68	960
jpayne@68	961 self.check_truncation(ignore_truncation)
jpayne@68	962
jpayne@68	963 # bam/cram files require a valid header
jpayne@68	964 if self.is_bam or self.is_cram:
jpayne@68	965 with nogil:
jpayne@68	966 hdr = sam_hdr_read(self.htsfile)
jpayne@68	967 if hdr == NULL:
jpayne@68	968 raise ValueError(
jpayne@68	969 "file does not have a valid header (mode='%s') "
jpayne@68	970 "- is it BAM/CRAM format?" % mode)
jpayne@68	971 self.header = makeAlignmentHeader(hdr)
jpayne@68	972 else:
jpayne@68	973 # in sam files a header is optional. If not given,
jpayne@68	974 # user may provide reference names and lengths to built
jpayne@68	975 # an on-the-fly header.
jpayne@68	976 if reference_names and reference_lengths:
jpayne@68	977 # build header from a target names and lengths
jpayne@68	978 self.header = AlignmentHeader.from_references(
jpayne@68	979 reference_names=reference_names,
jpayne@68	980 reference_lengths=reference_lengths,
jpayne@68	981 add_sq_text=add_sq_text,
jpayne@68	982 text=text)
jpayne@68	983 else:
jpayne@68	984 with nogil:
jpayne@68	985 hdr = sam_hdr_read(self.htsfile)
jpayne@68	986 if hdr == NULL:
jpayne@68	987 raise ValueError(
jpayne@68	988 "SAM? file does not have a valid header (mode='%s'), "
jpayne@68	989 "please provide reference_names and reference_lengths")
jpayne@68	990 self.header = makeAlignmentHeader(hdr)
jpayne@68	991
jpayne@68	992 # set filename with reference sequences
jpayne@68	993 if self.is_cram and reference_filename:
jpayne@68	994 creference_filename = self.reference_filename
jpayne@68	995 hts_set_opt(self.htsfile,
jpayne@68	996 CRAM_OPT_REFERENCE,
jpayne@68	997 creference_filename)
jpayne@68	998
jpayne@68	999 if check_sq and self.header.nreferences == 0:
jpayne@68	1000 raise ValueError(
jpayne@68	1001 ("file has no sequences defined (mode='%s') - "
jpayne@68	1002 "is it SAM/BAM format? Consider opening with "
jpayne@68	1003 "check_sq=False") % mode)
jpayne@68	1004
jpayne@68	1005 if self.is_bam or self.is_cram:
jpayne@68	1006 self.index_filename = index_filename or filepath_index
jpayne@68	1007 if self.index_filename:
jpayne@68	1008 cindexname = bfile_name = encode_filename(self.index_filename)
jpayne@68	1009
jpayne@68	1010 if cfilename or cindexname:
jpayne@68	1011 with nogil:
jpayne@68	1012 self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
jpayne@68	1013 HTS_IDX_SAVE_REMOTE\|HTS_IDX_SILENT_FAIL)
jpayne@68	1014
jpayne@68	1015 if not self.index and (cindexname or require_index):
jpayne@68	1016 if errno:
jpayne@68	1017 raise IOError(errno, force_str(strerror(errno)))
jpayne@68	1018 else:
jpayne@68	1019 raise IOError('unable to open index file `%s`' % self.index_filename)
jpayne@68	1020
jpayne@68	1021 elif require_index:
jpayne@68	1022 raise IOError('unable to open index file')
jpayne@68	1023
jpayne@68	1024 # save start of data section
jpayne@68	1025 if not self.is_stream:
jpayne@68	1026 self.start_offset = self.tell()
jpayne@68	1027
jpayne@68	1028 def fetch(self,
jpayne@68	1029 contig=None,
jpayne@68	1030 start=None,
jpayne@68	1031 stop=None,
jpayne@68	1032 region=None,
jpayne@68	1033 tid=None,
jpayne@68	1034 until_eof=False,
jpayne@68	1035 multiple_iterators=False,
jpayne@68	1036 reference=None,
jpayne@68	1037 end=None):
jpayne@68	1038 """fetch reads aligned in a :term:`region`.
jpayne@68	1039
jpayne@68	1040 See :meth:`~pysam.HTSFile.parse_region` for more information
jpayne@68	1041 on how genomic regions can be specified. :term:`reference` and
jpayne@68	1042 `end` are also accepted for backward compatibility as synonyms
jpayne@68	1043 for :term:`contig` and `stop`, respectively.
jpayne@68	1044
jpayne@68	1045 Without a `contig` or `region` all mapped reads in the file
jpayne@68	1046 will be fetched. The reads will be returned ordered by reference
jpayne@68	1047 sequence, which will not necessarily be the order within the
jpayne@68	1048 file. This mode of iteration still requires an index. If there is
jpayne@68	1049 no index, use `until_eof=True`.
jpayne@68	1050
jpayne@68	1051 If only `contig` is set, all reads aligned to `contig`
jpayne@68	1052 will be fetched.
jpayne@68	1053
jpayne@68	1054 A :term:`SAM` file does not allow random access. If `region`
jpayne@68	1055 or `contig` are given, an exception is raised.
jpayne@68	1056
jpayne@68	1057 Parameters
jpayne@68	1058 ----------
jpayne@68	1059
jpayne@68	1060 until_eof : bool
jpayne@68	1061
jpayne@68	1062 If `until_eof` is True, all reads from the current file
jpayne@68	1063 position will be returned in order as they are within the
jpayne@68	1064 file. Using this option will also fetch unmapped reads.
jpayne@68	1065
jpayne@68	1066 multiple_iterators : bool
jpayne@68	1067
jpayne@68	1068 If `multiple_iterators` is True, multiple
jpayne@68	1069 iterators on the same file can be used at the same time. The
jpayne@68	1070 iterator returned will receive its own copy of a filehandle to
jpayne@68	1071 the file effectively re-opening the file. Re-opening a file
jpayne@68	1072 creates some overhead, so beware.
jpayne@68	1073
jpayne@68	1074 Returns
jpayne@68	1075 -------
jpayne@68	1076
jpayne@68	1077 An iterator over a collection of reads. : IteratorRow
jpayne@68	1078
jpayne@68	1079 Raises
jpayne@68	1080 ------
jpayne@68	1081
jpayne@68	1082 ValueError
jpayne@68	1083 if the genomic coordinates are out of range or invalid or the
jpayne@68	1084 file does not permit random access to genomic coordinates.
jpayne@68	1085
jpayne@68	1086 """
jpayne@68	1087 cdef int rtid, rstart, rstop, has_coord
jpayne@68	1088
jpayne@68	1089 if not self.is_open:
jpayne@68	1090 raise ValueError( "I/O operation on closed file" )
jpayne@68	1091
jpayne@68	1092 has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@68	1093 contig, start, stop, region, tid,
jpayne@68	1094 end=end, reference=reference)
jpayne@68	1095
jpayne@68	1096 # Turn of re-opening if htsfile is a stream
jpayne@68	1097 if self.is_stream:
jpayne@68	1098 multiple_iterators = False
jpayne@68	1099
jpayne@68	1100 if self.is_bam or self.is_cram:
jpayne@68	1101 if not until_eof and not self.is_remote:
jpayne@68	1102 if not self.has_index():
jpayne@68	1103 raise ValueError(
jpayne@68	1104 "fetch called on bamfile without index")
jpayne@68	1105
jpayne@68	1106 if has_coord:
jpayne@68	1107 return IteratorRowRegion(
jpayne@68	1108 self, rtid, rstart, rstop,
jpayne@68	1109 multiple_iterators=multiple_iterators)
jpayne@68	1110 else:
jpayne@68	1111 if until_eof:
jpayne@68	1112 return IteratorRowAll(
jpayne@68	1113 self,
jpayne@68	1114 multiple_iterators=multiple_iterators)
jpayne@68	1115 else:
jpayne@68	1116 # AH: check - reason why no multiple_iterators for
jpayne@68	1117 # AllRefs?
jpayne@68	1118 return IteratorRowAllRefs(
jpayne@68	1119 self,
jpayne@68	1120 multiple_iterators=multiple_iterators)
jpayne@68	1121 else:
jpayne@68	1122 if has_coord:
jpayne@68	1123 raise ValueError(
jpayne@68	1124 "fetching by region is not available for SAM files")
jpayne@68	1125
jpayne@68	1126 if multiple_iterators == True:
jpayne@68	1127 raise ValueError(
jpayne@68	1128 "multiple iterators not implemented for SAM files")
jpayne@68	1129
jpayne@68	1130 return IteratorRowAll(self,
jpayne@68	1131 multiple_iterators=multiple_iterators)
jpayne@68	1132
jpayne@68	1133 def head(self, n, multiple_iterators=True):
jpayne@68	1134 '''return an iterator over the first n alignments.
jpayne@68	1135
jpayne@68	1136 This iterator is is useful for inspecting the bam-file.
jpayne@68	1137
jpayne@68	1138 Parameters
jpayne@68	1139 ----------
jpayne@68	1140
jpayne@68	1141 multiple_iterators : bool
jpayne@68	1142
jpayne@68	1143 is set to True by default in order to
jpayne@68	1144 avoid changing the current file position.
jpayne@68	1145
jpayne@68	1146 Returns
jpayne@68	1147 -------
jpayne@68	1148
jpayne@68	1149 an iterator over a collection of reads : IteratorRowHead
jpayne@68	1150
jpayne@68	1151 '''
jpayne@68	1152 return IteratorRowHead(self, n,
jpayne@68	1153 multiple_iterators=multiple_iterators)
jpayne@68	1154
jpayne@68	1155 def mate(self, AlignedSegment read):
jpayne@68	1156 '''return the mate of :class:`pysam.AlignedSegment` `read`.
jpayne@68	1157
jpayne@68	1158 .. note::
jpayne@68	1159
jpayne@68	1160 Calling this method will change the file position.
jpayne@68	1161 This might interfere with any iterators that have
jpayne@68	1162 not re-opened the file.
jpayne@68	1163
jpayne@68	1164 .. note::
jpayne@68	1165
jpayne@68	1166 This method is too slow for high-throughput processing.
jpayne@68	1167 If a read needs to be processed with its mate, work
jpayne@68	1168 from a read name sorted file or, better, cache reads.
jpayne@68	1169
jpayne@68	1170 Returns
jpayne@68	1171 -------
jpayne@68	1172
jpayne@68	1173 the mate : AlignedSegment
jpayne@68	1174
jpayne@68	1175 Raises
jpayne@68	1176 ------
jpayne@68	1177
jpayne@68	1178 ValueError
jpayne@68	1179 if the read is unpaired or the mate is unmapped
jpayne@68	1180
jpayne@68	1181 '''
jpayne@68	1182 cdef uint32_t flag = read._delegate.core.flag
jpayne@68	1183
jpayne@68	1184 if flag & BAM_FPAIRED == 0:
jpayne@68	1185 raise ValueError("read %s: is unpaired" %
jpayne@68	1186 (read.query_name))
jpayne@68	1187 if flag & BAM_FMUNMAP != 0:
jpayne@68	1188 raise ValueError("mate %s: is unmapped" %
jpayne@68	1189 (read.query_name))
jpayne@68	1190
jpayne@68	1191 # xor flags to get the other mate
jpayne@68	1192 cdef int x = BAM_FREAD1 + BAM_FREAD2
jpayne@68	1193 flag = (flag ^ x) & x
jpayne@68	1194
jpayne@68	1195 # Make sure to use a separate file to jump around
jpayne@68	1196 # to mate as otherwise the original file position
jpayne@68	1197 # will be lost
jpayne@68	1198 # The following code is not using the C API and
jpayne@68	1199 # could thus be made much quicker, for example
jpayne@68	1200 # by using tell and seek.
jpayne@68	1201 for mate in self.fetch(
jpayne@68	1202 read._delegate.core.mpos,
jpayne@68	1203 read._delegate.core.mpos + 1,
jpayne@68	1204 tid=read._delegate.core.mtid,
jpayne@68	1205 multiple_iterators=True):
jpayne@68	1206 if mate.flag & flag != 0 and \
jpayne@68	1207 mate.query_name == read.query_name:
jpayne@68	1208 break
jpayne@68	1209 else:
jpayne@68	1210 raise ValueError("mate not found")
jpayne@68	1211
jpayne@68	1212 return mate
jpayne@68	1213
jpayne@68	1214 def pileup(self,
jpayne@68	1215 contig=None,
jpayne@68	1216 start=None,
jpayne@68	1217 stop=None,
jpayne@68	1218 region=None,
jpayne@68	1219 reference=None,
jpayne@68	1220 end=None,
jpayne@68	1221 **kwargs):
jpayne@68	1222 """perform a :term:`pileup` within a :term:`region`. The region is
jpayne@68	1223 specified by :term:`contig`, `start` and `stop` (using
jpayne@68	1224 0-based indexing). :term:`reference` and `end` are also accepted for
jpayne@68	1225 backward compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68	1226 respectively. Alternatively, a samtools 'region' string
jpayne@68	1227 can be supplied.
jpayne@68	1228
jpayne@68	1229 Without 'contig' or 'region' all reads will be used for the
jpayne@68	1230 pileup. The reads will be returned ordered by
jpayne@68	1231 :term:`contig` sequence, which will not necessarily be the
jpayne@68	1232 order within the file.
jpayne@68	1233
jpayne@68	1234 Note that :term:`SAM` formatted files do not allow random
jpayne@68	1235 access. In these files, if a 'region' or 'contig' are
jpayne@68	1236 given an exception is raised.
jpayne@68	1237
jpayne@68	1238 .. note::
jpayne@68	1239
jpayne@68	1240 'all' reads which overlap the region are returned. The
jpayne@68	1241 first base returned will be the first base of the first
jpayne@68	1242 read 'not' necessarily the first base of the region used
jpayne@68	1243 in the query.
jpayne@68	1244
jpayne@68	1245 Parameters
jpayne@68	1246 ----------
jpayne@68	1247
jpayne@68	1248 truncate : bool
jpayne@68	1249
jpayne@68	1250 By default, the samtools pileup engine outputs all reads
jpayne@68	1251 overlapping a region. If truncate is True and a region is
jpayne@68	1252 given, only columns in the exact region specified are
jpayne@68	1253 returned.
jpayne@68	1254
jpayne@68	1255 max_depth : int
jpayne@68	1256 Maximum read depth permitted. The default limit is '8000'.
jpayne@68	1257
jpayne@68	1258 stepper : string
jpayne@68	1259 The stepper controls how the iterator advances.
jpayne@68	1260 Possible options for the stepper are
jpayne@68	1261
jpayne@68	1262 ``all``
jpayne@68	1263 skip reads in which any of the following flags are set:
jpayne@68	1264 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68	1265
jpayne@68	1266 ``nofilter``
jpayne@68	1267 uses every single read turning off any filtering.
jpayne@68	1268
jpayne@68	1269 ``samtools``
jpayne@68	1270 same filter and read processing as in samtools
jpayne@68	1271 pileup. For full compatibility, this requires a
jpayne@68	1272 'fastafile' to be given. The following options all pertain
jpayne@68	1273 to filtering of the ``samtools`` stepper.
jpayne@68	1274
jpayne@68	1275 fastafile : :class:`~pysam.FastaFile` object.
jpayne@68	1276
jpayne@68	1277 This is required for some of the steppers.
jpayne@68	1278
jpayne@68	1279 ignore_overlaps: bool
jpayne@68	1280
jpayne@68	1281 If set to True, detect if read pairs overlap and only take
jpayne@68	1282 the higher quality base. This is the default.
jpayne@68	1283
jpayne@68	1284 flag_filter : int
jpayne@68	1285
jpayne@68	1286 ignore reads where any of the bits in the flag are set. The default is
jpayne@68	1287 BAM_FUNMAP \| BAM_FSECONDARY \| BAM_FQCFAIL \| BAM_FDUP.
jpayne@68	1288
jpayne@68	1289 flag_require : int
jpayne@68	1290
jpayne@68	1291 only use reads where certain flags are set. The default is 0.
jpayne@68	1292
jpayne@68	1293 ignore_orphans: bool
jpayne@68	1294
jpayne@68	1295 ignore orphans (paired reads that are not in a proper pair).
jpayne@68	1296 The default is to ignore orphans.
jpayne@68	1297
jpayne@68	1298 min_base_quality: int
jpayne@68	1299
jpayne@68	1300 Minimum base quality. Bases below the minimum quality will
jpayne@68	1301 not be output. The default is 13.
jpayne@68	1302
jpayne@68	1303 adjust_capq_threshold: int
jpayne@68	1304
jpayne@68	1305 adjust mapping quality. The default is 0 for no
jpayne@68	1306 adjustment. The recommended value for adjustment is 50.
jpayne@68	1307
jpayne@68	1308 min_mapping_quality : int
jpayne@68	1309
jpayne@68	1310 only use reads above a minimum mapping quality. The default is 0.
jpayne@68	1311
jpayne@68	1312 compute_baq: bool
jpayne@68	1313
jpayne@68	1314 re-alignment computing per-Base Alignment Qualities (BAQ). The
jpayne@68	1315 default is to do re-alignment. Realignment requires a reference
jpayne@68	1316 sequence. If none is present, no realignment will be performed.
jpayne@68	1317
jpayne@68	1318 redo_baq: bool
jpayne@68	1319
jpayne@68	1320 recompute per-Base Alignment Quality on the fly ignoring
jpayne@68	1321 existing base qualities. The default is False (use existing
jpayne@68	1322 base qualities).
jpayne@68	1323
jpayne@68	1324 Returns
jpayne@68	1325 -------
jpayne@68	1326
jpayne@68	1327 an iterator over genomic positions. : IteratorColumn
jpayne@68	1328
jpayne@68	1329 """
jpayne@68	1330 cdef int rtid, has_coord
jpayne@68	1331 cdef int32_t rstart, rstop
jpayne@68	1332
jpayne@68	1333 if not self.is_open:
jpayne@68	1334 raise ValueError("I/O operation on closed file")
jpayne@68	1335
jpayne@68	1336 has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@68	1337 contig, start, stop, region, reference=reference, end=end)
jpayne@68	1338
jpayne@68	1339 if has_coord:
jpayne@68	1340 if not self.has_index():
jpayne@68	1341 raise ValueError("no index available for pileup")
jpayne@68	1342
jpayne@68	1343 return IteratorColumnRegion(self,
jpayne@68	1344 tid=rtid,
jpayne@68	1345 start=rstart,
jpayne@68	1346 stop=rstop,
jpayne@68	1347 **kwargs)
jpayne@68	1348 else:
jpayne@68	1349 if self.has_index():
jpayne@68	1350 return IteratorColumnAllRefs(self, **kwargs)
jpayne@68	1351 else:
jpayne@68	1352 return IteratorColumnAll(self, **kwargs)
jpayne@68	1353
jpayne@68	1354 def count(self,
jpayne@68	1355 contig=None,
jpayne@68	1356 start=None,
jpayne@68	1357 stop=None,
jpayne@68	1358 region=None,
jpayne@68	1359 until_eof=False,
jpayne@68	1360 read_callback="nofilter",
jpayne@68	1361 reference=None,
jpayne@68	1362 end=None):
jpayne@68	1363 '''count the number of reads in :term:`region`
jpayne@68	1364
jpayne@68	1365 The region is specified by :term:`contig`, `start` and `stop`.
jpayne@68	1366 :term:`reference` and `end` are also accepted for backward
jpayne@68	1367 compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68	1368 respectively. Alternatively, a `samtools`_ :term:`region`
jpayne@68	1369 string can be supplied.
jpayne@68	1370
jpayne@68	1371 A :term:`SAM` file does not allow random access and if
jpayne@68	1372 `region` or `contig` are given, an exception is raised.
jpayne@68	1373
jpayne@68	1374 Parameters
jpayne@68	1375 ----------
jpayne@68	1376
jpayne@68	1377 contig : string
jpayne@68	1378 reference_name of the genomic region (chromosome)
jpayne@68	1379
jpayne@68	1380 start : int
jpayne@68	1381 start of the genomic region (0-based inclusive)
jpayne@68	1382
jpayne@68	1383 stop : int
jpayne@68	1384 end of the genomic region (0-based exclusive)
jpayne@68	1385
jpayne@68	1386 region : string
jpayne@68	1387 a region string in samtools format.
jpayne@68	1388
jpayne@68	1389 until_eof : bool
jpayne@68	1390 count until the end of the file, possibly including
jpayne@68	1391 unmapped reads as well.
jpayne@68	1392
jpayne@68	1393 read_callback: string or function
jpayne@68	1394
jpayne@68	1395 select a call-back to ignore reads when counting. It can
jpayne@68	1396 be either a string with the following values:
jpayne@68	1397
jpayne@68	1398 ``all``
jpayne@68	1399 skip reads in which any of the following
jpayne@68	1400 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@68	1401 BAM_FDUP
jpayne@68	1402
jpayne@68	1403 ``nofilter``
jpayne@68	1404 uses every single read
jpayne@68	1405
jpayne@68	1406 Alternatively, `read_callback` can be a function
jpayne@68	1407 ``check_read(read)`` that should return True only for
jpayne@68	1408 those reads that shall be included in the counting.
jpayne@68	1409
jpayne@68	1410 reference : string
jpayne@68	1411 backward compatible synonym for `contig`
jpayne@68	1412
jpayne@68	1413 end : int
jpayne@68	1414 backward compatible synonym for `stop`
jpayne@68	1415
jpayne@68	1416 Raises
jpayne@68	1417 ------
jpayne@68	1418
jpayne@68	1419 ValueError
jpayne@68	1420 if the genomic coordinates are out of range or invalid.
jpayne@68	1421
jpayne@68	1422 '''
jpayne@68	1423 cdef AlignedSegment read
jpayne@68	1424 cdef long counter = 0
jpayne@68	1425
jpayne@68	1426 if not self.is_open:
jpayne@68	1427 raise ValueError("I/O operation on closed file")
jpayne@68	1428
jpayne@68	1429 cdef int filter_method = 0
jpayne@68	1430 if read_callback == "all":
jpayne@68	1431 filter_method = 1
jpayne@68	1432 elif read_callback == "nofilter":
jpayne@68	1433 filter_method = 2
jpayne@68	1434
jpayne@68	1435 for read in self.fetch(contig=contig,
jpayne@68	1436 start=start,
jpayne@68	1437 stop=stop,
jpayne@68	1438 reference=reference,
jpayne@68	1439 end=end,
jpayne@68	1440 region=region,
jpayne@68	1441 until_eof=until_eof):
jpayne@68	1442 # apply filter
jpayne@68	1443 if filter_method == 1:
jpayne@68	1444 # filter = "all"
jpayne@68	1445 if (read.flag & (0x4 \| 0x100 \| 0x200 \| 0x400)):
jpayne@68	1446 continue
jpayne@68	1447 elif filter_method == 2:
jpayne@68	1448 # filter = "nofilter"
jpayne@68	1449 pass
jpayne@68	1450 else:
jpayne@68	1451 if not read_callback(read):
jpayne@68	1452 continue
jpayne@68	1453 counter += 1
jpayne@68	1454
jpayne@68	1455 return counter
jpayne@68	1456
jpayne@68	1457 @cython.boundscheck(False) # we do manual bounds checking
jpayne@68	1458 def count_coverage(self,
jpayne@68	1459 contig,
jpayne@68	1460 start=None,
jpayne@68	1461 stop=None,
jpayne@68	1462 region=None,
jpayne@68	1463 quality_threshold=15,
jpayne@68	1464 read_callback='all',
jpayne@68	1465 reference=None,
jpayne@68	1466 end=None):
jpayne@68	1467 """count the coverage of genomic positions by reads in :term:`region`.
jpayne@68	1468
jpayne@68	1469 The region is specified by :term:`contig`, `start` and `stop`.
jpayne@68	1470 :term:`reference` and `end` are also accepted for backward
jpayne@68	1471 compatibility as synonyms for :term:`contig` and `stop`,
jpayne@68	1472 respectively. Alternatively, a `samtools`_ :term:`region`
jpayne@68	1473 string can be supplied. The coverage is computed per-base [ACGT].
jpayne@68	1474
jpayne@68	1475 Parameters
jpayne@68	1476 ----------
jpayne@68	1477
jpayne@68	1478 contig : string
jpayne@68	1479 reference_name of the genomic region (chromosome)
jpayne@68	1480
jpayne@68	1481 start : int
jpayne@68	1482 start of the genomic region (0-based inclusive). If not
jpayne@68	1483 given, count from the start of the chromosome.
jpayne@68	1484
jpayne@68	1485 stop : int
jpayne@68	1486 end of the genomic region (0-based exclusive). If not given,
jpayne@68	1487 count to the end of the chromosome.
jpayne@68	1488
jpayne@68	1489 region : string
jpayne@68	1490 a region string.
jpayne@68	1491
jpayne@68	1492 quality_threshold : int
jpayne@68	1493 quality_threshold is the minimum quality score (in phred) a
jpayne@68	1494 base has to reach to be counted.
jpayne@68	1495
jpayne@68	1496 read_callback: string or function
jpayne@68	1497
jpayne@68	1498 select a call-back to ignore reads when counting. It can
jpayne@68	1499 be either a string with the following values:
jpayne@68	1500
jpayne@68	1501 ``all``
jpayne@68	1502 skip reads in which any of the following
jpayne@68	1503 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@68	1504 BAM_FDUP
jpayne@68	1505
jpayne@68	1506 ``nofilter``
jpayne@68	1507 uses every single read
jpayne@68	1508
jpayne@68	1509 Alternatively, `read_callback` can be a function
jpayne@68	1510 ``check_read(read)`` that should return True only for
jpayne@68	1511 those reads that shall be included in the counting.
jpayne@68	1512
jpayne@68	1513 reference : string
jpayne@68	1514 backward compatible synonym for `contig`
jpayne@68	1515
jpayne@68	1516 end : int
jpayne@68	1517 backward compatible synonym for `stop`
jpayne@68	1518
jpayne@68	1519 Raises
jpayne@68	1520 ------
jpayne@68	1521
jpayne@68	1522 ValueError
jpayne@68	1523 if the genomic coordinates are out of range or invalid.
jpayne@68	1524
jpayne@68	1525 Returns
jpayne@68	1526 -------
jpayne@68	1527
jpayne@68	1528 four array.arrays of the same length in order A C G T : tuple
jpayne@68	1529
jpayne@68	1530 """
jpayne@68	1531
jpayne@68	1532 cdef uint32_t contig_length = self.get_reference_length(contig)
jpayne@68	1533 cdef int _start = start if start is not None else 0
jpayne@68	1534 cdef int _stop = stop if stop is not None else contig_length
jpayne@68	1535 _stop = _stop if _stop < contig_length else contig_length
jpayne@68	1536
jpayne@68	1537 if _stop == _start:
jpayne@68	1538 raise ValueError("interval of size 0")
jpayne@68	1539 if _stop < _start:
jpayne@68	1540 raise ValueError("interval of size less than 0")
jpayne@68	1541
jpayne@68	1542 cdef int length = _stop - _start
jpayne@68	1543 cdef c_array.array int_array_template = array.array('L', [])
jpayne@68	1544 cdef c_array.array count_a
jpayne@68	1545 cdef c_array.array count_c
jpayne@68	1546 cdef c_array.array count_g
jpayne@68	1547 cdef c_array.array count_t
jpayne@68	1548 count_a = c_array.clone(int_array_template, length, zero=True)
jpayne@68	1549 count_c = c_array.clone(int_array_template, length, zero=True)
jpayne@68	1550 count_g = c_array.clone(int_array_template, length, zero=True)
jpayne@68	1551 count_t = c_array.clone(int_array_template, length, zero=True)
jpayne@68	1552
jpayne@68	1553 cdef AlignedSegment read
jpayne@68	1554 cdef cython.str seq
jpayne@68	1555 cdef c_array.array quality
jpayne@68	1556 cdef int qpos
jpayne@68	1557 cdef int refpos
jpayne@68	1558 cdef int c = 0
jpayne@68	1559 cdef int filter_method = 0
jpayne@68	1560
jpayne@68	1561
jpayne@68	1562 if read_callback == "all":
jpayne@68	1563 filter_method = 1
jpayne@68	1564 elif read_callback == "nofilter":
jpayne@68	1565 filter_method = 2
jpayne@68	1566
jpayne@68	1567 cdef int _threshold = quality_threshold or 0
jpayne@68	1568 for read in self.fetch(contig=contig,
jpayne@68	1569 reference=reference,
jpayne@68	1570 start=start,
jpayne@68	1571 stop=stop,
jpayne@68	1572 end=end,
jpayne@68	1573 region=region):
jpayne@68	1574 # apply filter
jpayne@68	1575 if filter_method == 1:
jpayne@68	1576 # filter = "all"
jpayne@68	1577 if (read.flag & (0x4 \| 0x100 \| 0x200 \| 0x400)):
jpayne@68	1578 continue
jpayne@68	1579 elif filter_method == 2:
jpayne@68	1580 # filter = "nofilter"
jpayne@68	1581 pass
jpayne@68	1582 else:
jpayne@68	1583 if not read_callback(read):
jpayne@68	1584 continue
jpayne@68	1585
jpayne@68	1586 # count
jpayne@68	1587 seq = read.seq
jpayne@68	1588 if seq is None:
jpayne@68	1589 continue
jpayne@68	1590 quality = read.query_qualities
jpayne@68	1591
jpayne@68	1592 for qpos, refpos in read.get_aligned_pairs(True):
jpayne@68	1593 if qpos is not None and refpos is not None and \
jpayne@68	1594 _start <= refpos < _stop:
jpayne@68	1595
jpayne@68	1596 # only check base quality if _threshold > 0
jpayne@68	1597 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
jpayne@68	1598 if seq[qpos] == 'A':
jpayne@68	1599 count_a.data.as_ulongs[refpos - _start] += 1
jpayne@68	1600 if seq[qpos] == 'C':
jpayne@68	1601 count_c.data.as_ulongs[refpos - _start] += 1
jpayne@68	1602 if seq[qpos] == 'G':
jpayne@68	1603 count_g.data.as_ulongs[refpos - _start] += 1
jpayne@68	1604 if seq[qpos] == 'T':
jpayne@68	1605 count_t.data.as_ulongs[refpos - _start] += 1
jpayne@68	1606
jpayne@68	1607 return count_a, count_c, count_g, count_t
jpayne@68	1608
jpayne@68	1609 def find_introns_slow(self, read_iterator):
jpayne@68	1610 """Return a dictionary {(start, stop): count}
jpayne@68	1611 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@68	1612 and their support ( = number of reads ).
jpayne@68	1613
jpayne@68	1614 read_iterator can be the result of a .fetch(...) call.
jpayne@68	1615 Or it can be a generator filtering such reads. Example
jpayne@68	1616 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@68	1617 """
jpayne@68	1618 res = collections.Counter()
jpayne@68	1619 for r in read_iterator:
jpayne@68	1620 if 'N' in r.cigarstring:
jpayne@68	1621 last_read_pos = False
jpayne@68	1622 for read_loc, genome_loc in r.get_aligned_pairs():
jpayne@68	1623 if read_loc is None and last_read_pos:
jpayne@68	1624 start = genome_loc
jpayne@68	1625 elif read_loc and last_read_pos is None:
jpayne@68	1626 stop = genome_loc # we are right exclusive ,so this is correct
jpayne@68	1627 res[(start, stop)] += 1
jpayne@68	1628 del start
jpayne@68	1629 del stop
jpayne@68	1630 last_read_pos = read_loc
jpayne@68	1631 return res
jpayne@68	1632
jpayne@68	1633 def find_introns(self, read_iterator):
jpayne@68	1634 """Return a dictionary {(start, stop): count}
jpayne@68	1635 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@68	1636 and their support ( = number of reads ).
jpayne@68	1637
jpayne@68	1638 read_iterator can be the result of a .fetch(...) call.
jpayne@68	1639 Or it can be a generator filtering such reads. Example
jpayne@68	1640 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@68	1641 """
jpayne@68	1642 cdef:
jpayne@68	1643 uint32_t base_position, junc_start, nt
jpayne@68	1644 int op
jpayne@68	1645 AlignedSegment r
jpayne@68	1646 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
jpayne@68	1647
jpayne@68	1648 res = collections.Counter()
jpayne@68	1649
jpayne@68	1650 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
jpayne@68	1651 for r in read_iterator:
jpayne@68	1652 base_position = r.pos
jpayne@68	1653 cigar = r.cigartuples
jpayne@68	1654 if cigar is None:
jpayne@68	1655 continue
jpayne@68	1656
jpayne@68	1657 for op, nt in cigar:
jpayne@68	1658 if op in match_or_deletion:
jpayne@68	1659 base_position += nt
jpayne@68	1660 elif op == BAM_CREF_SKIP:
jpayne@68	1661 junc_start = base_position
jpayne@68	1662 base_position += nt
jpayne@68	1663 res[(junc_start, base_position)] += 1
jpayne@68	1664 return res
jpayne@68	1665
jpayne@68	1666
jpayne@68	1667 def close(self):
jpayne@68	1668 '''closes the :class:`pysam.AlignmentFile`.'''
jpayne@68	1669
jpayne@68	1670 if self.htsfile == NULL:
jpayne@68	1671 return
jpayne@68	1672
jpayne@68	1673 if self.index != NULL:
jpayne@68	1674 hts_idx_destroy(self.index)
jpayne@68	1675 self.index = NULL
jpayne@68	1676
jpayne@68	1677 cdef int ret = hts_close(self.htsfile)
jpayne@68	1678 self.htsfile = NULL
jpayne@68	1679
jpayne@68	1680 self.header = None
jpayne@68	1681
jpayne@68	1682 if ret < 0:
jpayne@68	1683 global errno
jpayne@68	1684 if errno == EPIPE:
jpayne@68	1685 errno = 0
jpayne@68	1686 else:
jpayne@68	1687 raise IOError(errno, force_str(strerror(errno)))
jpayne@68	1688
jpayne@68	1689 def __dealloc__(self):
jpayne@68	1690 cdef int ret = 0
jpayne@68	1691
jpayne@68	1692 if self.index != NULL:
jpayne@68	1693 hts_idx_destroy(self.index)
jpayne@68	1694 self.index = NULL
jpayne@68	1695
jpayne@68	1696 if self.htsfile != NULL:
jpayne@68	1697 ret = hts_close(self.htsfile)
jpayne@68	1698 self.htsfile = NULL
jpayne@68	1699
jpayne@68	1700 self.header = None
jpayne@68	1701
jpayne@68	1702 if self.b:
jpayne@68	1703 bam_destroy1(self.b)
jpayne@68	1704 self.b = NULL
jpayne@68	1705
jpayne@68	1706 if ret < 0:
jpayne@68	1707 global errno
jpayne@68	1708 if errno == EPIPE:
jpayne@68	1709 errno = 0
jpayne@68	1710 else:
jpayne@68	1711 raise IOError(errno, force_str(strerror(errno)))
jpayne@68	1712
jpayne@68	1713 cpdef int write(self, AlignedSegment read) except -1:
jpayne@68	1714 '''
jpayne@68	1715 write a single :class:`pysam.AlignedSegment` to disk.
jpayne@68	1716
jpayne@68	1717 Raises:
jpayne@68	1718 ValueError
jpayne@68	1719 if the writing failed
jpayne@68	1720
jpayne@68	1721 Returns:
jpayne@68	1722 int :
jpayne@68	1723 the number of bytes written. If the file is closed,
jpayne@68	1724 this will be 0.
jpayne@68	1725 '''
jpayne@68	1726 if not self.is_open:
jpayne@68	1727 return 0
jpayne@68	1728
jpayne@68	1729 if self.header.ptr.n_targets <= read._delegate.core.tid:
jpayne@68	1730 raise ValueError(
jpayne@68	1731 "AlignedSegment refers to reference number {} that "
jpayne@68	1732 "is larger than the number of references ({}) in the header".format(
jpayne@68	1733 read._delegate.core.tid, self.header.ptr.n_targets))
jpayne@68	1734
jpayne@68	1735 cdef int ret
jpayne@68	1736 with nogil:
jpayne@68	1737 ret = sam_write1(self.htsfile,
jpayne@68	1738 self.header.ptr,
jpayne@68	1739 read._delegate)
jpayne@68	1740
jpayne@68	1741 # kbj: Still need to raise an exception with except -1. Otherwise
jpayne@68	1742 # when ret == -1 we get a "SystemError: error return without
jpayne@68	1743 # exception set".
jpayne@68	1744 if ret < 0:
jpayne@68	1745 raise IOError(
jpayne@68	1746 "sam_write1 failed with error code {}".format(ret))
jpayne@68	1747
jpayne@68	1748 return ret
jpayne@68	1749
jpayne@68	1750 # context manager interface
jpayne@68	1751 def __enter__(self):
jpayne@68	1752 return self
jpayne@68	1753
jpayne@68	1754 def __exit__(self, exc_type, exc_value, traceback):
jpayne@68	1755 self.close()
jpayne@68	1756 return False
jpayne@68	1757
jpayne@68	1758 ###############################################################
jpayne@68	1759 ###############################################################
jpayne@68	1760 ###############################################################
jpayne@68	1761 ## properties
jpayne@68	1762 ###############################################################
jpayne@68	1763 property mapped:
jpayne@68	1764 """int with total number of mapped alignments according to the
jpayne@68	1765 statistics recorded in the index. This is a read-only
jpayne@68	1766 attribute.
jpayne@68	1767 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68	1768 index format does not record these statistics.)
jpayne@68	1769 """
jpayne@68	1770 def __get__(self):
jpayne@68	1771 self.check_index()
jpayne@68	1772 cdef int tid
jpayne@68	1773 cdef uint64_t total = 0
jpayne@68	1774 cdef uint64_t mapped, unmapped
jpayne@68	1775 for tid from 0 <= tid < self.header.nreferences:
jpayne@68	1776 with nogil:
jpayne@68	1777 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68	1778 total += mapped
jpayne@68	1779 return total
jpayne@68	1780
jpayne@68	1781 property unmapped:
jpayne@68	1782 """int with total number of unmapped reads according to the statistics
jpayne@68	1783 recorded in the index. This number of reads includes the number of reads
jpayne@68	1784 without coordinates. This is a read-only attribute.
jpayne@68	1785 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68	1786 index format does not record these statistics.)
jpayne@68	1787 """
jpayne@68	1788 def __get__(self):
jpayne@68	1789 self.check_index()
jpayne@68	1790 cdef int tid
jpayne@68	1791 cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
jpayne@68	1792 cdef uint64_t mapped, unmapped
jpayne@68	1793 for tid from 0 <= tid < self.header.nreferences:
jpayne@68	1794 with nogil:
jpayne@68	1795 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68	1796 total += unmapped
jpayne@68	1797 return total
jpayne@68	1798
jpayne@68	1799 property nocoordinate:
jpayne@68	1800 """int with total number of reads without coordinates according to the
jpayne@68	1801 statistics recorded in the index, i.e., the statistic printed for "*"
jpayne@68	1802 by the ``samtools idxstats`` command. This is a read-only attribute.
jpayne@68	1803 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@68	1804 index format does not record these statistics.)
jpayne@68	1805 """
jpayne@68	1806 def __get__(self):
jpayne@68	1807 self.check_index()
jpayne@68	1808 cdef uint64_t n
jpayne@68	1809 with nogil:
jpayne@68	1810 n = hts_idx_get_n_no_coor(self.index)
jpayne@68	1811 return n
jpayne@68	1812
jpayne@68	1813 def get_index_statistics(self):
jpayne@68	1814 """return statistics about mapped/unmapped reads per chromosome as
jpayne@68	1815 they are stored in the index, similarly to the statistics printed
jpayne@68	1816 by the ``samtools idxstats`` command.
jpayne@68	1817
jpayne@68	1818 CRAI indexes do not record these statistics, so for a CRAM file
jpayne@68	1819 with a .crai index the returned statistics will all be 0.
jpayne@68	1820
jpayne@68	1821 Returns:
jpayne@68	1822 list :
jpayne@68	1823 a list of records for each chromosome. Each record has the
jpayne@68	1824 attributes 'contig', 'mapped', 'unmapped' and 'total'.
jpayne@68	1825 """
jpayne@68	1826
jpayne@68	1827 self.check_index()
jpayne@68	1828 cdef int tid
jpayne@68	1829 cdef uint64_t mapped, unmapped
jpayne@68	1830 results = []
jpayne@68	1831 # TODO: use header
jpayne@68	1832 for tid from 0 <= tid < self.nreferences:
jpayne@68	1833 with nogil:
jpayne@68	1834 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@68	1835 results.append(
jpayne@68	1836 IndexStats._make((
jpayne@68	1837 self.get_reference_name(tid),
jpayne@68	1838 mapped,
jpayne@68	1839 unmapped,
jpayne@68	1840 mapped + unmapped)))
jpayne@68	1841
jpayne@68	1842 return results
jpayne@68	1843
jpayne@68	1844 ###############################################################
jpayne@68	1845 ## file-object like iterator access
jpayne@68	1846 ## note: concurrent access will cause errors (see IteratorRow
jpayne@68	1847 ## and multiple_iterators)
jpayne@68	1848 ## Possible solutions: deprecate or open new file handle
jpayne@68	1849 def __iter__(self):
jpayne@68	1850 if not self.is_open:
jpayne@68	1851 raise ValueError("I/O operation on closed file")
jpayne@68	1852
jpayne@68	1853 if not self.is_bam and self.header.nreferences == 0:
jpayne@68	1854 raise NotImplementedError(
jpayne@68	1855 "can not iterate over samfile without header")
jpayne@68	1856 return self
jpayne@68	1857
jpayne@68	1858 cdef bam1_t * getCurrent(self):
jpayne@68	1859 return self.b
jpayne@68	1860
jpayne@68	1861 cdef int cnext(self):
jpayne@68	1862 '''
jpayne@68	1863 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
jpayne@68	1864 '''
jpayne@68	1865 cdef int ret
jpayne@68	1866 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68	1867 with nogil:
jpayne@68	1868 ret = sam_read1(self.htsfile,
jpayne@68	1869 hdr,
jpayne@68	1870 self.b)
jpayne@68	1871 return ret
jpayne@68	1872
jpayne@68	1873 def __next__(self):
jpayne@68	1874 cdef int ret = self.cnext()
jpayne@68	1875 if ret >= 0:
jpayne@68	1876 return makeAlignedSegment(self.b, self.header)
jpayne@68	1877 elif ret == -1:
jpayne@68	1878 raise StopIteration
jpayne@68	1879 else:
jpayne@68	1880 raise IOError(read_failure_reason(ret))
jpayne@68	1881
jpayne@68	1882 ###########################################
jpayne@68	1883 # methods/properties referencing the header
jpayne@68	1884 def is_valid_tid(self, int tid):
jpayne@68	1885 """
jpayne@68	1886 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68	1887
jpayne@68	1888 Note that the unmapped tid code (-1) counts as an invalid.
jpayne@68	1889 """
jpayne@68	1890 if self.header is None:
jpayne@68	1891 raise ValueError("header not available in closed files")
jpayne@68	1892 return self.header.is_valid_tid(tid)
jpayne@68	1893
jpayne@68	1894 def get_tid(self, reference):
jpayne@68	1895 """
jpayne@68	1896 return the numerical :term:`tid` corresponding to
jpayne@68	1897 :term:`reference`
jpayne@68	1898
jpayne@68	1899 returns -1 if reference is not known.
jpayne@68	1900 """
jpayne@68	1901 if self.header is None:
jpayne@68	1902 raise ValueError("header not available in closed files")
jpayne@68	1903 return self.header.get_tid(reference)
jpayne@68	1904
jpayne@68	1905 def get_reference_name(self, tid):
jpayne@68	1906 """
jpayne@68	1907 return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@68	1908 """
jpayne@68	1909 if self.header is None:
jpayne@68	1910 raise ValueError("header not available in closed files")
jpayne@68	1911 return self.header.get_reference_name(tid)
jpayne@68	1912
jpayne@68	1913 def get_reference_length(self, reference):
jpayne@68	1914 """
jpayne@68	1915 return :term:`reference` length corresponding to numerical :term:`tid`
jpayne@68	1916 """
jpayne@68	1917 if self.header is None:
jpayne@68	1918 raise ValueError("header not available in closed files")
jpayne@68	1919 return self.header.get_reference_length(reference)
jpayne@68	1920
jpayne@68	1921 property nreferences:
jpayne@68	1922 """int with the number of :term:`reference` sequences in the file.
jpayne@68	1923 This is a read-only attribute."""
jpayne@68	1924 def __get__(self):
jpayne@68	1925 if self.header:
jpayne@68	1926 return self.header.nreferences
jpayne@68	1927 else:
jpayne@68	1928 raise ValueError("header not available in closed files")
jpayne@68	1929
jpayne@68	1930 property references:
jpayne@68	1931 """tuple with the names of :term:`reference` sequences. This is a
jpayne@68	1932 read-only attribute"""
jpayne@68	1933 def __get__(self):
jpayne@68	1934 if self.header:
jpayne@68	1935 return self.header.references
jpayne@68	1936 else:
jpayne@68	1937 raise ValueError("header not available in closed files")
jpayne@68	1938
jpayne@68	1939 property lengths:
jpayne@68	1940 """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@68	1941 read-only attribute. The lengths are in the same order as
jpayne@68	1942 :attr:`pysam.AlignmentFile.references`
jpayne@68	1943
jpayne@68	1944 """
jpayne@68	1945 def __get__(self):
jpayne@68	1946 if self.header:
jpayne@68	1947 return self.header.lengths
jpayne@68	1948 else:
jpayne@68	1949 raise ValueError("header not available in closed files")
jpayne@68	1950
jpayne@68	1951 # Compatibility functions for pysam < 0.14
jpayne@68	1952 property text:
jpayne@68	1953 """deprecated, use :attr:`references` and :attr:`lengths` instead"""
jpayne@68	1954 def __get__(self):
jpayne@68	1955 if self.header:
jpayne@68	1956 return self.header.__str__()
jpayne@68	1957 else:
jpayne@68	1958 raise ValueError("header not available in closed files")
jpayne@68	1959
jpayne@68	1960 # Compatibility functions for pysam < 0.8.3
jpayne@68	1961 def gettid(self, reference):
jpayne@68	1962 """deprecated, use :meth:`get_tid` instead"""
jpayne@68	1963 return self.get_tid(reference)
jpayne@68	1964
jpayne@68	1965 def getrname(self, tid):
jpayne@68	1966 """deprecated, use :meth:`get_reference_name` instead"""
jpayne@68	1967 return self.get_reference_name(tid)
jpayne@68	1968
jpayne@68	1969
jpayne@68	1970 cdef class IteratorRow:
jpayne@68	1971 '''abstract base class for iterators over mapped reads.
jpayne@68	1972
jpayne@68	1973 Various iterators implement different behaviours for wrapping around
jpayne@68	1974 contig boundaries. Examples include:
jpayne@68	1975
jpayne@68	1976 :class:`pysam.IteratorRowRegion`
jpayne@68	1977 iterate within a single contig and a defined region.
jpayne@68	1978
jpayne@68	1979 :class:`pysam.IteratorRowAll`
jpayne@68	1980 iterate until EOF. This iterator will also include unmapped reads.
jpayne@68	1981
jpayne@68	1982 :class:`pysam.IteratorRowAllRefs`
jpayne@68	1983 iterate over all reads in all reference sequences.
jpayne@68	1984
jpayne@68	1985 The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
jpayne@68	1986
jpayne@68	1987 .. note::
jpayne@68	1988
jpayne@68	1989 It is usually not necessary to create an object of this class
jpayne@68	1990 explicitly. It is returned as a result of call to a
jpayne@68	1991 :meth:`AlignmentFile.fetch`.
jpayne@68	1992
jpayne@68	1993 '''
jpayne@68	1994
jpayne@68	1995 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@68	1996 cdef char *cfilename
jpayne@68	1997 cdef char *creference_filename
jpayne@68	1998 cdef char *cindexname = NULL
jpayne@68	1999
jpayne@68	2000 if not samfile.is_open:
jpayne@68	2001 raise ValueError("I/O operation on closed file")
jpayne@68	2002
jpayne@68	2003 # makes sure that samfile stays alive as long as the
jpayne@68	2004 # iterator is alive
jpayne@68	2005 self.samfile = samfile
jpayne@68	2006
jpayne@68	2007 # reopen the file - note that this makes the iterator
jpayne@68	2008 # slow and causes pileup to slow down significantly.
jpayne@68	2009 if multiple_iterators:
jpayne@68	2010
jpayne@68	2011 cfilename = samfile.filename
jpayne@68	2012 with nogil:
jpayne@68	2013 self.htsfile = hts_open(cfilename, 'r')
jpayne@68	2014 assert self.htsfile != NULL
jpayne@68	2015
jpayne@68	2016 if samfile.has_index():
jpayne@68	2017 if samfile.index_filename:
jpayne@68	2018 cindexname = bindex_filename = encode_filename(samfile.index_filename)
jpayne@68	2019 with nogil:
jpayne@68	2020 self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
jpayne@68	2021 else:
jpayne@68	2022 self.index = NULL
jpayne@68	2023
jpayne@68	2024 # need to advance in newly opened file to position after header
jpayne@68	2025 # better: use seek/tell?
jpayne@68	2026 with nogil:
jpayne@68	2027 hdr = sam_hdr_read(self.htsfile)
jpayne@68	2028 if hdr is NULL:
jpayne@68	2029 raise IOError("unable to read header information")
jpayne@68	2030 self.header = makeAlignmentHeader(hdr)
jpayne@68	2031
jpayne@68	2032 self.owns_samfile = True
jpayne@68	2033
jpayne@68	2034 # options specific to CRAM files
jpayne@68	2035 if samfile.is_cram and samfile.reference_filename:
jpayne@68	2036 creference_filename = samfile.reference_filename
jpayne@68	2037 hts_set_opt(self.htsfile,
jpayne@68	2038 CRAM_OPT_REFERENCE,
jpayne@68	2039 creference_filename)
jpayne@68	2040
jpayne@68	2041 else:
jpayne@68	2042 self.htsfile = samfile.htsfile
jpayne@68	2043 self.index = samfile.index
jpayne@68	2044 self.owns_samfile = False
jpayne@68	2045 self.header = samfile.header
jpayne@68	2046
jpayne@68	2047 self.retval = 0
jpayne@68	2048
jpayne@68	2049 self.b = bam_init1()
jpayne@68	2050
jpayne@68	2051 def __dealloc__(self):
jpayne@68	2052 bam_destroy1(self.b)
jpayne@68	2053 if self.owns_samfile:
jpayne@68	2054 hts_idx_destroy(self.index)
jpayne@68	2055 hts_close(self.htsfile)
jpayne@68	2056
jpayne@68	2057
jpayne@68	2058 cdef class IteratorRowRegion(IteratorRow):
jpayne@68	2059 """*(AlignmentFile samfile, int tid, int beg, int stop,
jpayne@68	2060 int multiple_iterators=False)*
jpayne@68	2061
jpayne@68	2062 iterate over mapped reads in a region.
jpayne@68	2063
jpayne@68	2064 .. note::
jpayne@68	2065
jpayne@68	2066 It is usually not necessary to create an object of this class
jpayne@68	2067 explicitly. It is returned as a result of call to a
jpayne@68	2068 :meth:`AlignmentFile.fetch`.
jpayne@68	2069
jpayne@68	2070 """
jpayne@68	2071
jpayne@68	2072 def __init__(self, AlignmentFile samfile,
jpayne@68	2073 int tid, int beg, int stop,
jpayne@68	2074 int multiple_iterators=False):
jpayne@68	2075
jpayne@68	2076 if not samfile.has_index():
jpayne@68	2077 raise ValueError("no index available for iteration")
jpayne@68	2078
jpayne@68	2079 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68	2080 with nogil:
jpayne@68	2081 self.iter = sam_itr_queryi(
jpayne@68	2082 self.index,
jpayne@68	2083 tid,
jpayne@68	2084 beg,
jpayne@68	2085 stop)
jpayne@68	2086
jpayne@68	2087 def __iter__(self):
jpayne@68	2088 return self
jpayne@68	2089
jpayne@68	2090 cdef bam1_t * getCurrent(self):
jpayne@68	2091 return self.b
jpayne@68	2092
jpayne@68	2093 cdef int cnext(self):
jpayne@68	2094 '''cversion of iterator. Used by IteratorColumn'''
jpayne@68	2095 with nogil:
jpayne@68	2096 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
jpayne@68	2097 self.iter,
jpayne@68	2098 self.b,
jpayne@68	2099 self.htsfile)
jpayne@68	2100
jpayne@68	2101 def __next__(self):
jpayne@68	2102 self.cnext()
jpayne@68	2103 if self.retval >= 0:
jpayne@68	2104 return makeAlignedSegment(self.b, self.header)
jpayne@68	2105 elif self.retval == -1:
jpayne@68	2106 raise StopIteration
jpayne@68	2107 elif self.retval == -2:
jpayne@68	2108 # Note: it is currently not the case that hts_iter_next
jpayne@68	2109 # returns -2 for a truncated file.
jpayne@68	2110 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
jpayne@68	2111 raise IOError('truncated file')
jpayne@68	2112 else:
jpayne@68	2113 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
jpayne@68	2114
jpayne@68	2115 def __dealloc__(self):
jpayne@68	2116 hts_itr_destroy(self.iter)
jpayne@68	2117
jpayne@68	2118
jpayne@68	2119 cdef class IteratorRowHead(IteratorRow):
jpayne@68	2120 """(AlignmentFile samfile, n, int multiple_iterators=False)
jpayne@68	2121
jpayne@68	2122 iterate over first n reads in `samfile`
jpayne@68	2123
jpayne@68	2124 .. note::
jpayne@68	2125 It is usually not necessary to create an object of this class
jpayne@68	2126 explicitly. It is returned as a result of call to a
jpayne@68	2127 :meth:`AlignmentFile.head`.
jpayne@68	2128
jpayne@68	2129 """
jpayne@68	2130
jpayne@68	2131 def __init__(self,
jpayne@68	2132 AlignmentFile samfile,
jpayne@68	2133 int n,
jpayne@68	2134 int multiple_iterators=False):
jpayne@68	2135 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68	2136
jpayne@68	2137 self.max_rows = n
jpayne@68	2138 self.current_row = 0
jpayne@68	2139
jpayne@68	2140 def __iter__(self):
jpayne@68	2141 return self
jpayne@68	2142
jpayne@68	2143 cdef bam1_t * getCurrent(self):
jpayne@68	2144 return self.b
jpayne@68	2145
jpayne@68	2146 cdef int cnext(self):
jpayne@68	2147 '''cversion of iterator. Used by IteratorColumn'''
jpayne@68	2148 cdef int ret
jpayne@68	2149 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68	2150 with nogil:
jpayne@68	2151 ret = sam_read1(self.htsfile,
jpayne@68	2152 hdr,
jpayne@68	2153 self.b)
jpayne@68	2154 return ret
jpayne@68	2155
jpayne@68	2156 def __next__(self):
jpayne@68	2157 if self.current_row >= self.max_rows:
jpayne@68	2158 raise StopIteration
jpayne@68	2159
jpayne@68	2160 cdef int ret = self.cnext()
jpayne@68	2161 if ret >= 0:
jpayne@68	2162 self.current_row += 1
jpayne@68	2163 return makeAlignedSegment(self.b, self.header)
jpayne@68	2164 elif ret == -1:
jpayne@68	2165 raise StopIteration
jpayne@68	2166 else:
jpayne@68	2167 raise IOError(read_failure_reason(ret))
jpayne@68	2168
jpayne@68	2169
jpayne@68	2170 cdef class IteratorRowAll(IteratorRow):
jpayne@68	2171 """(AlignmentFile samfile, int multiple_iterators=False)
jpayne@68	2172
jpayne@68	2173 iterate over all reads in `samfile`
jpayne@68	2174
jpayne@68	2175 .. note::
jpayne@68	2176
jpayne@68	2177 It is usually not necessary to create an object of this class
jpayne@68	2178 explicitly. It is returned as a result of call to a
jpayne@68	2179 :meth:`AlignmentFile.fetch`.
jpayne@68	2180
jpayne@68	2181 """
jpayne@68	2182
jpayne@68	2183 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@68	2184 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68	2185
jpayne@68	2186 def __iter__(self):
jpayne@68	2187 return self
jpayne@68	2188
jpayne@68	2189 cdef bam1_t * getCurrent(self):
jpayne@68	2190 return self.b
jpayne@68	2191
jpayne@68	2192 cdef int cnext(self):
jpayne@68	2193 '''cversion of iterator. Used by IteratorColumn'''
jpayne@68	2194 cdef int ret
jpayne@68	2195 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68	2196 with nogil:
jpayne@68	2197 ret = sam_read1(self.htsfile,
jpayne@68	2198 hdr,
jpayne@68	2199 self.b)
jpayne@68	2200 return ret
jpayne@68	2201
jpayne@68	2202 def __next__(self):
jpayne@68	2203 cdef int ret = self.cnext()
jpayne@68	2204 if ret >= 0:
jpayne@68	2205 return makeAlignedSegment(self.b, self.header)
jpayne@68	2206 elif ret == -1:
jpayne@68	2207 raise StopIteration
jpayne@68	2208 else:
jpayne@68	2209 raise IOError(read_failure_reason(ret))
jpayne@68	2210
jpayne@68	2211
jpayne@68	2212 cdef class IteratorRowAllRefs(IteratorRow):
jpayne@68	2213 """iterates over all mapped reads by chaining iterators over each
jpayne@68	2214 reference
jpayne@68	2215
jpayne@68	2216 .. note::
jpayne@68	2217 It is usually not necessary to create an object of this class
jpayne@68	2218 explicitly. It is returned as a result of call to a
jpayne@68	2219 :meth:`AlignmentFile.fetch`.
jpayne@68	2220
jpayne@68	2221 """
jpayne@68	2222
jpayne@68	2223 def __init__(self, AlignmentFile samfile, multiple_iterators=False):
jpayne@68	2224 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68	2225
jpayne@68	2226 if not samfile.has_index():
jpayne@68	2227 raise ValueError("no index available for fetch")
jpayne@68	2228
jpayne@68	2229 self.tid = -1
jpayne@68	2230
jpayne@68	2231 def nextiter(self):
jpayne@68	2232 # get a new iterator for a chromosome. The file
jpayne@68	2233 # will not be re-opened.
jpayne@68	2234 self.rowiter = IteratorRowRegion(self.samfile,
jpayne@68	2235 self.tid,
jpayne@68	2236 0,
jpayne@68	2237 MAX_POS)
jpayne@68	2238 # set htsfile and header of the rowiter
jpayne@68	2239 # to the values in this iterator to reflect multiple_iterators
jpayne@68	2240 self.rowiter.htsfile = self.htsfile
jpayne@68	2241 self.rowiter.header = self.header
jpayne@68	2242
jpayne@68	2243 # make sure the iterator understand that IteratorRowAllRefs
jpayne@68	2244 # has ownership
jpayne@68	2245 self.rowiter.owns_samfile = False
jpayne@68	2246
jpayne@68	2247 def __iter__(self):
jpayne@68	2248 return self
jpayne@68	2249
jpayne@68	2250 def __next__(self):
jpayne@68	2251 # Create an initial iterator
jpayne@68	2252 if self.tid == -1:
jpayne@68	2253 if not self.samfile.nreferences:
jpayne@68	2254 raise StopIteration
jpayne@68	2255 self.tid = 0
jpayne@68	2256 self.nextiter()
jpayne@68	2257
jpayne@68	2258 while 1:
jpayne@68	2259 self.rowiter.cnext()
jpayne@68	2260
jpayne@68	2261 # If current iterator is not exhausted, return aligned read
jpayne@68	2262 if self.rowiter.retval > 0:
jpayne@68	2263 return makeAlignedSegment(self.rowiter.b, self.header)
jpayne@68	2264
jpayne@68	2265 self.tid += 1
jpayne@68	2266
jpayne@68	2267 # Otherwise, proceed to next reference or stop
jpayne@68	2268 if self.tid < self.samfile.nreferences:
jpayne@68	2269 self.nextiter()
jpayne@68	2270 else:
jpayne@68	2271 raise StopIteration
jpayne@68	2272
jpayne@68	2273
jpayne@68	2274 cdef class IteratorRowSelection(IteratorRow):
jpayne@68	2275 """(AlignmentFile samfile)
jpayne@68	2276
jpayne@68	2277 iterate over reads in `samfile` at a given list of file positions.
jpayne@68	2278
jpayne@68	2279 .. note::
jpayne@68	2280 It is usually not necessary to create an object of this class
jpayne@68	2281 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
jpayne@68	2282 """
jpayne@68	2283
jpayne@68	2284 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
jpayne@68	2285 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@68	2286
jpayne@68	2287 self.positions = positions
jpayne@68	2288 self.current_pos = 0
jpayne@68	2289
jpayne@68	2290 def __iter__(self):
jpayne@68	2291 return self
jpayne@68	2292
jpayne@68	2293 cdef bam1_t * getCurrent(self):
jpayne@68	2294 return self.b
jpayne@68	2295
jpayne@68	2296 cdef int cnext(self):
jpayne@68	2297 '''cversion of iterator'''
jpayne@68	2298 # end iteration if out of positions
jpayne@68	2299 if self.current_pos >= len(self.positions): return -1
jpayne@68	2300
jpayne@68	2301 cdef uint64_t pos = self.positions[self.current_pos]
jpayne@68	2302 with nogil:
jpayne@68	2303 bgzf_seek(hts_get_bgzfp(self.htsfile),
jpayne@68	2304 pos,
jpayne@68	2305 0)
jpayne@68	2306 self.current_pos += 1
jpayne@68	2307
jpayne@68	2308 cdef int ret
jpayne@68	2309 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68	2310 with nogil:
jpayne@68	2311 ret = sam_read1(self.htsfile,
jpayne@68	2312 hdr,
jpayne@68	2313 self.b)
jpayne@68	2314 return ret
jpayne@68	2315
jpayne@68	2316 def __next__(self):
jpayne@68	2317 cdef int ret = self.cnext()
jpayne@68	2318 if ret >= 0:
jpayne@68	2319 return makeAlignedSegment(self.b, self.header)
jpayne@68	2320 elif ret == -1:
jpayne@68	2321 raise StopIteration
jpayne@68	2322 else:
jpayne@68	2323 raise IOError(read_failure_reason(ret))
jpayne@68	2324
jpayne@68	2325
jpayne@68	2326 cdef int __advance_nofilter(void data, bam1_t b):
jpayne@68	2327 '''advance without any read filtering.
jpayne@68	2328 '''
jpayne@68	2329 cdef __iterdata * d = <__iterdata*>data
jpayne@68	2330 cdef int ret
jpayne@68	2331 with nogil:
jpayne@68	2332 ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@68	2333 return ret
jpayne@68	2334
jpayne@68	2335
jpayne@68	2336 cdef int __advance_raw_nofilter(void data, bam1_t b):
jpayne@68	2337 '''advance (without iterator) without any read filtering.
jpayne@68	2338 '''
jpayne@68	2339 cdef __iterdata * d = <__iterdata*>data
jpayne@68	2340 cdef int ret
jpayne@68	2341 with nogil:
jpayne@68	2342 ret = sam_read1(d.htsfile, d.header, b)
jpayne@68	2343 return ret
jpayne@68	2344
jpayne@68	2345
jpayne@68	2346 cdef int __advance_all(void data, bam1_t b):
jpayne@68	2347 '''only use reads for pileup passing basic filters such as
jpayne@68	2348
jpayne@68	2349 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68	2350 '''
jpayne@68	2351
jpayne@68	2352 cdef __iterdata * d = <__iterdata*>data
jpayne@68	2353 cdef mask = BAM_FUNMAP \| BAM_FSECONDARY \| BAM_FQCFAIL \| BAM_FDUP
jpayne@68	2354 cdef int ret
jpayne@68	2355 while 1:
jpayne@68	2356 with nogil:
jpayne@68	2357 ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@68	2358 if ret < 0:
jpayne@68	2359 break
jpayne@68	2360 if b.core.flag & d.flag_filter:
jpayne@68	2361 continue
jpayne@68	2362 break
jpayne@68	2363 return ret
jpayne@68	2364
jpayne@68	2365
jpayne@68	2366 cdef int __advance_raw_all(void data, bam1_t b):
jpayne@68	2367 '''only use reads for pileup passing basic filters such as
jpayne@68	2368
jpayne@68	2369 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@68	2370 '''
jpayne@68	2371
jpayne@68	2372 cdef __iterdata * d = <__iterdata*>data
jpayne@68	2373 cdef int ret
jpayne@68	2374 while 1:
jpayne@68	2375 with nogil:
jpayne@68	2376 ret = sam_read1(d.htsfile, d.header, b)
jpayne@68	2377 if ret < 0:
jpayne@68	2378 break
jpayne@68	2379 if b.core.flag & d.flag_filter:
jpayne@68	2380 continue
jpayne@68	2381 break
jpayne@68	2382 return ret
jpayne@68	2383
jpayne@68	2384
jpayne@68	2385 cdef int __advance_samtools(void * data, bam1_t * b):
jpayne@68	2386 '''advance using same filter and read processing as in
jpayne@68	2387 the samtools pileup.
jpayne@68	2388 '''
jpayne@68	2389 cdef __iterdata * d = <__iterdata*>data
jpayne@68	2390 cdef int ret
jpayne@68	2391 cdef int q
jpayne@68	2392
jpayne@68	2393 while 1:
jpayne@68	2394 with nogil:
jpayne@68	2395 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
jpayne@68	2396 if ret < 0:
jpayne@68	2397 break
jpayne@68	2398 if b.core.flag & d.flag_filter:
jpayne@68	2399 continue
jpayne@68	2400 if d.flag_require and not (b.core.flag & d.flag_require):
jpayne@68	2401 continue
jpayne@68	2402
jpayne@68	2403 # reload sequence
jpayne@68	2404 if d.fastafile != NULL and b.core.tid != d.tid:
jpayne@68	2405 if d.seq != NULL:
jpayne@68	2406 free(d.seq)
jpayne@68	2407 d.tid = b.core.tid
jpayne@68	2408 with nogil:
jpayne@68	2409 d.seq = faidx_fetch_seq(
jpayne@68	2410 d.fastafile,
jpayne@68	2411 d.header.target_name[d.tid],
jpayne@68	2412 0, MAX_POS,
jpayne@68	2413 &d.seq_len)
jpayne@68	2414
jpayne@68	2415 if d.seq == NULL:
jpayne@68	2416 raise ValueError(
jpayne@68	2417 "reference sequence for '{}' (tid={}) not found".format(
jpayne@68	2418 d.header.target_name[d.tid], d.tid))
jpayne@68	2419
jpayne@68	2420 # realign read - changes base qualities
jpayne@68	2421 if d.seq != NULL and d.compute_baq:
jpayne@68	2422 # 4th option to realign is flag:
jpayne@68	2423 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
jpayne@68	2424 if d.redo_baq:
jpayne@68	2425 sam_prob_realn(b, d.seq, d.seq_len, 7)
jpayne@68	2426 else:
jpayne@68	2427 sam_prob_realn(b, d.seq, d.seq_len, 3)
jpayne@68	2428
jpayne@68	2429 if d.seq != NULL and d.adjust_capq_threshold > 10:
jpayne@68	2430 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
jpayne@68	2431 if q < 0:
jpayne@68	2432 continue
jpayne@68	2433 elif b.core.qual > q:
jpayne@68	2434 b.core.qual = q
jpayne@68	2435
jpayne@68	2436 if b.core.qual < d.min_mapping_quality:
jpayne@68	2437 continue
jpayne@68	2438 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
jpayne@68	2439 continue
jpayne@68	2440
jpayne@68	2441 break
jpayne@68	2442
jpayne@68	2443 return ret
jpayne@68	2444
jpayne@68	2445
jpayne@68	2446 cdef class IteratorColumn:
jpayne@68	2447 '''abstract base class for iterators over columns.
jpayne@68	2448
jpayne@68	2449 IteratorColumn objects wrap the pileup functionality of samtools.
jpayne@68	2450
jpayne@68	2451 For reasons of efficiency, the iterator points to the current
jpayne@68	2452 pileup buffer. The pileup buffer is updated at every iteration.
jpayne@68	2453 This might cause some unexpected behaviour. For example,
jpayne@68	2454 consider the conversion to a list::
jpayne@68	2455
jpayne@68	2456 f = AlignmentFile("file.bam", "rb")
jpayne@68	2457 result = list(f.pileup())
jpayne@68	2458
jpayne@68	2459 Here, ``result`` will contain ``n`` objects of type
jpayne@68	2460 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
jpayne@68	2461 ``result`` will contain the same information.
jpayne@68	2462
jpayne@68	2463 The desired behaviour can be achieved by list comprehension::
jpayne@68	2464
jpayne@68	2465 result = [x.pileups() for x in f.pileup()]
jpayne@68	2466
jpayne@68	2467 ``result`` will be a list of ``n`` lists of objects of type
jpayne@68	2468 :class:`~pysam.PileupRead`.
jpayne@68	2469
jpayne@68	2470 If the iterator is associated with a :class:`~pysam.Fastafile`
jpayne@68	2471 using the :meth:`add_reference` method, then the iterator will
jpayne@68	2472 export the current sequence via the methods :meth:`get_sequence`
jpayne@68	2473 and :meth:`seq_len`.
jpayne@68	2474
jpayne@68	2475 See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
jpayne@68	2476 '''
jpayne@68	2477
jpayne@68	2478 def __cinit__( self, AlignmentFile samfile, **kwargs):
jpayne@68	2479 self.samfile = samfile
jpayne@68	2480 self.fastafile = kwargs.get("fastafile", None)
jpayne@68	2481 self.stepper = kwargs.get("stepper", "samtools")
jpayne@68	2482 self.max_depth = kwargs.get("max_depth", 8000)
jpayne@68	2483 self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
jpayne@68	2484 self.min_base_quality = kwargs.get("min_base_quality", 13)
jpayne@68	2485 self.iterdata.seq = NULL
jpayne@68	2486 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
jpayne@68	2487 self.iterdata.flag_require = kwargs.get("flag_require", 0)
jpayne@68	2488 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP \| BAM_FSECONDARY \| BAM_FQCFAIL \| BAM_FDUP)
jpayne@68	2489 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
jpayne@68	2490 self.iterdata.compute_baq = kwargs.get("compute_baq", True)
jpayne@68	2491 self.iterdata.redo_baq = kwargs.get("redo_baq", False)
jpayne@68	2492 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
jpayne@68	2493
jpayne@68	2494 self.tid = 0
jpayne@68	2495 self.pos = 0
jpayne@68	2496 self.n_plp = 0
jpayne@68	2497 self.plp = NULL
jpayne@68	2498 self.pileup_iter = <bam_mplp_t>NULL
jpayne@68	2499
jpayne@68	2500 def __iter__(self):
jpayne@68	2501 return self
jpayne@68	2502
jpayne@68	2503 cdef int cnext(self):
jpayne@68	2504 '''perform next iteration.
jpayne@68	2505 '''
jpayne@68	2506 # do not release gil here because of call-backs
jpayne@68	2507 cdef int ret = bam_mplp_auto(self.pileup_iter,
jpayne@68	2508 &self.tid,
jpayne@68	2509 &self.pos,
jpayne@68	2510 &self.n_plp,
jpayne@68	2511 &self.plp)
jpayne@68	2512 return ret
jpayne@68	2513
jpayne@68	2514 cdef char * get_sequence(self):
jpayne@68	2515 '''return current reference sequence underlying the iterator.
jpayne@68	2516 '''
jpayne@68	2517 return self.iterdata.seq
jpayne@68	2518
jpayne@68	2519 property seq_len:
jpayne@68	2520 '''current sequence length.'''
jpayne@68	2521 def __get__(self):
jpayne@68	2522 return self.iterdata.seq_len
jpayne@68	2523
jpayne@68	2524 def add_reference(self, FastaFile fastafile):
jpayne@68	2525 '''
jpayne@68	2526 add reference sequences in `fastafile` to iterator.'''
jpayne@68	2527 self.fastafile = fastafile
jpayne@68	2528 if self.iterdata.seq != NULL:
jpayne@68	2529 free(self.iterdata.seq)
jpayne@68	2530 self.iterdata.tid = -1
jpayne@68	2531 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68	2532
jpayne@68	2533 def has_reference(self):
jpayne@68	2534 '''
jpayne@68	2535 return true if iterator is associated with a reference'''
jpayne@68	2536 return self.fastafile
jpayne@68	2537
jpayne@68	2538 cdef _setup_iterator(self,
jpayne@68	2539 int tid,
jpayne@68	2540 int start,
jpayne@68	2541 int stop,
jpayne@68	2542 int multiple_iterators=0):
jpayne@68	2543 '''setup the iterator structure'''
jpayne@68	2544
jpayne@68	2545 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
jpayne@68	2546 self.iterdata.htsfile = self.samfile.htsfile
jpayne@68	2547 self.iterdata.iter = self.iter.iter
jpayne@68	2548 self.iterdata.seq = NULL
jpayne@68	2549 self.iterdata.tid = -1
jpayne@68	2550 self.iterdata.header = self.samfile.header.ptr
jpayne@68	2551
jpayne@68	2552 if self.fastafile is not None:
jpayne@68	2553 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68	2554 else:
jpayne@68	2555 self.iterdata.fastafile = NULL
jpayne@68	2556
jpayne@68	2557 # Free any previously allocated memory before reassigning
jpayne@68	2558 # pileup_iter
jpayne@68	2559 self._free_pileup_iter()
jpayne@68	2560
jpayne@68	2561 cdef void * data[1]
jpayne@68	2562 data[0] = <void*>&self.iterdata
jpayne@68	2563
jpayne@68	2564 if self.stepper is None or self.stepper == "all":
jpayne@68	2565 with nogil:
jpayne@68	2566 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2567 <bam_plp_auto_f>&__advance_all,
jpayne@68	2568 data)
jpayne@68	2569 elif self.stepper == "nofilter":
jpayne@68	2570 with nogil:
jpayne@68	2571 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2572 <bam_plp_auto_f>&__advance_nofilter,
jpayne@68	2573 data)
jpayne@68	2574 elif self.stepper == "samtools":
jpayne@68	2575 with nogil:
jpayne@68	2576 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2577 <bam_plp_auto_f>&__advance_samtools,
jpayne@68	2578 data)
jpayne@68	2579 else:
jpayne@68	2580 raise ValueError(
jpayne@68	2581 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@68	2582
jpayne@68	2583 if self.max_depth:
jpayne@68	2584 with nogil:
jpayne@68	2585 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@68	2586
jpayne@68	2587 if self.ignore_overlaps:
jpayne@68	2588 with nogil:
jpayne@68	2589 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@68	2590
jpayne@68	2591 cdef _setup_raw_rest_iterator(self):
jpayne@68	2592 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
jpayne@68	2593
jpayne@68	2594 self.iter = None
jpayne@68	2595 self.iterdata.iter = NULL
jpayne@68	2596 self.iterdata.htsfile = self.samfile.htsfile
jpayne@68	2597 self.iterdata.seq = NULL
jpayne@68	2598 self.iterdata.tid = -1
jpayne@68	2599 self.iterdata.header = self.samfile.header.ptr
jpayne@68	2600
jpayne@68	2601 if self.fastafile is not None:
jpayne@68	2602 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@68	2603 else:
jpayne@68	2604 self.iterdata.fastafile = NULL
jpayne@68	2605
jpayne@68	2606 # Free any previously allocated memory before reassigning
jpayne@68	2607 # pileup_iter
jpayne@68	2608 self._free_pileup_iter()
jpayne@68	2609
jpayne@68	2610 cdef void * data[1]
jpayne@68	2611 data[0] = <void*>&self.iterdata
jpayne@68	2612
jpayne@68	2613 if self.stepper is None or self.stepper == "all":
jpayne@68	2614 with nogil:
jpayne@68	2615 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2616 <bam_plp_auto_f>&__advance_raw_all,
jpayne@68	2617 data)
jpayne@68	2618 elif self.stepper == "nofilter":
jpayne@68	2619 with nogil:
jpayne@68	2620 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2621 <bam_plp_auto_f>&__advance_raw_nofilter,
jpayne@68	2622 data)
jpayne@68	2623 elif self.stepper == "samtools":
jpayne@68	2624 with nogil:
jpayne@68	2625 self.pileup_iter = bam_mplp_init(1,
jpayne@68	2626 <bam_plp_auto_f>&__advance_samtools,
jpayne@68	2627 data)
jpayne@68	2628 else:
jpayne@68	2629 raise ValueError(
jpayne@68	2630 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@68	2631
jpayne@68	2632 if self.max_depth:
jpayne@68	2633 with nogil:
jpayne@68	2634 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@68	2635
jpayne@68	2636 if self.ignore_overlaps:
jpayne@68	2637 with nogil:
jpayne@68	2638 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@68	2639
jpayne@68	2640 cdef reset(self, tid, start, stop):
jpayne@68	2641 '''reset iterator position.
jpayne@68	2642
jpayne@68	2643 This permits using the iterator multiple times without
jpayne@68	2644 having to incur the full set-up costs.
jpayne@68	2645 '''
jpayne@68	2646 if self.iter is None:
jpayne@68	2647 raise TypeError("Raw iterator set up without region cannot be reset")
jpayne@68	2648
jpayne@68	2649 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
jpayne@68	2650 self.iterdata.iter = self.iter.iter
jpayne@68	2651
jpayne@68	2652 # invalidate sequence if different tid
jpayne@68	2653 if self.tid != tid:
jpayne@68	2654 if self.iterdata.seq != NULL:
jpayne@68	2655 free(self.iterdata.seq)
jpayne@68	2656 self.iterdata.seq = NULL
jpayne@68	2657 self.iterdata.tid = -1
jpayne@68	2658
jpayne@68	2659 # self.pileup_iter = bam_mplp_init(1
jpayne@68	2660 # &__advancepileup,
jpayne@68	2661 # &self.iterdata)
jpayne@68	2662 with nogil:
jpayne@68	2663 bam_mplp_reset(self.pileup_iter)
jpayne@68	2664
jpayne@68	2665 cdef _free_pileup_iter(self):
jpayne@68	2666 '''free the memory alloc'd by bam_plp_init.
jpayne@68	2667
jpayne@68	2668 This is needed before setup_iterator allocates another
jpayne@68	2669 pileup_iter, or else memory will be lost. '''
jpayne@68	2670 if self.pileup_iter != <bam_mplp_t>NULL:
jpayne@68	2671 with nogil:
jpayne@68	2672 bam_mplp_reset(self.pileup_iter)
jpayne@68	2673 bam_mplp_destroy(self.pileup_iter)
jpayne@68	2674 self.pileup_iter = <bam_mplp_t>NULL
jpayne@68	2675
jpayne@68	2676 def __dealloc__(self):
jpayne@68	2677 # reset in order to avoid memory leak messages for iterators
jpayne@68	2678 # that have not been fully consumed
jpayne@68	2679 self._free_pileup_iter()
jpayne@68	2680 self.plp = <const bam_pileup1_t*>NULL
jpayne@68	2681
jpayne@68	2682 if self.iterdata.seq != NULL:
jpayne@68	2683 free(self.iterdata.seq)
jpayne@68	2684 self.iterdata.seq = NULL
jpayne@68	2685
jpayne@68	2686 # backwards compatibility
jpayne@68	2687
jpayne@68	2688 def hasReference(self):
jpayne@68	2689 return self.has_reference()
jpayne@68	2690 cdef char * getSequence(self):
jpayne@68	2691 return self.get_sequence()
jpayne@68	2692 def addReference(self, FastaFile fastafile):
jpayne@68	2693 return self.add_reference(fastafile)
jpayne@68	2694
jpayne@68	2695
jpayne@68	2696 cdef class IteratorColumnRegion(IteratorColumn):
jpayne@68	2697 '''iterates over a region only.
jpayne@68	2698 '''
jpayne@68	2699 def __cinit__(self,
jpayne@68	2700 AlignmentFile samfile,
jpayne@68	2701 int tid = 0,
jpayne@68	2702 int start = 0,
jpayne@68	2703 int stop = MAX_POS,
jpayne@68	2704 int truncate = False,
jpayne@68	2705 int multiple_iterators = True,
jpayne@68	2706 **kwargs ):
jpayne@68	2707
jpayne@68	2708 # initialize iterator. Multiple iterators not available
jpayne@68	2709 # for CRAM.
jpayne@68	2710 if multiple_iterators and samfile.is_cram:
jpayne@68	2711 warnings.warn("multiple_iterators not implemented for CRAM")
jpayne@68	2712 multiple_iterators = False
jpayne@68	2713
jpayne@68	2714 self._setup_iterator(tid, start, stop, multiple_iterators)
jpayne@68	2715 self.start = start
jpayne@68	2716 self.stop = stop
jpayne@68	2717 self.truncate = truncate
jpayne@68	2718
jpayne@68	2719 def __next__(self):
jpayne@68	2720
jpayne@68	2721 cdef int n
jpayne@68	2722
jpayne@68	2723 while 1:
jpayne@68	2724 n = self.cnext()
jpayne@68	2725 if n < 0:
jpayne@68	2726 raise ValueError("error during iteration" )
jpayne@68	2727
jpayne@68	2728 if n == 0:
jpayne@68	2729 raise StopIteration
jpayne@68	2730
jpayne@68	2731 if self.truncate:
jpayne@68	2732 if self.start > self.pos:
jpayne@68	2733 continue
jpayne@68	2734 if self.pos >= self.stop:
jpayne@68	2735 raise StopIteration
jpayne@68	2736
jpayne@68	2737 return makePileupColumn(&self.plp,
jpayne@68	2738 self.tid,
jpayne@68	2739 self.pos,
jpayne@68	2740 self.n_plp,
jpayne@68	2741 self.min_base_quality,
jpayne@68	2742 self.iterdata.seq,
jpayne@68	2743 self.samfile.header)
jpayne@68	2744
jpayne@68	2745
jpayne@68	2746 cdef class IteratorColumnAllRefs(IteratorColumn):
jpayne@68	2747 """iterates over all columns by chaining iterators over each reference
jpayne@68	2748 """
jpayne@68	2749
jpayne@68	2750 def __cinit__(self,
jpayne@68	2751 AlignmentFile samfile,
jpayne@68	2752 **kwargs):
jpayne@68	2753
jpayne@68	2754 # no iteration over empty files
jpayne@68	2755 if not samfile.nreferences:
jpayne@68	2756 raise StopIteration
jpayne@68	2757
jpayne@68	2758 # initialize iterator
jpayne@68	2759 self._setup_iterator(self.tid, 0, MAX_POS, 1)
jpayne@68	2760
jpayne@68	2761 def __next__(self):
jpayne@68	2762
jpayne@68	2763 cdef int n
jpayne@68	2764 while 1:
jpayne@68	2765 n = self.cnext()
jpayne@68	2766 if n < 0:
jpayne@68	2767 raise ValueError("error during iteration")
jpayne@68	2768
jpayne@68	2769 # proceed to next reference or stop
jpayne@68	2770 if n == 0:
jpayne@68	2771 self.tid += 1
jpayne@68	2772 if self.tid < self.samfile.nreferences:
jpayne@68	2773 self._setup_iterator(self.tid, 0, MAX_POS, 0)
jpayne@68	2774 else:
jpayne@68	2775 raise StopIteration
jpayne@68	2776 continue
jpayne@68	2777
jpayne@68	2778 # return result, if within same reference
jpayne@68	2779 return makePileupColumn(&self.plp,
jpayne@68	2780 self.tid,
jpayne@68	2781 self.pos,
jpayne@68	2782 self.n_plp,
jpayne@68	2783 self.min_base_quality,
jpayne@68	2784 self.iterdata.seq,
jpayne@68	2785 self.samfile.header)
jpayne@68	2786
jpayne@68	2787
jpayne@68	2788 cdef class IteratorColumnAll(IteratorColumn):
jpayne@68	2789 """iterates over all columns, without using an index
jpayne@68	2790 """
jpayne@68	2791
jpayne@68	2792 def __cinit__(self,
jpayne@68	2793 AlignmentFile samfile,
jpayne@68	2794 **kwargs):
jpayne@68	2795
jpayne@68	2796 self._setup_raw_rest_iterator()
jpayne@68	2797
jpayne@68	2798 def __next__(self):
jpayne@68	2799
jpayne@68	2800 cdef int n
jpayne@68	2801 n = self.cnext()
jpayne@68	2802 if n < 0:
jpayne@68	2803 raise ValueError("error during iteration")
jpayne@68	2804
jpayne@68	2805 if n == 0:
jpayne@68	2806 raise StopIteration
jpayne@68	2807
jpayne@68	2808 return makePileupColumn(&self.plp,
jpayne@68	2809 self.tid,
jpayne@68	2810 self.pos,
jpayne@68	2811 self.n_plp,
jpayne@68	2812 self.min_base_quality,
jpayne@68	2813 self.iterdata.seq,
jpayne@68	2814 self.samfile.header)
jpayne@68	2815
jpayne@68	2816
jpayne@68	2817 cdef class SNPCall:
jpayne@68	2818 '''the results of a SNP call.'''
jpayne@68	2819 cdef int _tid
jpayne@68	2820 cdef int _pos
jpayne@68	2821 cdef char _reference_base
jpayne@68	2822 cdef char _genotype
jpayne@68	2823 cdef int _consensus_quality
jpayne@68	2824 cdef int _snp_quality
jpayne@68	2825 cdef int _rms_mapping_quality
jpayne@68	2826 cdef int _coverage
jpayne@68	2827
jpayne@68	2828 property tid:
jpayne@68	2829 '''the chromosome ID as is defined in the header'''
jpayne@68	2830 def __get__(self):
jpayne@68	2831 return self._tid
jpayne@68	2832
jpayne@68	2833 property pos:
jpayne@68	2834 '''nucleotide position of SNP.'''
jpayne@68	2835 def __get__(self): return self._pos
jpayne@68	2836
jpayne@68	2837 property reference_base:
jpayne@68	2838 '''reference base at pos. ``N`` if no reference sequence supplied.'''
jpayne@68	2839 def __get__(self): return from_string_and_size( &self._reference_base, 1 )
jpayne@68	2840
jpayne@68	2841 property genotype:
jpayne@68	2842 '''the genotype called.'''
jpayne@68	2843 def __get__(self): return from_string_and_size( &self._genotype, 1 )
jpayne@68	2844
jpayne@68	2845 property consensus_quality:
jpayne@68	2846 '''the genotype quality (Phred-scaled).'''
jpayne@68	2847 def __get__(self): return self._consensus_quality
jpayne@68	2848
jpayne@68	2849 property snp_quality:
jpayne@68	2850 '''the snp quality (Phred scaled) - probability of consensus being
jpayne@68	2851 identical to reference sequence.'''
jpayne@68	2852 def __get__(self): return self._snp_quality
jpayne@68	2853
jpayne@68	2854 property mapping_quality:
jpayne@68	2855 '''the root mean square (rms) of the mapping quality of all reads
jpayne@68	2856 involved in the call.'''
jpayne@68	2857 def __get__(self): return self._rms_mapping_quality
jpayne@68	2858
jpayne@68	2859 property coverage:
jpayne@68	2860 '''coverage or read depth - the number of reads involved in the call.'''
jpayne@68	2861 def __get__(self): return self._coverage
jpayne@68	2862
jpayne@68	2863 def __str__(self):
jpayne@68	2864
jpayne@68	2865 return "\t".join( map(str, (
jpayne@68	2866 self.tid,
jpayne@68	2867 self.pos,
jpayne@68	2868 self.reference_base,
jpayne@68	2869 self.genotype,
jpayne@68	2870 self.consensus_quality,
jpayne@68	2871 self.snp_quality,
jpayne@68	2872 self.mapping_quality,
jpayne@68	2873 self.coverage ) ) )
jpayne@68	2874
jpayne@68	2875
jpayne@68	2876 cdef class IndexedReads:
jpayne@68	2877 """Index a Sam/BAM-file by query name while keeping the
jpayne@68	2878 original sort order intact.
jpayne@68	2879
jpayne@68	2880 The index is kept in memory and can be substantial.
jpayne@68	2881
jpayne@68	2882 By default, the file is re-opened to avoid conflicts if multiple
jpayne@68	2883 operators work on the same file. Set `multiple_iterators` = False
jpayne@68	2884 to not re-open `samfile`.
jpayne@68	2885
jpayne@68	2886 Parameters
jpayne@68	2887 ----------
jpayne@68	2888
jpayne@68	2889 samfile : AlignmentFile
jpayne@68	2890 File to be indexed.
jpayne@68	2891
jpayne@68	2892 multiple_iterators : bool
jpayne@68	2893 Flag indicating whether the file should be reopened. Reopening prevents
jpayne@68	2894 existing iterators being affected by the indexing.
jpayne@68	2895
jpayne@68	2896 """
jpayne@68	2897
jpayne@68	2898 def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
jpayne@68	2899 cdef char *cfilename
jpayne@68	2900
jpayne@68	2901 # makes sure that samfile stays alive as long as this
jpayne@68	2902 # object is alive.
jpayne@68	2903 self.samfile = samfile
jpayne@68	2904 cdef bam_hdr_t * hdr = NULL
jpayne@68	2905 assert samfile.is_bam, "can only apply IndexReads on bam files"
jpayne@68	2906
jpayne@68	2907 # multiple_iterators the file - note that this makes the iterator
jpayne@68	2908 # slow and causes pileup to slow down significantly.
jpayne@68	2909 if multiple_iterators:
jpayne@68	2910 cfilename = samfile.filename
jpayne@68	2911 with nogil:
jpayne@68	2912 self.htsfile = hts_open(cfilename, 'r')
jpayne@68	2913 if self.htsfile == NULL:
jpayne@68	2914 raise OSError("unable to reopen htsfile")
jpayne@68	2915
jpayne@68	2916 # need to advance in newly opened file to position after header
jpayne@68	2917 # better: use seek/tell?
jpayne@68	2918 with nogil:
jpayne@68	2919 hdr = sam_hdr_read(self.htsfile)
jpayne@68	2920 if hdr == NULL:
jpayne@68	2921 raise OSError("unable to read header information")
jpayne@68	2922 self.header = makeAlignmentHeader(hdr)
jpayne@68	2923 self.owns_samfile = True
jpayne@68	2924 else:
jpayne@68	2925 self.htsfile = self.samfile.htsfile
jpayne@68	2926 self.header = samfile.header
jpayne@68	2927 self.owns_samfile = False
jpayne@68	2928
jpayne@68	2929 def build(self):
jpayne@68	2930 '''build the index.'''
jpayne@68	2931
jpayne@68	2932 self.index = collections.defaultdict(list)
jpayne@68	2933
jpayne@68	2934 # this method will start indexing from the current file position
jpayne@68	2935 cdef int ret = 1
jpayne@68	2936 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
jpayne@68	2937 if b == NULL:
jpayne@68	2938 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
jpayne@68	2939
jpayne@68	2940 cdef uint64_t pos
jpayne@68	2941 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@68	2942
jpayne@68	2943 while ret > 0:
jpayne@68	2944 with nogil:
jpayne@68	2945 pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
jpayne@68	2946 ret = sam_read1(self.htsfile,
jpayne@68	2947 hdr,
jpayne@68	2948 b)
jpayne@68	2949
jpayne@68	2950 if ret > 0:
jpayne@68	2951 qname = charptr_to_str(pysam_bam_get_qname(b))
jpayne@68	2952 self.index[qname].append(pos)
jpayne@68	2953
jpayne@68	2954 bam_destroy1(b)
jpayne@68	2955
jpayne@68	2956 def find(self, query_name):
jpayne@68	2957 '''find `query_name` in index.
jpayne@68	2958
jpayne@68	2959 Returns
jpayne@68	2960 -------
jpayne@68	2961
jpayne@68	2962 IteratorRowSelection
jpayne@68	2963 Returns an iterator over all reads with query_name.
jpayne@68	2964
jpayne@68	2965 Raises
jpayne@68	2966 ------
jpayne@68	2967
jpayne@68	2968 KeyError
jpayne@68	2969 if the `query_name` is not in the index.
jpayne@68	2970
jpayne@68	2971 '''
jpayne@68	2972 if query_name in self.index:
jpayne@68	2973 return IteratorRowSelection(
jpayne@68	2974 self.samfile,
jpayne@68	2975 self.index[query_name],
jpayne@68	2976 multiple_iterators = False)
jpayne@68	2977 else:
jpayne@68	2978 raise KeyError("read %s not found" % query_name)
jpayne@68	2979
jpayne@68	2980 def __dealloc__(self):
jpayne@68	2981 if self.owns_samfile:
jpayne@68	2982 hts_close(self.htsfile)

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 68:5028fdace37b