annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcalignmentfile.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # cython: embedsignature=True
jpayne@69 2 # cython: profile=True
jpayne@69 3 ########################################################
jpayne@69 4 ########################################################
jpayne@69 5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@69 6 ########################################################
jpayne@69 7 # The principal classes defined in this module are:
jpayne@69 8 #
jpayne@69 9 # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
jpayne@69 10 #
jpayne@69 11 # class AlignmentHeader manage SAM/BAM/CRAM header data
jpayne@69 12 #
jpayne@69 13 # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
jpayne@69 14 # the original sort order intact
jpayne@69 15 #
jpayne@69 16 # Additionally this module defines numerous additional classes that
jpayne@69 17 # are part of the internal API. These are:
jpayne@69 18 #
jpayne@69 19 # Various iterator classes to iterate over alignments in sequential
jpayne@69 20 # (IteratorRow) or in a stacked fashion (IteratorColumn):
jpayne@69 21 #
jpayne@69 22 # class IteratorRow
jpayne@69 23 # class IteratorRowRegion
jpayne@69 24 # class IteratorRowHead
jpayne@69 25 # class IteratorRowAll
jpayne@69 26 # class IteratorRowAllRefs
jpayne@69 27 # class IteratorRowSelection
jpayne@69 28 # class IteratorColumn
jpayne@69 29 # class IteratorColumnRegion
jpayne@69 30 # class IteratorColumnAll
jpayne@69 31 # class IteratorColumnAllRefs
jpayne@69 32 #
jpayne@69 33 ########################################################
jpayne@69 34 #
jpayne@69 35 # The MIT License
jpayne@69 36 #
jpayne@69 37 # Copyright (c) 2015 Andreas Heger
jpayne@69 38 #
jpayne@69 39 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69 40 # copy of this software and associated documentation files (the "Software"),
jpayne@69 41 # to deal in the Software without restriction, including without limitation
jpayne@69 42 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69 43 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69 44 # Software is furnished to do so, subject to the following conditions:
jpayne@69 45 #
jpayne@69 46 # The above copyright notice and this permission notice shall be included in
jpayne@69 47 # all copies or substantial portions of the Software.
jpayne@69 48 #
jpayne@69 49 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69 50 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69 51 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@69 52 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69 53 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69 54 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69 55 # DEALINGS IN THE SOFTWARE.
jpayne@69 56 #
jpayne@69 57 ########################################################
jpayne@69 58 import os
jpayne@69 59 import collections
jpayne@69 60 try:
jpayne@69 61 from collections.abc import Sequence, Mapping # noqa
jpayne@69 62 except ImportError:
jpayne@69 63 from collections import Sequence, Mapping # noqa
jpayne@69 64 import re
jpayne@69 65 import warnings
jpayne@69 66 import array
jpayne@69 67 from libc.errno cimport errno, EPIPE
jpayne@69 68 from libc.string cimport strcmp, strpbrk, strerror
jpayne@69 69 from libc.stdint cimport INT32_MAX
jpayne@69 70
jpayne@69 71 from cpython cimport array as c_array
jpayne@69 72
jpayne@69 73 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@69 74 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@69 75 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
jpayne@69 76 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
jpayne@69 77 HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
jpayne@69 78
jpayne@69 79 from io import StringIO
jpayne@69 80
jpayne@69 81 cimport cython
jpayne@69 82
jpayne@69 83
jpayne@69 84 __all__ = [
jpayne@69 85 "AlignmentFile",
jpayne@69 86 "AlignmentHeader",
jpayne@69 87 "IteratorRow",
jpayne@69 88 "IteratorColumn",
jpayne@69 89 "IndexedReads"]
jpayne@69 90
jpayne@69 91 IndexStats = collections.namedtuple("IndexStats",
jpayne@69 92 ("contig",
jpayne@69 93 "mapped",
jpayne@69 94 "unmapped",
jpayne@69 95 "total"))
jpayne@69 96
jpayne@69 97 ########################################################
jpayne@69 98 ## global variables
jpayne@69 99 # maximum genomic coordinace
jpayne@69 100 # for some reason, using 'int' causes overflow
jpayne@69 101 cdef int MAX_POS = (1 << 31) - 1
jpayne@69 102
jpayne@69 103 # valid types for SAM headers
jpayne@69 104 VALID_HEADER_TYPES = {"HD" : Mapping,
jpayne@69 105 "SQ" : Sequence,
jpayne@69 106 "RG" : Sequence,
jpayne@69 107 "PG" : Sequence,
jpayne@69 108 "CO" : Sequence}
jpayne@69 109
jpayne@69 110 # order of records within SAM headers
jpayne@69 111 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
jpayne@69 112
jpayne@69 113 # default type conversions within SAM header records
jpayne@69 114 KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
jpayne@69 115 "SS" : str,},
jpayne@69 116 "SQ" : {"SN" : str, "LN" : int, "AS" : str,
jpayne@69 117 "M5" : str, "SP" : str, "UR" : str,
jpayne@69 118 "AH" : str, "TP" : str, "DS" : str,
jpayne@69 119 "AN" : str,},
jpayne@69 120 "RG" : {"ID" : str, "CN" : str, "DS" : str,
jpayne@69 121 "DT" : str, "FO" : str, "KS" : str,
jpayne@69 122 "LB" : str, "PG" : str, "PI" : str,
jpayne@69 123 "PL" : str, "PM" : str, "PU" : str,
jpayne@69 124 "SM" : str, "BC" : str,},
jpayne@69 125 "PG" : {"ID" : str, "PN" : str, "CL" : str,
jpayne@69 126 "PP" : str, "DS" : str, "VN" : str,},}
jpayne@69 127
jpayne@69 128 # output order of fields within records. Ensure that CL is at
jpayne@69 129 # the end as parsing a CL will ignore any subsequent records.
jpayne@69 130 VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
jpayne@69 131 "SQ" : ("SN", "LN", "AS", "M5",
jpayne@69 132 "UR", "SP", "AH", "TP",
jpayne@69 133 "DS", "AN"),
jpayne@69 134 "RG" : ("ID", "CN", "SM", "LB",
jpayne@69 135 "PU", "PI", "DT", "DS",
jpayne@69 136 "PL", "FO", "KS", "PG",
jpayne@69 137 "PM", "BC"),
jpayne@69 138 "PG" : ("PN", "ID", "VN", "PP",
jpayne@69 139 "DS", "CL"),}
jpayne@69 140
jpayne@69 141
jpayne@69 142 def build_header_line(fields, record):
jpayne@69 143 '''build a header line from `fields` dictionary for `record`'''
jpayne@69 144
jpayne@69 145 # TODO: add checking for field and sort order
jpayne@69 146 line = ["@%s" % record]
jpayne@69 147 # comment
jpayne@69 148 if record == "CO":
jpayne@69 149 line.append(fields)
jpayne@69 150 # user tags
jpayne@69 151 elif record.islower():
jpayne@69 152 for key in sorted(fields):
jpayne@69 153 line.append("%s:%s" % (key, str(fields[key])))
jpayne@69 154 # defined tags
jpayne@69 155 else:
jpayne@69 156 # write fields of the specification
jpayne@69 157 for key in VALID_HEADER_ORDER[record]:
jpayne@69 158 if key in fields:
jpayne@69 159 line.append("%s:%s" % (key, str(fields[key])))
jpayne@69 160 # write user fields
jpayne@69 161 for key in fields:
jpayne@69 162 if not key.isupper():
jpayne@69 163 line.append("%s:%s" % (key, str(fields[key])))
jpayne@69 164
jpayne@69 165 return "\t".join(line)
jpayne@69 166
jpayne@69 167
jpayne@69 168 cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
jpayne@69 169 if not hdr:
jpayne@69 170 raise ValueError('cannot create AlignmentHeader, received NULL pointer')
jpayne@69 171
jpayne@69 172 # check: is AlignmetHeader.__cinit__ called?
jpayne@69 173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
jpayne@69 174 header.ptr = hdr
jpayne@69 175
jpayne@69 176 return header
jpayne@69 177
jpayne@69 178 def read_failure_reason(code):
jpayne@69 179 if code == -2:
jpayne@69 180 return 'truncated file'
jpayne@69 181 else:
jpayne@69 182 return "error {} while reading file".format(code)
jpayne@69 183
jpayne@69 184
jpayne@69 185 # the following should be class-method for VariantHeader, but cdef @classmethods
jpayne@69 186 # are not implemented in cython.
jpayne@69 187 cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
jpayne@69 188 reference_names,
jpayne@69 189 reference_lengths,
jpayne@69 190 add_sq_text=True,
jpayne@69 191 text=None) except -1:
jpayne@69 192 """build header from list of reference names and lengths.
jpayne@69 193 """
jpayne@69 194
jpayne@69 195 cdef class AlignmentHeader(object):
jpayne@69 196 """header information for a :class:`AlignmentFile` object
jpayne@69 197
jpayne@69 198 Parameters
jpayne@69 199 ----------
jpayne@69 200 header_dict : dict
jpayne@69 201 build header from a multi-level dictionary. The
jpayne@69 202 first level are the four types ('HD', 'SQ', ...). The second
jpayne@69 203 level are a list of lines, with each line being a list of
jpayne@69 204 tag-value pairs. The header is constructed first from all the
jpayne@69 205 defined fields, followed by user tags in alphabetical
jpayne@69 206 order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@69 207 object can be passed directly.
jpayne@69 208
jpayne@69 209 text : string
jpayne@69 210 use the string provided as the header
jpayne@69 211
jpayne@69 212 reference_names : list
jpayne@69 213 see reference_lengths
jpayne@69 214
jpayne@69 215 reference_lengths : list
jpayne@69 216 build header from list of chromosome names and lengths. By
jpayne@69 217 default, 'SQ' and 'LN' tags will be added to the header
jpayne@69 218 text. This option can be changed by unsetting the flag
jpayne@69 219 `add_sq_text`.
jpayne@69 220
jpayne@69 221 add_sq_text : bool
jpayne@69 222 do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@69 223 construction :term:`SAM` formatted files without a header.
jpayne@69 224
jpayne@69 225 """
jpayne@69 226
jpayne@69 227 # See makeVariantHeader for C constructor
jpayne@69 228 def __cinit__(self):
jpayne@69 229 self.ptr = NULL
jpayne@69 230
jpayne@69 231 # Python constructor
jpayne@69 232 def __init__(self):
jpayne@69 233 self.ptr = bam_hdr_init()
jpayne@69 234 if self.ptr is NULL:
jpayne@69 235 raise MemoryError("could not create header")
jpayne@69 236
jpayne@69 237 @classmethod
jpayne@69 238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
jpayne@69 239
jpayne@69 240 cdef AlignmentHeader self = AlignmentHeader()
jpayne@69 241 cdef char *ctext
jpayne@69 242 cdef int l_text
jpayne@69 243 cdef int n, x
jpayne@69 244 if text is not None:
jpayne@69 245 btext = force_bytes(text)
jpayne@69 246 ctext = btext
jpayne@69 247 l_text = len(btext)
jpayne@69 248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
jpayne@69 249 if self.ptr.text == NULL:
jpayne@69 250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
jpayne@69 251 self.ptr.l_text = l_text
jpayne@69 252 memcpy(self.ptr.text, ctext, l_text + 1)
jpayne@69 253
jpayne@69 254 if reference_names and reference_lengths:
jpayne@69 255 reference_names = [force_bytes(ref) for ref in reference_names]
jpayne@69 256
jpayne@69 257 self.ptr.n_targets = len(reference_names)
jpayne@69 258
jpayne@69 259 n = sum([len(reference_names) + 1])
jpayne@69 260 self.ptr.target_name = <char**>calloc(n, sizeof(char*))
jpayne@69 261 if self.ptr.target_name == NULL:
jpayne@69 262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
jpayne@69 263
jpayne@69 264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
jpayne@69 265 if self.ptr.target_len == NULL:
jpayne@69 266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
jpayne@69 267
jpayne@69 268 for x from 0 <= x < self.ptr.n_targets:
jpayne@69 269 self.ptr.target_len[x] = reference_lengths[x]
jpayne@69 270 name = reference_names[x]
jpayne@69 271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
jpayne@69 272 if self.ptr.target_name[x] == NULL:
jpayne@69 273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
jpayne@69 274 strncpy(self.ptr.target_name[x], name, len(name))
jpayne@69 275
jpayne@69 276 return self
jpayne@69 277
jpayne@69 278 @classmethod
jpayne@69 279 def from_text(cls, text):
jpayne@69 280
jpayne@69 281 reference_names, reference_lengths = [], []
jpayne@69 282 for line in text.splitlines():
jpayne@69 283 if line.startswith("@SQ"):
jpayne@69 284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
jpayne@69 285 try:
jpayne@69 286 reference_names.append(fields["SN"])
jpayne@69 287 reference_lengths.append(int(fields["LN"]))
jpayne@69 288 except KeyError:
jpayne@69 289 raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@69 290 except ValueError:
jpayne@69 291 raise ValueError("wrong sequence information in '%s'" % str(fields))
jpayne@69 292
jpayne@69 293 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@69 294
jpayne@69 295 @classmethod
jpayne@69 296 def from_dict(cls, header_dict):
jpayne@69 297
jpayne@69 298 cdef list lines = []
jpayne@69 299 # first: defined tags
jpayne@69 300 for record in VALID_HEADERS:
jpayne@69 301 if record in header_dict:
jpayne@69 302 data = header_dict[record]
jpayne@69 303 if not isinstance(data, VALID_HEADER_TYPES[record]):
jpayne@69 304 raise ValueError(
jpayne@69 305 "invalid type for record %s: %s, expected %s".format(
jpayne@69 306 record, type(data), VALID_HEADER_TYPES[record]))
jpayne@69 307 if isinstance(data, Mapping):
jpayne@69 308 lines.append(build_header_line(data, record))
jpayne@69 309 else:
jpayne@69 310 for fields in header_dict[record]:
jpayne@69 311 lines.append(build_header_line(fields, record))
jpayne@69 312
jpayne@69 313 # then: user tags (lower case), sorted alphabetically
jpayne@69 314 for record, data in sorted(header_dict.items()):
jpayne@69 315 if record in VALID_HEADERS:
jpayne@69 316 continue
jpayne@69 317 if isinstance(data, Mapping):
jpayne@69 318 lines.append(build_header_line(data, record))
jpayne@69 319 else:
jpayne@69 320 for fields in header_dict[record]:
jpayne@69 321 lines.append(build_header_line(fields, record))
jpayne@69 322
jpayne@69 323 text = "\n".join(lines) + "\n"
jpayne@69 324
jpayne@69 325 reference_names, reference_lengths = [], []
jpayne@69 326 if "SQ" in header_dict:
jpayne@69 327 for fields in header_dict["SQ"]:
jpayne@69 328 try:
jpayne@69 329 reference_names.append(fields["SN"])
jpayne@69 330 reference_lengths.append(fields["LN"])
jpayne@69 331 except KeyError:
jpayne@69 332 raise KeyError("incomplete sequence information in '%s'" % str(fields))
jpayne@69 333
jpayne@69 334 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@69 335
jpayne@69 336 @classmethod
jpayne@69 337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
jpayne@69 338
jpayne@69 339 if len(reference_names) != len(reference_lengths):
jpayne@69 340 raise ValueError("number of reference names and lengths do not match")
jpayne@69 341
jpayne@69 342 # optionally, if there is no text, add a SAM compatible header to output file.
jpayne@69 343 if text is None and add_sq_text:
jpayne@69 344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
jpayne@69 345 reference_names, reference_lengths)])
jpayne@69 346
jpayne@69 347 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
jpayne@69 348
jpayne@69 349 def __dealloc__(self):
jpayne@69 350 bam_hdr_destroy(self.ptr)
jpayne@69 351 self.ptr = NULL
jpayne@69 352
jpayne@69 353 def __bool__(self):
jpayne@69 354 return self.ptr != NULL
jpayne@69 355
jpayne@69 356 def copy(self):
jpayne@69 357 return makeAlignmentHeader(bam_hdr_dup(self.ptr))
jpayne@69 358
jpayne@69 359 property nreferences:
jpayne@69 360 """int with the number of :term:`reference` sequences in the file.
jpayne@69 361
jpayne@69 362 This is a read-only attribute."""
jpayne@69 363 def __get__(self):
jpayne@69 364 return self.ptr.n_targets
jpayne@69 365
jpayne@69 366 property references:
jpayne@69 367 """tuple with the names of :term:`reference` sequences. This is a
jpayne@69 368 read-only attribute"""
jpayne@69 369 def __get__(self):
jpayne@69 370 t = []
jpayne@69 371 cdef int x
jpayne@69 372 for x in range(self.ptr.n_targets):
jpayne@69 373 t.append(charptr_to_str(self.ptr.target_name[x]))
jpayne@69 374 return tuple(t)
jpayne@69 375
jpayne@69 376 property lengths:
jpayne@69 377 """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@69 378 read-only attribute. The lengths are in the same order as
jpayne@69 379 :attr:`pysam.AlignmentFile.references`
jpayne@69 380 """
jpayne@69 381 def __get__(self):
jpayne@69 382 t = []
jpayne@69 383 cdef int x
jpayne@69 384 for x in range(self.ptr.n_targets):
jpayne@69 385 t.append(self.ptr.target_len[x])
jpayne@69 386 return tuple(t)
jpayne@69 387
jpayne@69 388 def _build_sequence_section(self):
jpayne@69 389 """return sequence section of header.
jpayne@69 390
jpayne@69 391 The sequence section is built from the list of reference names and
jpayne@69 392 lengths stored in the BAM-file and not from any @SQ entries that
jpayne@69 393 are part of the header's text section.
jpayne@69 394 """
jpayne@69 395
jpayne@69 396 cdef int x
jpayne@69 397 text = []
jpayne@69 398 for x in range(self.ptr.n_targets):
jpayne@69 399 text.append("@SQ\tSN:{}\tLN:{}\n".format(
jpayne@69 400 force_str(self.ptr.target_name[x]),
jpayne@69 401 self.ptr.target_len[x]))
jpayne@69 402 return "".join(text)
jpayne@69 403
jpayne@69 404 def to_dict(self):
jpayne@69 405 """return two-level dictionary with header information from the file.
jpayne@69 406
jpayne@69 407 The first level contains the record (``HD``, ``SQ``, etc) and
jpayne@69 408 the second level contains the fields (``VN``, ``LN``, etc).
jpayne@69 409
jpayne@69 410 The parser is validating and will raise an AssertionError if
jpayne@69 411 if encounters any record or field tags that are not part of
jpayne@69 412 the SAM specification. Use the
jpayne@69 413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
jpayne@69 414 header.
jpayne@69 415
jpayne@69 416 The parsing follows the SAM format specification with the
jpayne@69 417 exception of the ``CL`` field. This option will consume the
jpayne@69 418 rest of a header line irrespective of any additional fields.
jpayne@69 419 This behaviour has been added to accommodate command line
jpayne@69 420 options that contain characters that are not valid field
jpayne@69 421 separators.
jpayne@69 422
jpayne@69 423 If no @SQ entries are within the text section of the header,
jpayne@69 424 this will be automatically added from the reference names and
jpayne@69 425 lengths stored in the binary part of the header.
jpayne@69 426 """
jpayne@69 427 result = collections.OrderedDict()
jpayne@69 428
jpayne@69 429 # convert to python string
jpayne@69 430 t = self.__str__()
jpayne@69 431 for line in t.split("\n"):
jpayne@69 432 line = line.strip(' \0')
jpayne@69 433 if not line:
jpayne@69 434 continue
jpayne@69 435 assert line.startswith("@"), \
jpayne@69 436 "header line without '@': '%s'" % line
jpayne@69 437 fields = line[1:].split("\t")
jpayne@69 438 record = fields[0]
jpayne@69 439 assert record in VALID_HEADER_TYPES, \
jpayne@69 440 "header line with invalid type '%s': '%s'" % (record, line)
jpayne@69 441
jpayne@69 442 # treat comments
jpayne@69 443 if record == "CO":
jpayne@69 444 if record not in result:
jpayne@69 445 result[record] = []
jpayne@69 446 result[record].append("\t".join( fields[1:]))
jpayne@69 447 continue
jpayne@69 448 # the following is clumsy as generators do not work?
jpayne@69 449 x = {}
jpayne@69 450
jpayne@69 451 for idx, field in enumerate(fields[1:]):
jpayne@69 452 if ":" not in field:
jpayne@69 453 raise ValueError("malformatted header: no ':' in field" )
jpayne@69 454 key, value = field.split(":", 1)
jpayne@69 455 if key in ("CL",):
jpayne@69 456 # special treatment for command line
jpayne@69 457 # statements (CL). These might contain
jpayne@69 458 # characters that are non-conformant with
jpayne@69 459 # the valid field separators in the SAM
jpayne@69 460 # header. Thus, in contravention to the
jpayne@69 461 # SAM API, consume the rest of the line.
jpayne@69 462 key, value = "\t".join(fields[idx+1:]).split(":", 1)
jpayne@69 463 x[key] = KNOWN_HEADER_FIELDS[record][key](value)
jpayne@69 464 break
jpayne@69 465
jpayne@69 466 # interpret type of known header record tags, default to str
jpayne@69 467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
jpayne@69 468
jpayne@69 469 if VALID_HEADER_TYPES[record] == Mapping:
jpayne@69 470 if record in result:
jpayne@69 471 raise ValueError(
jpayne@69 472 "multiple '%s' lines are not permitted" % record)
jpayne@69 473
jpayne@69 474 result[record] = x
jpayne@69 475 elif VALID_HEADER_TYPES[record] == Sequence:
jpayne@69 476 if record not in result: result[record] = []
jpayne@69 477 result[record].append(x)
jpayne@69 478
jpayne@69 479 # if there are no SQ lines in the header, add the
jpayne@69 480 # reference names from the information in the bam
jpayne@69 481 # file.
jpayne@69 482 #
jpayne@69 483 # Background: c-samtools keeps the textual part of the
jpayne@69 484 # header separate from the list of reference names and
jpayne@69 485 # lengths. Thus, if a header contains only SQ lines,
jpayne@69 486 # the SQ information is not part of the textual header
jpayne@69 487 # and thus are missing from the output. See issue 84.
jpayne@69 488 if "SQ" not in result:
jpayne@69 489 sq = []
jpayne@69 490 for ref, length in zip(self.references, self.lengths):
jpayne@69 491 sq.append({'LN': length, 'SN': ref })
jpayne@69 492 result["SQ"] = sq
jpayne@69 493
jpayne@69 494 return result
jpayne@69 495
jpayne@69 496 def as_dict(self):
jpayne@69 497 """deprecated, use :meth:`to_dict()` instead"""
jpayne@69 498 return self.to_dict()
jpayne@69 499
jpayne@69 500 def get_reference_name(self, tid):
jpayne@69 501 if tid == -1:
jpayne@69 502 return None
jpayne@69 503 if not 0 <= tid < self.ptr.n_targets:
jpayne@69 504 raise ValueError("reference_id %i out of range 0<=tid<%i" %
jpayne@69 505 (tid, self.ptr.n_targets))
jpayne@69 506 return charptr_to_str(self.ptr.target_name[tid])
jpayne@69 507
jpayne@69 508 def get_reference_length(self, reference):
jpayne@69 509 cdef int tid = self.get_tid(reference)
jpayne@69 510 if tid < 0:
jpayne@69 511 raise KeyError("unknown reference {}".format(reference))
jpayne@69 512 else:
jpayne@69 513 return self.ptr.target_len[tid]
jpayne@69 514
jpayne@69 515 def is_valid_tid(self, int tid):
jpayne@69 516 """
jpayne@69 517 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@69 518
jpayne@69 519 Note that the unmapped tid code (-1) counts as an invalid.
jpayne@69 520 """
jpayne@69 521 return 0 <= tid < self.ptr.n_targets
jpayne@69 522
jpayne@69 523 def get_tid(self, reference):
jpayne@69 524 """
jpayne@69 525 return the numerical :term:`tid` corresponding to
jpayne@69 526 :term:`reference`
jpayne@69 527
jpayne@69 528 returns -1 if reference is not known.
jpayne@69 529 """
jpayne@69 530 reference = force_bytes(reference)
jpayne@69 531 tid = bam_name2id(self.ptr, reference)
jpayne@69 532 if tid < -1:
jpayne@69 533 raise ValueError('could not parse header')
jpayne@69 534 return tid
jpayne@69 535
jpayne@69 536 def __str__(self):
jpayne@69 537 '''string with the full contents of the :term:`sam file` header as a
jpayne@69 538 string.
jpayne@69 539
jpayne@69 540 If no @SQ entries are within the text section of the header,
jpayne@69 541 this will be automatically added from the reference names and
jpayne@69 542 lengths stored in the binary part of the header.
jpayne@69 543
jpayne@69 544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
jpayne@69 545 representation of the header.
jpayne@69 546 '''
jpayne@69 547 text = from_string_and_size(self.ptr.text, self.ptr.l_text)
jpayne@69 548 if "@SQ" not in text:
jpayne@69 549 text += "\n" + self._build_sequence_section()
jpayne@69 550 return text
jpayne@69 551
jpayne@69 552 # dictionary access methods, for backwards compatibility.
jpayne@69 553 def __setitem__(self, key, value):
jpayne@69 554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
jpayne@69 555
jpayne@69 556 def __getitem__(self, key):
jpayne@69 557 return self.to_dict().__getitem__(key)
jpayne@69 558
jpayne@69 559 def items(self):
jpayne@69 560 return self.to_dict().items()
jpayne@69 561
jpayne@69 562 # PY2 compatibility
jpayne@69 563 def iteritems(self):
jpayne@69 564 return self.to_dict().items()
jpayne@69 565
jpayne@69 566 def keys(self):
jpayne@69 567 return self.to_dict().keys()
jpayne@69 568
jpayne@69 569 def values(self):
jpayne@69 570 return self.to_dict().values()
jpayne@69 571
jpayne@69 572 def get(self, *args):
jpayne@69 573 return self.to_dict().get(*args)
jpayne@69 574
jpayne@69 575 def __len__(self):
jpayne@69 576 return self.to_dict().__len__()
jpayne@69 577
jpayne@69 578 def __contains__(self, key):
jpayne@69 579 return self.to_dict().__contains__(key)
jpayne@69 580
jpayne@69 581
jpayne@69 582 cdef class AlignmentFile(HTSFile):
jpayne@69 583 """AlignmentFile(filepath_or_object, mode=None, template=None,
jpayne@69 584 reference_names=None, reference_lengths=None, text=NULL,
jpayne@69 585 header=None, add_sq_text=False, check_header=True, check_sq=True,
jpayne@69 586 reference_filename=None, filename=None, index_filename=None,
jpayne@69 587 filepath_index=None, require_index=False, duplicate_filehandle=True,
jpayne@69 588 ignore_truncation=False, threads=1)
jpayne@69 589
jpayne@69 590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
jpayne@69 591
jpayne@69 592 If `filepath_or_object` is a string, the file is automatically
jpayne@69 593 opened. If `filepath_or_object` is a python File object, the
jpayne@69 594 already opened file will be used.
jpayne@69 595
jpayne@69 596 If the file is opened for reading and an index exists (if file is BAM, a
jpayne@69 597 .bai file or if CRAM a .crai file), it will be opened automatically.
jpayne@69 598 `index_filename` may be specified explicitly. If the index is not named
jpayne@69 599 in the standard manner, not located in the same directory as the
jpayne@69 600 BAM/CRAM file, or is remote. Without an index, random access via
jpayne@69 601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
jpayne@69 602 is disabled.
jpayne@69 603
jpayne@69 604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can
jpayne@69 605 be constituted from several sources (see also the samtools format
jpayne@69 606 specification):
jpayne@69 607
jpayne@69 608 1. If `template` is given, the header is copied from another
jpayne@69 609 `AlignmentFile` (`template` must be a
jpayne@69 610 :class:`~pysam.AlignmentFile`).
jpayne@69 611
jpayne@69 612 2. If `header` is given, the header is built from a
jpayne@69 613 multi-level dictionary.
jpayne@69 614
jpayne@69 615 3. If `text` is given, new header text is copied from raw
jpayne@69 616 text.
jpayne@69 617
jpayne@69 618 4. The names (`reference_names`) and lengths
jpayne@69 619 (`reference_lengths`) are supplied directly as lists.
jpayne@69 620
jpayne@69 621 When reading or writing a CRAM file, the filename of a FASTA-formatted
jpayne@69 622 reference can be specified with `reference_filename`.
jpayne@69 623
jpayne@69 624 By default, if a file is opened in mode 'r', it is checked
jpayne@69 625 for a valid header (`check_header` = True) and a definition of
jpayne@69 626 chromosome names (`check_sq` = True).
jpayne@69 627
jpayne@69 628 Parameters
jpayne@69 629 ----------
jpayne@69 630 mode : string
jpayne@69 631 `mode` should be ``r`` for reading or ``w`` for writing. The
jpayne@69 632 default is text mode (:term:`SAM`). For binary (:term:`BAM`)
jpayne@69 633 I/O you should append ``b`` for compressed or ``u`` for
jpayne@69 634 uncompressed :term:`BAM` output. Use ``h`` to output header
jpayne@69 635 information in text (:term:`TAM`) mode. Use ``c`` for
jpayne@69 636 :term:`CRAM` formatted files.
jpayne@69 637
jpayne@69 638 If ``b`` is present, it must immediately follow ``r`` or
jpayne@69 639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
jpayne@69 640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
jpayne@69 641 :term:`BAM` formatted file for reading, type::
jpayne@69 642
jpayne@69 643 f = pysam.AlignmentFile('ex1.bam','rb')
jpayne@69 644
jpayne@69 645 If mode is not specified, the method will try to auto-detect
jpayne@69 646 in the order 'rb', 'r', thus both the following should work::
jpayne@69 647
jpayne@69 648 f1 = pysam.AlignmentFile('ex1.bam')
jpayne@69 649 f2 = pysam.AlignmentFile('ex1.sam')
jpayne@69 650
jpayne@69 651 template : AlignmentFile
jpayne@69 652 when writing, copy header from file `template`.
jpayne@69 653
jpayne@69 654 header : dict or AlignmentHeader
jpayne@69 655 when writing, build header from a multi-level dictionary. The
jpayne@69 656 first level are the four types ('HD', 'SQ', ...). The second
jpayne@69 657 level are a list of lines, with each line being a list of
jpayne@69 658 tag-value pairs. The header is constructed first from all the
jpayne@69 659 defined fields, followed by user tags in alphabetical
jpayne@69 660 order. Alternatively, an :class:`~pysam.AlignmentHeader`
jpayne@69 661 object can be passed directly.
jpayne@69 662
jpayne@69 663 text : string
jpayne@69 664 when writing, use the string provided as the header
jpayne@69 665
jpayne@69 666 reference_names : list
jpayne@69 667 see reference_lengths
jpayne@69 668
jpayne@69 669 reference_lengths : list
jpayne@69 670 when writing or opening a SAM file without header build header
jpayne@69 671 from list of chromosome names and lengths. By default, 'SQ'
jpayne@69 672 and 'LN' tags will be added to the header text. This option
jpayne@69 673 can be changed by unsetting the flag `add_sq_text`.
jpayne@69 674
jpayne@69 675 add_sq_text : bool
jpayne@69 676 do not add 'SQ' and 'LN' tags to header. This option permits
jpayne@69 677 construction :term:`SAM` formatted files without a header.
jpayne@69 678
jpayne@69 679 add_sam_header : bool
jpayne@69 680 when outputting SAM the default is to output a header. This is
jpayne@69 681 equivalent to opening the file in 'wh' mode. If this option is
jpayne@69 682 set to False, no header will be output. To read such a file,
jpayne@69 683 set `check_header=False`.
jpayne@69 684
jpayne@69 685 check_header : bool
jpayne@69 686 obsolete: when reading a SAM file, check if header is present
jpayne@69 687 (default=True)
jpayne@69 688
jpayne@69 689 check_sq : bool
jpayne@69 690 when reading, check if SQ entries are present in header
jpayne@69 691 (default=True)
jpayne@69 692
jpayne@69 693 reference_filename : string
jpayne@69 694 Path to a FASTA-formatted reference file. Valid only for CRAM files.
jpayne@69 695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
jpayne@69 696 specified in the header (``UR`` tag), which are normally used to find
jpayne@69 697 the reference.
jpayne@69 698
jpayne@69 699 index_filename : string
jpayne@69 700 Explicit path to the index file. Only needed if the index is not
jpayne@69 701 named in the standard manner, not located in the same directory as
jpayne@69 702 the BAM/CRAM file, or is remote. An IOError is raised if the index
jpayne@69 703 cannot be found or is invalid.
jpayne@69 704
jpayne@69 705 filepath_index : string
jpayne@69 706 Alias for `index_filename`.
jpayne@69 707
jpayne@69 708 require_index : bool
jpayne@69 709 When reading, require that an index file is present and is valid or
jpayne@69 710 raise an IOError. (default=False)
jpayne@69 711
jpayne@69 712 filename : string
jpayne@69 713 Alternative to filepath_or_object. Filename of the file
jpayne@69 714 to be opened.
jpayne@69 715
jpayne@69 716 duplicate_filehandle: bool
jpayne@69 717 By default, file handles passed either directly or through
jpayne@69 718 File-like objects will be duplicated before passing them to
jpayne@69 719 htslib. The duplication prevents issues where the same stream
jpayne@69 720 will be closed by htslib and through destruction of the
jpayne@69 721 high-level python object. Set to False to turn off
jpayne@69 722 duplication.
jpayne@69 723
jpayne@69 724 ignore_truncation: bool
jpayne@69 725 Issue a warning, instead of raising an error if the current file
jpayne@69 726 appears to be truncated due to a missing EOF marker. Only applies
jpayne@69 727 to bgzipped formats. (Default=False)
jpayne@69 728
jpayne@69 729 format_options: list
jpayne@69 730 A list of key=value strings, as accepted by --input-fmt-option and
jpayne@69 731 --output-fmt-option in samtools.
jpayne@69 732 threads: integer
jpayne@69 733 Number of threads to use for compressing/decompressing BAM/CRAM files.
jpayne@69 734 Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@69 735 (Default=1)
jpayne@69 736 """
jpayne@69 737
jpayne@69 738 def __cinit__(self, *args, **kwargs):
jpayne@69 739 self.htsfile = NULL
jpayne@69 740 self.filename = None
jpayne@69 741 self.mode = None
jpayne@69 742 self.threads = 1
jpayne@69 743 self.is_stream = False
jpayne@69 744 self.is_remote = False
jpayne@69 745 self.index = NULL
jpayne@69 746
jpayne@69 747 if "filename" in kwargs:
jpayne@69 748 args = [kwargs["filename"]]
jpayne@69 749 del kwargs["filename"]
jpayne@69 750
jpayne@69 751 self._open(*args, **kwargs)
jpayne@69 752
jpayne@69 753 # allocate memory for iterator
jpayne@69 754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
jpayne@69 755 if self.b == NULL:
jpayne@69 756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
jpayne@69 757
jpayne@69 758 def has_index(self):
jpayne@69 759 """return true if htsfile has an existing (and opened) index.
jpayne@69 760 """
jpayne@69 761 return self.index != NULL
jpayne@69 762
jpayne@69 763 def check_index(self):
jpayne@69 764 """return True if index is present.
jpayne@69 765
jpayne@69 766 Raises
jpayne@69 767 ------
jpayne@69 768
jpayne@69 769 AttributeError
jpayne@69 770 if htsfile is :term:`SAM` formatted and thus has no index.
jpayne@69 771
jpayne@69 772 ValueError
jpayne@69 773 if htsfile is closed or index could not be opened.
jpayne@69 774 """
jpayne@69 775
jpayne@69 776 if not self.is_open:
jpayne@69 777 raise ValueError("I/O operation on closed file")
jpayne@69 778 if not self.is_bam and not self.is_cram:
jpayne@69 779 raise AttributeError(
jpayne@69 780 "AlignmentFile.mapped only available in bam files")
jpayne@69 781 if self.index == NULL:
jpayne@69 782 raise ValueError(
jpayne@69 783 "mapping information not recorded in index "
jpayne@69 784 "or index not available")
jpayne@69 785 return True
jpayne@69 786
jpayne@69 787 def _open(self,
jpayne@69 788 filepath_or_object,
jpayne@69 789 mode=None,
jpayne@69 790 AlignmentFile template=None,
jpayne@69 791 reference_names=None,
jpayne@69 792 reference_lengths=None,
jpayne@69 793 reference_filename=None,
jpayne@69 794 text=None,
jpayne@69 795 header=None,
jpayne@69 796 port=None,
jpayne@69 797 add_sq_text=True,
jpayne@69 798 add_sam_header=True,
jpayne@69 799 check_header=True,
jpayne@69 800 check_sq=True,
jpayne@69 801 index_filename=None,
jpayne@69 802 filepath_index=None,
jpayne@69 803 require_index=False,
jpayne@69 804 referencenames=None,
jpayne@69 805 referencelengths=None,
jpayne@69 806 duplicate_filehandle=True,
jpayne@69 807 ignore_truncation=False,
jpayne@69 808 format_options=None,
jpayne@69 809 threads=1):
jpayne@69 810 '''open a sam, bam or cram formatted file.
jpayne@69 811
jpayne@69 812 If _open is called on an existing file, the current file
jpayne@69 813 will be closed and a new file will be opened.
jpayne@69 814
jpayne@69 815 '''
jpayne@69 816 cdef char *cfilename = NULL
jpayne@69 817 cdef char *creference_filename = NULL
jpayne@69 818 cdef char *cindexname = NULL
jpayne@69 819 cdef char *cmode = NULL
jpayne@69 820 cdef bam_hdr_t * hdr = NULL
jpayne@69 821
jpayne@69 822 if threads > 1 and ignore_truncation:
jpayne@69 823 # This won't raise errors if reaching a truncated alignment,
jpayne@69 824 # because bgzf_mt_reader in htslib does not deal with
jpayne@69 825 # bgzf_mt_read_block returning non-zero values, contrary
jpayne@69 826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@69 827 # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@69 828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@69 829 self.threads = threads
jpayne@69 830
jpayne@69 831 # for backwards compatibility:
jpayne@69 832 if referencenames is not None:
jpayne@69 833 reference_names = referencenames
jpayne@69 834 if referencelengths is not None:
jpayne@69 835 reference_lengths = referencelengths
jpayne@69 836
jpayne@69 837 # close a previously opened file
jpayne@69 838 if self.is_open:
jpayne@69 839 self.close()
jpayne@69 840
jpayne@69 841 # autodetection for read
jpayne@69 842 if mode is None:
jpayne@69 843 mode = "r"
jpayne@69 844
jpayne@69 845 if add_sam_header and mode == "w":
jpayne@69 846 mode = "wh"
jpayne@69 847
jpayne@69 848 assert mode in ("r", "w", "rb", "wb", "wh",
jpayne@69 849 "wbu", "rU", "wb0",
jpayne@69 850 "rc", "wc"), \
jpayne@69 851 "invalid file opening mode `%s`" % mode
jpayne@69 852
jpayne@69 853 self.duplicate_filehandle = duplicate_filehandle
jpayne@69 854
jpayne@69 855 # StringIO not supported
jpayne@69 856 if isinstance(filepath_or_object, StringIO):
jpayne@69 857 raise NotImplementedError(
jpayne@69 858 "access from StringIO objects not supported")
jpayne@69 859 # reading from a file descriptor
jpayne@69 860 elif isinstance(filepath_or_object, int):
jpayne@69 861 self.filename = filepath_or_object
jpayne@69 862 filename = None
jpayne@69 863 self.is_remote = False
jpayne@69 864 self.is_stream = True
jpayne@69 865 # reading from a File object or other object with fileno
jpayne@69 866 elif hasattr(filepath_or_object, "fileno"):
jpayne@69 867 if filepath_or_object.closed:
jpayne@69 868 raise ValueError('I/O operation on closed file')
jpayne@69 869 self.filename = filepath_or_object
jpayne@69 870 # .name can be TextIOWrapper
jpayne@69 871 try:
jpayne@69 872 filename = encode_filename(str(filepath_or_object.name))
jpayne@69 873 cfilename = filename
jpayne@69 874 except AttributeError:
jpayne@69 875 filename = None
jpayne@69 876 self.is_remote = False
jpayne@69 877 self.is_stream = True
jpayne@69 878 # what remains is a filename
jpayne@69 879 else:
jpayne@69 880 self.filename = filename = encode_filename(filepath_or_object)
jpayne@69 881 cfilename = filename
jpayne@69 882 self.is_remote = hisremote(cfilename)
jpayne@69 883 self.is_stream = self.filename == b'-'
jpayne@69 884
jpayne@69 885 # for htslib, wbu seems to not work
jpayne@69 886 if mode == "wbu":
jpayne@69 887 mode = "wb0"
jpayne@69 888
jpayne@69 889 self.mode = force_bytes(mode)
jpayne@69 890 self.reference_filename = reference_filename = encode_filename(
jpayne@69 891 reference_filename)
jpayne@69 892
jpayne@69 893 if mode[0] == 'w':
jpayne@69 894 # open file for writing
jpayne@69 895
jpayne@69 896 if not (template or header or text or (reference_names and reference_lengths)):
jpayne@69 897 raise ValueError(
jpayne@69 898 "either supply options `template`, `header`, `text` or both `reference_names` "
jpayne@69 899 "and `reference_lengths` for writing")
jpayne@69 900
jpayne@69 901 if template:
jpayne@69 902 # header is copied, though at the moment not strictly
jpayne@69 903 # necessary as AlignmentHeader is immutable.
jpayne@69 904 self.header = template.header.copy()
jpayne@69 905 elif isinstance(header, AlignmentHeader):
jpayne@69 906 self.header = header.copy()
jpayne@69 907 elif isinstance(header, Mapping):
jpayne@69 908 self.header = AlignmentHeader.from_dict(header)
jpayne@69 909 elif reference_names and reference_lengths:
jpayne@69 910 self.header = AlignmentHeader.from_references(
jpayne@69 911 reference_names,
jpayne@69 912 reference_lengths,
jpayne@69 913 add_sq_text=add_sq_text,
jpayne@69 914 text=text)
jpayne@69 915 elif text:
jpayne@69 916 self.header = AlignmentHeader.from_text(text)
jpayne@69 917 else:
jpayne@69 918 raise ValueError("not enough information to construct header. Please provide template, "
jpayne@69 919 "header, text or reference_names/reference_lengths")
jpayne@69 920 self.htsfile = self._open_htsfile()
jpayne@69 921
jpayne@69 922 if self.htsfile == NULL:
jpayne@69 923 if errno:
jpayne@69 924 raise IOError(errno, "could not open alignment file `{}`: {}".format(
jpayne@69 925 force_str(filename),
jpayne@69 926 force_str(strerror(errno))))
jpayne@69 927 else:
jpayne@69 928 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@69 929 if format_options and len(format_options):
jpayne@69 930 self.add_hts_options(format_options)
jpayne@69 931 # set filename with reference sequences. If no filename
jpayne@69 932 # is given, the CRAM reference arrays will be built from
jpayne@69 933 # the @SQ header in the header
jpayne@69 934 if "c" in mode and reference_filename:
jpayne@69 935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
jpayne@69 936 raise ValueError("failure when setting reference filename")
jpayne@69 937
jpayne@69 938 # write header to htsfile
jpayne@69 939 if "b" in mode or "c" in mode or "h" in mode:
jpayne@69 940 hdr = self.header.ptr
jpayne@69 941 with nogil:
jpayne@69 942 sam_hdr_write(self.htsfile, hdr)
jpayne@69 943
jpayne@69 944 elif mode[0] == "r":
jpayne@69 945 # open file for reading
jpayne@69 946 self.htsfile = self._open_htsfile()
jpayne@69 947
jpayne@69 948 if self.htsfile == NULL:
jpayne@69 949 if errno:
jpayne@69 950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
jpayne@69 951 force_str(strerror(errno))))
jpayne@69 952 else:
jpayne@69 953 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
jpayne@69 954
jpayne@69 955 if self.htsfile.format.category != sequence_data:
jpayne@69 956 raise ValueError("file does not contain alignment data")
jpayne@69 957
jpayne@69 958 if format_options and len(format_options):
jpayne@69 959 self.add_hts_options(format_options)
jpayne@69 960
jpayne@69 961 self.check_truncation(ignore_truncation)
jpayne@69 962
jpayne@69 963 # bam/cram files require a valid header
jpayne@69 964 if self.is_bam or self.is_cram:
jpayne@69 965 with nogil:
jpayne@69 966 hdr = sam_hdr_read(self.htsfile)
jpayne@69 967 if hdr == NULL:
jpayne@69 968 raise ValueError(
jpayne@69 969 "file does not have a valid header (mode='%s') "
jpayne@69 970 "- is it BAM/CRAM format?" % mode)
jpayne@69 971 self.header = makeAlignmentHeader(hdr)
jpayne@69 972 else:
jpayne@69 973 # in sam files a header is optional. If not given,
jpayne@69 974 # user may provide reference names and lengths to built
jpayne@69 975 # an on-the-fly header.
jpayne@69 976 if reference_names and reference_lengths:
jpayne@69 977 # build header from a target names and lengths
jpayne@69 978 self.header = AlignmentHeader.from_references(
jpayne@69 979 reference_names=reference_names,
jpayne@69 980 reference_lengths=reference_lengths,
jpayne@69 981 add_sq_text=add_sq_text,
jpayne@69 982 text=text)
jpayne@69 983 else:
jpayne@69 984 with nogil:
jpayne@69 985 hdr = sam_hdr_read(self.htsfile)
jpayne@69 986 if hdr == NULL:
jpayne@69 987 raise ValueError(
jpayne@69 988 "SAM? file does not have a valid header (mode='%s'), "
jpayne@69 989 "please provide reference_names and reference_lengths")
jpayne@69 990 self.header = makeAlignmentHeader(hdr)
jpayne@69 991
jpayne@69 992 # set filename with reference sequences
jpayne@69 993 if self.is_cram and reference_filename:
jpayne@69 994 creference_filename = self.reference_filename
jpayne@69 995 hts_set_opt(self.htsfile,
jpayne@69 996 CRAM_OPT_REFERENCE,
jpayne@69 997 creference_filename)
jpayne@69 998
jpayne@69 999 if check_sq and self.header.nreferences == 0:
jpayne@69 1000 raise ValueError(
jpayne@69 1001 ("file has no sequences defined (mode='%s') - "
jpayne@69 1002 "is it SAM/BAM format? Consider opening with "
jpayne@69 1003 "check_sq=False") % mode)
jpayne@69 1004
jpayne@69 1005 if self.is_bam or self.is_cram:
jpayne@69 1006 self.index_filename = index_filename or filepath_index
jpayne@69 1007 if self.index_filename:
jpayne@69 1008 cindexname = bfile_name = encode_filename(self.index_filename)
jpayne@69 1009
jpayne@69 1010 if cfilename or cindexname:
jpayne@69 1011 with nogil:
jpayne@69 1012 self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
jpayne@69 1013 HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
jpayne@69 1014
jpayne@69 1015 if not self.index and (cindexname or require_index):
jpayne@69 1016 if errno:
jpayne@69 1017 raise IOError(errno, force_str(strerror(errno)))
jpayne@69 1018 else:
jpayne@69 1019 raise IOError('unable to open index file `%s`' % self.index_filename)
jpayne@69 1020
jpayne@69 1021 elif require_index:
jpayne@69 1022 raise IOError('unable to open index file')
jpayne@69 1023
jpayne@69 1024 # save start of data section
jpayne@69 1025 if not self.is_stream:
jpayne@69 1026 self.start_offset = self.tell()
jpayne@69 1027
jpayne@69 1028 def fetch(self,
jpayne@69 1029 contig=None,
jpayne@69 1030 start=None,
jpayne@69 1031 stop=None,
jpayne@69 1032 region=None,
jpayne@69 1033 tid=None,
jpayne@69 1034 until_eof=False,
jpayne@69 1035 multiple_iterators=False,
jpayne@69 1036 reference=None,
jpayne@69 1037 end=None):
jpayne@69 1038 """fetch reads aligned in a :term:`region`.
jpayne@69 1039
jpayne@69 1040 See :meth:`~pysam.HTSFile.parse_region` for more information
jpayne@69 1041 on how genomic regions can be specified. :term:`reference` and
jpayne@69 1042 `end` are also accepted for backward compatibility as synonyms
jpayne@69 1043 for :term:`contig` and `stop`, respectively.
jpayne@69 1044
jpayne@69 1045 Without a `contig` or `region` all mapped reads in the file
jpayne@69 1046 will be fetched. The reads will be returned ordered by reference
jpayne@69 1047 sequence, which will not necessarily be the order within the
jpayne@69 1048 file. This mode of iteration still requires an index. If there is
jpayne@69 1049 no index, use `until_eof=True`.
jpayne@69 1050
jpayne@69 1051 If only `contig` is set, all reads aligned to `contig`
jpayne@69 1052 will be fetched.
jpayne@69 1053
jpayne@69 1054 A :term:`SAM` file does not allow random access. If `region`
jpayne@69 1055 or `contig` are given, an exception is raised.
jpayne@69 1056
jpayne@69 1057 Parameters
jpayne@69 1058 ----------
jpayne@69 1059
jpayne@69 1060 until_eof : bool
jpayne@69 1061
jpayne@69 1062 If `until_eof` is True, all reads from the current file
jpayne@69 1063 position will be returned in order as they are within the
jpayne@69 1064 file. Using this option will also fetch unmapped reads.
jpayne@69 1065
jpayne@69 1066 multiple_iterators : bool
jpayne@69 1067
jpayne@69 1068 If `multiple_iterators` is True, multiple
jpayne@69 1069 iterators on the same file can be used at the same time. The
jpayne@69 1070 iterator returned will receive its own copy of a filehandle to
jpayne@69 1071 the file effectively re-opening the file. Re-opening a file
jpayne@69 1072 creates some overhead, so beware.
jpayne@69 1073
jpayne@69 1074 Returns
jpayne@69 1075 -------
jpayne@69 1076
jpayne@69 1077 An iterator over a collection of reads. : IteratorRow
jpayne@69 1078
jpayne@69 1079 Raises
jpayne@69 1080 ------
jpayne@69 1081
jpayne@69 1082 ValueError
jpayne@69 1083 if the genomic coordinates are out of range or invalid or the
jpayne@69 1084 file does not permit random access to genomic coordinates.
jpayne@69 1085
jpayne@69 1086 """
jpayne@69 1087 cdef int rtid, rstart, rstop, has_coord
jpayne@69 1088
jpayne@69 1089 if not self.is_open:
jpayne@69 1090 raise ValueError( "I/O operation on closed file" )
jpayne@69 1091
jpayne@69 1092 has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@69 1093 contig, start, stop, region, tid,
jpayne@69 1094 end=end, reference=reference)
jpayne@69 1095
jpayne@69 1096 # Turn of re-opening if htsfile is a stream
jpayne@69 1097 if self.is_stream:
jpayne@69 1098 multiple_iterators = False
jpayne@69 1099
jpayne@69 1100 if self.is_bam or self.is_cram:
jpayne@69 1101 if not until_eof and not self.is_remote:
jpayne@69 1102 if not self.has_index():
jpayne@69 1103 raise ValueError(
jpayne@69 1104 "fetch called on bamfile without index")
jpayne@69 1105
jpayne@69 1106 if has_coord:
jpayne@69 1107 return IteratorRowRegion(
jpayne@69 1108 self, rtid, rstart, rstop,
jpayne@69 1109 multiple_iterators=multiple_iterators)
jpayne@69 1110 else:
jpayne@69 1111 if until_eof:
jpayne@69 1112 return IteratorRowAll(
jpayne@69 1113 self,
jpayne@69 1114 multiple_iterators=multiple_iterators)
jpayne@69 1115 else:
jpayne@69 1116 # AH: check - reason why no multiple_iterators for
jpayne@69 1117 # AllRefs?
jpayne@69 1118 return IteratorRowAllRefs(
jpayne@69 1119 self,
jpayne@69 1120 multiple_iterators=multiple_iterators)
jpayne@69 1121 else:
jpayne@69 1122 if has_coord:
jpayne@69 1123 raise ValueError(
jpayne@69 1124 "fetching by region is not available for SAM files")
jpayne@69 1125
jpayne@69 1126 if multiple_iterators == True:
jpayne@69 1127 raise ValueError(
jpayne@69 1128 "multiple iterators not implemented for SAM files")
jpayne@69 1129
jpayne@69 1130 return IteratorRowAll(self,
jpayne@69 1131 multiple_iterators=multiple_iterators)
jpayne@69 1132
jpayne@69 1133 def head(self, n, multiple_iterators=True):
jpayne@69 1134 '''return an iterator over the first n alignments.
jpayne@69 1135
jpayne@69 1136 This iterator is is useful for inspecting the bam-file.
jpayne@69 1137
jpayne@69 1138 Parameters
jpayne@69 1139 ----------
jpayne@69 1140
jpayne@69 1141 multiple_iterators : bool
jpayne@69 1142
jpayne@69 1143 is set to True by default in order to
jpayne@69 1144 avoid changing the current file position.
jpayne@69 1145
jpayne@69 1146 Returns
jpayne@69 1147 -------
jpayne@69 1148
jpayne@69 1149 an iterator over a collection of reads : IteratorRowHead
jpayne@69 1150
jpayne@69 1151 '''
jpayne@69 1152 return IteratorRowHead(self, n,
jpayne@69 1153 multiple_iterators=multiple_iterators)
jpayne@69 1154
jpayne@69 1155 def mate(self, AlignedSegment read):
jpayne@69 1156 '''return the mate of :class:`pysam.AlignedSegment` `read`.
jpayne@69 1157
jpayne@69 1158 .. note::
jpayne@69 1159
jpayne@69 1160 Calling this method will change the file position.
jpayne@69 1161 This might interfere with any iterators that have
jpayne@69 1162 not re-opened the file.
jpayne@69 1163
jpayne@69 1164 .. note::
jpayne@69 1165
jpayne@69 1166 This method is too slow for high-throughput processing.
jpayne@69 1167 If a read needs to be processed with its mate, work
jpayne@69 1168 from a read name sorted file or, better, cache reads.
jpayne@69 1169
jpayne@69 1170 Returns
jpayne@69 1171 -------
jpayne@69 1172
jpayne@69 1173 the mate : AlignedSegment
jpayne@69 1174
jpayne@69 1175 Raises
jpayne@69 1176 ------
jpayne@69 1177
jpayne@69 1178 ValueError
jpayne@69 1179 if the read is unpaired or the mate is unmapped
jpayne@69 1180
jpayne@69 1181 '''
jpayne@69 1182 cdef uint32_t flag = read._delegate.core.flag
jpayne@69 1183
jpayne@69 1184 if flag & BAM_FPAIRED == 0:
jpayne@69 1185 raise ValueError("read %s: is unpaired" %
jpayne@69 1186 (read.query_name))
jpayne@69 1187 if flag & BAM_FMUNMAP != 0:
jpayne@69 1188 raise ValueError("mate %s: is unmapped" %
jpayne@69 1189 (read.query_name))
jpayne@69 1190
jpayne@69 1191 # xor flags to get the other mate
jpayne@69 1192 cdef int x = BAM_FREAD1 + BAM_FREAD2
jpayne@69 1193 flag = (flag ^ x) & x
jpayne@69 1194
jpayne@69 1195 # Make sure to use a separate file to jump around
jpayne@69 1196 # to mate as otherwise the original file position
jpayne@69 1197 # will be lost
jpayne@69 1198 # The following code is not using the C API and
jpayne@69 1199 # could thus be made much quicker, for example
jpayne@69 1200 # by using tell and seek.
jpayne@69 1201 for mate in self.fetch(
jpayne@69 1202 read._delegate.core.mpos,
jpayne@69 1203 read._delegate.core.mpos + 1,
jpayne@69 1204 tid=read._delegate.core.mtid,
jpayne@69 1205 multiple_iterators=True):
jpayne@69 1206 if mate.flag & flag != 0 and \
jpayne@69 1207 mate.query_name == read.query_name:
jpayne@69 1208 break
jpayne@69 1209 else:
jpayne@69 1210 raise ValueError("mate not found")
jpayne@69 1211
jpayne@69 1212 return mate
jpayne@69 1213
jpayne@69 1214 def pileup(self,
jpayne@69 1215 contig=None,
jpayne@69 1216 start=None,
jpayne@69 1217 stop=None,
jpayne@69 1218 region=None,
jpayne@69 1219 reference=None,
jpayne@69 1220 end=None,
jpayne@69 1221 **kwargs):
jpayne@69 1222 """perform a :term:`pileup` within a :term:`region`. The region is
jpayne@69 1223 specified by :term:`contig`, `start` and `stop` (using
jpayne@69 1224 0-based indexing). :term:`reference` and `end` are also accepted for
jpayne@69 1225 backward compatibility as synonyms for :term:`contig` and `stop`,
jpayne@69 1226 respectively. Alternatively, a samtools 'region' string
jpayne@69 1227 can be supplied.
jpayne@69 1228
jpayne@69 1229 Without 'contig' or 'region' all reads will be used for the
jpayne@69 1230 pileup. The reads will be returned ordered by
jpayne@69 1231 :term:`contig` sequence, which will not necessarily be the
jpayne@69 1232 order within the file.
jpayne@69 1233
jpayne@69 1234 Note that :term:`SAM` formatted files do not allow random
jpayne@69 1235 access. In these files, if a 'region' or 'contig' are
jpayne@69 1236 given an exception is raised.
jpayne@69 1237
jpayne@69 1238 .. note::
jpayne@69 1239
jpayne@69 1240 'all' reads which overlap the region are returned. The
jpayne@69 1241 first base returned will be the first base of the first
jpayne@69 1242 read 'not' necessarily the first base of the region used
jpayne@69 1243 in the query.
jpayne@69 1244
jpayne@69 1245 Parameters
jpayne@69 1246 ----------
jpayne@69 1247
jpayne@69 1248 truncate : bool
jpayne@69 1249
jpayne@69 1250 By default, the samtools pileup engine outputs all reads
jpayne@69 1251 overlapping a region. If truncate is True and a region is
jpayne@69 1252 given, only columns in the exact region specified are
jpayne@69 1253 returned.
jpayne@69 1254
jpayne@69 1255 max_depth : int
jpayne@69 1256 Maximum read depth permitted. The default limit is '8000'.
jpayne@69 1257
jpayne@69 1258 stepper : string
jpayne@69 1259 The stepper controls how the iterator advances.
jpayne@69 1260 Possible options for the stepper are
jpayne@69 1261
jpayne@69 1262 ``all``
jpayne@69 1263 skip reads in which any of the following flags are set:
jpayne@69 1264 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@69 1265
jpayne@69 1266 ``nofilter``
jpayne@69 1267 uses every single read turning off any filtering.
jpayne@69 1268
jpayne@69 1269 ``samtools``
jpayne@69 1270 same filter and read processing as in samtools
jpayne@69 1271 pileup. For full compatibility, this requires a
jpayne@69 1272 'fastafile' to be given. The following options all pertain
jpayne@69 1273 to filtering of the ``samtools`` stepper.
jpayne@69 1274
jpayne@69 1275 fastafile : :class:`~pysam.FastaFile` object.
jpayne@69 1276
jpayne@69 1277 This is required for some of the steppers.
jpayne@69 1278
jpayne@69 1279 ignore_overlaps: bool
jpayne@69 1280
jpayne@69 1281 If set to True, detect if read pairs overlap and only take
jpayne@69 1282 the higher quality base. This is the default.
jpayne@69 1283
jpayne@69 1284 flag_filter : int
jpayne@69 1285
jpayne@69 1286 ignore reads where any of the bits in the flag are set. The default is
jpayne@69 1287 BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
jpayne@69 1288
jpayne@69 1289 flag_require : int
jpayne@69 1290
jpayne@69 1291 only use reads where certain flags are set. The default is 0.
jpayne@69 1292
jpayne@69 1293 ignore_orphans: bool
jpayne@69 1294
jpayne@69 1295 ignore orphans (paired reads that are not in a proper pair).
jpayne@69 1296 The default is to ignore orphans.
jpayne@69 1297
jpayne@69 1298 min_base_quality: int
jpayne@69 1299
jpayne@69 1300 Minimum base quality. Bases below the minimum quality will
jpayne@69 1301 not be output. The default is 13.
jpayne@69 1302
jpayne@69 1303 adjust_capq_threshold: int
jpayne@69 1304
jpayne@69 1305 adjust mapping quality. The default is 0 for no
jpayne@69 1306 adjustment. The recommended value for adjustment is 50.
jpayne@69 1307
jpayne@69 1308 min_mapping_quality : int
jpayne@69 1309
jpayne@69 1310 only use reads above a minimum mapping quality. The default is 0.
jpayne@69 1311
jpayne@69 1312 compute_baq: bool
jpayne@69 1313
jpayne@69 1314 re-alignment computing per-Base Alignment Qualities (BAQ). The
jpayne@69 1315 default is to do re-alignment. Realignment requires a reference
jpayne@69 1316 sequence. If none is present, no realignment will be performed.
jpayne@69 1317
jpayne@69 1318 redo_baq: bool
jpayne@69 1319
jpayne@69 1320 recompute per-Base Alignment Quality on the fly ignoring
jpayne@69 1321 existing base qualities. The default is False (use existing
jpayne@69 1322 base qualities).
jpayne@69 1323
jpayne@69 1324 Returns
jpayne@69 1325 -------
jpayne@69 1326
jpayne@69 1327 an iterator over genomic positions. : IteratorColumn
jpayne@69 1328
jpayne@69 1329 """
jpayne@69 1330 cdef int rtid, has_coord
jpayne@69 1331 cdef int32_t rstart, rstop
jpayne@69 1332
jpayne@69 1333 if not self.is_open:
jpayne@69 1334 raise ValueError("I/O operation on closed file")
jpayne@69 1335
jpayne@69 1336 has_coord, rtid, rstart, rstop = self.parse_region(
jpayne@69 1337 contig, start, stop, region, reference=reference, end=end)
jpayne@69 1338
jpayne@69 1339 if has_coord:
jpayne@69 1340 if not self.has_index():
jpayne@69 1341 raise ValueError("no index available for pileup")
jpayne@69 1342
jpayne@69 1343 return IteratorColumnRegion(self,
jpayne@69 1344 tid=rtid,
jpayne@69 1345 start=rstart,
jpayne@69 1346 stop=rstop,
jpayne@69 1347 **kwargs)
jpayne@69 1348 else:
jpayne@69 1349 if self.has_index():
jpayne@69 1350 return IteratorColumnAllRefs(self, **kwargs)
jpayne@69 1351 else:
jpayne@69 1352 return IteratorColumnAll(self, **kwargs)
jpayne@69 1353
jpayne@69 1354 def count(self,
jpayne@69 1355 contig=None,
jpayne@69 1356 start=None,
jpayne@69 1357 stop=None,
jpayne@69 1358 region=None,
jpayne@69 1359 until_eof=False,
jpayne@69 1360 read_callback="nofilter",
jpayne@69 1361 reference=None,
jpayne@69 1362 end=None):
jpayne@69 1363 '''count the number of reads in :term:`region`
jpayne@69 1364
jpayne@69 1365 The region is specified by :term:`contig`, `start` and `stop`.
jpayne@69 1366 :term:`reference` and `end` are also accepted for backward
jpayne@69 1367 compatibility as synonyms for :term:`contig` and `stop`,
jpayne@69 1368 respectively. Alternatively, a `samtools`_ :term:`region`
jpayne@69 1369 string can be supplied.
jpayne@69 1370
jpayne@69 1371 A :term:`SAM` file does not allow random access and if
jpayne@69 1372 `region` or `contig` are given, an exception is raised.
jpayne@69 1373
jpayne@69 1374 Parameters
jpayne@69 1375 ----------
jpayne@69 1376
jpayne@69 1377 contig : string
jpayne@69 1378 reference_name of the genomic region (chromosome)
jpayne@69 1379
jpayne@69 1380 start : int
jpayne@69 1381 start of the genomic region (0-based inclusive)
jpayne@69 1382
jpayne@69 1383 stop : int
jpayne@69 1384 end of the genomic region (0-based exclusive)
jpayne@69 1385
jpayne@69 1386 region : string
jpayne@69 1387 a region string in samtools format.
jpayne@69 1388
jpayne@69 1389 until_eof : bool
jpayne@69 1390 count until the end of the file, possibly including
jpayne@69 1391 unmapped reads as well.
jpayne@69 1392
jpayne@69 1393 read_callback: string or function
jpayne@69 1394
jpayne@69 1395 select a call-back to ignore reads when counting. It can
jpayne@69 1396 be either a string with the following values:
jpayne@69 1397
jpayne@69 1398 ``all``
jpayne@69 1399 skip reads in which any of the following
jpayne@69 1400 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@69 1401 BAM_FDUP
jpayne@69 1402
jpayne@69 1403 ``nofilter``
jpayne@69 1404 uses every single read
jpayne@69 1405
jpayne@69 1406 Alternatively, `read_callback` can be a function
jpayne@69 1407 ``check_read(read)`` that should return True only for
jpayne@69 1408 those reads that shall be included in the counting.
jpayne@69 1409
jpayne@69 1410 reference : string
jpayne@69 1411 backward compatible synonym for `contig`
jpayne@69 1412
jpayne@69 1413 end : int
jpayne@69 1414 backward compatible synonym for `stop`
jpayne@69 1415
jpayne@69 1416 Raises
jpayne@69 1417 ------
jpayne@69 1418
jpayne@69 1419 ValueError
jpayne@69 1420 if the genomic coordinates are out of range or invalid.
jpayne@69 1421
jpayne@69 1422 '''
jpayne@69 1423 cdef AlignedSegment read
jpayne@69 1424 cdef long counter = 0
jpayne@69 1425
jpayne@69 1426 if not self.is_open:
jpayne@69 1427 raise ValueError("I/O operation on closed file")
jpayne@69 1428
jpayne@69 1429 cdef int filter_method = 0
jpayne@69 1430 if read_callback == "all":
jpayne@69 1431 filter_method = 1
jpayne@69 1432 elif read_callback == "nofilter":
jpayne@69 1433 filter_method = 2
jpayne@69 1434
jpayne@69 1435 for read in self.fetch(contig=contig,
jpayne@69 1436 start=start,
jpayne@69 1437 stop=stop,
jpayne@69 1438 reference=reference,
jpayne@69 1439 end=end,
jpayne@69 1440 region=region,
jpayne@69 1441 until_eof=until_eof):
jpayne@69 1442 # apply filter
jpayne@69 1443 if filter_method == 1:
jpayne@69 1444 # filter = "all"
jpayne@69 1445 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
jpayne@69 1446 continue
jpayne@69 1447 elif filter_method == 2:
jpayne@69 1448 # filter = "nofilter"
jpayne@69 1449 pass
jpayne@69 1450 else:
jpayne@69 1451 if not read_callback(read):
jpayne@69 1452 continue
jpayne@69 1453 counter += 1
jpayne@69 1454
jpayne@69 1455 return counter
jpayne@69 1456
jpayne@69 1457 @cython.boundscheck(False) # we do manual bounds checking
jpayne@69 1458 def count_coverage(self,
jpayne@69 1459 contig,
jpayne@69 1460 start=None,
jpayne@69 1461 stop=None,
jpayne@69 1462 region=None,
jpayne@69 1463 quality_threshold=15,
jpayne@69 1464 read_callback='all',
jpayne@69 1465 reference=None,
jpayne@69 1466 end=None):
jpayne@69 1467 """count the coverage of genomic positions by reads in :term:`region`.
jpayne@69 1468
jpayne@69 1469 The region is specified by :term:`contig`, `start` and `stop`.
jpayne@69 1470 :term:`reference` and `end` are also accepted for backward
jpayne@69 1471 compatibility as synonyms for :term:`contig` and `stop`,
jpayne@69 1472 respectively. Alternatively, a `samtools`_ :term:`region`
jpayne@69 1473 string can be supplied. The coverage is computed per-base [ACGT].
jpayne@69 1474
jpayne@69 1475 Parameters
jpayne@69 1476 ----------
jpayne@69 1477
jpayne@69 1478 contig : string
jpayne@69 1479 reference_name of the genomic region (chromosome)
jpayne@69 1480
jpayne@69 1481 start : int
jpayne@69 1482 start of the genomic region (0-based inclusive). If not
jpayne@69 1483 given, count from the start of the chromosome.
jpayne@69 1484
jpayne@69 1485 stop : int
jpayne@69 1486 end of the genomic region (0-based exclusive). If not given,
jpayne@69 1487 count to the end of the chromosome.
jpayne@69 1488
jpayne@69 1489 region : string
jpayne@69 1490 a region string.
jpayne@69 1491
jpayne@69 1492 quality_threshold : int
jpayne@69 1493 quality_threshold is the minimum quality score (in phred) a
jpayne@69 1494 base has to reach to be counted.
jpayne@69 1495
jpayne@69 1496 read_callback: string or function
jpayne@69 1497
jpayne@69 1498 select a call-back to ignore reads when counting. It can
jpayne@69 1499 be either a string with the following values:
jpayne@69 1500
jpayne@69 1501 ``all``
jpayne@69 1502 skip reads in which any of the following
jpayne@69 1503 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
jpayne@69 1504 BAM_FDUP
jpayne@69 1505
jpayne@69 1506 ``nofilter``
jpayne@69 1507 uses every single read
jpayne@69 1508
jpayne@69 1509 Alternatively, `read_callback` can be a function
jpayne@69 1510 ``check_read(read)`` that should return True only for
jpayne@69 1511 those reads that shall be included in the counting.
jpayne@69 1512
jpayne@69 1513 reference : string
jpayne@69 1514 backward compatible synonym for `contig`
jpayne@69 1515
jpayne@69 1516 end : int
jpayne@69 1517 backward compatible synonym for `stop`
jpayne@69 1518
jpayne@69 1519 Raises
jpayne@69 1520 ------
jpayne@69 1521
jpayne@69 1522 ValueError
jpayne@69 1523 if the genomic coordinates are out of range or invalid.
jpayne@69 1524
jpayne@69 1525 Returns
jpayne@69 1526 -------
jpayne@69 1527
jpayne@69 1528 four array.arrays of the same length in order A C G T : tuple
jpayne@69 1529
jpayne@69 1530 """
jpayne@69 1531
jpayne@69 1532 cdef uint32_t contig_length = self.get_reference_length(contig)
jpayne@69 1533 cdef int _start = start if start is not None else 0
jpayne@69 1534 cdef int _stop = stop if stop is not None else contig_length
jpayne@69 1535 _stop = _stop if _stop < contig_length else contig_length
jpayne@69 1536
jpayne@69 1537 if _stop == _start:
jpayne@69 1538 raise ValueError("interval of size 0")
jpayne@69 1539 if _stop < _start:
jpayne@69 1540 raise ValueError("interval of size less than 0")
jpayne@69 1541
jpayne@69 1542 cdef int length = _stop - _start
jpayne@69 1543 cdef c_array.array int_array_template = array.array('L', [])
jpayne@69 1544 cdef c_array.array count_a
jpayne@69 1545 cdef c_array.array count_c
jpayne@69 1546 cdef c_array.array count_g
jpayne@69 1547 cdef c_array.array count_t
jpayne@69 1548 count_a = c_array.clone(int_array_template, length, zero=True)
jpayne@69 1549 count_c = c_array.clone(int_array_template, length, zero=True)
jpayne@69 1550 count_g = c_array.clone(int_array_template, length, zero=True)
jpayne@69 1551 count_t = c_array.clone(int_array_template, length, zero=True)
jpayne@69 1552
jpayne@69 1553 cdef AlignedSegment read
jpayne@69 1554 cdef cython.str seq
jpayne@69 1555 cdef c_array.array quality
jpayne@69 1556 cdef int qpos
jpayne@69 1557 cdef int refpos
jpayne@69 1558 cdef int c = 0
jpayne@69 1559 cdef int filter_method = 0
jpayne@69 1560
jpayne@69 1561
jpayne@69 1562 if read_callback == "all":
jpayne@69 1563 filter_method = 1
jpayne@69 1564 elif read_callback == "nofilter":
jpayne@69 1565 filter_method = 2
jpayne@69 1566
jpayne@69 1567 cdef int _threshold = quality_threshold or 0
jpayne@69 1568 for read in self.fetch(contig=contig,
jpayne@69 1569 reference=reference,
jpayne@69 1570 start=start,
jpayne@69 1571 stop=stop,
jpayne@69 1572 end=end,
jpayne@69 1573 region=region):
jpayne@69 1574 # apply filter
jpayne@69 1575 if filter_method == 1:
jpayne@69 1576 # filter = "all"
jpayne@69 1577 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
jpayne@69 1578 continue
jpayne@69 1579 elif filter_method == 2:
jpayne@69 1580 # filter = "nofilter"
jpayne@69 1581 pass
jpayne@69 1582 else:
jpayne@69 1583 if not read_callback(read):
jpayne@69 1584 continue
jpayne@69 1585
jpayne@69 1586 # count
jpayne@69 1587 seq = read.seq
jpayne@69 1588 if seq is None:
jpayne@69 1589 continue
jpayne@69 1590 quality = read.query_qualities
jpayne@69 1591
jpayne@69 1592 for qpos, refpos in read.get_aligned_pairs(True):
jpayne@69 1593 if qpos is not None and refpos is not None and \
jpayne@69 1594 _start <= refpos < _stop:
jpayne@69 1595
jpayne@69 1596 # only check base quality if _threshold > 0
jpayne@69 1597 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
jpayne@69 1598 if seq[qpos] == 'A':
jpayne@69 1599 count_a.data.as_ulongs[refpos - _start] += 1
jpayne@69 1600 if seq[qpos] == 'C':
jpayne@69 1601 count_c.data.as_ulongs[refpos - _start] += 1
jpayne@69 1602 if seq[qpos] == 'G':
jpayne@69 1603 count_g.data.as_ulongs[refpos - _start] += 1
jpayne@69 1604 if seq[qpos] == 'T':
jpayne@69 1605 count_t.data.as_ulongs[refpos - _start] += 1
jpayne@69 1606
jpayne@69 1607 return count_a, count_c, count_g, count_t
jpayne@69 1608
jpayne@69 1609 def find_introns_slow(self, read_iterator):
jpayne@69 1610 """Return a dictionary {(start, stop): count}
jpayne@69 1611 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@69 1612 and their support ( = number of reads ).
jpayne@69 1613
jpayne@69 1614 read_iterator can be the result of a .fetch(...) call.
jpayne@69 1615 Or it can be a generator filtering such reads. Example
jpayne@69 1616 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@69 1617 """
jpayne@69 1618 res = collections.Counter()
jpayne@69 1619 for r in read_iterator:
jpayne@69 1620 if 'N' in r.cigarstring:
jpayne@69 1621 last_read_pos = False
jpayne@69 1622 for read_loc, genome_loc in r.get_aligned_pairs():
jpayne@69 1623 if read_loc is None and last_read_pos:
jpayne@69 1624 start = genome_loc
jpayne@69 1625 elif read_loc and last_read_pos is None:
jpayne@69 1626 stop = genome_loc # we are right exclusive ,so this is correct
jpayne@69 1627 res[(start, stop)] += 1
jpayne@69 1628 del start
jpayne@69 1629 del stop
jpayne@69 1630 last_read_pos = read_loc
jpayne@69 1631 return res
jpayne@69 1632
jpayne@69 1633 def find_introns(self, read_iterator):
jpayne@69 1634 """Return a dictionary {(start, stop): count}
jpayne@69 1635 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
jpayne@69 1636 and their support ( = number of reads ).
jpayne@69 1637
jpayne@69 1638 read_iterator can be the result of a .fetch(...) call.
jpayne@69 1639 Or it can be a generator filtering such reads. Example
jpayne@69 1640 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
jpayne@69 1641 """
jpayne@69 1642 cdef:
jpayne@69 1643 uint32_t base_position, junc_start, nt
jpayne@69 1644 int op
jpayne@69 1645 AlignedSegment r
jpayne@69 1646 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
jpayne@69 1647
jpayne@69 1648 res = collections.Counter()
jpayne@69 1649
jpayne@69 1650 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
jpayne@69 1651 for r in read_iterator:
jpayne@69 1652 base_position = r.pos
jpayne@69 1653 cigar = r.cigartuples
jpayne@69 1654 if cigar is None:
jpayne@69 1655 continue
jpayne@69 1656
jpayne@69 1657 for op, nt in cigar:
jpayne@69 1658 if op in match_or_deletion:
jpayne@69 1659 base_position += nt
jpayne@69 1660 elif op == BAM_CREF_SKIP:
jpayne@69 1661 junc_start = base_position
jpayne@69 1662 base_position += nt
jpayne@69 1663 res[(junc_start, base_position)] += 1
jpayne@69 1664 return res
jpayne@69 1665
jpayne@69 1666
jpayne@69 1667 def close(self):
jpayne@69 1668 '''closes the :class:`pysam.AlignmentFile`.'''
jpayne@69 1669
jpayne@69 1670 if self.htsfile == NULL:
jpayne@69 1671 return
jpayne@69 1672
jpayne@69 1673 if self.index != NULL:
jpayne@69 1674 hts_idx_destroy(self.index)
jpayne@69 1675 self.index = NULL
jpayne@69 1676
jpayne@69 1677 cdef int ret = hts_close(self.htsfile)
jpayne@69 1678 self.htsfile = NULL
jpayne@69 1679
jpayne@69 1680 self.header = None
jpayne@69 1681
jpayne@69 1682 if ret < 0:
jpayne@69 1683 global errno
jpayne@69 1684 if errno == EPIPE:
jpayne@69 1685 errno = 0
jpayne@69 1686 else:
jpayne@69 1687 raise IOError(errno, force_str(strerror(errno)))
jpayne@69 1688
jpayne@69 1689 def __dealloc__(self):
jpayne@69 1690 cdef int ret = 0
jpayne@69 1691
jpayne@69 1692 if self.index != NULL:
jpayne@69 1693 hts_idx_destroy(self.index)
jpayne@69 1694 self.index = NULL
jpayne@69 1695
jpayne@69 1696 if self.htsfile != NULL:
jpayne@69 1697 ret = hts_close(self.htsfile)
jpayne@69 1698 self.htsfile = NULL
jpayne@69 1699
jpayne@69 1700 self.header = None
jpayne@69 1701
jpayne@69 1702 if self.b:
jpayne@69 1703 bam_destroy1(self.b)
jpayne@69 1704 self.b = NULL
jpayne@69 1705
jpayne@69 1706 if ret < 0:
jpayne@69 1707 global errno
jpayne@69 1708 if errno == EPIPE:
jpayne@69 1709 errno = 0
jpayne@69 1710 else:
jpayne@69 1711 raise IOError(errno, force_str(strerror(errno)))
jpayne@69 1712
jpayne@69 1713 cpdef int write(self, AlignedSegment read) except -1:
jpayne@69 1714 '''
jpayne@69 1715 write a single :class:`pysam.AlignedSegment` to disk.
jpayne@69 1716
jpayne@69 1717 Raises:
jpayne@69 1718 ValueError
jpayne@69 1719 if the writing failed
jpayne@69 1720
jpayne@69 1721 Returns:
jpayne@69 1722 int :
jpayne@69 1723 the number of bytes written. If the file is closed,
jpayne@69 1724 this will be 0.
jpayne@69 1725 '''
jpayne@69 1726 if not self.is_open:
jpayne@69 1727 return 0
jpayne@69 1728
jpayne@69 1729 if self.header.ptr.n_targets <= read._delegate.core.tid:
jpayne@69 1730 raise ValueError(
jpayne@69 1731 "AlignedSegment refers to reference number {} that "
jpayne@69 1732 "is larger than the number of references ({}) in the header".format(
jpayne@69 1733 read._delegate.core.tid, self.header.ptr.n_targets))
jpayne@69 1734
jpayne@69 1735 cdef int ret
jpayne@69 1736 with nogil:
jpayne@69 1737 ret = sam_write1(self.htsfile,
jpayne@69 1738 self.header.ptr,
jpayne@69 1739 read._delegate)
jpayne@69 1740
jpayne@69 1741 # kbj: Still need to raise an exception with except -1. Otherwise
jpayne@69 1742 # when ret == -1 we get a "SystemError: error return without
jpayne@69 1743 # exception set".
jpayne@69 1744 if ret < 0:
jpayne@69 1745 raise IOError(
jpayne@69 1746 "sam_write1 failed with error code {}".format(ret))
jpayne@69 1747
jpayne@69 1748 return ret
jpayne@69 1749
jpayne@69 1750 # context manager interface
jpayne@69 1751 def __enter__(self):
jpayne@69 1752 return self
jpayne@69 1753
jpayne@69 1754 def __exit__(self, exc_type, exc_value, traceback):
jpayne@69 1755 self.close()
jpayne@69 1756 return False
jpayne@69 1757
jpayne@69 1758 ###############################################################
jpayne@69 1759 ###############################################################
jpayne@69 1760 ###############################################################
jpayne@69 1761 ## properties
jpayne@69 1762 ###############################################################
jpayne@69 1763 property mapped:
jpayne@69 1764 """int with total number of mapped alignments according to the
jpayne@69 1765 statistics recorded in the index. This is a read-only
jpayne@69 1766 attribute.
jpayne@69 1767 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@69 1768 index format does not record these statistics.)
jpayne@69 1769 """
jpayne@69 1770 def __get__(self):
jpayne@69 1771 self.check_index()
jpayne@69 1772 cdef int tid
jpayne@69 1773 cdef uint64_t total = 0
jpayne@69 1774 cdef uint64_t mapped, unmapped
jpayne@69 1775 for tid from 0 <= tid < self.header.nreferences:
jpayne@69 1776 with nogil:
jpayne@69 1777 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@69 1778 total += mapped
jpayne@69 1779 return total
jpayne@69 1780
jpayne@69 1781 property unmapped:
jpayne@69 1782 """int with total number of unmapped reads according to the statistics
jpayne@69 1783 recorded in the index. This number of reads includes the number of reads
jpayne@69 1784 without coordinates. This is a read-only attribute.
jpayne@69 1785 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@69 1786 index format does not record these statistics.)
jpayne@69 1787 """
jpayne@69 1788 def __get__(self):
jpayne@69 1789 self.check_index()
jpayne@69 1790 cdef int tid
jpayne@69 1791 cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
jpayne@69 1792 cdef uint64_t mapped, unmapped
jpayne@69 1793 for tid from 0 <= tid < self.header.nreferences:
jpayne@69 1794 with nogil:
jpayne@69 1795 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@69 1796 total += unmapped
jpayne@69 1797 return total
jpayne@69 1798
jpayne@69 1799 property nocoordinate:
jpayne@69 1800 """int with total number of reads without coordinates according to the
jpayne@69 1801 statistics recorded in the index, i.e., the statistic printed for "*"
jpayne@69 1802 by the ``samtools idxstats`` command. This is a read-only attribute.
jpayne@69 1803 (This will be 0 for a CRAM file indexed by a .crai index, as that
jpayne@69 1804 index format does not record these statistics.)
jpayne@69 1805 """
jpayne@69 1806 def __get__(self):
jpayne@69 1807 self.check_index()
jpayne@69 1808 cdef uint64_t n
jpayne@69 1809 with nogil:
jpayne@69 1810 n = hts_idx_get_n_no_coor(self.index)
jpayne@69 1811 return n
jpayne@69 1812
jpayne@69 1813 def get_index_statistics(self):
jpayne@69 1814 """return statistics about mapped/unmapped reads per chromosome as
jpayne@69 1815 they are stored in the index, similarly to the statistics printed
jpayne@69 1816 by the ``samtools idxstats`` command.
jpayne@69 1817
jpayne@69 1818 CRAI indexes do not record these statistics, so for a CRAM file
jpayne@69 1819 with a .crai index the returned statistics will all be 0.
jpayne@69 1820
jpayne@69 1821 Returns:
jpayne@69 1822 list :
jpayne@69 1823 a list of records for each chromosome. Each record has the
jpayne@69 1824 attributes 'contig', 'mapped', 'unmapped' and 'total'.
jpayne@69 1825 """
jpayne@69 1826
jpayne@69 1827 self.check_index()
jpayne@69 1828 cdef int tid
jpayne@69 1829 cdef uint64_t mapped, unmapped
jpayne@69 1830 results = []
jpayne@69 1831 # TODO: use header
jpayne@69 1832 for tid from 0 <= tid < self.nreferences:
jpayne@69 1833 with nogil:
jpayne@69 1834 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
jpayne@69 1835 results.append(
jpayne@69 1836 IndexStats._make((
jpayne@69 1837 self.get_reference_name(tid),
jpayne@69 1838 mapped,
jpayne@69 1839 unmapped,
jpayne@69 1840 mapped + unmapped)))
jpayne@69 1841
jpayne@69 1842 return results
jpayne@69 1843
jpayne@69 1844 ###############################################################
jpayne@69 1845 ## file-object like iterator access
jpayne@69 1846 ## note: concurrent access will cause errors (see IteratorRow
jpayne@69 1847 ## and multiple_iterators)
jpayne@69 1848 ## Possible solutions: deprecate or open new file handle
jpayne@69 1849 def __iter__(self):
jpayne@69 1850 if not self.is_open:
jpayne@69 1851 raise ValueError("I/O operation on closed file")
jpayne@69 1852
jpayne@69 1853 if not self.is_bam and self.header.nreferences == 0:
jpayne@69 1854 raise NotImplementedError(
jpayne@69 1855 "can not iterate over samfile without header")
jpayne@69 1856 return self
jpayne@69 1857
jpayne@69 1858 cdef bam1_t * getCurrent(self):
jpayne@69 1859 return self.b
jpayne@69 1860
jpayne@69 1861 cdef int cnext(self):
jpayne@69 1862 '''
jpayne@69 1863 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
jpayne@69 1864 '''
jpayne@69 1865 cdef int ret
jpayne@69 1866 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@69 1867 with nogil:
jpayne@69 1868 ret = sam_read1(self.htsfile,
jpayne@69 1869 hdr,
jpayne@69 1870 self.b)
jpayne@69 1871 return ret
jpayne@69 1872
jpayne@69 1873 def __next__(self):
jpayne@69 1874 cdef int ret = self.cnext()
jpayne@69 1875 if ret >= 0:
jpayne@69 1876 return makeAlignedSegment(self.b, self.header)
jpayne@69 1877 elif ret == -1:
jpayne@69 1878 raise StopIteration
jpayne@69 1879 else:
jpayne@69 1880 raise IOError(read_failure_reason(ret))
jpayne@69 1881
jpayne@69 1882 ###########################################
jpayne@69 1883 # methods/properties referencing the header
jpayne@69 1884 def is_valid_tid(self, int tid):
jpayne@69 1885 """
jpayne@69 1886 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@69 1887
jpayne@69 1888 Note that the unmapped tid code (-1) counts as an invalid.
jpayne@69 1889 """
jpayne@69 1890 if self.header is None:
jpayne@69 1891 raise ValueError("header not available in closed files")
jpayne@69 1892 return self.header.is_valid_tid(tid)
jpayne@69 1893
jpayne@69 1894 def get_tid(self, reference):
jpayne@69 1895 """
jpayne@69 1896 return the numerical :term:`tid` corresponding to
jpayne@69 1897 :term:`reference`
jpayne@69 1898
jpayne@69 1899 returns -1 if reference is not known.
jpayne@69 1900 """
jpayne@69 1901 if self.header is None:
jpayne@69 1902 raise ValueError("header not available in closed files")
jpayne@69 1903 return self.header.get_tid(reference)
jpayne@69 1904
jpayne@69 1905 def get_reference_name(self, tid):
jpayne@69 1906 """
jpayne@69 1907 return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@69 1908 """
jpayne@69 1909 if self.header is None:
jpayne@69 1910 raise ValueError("header not available in closed files")
jpayne@69 1911 return self.header.get_reference_name(tid)
jpayne@69 1912
jpayne@69 1913 def get_reference_length(self, reference):
jpayne@69 1914 """
jpayne@69 1915 return :term:`reference` length corresponding to numerical :term:`tid`
jpayne@69 1916 """
jpayne@69 1917 if self.header is None:
jpayne@69 1918 raise ValueError("header not available in closed files")
jpayne@69 1919 return self.header.get_reference_length(reference)
jpayne@69 1920
jpayne@69 1921 property nreferences:
jpayne@69 1922 """int with the number of :term:`reference` sequences in the file.
jpayne@69 1923 This is a read-only attribute."""
jpayne@69 1924 def __get__(self):
jpayne@69 1925 if self.header:
jpayne@69 1926 return self.header.nreferences
jpayne@69 1927 else:
jpayne@69 1928 raise ValueError("header not available in closed files")
jpayne@69 1929
jpayne@69 1930 property references:
jpayne@69 1931 """tuple with the names of :term:`reference` sequences. This is a
jpayne@69 1932 read-only attribute"""
jpayne@69 1933 def __get__(self):
jpayne@69 1934 if self.header:
jpayne@69 1935 return self.header.references
jpayne@69 1936 else:
jpayne@69 1937 raise ValueError("header not available in closed files")
jpayne@69 1938
jpayne@69 1939 property lengths:
jpayne@69 1940 """tuple of the lengths of the :term:`reference` sequences. This is a
jpayne@69 1941 read-only attribute. The lengths are in the same order as
jpayne@69 1942 :attr:`pysam.AlignmentFile.references`
jpayne@69 1943
jpayne@69 1944 """
jpayne@69 1945 def __get__(self):
jpayne@69 1946 if self.header:
jpayne@69 1947 return self.header.lengths
jpayne@69 1948 else:
jpayne@69 1949 raise ValueError("header not available in closed files")
jpayne@69 1950
jpayne@69 1951 # Compatibility functions for pysam < 0.14
jpayne@69 1952 property text:
jpayne@69 1953 """deprecated, use :attr:`references` and :attr:`lengths` instead"""
jpayne@69 1954 def __get__(self):
jpayne@69 1955 if self.header:
jpayne@69 1956 return self.header.__str__()
jpayne@69 1957 else:
jpayne@69 1958 raise ValueError("header not available in closed files")
jpayne@69 1959
jpayne@69 1960 # Compatibility functions for pysam < 0.8.3
jpayne@69 1961 def gettid(self, reference):
jpayne@69 1962 """deprecated, use :meth:`get_tid` instead"""
jpayne@69 1963 return self.get_tid(reference)
jpayne@69 1964
jpayne@69 1965 def getrname(self, tid):
jpayne@69 1966 """deprecated, use :meth:`get_reference_name` instead"""
jpayne@69 1967 return self.get_reference_name(tid)
jpayne@69 1968
jpayne@69 1969
jpayne@69 1970 cdef class IteratorRow:
jpayne@69 1971 '''abstract base class for iterators over mapped reads.
jpayne@69 1972
jpayne@69 1973 Various iterators implement different behaviours for wrapping around
jpayne@69 1974 contig boundaries. Examples include:
jpayne@69 1975
jpayne@69 1976 :class:`pysam.IteratorRowRegion`
jpayne@69 1977 iterate within a single contig and a defined region.
jpayne@69 1978
jpayne@69 1979 :class:`pysam.IteratorRowAll`
jpayne@69 1980 iterate until EOF. This iterator will also include unmapped reads.
jpayne@69 1981
jpayne@69 1982 :class:`pysam.IteratorRowAllRefs`
jpayne@69 1983 iterate over all reads in all reference sequences.
jpayne@69 1984
jpayne@69 1985 The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
jpayne@69 1986
jpayne@69 1987 .. note::
jpayne@69 1988
jpayne@69 1989 It is usually not necessary to create an object of this class
jpayne@69 1990 explicitly. It is returned as a result of call to a
jpayne@69 1991 :meth:`AlignmentFile.fetch`.
jpayne@69 1992
jpayne@69 1993 '''
jpayne@69 1994
jpayne@69 1995 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@69 1996 cdef char *cfilename
jpayne@69 1997 cdef char *creference_filename
jpayne@69 1998 cdef char *cindexname = NULL
jpayne@69 1999
jpayne@69 2000 if not samfile.is_open:
jpayne@69 2001 raise ValueError("I/O operation on closed file")
jpayne@69 2002
jpayne@69 2003 # makes sure that samfile stays alive as long as the
jpayne@69 2004 # iterator is alive
jpayne@69 2005 self.samfile = samfile
jpayne@69 2006
jpayne@69 2007 # reopen the file - note that this makes the iterator
jpayne@69 2008 # slow and causes pileup to slow down significantly.
jpayne@69 2009 if multiple_iterators:
jpayne@69 2010
jpayne@69 2011 cfilename = samfile.filename
jpayne@69 2012 with nogil:
jpayne@69 2013 self.htsfile = hts_open(cfilename, 'r')
jpayne@69 2014 assert self.htsfile != NULL
jpayne@69 2015
jpayne@69 2016 if samfile.has_index():
jpayne@69 2017 if samfile.index_filename:
jpayne@69 2018 cindexname = bindex_filename = encode_filename(samfile.index_filename)
jpayne@69 2019 with nogil:
jpayne@69 2020 self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
jpayne@69 2021 else:
jpayne@69 2022 self.index = NULL
jpayne@69 2023
jpayne@69 2024 # need to advance in newly opened file to position after header
jpayne@69 2025 # better: use seek/tell?
jpayne@69 2026 with nogil:
jpayne@69 2027 hdr = sam_hdr_read(self.htsfile)
jpayne@69 2028 if hdr is NULL:
jpayne@69 2029 raise IOError("unable to read header information")
jpayne@69 2030 self.header = makeAlignmentHeader(hdr)
jpayne@69 2031
jpayne@69 2032 self.owns_samfile = True
jpayne@69 2033
jpayne@69 2034 # options specific to CRAM files
jpayne@69 2035 if samfile.is_cram and samfile.reference_filename:
jpayne@69 2036 creference_filename = samfile.reference_filename
jpayne@69 2037 hts_set_opt(self.htsfile,
jpayne@69 2038 CRAM_OPT_REFERENCE,
jpayne@69 2039 creference_filename)
jpayne@69 2040
jpayne@69 2041 else:
jpayne@69 2042 self.htsfile = samfile.htsfile
jpayne@69 2043 self.index = samfile.index
jpayne@69 2044 self.owns_samfile = False
jpayne@69 2045 self.header = samfile.header
jpayne@69 2046
jpayne@69 2047 self.retval = 0
jpayne@69 2048
jpayne@69 2049 self.b = bam_init1()
jpayne@69 2050
jpayne@69 2051 def __dealloc__(self):
jpayne@69 2052 bam_destroy1(self.b)
jpayne@69 2053 if self.owns_samfile:
jpayne@69 2054 hts_idx_destroy(self.index)
jpayne@69 2055 hts_close(self.htsfile)
jpayne@69 2056
jpayne@69 2057
jpayne@69 2058 cdef class IteratorRowRegion(IteratorRow):
jpayne@69 2059 """*(AlignmentFile samfile, int tid, int beg, int stop,
jpayne@69 2060 int multiple_iterators=False)*
jpayne@69 2061
jpayne@69 2062 iterate over mapped reads in a region.
jpayne@69 2063
jpayne@69 2064 .. note::
jpayne@69 2065
jpayne@69 2066 It is usually not necessary to create an object of this class
jpayne@69 2067 explicitly. It is returned as a result of call to a
jpayne@69 2068 :meth:`AlignmentFile.fetch`.
jpayne@69 2069
jpayne@69 2070 """
jpayne@69 2071
jpayne@69 2072 def __init__(self, AlignmentFile samfile,
jpayne@69 2073 int tid, int beg, int stop,
jpayne@69 2074 int multiple_iterators=False):
jpayne@69 2075
jpayne@69 2076 if not samfile.has_index():
jpayne@69 2077 raise ValueError("no index available for iteration")
jpayne@69 2078
jpayne@69 2079 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@69 2080 with nogil:
jpayne@69 2081 self.iter = sam_itr_queryi(
jpayne@69 2082 self.index,
jpayne@69 2083 tid,
jpayne@69 2084 beg,
jpayne@69 2085 stop)
jpayne@69 2086
jpayne@69 2087 def __iter__(self):
jpayne@69 2088 return self
jpayne@69 2089
jpayne@69 2090 cdef bam1_t * getCurrent(self):
jpayne@69 2091 return self.b
jpayne@69 2092
jpayne@69 2093 cdef int cnext(self):
jpayne@69 2094 '''cversion of iterator. Used by IteratorColumn'''
jpayne@69 2095 with nogil:
jpayne@69 2096 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
jpayne@69 2097 self.iter,
jpayne@69 2098 self.b,
jpayne@69 2099 self.htsfile)
jpayne@69 2100
jpayne@69 2101 def __next__(self):
jpayne@69 2102 self.cnext()
jpayne@69 2103 if self.retval >= 0:
jpayne@69 2104 return makeAlignedSegment(self.b, self.header)
jpayne@69 2105 elif self.retval == -1:
jpayne@69 2106 raise StopIteration
jpayne@69 2107 elif self.retval == -2:
jpayne@69 2108 # Note: it is currently not the case that hts_iter_next
jpayne@69 2109 # returns -2 for a truncated file.
jpayne@69 2110 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
jpayne@69 2111 raise IOError('truncated file')
jpayne@69 2112 else:
jpayne@69 2113 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
jpayne@69 2114
jpayne@69 2115 def __dealloc__(self):
jpayne@69 2116 hts_itr_destroy(self.iter)
jpayne@69 2117
jpayne@69 2118
jpayne@69 2119 cdef class IteratorRowHead(IteratorRow):
jpayne@69 2120 """*(AlignmentFile samfile, n, int multiple_iterators=False)*
jpayne@69 2121
jpayne@69 2122 iterate over first n reads in `samfile`
jpayne@69 2123
jpayne@69 2124 .. note::
jpayne@69 2125 It is usually not necessary to create an object of this class
jpayne@69 2126 explicitly. It is returned as a result of call to a
jpayne@69 2127 :meth:`AlignmentFile.head`.
jpayne@69 2128
jpayne@69 2129 """
jpayne@69 2130
jpayne@69 2131 def __init__(self,
jpayne@69 2132 AlignmentFile samfile,
jpayne@69 2133 int n,
jpayne@69 2134 int multiple_iterators=False):
jpayne@69 2135 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@69 2136
jpayne@69 2137 self.max_rows = n
jpayne@69 2138 self.current_row = 0
jpayne@69 2139
jpayne@69 2140 def __iter__(self):
jpayne@69 2141 return self
jpayne@69 2142
jpayne@69 2143 cdef bam1_t * getCurrent(self):
jpayne@69 2144 return self.b
jpayne@69 2145
jpayne@69 2146 cdef int cnext(self):
jpayne@69 2147 '''cversion of iterator. Used by IteratorColumn'''
jpayne@69 2148 cdef int ret
jpayne@69 2149 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@69 2150 with nogil:
jpayne@69 2151 ret = sam_read1(self.htsfile,
jpayne@69 2152 hdr,
jpayne@69 2153 self.b)
jpayne@69 2154 return ret
jpayne@69 2155
jpayne@69 2156 def __next__(self):
jpayne@69 2157 if self.current_row >= self.max_rows:
jpayne@69 2158 raise StopIteration
jpayne@69 2159
jpayne@69 2160 cdef int ret = self.cnext()
jpayne@69 2161 if ret >= 0:
jpayne@69 2162 self.current_row += 1
jpayne@69 2163 return makeAlignedSegment(self.b, self.header)
jpayne@69 2164 elif ret == -1:
jpayne@69 2165 raise StopIteration
jpayne@69 2166 else:
jpayne@69 2167 raise IOError(read_failure_reason(ret))
jpayne@69 2168
jpayne@69 2169
jpayne@69 2170 cdef class IteratorRowAll(IteratorRow):
jpayne@69 2171 """*(AlignmentFile samfile, int multiple_iterators=False)*
jpayne@69 2172
jpayne@69 2173 iterate over all reads in `samfile`
jpayne@69 2174
jpayne@69 2175 .. note::
jpayne@69 2176
jpayne@69 2177 It is usually not necessary to create an object of this class
jpayne@69 2178 explicitly. It is returned as a result of call to a
jpayne@69 2179 :meth:`AlignmentFile.fetch`.
jpayne@69 2180
jpayne@69 2181 """
jpayne@69 2182
jpayne@69 2183 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
jpayne@69 2184 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@69 2185
jpayne@69 2186 def __iter__(self):
jpayne@69 2187 return self
jpayne@69 2188
jpayne@69 2189 cdef bam1_t * getCurrent(self):
jpayne@69 2190 return self.b
jpayne@69 2191
jpayne@69 2192 cdef int cnext(self):
jpayne@69 2193 '''cversion of iterator. Used by IteratorColumn'''
jpayne@69 2194 cdef int ret
jpayne@69 2195 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@69 2196 with nogil:
jpayne@69 2197 ret = sam_read1(self.htsfile,
jpayne@69 2198 hdr,
jpayne@69 2199 self.b)
jpayne@69 2200 return ret
jpayne@69 2201
jpayne@69 2202 def __next__(self):
jpayne@69 2203 cdef int ret = self.cnext()
jpayne@69 2204 if ret >= 0:
jpayne@69 2205 return makeAlignedSegment(self.b, self.header)
jpayne@69 2206 elif ret == -1:
jpayne@69 2207 raise StopIteration
jpayne@69 2208 else:
jpayne@69 2209 raise IOError(read_failure_reason(ret))
jpayne@69 2210
jpayne@69 2211
jpayne@69 2212 cdef class IteratorRowAllRefs(IteratorRow):
jpayne@69 2213 """iterates over all mapped reads by chaining iterators over each
jpayne@69 2214 reference
jpayne@69 2215
jpayne@69 2216 .. note::
jpayne@69 2217 It is usually not necessary to create an object of this class
jpayne@69 2218 explicitly. It is returned as a result of call to a
jpayne@69 2219 :meth:`AlignmentFile.fetch`.
jpayne@69 2220
jpayne@69 2221 """
jpayne@69 2222
jpayne@69 2223 def __init__(self, AlignmentFile samfile, multiple_iterators=False):
jpayne@69 2224 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@69 2225
jpayne@69 2226 if not samfile.has_index():
jpayne@69 2227 raise ValueError("no index available for fetch")
jpayne@69 2228
jpayne@69 2229 self.tid = -1
jpayne@69 2230
jpayne@69 2231 def nextiter(self):
jpayne@69 2232 # get a new iterator for a chromosome. The file
jpayne@69 2233 # will not be re-opened.
jpayne@69 2234 self.rowiter = IteratorRowRegion(self.samfile,
jpayne@69 2235 self.tid,
jpayne@69 2236 0,
jpayne@69 2237 MAX_POS)
jpayne@69 2238 # set htsfile and header of the rowiter
jpayne@69 2239 # to the values in this iterator to reflect multiple_iterators
jpayne@69 2240 self.rowiter.htsfile = self.htsfile
jpayne@69 2241 self.rowiter.header = self.header
jpayne@69 2242
jpayne@69 2243 # make sure the iterator understand that IteratorRowAllRefs
jpayne@69 2244 # has ownership
jpayne@69 2245 self.rowiter.owns_samfile = False
jpayne@69 2246
jpayne@69 2247 def __iter__(self):
jpayne@69 2248 return self
jpayne@69 2249
jpayne@69 2250 def __next__(self):
jpayne@69 2251 # Create an initial iterator
jpayne@69 2252 if self.tid == -1:
jpayne@69 2253 if not self.samfile.nreferences:
jpayne@69 2254 raise StopIteration
jpayne@69 2255 self.tid = 0
jpayne@69 2256 self.nextiter()
jpayne@69 2257
jpayne@69 2258 while 1:
jpayne@69 2259 self.rowiter.cnext()
jpayne@69 2260
jpayne@69 2261 # If current iterator is not exhausted, return aligned read
jpayne@69 2262 if self.rowiter.retval > 0:
jpayne@69 2263 return makeAlignedSegment(self.rowiter.b, self.header)
jpayne@69 2264
jpayne@69 2265 self.tid += 1
jpayne@69 2266
jpayne@69 2267 # Otherwise, proceed to next reference or stop
jpayne@69 2268 if self.tid < self.samfile.nreferences:
jpayne@69 2269 self.nextiter()
jpayne@69 2270 else:
jpayne@69 2271 raise StopIteration
jpayne@69 2272
jpayne@69 2273
jpayne@69 2274 cdef class IteratorRowSelection(IteratorRow):
jpayne@69 2275 """*(AlignmentFile samfile)*
jpayne@69 2276
jpayne@69 2277 iterate over reads in `samfile` at a given list of file positions.
jpayne@69 2278
jpayne@69 2279 .. note::
jpayne@69 2280 It is usually not necessary to create an object of this class
jpayne@69 2281 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
jpayne@69 2282 """
jpayne@69 2283
jpayne@69 2284 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
jpayne@69 2285 super().__init__(samfile, multiple_iterators=multiple_iterators)
jpayne@69 2286
jpayne@69 2287 self.positions = positions
jpayne@69 2288 self.current_pos = 0
jpayne@69 2289
jpayne@69 2290 def __iter__(self):
jpayne@69 2291 return self
jpayne@69 2292
jpayne@69 2293 cdef bam1_t * getCurrent(self):
jpayne@69 2294 return self.b
jpayne@69 2295
jpayne@69 2296 cdef int cnext(self):
jpayne@69 2297 '''cversion of iterator'''
jpayne@69 2298 # end iteration if out of positions
jpayne@69 2299 if self.current_pos >= len(self.positions): return -1
jpayne@69 2300
jpayne@69 2301 cdef uint64_t pos = self.positions[self.current_pos]
jpayne@69 2302 with nogil:
jpayne@69 2303 bgzf_seek(hts_get_bgzfp(self.htsfile),
jpayne@69 2304 pos,
jpayne@69 2305 0)
jpayne@69 2306 self.current_pos += 1
jpayne@69 2307
jpayne@69 2308 cdef int ret
jpayne@69 2309 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@69 2310 with nogil:
jpayne@69 2311 ret = sam_read1(self.htsfile,
jpayne@69 2312 hdr,
jpayne@69 2313 self.b)
jpayne@69 2314 return ret
jpayne@69 2315
jpayne@69 2316 def __next__(self):
jpayne@69 2317 cdef int ret = self.cnext()
jpayne@69 2318 if ret >= 0:
jpayne@69 2319 return makeAlignedSegment(self.b, self.header)
jpayne@69 2320 elif ret == -1:
jpayne@69 2321 raise StopIteration
jpayne@69 2322 else:
jpayne@69 2323 raise IOError(read_failure_reason(ret))
jpayne@69 2324
jpayne@69 2325
jpayne@69 2326 cdef int __advance_nofilter(void *data, bam1_t *b):
jpayne@69 2327 '''advance without any read filtering.
jpayne@69 2328 '''
jpayne@69 2329 cdef __iterdata * d = <__iterdata*>data
jpayne@69 2330 cdef int ret
jpayne@69 2331 with nogil:
jpayne@69 2332 ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@69 2333 return ret
jpayne@69 2334
jpayne@69 2335
jpayne@69 2336 cdef int __advance_raw_nofilter(void *data, bam1_t *b):
jpayne@69 2337 '''advance (without iterator) without any read filtering.
jpayne@69 2338 '''
jpayne@69 2339 cdef __iterdata * d = <__iterdata*>data
jpayne@69 2340 cdef int ret
jpayne@69 2341 with nogil:
jpayne@69 2342 ret = sam_read1(d.htsfile, d.header, b)
jpayne@69 2343 return ret
jpayne@69 2344
jpayne@69 2345
jpayne@69 2346 cdef int __advance_all(void *data, bam1_t *b):
jpayne@69 2347 '''only use reads for pileup passing basic filters such as
jpayne@69 2348
jpayne@69 2349 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@69 2350 '''
jpayne@69 2351
jpayne@69 2352 cdef __iterdata * d = <__iterdata*>data
jpayne@69 2353 cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
jpayne@69 2354 cdef int ret
jpayne@69 2355 while 1:
jpayne@69 2356 with nogil:
jpayne@69 2357 ret = sam_itr_next(d.htsfile, d.iter, b)
jpayne@69 2358 if ret < 0:
jpayne@69 2359 break
jpayne@69 2360 if b.core.flag & d.flag_filter:
jpayne@69 2361 continue
jpayne@69 2362 break
jpayne@69 2363 return ret
jpayne@69 2364
jpayne@69 2365
jpayne@69 2366 cdef int __advance_raw_all(void *data, bam1_t *b):
jpayne@69 2367 '''only use reads for pileup passing basic filters such as
jpayne@69 2368
jpayne@69 2369 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
jpayne@69 2370 '''
jpayne@69 2371
jpayne@69 2372 cdef __iterdata * d = <__iterdata*>data
jpayne@69 2373 cdef int ret
jpayne@69 2374 while 1:
jpayne@69 2375 with nogil:
jpayne@69 2376 ret = sam_read1(d.htsfile, d.header, b)
jpayne@69 2377 if ret < 0:
jpayne@69 2378 break
jpayne@69 2379 if b.core.flag & d.flag_filter:
jpayne@69 2380 continue
jpayne@69 2381 break
jpayne@69 2382 return ret
jpayne@69 2383
jpayne@69 2384
jpayne@69 2385 cdef int __advance_samtools(void * data, bam1_t * b):
jpayne@69 2386 '''advance using same filter and read processing as in
jpayne@69 2387 the samtools pileup.
jpayne@69 2388 '''
jpayne@69 2389 cdef __iterdata * d = <__iterdata*>data
jpayne@69 2390 cdef int ret
jpayne@69 2391 cdef int q
jpayne@69 2392
jpayne@69 2393 while 1:
jpayne@69 2394 with nogil:
jpayne@69 2395 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
jpayne@69 2396 if ret < 0:
jpayne@69 2397 break
jpayne@69 2398 if b.core.flag & d.flag_filter:
jpayne@69 2399 continue
jpayne@69 2400 if d.flag_require and not (b.core.flag & d.flag_require):
jpayne@69 2401 continue
jpayne@69 2402
jpayne@69 2403 # reload sequence
jpayne@69 2404 if d.fastafile != NULL and b.core.tid != d.tid:
jpayne@69 2405 if d.seq != NULL:
jpayne@69 2406 free(d.seq)
jpayne@69 2407 d.tid = b.core.tid
jpayne@69 2408 with nogil:
jpayne@69 2409 d.seq = faidx_fetch_seq(
jpayne@69 2410 d.fastafile,
jpayne@69 2411 d.header.target_name[d.tid],
jpayne@69 2412 0, MAX_POS,
jpayne@69 2413 &d.seq_len)
jpayne@69 2414
jpayne@69 2415 if d.seq == NULL:
jpayne@69 2416 raise ValueError(
jpayne@69 2417 "reference sequence for '{}' (tid={}) not found".format(
jpayne@69 2418 d.header.target_name[d.tid], d.tid))
jpayne@69 2419
jpayne@69 2420 # realign read - changes base qualities
jpayne@69 2421 if d.seq != NULL and d.compute_baq:
jpayne@69 2422 # 4th option to realign is flag:
jpayne@69 2423 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
jpayne@69 2424 if d.redo_baq:
jpayne@69 2425 sam_prob_realn(b, d.seq, d.seq_len, 7)
jpayne@69 2426 else:
jpayne@69 2427 sam_prob_realn(b, d.seq, d.seq_len, 3)
jpayne@69 2428
jpayne@69 2429 if d.seq != NULL and d.adjust_capq_threshold > 10:
jpayne@69 2430 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
jpayne@69 2431 if q < 0:
jpayne@69 2432 continue
jpayne@69 2433 elif b.core.qual > q:
jpayne@69 2434 b.core.qual = q
jpayne@69 2435
jpayne@69 2436 if b.core.qual < d.min_mapping_quality:
jpayne@69 2437 continue
jpayne@69 2438 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
jpayne@69 2439 continue
jpayne@69 2440
jpayne@69 2441 break
jpayne@69 2442
jpayne@69 2443 return ret
jpayne@69 2444
jpayne@69 2445
jpayne@69 2446 cdef class IteratorColumn:
jpayne@69 2447 '''abstract base class for iterators over columns.
jpayne@69 2448
jpayne@69 2449 IteratorColumn objects wrap the pileup functionality of samtools.
jpayne@69 2450
jpayne@69 2451 For reasons of efficiency, the iterator points to the current
jpayne@69 2452 pileup buffer. The pileup buffer is updated at every iteration.
jpayne@69 2453 This might cause some unexpected behaviour. For example,
jpayne@69 2454 consider the conversion to a list::
jpayne@69 2455
jpayne@69 2456 f = AlignmentFile("file.bam", "rb")
jpayne@69 2457 result = list(f.pileup())
jpayne@69 2458
jpayne@69 2459 Here, ``result`` will contain ``n`` objects of type
jpayne@69 2460 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
jpayne@69 2461 ``result`` will contain the same information.
jpayne@69 2462
jpayne@69 2463 The desired behaviour can be achieved by list comprehension::
jpayne@69 2464
jpayne@69 2465 result = [x.pileups() for x in f.pileup()]
jpayne@69 2466
jpayne@69 2467 ``result`` will be a list of ``n`` lists of objects of type
jpayne@69 2468 :class:`~pysam.PileupRead`.
jpayne@69 2469
jpayne@69 2470 If the iterator is associated with a :class:`~pysam.Fastafile`
jpayne@69 2471 using the :meth:`add_reference` method, then the iterator will
jpayne@69 2472 export the current sequence via the methods :meth:`get_sequence`
jpayne@69 2473 and :meth:`seq_len`.
jpayne@69 2474
jpayne@69 2475 See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
jpayne@69 2476 '''
jpayne@69 2477
jpayne@69 2478 def __cinit__( self, AlignmentFile samfile, **kwargs):
jpayne@69 2479 self.samfile = samfile
jpayne@69 2480 self.fastafile = kwargs.get("fastafile", None)
jpayne@69 2481 self.stepper = kwargs.get("stepper", "samtools")
jpayne@69 2482 self.max_depth = kwargs.get("max_depth", 8000)
jpayne@69 2483 self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
jpayne@69 2484 self.min_base_quality = kwargs.get("min_base_quality", 13)
jpayne@69 2485 self.iterdata.seq = NULL
jpayne@69 2486 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
jpayne@69 2487 self.iterdata.flag_require = kwargs.get("flag_require", 0)
jpayne@69 2488 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
jpayne@69 2489 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
jpayne@69 2490 self.iterdata.compute_baq = kwargs.get("compute_baq", True)
jpayne@69 2491 self.iterdata.redo_baq = kwargs.get("redo_baq", False)
jpayne@69 2492 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
jpayne@69 2493
jpayne@69 2494 self.tid = 0
jpayne@69 2495 self.pos = 0
jpayne@69 2496 self.n_plp = 0
jpayne@69 2497 self.plp = NULL
jpayne@69 2498 self.pileup_iter = <bam_mplp_t>NULL
jpayne@69 2499
jpayne@69 2500 def __iter__(self):
jpayne@69 2501 return self
jpayne@69 2502
jpayne@69 2503 cdef int cnext(self):
jpayne@69 2504 '''perform next iteration.
jpayne@69 2505 '''
jpayne@69 2506 # do not release gil here because of call-backs
jpayne@69 2507 cdef int ret = bam_mplp_auto(self.pileup_iter,
jpayne@69 2508 &self.tid,
jpayne@69 2509 &self.pos,
jpayne@69 2510 &self.n_plp,
jpayne@69 2511 &self.plp)
jpayne@69 2512 return ret
jpayne@69 2513
jpayne@69 2514 cdef char * get_sequence(self):
jpayne@69 2515 '''return current reference sequence underlying the iterator.
jpayne@69 2516 '''
jpayne@69 2517 return self.iterdata.seq
jpayne@69 2518
jpayne@69 2519 property seq_len:
jpayne@69 2520 '''current sequence length.'''
jpayne@69 2521 def __get__(self):
jpayne@69 2522 return self.iterdata.seq_len
jpayne@69 2523
jpayne@69 2524 def add_reference(self, FastaFile fastafile):
jpayne@69 2525 '''
jpayne@69 2526 add reference sequences in `fastafile` to iterator.'''
jpayne@69 2527 self.fastafile = fastafile
jpayne@69 2528 if self.iterdata.seq != NULL:
jpayne@69 2529 free(self.iterdata.seq)
jpayne@69 2530 self.iterdata.tid = -1
jpayne@69 2531 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@69 2532
jpayne@69 2533 def has_reference(self):
jpayne@69 2534 '''
jpayne@69 2535 return true if iterator is associated with a reference'''
jpayne@69 2536 return self.fastafile
jpayne@69 2537
jpayne@69 2538 cdef _setup_iterator(self,
jpayne@69 2539 int tid,
jpayne@69 2540 int start,
jpayne@69 2541 int stop,
jpayne@69 2542 int multiple_iterators=0):
jpayne@69 2543 '''setup the iterator structure'''
jpayne@69 2544
jpayne@69 2545 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
jpayne@69 2546 self.iterdata.htsfile = self.samfile.htsfile
jpayne@69 2547 self.iterdata.iter = self.iter.iter
jpayne@69 2548 self.iterdata.seq = NULL
jpayne@69 2549 self.iterdata.tid = -1
jpayne@69 2550 self.iterdata.header = self.samfile.header.ptr
jpayne@69 2551
jpayne@69 2552 if self.fastafile is not None:
jpayne@69 2553 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@69 2554 else:
jpayne@69 2555 self.iterdata.fastafile = NULL
jpayne@69 2556
jpayne@69 2557 # Free any previously allocated memory before reassigning
jpayne@69 2558 # pileup_iter
jpayne@69 2559 self._free_pileup_iter()
jpayne@69 2560
jpayne@69 2561 cdef void * data[1]
jpayne@69 2562 data[0] = <void*>&self.iterdata
jpayne@69 2563
jpayne@69 2564 if self.stepper is None or self.stepper == "all":
jpayne@69 2565 with nogil:
jpayne@69 2566 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2567 <bam_plp_auto_f>&__advance_all,
jpayne@69 2568 data)
jpayne@69 2569 elif self.stepper == "nofilter":
jpayne@69 2570 with nogil:
jpayne@69 2571 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2572 <bam_plp_auto_f>&__advance_nofilter,
jpayne@69 2573 data)
jpayne@69 2574 elif self.stepper == "samtools":
jpayne@69 2575 with nogil:
jpayne@69 2576 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2577 <bam_plp_auto_f>&__advance_samtools,
jpayne@69 2578 data)
jpayne@69 2579 else:
jpayne@69 2580 raise ValueError(
jpayne@69 2581 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@69 2582
jpayne@69 2583 if self.max_depth:
jpayne@69 2584 with nogil:
jpayne@69 2585 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@69 2586
jpayne@69 2587 if self.ignore_overlaps:
jpayne@69 2588 with nogil:
jpayne@69 2589 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@69 2590
jpayne@69 2591 cdef _setup_raw_rest_iterator(self):
jpayne@69 2592 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
jpayne@69 2593
jpayne@69 2594 self.iter = None
jpayne@69 2595 self.iterdata.iter = NULL
jpayne@69 2596 self.iterdata.htsfile = self.samfile.htsfile
jpayne@69 2597 self.iterdata.seq = NULL
jpayne@69 2598 self.iterdata.tid = -1
jpayne@69 2599 self.iterdata.header = self.samfile.header.ptr
jpayne@69 2600
jpayne@69 2601 if self.fastafile is not None:
jpayne@69 2602 self.iterdata.fastafile = self.fastafile.fastafile
jpayne@69 2603 else:
jpayne@69 2604 self.iterdata.fastafile = NULL
jpayne@69 2605
jpayne@69 2606 # Free any previously allocated memory before reassigning
jpayne@69 2607 # pileup_iter
jpayne@69 2608 self._free_pileup_iter()
jpayne@69 2609
jpayne@69 2610 cdef void * data[1]
jpayne@69 2611 data[0] = <void*>&self.iterdata
jpayne@69 2612
jpayne@69 2613 if self.stepper is None or self.stepper == "all":
jpayne@69 2614 with nogil:
jpayne@69 2615 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2616 <bam_plp_auto_f>&__advance_raw_all,
jpayne@69 2617 data)
jpayne@69 2618 elif self.stepper == "nofilter":
jpayne@69 2619 with nogil:
jpayne@69 2620 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2621 <bam_plp_auto_f>&__advance_raw_nofilter,
jpayne@69 2622 data)
jpayne@69 2623 elif self.stepper == "samtools":
jpayne@69 2624 with nogil:
jpayne@69 2625 self.pileup_iter = bam_mplp_init(1,
jpayne@69 2626 <bam_plp_auto_f>&__advance_samtools,
jpayne@69 2627 data)
jpayne@69 2628 else:
jpayne@69 2629 raise ValueError(
jpayne@69 2630 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
jpayne@69 2631
jpayne@69 2632 if self.max_depth:
jpayne@69 2633 with nogil:
jpayne@69 2634 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
jpayne@69 2635
jpayne@69 2636 if self.ignore_overlaps:
jpayne@69 2637 with nogil:
jpayne@69 2638 bam_mplp_init_overlaps(self.pileup_iter)
jpayne@69 2639
jpayne@69 2640 cdef reset(self, tid, start, stop):
jpayne@69 2641 '''reset iterator position.
jpayne@69 2642
jpayne@69 2643 This permits using the iterator multiple times without
jpayne@69 2644 having to incur the full set-up costs.
jpayne@69 2645 '''
jpayne@69 2646 if self.iter is None:
jpayne@69 2647 raise TypeError("Raw iterator set up without region cannot be reset")
jpayne@69 2648
jpayne@69 2649 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
jpayne@69 2650 self.iterdata.iter = self.iter.iter
jpayne@69 2651
jpayne@69 2652 # invalidate sequence if different tid
jpayne@69 2653 if self.tid != tid:
jpayne@69 2654 if self.iterdata.seq != NULL:
jpayne@69 2655 free(self.iterdata.seq)
jpayne@69 2656 self.iterdata.seq = NULL
jpayne@69 2657 self.iterdata.tid = -1
jpayne@69 2658
jpayne@69 2659 # self.pileup_iter = bam_mplp_init(1
jpayne@69 2660 # &__advancepileup,
jpayne@69 2661 # &self.iterdata)
jpayne@69 2662 with nogil:
jpayne@69 2663 bam_mplp_reset(self.pileup_iter)
jpayne@69 2664
jpayne@69 2665 cdef _free_pileup_iter(self):
jpayne@69 2666 '''free the memory alloc'd by bam_plp_init.
jpayne@69 2667
jpayne@69 2668 This is needed before setup_iterator allocates another
jpayne@69 2669 pileup_iter, or else memory will be lost. '''
jpayne@69 2670 if self.pileup_iter != <bam_mplp_t>NULL:
jpayne@69 2671 with nogil:
jpayne@69 2672 bam_mplp_reset(self.pileup_iter)
jpayne@69 2673 bam_mplp_destroy(self.pileup_iter)
jpayne@69 2674 self.pileup_iter = <bam_mplp_t>NULL
jpayne@69 2675
jpayne@69 2676 def __dealloc__(self):
jpayne@69 2677 # reset in order to avoid memory leak messages for iterators
jpayne@69 2678 # that have not been fully consumed
jpayne@69 2679 self._free_pileup_iter()
jpayne@69 2680 self.plp = <const bam_pileup1_t*>NULL
jpayne@69 2681
jpayne@69 2682 if self.iterdata.seq != NULL:
jpayne@69 2683 free(self.iterdata.seq)
jpayne@69 2684 self.iterdata.seq = NULL
jpayne@69 2685
jpayne@69 2686 # backwards compatibility
jpayne@69 2687
jpayne@69 2688 def hasReference(self):
jpayne@69 2689 return self.has_reference()
jpayne@69 2690 cdef char * getSequence(self):
jpayne@69 2691 return self.get_sequence()
jpayne@69 2692 def addReference(self, FastaFile fastafile):
jpayne@69 2693 return self.add_reference(fastafile)
jpayne@69 2694
jpayne@69 2695
jpayne@69 2696 cdef class IteratorColumnRegion(IteratorColumn):
jpayne@69 2697 '''iterates over a region only.
jpayne@69 2698 '''
jpayne@69 2699 def __cinit__(self,
jpayne@69 2700 AlignmentFile samfile,
jpayne@69 2701 int tid = 0,
jpayne@69 2702 int start = 0,
jpayne@69 2703 int stop = MAX_POS,
jpayne@69 2704 int truncate = False,
jpayne@69 2705 int multiple_iterators = True,
jpayne@69 2706 **kwargs ):
jpayne@69 2707
jpayne@69 2708 # initialize iterator. Multiple iterators not available
jpayne@69 2709 # for CRAM.
jpayne@69 2710 if multiple_iterators and samfile.is_cram:
jpayne@69 2711 warnings.warn("multiple_iterators not implemented for CRAM")
jpayne@69 2712 multiple_iterators = False
jpayne@69 2713
jpayne@69 2714 self._setup_iterator(tid, start, stop, multiple_iterators)
jpayne@69 2715 self.start = start
jpayne@69 2716 self.stop = stop
jpayne@69 2717 self.truncate = truncate
jpayne@69 2718
jpayne@69 2719 def __next__(self):
jpayne@69 2720
jpayne@69 2721 cdef int n
jpayne@69 2722
jpayne@69 2723 while 1:
jpayne@69 2724 n = self.cnext()
jpayne@69 2725 if n < 0:
jpayne@69 2726 raise ValueError("error during iteration" )
jpayne@69 2727
jpayne@69 2728 if n == 0:
jpayne@69 2729 raise StopIteration
jpayne@69 2730
jpayne@69 2731 if self.truncate:
jpayne@69 2732 if self.start > self.pos:
jpayne@69 2733 continue
jpayne@69 2734 if self.pos >= self.stop:
jpayne@69 2735 raise StopIteration
jpayne@69 2736
jpayne@69 2737 return makePileupColumn(&self.plp,
jpayne@69 2738 self.tid,
jpayne@69 2739 self.pos,
jpayne@69 2740 self.n_plp,
jpayne@69 2741 self.min_base_quality,
jpayne@69 2742 self.iterdata.seq,
jpayne@69 2743 self.samfile.header)
jpayne@69 2744
jpayne@69 2745
jpayne@69 2746 cdef class IteratorColumnAllRefs(IteratorColumn):
jpayne@69 2747 """iterates over all columns by chaining iterators over each reference
jpayne@69 2748 """
jpayne@69 2749
jpayne@69 2750 def __cinit__(self,
jpayne@69 2751 AlignmentFile samfile,
jpayne@69 2752 **kwargs):
jpayne@69 2753
jpayne@69 2754 # no iteration over empty files
jpayne@69 2755 if not samfile.nreferences:
jpayne@69 2756 raise StopIteration
jpayne@69 2757
jpayne@69 2758 # initialize iterator
jpayne@69 2759 self._setup_iterator(self.tid, 0, MAX_POS, 1)
jpayne@69 2760
jpayne@69 2761 def __next__(self):
jpayne@69 2762
jpayne@69 2763 cdef int n
jpayne@69 2764 while 1:
jpayne@69 2765 n = self.cnext()
jpayne@69 2766 if n < 0:
jpayne@69 2767 raise ValueError("error during iteration")
jpayne@69 2768
jpayne@69 2769 # proceed to next reference or stop
jpayne@69 2770 if n == 0:
jpayne@69 2771 self.tid += 1
jpayne@69 2772 if self.tid < self.samfile.nreferences:
jpayne@69 2773 self._setup_iterator(self.tid, 0, MAX_POS, 0)
jpayne@69 2774 else:
jpayne@69 2775 raise StopIteration
jpayne@69 2776 continue
jpayne@69 2777
jpayne@69 2778 # return result, if within same reference
jpayne@69 2779 return makePileupColumn(&self.plp,
jpayne@69 2780 self.tid,
jpayne@69 2781 self.pos,
jpayne@69 2782 self.n_plp,
jpayne@69 2783 self.min_base_quality,
jpayne@69 2784 self.iterdata.seq,
jpayne@69 2785 self.samfile.header)
jpayne@69 2786
jpayne@69 2787
jpayne@69 2788 cdef class IteratorColumnAll(IteratorColumn):
jpayne@69 2789 """iterates over all columns, without using an index
jpayne@69 2790 """
jpayne@69 2791
jpayne@69 2792 def __cinit__(self,
jpayne@69 2793 AlignmentFile samfile,
jpayne@69 2794 **kwargs):
jpayne@69 2795
jpayne@69 2796 self._setup_raw_rest_iterator()
jpayne@69 2797
jpayne@69 2798 def __next__(self):
jpayne@69 2799
jpayne@69 2800 cdef int n
jpayne@69 2801 n = self.cnext()
jpayne@69 2802 if n < 0:
jpayne@69 2803 raise ValueError("error during iteration")
jpayne@69 2804
jpayne@69 2805 if n == 0:
jpayne@69 2806 raise StopIteration
jpayne@69 2807
jpayne@69 2808 return makePileupColumn(&self.plp,
jpayne@69 2809 self.tid,
jpayne@69 2810 self.pos,
jpayne@69 2811 self.n_plp,
jpayne@69 2812 self.min_base_quality,
jpayne@69 2813 self.iterdata.seq,
jpayne@69 2814 self.samfile.header)
jpayne@69 2815
jpayne@69 2816
jpayne@69 2817 cdef class SNPCall:
jpayne@69 2818 '''the results of a SNP call.'''
jpayne@69 2819 cdef int _tid
jpayne@69 2820 cdef int _pos
jpayne@69 2821 cdef char _reference_base
jpayne@69 2822 cdef char _genotype
jpayne@69 2823 cdef int _consensus_quality
jpayne@69 2824 cdef int _snp_quality
jpayne@69 2825 cdef int _rms_mapping_quality
jpayne@69 2826 cdef int _coverage
jpayne@69 2827
jpayne@69 2828 property tid:
jpayne@69 2829 '''the chromosome ID as is defined in the header'''
jpayne@69 2830 def __get__(self):
jpayne@69 2831 return self._tid
jpayne@69 2832
jpayne@69 2833 property pos:
jpayne@69 2834 '''nucleotide position of SNP.'''
jpayne@69 2835 def __get__(self): return self._pos
jpayne@69 2836
jpayne@69 2837 property reference_base:
jpayne@69 2838 '''reference base at pos. ``N`` if no reference sequence supplied.'''
jpayne@69 2839 def __get__(self): return from_string_and_size( &self._reference_base, 1 )
jpayne@69 2840
jpayne@69 2841 property genotype:
jpayne@69 2842 '''the genotype called.'''
jpayne@69 2843 def __get__(self): return from_string_and_size( &self._genotype, 1 )
jpayne@69 2844
jpayne@69 2845 property consensus_quality:
jpayne@69 2846 '''the genotype quality (Phred-scaled).'''
jpayne@69 2847 def __get__(self): return self._consensus_quality
jpayne@69 2848
jpayne@69 2849 property snp_quality:
jpayne@69 2850 '''the snp quality (Phred scaled) - probability of consensus being
jpayne@69 2851 identical to reference sequence.'''
jpayne@69 2852 def __get__(self): return self._snp_quality
jpayne@69 2853
jpayne@69 2854 property mapping_quality:
jpayne@69 2855 '''the root mean square (rms) of the mapping quality of all reads
jpayne@69 2856 involved in the call.'''
jpayne@69 2857 def __get__(self): return self._rms_mapping_quality
jpayne@69 2858
jpayne@69 2859 property coverage:
jpayne@69 2860 '''coverage or read depth - the number of reads involved in the call.'''
jpayne@69 2861 def __get__(self): return self._coverage
jpayne@69 2862
jpayne@69 2863 def __str__(self):
jpayne@69 2864
jpayne@69 2865 return "\t".join( map(str, (
jpayne@69 2866 self.tid,
jpayne@69 2867 self.pos,
jpayne@69 2868 self.reference_base,
jpayne@69 2869 self.genotype,
jpayne@69 2870 self.consensus_quality,
jpayne@69 2871 self.snp_quality,
jpayne@69 2872 self.mapping_quality,
jpayne@69 2873 self.coverage ) ) )
jpayne@69 2874
jpayne@69 2875
jpayne@69 2876 cdef class IndexedReads:
jpayne@69 2877 """Index a Sam/BAM-file by query name while keeping the
jpayne@69 2878 original sort order intact.
jpayne@69 2879
jpayne@69 2880 The index is kept in memory and can be substantial.
jpayne@69 2881
jpayne@69 2882 By default, the file is re-opened to avoid conflicts if multiple
jpayne@69 2883 operators work on the same file. Set `multiple_iterators` = False
jpayne@69 2884 to not re-open `samfile`.
jpayne@69 2885
jpayne@69 2886 Parameters
jpayne@69 2887 ----------
jpayne@69 2888
jpayne@69 2889 samfile : AlignmentFile
jpayne@69 2890 File to be indexed.
jpayne@69 2891
jpayne@69 2892 multiple_iterators : bool
jpayne@69 2893 Flag indicating whether the file should be reopened. Reopening prevents
jpayne@69 2894 existing iterators being affected by the indexing.
jpayne@69 2895
jpayne@69 2896 """
jpayne@69 2897
jpayne@69 2898 def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
jpayne@69 2899 cdef char *cfilename
jpayne@69 2900
jpayne@69 2901 # makes sure that samfile stays alive as long as this
jpayne@69 2902 # object is alive.
jpayne@69 2903 self.samfile = samfile
jpayne@69 2904 cdef bam_hdr_t * hdr = NULL
jpayne@69 2905 assert samfile.is_bam, "can only apply IndexReads on bam files"
jpayne@69 2906
jpayne@69 2907 # multiple_iterators the file - note that this makes the iterator
jpayne@69 2908 # slow and causes pileup to slow down significantly.
jpayne@69 2909 if multiple_iterators:
jpayne@69 2910 cfilename = samfile.filename
jpayne@69 2911 with nogil:
jpayne@69 2912 self.htsfile = hts_open(cfilename, 'r')
jpayne@69 2913 if self.htsfile == NULL:
jpayne@69 2914 raise OSError("unable to reopen htsfile")
jpayne@69 2915
jpayne@69 2916 # need to advance in newly opened file to position after header
jpayne@69 2917 # better: use seek/tell?
jpayne@69 2918 with nogil:
jpayne@69 2919 hdr = sam_hdr_read(self.htsfile)
jpayne@69 2920 if hdr == NULL:
jpayne@69 2921 raise OSError("unable to read header information")
jpayne@69 2922 self.header = makeAlignmentHeader(hdr)
jpayne@69 2923 self.owns_samfile = True
jpayne@69 2924 else:
jpayne@69 2925 self.htsfile = self.samfile.htsfile
jpayne@69 2926 self.header = samfile.header
jpayne@69 2927 self.owns_samfile = False
jpayne@69 2928
jpayne@69 2929 def build(self):
jpayne@69 2930 '''build the index.'''
jpayne@69 2931
jpayne@69 2932 self.index = collections.defaultdict(list)
jpayne@69 2933
jpayne@69 2934 # this method will start indexing from the current file position
jpayne@69 2935 cdef int ret = 1
jpayne@69 2936 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
jpayne@69 2937 if b == NULL:
jpayne@69 2938 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
jpayne@69 2939
jpayne@69 2940 cdef uint64_t pos
jpayne@69 2941 cdef bam_hdr_t * hdr = self.header.ptr
jpayne@69 2942
jpayne@69 2943 while ret > 0:
jpayne@69 2944 with nogil:
jpayne@69 2945 pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
jpayne@69 2946 ret = sam_read1(self.htsfile,
jpayne@69 2947 hdr,
jpayne@69 2948 b)
jpayne@69 2949
jpayne@69 2950 if ret > 0:
jpayne@69 2951 qname = charptr_to_str(pysam_bam_get_qname(b))
jpayne@69 2952 self.index[qname].append(pos)
jpayne@69 2953
jpayne@69 2954 bam_destroy1(b)
jpayne@69 2955
jpayne@69 2956 def find(self, query_name):
jpayne@69 2957 '''find `query_name` in index.
jpayne@69 2958
jpayne@69 2959 Returns
jpayne@69 2960 -------
jpayne@69 2961
jpayne@69 2962 IteratorRowSelection
jpayne@69 2963 Returns an iterator over all reads with query_name.
jpayne@69 2964
jpayne@69 2965 Raises
jpayne@69 2966 ------
jpayne@69 2967
jpayne@69 2968 KeyError
jpayne@69 2969 if the `query_name` is not in the index.
jpayne@69 2970
jpayne@69 2971 '''
jpayne@69 2972 if query_name in self.index:
jpayne@69 2973 return IteratorRowSelection(
jpayne@69 2974 self.samfile,
jpayne@69 2975 self.index[query_name],
jpayne@69 2976 multiple_iterators = False)
jpayne@69 2977 else:
jpayne@69 2978 raise KeyError("read %s not found" % query_name)
jpayne@69 2979
jpayne@69 2980 def __dealloc__(self):
jpayne@69 2981 if self.owns_samfile:
jpayne@69 2982 hts_close(self.htsfile)