jpayne@68
|
1 # cython: embedsignature=True
|
jpayne@68
|
2 # cython: profile=True
|
jpayne@68
|
3 ########################################################
|
jpayne@68
|
4 ########################################################
|
jpayne@68
|
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
|
jpayne@68
|
6 ########################################################
|
jpayne@68
|
7 # The principal classes defined in this module are:
|
jpayne@68
|
8 #
|
jpayne@68
|
9 # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
|
jpayne@68
|
10 #
|
jpayne@68
|
11 # class AlignmentHeader manage SAM/BAM/CRAM header data
|
jpayne@68
|
12 #
|
jpayne@68
|
13 # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
|
jpayne@68
|
14 # the original sort order intact
|
jpayne@68
|
15 #
|
jpayne@68
|
16 # Additionally this module defines numerous additional classes that
|
jpayne@68
|
17 # are part of the internal API. These are:
|
jpayne@68
|
18 #
|
jpayne@68
|
19 # Various iterator classes to iterate over alignments in sequential
|
jpayne@68
|
20 # (IteratorRow) or in a stacked fashion (IteratorColumn):
|
jpayne@68
|
21 #
|
jpayne@68
|
22 # class IteratorRow
|
jpayne@68
|
23 # class IteratorRowRegion
|
jpayne@68
|
24 # class IteratorRowHead
|
jpayne@68
|
25 # class IteratorRowAll
|
jpayne@68
|
26 # class IteratorRowAllRefs
|
jpayne@68
|
27 # class IteratorRowSelection
|
jpayne@68
|
28 # class IteratorColumn
|
jpayne@68
|
29 # class IteratorColumnRegion
|
jpayne@68
|
30 # class IteratorColumnAll
|
jpayne@68
|
31 # class IteratorColumnAllRefs
|
jpayne@68
|
32 #
|
jpayne@68
|
33 ########################################################
|
jpayne@68
|
34 #
|
jpayne@68
|
35 # The MIT License
|
jpayne@68
|
36 #
|
jpayne@68
|
37 # Copyright (c) 2015 Andreas Heger
|
jpayne@68
|
38 #
|
jpayne@68
|
39 # Permission is hereby granted, free of charge, to any person obtaining a
|
jpayne@68
|
40 # copy of this software and associated documentation files (the "Software"),
|
jpayne@68
|
41 # to deal in the Software without restriction, including without limitation
|
jpayne@68
|
42 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jpayne@68
|
43 # and/or sell copies of the Software, and to permit persons to whom the
|
jpayne@68
|
44 # Software is furnished to do so, subject to the following conditions:
|
jpayne@68
|
45 #
|
jpayne@68
|
46 # The above copyright notice and this permission notice shall be included in
|
jpayne@68
|
47 # all copies or substantial portions of the Software.
|
jpayne@68
|
48 #
|
jpayne@68
|
49 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@68
|
50 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@68
|
51 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
jpayne@68
|
52 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@68
|
53 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jpayne@68
|
54 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jpayne@68
|
55 # DEALINGS IN THE SOFTWARE.
|
jpayne@68
|
56 #
|
jpayne@68
|
57 ########################################################
|
jpayne@68
|
58 import os
|
jpayne@68
|
59 import collections
|
jpayne@68
|
60 try:
|
jpayne@68
|
61 from collections.abc import Sequence, Mapping # noqa
|
jpayne@68
|
62 except ImportError:
|
jpayne@68
|
63 from collections import Sequence, Mapping # noqa
|
jpayne@68
|
64 import re
|
jpayne@68
|
65 import warnings
|
jpayne@68
|
66 import array
|
jpayne@68
|
67 from libc.errno cimport errno, EPIPE
|
jpayne@68
|
68 from libc.string cimport strcmp, strpbrk, strerror
|
jpayne@68
|
69 from libc.stdint cimport INT32_MAX
|
jpayne@68
|
70
|
jpayne@68
|
71 from cpython cimport array as c_array
|
jpayne@68
|
72
|
jpayne@68
|
73 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
|
jpayne@68
|
74 from pysam.libcutils cimport encode_filename, from_string_and_size
|
jpayne@68
|
75 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
|
jpayne@68
|
76 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
|
jpayne@68
|
77 HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
|
jpayne@68
|
78
|
jpayne@68
|
79 from io import StringIO
|
jpayne@68
|
80
|
jpayne@68
|
81 cimport cython
|
jpayne@68
|
82
|
jpayne@68
|
83
|
jpayne@68
|
84 __all__ = [
|
jpayne@68
|
85 "AlignmentFile",
|
jpayne@68
|
86 "AlignmentHeader",
|
jpayne@68
|
87 "IteratorRow",
|
jpayne@68
|
88 "IteratorColumn",
|
jpayne@68
|
89 "IndexedReads"]
|
jpayne@68
|
90
|
jpayne@68
|
91 IndexStats = collections.namedtuple("IndexStats",
|
jpayne@68
|
92 ("contig",
|
jpayne@68
|
93 "mapped",
|
jpayne@68
|
94 "unmapped",
|
jpayne@68
|
95 "total"))
|
jpayne@68
|
96
|
jpayne@68
|
97 ########################################################
|
jpayne@68
|
98 ## global variables
|
jpayne@68
|
99 # maximum genomic coordinace
|
jpayne@68
|
100 # for some reason, using 'int' causes overflow
|
jpayne@68
|
101 cdef int MAX_POS = (1 << 31) - 1
|
jpayne@68
|
102
|
jpayne@68
|
103 # valid types for SAM headers
|
jpayne@68
|
104 VALID_HEADER_TYPES = {"HD" : Mapping,
|
jpayne@68
|
105 "SQ" : Sequence,
|
jpayne@68
|
106 "RG" : Sequence,
|
jpayne@68
|
107 "PG" : Sequence,
|
jpayne@68
|
108 "CO" : Sequence}
|
jpayne@68
|
109
|
jpayne@68
|
110 # order of records within SAM headers
|
jpayne@68
|
111 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
|
jpayne@68
|
112
|
jpayne@68
|
113 # default type conversions within SAM header records
|
jpayne@68
|
114 KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
|
jpayne@68
|
115 "SS" : str,},
|
jpayne@68
|
116 "SQ" : {"SN" : str, "LN" : int, "AS" : str,
|
jpayne@68
|
117 "M5" : str, "SP" : str, "UR" : str,
|
jpayne@68
|
118 "AH" : str, "TP" : str, "DS" : str,
|
jpayne@68
|
119 "AN" : str,},
|
jpayne@68
|
120 "RG" : {"ID" : str, "CN" : str, "DS" : str,
|
jpayne@68
|
121 "DT" : str, "FO" : str, "KS" : str,
|
jpayne@68
|
122 "LB" : str, "PG" : str, "PI" : str,
|
jpayne@68
|
123 "PL" : str, "PM" : str, "PU" : str,
|
jpayne@68
|
124 "SM" : str, "BC" : str,},
|
jpayne@68
|
125 "PG" : {"ID" : str, "PN" : str, "CL" : str,
|
jpayne@68
|
126 "PP" : str, "DS" : str, "VN" : str,},}
|
jpayne@68
|
127
|
jpayne@68
|
128 # output order of fields within records. Ensure that CL is at
|
jpayne@68
|
129 # the end as parsing a CL will ignore any subsequent records.
|
jpayne@68
|
130 VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
|
jpayne@68
|
131 "SQ" : ("SN", "LN", "AS", "M5",
|
jpayne@68
|
132 "UR", "SP", "AH", "TP",
|
jpayne@68
|
133 "DS", "AN"),
|
jpayne@68
|
134 "RG" : ("ID", "CN", "SM", "LB",
|
jpayne@68
|
135 "PU", "PI", "DT", "DS",
|
jpayne@68
|
136 "PL", "FO", "KS", "PG",
|
jpayne@68
|
137 "PM", "BC"),
|
jpayne@68
|
138 "PG" : ("PN", "ID", "VN", "PP",
|
jpayne@68
|
139 "DS", "CL"),}
|
jpayne@68
|
140
|
jpayne@68
|
141
|
jpayne@68
|
142 def build_header_line(fields, record):
|
jpayne@68
|
143 '''build a header line from `fields` dictionary for `record`'''
|
jpayne@68
|
144
|
jpayne@68
|
145 # TODO: add checking for field and sort order
|
jpayne@68
|
146 line = ["@%s" % record]
|
jpayne@68
|
147 # comment
|
jpayne@68
|
148 if record == "CO":
|
jpayne@68
|
149 line.append(fields)
|
jpayne@68
|
150 # user tags
|
jpayne@68
|
151 elif record.islower():
|
jpayne@68
|
152 for key in sorted(fields):
|
jpayne@68
|
153 line.append("%s:%s" % (key, str(fields[key])))
|
jpayne@68
|
154 # defined tags
|
jpayne@68
|
155 else:
|
jpayne@68
|
156 # write fields of the specification
|
jpayne@68
|
157 for key in VALID_HEADER_ORDER[record]:
|
jpayne@68
|
158 if key in fields:
|
jpayne@68
|
159 line.append("%s:%s" % (key, str(fields[key])))
|
jpayne@68
|
160 # write user fields
|
jpayne@68
|
161 for key in fields:
|
jpayne@68
|
162 if not key.isupper():
|
jpayne@68
|
163 line.append("%s:%s" % (key, str(fields[key])))
|
jpayne@68
|
164
|
jpayne@68
|
165 return "\t".join(line)
|
jpayne@68
|
166
|
jpayne@68
|
167
|
jpayne@68
|
168 cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
|
jpayne@68
|
169 if not hdr:
|
jpayne@68
|
170 raise ValueError('cannot create AlignmentHeader, received NULL pointer')
|
jpayne@68
|
171
|
jpayne@68
|
172 # check: is AlignmetHeader.__cinit__ called?
|
jpayne@68
|
173 cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
|
jpayne@68
|
174 header.ptr = hdr
|
jpayne@68
|
175
|
jpayne@68
|
176 return header
|
jpayne@68
|
177
|
jpayne@68
|
178 def read_failure_reason(code):
|
jpayne@68
|
179 if code == -2:
|
jpayne@68
|
180 return 'truncated file'
|
jpayne@68
|
181 else:
|
jpayne@68
|
182 return "error {} while reading file".format(code)
|
jpayne@68
|
183
|
jpayne@68
|
184
|
jpayne@68
|
185 # the following should be class-method for VariantHeader, but cdef @classmethods
|
jpayne@68
|
186 # are not implemented in cython.
|
jpayne@68
|
187 cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
|
jpayne@68
|
188 reference_names,
|
jpayne@68
|
189 reference_lengths,
|
jpayne@68
|
190 add_sq_text=True,
|
jpayne@68
|
191 text=None) except -1:
|
jpayne@68
|
192 """build header from list of reference names and lengths.
|
jpayne@68
|
193 """
|
jpayne@68
|
194
|
jpayne@68
|
195 cdef class AlignmentHeader(object):
|
jpayne@68
|
196 """header information for a :class:`AlignmentFile` object
|
jpayne@68
|
197
|
jpayne@68
|
198 Parameters
|
jpayne@68
|
199 ----------
|
jpayne@68
|
200 header_dict : dict
|
jpayne@68
|
201 build header from a multi-level dictionary. The
|
jpayne@68
|
202 first level are the four types ('HD', 'SQ', ...). The second
|
jpayne@68
|
203 level are a list of lines, with each line being a list of
|
jpayne@68
|
204 tag-value pairs. The header is constructed first from all the
|
jpayne@68
|
205 defined fields, followed by user tags in alphabetical
|
jpayne@68
|
206 order. Alternatively, an :class:`~pysam.AlignmentHeader`
|
jpayne@68
|
207 object can be passed directly.
|
jpayne@68
|
208
|
jpayne@68
|
209 text : string
|
jpayne@68
|
210 use the string provided as the header
|
jpayne@68
|
211
|
jpayne@68
|
212 reference_names : list
|
jpayne@68
|
213 see reference_lengths
|
jpayne@68
|
214
|
jpayne@68
|
215 reference_lengths : list
|
jpayne@68
|
216 build header from list of chromosome names and lengths. By
|
jpayne@68
|
217 default, 'SQ' and 'LN' tags will be added to the header
|
jpayne@68
|
218 text. This option can be changed by unsetting the flag
|
jpayne@68
|
219 `add_sq_text`.
|
jpayne@68
|
220
|
jpayne@68
|
221 add_sq_text : bool
|
jpayne@68
|
222 do not add 'SQ' and 'LN' tags to header. This option permits
|
jpayne@68
|
223 construction :term:`SAM` formatted files without a header.
|
jpayne@68
|
224
|
jpayne@68
|
225 """
|
jpayne@68
|
226
|
jpayne@68
|
227 # See makeVariantHeader for C constructor
|
jpayne@68
|
228 def __cinit__(self):
|
jpayne@68
|
229 self.ptr = NULL
|
jpayne@68
|
230
|
jpayne@68
|
231 # Python constructor
|
jpayne@68
|
232 def __init__(self):
|
jpayne@68
|
233 self.ptr = bam_hdr_init()
|
jpayne@68
|
234 if self.ptr is NULL:
|
jpayne@68
|
235 raise MemoryError("could not create header")
|
jpayne@68
|
236
|
jpayne@68
|
237 @classmethod
|
jpayne@68
|
238 def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
|
jpayne@68
|
239
|
jpayne@68
|
240 cdef AlignmentHeader self = AlignmentHeader()
|
jpayne@68
|
241 cdef char *ctext
|
jpayne@68
|
242 cdef int l_text
|
jpayne@68
|
243 cdef int n, x
|
jpayne@68
|
244 if text is not None:
|
jpayne@68
|
245 btext = force_bytes(text)
|
jpayne@68
|
246 ctext = btext
|
jpayne@68
|
247 l_text = len(btext)
|
jpayne@68
|
248 self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
|
jpayne@68
|
249 if self.ptr.text == NULL:
|
jpayne@68
|
250 raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
|
jpayne@68
|
251 self.ptr.l_text = l_text
|
jpayne@68
|
252 memcpy(self.ptr.text, ctext, l_text + 1)
|
jpayne@68
|
253
|
jpayne@68
|
254 if reference_names and reference_lengths:
|
jpayne@68
|
255 reference_names = [force_bytes(ref) for ref in reference_names]
|
jpayne@68
|
256
|
jpayne@68
|
257 self.ptr.n_targets = len(reference_names)
|
jpayne@68
|
258
|
jpayne@68
|
259 n = sum([len(reference_names) + 1])
|
jpayne@68
|
260 self.ptr.target_name = <char**>calloc(n, sizeof(char*))
|
jpayne@68
|
261 if self.ptr.target_name == NULL:
|
jpayne@68
|
262 raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
|
jpayne@68
|
263
|
jpayne@68
|
264 self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
|
jpayne@68
|
265 if self.ptr.target_len == NULL:
|
jpayne@68
|
266 raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
|
jpayne@68
|
267
|
jpayne@68
|
268 for x from 0 <= x < self.ptr.n_targets:
|
jpayne@68
|
269 self.ptr.target_len[x] = reference_lengths[x]
|
jpayne@68
|
270 name = reference_names[x]
|
jpayne@68
|
271 self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
|
jpayne@68
|
272 if self.ptr.target_name[x] == NULL:
|
jpayne@68
|
273 raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
|
jpayne@68
|
274 strncpy(self.ptr.target_name[x], name, len(name))
|
jpayne@68
|
275
|
jpayne@68
|
276 return self
|
jpayne@68
|
277
|
jpayne@68
|
278 @classmethod
|
jpayne@68
|
279 def from_text(cls, text):
|
jpayne@68
|
280
|
jpayne@68
|
281 reference_names, reference_lengths = [], []
|
jpayne@68
|
282 for line in text.splitlines():
|
jpayne@68
|
283 if line.startswith("@SQ"):
|
jpayne@68
|
284 fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
|
jpayne@68
|
285 try:
|
jpayne@68
|
286 reference_names.append(fields["SN"])
|
jpayne@68
|
287 reference_lengths.append(int(fields["LN"]))
|
jpayne@68
|
288 except KeyError:
|
jpayne@68
|
289 raise KeyError("incomplete sequence information in '%s'" % str(fields))
|
jpayne@68
|
290 except ValueError:
|
jpayne@68
|
291 raise ValueError("wrong sequence information in '%s'" % str(fields))
|
jpayne@68
|
292
|
jpayne@68
|
293 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
|
jpayne@68
|
294
|
jpayne@68
|
295 @classmethod
|
jpayne@68
|
296 def from_dict(cls, header_dict):
|
jpayne@68
|
297
|
jpayne@68
|
298 cdef list lines = []
|
jpayne@68
|
299 # first: defined tags
|
jpayne@68
|
300 for record in VALID_HEADERS:
|
jpayne@68
|
301 if record in header_dict:
|
jpayne@68
|
302 data = header_dict[record]
|
jpayne@68
|
303 if not isinstance(data, VALID_HEADER_TYPES[record]):
|
jpayne@68
|
304 raise ValueError(
|
jpayne@68
|
305 "invalid type for record %s: %s, expected %s".format(
|
jpayne@68
|
306 record, type(data), VALID_HEADER_TYPES[record]))
|
jpayne@68
|
307 if isinstance(data, Mapping):
|
jpayne@68
|
308 lines.append(build_header_line(data, record))
|
jpayne@68
|
309 else:
|
jpayne@68
|
310 for fields in header_dict[record]:
|
jpayne@68
|
311 lines.append(build_header_line(fields, record))
|
jpayne@68
|
312
|
jpayne@68
|
313 # then: user tags (lower case), sorted alphabetically
|
jpayne@68
|
314 for record, data in sorted(header_dict.items()):
|
jpayne@68
|
315 if record in VALID_HEADERS:
|
jpayne@68
|
316 continue
|
jpayne@68
|
317 if isinstance(data, Mapping):
|
jpayne@68
|
318 lines.append(build_header_line(data, record))
|
jpayne@68
|
319 else:
|
jpayne@68
|
320 for fields in header_dict[record]:
|
jpayne@68
|
321 lines.append(build_header_line(fields, record))
|
jpayne@68
|
322
|
jpayne@68
|
323 text = "\n".join(lines) + "\n"
|
jpayne@68
|
324
|
jpayne@68
|
325 reference_names, reference_lengths = [], []
|
jpayne@68
|
326 if "SQ" in header_dict:
|
jpayne@68
|
327 for fields in header_dict["SQ"]:
|
jpayne@68
|
328 try:
|
jpayne@68
|
329 reference_names.append(fields["SN"])
|
jpayne@68
|
330 reference_lengths.append(fields["LN"])
|
jpayne@68
|
331 except KeyError:
|
jpayne@68
|
332 raise KeyError("incomplete sequence information in '%s'" % str(fields))
|
jpayne@68
|
333
|
jpayne@68
|
334 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
|
jpayne@68
|
335
|
jpayne@68
|
336 @classmethod
|
jpayne@68
|
337 def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
|
jpayne@68
|
338
|
jpayne@68
|
339 if len(reference_names) != len(reference_lengths):
|
jpayne@68
|
340 raise ValueError("number of reference names and lengths do not match")
|
jpayne@68
|
341
|
jpayne@68
|
342 # optionally, if there is no text, add a SAM compatible header to output file.
|
jpayne@68
|
343 if text is None and add_sq_text:
|
jpayne@68
|
344 text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
|
jpayne@68
|
345 reference_names, reference_lengths)])
|
jpayne@68
|
346
|
jpayne@68
|
347 return cls._from_text_and_lengths(text, reference_names, reference_lengths)
|
jpayne@68
|
348
|
jpayne@68
|
349 def __dealloc__(self):
|
jpayne@68
|
350 bam_hdr_destroy(self.ptr)
|
jpayne@68
|
351 self.ptr = NULL
|
jpayne@68
|
352
|
jpayne@68
|
353 def __bool__(self):
|
jpayne@68
|
354 return self.ptr != NULL
|
jpayne@68
|
355
|
jpayne@68
|
356 def copy(self):
|
jpayne@68
|
357 return makeAlignmentHeader(bam_hdr_dup(self.ptr))
|
jpayne@68
|
358
|
jpayne@68
|
359 property nreferences:
|
jpayne@68
|
360 """int with the number of :term:`reference` sequences in the file.
|
jpayne@68
|
361
|
jpayne@68
|
362 This is a read-only attribute."""
|
jpayne@68
|
363 def __get__(self):
|
jpayne@68
|
364 return self.ptr.n_targets
|
jpayne@68
|
365
|
jpayne@68
|
366 property references:
|
jpayne@68
|
367 """tuple with the names of :term:`reference` sequences. This is a
|
jpayne@68
|
368 read-only attribute"""
|
jpayne@68
|
369 def __get__(self):
|
jpayne@68
|
370 t = []
|
jpayne@68
|
371 cdef int x
|
jpayne@68
|
372 for x in range(self.ptr.n_targets):
|
jpayne@68
|
373 t.append(charptr_to_str(self.ptr.target_name[x]))
|
jpayne@68
|
374 return tuple(t)
|
jpayne@68
|
375
|
jpayne@68
|
376 property lengths:
|
jpayne@68
|
377 """tuple of the lengths of the :term:`reference` sequences. This is a
|
jpayne@68
|
378 read-only attribute. The lengths are in the same order as
|
jpayne@68
|
379 :attr:`pysam.AlignmentFile.references`
|
jpayne@68
|
380 """
|
jpayne@68
|
381 def __get__(self):
|
jpayne@68
|
382 t = []
|
jpayne@68
|
383 cdef int x
|
jpayne@68
|
384 for x in range(self.ptr.n_targets):
|
jpayne@68
|
385 t.append(self.ptr.target_len[x])
|
jpayne@68
|
386 return tuple(t)
|
jpayne@68
|
387
|
jpayne@68
|
388 def _build_sequence_section(self):
|
jpayne@68
|
389 """return sequence section of header.
|
jpayne@68
|
390
|
jpayne@68
|
391 The sequence section is built from the list of reference names and
|
jpayne@68
|
392 lengths stored in the BAM-file and not from any @SQ entries that
|
jpayne@68
|
393 are part of the header's text section.
|
jpayne@68
|
394 """
|
jpayne@68
|
395
|
jpayne@68
|
396 cdef int x
|
jpayne@68
|
397 text = []
|
jpayne@68
|
398 for x in range(self.ptr.n_targets):
|
jpayne@68
|
399 text.append("@SQ\tSN:{}\tLN:{}\n".format(
|
jpayne@68
|
400 force_str(self.ptr.target_name[x]),
|
jpayne@68
|
401 self.ptr.target_len[x]))
|
jpayne@68
|
402 return "".join(text)
|
jpayne@68
|
403
|
jpayne@68
|
404 def to_dict(self):
|
jpayne@68
|
405 """return two-level dictionary with header information from the file.
|
jpayne@68
|
406
|
jpayne@68
|
407 The first level contains the record (``HD``, ``SQ``, etc) and
|
jpayne@68
|
408 the second level contains the fields (``VN``, ``LN``, etc).
|
jpayne@68
|
409
|
jpayne@68
|
410 The parser is validating and will raise an AssertionError if
|
jpayne@68
|
411 if encounters any record or field tags that are not part of
|
jpayne@68
|
412 the SAM specification. Use the
|
jpayne@68
|
413 :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
|
jpayne@68
|
414 header.
|
jpayne@68
|
415
|
jpayne@68
|
416 The parsing follows the SAM format specification with the
|
jpayne@68
|
417 exception of the ``CL`` field. This option will consume the
|
jpayne@68
|
418 rest of a header line irrespective of any additional fields.
|
jpayne@68
|
419 This behaviour has been added to accommodate command line
|
jpayne@68
|
420 options that contain characters that are not valid field
|
jpayne@68
|
421 separators.
|
jpayne@68
|
422
|
jpayne@68
|
423 If no @SQ entries are within the text section of the header,
|
jpayne@68
|
424 this will be automatically added from the reference names and
|
jpayne@68
|
425 lengths stored in the binary part of the header.
|
jpayne@68
|
426 """
|
jpayne@68
|
427 result = collections.OrderedDict()
|
jpayne@68
|
428
|
jpayne@68
|
429 # convert to python string
|
jpayne@68
|
430 t = self.__str__()
|
jpayne@68
|
431 for line in t.split("\n"):
|
jpayne@68
|
432 line = line.strip(' \0')
|
jpayne@68
|
433 if not line:
|
jpayne@68
|
434 continue
|
jpayne@68
|
435 assert line.startswith("@"), \
|
jpayne@68
|
436 "header line without '@': '%s'" % line
|
jpayne@68
|
437 fields = line[1:].split("\t")
|
jpayne@68
|
438 record = fields[0]
|
jpayne@68
|
439 assert record in VALID_HEADER_TYPES, \
|
jpayne@68
|
440 "header line with invalid type '%s': '%s'" % (record, line)
|
jpayne@68
|
441
|
jpayne@68
|
442 # treat comments
|
jpayne@68
|
443 if record == "CO":
|
jpayne@68
|
444 if record not in result:
|
jpayne@68
|
445 result[record] = []
|
jpayne@68
|
446 result[record].append("\t".join( fields[1:]))
|
jpayne@68
|
447 continue
|
jpayne@68
|
448 # the following is clumsy as generators do not work?
|
jpayne@68
|
449 x = {}
|
jpayne@68
|
450
|
jpayne@68
|
451 for idx, field in enumerate(fields[1:]):
|
jpayne@68
|
452 if ":" not in field:
|
jpayne@68
|
453 raise ValueError("malformatted header: no ':' in field" )
|
jpayne@68
|
454 key, value = field.split(":", 1)
|
jpayne@68
|
455 if key in ("CL",):
|
jpayne@68
|
456 # special treatment for command line
|
jpayne@68
|
457 # statements (CL). These might contain
|
jpayne@68
|
458 # characters that are non-conformant with
|
jpayne@68
|
459 # the valid field separators in the SAM
|
jpayne@68
|
460 # header. Thus, in contravention to the
|
jpayne@68
|
461 # SAM API, consume the rest of the line.
|
jpayne@68
|
462 key, value = "\t".join(fields[idx+1:]).split(":", 1)
|
jpayne@68
|
463 x[key] = KNOWN_HEADER_FIELDS[record][key](value)
|
jpayne@68
|
464 break
|
jpayne@68
|
465
|
jpayne@68
|
466 # interpret type of known header record tags, default to str
|
jpayne@68
|
467 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
|
jpayne@68
|
468
|
jpayne@68
|
469 if VALID_HEADER_TYPES[record] == Mapping:
|
jpayne@68
|
470 if record in result:
|
jpayne@68
|
471 raise ValueError(
|
jpayne@68
|
472 "multiple '%s' lines are not permitted" % record)
|
jpayne@68
|
473
|
jpayne@68
|
474 result[record] = x
|
jpayne@68
|
475 elif VALID_HEADER_TYPES[record] == Sequence:
|
jpayne@68
|
476 if record not in result: result[record] = []
|
jpayne@68
|
477 result[record].append(x)
|
jpayne@68
|
478
|
jpayne@68
|
479 # if there are no SQ lines in the header, add the
|
jpayne@68
|
480 # reference names from the information in the bam
|
jpayne@68
|
481 # file.
|
jpayne@68
|
482 #
|
jpayne@68
|
483 # Background: c-samtools keeps the textual part of the
|
jpayne@68
|
484 # header separate from the list of reference names and
|
jpayne@68
|
485 # lengths. Thus, if a header contains only SQ lines,
|
jpayne@68
|
486 # the SQ information is not part of the textual header
|
jpayne@68
|
487 # and thus are missing from the output. See issue 84.
|
jpayne@68
|
488 if "SQ" not in result:
|
jpayne@68
|
489 sq = []
|
jpayne@68
|
490 for ref, length in zip(self.references, self.lengths):
|
jpayne@68
|
491 sq.append({'LN': length, 'SN': ref })
|
jpayne@68
|
492 result["SQ"] = sq
|
jpayne@68
|
493
|
jpayne@68
|
494 return result
|
jpayne@68
|
495
|
jpayne@68
|
496 def as_dict(self):
|
jpayne@68
|
497 """deprecated, use :meth:`to_dict()` instead"""
|
jpayne@68
|
498 return self.to_dict()
|
jpayne@68
|
499
|
jpayne@68
|
500 def get_reference_name(self, tid):
|
jpayne@68
|
501 if tid == -1:
|
jpayne@68
|
502 return None
|
jpayne@68
|
503 if not 0 <= tid < self.ptr.n_targets:
|
jpayne@68
|
504 raise ValueError("reference_id %i out of range 0<=tid<%i" %
|
jpayne@68
|
505 (tid, self.ptr.n_targets))
|
jpayne@68
|
506 return charptr_to_str(self.ptr.target_name[tid])
|
jpayne@68
|
507
|
jpayne@68
|
508 def get_reference_length(self, reference):
|
jpayne@68
|
509 cdef int tid = self.get_tid(reference)
|
jpayne@68
|
510 if tid < 0:
|
jpayne@68
|
511 raise KeyError("unknown reference {}".format(reference))
|
jpayne@68
|
512 else:
|
jpayne@68
|
513 return self.ptr.target_len[tid]
|
jpayne@68
|
514
|
jpayne@68
|
515 def is_valid_tid(self, int tid):
|
jpayne@68
|
516 """
|
jpayne@68
|
517 return True if the numerical :term:`tid` is valid; False otherwise.
|
jpayne@68
|
518
|
jpayne@68
|
519 Note that the unmapped tid code (-1) counts as an invalid.
|
jpayne@68
|
520 """
|
jpayne@68
|
521 return 0 <= tid < self.ptr.n_targets
|
jpayne@68
|
522
|
jpayne@68
|
523 def get_tid(self, reference):
|
jpayne@68
|
524 """
|
jpayne@68
|
525 return the numerical :term:`tid` corresponding to
|
jpayne@68
|
526 :term:`reference`
|
jpayne@68
|
527
|
jpayne@68
|
528 returns -1 if reference is not known.
|
jpayne@68
|
529 """
|
jpayne@68
|
530 reference = force_bytes(reference)
|
jpayne@68
|
531 tid = bam_name2id(self.ptr, reference)
|
jpayne@68
|
532 if tid < -1:
|
jpayne@68
|
533 raise ValueError('could not parse header')
|
jpayne@68
|
534 return tid
|
jpayne@68
|
535
|
jpayne@68
|
536 def __str__(self):
|
jpayne@68
|
537 '''string with the full contents of the :term:`sam file` header as a
|
jpayne@68
|
538 string.
|
jpayne@68
|
539
|
jpayne@68
|
540 If no @SQ entries are within the text section of the header,
|
jpayne@68
|
541 this will be automatically added from the reference names and
|
jpayne@68
|
542 lengths stored in the binary part of the header.
|
jpayne@68
|
543
|
jpayne@68
|
544 See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
|
jpayne@68
|
545 representation of the header.
|
jpayne@68
|
546 '''
|
jpayne@68
|
547 text = from_string_and_size(self.ptr.text, self.ptr.l_text)
|
jpayne@68
|
548 if "@SQ" not in text:
|
jpayne@68
|
549 text += "\n" + self._build_sequence_section()
|
jpayne@68
|
550 return text
|
jpayne@68
|
551
|
jpayne@68
|
552 # dictionary access methods, for backwards compatibility.
|
jpayne@68
|
553 def __setitem__(self, key, value):
|
jpayne@68
|
554 raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
|
jpayne@68
|
555
|
jpayne@68
|
556 def __getitem__(self, key):
|
jpayne@68
|
557 return self.to_dict().__getitem__(key)
|
jpayne@68
|
558
|
jpayne@68
|
559 def items(self):
|
jpayne@68
|
560 return self.to_dict().items()
|
jpayne@68
|
561
|
jpayne@68
|
562 # PY2 compatibility
|
jpayne@68
|
563 def iteritems(self):
|
jpayne@68
|
564 return self.to_dict().items()
|
jpayne@68
|
565
|
jpayne@68
|
566 def keys(self):
|
jpayne@68
|
567 return self.to_dict().keys()
|
jpayne@68
|
568
|
jpayne@68
|
569 def values(self):
|
jpayne@68
|
570 return self.to_dict().values()
|
jpayne@68
|
571
|
jpayne@68
|
572 def get(self, *args):
|
jpayne@68
|
573 return self.to_dict().get(*args)
|
jpayne@68
|
574
|
jpayne@68
|
575 def __len__(self):
|
jpayne@68
|
576 return self.to_dict().__len__()
|
jpayne@68
|
577
|
jpayne@68
|
578 def __contains__(self, key):
|
jpayne@68
|
579 return self.to_dict().__contains__(key)
|
jpayne@68
|
580
|
jpayne@68
|
581
|
jpayne@68
|
582 cdef class AlignmentFile(HTSFile):
|
jpayne@68
|
583 """AlignmentFile(filepath_or_object, mode=None, template=None,
|
jpayne@68
|
584 reference_names=None, reference_lengths=None, text=NULL,
|
jpayne@68
|
585 header=None, add_sq_text=False, check_header=True, check_sq=True,
|
jpayne@68
|
586 reference_filename=None, filename=None, index_filename=None,
|
jpayne@68
|
587 filepath_index=None, require_index=False, duplicate_filehandle=True,
|
jpayne@68
|
588 ignore_truncation=False, threads=1)
|
jpayne@68
|
589
|
jpayne@68
|
590 A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
|
jpayne@68
|
591
|
jpayne@68
|
592 If `filepath_or_object` is a string, the file is automatically
|
jpayne@68
|
593 opened. If `filepath_or_object` is a python File object, the
|
jpayne@68
|
594 already opened file will be used.
|
jpayne@68
|
595
|
jpayne@68
|
596 If the file is opened for reading and an index exists (if file is BAM, a
|
jpayne@68
|
597 .bai file or if CRAM a .crai file), it will be opened automatically.
|
jpayne@68
|
598 `index_filename` may be specified explicitly. If the index is not named
|
jpayne@68
|
599 in the standard manner, not located in the same directory as the
|
jpayne@68
|
600 BAM/CRAM file, or is remote. Without an index, random access via
|
jpayne@68
|
601 :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
|
jpayne@68
|
602 is disabled.
|
jpayne@68
|
603
|
jpayne@68
|
604 For writing, the header of a :term:`SAM` file/:term:`BAM` file can
|
jpayne@68
|
605 be constituted from several sources (see also the samtools format
|
jpayne@68
|
606 specification):
|
jpayne@68
|
607
|
jpayne@68
|
608 1. If `template` is given, the header is copied from another
|
jpayne@68
|
609 `AlignmentFile` (`template` must be a
|
jpayne@68
|
610 :class:`~pysam.AlignmentFile`).
|
jpayne@68
|
611
|
jpayne@68
|
612 2. If `header` is given, the header is built from a
|
jpayne@68
|
613 multi-level dictionary.
|
jpayne@68
|
614
|
jpayne@68
|
615 3. If `text` is given, new header text is copied from raw
|
jpayne@68
|
616 text.
|
jpayne@68
|
617
|
jpayne@68
|
618 4. The names (`reference_names`) and lengths
|
jpayne@68
|
619 (`reference_lengths`) are supplied directly as lists.
|
jpayne@68
|
620
|
jpayne@68
|
621 When reading or writing a CRAM file, the filename of a FASTA-formatted
|
jpayne@68
|
622 reference can be specified with `reference_filename`.
|
jpayne@68
|
623
|
jpayne@68
|
624 By default, if a file is opened in mode 'r', it is checked
|
jpayne@68
|
625 for a valid header (`check_header` = True) and a definition of
|
jpayne@68
|
626 chromosome names (`check_sq` = True).
|
jpayne@68
|
627
|
jpayne@68
|
628 Parameters
|
jpayne@68
|
629 ----------
|
jpayne@68
|
630 mode : string
|
jpayne@68
|
631 `mode` should be ``r`` for reading or ``w`` for writing. The
|
jpayne@68
|
632 default is text mode (:term:`SAM`). For binary (:term:`BAM`)
|
jpayne@68
|
633 I/O you should append ``b`` for compressed or ``u`` for
|
jpayne@68
|
634 uncompressed :term:`BAM` output. Use ``h`` to output header
|
jpayne@68
|
635 information in text (:term:`TAM`) mode. Use ``c`` for
|
jpayne@68
|
636 :term:`CRAM` formatted files.
|
jpayne@68
|
637
|
jpayne@68
|
638 If ``b`` is present, it must immediately follow ``r`` or
|
jpayne@68
|
639 ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
|
jpayne@68
|
640 ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
|
jpayne@68
|
641 :term:`BAM` formatted file for reading, type::
|
jpayne@68
|
642
|
jpayne@68
|
643 f = pysam.AlignmentFile('ex1.bam','rb')
|
jpayne@68
|
644
|
jpayne@68
|
645 If mode is not specified, the method will try to auto-detect
|
jpayne@68
|
646 in the order 'rb', 'r', thus both the following should work::
|
jpayne@68
|
647
|
jpayne@68
|
648 f1 = pysam.AlignmentFile('ex1.bam')
|
jpayne@68
|
649 f2 = pysam.AlignmentFile('ex1.sam')
|
jpayne@68
|
650
|
jpayne@68
|
651 template : AlignmentFile
|
jpayne@68
|
652 when writing, copy header from file `template`.
|
jpayne@68
|
653
|
jpayne@68
|
654 header : dict or AlignmentHeader
|
jpayne@68
|
655 when writing, build header from a multi-level dictionary. The
|
jpayne@68
|
656 first level are the four types ('HD', 'SQ', ...). The second
|
jpayne@68
|
657 level are a list of lines, with each line being a list of
|
jpayne@68
|
658 tag-value pairs. The header is constructed first from all the
|
jpayne@68
|
659 defined fields, followed by user tags in alphabetical
|
jpayne@68
|
660 order. Alternatively, an :class:`~pysam.AlignmentHeader`
|
jpayne@68
|
661 object can be passed directly.
|
jpayne@68
|
662
|
jpayne@68
|
663 text : string
|
jpayne@68
|
664 when writing, use the string provided as the header
|
jpayne@68
|
665
|
jpayne@68
|
666 reference_names : list
|
jpayne@68
|
667 see reference_lengths
|
jpayne@68
|
668
|
jpayne@68
|
669 reference_lengths : list
|
jpayne@68
|
670 when writing or opening a SAM file without header build header
|
jpayne@68
|
671 from list of chromosome names and lengths. By default, 'SQ'
|
jpayne@68
|
672 and 'LN' tags will be added to the header text. This option
|
jpayne@68
|
673 can be changed by unsetting the flag `add_sq_text`.
|
jpayne@68
|
674
|
jpayne@68
|
675 add_sq_text : bool
|
jpayne@68
|
676 do not add 'SQ' and 'LN' tags to header. This option permits
|
jpayne@68
|
677 construction :term:`SAM` formatted files without a header.
|
jpayne@68
|
678
|
jpayne@68
|
679 add_sam_header : bool
|
jpayne@68
|
680 when outputting SAM the default is to output a header. This is
|
jpayne@68
|
681 equivalent to opening the file in 'wh' mode. If this option is
|
jpayne@68
|
682 set to False, no header will be output. To read such a file,
|
jpayne@68
|
683 set `check_header=False`.
|
jpayne@68
|
684
|
jpayne@68
|
685 check_header : bool
|
jpayne@68
|
686 obsolete: when reading a SAM file, check if header is present
|
jpayne@68
|
687 (default=True)
|
jpayne@68
|
688
|
jpayne@68
|
689 check_sq : bool
|
jpayne@68
|
690 when reading, check if SQ entries are present in header
|
jpayne@68
|
691 (default=True)
|
jpayne@68
|
692
|
jpayne@68
|
693 reference_filename : string
|
jpayne@68
|
694 Path to a FASTA-formatted reference file. Valid only for CRAM files.
|
jpayne@68
|
695 When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
|
jpayne@68
|
696 specified in the header (``UR`` tag), which are normally used to find
|
jpayne@68
|
697 the reference.
|
jpayne@68
|
698
|
jpayne@68
|
699 index_filename : string
|
jpayne@68
|
700 Explicit path to the index file. Only needed if the index is not
|
jpayne@68
|
701 named in the standard manner, not located in the same directory as
|
jpayne@68
|
702 the BAM/CRAM file, or is remote. An IOError is raised if the index
|
jpayne@68
|
703 cannot be found or is invalid.
|
jpayne@68
|
704
|
jpayne@68
|
705 filepath_index : string
|
jpayne@68
|
706 Alias for `index_filename`.
|
jpayne@68
|
707
|
jpayne@68
|
708 require_index : bool
|
jpayne@68
|
709 When reading, require that an index file is present and is valid or
|
jpayne@68
|
710 raise an IOError. (default=False)
|
jpayne@68
|
711
|
jpayne@68
|
712 filename : string
|
jpayne@68
|
713 Alternative to filepath_or_object. Filename of the file
|
jpayne@68
|
714 to be opened.
|
jpayne@68
|
715
|
jpayne@68
|
716 duplicate_filehandle: bool
|
jpayne@68
|
717 By default, file handles passed either directly or through
|
jpayne@68
|
718 File-like objects will be duplicated before passing them to
|
jpayne@68
|
719 htslib. The duplication prevents issues where the same stream
|
jpayne@68
|
720 will be closed by htslib and through destruction of the
|
jpayne@68
|
721 high-level python object. Set to False to turn off
|
jpayne@68
|
722 duplication.
|
jpayne@68
|
723
|
jpayne@68
|
724 ignore_truncation: bool
|
jpayne@68
|
725 Issue a warning, instead of raising an error if the current file
|
jpayne@68
|
726 appears to be truncated due to a missing EOF marker. Only applies
|
jpayne@68
|
727 to bgzipped formats. (Default=False)
|
jpayne@68
|
728
|
jpayne@68
|
729 format_options: list
|
jpayne@68
|
730 A list of key=value strings, as accepted by --input-fmt-option and
|
jpayne@68
|
731 --output-fmt-option in samtools.
|
jpayne@68
|
732 threads: integer
|
jpayne@68
|
733 Number of threads to use for compressing/decompressing BAM/CRAM files.
|
jpayne@68
|
734 Setting threads to > 1 cannot be combined with `ignore_truncation`.
|
jpayne@68
|
735 (Default=1)
|
jpayne@68
|
736 """
|
jpayne@68
|
737
|
jpayne@68
|
738 def __cinit__(self, *args, **kwargs):
|
jpayne@68
|
739 self.htsfile = NULL
|
jpayne@68
|
740 self.filename = None
|
jpayne@68
|
741 self.mode = None
|
jpayne@68
|
742 self.threads = 1
|
jpayne@68
|
743 self.is_stream = False
|
jpayne@68
|
744 self.is_remote = False
|
jpayne@68
|
745 self.index = NULL
|
jpayne@68
|
746
|
jpayne@68
|
747 if "filename" in kwargs:
|
jpayne@68
|
748 args = [kwargs["filename"]]
|
jpayne@68
|
749 del kwargs["filename"]
|
jpayne@68
|
750
|
jpayne@68
|
751 self._open(*args, **kwargs)
|
jpayne@68
|
752
|
jpayne@68
|
753 # allocate memory for iterator
|
jpayne@68
|
754 self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
|
jpayne@68
|
755 if self.b == NULL:
|
jpayne@68
|
756 raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
|
jpayne@68
|
757
|
jpayne@68
|
758 def has_index(self):
|
jpayne@68
|
759 """return true if htsfile has an existing (and opened) index.
|
jpayne@68
|
760 """
|
jpayne@68
|
761 return self.index != NULL
|
jpayne@68
|
762
|
jpayne@68
|
763 def check_index(self):
|
jpayne@68
|
764 """return True if index is present.
|
jpayne@68
|
765
|
jpayne@68
|
766 Raises
|
jpayne@68
|
767 ------
|
jpayne@68
|
768
|
jpayne@68
|
769 AttributeError
|
jpayne@68
|
770 if htsfile is :term:`SAM` formatted and thus has no index.
|
jpayne@68
|
771
|
jpayne@68
|
772 ValueError
|
jpayne@68
|
773 if htsfile is closed or index could not be opened.
|
jpayne@68
|
774 """
|
jpayne@68
|
775
|
jpayne@68
|
776 if not self.is_open:
|
jpayne@68
|
777 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
778 if not self.is_bam and not self.is_cram:
|
jpayne@68
|
779 raise AttributeError(
|
jpayne@68
|
780 "AlignmentFile.mapped only available in bam files")
|
jpayne@68
|
781 if self.index == NULL:
|
jpayne@68
|
782 raise ValueError(
|
jpayne@68
|
783 "mapping information not recorded in index "
|
jpayne@68
|
784 "or index not available")
|
jpayne@68
|
785 return True
|
jpayne@68
|
786
|
jpayne@68
|
787 def _open(self,
|
jpayne@68
|
788 filepath_or_object,
|
jpayne@68
|
789 mode=None,
|
jpayne@68
|
790 AlignmentFile template=None,
|
jpayne@68
|
791 reference_names=None,
|
jpayne@68
|
792 reference_lengths=None,
|
jpayne@68
|
793 reference_filename=None,
|
jpayne@68
|
794 text=None,
|
jpayne@68
|
795 header=None,
|
jpayne@68
|
796 port=None,
|
jpayne@68
|
797 add_sq_text=True,
|
jpayne@68
|
798 add_sam_header=True,
|
jpayne@68
|
799 check_header=True,
|
jpayne@68
|
800 check_sq=True,
|
jpayne@68
|
801 index_filename=None,
|
jpayne@68
|
802 filepath_index=None,
|
jpayne@68
|
803 require_index=False,
|
jpayne@68
|
804 referencenames=None,
|
jpayne@68
|
805 referencelengths=None,
|
jpayne@68
|
806 duplicate_filehandle=True,
|
jpayne@68
|
807 ignore_truncation=False,
|
jpayne@68
|
808 format_options=None,
|
jpayne@68
|
809 threads=1):
|
jpayne@68
|
810 '''open a sam, bam or cram formatted file.
|
jpayne@68
|
811
|
jpayne@68
|
812 If _open is called on an existing file, the current file
|
jpayne@68
|
813 will be closed and a new file will be opened.
|
jpayne@68
|
814
|
jpayne@68
|
815 '''
|
jpayne@68
|
816 cdef char *cfilename = NULL
|
jpayne@68
|
817 cdef char *creference_filename = NULL
|
jpayne@68
|
818 cdef char *cindexname = NULL
|
jpayne@68
|
819 cdef char *cmode = NULL
|
jpayne@68
|
820 cdef bam_hdr_t * hdr = NULL
|
jpayne@68
|
821
|
jpayne@68
|
822 if threads > 1 and ignore_truncation:
|
jpayne@68
|
823 # This won't raise errors if reaching a truncated alignment,
|
jpayne@68
|
824 # because bgzf_mt_reader in htslib does not deal with
|
jpayne@68
|
825 # bgzf_mt_read_block returning non-zero values, contrary
|
jpayne@68
|
826 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
|
jpayne@68
|
827 # Better to avoid this (for now) than to produce seemingly correct results.
|
jpayne@68
|
828 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
|
jpayne@68
|
829 self.threads = threads
|
jpayne@68
|
830
|
jpayne@68
|
831 # for backwards compatibility:
|
jpayne@68
|
832 if referencenames is not None:
|
jpayne@68
|
833 reference_names = referencenames
|
jpayne@68
|
834 if referencelengths is not None:
|
jpayne@68
|
835 reference_lengths = referencelengths
|
jpayne@68
|
836
|
jpayne@68
|
837 # close a previously opened file
|
jpayne@68
|
838 if self.is_open:
|
jpayne@68
|
839 self.close()
|
jpayne@68
|
840
|
jpayne@68
|
841 # autodetection for read
|
jpayne@68
|
842 if mode is None:
|
jpayne@68
|
843 mode = "r"
|
jpayne@68
|
844
|
jpayne@68
|
845 if add_sam_header and mode == "w":
|
jpayne@68
|
846 mode = "wh"
|
jpayne@68
|
847
|
jpayne@68
|
848 assert mode in ("r", "w", "rb", "wb", "wh",
|
jpayne@68
|
849 "wbu", "rU", "wb0",
|
jpayne@68
|
850 "rc", "wc"), \
|
jpayne@68
|
851 "invalid file opening mode `%s`" % mode
|
jpayne@68
|
852
|
jpayne@68
|
853 self.duplicate_filehandle = duplicate_filehandle
|
jpayne@68
|
854
|
jpayne@68
|
855 # StringIO not supported
|
jpayne@68
|
856 if isinstance(filepath_or_object, StringIO):
|
jpayne@68
|
857 raise NotImplementedError(
|
jpayne@68
|
858 "access from StringIO objects not supported")
|
jpayne@68
|
859 # reading from a file descriptor
|
jpayne@68
|
860 elif isinstance(filepath_or_object, int):
|
jpayne@68
|
861 self.filename = filepath_or_object
|
jpayne@68
|
862 filename = None
|
jpayne@68
|
863 self.is_remote = False
|
jpayne@68
|
864 self.is_stream = True
|
jpayne@68
|
865 # reading from a File object or other object with fileno
|
jpayne@68
|
866 elif hasattr(filepath_or_object, "fileno"):
|
jpayne@68
|
867 if filepath_or_object.closed:
|
jpayne@68
|
868 raise ValueError('I/O operation on closed file')
|
jpayne@68
|
869 self.filename = filepath_or_object
|
jpayne@68
|
870 # .name can be TextIOWrapper
|
jpayne@68
|
871 try:
|
jpayne@68
|
872 filename = encode_filename(str(filepath_or_object.name))
|
jpayne@68
|
873 cfilename = filename
|
jpayne@68
|
874 except AttributeError:
|
jpayne@68
|
875 filename = None
|
jpayne@68
|
876 self.is_remote = False
|
jpayne@68
|
877 self.is_stream = True
|
jpayne@68
|
878 # what remains is a filename
|
jpayne@68
|
879 else:
|
jpayne@68
|
880 self.filename = filename = encode_filename(filepath_or_object)
|
jpayne@68
|
881 cfilename = filename
|
jpayne@68
|
882 self.is_remote = hisremote(cfilename)
|
jpayne@68
|
883 self.is_stream = self.filename == b'-'
|
jpayne@68
|
884
|
jpayne@68
|
885 # for htslib, wbu seems to not work
|
jpayne@68
|
886 if mode == "wbu":
|
jpayne@68
|
887 mode = "wb0"
|
jpayne@68
|
888
|
jpayne@68
|
889 self.mode = force_bytes(mode)
|
jpayne@68
|
890 self.reference_filename = reference_filename = encode_filename(
|
jpayne@68
|
891 reference_filename)
|
jpayne@68
|
892
|
jpayne@68
|
893 if mode[0] == 'w':
|
jpayne@68
|
894 # open file for writing
|
jpayne@68
|
895
|
jpayne@68
|
896 if not (template or header or text or (reference_names and reference_lengths)):
|
jpayne@68
|
897 raise ValueError(
|
jpayne@68
|
898 "either supply options `template`, `header`, `text` or both `reference_names` "
|
jpayne@68
|
899 "and `reference_lengths` for writing")
|
jpayne@68
|
900
|
jpayne@68
|
901 if template:
|
jpayne@68
|
902 # header is copied, though at the moment not strictly
|
jpayne@68
|
903 # necessary as AlignmentHeader is immutable.
|
jpayne@68
|
904 self.header = template.header.copy()
|
jpayne@68
|
905 elif isinstance(header, AlignmentHeader):
|
jpayne@68
|
906 self.header = header.copy()
|
jpayne@68
|
907 elif isinstance(header, Mapping):
|
jpayne@68
|
908 self.header = AlignmentHeader.from_dict(header)
|
jpayne@68
|
909 elif reference_names and reference_lengths:
|
jpayne@68
|
910 self.header = AlignmentHeader.from_references(
|
jpayne@68
|
911 reference_names,
|
jpayne@68
|
912 reference_lengths,
|
jpayne@68
|
913 add_sq_text=add_sq_text,
|
jpayne@68
|
914 text=text)
|
jpayne@68
|
915 elif text:
|
jpayne@68
|
916 self.header = AlignmentHeader.from_text(text)
|
jpayne@68
|
917 else:
|
jpayne@68
|
918 raise ValueError("not enough information to construct header. Please provide template, "
|
jpayne@68
|
919 "header, text or reference_names/reference_lengths")
|
jpayne@68
|
920 self.htsfile = self._open_htsfile()
|
jpayne@68
|
921
|
jpayne@68
|
922 if self.htsfile == NULL:
|
jpayne@68
|
923 if errno:
|
jpayne@68
|
924 raise IOError(errno, "could not open alignment file `{}`: {}".format(
|
jpayne@68
|
925 force_str(filename),
|
jpayne@68
|
926 force_str(strerror(errno))))
|
jpayne@68
|
927 else:
|
jpayne@68
|
928 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
|
jpayne@68
|
929 if format_options and len(format_options):
|
jpayne@68
|
930 self.add_hts_options(format_options)
|
jpayne@68
|
931 # set filename with reference sequences. If no filename
|
jpayne@68
|
932 # is given, the CRAM reference arrays will be built from
|
jpayne@68
|
933 # the @SQ header in the header
|
jpayne@68
|
934 if "c" in mode and reference_filename:
|
jpayne@68
|
935 if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
|
jpayne@68
|
936 raise ValueError("failure when setting reference filename")
|
jpayne@68
|
937
|
jpayne@68
|
938 # write header to htsfile
|
jpayne@68
|
939 if "b" in mode or "c" in mode or "h" in mode:
|
jpayne@68
|
940 hdr = self.header.ptr
|
jpayne@68
|
941 with nogil:
|
jpayne@68
|
942 sam_hdr_write(self.htsfile, hdr)
|
jpayne@68
|
943
|
jpayne@68
|
944 elif mode[0] == "r":
|
jpayne@68
|
945 # open file for reading
|
jpayne@68
|
946 self.htsfile = self._open_htsfile()
|
jpayne@68
|
947
|
jpayne@68
|
948 if self.htsfile == NULL:
|
jpayne@68
|
949 if errno:
|
jpayne@68
|
950 raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
|
jpayne@68
|
951 force_str(strerror(errno))))
|
jpayne@68
|
952 else:
|
jpayne@68
|
953 raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
|
jpayne@68
|
954
|
jpayne@68
|
955 if self.htsfile.format.category != sequence_data:
|
jpayne@68
|
956 raise ValueError("file does not contain alignment data")
|
jpayne@68
|
957
|
jpayne@68
|
958 if format_options and len(format_options):
|
jpayne@68
|
959 self.add_hts_options(format_options)
|
jpayne@68
|
960
|
jpayne@68
|
961 self.check_truncation(ignore_truncation)
|
jpayne@68
|
962
|
jpayne@68
|
963 # bam/cram files require a valid header
|
jpayne@68
|
964 if self.is_bam or self.is_cram:
|
jpayne@68
|
965 with nogil:
|
jpayne@68
|
966 hdr = sam_hdr_read(self.htsfile)
|
jpayne@68
|
967 if hdr == NULL:
|
jpayne@68
|
968 raise ValueError(
|
jpayne@68
|
969 "file does not have a valid header (mode='%s') "
|
jpayne@68
|
970 "- is it BAM/CRAM format?" % mode)
|
jpayne@68
|
971 self.header = makeAlignmentHeader(hdr)
|
jpayne@68
|
972 else:
|
jpayne@68
|
973 # in sam files a header is optional. If not given,
|
jpayne@68
|
974 # user may provide reference names and lengths to built
|
jpayne@68
|
975 # an on-the-fly header.
|
jpayne@68
|
976 if reference_names and reference_lengths:
|
jpayne@68
|
977 # build header from a target names and lengths
|
jpayne@68
|
978 self.header = AlignmentHeader.from_references(
|
jpayne@68
|
979 reference_names=reference_names,
|
jpayne@68
|
980 reference_lengths=reference_lengths,
|
jpayne@68
|
981 add_sq_text=add_sq_text,
|
jpayne@68
|
982 text=text)
|
jpayne@68
|
983 else:
|
jpayne@68
|
984 with nogil:
|
jpayne@68
|
985 hdr = sam_hdr_read(self.htsfile)
|
jpayne@68
|
986 if hdr == NULL:
|
jpayne@68
|
987 raise ValueError(
|
jpayne@68
|
988 "SAM? file does not have a valid header (mode='%s'), "
|
jpayne@68
|
989 "please provide reference_names and reference_lengths")
|
jpayne@68
|
990 self.header = makeAlignmentHeader(hdr)
|
jpayne@68
|
991
|
jpayne@68
|
992 # set filename with reference sequences
|
jpayne@68
|
993 if self.is_cram and reference_filename:
|
jpayne@68
|
994 creference_filename = self.reference_filename
|
jpayne@68
|
995 hts_set_opt(self.htsfile,
|
jpayne@68
|
996 CRAM_OPT_REFERENCE,
|
jpayne@68
|
997 creference_filename)
|
jpayne@68
|
998
|
jpayne@68
|
999 if check_sq and self.header.nreferences == 0:
|
jpayne@68
|
1000 raise ValueError(
|
jpayne@68
|
1001 ("file has no sequences defined (mode='%s') - "
|
jpayne@68
|
1002 "is it SAM/BAM format? Consider opening with "
|
jpayne@68
|
1003 "check_sq=False") % mode)
|
jpayne@68
|
1004
|
jpayne@68
|
1005 if self.is_bam or self.is_cram:
|
jpayne@68
|
1006 self.index_filename = index_filename or filepath_index
|
jpayne@68
|
1007 if self.index_filename:
|
jpayne@68
|
1008 cindexname = bfile_name = encode_filename(self.index_filename)
|
jpayne@68
|
1009
|
jpayne@68
|
1010 if cfilename or cindexname:
|
jpayne@68
|
1011 with nogil:
|
jpayne@68
|
1012 self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
|
jpayne@68
|
1013 HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
|
jpayne@68
|
1014
|
jpayne@68
|
1015 if not self.index and (cindexname or require_index):
|
jpayne@68
|
1016 if errno:
|
jpayne@68
|
1017 raise IOError(errno, force_str(strerror(errno)))
|
jpayne@68
|
1018 else:
|
jpayne@68
|
1019 raise IOError('unable to open index file `%s`' % self.index_filename)
|
jpayne@68
|
1020
|
jpayne@68
|
1021 elif require_index:
|
jpayne@68
|
1022 raise IOError('unable to open index file')
|
jpayne@68
|
1023
|
jpayne@68
|
1024 # save start of data section
|
jpayne@68
|
1025 if not self.is_stream:
|
jpayne@68
|
1026 self.start_offset = self.tell()
|
jpayne@68
|
1027
|
jpayne@68
|
1028 def fetch(self,
|
jpayne@68
|
1029 contig=None,
|
jpayne@68
|
1030 start=None,
|
jpayne@68
|
1031 stop=None,
|
jpayne@68
|
1032 region=None,
|
jpayne@68
|
1033 tid=None,
|
jpayne@68
|
1034 until_eof=False,
|
jpayne@68
|
1035 multiple_iterators=False,
|
jpayne@68
|
1036 reference=None,
|
jpayne@68
|
1037 end=None):
|
jpayne@68
|
1038 """fetch reads aligned in a :term:`region`.
|
jpayne@68
|
1039
|
jpayne@68
|
1040 See :meth:`~pysam.HTSFile.parse_region` for more information
|
jpayne@68
|
1041 on how genomic regions can be specified. :term:`reference` and
|
jpayne@68
|
1042 `end` are also accepted for backward compatibility as synonyms
|
jpayne@68
|
1043 for :term:`contig` and `stop`, respectively.
|
jpayne@68
|
1044
|
jpayne@68
|
1045 Without a `contig` or `region` all mapped reads in the file
|
jpayne@68
|
1046 will be fetched. The reads will be returned ordered by reference
|
jpayne@68
|
1047 sequence, which will not necessarily be the order within the
|
jpayne@68
|
1048 file. This mode of iteration still requires an index. If there is
|
jpayne@68
|
1049 no index, use `until_eof=True`.
|
jpayne@68
|
1050
|
jpayne@68
|
1051 If only `contig` is set, all reads aligned to `contig`
|
jpayne@68
|
1052 will be fetched.
|
jpayne@68
|
1053
|
jpayne@68
|
1054 A :term:`SAM` file does not allow random access. If `region`
|
jpayne@68
|
1055 or `contig` are given, an exception is raised.
|
jpayne@68
|
1056
|
jpayne@68
|
1057 Parameters
|
jpayne@68
|
1058 ----------
|
jpayne@68
|
1059
|
jpayne@68
|
1060 until_eof : bool
|
jpayne@68
|
1061
|
jpayne@68
|
1062 If `until_eof` is True, all reads from the current file
|
jpayne@68
|
1063 position will be returned in order as they are within the
|
jpayne@68
|
1064 file. Using this option will also fetch unmapped reads.
|
jpayne@68
|
1065
|
jpayne@68
|
1066 multiple_iterators : bool
|
jpayne@68
|
1067
|
jpayne@68
|
1068 If `multiple_iterators` is True, multiple
|
jpayne@68
|
1069 iterators on the same file can be used at the same time. The
|
jpayne@68
|
1070 iterator returned will receive its own copy of a filehandle to
|
jpayne@68
|
1071 the file effectively re-opening the file. Re-opening a file
|
jpayne@68
|
1072 creates some overhead, so beware.
|
jpayne@68
|
1073
|
jpayne@68
|
1074 Returns
|
jpayne@68
|
1075 -------
|
jpayne@68
|
1076
|
jpayne@68
|
1077 An iterator over a collection of reads. : IteratorRow
|
jpayne@68
|
1078
|
jpayne@68
|
1079 Raises
|
jpayne@68
|
1080 ------
|
jpayne@68
|
1081
|
jpayne@68
|
1082 ValueError
|
jpayne@68
|
1083 if the genomic coordinates are out of range or invalid or the
|
jpayne@68
|
1084 file does not permit random access to genomic coordinates.
|
jpayne@68
|
1085
|
jpayne@68
|
1086 """
|
jpayne@68
|
1087 cdef int rtid, rstart, rstop, has_coord
|
jpayne@68
|
1088
|
jpayne@68
|
1089 if not self.is_open:
|
jpayne@68
|
1090 raise ValueError( "I/O operation on closed file" )
|
jpayne@68
|
1091
|
jpayne@68
|
1092 has_coord, rtid, rstart, rstop = self.parse_region(
|
jpayne@68
|
1093 contig, start, stop, region, tid,
|
jpayne@68
|
1094 end=end, reference=reference)
|
jpayne@68
|
1095
|
jpayne@68
|
1096 # Turn of re-opening if htsfile is a stream
|
jpayne@68
|
1097 if self.is_stream:
|
jpayne@68
|
1098 multiple_iterators = False
|
jpayne@68
|
1099
|
jpayne@68
|
1100 if self.is_bam or self.is_cram:
|
jpayne@68
|
1101 if not until_eof and not self.is_remote:
|
jpayne@68
|
1102 if not self.has_index():
|
jpayne@68
|
1103 raise ValueError(
|
jpayne@68
|
1104 "fetch called on bamfile without index")
|
jpayne@68
|
1105
|
jpayne@68
|
1106 if has_coord:
|
jpayne@68
|
1107 return IteratorRowRegion(
|
jpayne@68
|
1108 self, rtid, rstart, rstop,
|
jpayne@68
|
1109 multiple_iterators=multiple_iterators)
|
jpayne@68
|
1110 else:
|
jpayne@68
|
1111 if until_eof:
|
jpayne@68
|
1112 return IteratorRowAll(
|
jpayne@68
|
1113 self,
|
jpayne@68
|
1114 multiple_iterators=multiple_iterators)
|
jpayne@68
|
1115 else:
|
jpayne@68
|
1116 # AH: check - reason why no multiple_iterators for
|
jpayne@68
|
1117 # AllRefs?
|
jpayne@68
|
1118 return IteratorRowAllRefs(
|
jpayne@68
|
1119 self,
|
jpayne@68
|
1120 multiple_iterators=multiple_iterators)
|
jpayne@68
|
1121 else:
|
jpayne@68
|
1122 if has_coord:
|
jpayne@68
|
1123 raise ValueError(
|
jpayne@68
|
1124 "fetching by region is not available for SAM files")
|
jpayne@68
|
1125
|
jpayne@68
|
1126 if multiple_iterators == True:
|
jpayne@68
|
1127 raise ValueError(
|
jpayne@68
|
1128 "multiple iterators not implemented for SAM files")
|
jpayne@68
|
1129
|
jpayne@68
|
1130 return IteratorRowAll(self,
|
jpayne@68
|
1131 multiple_iterators=multiple_iterators)
|
jpayne@68
|
1132
|
jpayne@68
|
1133 def head(self, n, multiple_iterators=True):
|
jpayne@68
|
1134 '''return an iterator over the first n alignments.
|
jpayne@68
|
1135
|
jpayne@68
|
1136 This iterator is is useful for inspecting the bam-file.
|
jpayne@68
|
1137
|
jpayne@68
|
1138 Parameters
|
jpayne@68
|
1139 ----------
|
jpayne@68
|
1140
|
jpayne@68
|
1141 multiple_iterators : bool
|
jpayne@68
|
1142
|
jpayne@68
|
1143 is set to True by default in order to
|
jpayne@68
|
1144 avoid changing the current file position.
|
jpayne@68
|
1145
|
jpayne@68
|
1146 Returns
|
jpayne@68
|
1147 -------
|
jpayne@68
|
1148
|
jpayne@68
|
1149 an iterator over a collection of reads : IteratorRowHead
|
jpayne@68
|
1150
|
jpayne@68
|
1151 '''
|
jpayne@68
|
1152 return IteratorRowHead(self, n,
|
jpayne@68
|
1153 multiple_iterators=multiple_iterators)
|
jpayne@68
|
1154
|
jpayne@68
|
1155 def mate(self, AlignedSegment read):
|
jpayne@68
|
1156 '''return the mate of :class:`pysam.AlignedSegment` `read`.
|
jpayne@68
|
1157
|
jpayne@68
|
1158 .. note::
|
jpayne@68
|
1159
|
jpayne@68
|
1160 Calling this method will change the file position.
|
jpayne@68
|
1161 This might interfere with any iterators that have
|
jpayne@68
|
1162 not re-opened the file.
|
jpayne@68
|
1163
|
jpayne@68
|
1164 .. note::
|
jpayne@68
|
1165
|
jpayne@68
|
1166 This method is too slow for high-throughput processing.
|
jpayne@68
|
1167 If a read needs to be processed with its mate, work
|
jpayne@68
|
1168 from a read name sorted file or, better, cache reads.
|
jpayne@68
|
1169
|
jpayne@68
|
1170 Returns
|
jpayne@68
|
1171 -------
|
jpayne@68
|
1172
|
jpayne@68
|
1173 the mate : AlignedSegment
|
jpayne@68
|
1174
|
jpayne@68
|
1175 Raises
|
jpayne@68
|
1176 ------
|
jpayne@68
|
1177
|
jpayne@68
|
1178 ValueError
|
jpayne@68
|
1179 if the read is unpaired or the mate is unmapped
|
jpayne@68
|
1180
|
jpayne@68
|
1181 '''
|
jpayne@68
|
1182 cdef uint32_t flag = read._delegate.core.flag
|
jpayne@68
|
1183
|
jpayne@68
|
1184 if flag & BAM_FPAIRED == 0:
|
jpayne@68
|
1185 raise ValueError("read %s: is unpaired" %
|
jpayne@68
|
1186 (read.query_name))
|
jpayne@68
|
1187 if flag & BAM_FMUNMAP != 0:
|
jpayne@68
|
1188 raise ValueError("mate %s: is unmapped" %
|
jpayne@68
|
1189 (read.query_name))
|
jpayne@68
|
1190
|
jpayne@68
|
1191 # xor flags to get the other mate
|
jpayne@68
|
1192 cdef int x = BAM_FREAD1 + BAM_FREAD2
|
jpayne@68
|
1193 flag = (flag ^ x) & x
|
jpayne@68
|
1194
|
jpayne@68
|
1195 # Make sure to use a separate file to jump around
|
jpayne@68
|
1196 # to mate as otherwise the original file position
|
jpayne@68
|
1197 # will be lost
|
jpayne@68
|
1198 # The following code is not using the C API and
|
jpayne@68
|
1199 # could thus be made much quicker, for example
|
jpayne@68
|
1200 # by using tell and seek.
|
jpayne@68
|
1201 for mate in self.fetch(
|
jpayne@68
|
1202 read._delegate.core.mpos,
|
jpayne@68
|
1203 read._delegate.core.mpos + 1,
|
jpayne@68
|
1204 tid=read._delegate.core.mtid,
|
jpayne@68
|
1205 multiple_iterators=True):
|
jpayne@68
|
1206 if mate.flag & flag != 0 and \
|
jpayne@68
|
1207 mate.query_name == read.query_name:
|
jpayne@68
|
1208 break
|
jpayne@68
|
1209 else:
|
jpayne@68
|
1210 raise ValueError("mate not found")
|
jpayne@68
|
1211
|
jpayne@68
|
1212 return mate
|
jpayne@68
|
1213
|
jpayne@68
|
1214 def pileup(self,
|
jpayne@68
|
1215 contig=None,
|
jpayne@68
|
1216 start=None,
|
jpayne@68
|
1217 stop=None,
|
jpayne@68
|
1218 region=None,
|
jpayne@68
|
1219 reference=None,
|
jpayne@68
|
1220 end=None,
|
jpayne@68
|
1221 **kwargs):
|
jpayne@68
|
1222 """perform a :term:`pileup` within a :term:`region`. The region is
|
jpayne@68
|
1223 specified by :term:`contig`, `start` and `stop` (using
|
jpayne@68
|
1224 0-based indexing). :term:`reference` and `end` are also accepted for
|
jpayne@68
|
1225 backward compatibility as synonyms for :term:`contig` and `stop`,
|
jpayne@68
|
1226 respectively. Alternatively, a samtools 'region' string
|
jpayne@68
|
1227 can be supplied.
|
jpayne@68
|
1228
|
jpayne@68
|
1229 Without 'contig' or 'region' all reads will be used for the
|
jpayne@68
|
1230 pileup. The reads will be returned ordered by
|
jpayne@68
|
1231 :term:`contig` sequence, which will not necessarily be the
|
jpayne@68
|
1232 order within the file.
|
jpayne@68
|
1233
|
jpayne@68
|
1234 Note that :term:`SAM` formatted files do not allow random
|
jpayne@68
|
1235 access. In these files, if a 'region' or 'contig' are
|
jpayne@68
|
1236 given an exception is raised.
|
jpayne@68
|
1237
|
jpayne@68
|
1238 .. note::
|
jpayne@68
|
1239
|
jpayne@68
|
1240 'all' reads which overlap the region are returned. The
|
jpayne@68
|
1241 first base returned will be the first base of the first
|
jpayne@68
|
1242 read 'not' necessarily the first base of the region used
|
jpayne@68
|
1243 in the query.
|
jpayne@68
|
1244
|
jpayne@68
|
1245 Parameters
|
jpayne@68
|
1246 ----------
|
jpayne@68
|
1247
|
jpayne@68
|
1248 truncate : bool
|
jpayne@68
|
1249
|
jpayne@68
|
1250 By default, the samtools pileup engine outputs all reads
|
jpayne@68
|
1251 overlapping a region. If truncate is True and a region is
|
jpayne@68
|
1252 given, only columns in the exact region specified are
|
jpayne@68
|
1253 returned.
|
jpayne@68
|
1254
|
jpayne@68
|
1255 max_depth : int
|
jpayne@68
|
1256 Maximum read depth permitted. The default limit is '8000'.
|
jpayne@68
|
1257
|
jpayne@68
|
1258 stepper : string
|
jpayne@68
|
1259 The stepper controls how the iterator advances.
|
jpayne@68
|
1260 Possible options for the stepper are
|
jpayne@68
|
1261
|
jpayne@68
|
1262 ``all``
|
jpayne@68
|
1263 skip reads in which any of the following flags are set:
|
jpayne@68
|
1264 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
|
jpayne@68
|
1265
|
jpayne@68
|
1266 ``nofilter``
|
jpayne@68
|
1267 uses every single read turning off any filtering.
|
jpayne@68
|
1268
|
jpayne@68
|
1269 ``samtools``
|
jpayne@68
|
1270 same filter and read processing as in samtools
|
jpayne@68
|
1271 pileup. For full compatibility, this requires a
|
jpayne@68
|
1272 'fastafile' to be given. The following options all pertain
|
jpayne@68
|
1273 to filtering of the ``samtools`` stepper.
|
jpayne@68
|
1274
|
jpayne@68
|
1275 fastafile : :class:`~pysam.FastaFile` object.
|
jpayne@68
|
1276
|
jpayne@68
|
1277 This is required for some of the steppers.
|
jpayne@68
|
1278
|
jpayne@68
|
1279 ignore_overlaps: bool
|
jpayne@68
|
1280
|
jpayne@68
|
1281 If set to True, detect if read pairs overlap and only take
|
jpayne@68
|
1282 the higher quality base. This is the default.
|
jpayne@68
|
1283
|
jpayne@68
|
1284 flag_filter : int
|
jpayne@68
|
1285
|
jpayne@68
|
1286 ignore reads where any of the bits in the flag are set. The default is
|
jpayne@68
|
1287 BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
|
jpayne@68
|
1288
|
jpayne@68
|
1289 flag_require : int
|
jpayne@68
|
1290
|
jpayne@68
|
1291 only use reads where certain flags are set. The default is 0.
|
jpayne@68
|
1292
|
jpayne@68
|
1293 ignore_orphans: bool
|
jpayne@68
|
1294
|
jpayne@68
|
1295 ignore orphans (paired reads that are not in a proper pair).
|
jpayne@68
|
1296 The default is to ignore orphans.
|
jpayne@68
|
1297
|
jpayne@68
|
1298 min_base_quality: int
|
jpayne@68
|
1299
|
jpayne@68
|
1300 Minimum base quality. Bases below the minimum quality will
|
jpayne@68
|
1301 not be output. The default is 13.
|
jpayne@68
|
1302
|
jpayne@68
|
1303 adjust_capq_threshold: int
|
jpayne@68
|
1304
|
jpayne@68
|
1305 adjust mapping quality. The default is 0 for no
|
jpayne@68
|
1306 adjustment. The recommended value for adjustment is 50.
|
jpayne@68
|
1307
|
jpayne@68
|
1308 min_mapping_quality : int
|
jpayne@68
|
1309
|
jpayne@68
|
1310 only use reads above a minimum mapping quality. The default is 0.
|
jpayne@68
|
1311
|
jpayne@68
|
1312 compute_baq: bool
|
jpayne@68
|
1313
|
jpayne@68
|
1314 re-alignment computing per-Base Alignment Qualities (BAQ). The
|
jpayne@68
|
1315 default is to do re-alignment. Realignment requires a reference
|
jpayne@68
|
1316 sequence. If none is present, no realignment will be performed.
|
jpayne@68
|
1317
|
jpayne@68
|
1318 redo_baq: bool
|
jpayne@68
|
1319
|
jpayne@68
|
1320 recompute per-Base Alignment Quality on the fly ignoring
|
jpayne@68
|
1321 existing base qualities. The default is False (use existing
|
jpayne@68
|
1322 base qualities).
|
jpayne@68
|
1323
|
jpayne@68
|
1324 Returns
|
jpayne@68
|
1325 -------
|
jpayne@68
|
1326
|
jpayne@68
|
1327 an iterator over genomic positions. : IteratorColumn
|
jpayne@68
|
1328
|
jpayne@68
|
1329 """
|
jpayne@68
|
1330 cdef int rtid, has_coord
|
jpayne@68
|
1331 cdef int32_t rstart, rstop
|
jpayne@68
|
1332
|
jpayne@68
|
1333 if not self.is_open:
|
jpayne@68
|
1334 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
1335
|
jpayne@68
|
1336 has_coord, rtid, rstart, rstop = self.parse_region(
|
jpayne@68
|
1337 contig, start, stop, region, reference=reference, end=end)
|
jpayne@68
|
1338
|
jpayne@68
|
1339 if has_coord:
|
jpayne@68
|
1340 if not self.has_index():
|
jpayne@68
|
1341 raise ValueError("no index available for pileup")
|
jpayne@68
|
1342
|
jpayne@68
|
1343 return IteratorColumnRegion(self,
|
jpayne@68
|
1344 tid=rtid,
|
jpayne@68
|
1345 start=rstart,
|
jpayne@68
|
1346 stop=rstop,
|
jpayne@68
|
1347 **kwargs)
|
jpayne@68
|
1348 else:
|
jpayne@68
|
1349 if self.has_index():
|
jpayne@68
|
1350 return IteratorColumnAllRefs(self, **kwargs)
|
jpayne@68
|
1351 else:
|
jpayne@68
|
1352 return IteratorColumnAll(self, **kwargs)
|
jpayne@68
|
1353
|
jpayne@68
|
1354 def count(self,
|
jpayne@68
|
1355 contig=None,
|
jpayne@68
|
1356 start=None,
|
jpayne@68
|
1357 stop=None,
|
jpayne@68
|
1358 region=None,
|
jpayne@68
|
1359 until_eof=False,
|
jpayne@68
|
1360 read_callback="nofilter",
|
jpayne@68
|
1361 reference=None,
|
jpayne@68
|
1362 end=None):
|
jpayne@68
|
1363 '''count the number of reads in :term:`region`
|
jpayne@68
|
1364
|
jpayne@68
|
1365 The region is specified by :term:`contig`, `start` and `stop`.
|
jpayne@68
|
1366 :term:`reference` and `end` are also accepted for backward
|
jpayne@68
|
1367 compatibility as synonyms for :term:`contig` and `stop`,
|
jpayne@68
|
1368 respectively. Alternatively, a `samtools`_ :term:`region`
|
jpayne@68
|
1369 string can be supplied.
|
jpayne@68
|
1370
|
jpayne@68
|
1371 A :term:`SAM` file does not allow random access and if
|
jpayne@68
|
1372 `region` or `contig` are given, an exception is raised.
|
jpayne@68
|
1373
|
jpayne@68
|
1374 Parameters
|
jpayne@68
|
1375 ----------
|
jpayne@68
|
1376
|
jpayne@68
|
1377 contig : string
|
jpayne@68
|
1378 reference_name of the genomic region (chromosome)
|
jpayne@68
|
1379
|
jpayne@68
|
1380 start : int
|
jpayne@68
|
1381 start of the genomic region (0-based inclusive)
|
jpayne@68
|
1382
|
jpayne@68
|
1383 stop : int
|
jpayne@68
|
1384 end of the genomic region (0-based exclusive)
|
jpayne@68
|
1385
|
jpayne@68
|
1386 region : string
|
jpayne@68
|
1387 a region string in samtools format.
|
jpayne@68
|
1388
|
jpayne@68
|
1389 until_eof : bool
|
jpayne@68
|
1390 count until the end of the file, possibly including
|
jpayne@68
|
1391 unmapped reads as well.
|
jpayne@68
|
1392
|
jpayne@68
|
1393 read_callback: string or function
|
jpayne@68
|
1394
|
jpayne@68
|
1395 select a call-back to ignore reads when counting. It can
|
jpayne@68
|
1396 be either a string with the following values:
|
jpayne@68
|
1397
|
jpayne@68
|
1398 ``all``
|
jpayne@68
|
1399 skip reads in which any of the following
|
jpayne@68
|
1400 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
|
jpayne@68
|
1401 BAM_FDUP
|
jpayne@68
|
1402
|
jpayne@68
|
1403 ``nofilter``
|
jpayne@68
|
1404 uses every single read
|
jpayne@68
|
1405
|
jpayne@68
|
1406 Alternatively, `read_callback` can be a function
|
jpayne@68
|
1407 ``check_read(read)`` that should return True only for
|
jpayne@68
|
1408 those reads that shall be included in the counting.
|
jpayne@68
|
1409
|
jpayne@68
|
1410 reference : string
|
jpayne@68
|
1411 backward compatible synonym for `contig`
|
jpayne@68
|
1412
|
jpayne@68
|
1413 end : int
|
jpayne@68
|
1414 backward compatible synonym for `stop`
|
jpayne@68
|
1415
|
jpayne@68
|
1416 Raises
|
jpayne@68
|
1417 ------
|
jpayne@68
|
1418
|
jpayne@68
|
1419 ValueError
|
jpayne@68
|
1420 if the genomic coordinates are out of range or invalid.
|
jpayne@68
|
1421
|
jpayne@68
|
1422 '''
|
jpayne@68
|
1423 cdef AlignedSegment read
|
jpayne@68
|
1424 cdef long counter = 0
|
jpayne@68
|
1425
|
jpayne@68
|
1426 if not self.is_open:
|
jpayne@68
|
1427 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
1428
|
jpayne@68
|
1429 cdef int filter_method = 0
|
jpayne@68
|
1430 if read_callback == "all":
|
jpayne@68
|
1431 filter_method = 1
|
jpayne@68
|
1432 elif read_callback == "nofilter":
|
jpayne@68
|
1433 filter_method = 2
|
jpayne@68
|
1434
|
jpayne@68
|
1435 for read in self.fetch(contig=contig,
|
jpayne@68
|
1436 start=start,
|
jpayne@68
|
1437 stop=stop,
|
jpayne@68
|
1438 reference=reference,
|
jpayne@68
|
1439 end=end,
|
jpayne@68
|
1440 region=region,
|
jpayne@68
|
1441 until_eof=until_eof):
|
jpayne@68
|
1442 # apply filter
|
jpayne@68
|
1443 if filter_method == 1:
|
jpayne@68
|
1444 # filter = "all"
|
jpayne@68
|
1445 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
|
jpayne@68
|
1446 continue
|
jpayne@68
|
1447 elif filter_method == 2:
|
jpayne@68
|
1448 # filter = "nofilter"
|
jpayne@68
|
1449 pass
|
jpayne@68
|
1450 else:
|
jpayne@68
|
1451 if not read_callback(read):
|
jpayne@68
|
1452 continue
|
jpayne@68
|
1453 counter += 1
|
jpayne@68
|
1454
|
jpayne@68
|
1455 return counter
|
jpayne@68
|
1456
|
jpayne@68
|
1457 @cython.boundscheck(False) # we do manual bounds checking
|
jpayne@68
|
1458 def count_coverage(self,
|
jpayne@68
|
1459 contig,
|
jpayne@68
|
1460 start=None,
|
jpayne@68
|
1461 stop=None,
|
jpayne@68
|
1462 region=None,
|
jpayne@68
|
1463 quality_threshold=15,
|
jpayne@68
|
1464 read_callback='all',
|
jpayne@68
|
1465 reference=None,
|
jpayne@68
|
1466 end=None):
|
jpayne@68
|
1467 """count the coverage of genomic positions by reads in :term:`region`.
|
jpayne@68
|
1468
|
jpayne@68
|
1469 The region is specified by :term:`contig`, `start` and `stop`.
|
jpayne@68
|
1470 :term:`reference` and `end` are also accepted for backward
|
jpayne@68
|
1471 compatibility as synonyms for :term:`contig` and `stop`,
|
jpayne@68
|
1472 respectively. Alternatively, a `samtools`_ :term:`region`
|
jpayne@68
|
1473 string can be supplied. The coverage is computed per-base [ACGT].
|
jpayne@68
|
1474
|
jpayne@68
|
1475 Parameters
|
jpayne@68
|
1476 ----------
|
jpayne@68
|
1477
|
jpayne@68
|
1478 contig : string
|
jpayne@68
|
1479 reference_name of the genomic region (chromosome)
|
jpayne@68
|
1480
|
jpayne@68
|
1481 start : int
|
jpayne@68
|
1482 start of the genomic region (0-based inclusive). If not
|
jpayne@68
|
1483 given, count from the start of the chromosome.
|
jpayne@68
|
1484
|
jpayne@68
|
1485 stop : int
|
jpayne@68
|
1486 end of the genomic region (0-based exclusive). If not given,
|
jpayne@68
|
1487 count to the end of the chromosome.
|
jpayne@68
|
1488
|
jpayne@68
|
1489 region : string
|
jpayne@68
|
1490 a region string.
|
jpayne@68
|
1491
|
jpayne@68
|
1492 quality_threshold : int
|
jpayne@68
|
1493 quality_threshold is the minimum quality score (in phred) a
|
jpayne@68
|
1494 base has to reach to be counted.
|
jpayne@68
|
1495
|
jpayne@68
|
1496 read_callback: string or function
|
jpayne@68
|
1497
|
jpayne@68
|
1498 select a call-back to ignore reads when counting. It can
|
jpayne@68
|
1499 be either a string with the following values:
|
jpayne@68
|
1500
|
jpayne@68
|
1501 ``all``
|
jpayne@68
|
1502 skip reads in which any of the following
|
jpayne@68
|
1503 flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
|
jpayne@68
|
1504 BAM_FDUP
|
jpayne@68
|
1505
|
jpayne@68
|
1506 ``nofilter``
|
jpayne@68
|
1507 uses every single read
|
jpayne@68
|
1508
|
jpayne@68
|
1509 Alternatively, `read_callback` can be a function
|
jpayne@68
|
1510 ``check_read(read)`` that should return True only for
|
jpayne@68
|
1511 those reads that shall be included in the counting.
|
jpayne@68
|
1512
|
jpayne@68
|
1513 reference : string
|
jpayne@68
|
1514 backward compatible synonym for `contig`
|
jpayne@68
|
1515
|
jpayne@68
|
1516 end : int
|
jpayne@68
|
1517 backward compatible synonym for `stop`
|
jpayne@68
|
1518
|
jpayne@68
|
1519 Raises
|
jpayne@68
|
1520 ------
|
jpayne@68
|
1521
|
jpayne@68
|
1522 ValueError
|
jpayne@68
|
1523 if the genomic coordinates are out of range or invalid.
|
jpayne@68
|
1524
|
jpayne@68
|
1525 Returns
|
jpayne@68
|
1526 -------
|
jpayne@68
|
1527
|
jpayne@68
|
1528 four array.arrays of the same length in order A C G T : tuple
|
jpayne@68
|
1529
|
jpayne@68
|
1530 """
|
jpayne@68
|
1531
|
jpayne@68
|
1532 cdef uint32_t contig_length = self.get_reference_length(contig)
|
jpayne@68
|
1533 cdef int _start = start if start is not None else 0
|
jpayne@68
|
1534 cdef int _stop = stop if stop is not None else contig_length
|
jpayne@68
|
1535 _stop = _stop if _stop < contig_length else contig_length
|
jpayne@68
|
1536
|
jpayne@68
|
1537 if _stop == _start:
|
jpayne@68
|
1538 raise ValueError("interval of size 0")
|
jpayne@68
|
1539 if _stop < _start:
|
jpayne@68
|
1540 raise ValueError("interval of size less than 0")
|
jpayne@68
|
1541
|
jpayne@68
|
1542 cdef int length = _stop - _start
|
jpayne@68
|
1543 cdef c_array.array int_array_template = array.array('L', [])
|
jpayne@68
|
1544 cdef c_array.array count_a
|
jpayne@68
|
1545 cdef c_array.array count_c
|
jpayne@68
|
1546 cdef c_array.array count_g
|
jpayne@68
|
1547 cdef c_array.array count_t
|
jpayne@68
|
1548 count_a = c_array.clone(int_array_template, length, zero=True)
|
jpayne@68
|
1549 count_c = c_array.clone(int_array_template, length, zero=True)
|
jpayne@68
|
1550 count_g = c_array.clone(int_array_template, length, zero=True)
|
jpayne@68
|
1551 count_t = c_array.clone(int_array_template, length, zero=True)
|
jpayne@68
|
1552
|
jpayne@68
|
1553 cdef AlignedSegment read
|
jpayne@68
|
1554 cdef cython.str seq
|
jpayne@68
|
1555 cdef c_array.array quality
|
jpayne@68
|
1556 cdef int qpos
|
jpayne@68
|
1557 cdef int refpos
|
jpayne@68
|
1558 cdef int c = 0
|
jpayne@68
|
1559 cdef int filter_method = 0
|
jpayne@68
|
1560
|
jpayne@68
|
1561
|
jpayne@68
|
1562 if read_callback == "all":
|
jpayne@68
|
1563 filter_method = 1
|
jpayne@68
|
1564 elif read_callback == "nofilter":
|
jpayne@68
|
1565 filter_method = 2
|
jpayne@68
|
1566
|
jpayne@68
|
1567 cdef int _threshold = quality_threshold or 0
|
jpayne@68
|
1568 for read in self.fetch(contig=contig,
|
jpayne@68
|
1569 reference=reference,
|
jpayne@68
|
1570 start=start,
|
jpayne@68
|
1571 stop=stop,
|
jpayne@68
|
1572 end=end,
|
jpayne@68
|
1573 region=region):
|
jpayne@68
|
1574 # apply filter
|
jpayne@68
|
1575 if filter_method == 1:
|
jpayne@68
|
1576 # filter = "all"
|
jpayne@68
|
1577 if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
|
jpayne@68
|
1578 continue
|
jpayne@68
|
1579 elif filter_method == 2:
|
jpayne@68
|
1580 # filter = "nofilter"
|
jpayne@68
|
1581 pass
|
jpayne@68
|
1582 else:
|
jpayne@68
|
1583 if not read_callback(read):
|
jpayne@68
|
1584 continue
|
jpayne@68
|
1585
|
jpayne@68
|
1586 # count
|
jpayne@68
|
1587 seq = read.seq
|
jpayne@68
|
1588 if seq is None:
|
jpayne@68
|
1589 continue
|
jpayne@68
|
1590 quality = read.query_qualities
|
jpayne@68
|
1591
|
jpayne@68
|
1592 for qpos, refpos in read.get_aligned_pairs(True):
|
jpayne@68
|
1593 if qpos is not None and refpos is not None and \
|
jpayne@68
|
1594 _start <= refpos < _stop:
|
jpayne@68
|
1595
|
jpayne@68
|
1596 # only check base quality if _threshold > 0
|
jpayne@68
|
1597 if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
|
jpayne@68
|
1598 if seq[qpos] == 'A':
|
jpayne@68
|
1599 count_a.data.as_ulongs[refpos - _start] += 1
|
jpayne@68
|
1600 if seq[qpos] == 'C':
|
jpayne@68
|
1601 count_c.data.as_ulongs[refpos - _start] += 1
|
jpayne@68
|
1602 if seq[qpos] == 'G':
|
jpayne@68
|
1603 count_g.data.as_ulongs[refpos - _start] += 1
|
jpayne@68
|
1604 if seq[qpos] == 'T':
|
jpayne@68
|
1605 count_t.data.as_ulongs[refpos - _start] += 1
|
jpayne@68
|
1606
|
jpayne@68
|
1607 return count_a, count_c, count_g, count_t
|
jpayne@68
|
1608
|
jpayne@68
|
1609 def find_introns_slow(self, read_iterator):
|
jpayne@68
|
1610 """Return a dictionary {(start, stop): count}
|
jpayne@68
|
1611 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
|
jpayne@68
|
1612 and their support ( = number of reads ).
|
jpayne@68
|
1613
|
jpayne@68
|
1614 read_iterator can be the result of a .fetch(...) call.
|
jpayne@68
|
1615 Or it can be a generator filtering such reads. Example
|
jpayne@68
|
1616 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
|
jpayne@68
|
1617 """
|
jpayne@68
|
1618 res = collections.Counter()
|
jpayne@68
|
1619 for r in read_iterator:
|
jpayne@68
|
1620 if 'N' in r.cigarstring:
|
jpayne@68
|
1621 last_read_pos = False
|
jpayne@68
|
1622 for read_loc, genome_loc in r.get_aligned_pairs():
|
jpayne@68
|
1623 if read_loc is None and last_read_pos:
|
jpayne@68
|
1624 start = genome_loc
|
jpayne@68
|
1625 elif read_loc and last_read_pos is None:
|
jpayne@68
|
1626 stop = genome_loc # we are right exclusive ,so this is correct
|
jpayne@68
|
1627 res[(start, stop)] += 1
|
jpayne@68
|
1628 del start
|
jpayne@68
|
1629 del stop
|
jpayne@68
|
1630 last_read_pos = read_loc
|
jpayne@68
|
1631 return res
|
jpayne@68
|
1632
|
jpayne@68
|
1633 def find_introns(self, read_iterator):
|
jpayne@68
|
1634 """Return a dictionary {(start, stop): count}
|
jpayne@68
|
1635 Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
|
jpayne@68
|
1636 and their support ( = number of reads ).
|
jpayne@68
|
1637
|
jpayne@68
|
1638 read_iterator can be the result of a .fetch(...) call.
|
jpayne@68
|
1639 Or it can be a generator filtering such reads. Example
|
jpayne@68
|
1640 samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
|
jpayne@68
|
1641 """
|
jpayne@68
|
1642 cdef:
|
jpayne@68
|
1643 uint32_t base_position, junc_start, nt
|
jpayne@68
|
1644 int op
|
jpayne@68
|
1645 AlignedSegment r
|
jpayne@68
|
1646 int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
|
jpayne@68
|
1647
|
jpayne@68
|
1648 res = collections.Counter()
|
jpayne@68
|
1649
|
jpayne@68
|
1650 match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
|
jpayne@68
|
1651 for r in read_iterator:
|
jpayne@68
|
1652 base_position = r.pos
|
jpayne@68
|
1653 cigar = r.cigartuples
|
jpayne@68
|
1654 if cigar is None:
|
jpayne@68
|
1655 continue
|
jpayne@68
|
1656
|
jpayne@68
|
1657 for op, nt in cigar:
|
jpayne@68
|
1658 if op in match_or_deletion:
|
jpayne@68
|
1659 base_position += nt
|
jpayne@68
|
1660 elif op == BAM_CREF_SKIP:
|
jpayne@68
|
1661 junc_start = base_position
|
jpayne@68
|
1662 base_position += nt
|
jpayne@68
|
1663 res[(junc_start, base_position)] += 1
|
jpayne@68
|
1664 return res
|
jpayne@68
|
1665
|
jpayne@68
|
1666
|
jpayne@68
|
1667 def close(self):
|
jpayne@68
|
1668 '''closes the :class:`pysam.AlignmentFile`.'''
|
jpayne@68
|
1669
|
jpayne@68
|
1670 if self.htsfile == NULL:
|
jpayne@68
|
1671 return
|
jpayne@68
|
1672
|
jpayne@68
|
1673 if self.index != NULL:
|
jpayne@68
|
1674 hts_idx_destroy(self.index)
|
jpayne@68
|
1675 self.index = NULL
|
jpayne@68
|
1676
|
jpayne@68
|
1677 cdef int ret = hts_close(self.htsfile)
|
jpayne@68
|
1678 self.htsfile = NULL
|
jpayne@68
|
1679
|
jpayne@68
|
1680 self.header = None
|
jpayne@68
|
1681
|
jpayne@68
|
1682 if ret < 0:
|
jpayne@68
|
1683 global errno
|
jpayne@68
|
1684 if errno == EPIPE:
|
jpayne@68
|
1685 errno = 0
|
jpayne@68
|
1686 else:
|
jpayne@68
|
1687 raise IOError(errno, force_str(strerror(errno)))
|
jpayne@68
|
1688
|
jpayne@68
|
1689 def __dealloc__(self):
|
jpayne@68
|
1690 cdef int ret = 0
|
jpayne@68
|
1691
|
jpayne@68
|
1692 if self.index != NULL:
|
jpayne@68
|
1693 hts_idx_destroy(self.index)
|
jpayne@68
|
1694 self.index = NULL
|
jpayne@68
|
1695
|
jpayne@68
|
1696 if self.htsfile != NULL:
|
jpayne@68
|
1697 ret = hts_close(self.htsfile)
|
jpayne@68
|
1698 self.htsfile = NULL
|
jpayne@68
|
1699
|
jpayne@68
|
1700 self.header = None
|
jpayne@68
|
1701
|
jpayne@68
|
1702 if self.b:
|
jpayne@68
|
1703 bam_destroy1(self.b)
|
jpayne@68
|
1704 self.b = NULL
|
jpayne@68
|
1705
|
jpayne@68
|
1706 if ret < 0:
|
jpayne@68
|
1707 global errno
|
jpayne@68
|
1708 if errno == EPIPE:
|
jpayne@68
|
1709 errno = 0
|
jpayne@68
|
1710 else:
|
jpayne@68
|
1711 raise IOError(errno, force_str(strerror(errno)))
|
jpayne@68
|
1712
|
jpayne@68
|
1713 cpdef int write(self, AlignedSegment read) except -1:
|
jpayne@68
|
1714 '''
|
jpayne@68
|
1715 write a single :class:`pysam.AlignedSegment` to disk.
|
jpayne@68
|
1716
|
jpayne@68
|
1717 Raises:
|
jpayne@68
|
1718 ValueError
|
jpayne@68
|
1719 if the writing failed
|
jpayne@68
|
1720
|
jpayne@68
|
1721 Returns:
|
jpayne@68
|
1722 int :
|
jpayne@68
|
1723 the number of bytes written. If the file is closed,
|
jpayne@68
|
1724 this will be 0.
|
jpayne@68
|
1725 '''
|
jpayne@68
|
1726 if not self.is_open:
|
jpayne@68
|
1727 return 0
|
jpayne@68
|
1728
|
jpayne@68
|
1729 if self.header.ptr.n_targets <= read._delegate.core.tid:
|
jpayne@68
|
1730 raise ValueError(
|
jpayne@68
|
1731 "AlignedSegment refers to reference number {} that "
|
jpayne@68
|
1732 "is larger than the number of references ({}) in the header".format(
|
jpayne@68
|
1733 read._delegate.core.tid, self.header.ptr.n_targets))
|
jpayne@68
|
1734
|
jpayne@68
|
1735 cdef int ret
|
jpayne@68
|
1736 with nogil:
|
jpayne@68
|
1737 ret = sam_write1(self.htsfile,
|
jpayne@68
|
1738 self.header.ptr,
|
jpayne@68
|
1739 read._delegate)
|
jpayne@68
|
1740
|
jpayne@68
|
1741 # kbj: Still need to raise an exception with except -1. Otherwise
|
jpayne@68
|
1742 # when ret == -1 we get a "SystemError: error return without
|
jpayne@68
|
1743 # exception set".
|
jpayne@68
|
1744 if ret < 0:
|
jpayne@68
|
1745 raise IOError(
|
jpayne@68
|
1746 "sam_write1 failed with error code {}".format(ret))
|
jpayne@68
|
1747
|
jpayne@68
|
1748 return ret
|
jpayne@68
|
1749
|
jpayne@68
|
1750 # context manager interface
|
jpayne@68
|
1751 def __enter__(self):
|
jpayne@68
|
1752 return self
|
jpayne@68
|
1753
|
jpayne@68
|
1754 def __exit__(self, exc_type, exc_value, traceback):
|
jpayne@68
|
1755 self.close()
|
jpayne@68
|
1756 return False
|
jpayne@68
|
1757
|
jpayne@68
|
1758 ###############################################################
|
jpayne@68
|
1759 ###############################################################
|
jpayne@68
|
1760 ###############################################################
|
jpayne@68
|
1761 ## properties
|
jpayne@68
|
1762 ###############################################################
|
jpayne@68
|
1763 property mapped:
|
jpayne@68
|
1764 """int with total number of mapped alignments according to the
|
jpayne@68
|
1765 statistics recorded in the index. This is a read-only
|
jpayne@68
|
1766 attribute.
|
jpayne@68
|
1767 (This will be 0 for a CRAM file indexed by a .crai index, as that
|
jpayne@68
|
1768 index format does not record these statistics.)
|
jpayne@68
|
1769 """
|
jpayne@68
|
1770 def __get__(self):
|
jpayne@68
|
1771 self.check_index()
|
jpayne@68
|
1772 cdef int tid
|
jpayne@68
|
1773 cdef uint64_t total = 0
|
jpayne@68
|
1774 cdef uint64_t mapped, unmapped
|
jpayne@68
|
1775 for tid from 0 <= tid < self.header.nreferences:
|
jpayne@68
|
1776 with nogil:
|
jpayne@68
|
1777 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
|
jpayne@68
|
1778 total += mapped
|
jpayne@68
|
1779 return total
|
jpayne@68
|
1780
|
jpayne@68
|
1781 property unmapped:
|
jpayne@68
|
1782 """int with total number of unmapped reads according to the statistics
|
jpayne@68
|
1783 recorded in the index. This number of reads includes the number of reads
|
jpayne@68
|
1784 without coordinates. This is a read-only attribute.
|
jpayne@68
|
1785 (This will be 0 for a CRAM file indexed by a .crai index, as that
|
jpayne@68
|
1786 index format does not record these statistics.)
|
jpayne@68
|
1787 """
|
jpayne@68
|
1788 def __get__(self):
|
jpayne@68
|
1789 self.check_index()
|
jpayne@68
|
1790 cdef int tid
|
jpayne@68
|
1791 cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
|
jpayne@68
|
1792 cdef uint64_t mapped, unmapped
|
jpayne@68
|
1793 for tid from 0 <= tid < self.header.nreferences:
|
jpayne@68
|
1794 with nogil:
|
jpayne@68
|
1795 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
|
jpayne@68
|
1796 total += unmapped
|
jpayne@68
|
1797 return total
|
jpayne@68
|
1798
|
jpayne@68
|
1799 property nocoordinate:
|
jpayne@68
|
1800 """int with total number of reads without coordinates according to the
|
jpayne@68
|
1801 statistics recorded in the index, i.e., the statistic printed for "*"
|
jpayne@68
|
1802 by the ``samtools idxstats`` command. This is a read-only attribute.
|
jpayne@68
|
1803 (This will be 0 for a CRAM file indexed by a .crai index, as that
|
jpayne@68
|
1804 index format does not record these statistics.)
|
jpayne@68
|
1805 """
|
jpayne@68
|
1806 def __get__(self):
|
jpayne@68
|
1807 self.check_index()
|
jpayne@68
|
1808 cdef uint64_t n
|
jpayne@68
|
1809 with nogil:
|
jpayne@68
|
1810 n = hts_idx_get_n_no_coor(self.index)
|
jpayne@68
|
1811 return n
|
jpayne@68
|
1812
|
jpayne@68
|
1813 def get_index_statistics(self):
|
jpayne@68
|
1814 """return statistics about mapped/unmapped reads per chromosome as
|
jpayne@68
|
1815 they are stored in the index, similarly to the statistics printed
|
jpayne@68
|
1816 by the ``samtools idxstats`` command.
|
jpayne@68
|
1817
|
jpayne@68
|
1818 CRAI indexes do not record these statistics, so for a CRAM file
|
jpayne@68
|
1819 with a .crai index the returned statistics will all be 0.
|
jpayne@68
|
1820
|
jpayne@68
|
1821 Returns:
|
jpayne@68
|
1822 list :
|
jpayne@68
|
1823 a list of records for each chromosome. Each record has the
|
jpayne@68
|
1824 attributes 'contig', 'mapped', 'unmapped' and 'total'.
|
jpayne@68
|
1825 """
|
jpayne@68
|
1826
|
jpayne@68
|
1827 self.check_index()
|
jpayne@68
|
1828 cdef int tid
|
jpayne@68
|
1829 cdef uint64_t mapped, unmapped
|
jpayne@68
|
1830 results = []
|
jpayne@68
|
1831 # TODO: use header
|
jpayne@68
|
1832 for tid from 0 <= tid < self.nreferences:
|
jpayne@68
|
1833 with nogil:
|
jpayne@68
|
1834 hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
|
jpayne@68
|
1835 results.append(
|
jpayne@68
|
1836 IndexStats._make((
|
jpayne@68
|
1837 self.get_reference_name(tid),
|
jpayne@68
|
1838 mapped,
|
jpayne@68
|
1839 unmapped,
|
jpayne@68
|
1840 mapped + unmapped)))
|
jpayne@68
|
1841
|
jpayne@68
|
1842 return results
|
jpayne@68
|
1843
|
jpayne@68
|
1844 ###############################################################
|
jpayne@68
|
1845 ## file-object like iterator access
|
jpayne@68
|
1846 ## note: concurrent access will cause errors (see IteratorRow
|
jpayne@68
|
1847 ## and multiple_iterators)
|
jpayne@68
|
1848 ## Possible solutions: deprecate or open new file handle
|
jpayne@68
|
1849 def __iter__(self):
|
jpayne@68
|
1850 if not self.is_open:
|
jpayne@68
|
1851 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
1852
|
jpayne@68
|
1853 if not self.is_bam and self.header.nreferences == 0:
|
jpayne@68
|
1854 raise NotImplementedError(
|
jpayne@68
|
1855 "can not iterate over samfile without header")
|
jpayne@68
|
1856 return self
|
jpayne@68
|
1857
|
jpayne@68
|
1858 cdef bam1_t * getCurrent(self):
|
jpayne@68
|
1859 return self.b
|
jpayne@68
|
1860
|
jpayne@68
|
1861 cdef int cnext(self):
|
jpayne@68
|
1862 '''
|
jpayne@68
|
1863 cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
|
jpayne@68
|
1864 '''
|
jpayne@68
|
1865 cdef int ret
|
jpayne@68
|
1866 cdef bam_hdr_t * hdr = self.header.ptr
|
jpayne@68
|
1867 with nogil:
|
jpayne@68
|
1868 ret = sam_read1(self.htsfile,
|
jpayne@68
|
1869 hdr,
|
jpayne@68
|
1870 self.b)
|
jpayne@68
|
1871 return ret
|
jpayne@68
|
1872
|
jpayne@68
|
1873 def __next__(self):
|
jpayne@68
|
1874 cdef int ret = self.cnext()
|
jpayne@68
|
1875 if ret >= 0:
|
jpayne@68
|
1876 return makeAlignedSegment(self.b, self.header)
|
jpayne@68
|
1877 elif ret == -1:
|
jpayne@68
|
1878 raise StopIteration
|
jpayne@68
|
1879 else:
|
jpayne@68
|
1880 raise IOError(read_failure_reason(ret))
|
jpayne@68
|
1881
|
jpayne@68
|
1882 ###########################################
|
jpayne@68
|
1883 # methods/properties referencing the header
|
jpayne@68
|
1884 def is_valid_tid(self, int tid):
|
jpayne@68
|
1885 """
|
jpayne@68
|
1886 return True if the numerical :term:`tid` is valid; False otherwise.
|
jpayne@68
|
1887
|
jpayne@68
|
1888 Note that the unmapped tid code (-1) counts as an invalid.
|
jpayne@68
|
1889 """
|
jpayne@68
|
1890 if self.header is None:
|
jpayne@68
|
1891 raise ValueError("header not available in closed files")
|
jpayne@68
|
1892 return self.header.is_valid_tid(tid)
|
jpayne@68
|
1893
|
jpayne@68
|
1894 def get_tid(self, reference):
|
jpayne@68
|
1895 """
|
jpayne@68
|
1896 return the numerical :term:`tid` corresponding to
|
jpayne@68
|
1897 :term:`reference`
|
jpayne@68
|
1898
|
jpayne@68
|
1899 returns -1 if reference is not known.
|
jpayne@68
|
1900 """
|
jpayne@68
|
1901 if self.header is None:
|
jpayne@68
|
1902 raise ValueError("header not available in closed files")
|
jpayne@68
|
1903 return self.header.get_tid(reference)
|
jpayne@68
|
1904
|
jpayne@68
|
1905 def get_reference_name(self, tid):
|
jpayne@68
|
1906 """
|
jpayne@68
|
1907 return :term:`reference` name corresponding to numerical :term:`tid`
|
jpayne@68
|
1908 """
|
jpayne@68
|
1909 if self.header is None:
|
jpayne@68
|
1910 raise ValueError("header not available in closed files")
|
jpayne@68
|
1911 return self.header.get_reference_name(tid)
|
jpayne@68
|
1912
|
jpayne@68
|
1913 def get_reference_length(self, reference):
|
jpayne@68
|
1914 """
|
jpayne@68
|
1915 return :term:`reference` length corresponding to numerical :term:`tid`
|
jpayne@68
|
1916 """
|
jpayne@68
|
1917 if self.header is None:
|
jpayne@68
|
1918 raise ValueError("header not available in closed files")
|
jpayne@68
|
1919 return self.header.get_reference_length(reference)
|
jpayne@68
|
1920
|
jpayne@68
|
1921 property nreferences:
|
jpayne@68
|
1922 """int with the number of :term:`reference` sequences in the file.
|
jpayne@68
|
1923 This is a read-only attribute."""
|
jpayne@68
|
1924 def __get__(self):
|
jpayne@68
|
1925 if self.header:
|
jpayne@68
|
1926 return self.header.nreferences
|
jpayne@68
|
1927 else:
|
jpayne@68
|
1928 raise ValueError("header not available in closed files")
|
jpayne@68
|
1929
|
jpayne@68
|
1930 property references:
|
jpayne@68
|
1931 """tuple with the names of :term:`reference` sequences. This is a
|
jpayne@68
|
1932 read-only attribute"""
|
jpayne@68
|
1933 def __get__(self):
|
jpayne@68
|
1934 if self.header:
|
jpayne@68
|
1935 return self.header.references
|
jpayne@68
|
1936 else:
|
jpayne@68
|
1937 raise ValueError("header not available in closed files")
|
jpayne@68
|
1938
|
jpayne@68
|
1939 property lengths:
|
jpayne@68
|
1940 """tuple of the lengths of the :term:`reference` sequences. This is a
|
jpayne@68
|
1941 read-only attribute. The lengths are in the same order as
|
jpayne@68
|
1942 :attr:`pysam.AlignmentFile.references`
|
jpayne@68
|
1943
|
jpayne@68
|
1944 """
|
jpayne@68
|
1945 def __get__(self):
|
jpayne@68
|
1946 if self.header:
|
jpayne@68
|
1947 return self.header.lengths
|
jpayne@68
|
1948 else:
|
jpayne@68
|
1949 raise ValueError("header not available in closed files")
|
jpayne@68
|
1950
|
jpayne@68
|
1951 # Compatibility functions for pysam < 0.14
|
jpayne@68
|
1952 property text:
|
jpayne@68
|
1953 """deprecated, use :attr:`references` and :attr:`lengths` instead"""
|
jpayne@68
|
1954 def __get__(self):
|
jpayne@68
|
1955 if self.header:
|
jpayne@68
|
1956 return self.header.__str__()
|
jpayne@68
|
1957 else:
|
jpayne@68
|
1958 raise ValueError("header not available in closed files")
|
jpayne@68
|
1959
|
jpayne@68
|
1960 # Compatibility functions for pysam < 0.8.3
|
jpayne@68
|
1961 def gettid(self, reference):
|
jpayne@68
|
1962 """deprecated, use :meth:`get_tid` instead"""
|
jpayne@68
|
1963 return self.get_tid(reference)
|
jpayne@68
|
1964
|
jpayne@68
|
1965 def getrname(self, tid):
|
jpayne@68
|
1966 """deprecated, use :meth:`get_reference_name` instead"""
|
jpayne@68
|
1967 return self.get_reference_name(tid)
|
jpayne@68
|
1968
|
jpayne@68
|
1969
|
jpayne@68
|
1970 cdef class IteratorRow:
|
jpayne@68
|
1971 '''abstract base class for iterators over mapped reads.
|
jpayne@68
|
1972
|
jpayne@68
|
1973 Various iterators implement different behaviours for wrapping around
|
jpayne@68
|
1974 contig boundaries. Examples include:
|
jpayne@68
|
1975
|
jpayne@68
|
1976 :class:`pysam.IteratorRowRegion`
|
jpayne@68
|
1977 iterate within a single contig and a defined region.
|
jpayne@68
|
1978
|
jpayne@68
|
1979 :class:`pysam.IteratorRowAll`
|
jpayne@68
|
1980 iterate until EOF. This iterator will also include unmapped reads.
|
jpayne@68
|
1981
|
jpayne@68
|
1982 :class:`pysam.IteratorRowAllRefs`
|
jpayne@68
|
1983 iterate over all reads in all reference sequences.
|
jpayne@68
|
1984
|
jpayne@68
|
1985 The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
|
jpayne@68
|
1986
|
jpayne@68
|
1987 .. note::
|
jpayne@68
|
1988
|
jpayne@68
|
1989 It is usually not necessary to create an object of this class
|
jpayne@68
|
1990 explicitly. It is returned as a result of call to a
|
jpayne@68
|
1991 :meth:`AlignmentFile.fetch`.
|
jpayne@68
|
1992
|
jpayne@68
|
1993 '''
|
jpayne@68
|
1994
|
jpayne@68
|
1995 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
|
jpayne@68
|
1996 cdef char *cfilename
|
jpayne@68
|
1997 cdef char *creference_filename
|
jpayne@68
|
1998 cdef char *cindexname = NULL
|
jpayne@68
|
1999
|
jpayne@68
|
2000 if not samfile.is_open:
|
jpayne@68
|
2001 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
2002
|
jpayne@68
|
2003 # makes sure that samfile stays alive as long as the
|
jpayne@68
|
2004 # iterator is alive
|
jpayne@68
|
2005 self.samfile = samfile
|
jpayne@68
|
2006
|
jpayne@68
|
2007 # reopen the file - note that this makes the iterator
|
jpayne@68
|
2008 # slow and causes pileup to slow down significantly.
|
jpayne@68
|
2009 if multiple_iterators:
|
jpayne@68
|
2010
|
jpayne@68
|
2011 cfilename = samfile.filename
|
jpayne@68
|
2012 with nogil:
|
jpayne@68
|
2013 self.htsfile = hts_open(cfilename, 'r')
|
jpayne@68
|
2014 assert self.htsfile != NULL
|
jpayne@68
|
2015
|
jpayne@68
|
2016 if samfile.has_index():
|
jpayne@68
|
2017 if samfile.index_filename:
|
jpayne@68
|
2018 cindexname = bindex_filename = encode_filename(samfile.index_filename)
|
jpayne@68
|
2019 with nogil:
|
jpayne@68
|
2020 self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
|
jpayne@68
|
2021 else:
|
jpayne@68
|
2022 self.index = NULL
|
jpayne@68
|
2023
|
jpayne@68
|
2024 # need to advance in newly opened file to position after header
|
jpayne@68
|
2025 # better: use seek/tell?
|
jpayne@68
|
2026 with nogil:
|
jpayne@68
|
2027 hdr = sam_hdr_read(self.htsfile)
|
jpayne@68
|
2028 if hdr is NULL:
|
jpayne@68
|
2029 raise IOError("unable to read header information")
|
jpayne@68
|
2030 self.header = makeAlignmentHeader(hdr)
|
jpayne@68
|
2031
|
jpayne@68
|
2032 self.owns_samfile = True
|
jpayne@68
|
2033
|
jpayne@68
|
2034 # options specific to CRAM files
|
jpayne@68
|
2035 if samfile.is_cram and samfile.reference_filename:
|
jpayne@68
|
2036 creference_filename = samfile.reference_filename
|
jpayne@68
|
2037 hts_set_opt(self.htsfile,
|
jpayne@68
|
2038 CRAM_OPT_REFERENCE,
|
jpayne@68
|
2039 creference_filename)
|
jpayne@68
|
2040
|
jpayne@68
|
2041 else:
|
jpayne@68
|
2042 self.htsfile = samfile.htsfile
|
jpayne@68
|
2043 self.index = samfile.index
|
jpayne@68
|
2044 self.owns_samfile = False
|
jpayne@68
|
2045 self.header = samfile.header
|
jpayne@68
|
2046
|
jpayne@68
|
2047 self.retval = 0
|
jpayne@68
|
2048
|
jpayne@68
|
2049 self.b = bam_init1()
|
jpayne@68
|
2050
|
jpayne@68
|
2051 def __dealloc__(self):
|
jpayne@68
|
2052 bam_destroy1(self.b)
|
jpayne@68
|
2053 if self.owns_samfile:
|
jpayne@68
|
2054 hts_idx_destroy(self.index)
|
jpayne@68
|
2055 hts_close(self.htsfile)
|
jpayne@68
|
2056
|
jpayne@68
|
2057
|
jpayne@68
|
2058 cdef class IteratorRowRegion(IteratorRow):
|
jpayne@68
|
2059 """*(AlignmentFile samfile, int tid, int beg, int stop,
|
jpayne@68
|
2060 int multiple_iterators=False)*
|
jpayne@68
|
2061
|
jpayne@68
|
2062 iterate over mapped reads in a region.
|
jpayne@68
|
2063
|
jpayne@68
|
2064 .. note::
|
jpayne@68
|
2065
|
jpayne@68
|
2066 It is usually not necessary to create an object of this class
|
jpayne@68
|
2067 explicitly. It is returned as a result of call to a
|
jpayne@68
|
2068 :meth:`AlignmentFile.fetch`.
|
jpayne@68
|
2069
|
jpayne@68
|
2070 """
|
jpayne@68
|
2071
|
jpayne@68
|
2072 def __init__(self, AlignmentFile samfile,
|
jpayne@68
|
2073 int tid, int beg, int stop,
|
jpayne@68
|
2074 int multiple_iterators=False):
|
jpayne@68
|
2075
|
jpayne@68
|
2076 if not samfile.has_index():
|
jpayne@68
|
2077 raise ValueError("no index available for iteration")
|
jpayne@68
|
2078
|
jpayne@68
|
2079 super().__init__(samfile, multiple_iterators=multiple_iterators)
|
jpayne@68
|
2080 with nogil:
|
jpayne@68
|
2081 self.iter = sam_itr_queryi(
|
jpayne@68
|
2082 self.index,
|
jpayne@68
|
2083 tid,
|
jpayne@68
|
2084 beg,
|
jpayne@68
|
2085 stop)
|
jpayne@68
|
2086
|
jpayne@68
|
2087 def __iter__(self):
|
jpayne@68
|
2088 return self
|
jpayne@68
|
2089
|
jpayne@68
|
2090 cdef bam1_t * getCurrent(self):
|
jpayne@68
|
2091 return self.b
|
jpayne@68
|
2092
|
jpayne@68
|
2093 cdef int cnext(self):
|
jpayne@68
|
2094 '''cversion of iterator. Used by IteratorColumn'''
|
jpayne@68
|
2095 with nogil:
|
jpayne@68
|
2096 self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
|
jpayne@68
|
2097 self.iter,
|
jpayne@68
|
2098 self.b,
|
jpayne@68
|
2099 self.htsfile)
|
jpayne@68
|
2100
|
jpayne@68
|
2101 def __next__(self):
|
jpayne@68
|
2102 self.cnext()
|
jpayne@68
|
2103 if self.retval >= 0:
|
jpayne@68
|
2104 return makeAlignedSegment(self.b, self.header)
|
jpayne@68
|
2105 elif self.retval == -1:
|
jpayne@68
|
2106 raise StopIteration
|
jpayne@68
|
2107 elif self.retval == -2:
|
jpayne@68
|
2108 # Note: it is currently not the case that hts_iter_next
|
jpayne@68
|
2109 # returns -2 for a truncated file.
|
jpayne@68
|
2110 # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
|
jpayne@68
|
2111 raise IOError('truncated file')
|
jpayne@68
|
2112 else:
|
jpayne@68
|
2113 raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
|
jpayne@68
|
2114
|
jpayne@68
|
2115 def __dealloc__(self):
|
jpayne@68
|
2116 hts_itr_destroy(self.iter)
|
jpayne@68
|
2117
|
jpayne@68
|
2118
|
jpayne@68
|
2119 cdef class IteratorRowHead(IteratorRow):
|
jpayne@68
|
2120 """*(AlignmentFile samfile, n, int multiple_iterators=False)*
|
jpayne@68
|
2121
|
jpayne@68
|
2122 iterate over first n reads in `samfile`
|
jpayne@68
|
2123
|
jpayne@68
|
2124 .. note::
|
jpayne@68
|
2125 It is usually not necessary to create an object of this class
|
jpayne@68
|
2126 explicitly. It is returned as a result of call to a
|
jpayne@68
|
2127 :meth:`AlignmentFile.head`.
|
jpayne@68
|
2128
|
jpayne@68
|
2129 """
|
jpayne@68
|
2130
|
jpayne@68
|
2131 def __init__(self,
|
jpayne@68
|
2132 AlignmentFile samfile,
|
jpayne@68
|
2133 int n,
|
jpayne@68
|
2134 int multiple_iterators=False):
|
jpayne@68
|
2135 super().__init__(samfile, multiple_iterators=multiple_iterators)
|
jpayne@68
|
2136
|
jpayne@68
|
2137 self.max_rows = n
|
jpayne@68
|
2138 self.current_row = 0
|
jpayne@68
|
2139
|
jpayne@68
|
2140 def __iter__(self):
|
jpayne@68
|
2141 return self
|
jpayne@68
|
2142
|
jpayne@68
|
2143 cdef bam1_t * getCurrent(self):
|
jpayne@68
|
2144 return self.b
|
jpayne@68
|
2145
|
jpayne@68
|
2146 cdef int cnext(self):
|
jpayne@68
|
2147 '''cversion of iterator. Used by IteratorColumn'''
|
jpayne@68
|
2148 cdef int ret
|
jpayne@68
|
2149 cdef bam_hdr_t * hdr = self.header.ptr
|
jpayne@68
|
2150 with nogil:
|
jpayne@68
|
2151 ret = sam_read1(self.htsfile,
|
jpayne@68
|
2152 hdr,
|
jpayne@68
|
2153 self.b)
|
jpayne@68
|
2154 return ret
|
jpayne@68
|
2155
|
jpayne@68
|
2156 def __next__(self):
|
jpayne@68
|
2157 if self.current_row >= self.max_rows:
|
jpayne@68
|
2158 raise StopIteration
|
jpayne@68
|
2159
|
jpayne@68
|
2160 cdef int ret = self.cnext()
|
jpayne@68
|
2161 if ret >= 0:
|
jpayne@68
|
2162 self.current_row += 1
|
jpayne@68
|
2163 return makeAlignedSegment(self.b, self.header)
|
jpayne@68
|
2164 elif ret == -1:
|
jpayne@68
|
2165 raise StopIteration
|
jpayne@68
|
2166 else:
|
jpayne@68
|
2167 raise IOError(read_failure_reason(ret))
|
jpayne@68
|
2168
|
jpayne@68
|
2169
|
jpayne@68
|
2170 cdef class IteratorRowAll(IteratorRow):
|
jpayne@68
|
2171 """*(AlignmentFile samfile, int multiple_iterators=False)*
|
jpayne@68
|
2172
|
jpayne@68
|
2173 iterate over all reads in `samfile`
|
jpayne@68
|
2174
|
jpayne@68
|
2175 .. note::
|
jpayne@68
|
2176
|
jpayne@68
|
2177 It is usually not necessary to create an object of this class
|
jpayne@68
|
2178 explicitly. It is returned as a result of call to a
|
jpayne@68
|
2179 :meth:`AlignmentFile.fetch`.
|
jpayne@68
|
2180
|
jpayne@68
|
2181 """
|
jpayne@68
|
2182
|
jpayne@68
|
2183 def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
|
jpayne@68
|
2184 super().__init__(samfile, multiple_iterators=multiple_iterators)
|
jpayne@68
|
2185
|
jpayne@68
|
2186 def __iter__(self):
|
jpayne@68
|
2187 return self
|
jpayne@68
|
2188
|
jpayne@68
|
2189 cdef bam1_t * getCurrent(self):
|
jpayne@68
|
2190 return self.b
|
jpayne@68
|
2191
|
jpayne@68
|
2192 cdef int cnext(self):
|
jpayne@68
|
2193 '''cversion of iterator. Used by IteratorColumn'''
|
jpayne@68
|
2194 cdef int ret
|
jpayne@68
|
2195 cdef bam_hdr_t * hdr = self.header.ptr
|
jpayne@68
|
2196 with nogil:
|
jpayne@68
|
2197 ret = sam_read1(self.htsfile,
|
jpayne@68
|
2198 hdr,
|
jpayne@68
|
2199 self.b)
|
jpayne@68
|
2200 return ret
|
jpayne@68
|
2201
|
jpayne@68
|
2202 def __next__(self):
|
jpayne@68
|
2203 cdef int ret = self.cnext()
|
jpayne@68
|
2204 if ret >= 0:
|
jpayne@68
|
2205 return makeAlignedSegment(self.b, self.header)
|
jpayne@68
|
2206 elif ret == -1:
|
jpayne@68
|
2207 raise StopIteration
|
jpayne@68
|
2208 else:
|
jpayne@68
|
2209 raise IOError(read_failure_reason(ret))
|
jpayne@68
|
2210
|
jpayne@68
|
2211
|
jpayne@68
|
2212 cdef class IteratorRowAllRefs(IteratorRow):
|
jpayne@68
|
2213 """iterates over all mapped reads by chaining iterators over each
|
jpayne@68
|
2214 reference
|
jpayne@68
|
2215
|
jpayne@68
|
2216 .. note::
|
jpayne@68
|
2217 It is usually not necessary to create an object of this class
|
jpayne@68
|
2218 explicitly. It is returned as a result of call to a
|
jpayne@68
|
2219 :meth:`AlignmentFile.fetch`.
|
jpayne@68
|
2220
|
jpayne@68
|
2221 """
|
jpayne@68
|
2222
|
jpayne@68
|
2223 def __init__(self, AlignmentFile samfile, multiple_iterators=False):
|
jpayne@68
|
2224 super().__init__(samfile, multiple_iterators=multiple_iterators)
|
jpayne@68
|
2225
|
jpayne@68
|
2226 if not samfile.has_index():
|
jpayne@68
|
2227 raise ValueError("no index available for fetch")
|
jpayne@68
|
2228
|
jpayne@68
|
2229 self.tid = -1
|
jpayne@68
|
2230
|
jpayne@68
|
2231 def nextiter(self):
|
jpayne@68
|
2232 # get a new iterator for a chromosome. The file
|
jpayne@68
|
2233 # will not be re-opened.
|
jpayne@68
|
2234 self.rowiter = IteratorRowRegion(self.samfile,
|
jpayne@68
|
2235 self.tid,
|
jpayne@68
|
2236 0,
|
jpayne@68
|
2237 MAX_POS)
|
jpayne@68
|
2238 # set htsfile and header of the rowiter
|
jpayne@68
|
2239 # to the values in this iterator to reflect multiple_iterators
|
jpayne@68
|
2240 self.rowiter.htsfile = self.htsfile
|
jpayne@68
|
2241 self.rowiter.header = self.header
|
jpayne@68
|
2242
|
jpayne@68
|
2243 # make sure the iterator understand that IteratorRowAllRefs
|
jpayne@68
|
2244 # has ownership
|
jpayne@68
|
2245 self.rowiter.owns_samfile = False
|
jpayne@68
|
2246
|
jpayne@68
|
2247 def __iter__(self):
|
jpayne@68
|
2248 return self
|
jpayne@68
|
2249
|
jpayne@68
|
2250 def __next__(self):
|
jpayne@68
|
2251 # Create an initial iterator
|
jpayne@68
|
2252 if self.tid == -1:
|
jpayne@68
|
2253 if not self.samfile.nreferences:
|
jpayne@68
|
2254 raise StopIteration
|
jpayne@68
|
2255 self.tid = 0
|
jpayne@68
|
2256 self.nextiter()
|
jpayne@68
|
2257
|
jpayne@68
|
2258 while 1:
|
jpayne@68
|
2259 self.rowiter.cnext()
|
jpayne@68
|
2260
|
jpayne@68
|
2261 # If current iterator is not exhausted, return aligned read
|
jpayne@68
|
2262 if self.rowiter.retval > 0:
|
jpayne@68
|
2263 return makeAlignedSegment(self.rowiter.b, self.header)
|
jpayne@68
|
2264
|
jpayne@68
|
2265 self.tid += 1
|
jpayne@68
|
2266
|
jpayne@68
|
2267 # Otherwise, proceed to next reference or stop
|
jpayne@68
|
2268 if self.tid < self.samfile.nreferences:
|
jpayne@68
|
2269 self.nextiter()
|
jpayne@68
|
2270 else:
|
jpayne@68
|
2271 raise StopIteration
|
jpayne@68
|
2272
|
jpayne@68
|
2273
|
jpayne@68
|
2274 cdef class IteratorRowSelection(IteratorRow):
|
jpayne@68
|
2275 """*(AlignmentFile samfile)*
|
jpayne@68
|
2276
|
jpayne@68
|
2277 iterate over reads in `samfile` at a given list of file positions.
|
jpayne@68
|
2278
|
jpayne@68
|
2279 .. note::
|
jpayne@68
|
2280 It is usually not necessary to create an object of this class
|
jpayne@68
|
2281 explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
|
jpayne@68
|
2282 """
|
jpayne@68
|
2283
|
jpayne@68
|
2284 def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
|
jpayne@68
|
2285 super().__init__(samfile, multiple_iterators=multiple_iterators)
|
jpayne@68
|
2286
|
jpayne@68
|
2287 self.positions = positions
|
jpayne@68
|
2288 self.current_pos = 0
|
jpayne@68
|
2289
|
jpayne@68
|
2290 def __iter__(self):
|
jpayne@68
|
2291 return self
|
jpayne@68
|
2292
|
jpayne@68
|
2293 cdef bam1_t * getCurrent(self):
|
jpayne@68
|
2294 return self.b
|
jpayne@68
|
2295
|
jpayne@68
|
2296 cdef int cnext(self):
|
jpayne@68
|
2297 '''cversion of iterator'''
|
jpayne@68
|
2298 # end iteration if out of positions
|
jpayne@68
|
2299 if self.current_pos >= len(self.positions): return -1
|
jpayne@68
|
2300
|
jpayne@68
|
2301 cdef uint64_t pos = self.positions[self.current_pos]
|
jpayne@68
|
2302 with nogil:
|
jpayne@68
|
2303 bgzf_seek(hts_get_bgzfp(self.htsfile),
|
jpayne@68
|
2304 pos,
|
jpayne@68
|
2305 0)
|
jpayne@68
|
2306 self.current_pos += 1
|
jpayne@68
|
2307
|
jpayne@68
|
2308 cdef int ret
|
jpayne@68
|
2309 cdef bam_hdr_t * hdr = self.header.ptr
|
jpayne@68
|
2310 with nogil:
|
jpayne@68
|
2311 ret = sam_read1(self.htsfile,
|
jpayne@68
|
2312 hdr,
|
jpayne@68
|
2313 self.b)
|
jpayne@68
|
2314 return ret
|
jpayne@68
|
2315
|
jpayne@68
|
2316 def __next__(self):
|
jpayne@68
|
2317 cdef int ret = self.cnext()
|
jpayne@68
|
2318 if ret >= 0:
|
jpayne@68
|
2319 return makeAlignedSegment(self.b, self.header)
|
jpayne@68
|
2320 elif ret == -1:
|
jpayne@68
|
2321 raise StopIteration
|
jpayne@68
|
2322 else:
|
jpayne@68
|
2323 raise IOError(read_failure_reason(ret))
|
jpayne@68
|
2324
|
jpayne@68
|
2325
|
jpayne@68
|
2326 cdef int __advance_nofilter(void *data, bam1_t *b):
|
jpayne@68
|
2327 '''advance without any read filtering.
|
jpayne@68
|
2328 '''
|
jpayne@68
|
2329 cdef __iterdata * d = <__iterdata*>data
|
jpayne@68
|
2330 cdef int ret
|
jpayne@68
|
2331 with nogil:
|
jpayne@68
|
2332 ret = sam_itr_next(d.htsfile, d.iter, b)
|
jpayne@68
|
2333 return ret
|
jpayne@68
|
2334
|
jpayne@68
|
2335
|
jpayne@68
|
2336 cdef int __advance_raw_nofilter(void *data, bam1_t *b):
|
jpayne@68
|
2337 '''advance (without iterator) without any read filtering.
|
jpayne@68
|
2338 '''
|
jpayne@68
|
2339 cdef __iterdata * d = <__iterdata*>data
|
jpayne@68
|
2340 cdef int ret
|
jpayne@68
|
2341 with nogil:
|
jpayne@68
|
2342 ret = sam_read1(d.htsfile, d.header, b)
|
jpayne@68
|
2343 return ret
|
jpayne@68
|
2344
|
jpayne@68
|
2345
|
jpayne@68
|
2346 cdef int __advance_all(void *data, bam1_t *b):
|
jpayne@68
|
2347 '''only use reads for pileup passing basic filters such as
|
jpayne@68
|
2348
|
jpayne@68
|
2349 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
|
jpayne@68
|
2350 '''
|
jpayne@68
|
2351
|
jpayne@68
|
2352 cdef __iterdata * d = <__iterdata*>data
|
jpayne@68
|
2353 cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
|
jpayne@68
|
2354 cdef int ret
|
jpayne@68
|
2355 while 1:
|
jpayne@68
|
2356 with nogil:
|
jpayne@68
|
2357 ret = sam_itr_next(d.htsfile, d.iter, b)
|
jpayne@68
|
2358 if ret < 0:
|
jpayne@68
|
2359 break
|
jpayne@68
|
2360 if b.core.flag & d.flag_filter:
|
jpayne@68
|
2361 continue
|
jpayne@68
|
2362 break
|
jpayne@68
|
2363 return ret
|
jpayne@68
|
2364
|
jpayne@68
|
2365
|
jpayne@68
|
2366 cdef int __advance_raw_all(void *data, bam1_t *b):
|
jpayne@68
|
2367 '''only use reads for pileup passing basic filters such as
|
jpayne@68
|
2368
|
jpayne@68
|
2369 BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
|
jpayne@68
|
2370 '''
|
jpayne@68
|
2371
|
jpayne@68
|
2372 cdef __iterdata * d = <__iterdata*>data
|
jpayne@68
|
2373 cdef int ret
|
jpayne@68
|
2374 while 1:
|
jpayne@68
|
2375 with nogil:
|
jpayne@68
|
2376 ret = sam_read1(d.htsfile, d.header, b)
|
jpayne@68
|
2377 if ret < 0:
|
jpayne@68
|
2378 break
|
jpayne@68
|
2379 if b.core.flag & d.flag_filter:
|
jpayne@68
|
2380 continue
|
jpayne@68
|
2381 break
|
jpayne@68
|
2382 return ret
|
jpayne@68
|
2383
|
jpayne@68
|
2384
|
jpayne@68
|
2385 cdef int __advance_samtools(void * data, bam1_t * b):
|
jpayne@68
|
2386 '''advance using same filter and read processing as in
|
jpayne@68
|
2387 the samtools pileup.
|
jpayne@68
|
2388 '''
|
jpayne@68
|
2389 cdef __iterdata * d = <__iterdata*>data
|
jpayne@68
|
2390 cdef int ret
|
jpayne@68
|
2391 cdef int q
|
jpayne@68
|
2392
|
jpayne@68
|
2393 while 1:
|
jpayne@68
|
2394 with nogil:
|
jpayne@68
|
2395 ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
|
jpayne@68
|
2396 if ret < 0:
|
jpayne@68
|
2397 break
|
jpayne@68
|
2398 if b.core.flag & d.flag_filter:
|
jpayne@68
|
2399 continue
|
jpayne@68
|
2400 if d.flag_require and not (b.core.flag & d.flag_require):
|
jpayne@68
|
2401 continue
|
jpayne@68
|
2402
|
jpayne@68
|
2403 # reload sequence
|
jpayne@68
|
2404 if d.fastafile != NULL and b.core.tid != d.tid:
|
jpayne@68
|
2405 if d.seq != NULL:
|
jpayne@68
|
2406 free(d.seq)
|
jpayne@68
|
2407 d.tid = b.core.tid
|
jpayne@68
|
2408 with nogil:
|
jpayne@68
|
2409 d.seq = faidx_fetch_seq(
|
jpayne@68
|
2410 d.fastafile,
|
jpayne@68
|
2411 d.header.target_name[d.tid],
|
jpayne@68
|
2412 0, MAX_POS,
|
jpayne@68
|
2413 &d.seq_len)
|
jpayne@68
|
2414
|
jpayne@68
|
2415 if d.seq == NULL:
|
jpayne@68
|
2416 raise ValueError(
|
jpayne@68
|
2417 "reference sequence for '{}' (tid={}) not found".format(
|
jpayne@68
|
2418 d.header.target_name[d.tid], d.tid))
|
jpayne@68
|
2419
|
jpayne@68
|
2420 # realign read - changes base qualities
|
jpayne@68
|
2421 if d.seq != NULL and d.compute_baq:
|
jpayne@68
|
2422 # 4th option to realign is flag:
|
jpayne@68
|
2423 # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
|
jpayne@68
|
2424 if d.redo_baq:
|
jpayne@68
|
2425 sam_prob_realn(b, d.seq, d.seq_len, 7)
|
jpayne@68
|
2426 else:
|
jpayne@68
|
2427 sam_prob_realn(b, d.seq, d.seq_len, 3)
|
jpayne@68
|
2428
|
jpayne@68
|
2429 if d.seq != NULL and d.adjust_capq_threshold > 10:
|
jpayne@68
|
2430 q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
|
jpayne@68
|
2431 if q < 0:
|
jpayne@68
|
2432 continue
|
jpayne@68
|
2433 elif b.core.qual > q:
|
jpayne@68
|
2434 b.core.qual = q
|
jpayne@68
|
2435
|
jpayne@68
|
2436 if b.core.qual < d.min_mapping_quality:
|
jpayne@68
|
2437 continue
|
jpayne@68
|
2438 if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
|
jpayne@68
|
2439 continue
|
jpayne@68
|
2440
|
jpayne@68
|
2441 break
|
jpayne@68
|
2442
|
jpayne@68
|
2443 return ret
|
jpayne@68
|
2444
|
jpayne@68
|
2445
|
jpayne@68
|
2446 cdef class IteratorColumn:
|
jpayne@68
|
2447 '''abstract base class for iterators over columns.
|
jpayne@68
|
2448
|
jpayne@68
|
2449 IteratorColumn objects wrap the pileup functionality of samtools.
|
jpayne@68
|
2450
|
jpayne@68
|
2451 For reasons of efficiency, the iterator points to the current
|
jpayne@68
|
2452 pileup buffer. The pileup buffer is updated at every iteration.
|
jpayne@68
|
2453 This might cause some unexpected behaviour. For example,
|
jpayne@68
|
2454 consider the conversion to a list::
|
jpayne@68
|
2455
|
jpayne@68
|
2456 f = AlignmentFile("file.bam", "rb")
|
jpayne@68
|
2457 result = list(f.pileup())
|
jpayne@68
|
2458
|
jpayne@68
|
2459 Here, ``result`` will contain ``n`` objects of type
|
jpayne@68
|
2460 :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
|
jpayne@68
|
2461 ``result`` will contain the same information.
|
jpayne@68
|
2462
|
jpayne@68
|
2463 The desired behaviour can be achieved by list comprehension::
|
jpayne@68
|
2464
|
jpayne@68
|
2465 result = [x.pileups() for x in f.pileup()]
|
jpayne@68
|
2466
|
jpayne@68
|
2467 ``result`` will be a list of ``n`` lists of objects of type
|
jpayne@68
|
2468 :class:`~pysam.PileupRead`.
|
jpayne@68
|
2469
|
jpayne@68
|
2470 If the iterator is associated with a :class:`~pysam.Fastafile`
|
jpayne@68
|
2471 using the :meth:`add_reference` method, then the iterator will
|
jpayne@68
|
2472 export the current sequence via the methods :meth:`get_sequence`
|
jpayne@68
|
2473 and :meth:`seq_len`.
|
jpayne@68
|
2474
|
jpayne@68
|
2475 See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
|
jpayne@68
|
2476 '''
|
jpayne@68
|
2477
|
jpayne@68
|
2478 def __cinit__( self, AlignmentFile samfile, **kwargs):
|
jpayne@68
|
2479 self.samfile = samfile
|
jpayne@68
|
2480 self.fastafile = kwargs.get("fastafile", None)
|
jpayne@68
|
2481 self.stepper = kwargs.get("stepper", "samtools")
|
jpayne@68
|
2482 self.max_depth = kwargs.get("max_depth", 8000)
|
jpayne@68
|
2483 self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
|
jpayne@68
|
2484 self.min_base_quality = kwargs.get("min_base_quality", 13)
|
jpayne@68
|
2485 self.iterdata.seq = NULL
|
jpayne@68
|
2486 self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
|
jpayne@68
|
2487 self.iterdata.flag_require = kwargs.get("flag_require", 0)
|
jpayne@68
|
2488 self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
|
jpayne@68
|
2489 self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
|
jpayne@68
|
2490 self.iterdata.compute_baq = kwargs.get("compute_baq", True)
|
jpayne@68
|
2491 self.iterdata.redo_baq = kwargs.get("redo_baq", False)
|
jpayne@68
|
2492 self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
|
jpayne@68
|
2493
|
jpayne@68
|
2494 self.tid = 0
|
jpayne@68
|
2495 self.pos = 0
|
jpayne@68
|
2496 self.n_plp = 0
|
jpayne@68
|
2497 self.plp = NULL
|
jpayne@68
|
2498 self.pileup_iter = <bam_mplp_t>NULL
|
jpayne@68
|
2499
|
jpayne@68
|
2500 def __iter__(self):
|
jpayne@68
|
2501 return self
|
jpayne@68
|
2502
|
jpayne@68
|
2503 cdef int cnext(self):
|
jpayne@68
|
2504 '''perform next iteration.
|
jpayne@68
|
2505 '''
|
jpayne@68
|
2506 # do not release gil here because of call-backs
|
jpayne@68
|
2507 cdef int ret = bam_mplp_auto(self.pileup_iter,
|
jpayne@68
|
2508 &self.tid,
|
jpayne@68
|
2509 &self.pos,
|
jpayne@68
|
2510 &self.n_plp,
|
jpayne@68
|
2511 &self.plp)
|
jpayne@68
|
2512 return ret
|
jpayne@68
|
2513
|
jpayne@68
|
2514 cdef char * get_sequence(self):
|
jpayne@68
|
2515 '''return current reference sequence underlying the iterator.
|
jpayne@68
|
2516 '''
|
jpayne@68
|
2517 return self.iterdata.seq
|
jpayne@68
|
2518
|
jpayne@68
|
2519 property seq_len:
|
jpayne@68
|
2520 '''current sequence length.'''
|
jpayne@68
|
2521 def __get__(self):
|
jpayne@68
|
2522 return self.iterdata.seq_len
|
jpayne@68
|
2523
|
jpayne@68
|
2524 def add_reference(self, FastaFile fastafile):
|
jpayne@68
|
2525 '''
|
jpayne@68
|
2526 add reference sequences in `fastafile` to iterator.'''
|
jpayne@68
|
2527 self.fastafile = fastafile
|
jpayne@68
|
2528 if self.iterdata.seq != NULL:
|
jpayne@68
|
2529 free(self.iterdata.seq)
|
jpayne@68
|
2530 self.iterdata.tid = -1
|
jpayne@68
|
2531 self.iterdata.fastafile = self.fastafile.fastafile
|
jpayne@68
|
2532
|
jpayne@68
|
2533 def has_reference(self):
|
jpayne@68
|
2534 '''
|
jpayne@68
|
2535 return true if iterator is associated with a reference'''
|
jpayne@68
|
2536 return self.fastafile
|
jpayne@68
|
2537
|
jpayne@68
|
2538 cdef _setup_iterator(self,
|
jpayne@68
|
2539 int tid,
|
jpayne@68
|
2540 int start,
|
jpayne@68
|
2541 int stop,
|
jpayne@68
|
2542 int multiple_iterators=0):
|
jpayne@68
|
2543 '''setup the iterator structure'''
|
jpayne@68
|
2544
|
jpayne@68
|
2545 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
|
jpayne@68
|
2546 self.iterdata.htsfile = self.samfile.htsfile
|
jpayne@68
|
2547 self.iterdata.iter = self.iter.iter
|
jpayne@68
|
2548 self.iterdata.seq = NULL
|
jpayne@68
|
2549 self.iterdata.tid = -1
|
jpayne@68
|
2550 self.iterdata.header = self.samfile.header.ptr
|
jpayne@68
|
2551
|
jpayne@68
|
2552 if self.fastafile is not None:
|
jpayne@68
|
2553 self.iterdata.fastafile = self.fastafile.fastafile
|
jpayne@68
|
2554 else:
|
jpayne@68
|
2555 self.iterdata.fastafile = NULL
|
jpayne@68
|
2556
|
jpayne@68
|
2557 # Free any previously allocated memory before reassigning
|
jpayne@68
|
2558 # pileup_iter
|
jpayne@68
|
2559 self._free_pileup_iter()
|
jpayne@68
|
2560
|
jpayne@68
|
2561 cdef void * data[1]
|
jpayne@68
|
2562 data[0] = <void*>&self.iterdata
|
jpayne@68
|
2563
|
jpayne@68
|
2564 if self.stepper is None or self.stepper == "all":
|
jpayne@68
|
2565 with nogil:
|
jpayne@68
|
2566 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2567 <bam_plp_auto_f>&__advance_all,
|
jpayne@68
|
2568 data)
|
jpayne@68
|
2569 elif self.stepper == "nofilter":
|
jpayne@68
|
2570 with nogil:
|
jpayne@68
|
2571 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2572 <bam_plp_auto_f>&__advance_nofilter,
|
jpayne@68
|
2573 data)
|
jpayne@68
|
2574 elif self.stepper == "samtools":
|
jpayne@68
|
2575 with nogil:
|
jpayne@68
|
2576 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2577 <bam_plp_auto_f>&__advance_samtools,
|
jpayne@68
|
2578 data)
|
jpayne@68
|
2579 else:
|
jpayne@68
|
2580 raise ValueError(
|
jpayne@68
|
2581 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
|
jpayne@68
|
2582
|
jpayne@68
|
2583 if self.max_depth:
|
jpayne@68
|
2584 with nogil:
|
jpayne@68
|
2585 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
|
jpayne@68
|
2586
|
jpayne@68
|
2587 if self.ignore_overlaps:
|
jpayne@68
|
2588 with nogil:
|
jpayne@68
|
2589 bam_mplp_init_overlaps(self.pileup_iter)
|
jpayne@68
|
2590
|
jpayne@68
|
2591 cdef _setup_raw_rest_iterator(self):
|
jpayne@68
|
2592 '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
|
jpayne@68
|
2593
|
jpayne@68
|
2594 self.iter = None
|
jpayne@68
|
2595 self.iterdata.iter = NULL
|
jpayne@68
|
2596 self.iterdata.htsfile = self.samfile.htsfile
|
jpayne@68
|
2597 self.iterdata.seq = NULL
|
jpayne@68
|
2598 self.iterdata.tid = -1
|
jpayne@68
|
2599 self.iterdata.header = self.samfile.header.ptr
|
jpayne@68
|
2600
|
jpayne@68
|
2601 if self.fastafile is not None:
|
jpayne@68
|
2602 self.iterdata.fastafile = self.fastafile.fastafile
|
jpayne@68
|
2603 else:
|
jpayne@68
|
2604 self.iterdata.fastafile = NULL
|
jpayne@68
|
2605
|
jpayne@68
|
2606 # Free any previously allocated memory before reassigning
|
jpayne@68
|
2607 # pileup_iter
|
jpayne@68
|
2608 self._free_pileup_iter()
|
jpayne@68
|
2609
|
jpayne@68
|
2610 cdef void * data[1]
|
jpayne@68
|
2611 data[0] = <void*>&self.iterdata
|
jpayne@68
|
2612
|
jpayne@68
|
2613 if self.stepper is None or self.stepper == "all":
|
jpayne@68
|
2614 with nogil:
|
jpayne@68
|
2615 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2616 <bam_plp_auto_f>&__advance_raw_all,
|
jpayne@68
|
2617 data)
|
jpayne@68
|
2618 elif self.stepper == "nofilter":
|
jpayne@68
|
2619 with nogil:
|
jpayne@68
|
2620 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2621 <bam_plp_auto_f>&__advance_raw_nofilter,
|
jpayne@68
|
2622 data)
|
jpayne@68
|
2623 elif self.stepper == "samtools":
|
jpayne@68
|
2624 with nogil:
|
jpayne@68
|
2625 self.pileup_iter = bam_mplp_init(1,
|
jpayne@68
|
2626 <bam_plp_auto_f>&__advance_samtools,
|
jpayne@68
|
2627 data)
|
jpayne@68
|
2628 else:
|
jpayne@68
|
2629 raise ValueError(
|
jpayne@68
|
2630 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
|
jpayne@68
|
2631
|
jpayne@68
|
2632 if self.max_depth:
|
jpayne@68
|
2633 with nogil:
|
jpayne@68
|
2634 bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
|
jpayne@68
|
2635
|
jpayne@68
|
2636 if self.ignore_overlaps:
|
jpayne@68
|
2637 with nogil:
|
jpayne@68
|
2638 bam_mplp_init_overlaps(self.pileup_iter)
|
jpayne@68
|
2639
|
jpayne@68
|
2640 cdef reset(self, tid, start, stop):
|
jpayne@68
|
2641 '''reset iterator position.
|
jpayne@68
|
2642
|
jpayne@68
|
2643 This permits using the iterator multiple times without
|
jpayne@68
|
2644 having to incur the full set-up costs.
|
jpayne@68
|
2645 '''
|
jpayne@68
|
2646 if self.iter is None:
|
jpayne@68
|
2647 raise TypeError("Raw iterator set up without region cannot be reset")
|
jpayne@68
|
2648
|
jpayne@68
|
2649 self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
|
jpayne@68
|
2650 self.iterdata.iter = self.iter.iter
|
jpayne@68
|
2651
|
jpayne@68
|
2652 # invalidate sequence if different tid
|
jpayne@68
|
2653 if self.tid != tid:
|
jpayne@68
|
2654 if self.iterdata.seq != NULL:
|
jpayne@68
|
2655 free(self.iterdata.seq)
|
jpayne@68
|
2656 self.iterdata.seq = NULL
|
jpayne@68
|
2657 self.iterdata.tid = -1
|
jpayne@68
|
2658
|
jpayne@68
|
2659 # self.pileup_iter = bam_mplp_init(1
|
jpayne@68
|
2660 # &__advancepileup,
|
jpayne@68
|
2661 # &self.iterdata)
|
jpayne@68
|
2662 with nogil:
|
jpayne@68
|
2663 bam_mplp_reset(self.pileup_iter)
|
jpayne@68
|
2664
|
jpayne@68
|
2665 cdef _free_pileup_iter(self):
|
jpayne@68
|
2666 '''free the memory alloc'd by bam_plp_init.
|
jpayne@68
|
2667
|
jpayne@68
|
2668 This is needed before setup_iterator allocates another
|
jpayne@68
|
2669 pileup_iter, or else memory will be lost. '''
|
jpayne@68
|
2670 if self.pileup_iter != <bam_mplp_t>NULL:
|
jpayne@68
|
2671 with nogil:
|
jpayne@68
|
2672 bam_mplp_reset(self.pileup_iter)
|
jpayne@68
|
2673 bam_mplp_destroy(self.pileup_iter)
|
jpayne@68
|
2674 self.pileup_iter = <bam_mplp_t>NULL
|
jpayne@68
|
2675
|
jpayne@68
|
2676 def __dealloc__(self):
|
jpayne@68
|
2677 # reset in order to avoid memory leak messages for iterators
|
jpayne@68
|
2678 # that have not been fully consumed
|
jpayne@68
|
2679 self._free_pileup_iter()
|
jpayne@68
|
2680 self.plp = <const bam_pileup1_t*>NULL
|
jpayne@68
|
2681
|
jpayne@68
|
2682 if self.iterdata.seq != NULL:
|
jpayne@68
|
2683 free(self.iterdata.seq)
|
jpayne@68
|
2684 self.iterdata.seq = NULL
|
jpayne@68
|
2685
|
jpayne@68
|
2686 # backwards compatibility
|
jpayne@68
|
2687
|
jpayne@68
|
2688 def hasReference(self):
|
jpayne@68
|
2689 return self.has_reference()
|
jpayne@68
|
2690 cdef char * getSequence(self):
|
jpayne@68
|
2691 return self.get_sequence()
|
jpayne@68
|
2692 def addReference(self, FastaFile fastafile):
|
jpayne@68
|
2693 return self.add_reference(fastafile)
|
jpayne@68
|
2694
|
jpayne@68
|
2695
|
jpayne@68
|
2696 cdef class IteratorColumnRegion(IteratorColumn):
|
jpayne@68
|
2697 '''iterates over a region only.
|
jpayne@68
|
2698 '''
|
jpayne@68
|
2699 def __cinit__(self,
|
jpayne@68
|
2700 AlignmentFile samfile,
|
jpayne@68
|
2701 int tid = 0,
|
jpayne@68
|
2702 int start = 0,
|
jpayne@68
|
2703 int stop = MAX_POS,
|
jpayne@68
|
2704 int truncate = False,
|
jpayne@68
|
2705 int multiple_iterators = True,
|
jpayne@68
|
2706 **kwargs ):
|
jpayne@68
|
2707
|
jpayne@68
|
2708 # initialize iterator. Multiple iterators not available
|
jpayne@68
|
2709 # for CRAM.
|
jpayne@68
|
2710 if multiple_iterators and samfile.is_cram:
|
jpayne@68
|
2711 warnings.warn("multiple_iterators not implemented for CRAM")
|
jpayne@68
|
2712 multiple_iterators = False
|
jpayne@68
|
2713
|
jpayne@68
|
2714 self._setup_iterator(tid, start, stop, multiple_iterators)
|
jpayne@68
|
2715 self.start = start
|
jpayne@68
|
2716 self.stop = stop
|
jpayne@68
|
2717 self.truncate = truncate
|
jpayne@68
|
2718
|
jpayne@68
|
2719 def __next__(self):
|
jpayne@68
|
2720
|
jpayne@68
|
2721 cdef int n
|
jpayne@68
|
2722
|
jpayne@68
|
2723 while 1:
|
jpayne@68
|
2724 n = self.cnext()
|
jpayne@68
|
2725 if n < 0:
|
jpayne@68
|
2726 raise ValueError("error during iteration" )
|
jpayne@68
|
2727
|
jpayne@68
|
2728 if n == 0:
|
jpayne@68
|
2729 raise StopIteration
|
jpayne@68
|
2730
|
jpayne@68
|
2731 if self.truncate:
|
jpayne@68
|
2732 if self.start > self.pos:
|
jpayne@68
|
2733 continue
|
jpayne@68
|
2734 if self.pos >= self.stop:
|
jpayne@68
|
2735 raise StopIteration
|
jpayne@68
|
2736
|
jpayne@68
|
2737 return makePileupColumn(&self.plp,
|
jpayne@68
|
2738 self.tid,
|
jpayne@68
|
2739 self.pos,
|
jpayne@68
|
2740 self.n_plp,
|
jpayne@68
|
2741 self.min_base_quality,
|
jpayne@68
|
2742 self.iterdata.seq,
|
jpayne@68
|
2743 self.samfile.header)
|
jpayne@68
|
2744
|
jpayne@68
|
2745
|
jpayne@68
|
2746 cdef class IteratorColumnAllRefs(IteratorColumn):
|
jpayne@68
|
2747 """iterates over all columns by chaining iterators over each reference
|
jpayne@68
|
2748 """
|
jpayne@68
|
2749
|
jpayne@68
|
2750 def __cinit__(self,
|
jpayne@68
|
2751 AlignmentFile samfile,
|
jpayne@68
|
2752 **kwargs):
|
jpayne@68
|
2753
|
jpayne@68
|
2754 # no iteration over empty files
|
jpayne@68
|
2755 if not samfile.nreferences:
|
jpayne@68
|
2756 raise StopIteration
|
jpayne@68
|
2757
|
jpayne@68
|
2758 # initialize iterator
|
jpayne@68
|
2759 self._setup_iterator(self.tid, 0, MAX_POS, 1)
|
jpayne@68
|
2760
|
jpayne@68
|
2761 def __next__(self):
|
jpayne@68
|
2762
|
jpayne@68
|
2763 cdef int n
|
jpayne@68
|
2764 while 1:
|
jpayne@68
|
2765 n = self.cnext()
|
jpayne@68
|
2766 if n < 0:
|
jpayne@68
|
2767 raise ValueError("error during iteration")
|
jpayne@68
|
2768
|
jpayne@68
|
2769 # proceed to next reference or stop
|
jpayne@68
|
2770 if n == 0:
|
jpayne@68
|
2771 self.tid += 1
|
jpayne@68
|
2772 if self.tid < self.samfile.nreferences:
|
jpayne@68
|
2773 self._setup_iterator(self.tid, 0, MAX_POS, 0)
|
jpayne@68
|
2774 else:
|
jpayne@68
|
2775 raise StopIteration
|
jpayne@68
|
2776 continue
|
jpayne@68
|
2777
|
jpayne@68
|
2778 # return result, if within same reference
|
jpayne@68
|
2779 return makePileupColumn(&self.plp,
|
jpayne@68
|
2780 self.tid,
|
jpayne@68
|
2781 self.pos,
|
jpayne@68
|
2782 self.n_plp,
|
jpayne@68
|
2783 self.min_base_quality,
|
jpayne@68
|
2784 self.iterdata.seq,
|
jpayne@68
|
2785 self.samfile.header)
|
jpayne@68
|
2786
|
jpayne@68
|
2787
|
jpayne@68
|
2788 cdef class IteratorColumnAll(IteratorColumn):
|
jpayne@68
|
2789 """iterates over all columns, without using an index
|
jpayne@68
|
2790 """
|
jpayne@68
|
2791
|
jpayne@68
|
2792 def __cinit__(self,
|
jpayne@68
|
2793 AlignmentFile samfile,
|
jpayne@68
|
2794 **kwargs):
|
jpayne@68
|
2795
|
jpayne@68
|
2796 self._setup_raw_rest_iterator()
|
jpayne@68
|
2797
|
jpayne@68
|
2798 def __next__(self):
|
jpayne@68
|
2799
|
jpayne@68
|
2800 cdef int n
|
jpayne@68
|
2801 n = self.cnext()
|
jpayne@68
|
2802 if n < 0:
|
jpayne@68
|
2803 raise ValueError("error during iteration")
|
jpayne@68
|
2804
|
jpayne@68
|
2805 if n == 0:
|
jpayne@68
|
2806 raise StopIteration
|
jpayne@68
|
2807
|
jpayne@68
|
2808 return makePileupColumn(&self.plp,
|
jpayne@68
|
2809 self.tid,
|
jpayne@68
|
2810 self.pos,
|
jpayne@68
|
2811 self.n_plp,
|
jpayne@68
|
2812 self.min_base_quality,
|
jpayne@68
|
2813 self.iterdata.seq,
|
jpayne@68
|
2814 self.samfile.header)
|
jpayne@68
|
2815
|
jpayne@68
|
2816
|
jpayne@68
|
2817 cdef class SNPCall:
|
jpayne@68
|
2818 '''the results of a SNP call.'''
|
jpayne@68
|
2819 cdef int _tid
|
jpayne@68
|
2820 cdef int _pos
|
jpayne@68
|
2821 cdef char _reference_base
|
jpayne@68
|
2822 cdef char _genotype
|
jpayne@68
|
2823 cdef int _consensus_quality
|
jpayne@68
|
2824 cdef int _snp_quality
|
jpayne@68
|
2825 cdef int _rms_mapping_quality
|
jpayne@68
|
2826 cdef int _coverage
|
jpayne@68
|
2827
|
jpayne@68
|
2828 property tid:
|
jpayne@68
|
2829 '''the chromosome ID as is defined in the header'''
|
jpayne@68
|
2830 def __get__(self):
|
jpayne@68
|
2831 return self._tid
|
jpayne@68
|
2832
|
jpayne@68
|
2833 property pos:
|
jpayne@68
|
2834 '''nucleotide position of SNP.'''
|
jpayne@68
|
2835 def __get__(self): return self._pos
|
jpayne@68
|
2836
|
jpayne@68
|
2837 property reference_base:
|
jpayne@68
|
2838 '''reference base at pos. ``N`` if no reference sequence supplied.'''
|
jpayne@68
|
2839 def __get__(self): return from_string_and_size( &self._reference_base, 1 )
|
jpayne@68
|
2840
|
jpayne@68
|
2841 property genotype:
|
jpayne@68
|
2842 '''the genotype called.'''
|
jpayne@68
|
2843 def __get__(self): return from_string_and_size( &self._genotype, 1 )
|
jpayne@68
|
2844
|
jpayne@68
|
2845 property consensus_quality:
|
jpayne@68
|
2846 '''the genotype quality (Phred-scaled).'''
|
jpayne@68
|
2847 def __get__(self): return self._consensus_quality
|
jpayne@68
|
2848
|
jpayne@68
|
2849 property snp_quality:
|
jpayne@68
|
2850 '''the snp quality (Phred scaled) - probability of consensus being
|
jpayne@68
|
2851 identical to reference sequence.'''
|
jpayne@68
|
2852 def __get__(self): return self._snp_quality
|
jpayne@68
|
2853
|
jpayne@68
|
2854 property mapping_quality:
|
jpayne@68
|
2855 '''the root mean square (rms) of the mapping quality of all reads
|
jpayne@68
|
2856 involved in the call.'''
|
jpayne@68
|
2857 def __get__(self): return self._rms_mapping_quality
|
jpayne@68
|
2858
|
jpayne@68
|
2859 property coverage:
|
jpayne@68
|
2860 '''coverage or read depth - the number of reads involved in the call.'''
|
jpayne@68
|
2861 def __get__(self): return self._coverage
|
jpayne@68
|
2862
|
jpayne@68
|
2863 def __str__(self):
|
jpayne@68
|
2864
|
jpayne@68
|
2865 return "\t".join( map(str, (
|
jpayne@68
|
2866 self.tid,
|
jpayne@68
|
2867 self.pos,
|
jpayne@68
|
2868 self.reference_base,
|
jpayne@68
|
2869 self.genotype,
|
jpayne@68
|
2870 self.consensus_quality,
|
jpayne@68
|
2871 self.snp_quality,
|
jpayne@68
|
2872 self.mapping_quality,
|
jpayne@68
|
2873 self.coverage ) ) )
|
jpayne@68
|
2874
|
jpayne@68
|
2875
|
jpayne@68
|
2876 cdef class IndexedReads:
|
jpayne@68
|
2877 """Index a Sam/BAM-file by query name while keeping the
|
jpayne@68
|
2878 original sort order intact.
|
jpayne@68
|
2879
|
jpayne@68
|
2880 The index is kept in memory and can be substantial.
|
jpayne@68
|
2881
|
jpayne@68
|
2882 By default, the file is re-opened to avoid conflicts if multiple
|
jpayne@68
|
2883 operators work on the same file. Set `multiple_iterators` = False
|
jpayne@68
|
2884 to not re-open `samfile`.
|
jpayne@68
|
2885
|
jpayne@68
|
2886 Parameters
|
jpayne@68
|
2887 ----------
|
jpayne@68
|
2888
|
jpayne@68
|
2889 samfile : AlignmentFile
|
jpayne@68
|
2890 File to be indexed.
|
jpayne@68
|
2891
|
jpayne@68
|
2892 multiple_iterators : bool
|
jpayne@68
|
2893 Flag indicating whether the file should be reopened. Reopening prevents
|
jpayne@68
|
2894 existing iterators being affected by the indexing.
|
jpayne@68
|
2895
|
jpayne@68
|
2896 """
|
jpayne@68
|
2897
|
jpayne@68
|
2898 def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
|
jpayne@68
|
2899 cdef char *cfilename
|
jpayne@68
|
2900
|
jpayne@68
|
2901 # makes sure that samfile stays alive as long as this
|
jpayne@68
|
2902 # object is alive.
|
jpayne@68
|
2903 self.samfile = samfile
|
jpayne@68
|
2904 cdef bam_hdr_t * hdr = NULL
|
jpayne@68
|
2905 assert samfile.is_bam, "can only apply IndexReads on bam files"
|
jpayne@68
|
2906
|
jpayne@68
|
2907 # multiple_iterators the file - note that this makes the iterator
|
jpayne@68
|
2908 # slow and causes pileup to slow down significantly.
|
jpayne@68
|
2909 if multiple_iterators:
|
jpayne@68
|
2910 cfilename = samfile.filename
|
jpayne@68
|
2911 with nogil:
|
jpayne@68
|
2912 self.htsfile = hts_open(cfilename, 'r')
|
jpayne@68
|
2913 if self.htsfile == NULL:
|
jpayne@68
|
2914 raise OSError("unable to reopen htsfile")
|
jpayne@68
|
2915
|
jpayne@68
|
2916 # need to advance in newly opened file to position after header
|
jpayne@68
|
2917 # better: use seek/tell?
|
jpayne@68
|
2918 with nogil:
|
jpayne@68
|
2919 hdr = sam_hdr_read(self.htsfile)
|
jpayne@68
|
2920 if hdr == NULL:
|
jpayne@68
|
2921 raise OSError("unable to read header information")
|
jpayne@68
|
2922 self.header = makeAlignmentHeader(hdr)
|
jpayne@68
|
2923 self.owns_samfile = True
|
jpayne@68
|
2924 else:
|
jpayne@68
|
2925 self.htsfile = self.samfile.htsfile
|
jpayne@68
|
2926 self.header = samfile.header
|
jpayne@68
|
2927 self.owns_samfile = False
|
jpayne@68
|
2928
|
jpayne@68
|
2929 def build(self):
|
jpayne@68
|
2930 '''build the index.'''
|
jpayne@68
|
2931
|
jpayne@68
|
2932 self.index = collections.defaultdict(list)
|
jpayne@68
|
2933
|
jpayne@68
|
2934 # this method will start indexing from the current file position
|
jpayne@68
|
2935 cdef int ret = 1
|
jpayne@68
|
2936 cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
|
jpayne@68
|
2937 if b == NULL:
|
jpayne@68
|
2938 raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
|
jpayne@68
|
2939
|
jpayne@68
|
2940 cdef uint64_t pos
|
jpayne@68
|
2941 cdef bam_hdr_t * hdr = self.header.ptr
|
jpayne@68
|
2942
|
jpayne@68
|
2943 while ret > 0:
|
jpayne@68
|
2944 with nogil:
|
jpayne@68
|
2945 pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
|
jpayne@68
|
2946 ret = sam_read1(self.htsfile,
|
jpayne@68
|
2947 hdr,
|
jpayne@68
|
2948 b)
|
jpayne@68
|
2949
|
jpayne@68
|
2950 if ret > 0:
|
jpayne@68
|
2951 qname = charptr_to_str(pysam_bam_get_qname(b))
|
jpayne@68
|
2952 self.index[qname].append(pos)
|
jpayne@68
|
2953
|
jpayne@68
|
2954 bam_destroy1(b)
|
jpayne@68
|
2955
|
jpayne@68
|
2956 def find(self, query_name):
|
jpayne@68
|
2957 '''find `query_name` in index.
|
jpayne@68
|
2958
|
jpayne@68
|
2959 Returns
|
jpayne@68
|
2960 -------
|
jpayne@68
|
2961
|
jpayne@68
|
2962 IteratorRowSelection
|
jpayne@68
|
2963 Returns an iterator over all reads with query_name.
|
jpayne@68
|
2964
|
jpayne@68
|
2965 Raises
|
jpayne@68
|
2966 ------
|
jpayne@68
|
2967
|
jpayne@68
|
2968 KeyError
|
jpayne@68
|
2969 if the `query_name` is not in the index.
|
jpayne@68
|
2970
|
jpayne@68
|
2971 '''
|
jpayne@68
|
2972 if query_name in self.index:
|
jpayne@68
|
2973 return IteratorRowSelection(
|
jpayne@68
|
2974 self.samfile,
|
jpayne@68
|
2975 self.index[query_name],
|
jpayne@68
|
2976 multiple_iterators = False)
|
jpayne@68
|
2977 else:
|
jpayne@68
|
2978 raise KeyError("read %s not found" % query_name)
|
jpayne@68
|
2979
|
jpayne@68
|
2980 def __dealloc__(self):
|
jpayne@68
|
2981 if self.owns_samfile:
|
jpayne@68
|
2982 hts_close(self.htsfile)
|