jpayne@69
|
1 # cython: language_level=3
|
jpayne@69
|
2 # cython: embedsignature=True
|
jpayne@69
|
3 # cython: profile=True
|
jpayne@69
|
4 # adds doc-strings for sphinx
|
jpayne@69
|
5
|
jpayne@69
|
6 ########################################################################
|
jpayne@69
|
7 ########################################################################
|
jpayne@69
|
8 ## Cython cimports
|
jpayne@69
|
9 ########################################################################
|
jpayne@69
|
10
|
jpayne@69
|
11 from posix.unistd cimport dup
|
jpayne@69
|
12 from libc.errno cimport errno
|
jpayne@69
|
13 from libc.stdint cimport INT32_MAX
|
jpayne@69
|
14 from cpython cimport PyBytes_FromStringAndSize
|
jpayne@69
|
15 from pysam.libchtslib cimport *
|
jpayne@69
|
16 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
|
jpayne@69
|
17 from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
|
jpayne@69
|
18
|
jpayne@69
|
19
|
jpayne@69
|
20 ########################################################################
|
jpayne@69
|
21 ########################################################################
|
jpayne@69
|
22 ## Python imports
|
jpayne@69
|
23 ########################################################################
|
jpayne@69
|
24
|
jpayne@69
|
25 import os
|
jpayne@69
|
26 import io
|
jpayne@69
|
27 import re
|
jpayne@69
|
28 from warnings import warn
|
jpayne@69
|
29
|
jpayne@69
|
30
|
jpayne@69
|
31 ########################################################################
|
jpayne@69
|
32 ########################################################################
|
jpayne@69
|
33 ## Constants
|
jpayne@69
|
34 ########################################################################
|
jpayne@69
|
35
|
jpayne@69
|
36 __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
|
jpayne@69
|
37
|
jpayne@69
|
38 # maximum genomic coordinace
|
jpayne@69
|
39 cdef int MAX_POS = (1 << 31) - 1
|
jpayne@69
|
40
|
jpayne@69
|
41 cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
|
jpayne@69
|
42 cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
|
jpayne@69
|
43 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
|
jpayne@69
|
44 cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
|
jpayne@69
|
45
|
jpayne@69
|
46
|
jpayne@69
|
47 ########################################################################
|
jpayne@69
|
48 ########################################################################
|
jpayne@69
|
49 ## Verbosity functions
|
jpayne@69
|
50 ########################################################################
|
jpayne@69
|
51
|
jpayne@69
|
52
|
jpayne@69
|
53 cpdef set_verbosity(int verbosity):
|
jpayne@69
|
54 """Set htslib's hts_verbose global variable to the specified value."""
|
jpayne@69
|
55 return hts_set_verbosity(verbosity)
|
jpayne@69
|
56
|
jpayne@69
|
57 cpdef get_verbosity():
|
jpayne@69
|
58 """Return the value of htslib's hts_verbose global variable."""
|
jpayne@69
|
59 return hts_get_verbosity()
|
jpayne@69
|
60
|
jpayne@69
|
61
|
jpayne@69
|
62 ########################################################################
|
jpayne@69
|
63 ########################################################################
|
jpayne@69
|
64 ## HFile wrapper class
|
jpayne@69
|
65 ########################################################################
|
jpayne@69
|
66
|
jpayne@69
|
67 cdef class HFile(object):
|
jpayne@69
|
68 cdef hFILE *fp
|
jpayne@69
|
69 cdef readonly object name, mode
|
jpayne@69
|
70
|
jpayne@69
|
71 def __init__(self, name, mode='r', closefd=True):
|
jpayne@69
|
72 self._open(name, mode, closefd=True)
|
jpayne@69
|
73
|
jpayne@69
|
74 def __dealloc__(self):
|
jpayne@69
|
75 self.close()
|
jpayne@69
|
76
|
jpayne@69
|
77 @property
|
jpayne@69
|
78 def closed(self):
|
jpayne@69
|
79 return self.fp == NULL
|
jpayne@69
|
80
|
jpayne@69
|
81 cdef _open(self, name, mode, closefd=True):
|
jpayne@69
|
82 self.name = name
|
jpayne@69
|
83 self.mode = mode
|
jpayne@69
|
84
|
jpayne@69
|
85 mode = force_bytes(mode)
|
jpayne@69
|
86
|
jpayne@69
|
87 if isinstance(name, int):
|
jpayne@69
|
88 if self.fp != NULL:
|
jpayne@69
|
89 name = dup(name)
|
jpayne@69
|
90 self.fp = hdopen(name, mode)
|
jpayne@69
|
91 else:
|
jpayne@69
|
92 name = encode_filename(name)
|
jpayne@69
|
93 self.fp = hopen(name, mode)
|
jpayne@69
|
94
|
jpayne@69
|
95 if not self.fp:
|
jpayne@69
|
96 raise IOError(errno, 'failed to open HFile', self.name)
|
jpayne@69
|
97
|
jpayne@69
|
98 def close(self):
|
jpayne@69
|
99 if self.fp == NULL:
|
jpayne@69
|
100 return
|
jpayne@69
|
101
|
jpayne@69
|
102 cdef hFILE *fp = self.fp
|
jpayne@69
|
103 self.fp = NULL
|
jpayne@69
|
104
|
jpayne@69
|
105 if hclose(fp) != 0:
|
jpayne@69
|
106 raise IOError(errno, 'failed to close HFile', self.name)
|
jpayne@69
|
107
|
jpayne@69
|
108 def fileno(self):
|
jpayne@69
|
109 if self.fp == NULL:
|
jpayne@69
|
110 raise IOError('operation on closed HFile')
|
jpayne@69
|
111 if isinstance(self.name, int):
|
jpayne@69
|
112 return self.name
|
jpayne@69
|
113 else:
|
jpayne@69
|
114 raise AttributeError('fileno not available')
|
jpayne@69
|
115
|
jpayne@69
|
116 def __enter__(self):
|
jpayne@69
|
117 return self
|
jpayne@69
|
118
|
jpayne@69
|
119 def __exit__(self, type, value, tb):
|
jpayne@69
|
120 self.close()
|
jpayne@69
|
121
|
jpayne@69
|
122 def __iter__(self):
|
jpayne@69
|
123 return self
|
jpayne@69
|
124
|
jpayne@69
|
125 def __next__(self):
|
jpayne@69
|
126 line = self.readline()
|
jpayne@69
|
127 if not line:
|
jpayne@69
|
128 raise StopIteration()
|
jpayne@69
|
129 return line
|
jpayne@69
|
130
|
jpayne@69
|
131 def flush(self):
|
jpayne@69
|
132 if self.fp == NULL:
|
jpayne@69
|
133 raise IOError('operation on closed HFile')
|
jpayne@69
|
134 if hflush(self.fp) != 0:
|
jpayne@69
|
135 raise IOError(herrno(self.fp), 'failed to flush HFile', self.name)
|
jpayne@69
|
136
|
jpayne@69
|
137 def isatty(self):
|
jpayne@69
|
138 if self.fp == NULL:
|
jpayne@69
|
139 raise IOError('operation on closed HFile')
|
jpayne@69
|
140 return False
|
jpayne@69
|
141
|
jpayne@69
|
142 def readable(self):
|
jpayne@69
|
143 return self.fp != NULL and 'r' in self.mode
|
jpayne@69
|
144
|
jpayne@69
|
145 def read(self, Py_ssize_t size=-1):
|
jpayne@69
|
146 if self.fp == NULL:
|
jpayne@69
|
147 raise IOError('operation on closed HFile')
|
jpayne@69
|
148
|
jpayne@69
|
149 if size == 0:
|
jpayne@69
|
150 return b''
|
jpayne@69
|
151
|
jpayne@69
|
152 cdef list parts = []
|
jpayne@69
|
153 cdef bytes part
|
jpayne@69
|
154 cdef Py_ssize_t chunk_size, ret, bytes_read = 0
|
jpayne@69
|
155 cdef char *cpart
|
jpayne@69
|
156
|
jpayne@69
|
157 while size == -1 or bytes_read < size:
|
jpayne@69
|
158 chunk_size = 4096
|
jpayne@69
|
159 if size != -1:
|
jpayne@69
|
160 chunk_size = min(chunk_size, size - bytes_read)
|
jpayne@69
|
161
|
jpayne@69
|
162 part = PyBytes_FromStringAndSize(NULL, chunk_size)
|
jpayne@69
|
163 cpart = <char *>part
|
jpayne@69
|
164 ret = hread(self.fp, <void *>cpart, chunk_size)
|
jpayne@69
|
165
|
jpayne@69
|
166 if ret < 0:
|
jpayne@69
|
167 IOError(herrno(self.fp), 'failed to read HFile', self.name)
|
jpayne@69
|
168 elif not ret:
|
jpayne@69
|
169 break
|
jpayne@69
|
170
|
jpayne@69
|
171 bytes_read += ret
|
jpayne@69
|
172
|
jpayne@69
|
173 if ret < chunk_size:
|
jpayne@69
|
174 part = cpart[:ret]
|
jpayne@69
|
175
|
jpayne@69
|
176 parts.append(part)
|
jpayne@69
|
177
|
jpayne@69
|
178 return b''.join(parts)
|
jpayne@69
|
179
|
jpayne@69
|
180 def readall(self):
|
jpayne@69
|
181 return self.read()
|
jpayne@69
|
182
|
jpayne@69
|
183 def readinto(self, buf):
|
jpayne@69
|
184 if self.fp == NULL:
|
jpayne@69
|
185 raise IOError('operation on closed HFile')
|
jpayne@69
|
186
|
jpayne@69
|
187 size = len(buf)
|
jpayne@69
|
188
|
jpayne@69
|
189 if size == 0:
|
jpayne@69
|
190 return size
|
jpayne@69
|
191
|
jpayne@69
|
192 mv = memoryview(buf)
|
jpayne@69
|
193 ret = hread(self.fp, <void *>mv, size)
|
jpayne@69
|
194
|
jpayne@69
|
195 if ret < 0:
|
jpayne@69
|
196 IOError(herrno(self.fp), 'failed to read HFile', self.name)
|
jpayne@69
|
197
|
jpayne@69
|
198 return ret
|
jpayne@69
|
199
|
jpayne@69
|
200 def readline(self, Py_ssize_t size=-1):
|
jpayne@69
|
201 if self.fp == NULL:
|
jpayne@69
|
202 raise IOError('operation on closed HFile')
|
jpayne@69
|
203
|
jpayne@69
|
204 if size == 0:
|
jpayne@69
|
205 return b''
|
jpayne@69
|
206
|
jpayne@69
|
207 cdef list parts = []
|
jpayne@69
|
208 cdef bytes part
|
jpayne@69
|
209 cdef Py_ssize_t chunk_size, ret, bytes_read = 0
|
jpayne@69
|
210 cdef char *cpart
|
jpayne@69
|
211
|
jpayne@69
|
212 while size == -1 or bytes_read < size:
|
jpayne@69
|
213 chunk_size = 4096
|
jpayne@69
|
214 if size != -1:
|
jpayne@69
|
215 chunk_size = min(chunk_size, size - bytes_read)
|
jpayne@69
|
216
|
jpayne@69
|
217 part = PyBytes_FromStringAndSize(NULL, chunk_size)
|
jpayne@69
|
218 cpart = <char *>part
|
jpayne@69
|
219
|
jpayne@69
|
220 # Python bytes objects allocate an extra byte for a null terminator
|
jpayne@69
|
221 ret = hgetln(cpart, chunk_size+1, self.fp)
|
jpayne@69
|
222
|
jpayne@69
|
223 if ret < 0:
|
jpayne@69
|
224 IOError(herrno(self.fp), 'failed to read HFile', self.name)
|
jpayne@69
|
225 elif not ret:
|
jpayne@69
|
226 break
|
jpayne@69
|
227
|
jpayne@69
|
228 bytes_read += ret
|
jpayne@69
|
229
|
jpayne@69
|
230 if ret < chunk_size:
|
jpayne@69
|
231 part = cpart[:ret]
|
jpayne@69
|
232 cpart = <char *>part
|
jpayne@69
|
233
|
jpayne@69
|
234 parts.append(part)
|
jpayne@69
|
235
|
jpayne@69
|
236 if cpart[ret-1] == b'\n':
|
jpayne@69
|
237 break
|
jpayne@69
|
238
|
jpayne@69
|
239 return b''.join(parts)
|
jpayne@69
|
240
|
jpayne@69
|
241 def readlines(self):
|
jpayne@69
|
242 return list(self)
|
jpayne@69
|
243
|
jpayne@69
|
244 def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
|
jpayne@69
|
245 if self.fp == NULL:
|
jpayne@69
|
246 raise IOError('operation on closed HFile')
|
jpayne@69
|
247
|
jpayne@69
|
248 cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
|
jpayne@69
|
249
|
jpayne@69
|
250 if off < 0:
|
jpayne@69
|
251 raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
|
jpayne@69
|
252
|
jpayne@69
|
253 return off
|
jpayne@69
|
254
|
jpayne@69
|
255 def tell(self):
|
jpayne@69
|
256 if self.fp == NULL:
|
jpayne@69
|
257 raise IOError('operation on closed HFile')
|
jpayne@69
|
258
|
jpayne@69
|
259 ret = htell(self.fp)
|
jpayne@69
|
260
|
jpayne@69
|
261 if ret < 0:
|
jpayne@69
|
262 raise IOError(herrno(self.fp), 'tell failed on HFile', self.name)
|
jpayne@69
|
263
|
jpayne@69
|
264 return ret
|
jpayne@69
|
265
|
jpayne@69
|
266 def seekable(self):
|
jpayne@69
|
267 return self.fp != NULL
|
jpayne@69
|
268
|
jpayne@69
|
269 def truncate(self, size=None):
|
jpayne@69
|
270 raise NotImplementedError()
|
jpayne@69
|
271
|
jpayne@69
|
272 def writable(self):
|
jpayne@69
|
273 return self.fp != NULL and 'w' in self.mode
|
jpayne@69
|
274
|
jpayne@69
|
275 def write(self, bytes b):
|
jpayne@69
|
276 if self.fp == NULL:
|
jpayne@69
|
277 raise IOError('operation on closed HFile')
|
jpayne@69
|
278
|
jpayne@69
|
279 got = hwrite(self.fp, <void *>b, len(b))
|
jpayne@69
|
280
|
jpayne@69
|
281 if got < 0:
|
jpayne@69
|
282 raise IOError(herrno(self.fp), 'write failed on HFile', self.name)
|
jpayne@69
|
283
|
jpayne@69
|
284 return got
|
jpayne@69
|
285
|
jpayne@69
|
286 def writelines(self, lines):
|
jpayne@69
|
287 for line in lines:
|
jpayne@69
|
288 self.write(line)
|
jpayne@69
|
289
|
jpayne@69
|
290
|
jpayne@69
|
291 ########################################################################
|
jpayne@69
|
292 ########################################################################
|
jpayne@69
|
293 ## Helpers for backward compatibility to hide the difference between
|
jpayne@69
|
294 ## boolean properties and methods
|
jpayne@69
|
295 ########################################################################
|
jpayne@69
|
296
|
jpayne@69
|
297 class CallableValue(object):
|
jpayne@69
|
298 def __init__(self, value):
|
jpayne@69
|
299 self.value = value
|
jpayne@69
|
300 def __call__(self):
|
jpayne@69
|
301 return self.value
|
jpayne@69
|
302 def __bool__(self):
|
jpayne@69
|
303 return self.value
|
jpayne@69
|
304 def __nonzero__(self):
|
jpayne@69
|
305 return self.value
|
jpayne@69
|
306 def __eq__(self, other):
|
jpayne@69
|
307 return self.value == other
|
jpayne@69
|
308 def __ne__(self, other):
|
jpayne@69
|
309 return self.value != other
|
jpayne@69
|
310
|
jpayne@69
|
311
|
jpayne@69
|
312 CTrue = CallableValue(True)
|
jpayne@69
|
313 CFalse = CallableValue(False)
|
jpayne@69
|
314
|
jpayne@69
|
315
|
jpayne@69
|
316 ########################################################################
|
jpayne@69
|
317 ########################################################################
|
jpayne@69
|
318 ## HTSFile wrapper class (base class for AlignmentFile and VariantFile)
|
jpayne@69
|
319 ########################################################################
|
jpayne@69
|
320
|
jpayne@69
|
321 cdef class HTSFile(object):
|
jpayne@69
|
322 """
|
jpayne@69
|
323 Base class for HTS file types
|
jpayne@69
|
324 """
|
jpayne@69
|
325
|
jpayne@69
|
326 def __cinit__(self, *args, **kwargs):
|
jpayne@69
|
327 self.htsfile = NULL
|
jpayne@69
|
328 self.threads = 1
|
jpayne@69
|
329 self.duplicate_filehandle = True
|
jpayne@69
|
330
|
jpayne@69
|
331 def close(self):
|
jpayne@69
|
332 if self.htsfile:
|
jpayne@69
|
333 hts_close(self.htsfile)
|
jpayne@69
|
334 self.htsfile = NULL
|
jpayne@69
|
335
|
jpayne@69
|
336 def __dealloc__(self):
|
jpayne@69
|
337 if self.htsfile:
|
jpayne@69
|
338 hts_close(self.htsfile)
|
jpayne@69
|
339 self.htsfile = NULL
|
jpayne@69
|
340
|
jpayne@69
|
341 def flush(self):
|
jpayne@69
|
342 """Flush any buffered data to the underlying output stream."""
|
jpayne@69
|
343 if self.htsfile:
|
jpayne@69
|
344 if hts_flush(self.htsfile) < 0:
|
jpayne@69
|
345 raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename))
|
jpayne@69
|
346
|
jpayne@69
|
347 def check_truncation(self, ignore_truncation=False):
|
jpayne@69
|
348 """Check if file is truncated."""
|
jpayne@69
|
349 if not self.htsfile:
|
jpayne@69
|
350 return
|
jpayne@69
|
351
|
jpayne@69
|
352 if self.htsfile.format.compression != bgzf:
|
jpayne@69
|
353 return
|
jpayne@69
|
354
|
jpayne@69
|
355 cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
|
jpayne@69
|
356 if not bgzfp:
|
jpayne@69
|
357 return
|
jpayne@69
|
358
|
jpayne@69
|
359 cdef int ret = bgzf_check_EOF(bgzfp)
|
jpayne@69
|
360 if ret < 0:
|
jpayne@69
|
361 raise IOError(errno, 'error checking for EOF marker')
|
jpayne@69
|
362 elif ret == 0:
|
jpayne@69
|
363 msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
|
jpayne@69
|
364 if ignore_truncation:
|
jpayne@69
|
365 warn(msg)
|
jpayne@69
|
366 else:
|
jpayne@69
|
367 raise IOError(msg)
|
jpayne@69
|
368
|
jpayne@69
|
369 def __enter__(self):
|
jpayne@69
|
370 return self
|
jpayne@69
|
371
|
jpayne@69
|
372 def __exit__(self, exc_type, exc_value, traceback):
|
jpayne@69
|
373 self.close()
|
jpayne@69
|
374 return False
|
jpayne@69
|
375
|
jpayne@69
|
376 @property
|
jpayne@69
|
377 def category(self):
|
jpayne@69
|
378 """General file format category. One of UNKNOWN, ALIGNMENTS,
|
jpayne@69
|
379 VARIANTS, INDEX, REGIONS"""
|
jpayne@69
|
380 if not self.htsfile:
|
jpayne@69
|
381 raise ValueError('metadata not available on closed file')
|
jpayne@69
|
382 return FORMAT_CATEGORIES[self.htsfile.format.category]
|
jpayne@69
|
383
|
jpayne@69
|
384 @property
|
jpayne@69
|
385 def format(self):
|
jpayne@69
|
386 """File format.
|
jpayne@69
|
387
|
jpayne@69
|
388 One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
|
jpayne@69
|
389 BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
|
jpayne@69
|
390 """
|
jpayne@69
|
391 if not self.htsfile:
|
jpayne@69
|
392 raise ValueError('metadata not available on closed file')
|
jpayne@69
|
393 return FORMATS[self.htsfile.format.format]
|
jpayne@69
|
394
|
jpayne@69
|
395 @property
|
jpayne@69
|
396 def version(self):
|
jpayne@69
|
397 """Tuple of file format version numbers (major, minor)"""
|
jpayne@69
|
398 if not self.htsfile:
|
jpayne@69
|
399 raise ValueError('metadata not available on closed file')
|
jpayne@69
|
400 return self.htsfile.format.version.major, self.htsfile.format.version.minor
|
jpayne@69
|
401
|
jpayne@69
|
402 @property
|
jpayne@69
|
403 def compression(self):
|
jpayne@69
|
404 """File compression.
|
jpayne@69
|
405
|
jpayne@69
|
406 One of NONE, GZIP, BGZF, CUSTOM."""
|
jpayne@69
|
407 if not self.htsfile:
|
jpayne@69
|
408 raise ValueError('metadata not available on closed file')
|
jpayne@69
|
409 return COMPRESSION[self.htsfile.format.compression]
|
jpayne@69
|
410
|
jpayne@69
|
411 @property
|
jpayne@69
|
412 def description(self):
|
jpayne@69
|
413 """Vaguely human readable description of the file format"""
|
jpayne@69
|
414 if not self.htsfile:
|
jpayne@69
|
415 raise ValueError('metadata not available on closed file')
|
jpayne@69
|
416 cdef char *desc = hts_format_description(&self.htsfile.format)
|
jpayne@69
|
417 try:
|
jpayne@69
|
418 return charptr_to_str(desc)
|
jpayne@69
|
419 finally:
|
jpayne@69
|
420 free(desc)
|
jpayne@69
|
421
|
jpayne@69
|
422 @property
|
jpayne@69
|
423 def is_open(self):
|
jpayne@69
|
424 """return True if HTSFile is open and in a valid state."""
|
jpayne@69
|
425 return CTrue if self.htsfile != NULL else CFalse
|
jpayne@69
|
426
|
jpayne@69
|
427 @property
|
jpayne@69
|
428 def is_closed(self):
|
jpayne@69
|
429 """return True if HTSFile is closed."""
|
jpayne@69
|
430 return self.htsfile == NULL
|
jpayne@69
|
431
|
jpayne@69
|
432 @property
|
jpayne@69
|
433 def closed(self):
|
jpayne@69
|
434 """return True if HTSFile is closed."""
|
jpayne@69
|
435 return self.htsfile == NULL
|
jpayne@69
|
436
|
jpayne@69
|
437 @property
|
jpayne@69
|
438 def is_write(self):
|
jpayne@69
|
439 """return True if HTSFile is open for writing"""
|
jpayne@69
|
440 return self.htsfile != NULL and self.htsfile.is_write != 0
|
jpayne@69
|
441
|
jpayne@69
|
442 @property
|
jpayne@69
|
443 def is_read(self):
|
jpayne@69
|
444 """return True if HTSFile is open for reading"""
|
jpayne@69
|
445 return self.htsfile != NULL and self.htsfile.is_write == 0
|
jpayne@69
|
446
|
jpayne@69
|
447 @property
|
jpayne@69
|
448 def is_sam(self):
|
jpayne@69
|
449 """return True if HTSFile is reading or writing a SAM alignment file"""
|
jpayne@69
|
450 return self.htsfile != NULL and self.htsfile.format.format == sam
|
jpayne@69
|
451
|
jpayne@69
|
452 @property
|
jpayne@69
|
453 def is_bam(self):
|
jpayne@69
|
454 """return True if HTSFile is reading or writing a BAM alignment file"""
|
jpayne@69
|
455 return self.htsfile != NULL and self.htsfile.format.format == bam
|
jpayne@69
|
456
|
jpayne@69
|
457 @property
|
jpayne@69
|
458 def is_cram(self):
|
jpayne@69
|
459 """return True if HTSFile is reading or writing a BAM alignment file"""
|
jpayne@69
|
460 return self.htsfile != NULL and self.htsfile.format.format == cram
|
jpayne@69
|
461
|
jpayne@69
|
462 @property
|
jpayne@69
|
463 def is_vcf(self):
|
jpayne@69
|
464 """return True if HTSFile is reading or writing a VCF variant file"""
|
jpayne@69
|
465 return self.htsfile != NULL and self.htsfile.format.format == vcf
|
jpayne@69
|
466
|
jpayne@69
|
467 @property
|
jpayne@69
|
468 def is_bcf(self):
|
jpayne@69
|
469 """return True if HTSFile is reading or writing a BCF variant file"""
|
jpayne@69
|
470 return self.htsfile != NULL and self.htsfile.format.format == bcf
|
jpayne@69
|
471
|
jpayne@69
|
472 def reset(self):
|
jpayne@69
|
473 """reset file position to beginning of file just after the header.
|
jpayne@69
|
474
|
jpayne@69
|
475 Returns
|
jpayne@69
|
476 -------
|
jpayne@69
|
477
|
jpayne@69
|
478 The file position after moving the file pointer. : pointer
|
jpayne@69
|
479
|
jpayne@69
|
480 """
|
jpayne@69
|
481 return self.seek(self.start_offset)
|
jpayne@69
|
482
|
jpayne@69
|
483 def seek(self, uint64_t offset, int whence=io.SEEK_SET):
|
jpayne@69
|
484 """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
|
jpayne@69
|
485 if not self.is_open:
|
jpayne@69
|
486 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
487 if self.is_stream:
|
jpayne@69
|
488 raise IOError('seek not available in streams')
|
jpayne@69
|
489
|
jpayne@69
|
490 whence = libc_whence_from_io(whence)
|
jpayne@69
|
491
|
jpayne@69
|
492 cdef int64_t ret
|
jpayne@69
|
493 if self.htsfile.format.compression == bgzf:
|
jpayne@69
|
494 with nogil:
|
jpayne@69
|
495 ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
|
jpayne@69
|
496 elif self.htsfile.format.compression == no_compression:
|
jpayne@69
|
497 ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
|
jpayne@69
|
498 else:
|
jpayne@69
|
499 raise NotImplementedError("seek not implemented in files compressed by method {}".format(
|
jpayne@69
|
500 self.htsfile.format.compression))
|
jpayne@69
|
501 return ret
|
jpayne@69
|
502
|
jpayne@69
|
503 def tell(self):
|
jpayne@69
|
504 """return current file position, see :meth:`pysam.HTSFile.seek`."""
|
jpayne@69
|
505 if not self.is_open:
|
jpayne@69
|
506 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
507 if self.is_stream:
|
jpayne@69
|
508 raise IOError('tell not available in streams')
|
jpayne@69
|
509
|
jpayne@69
|
510 cdef int64_t ret
|
jpayne@69
|
511 if self.htsfile.format.compression == bgzf:
|
jpayne@69
|
512 with nogil:
|
jpayne@69
|
513 ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
|
jpayne@69
|
514 elif self.htsfile.format.compression == no_compression:
|
jpayne@69
|
515 ret = htell(self.htsfile.fp.hfile)
|
jpayne@69
|
516 elif self.htsfile.format.format == cram:
|
jpayne@69
|
517 with nogil:
|
jpayne@69
|
518 ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
|
jpayne@69
|
519 else:
|
jpayne@69
|
520 raise NotImplementedError("seek not implemented in files compressed by method {}".format(
|
jpayne@69
|
521 self.htsfile.format.compression))
|
jpayne@69
|
522
|
jpayne@69
|
523 return ret
|
jpayne@69
|
524
|
jpayne@69
|
525 cdef htsFile *_open_htsfile(self) except? NULL:
|
jpayne@69
|
526 cdef char *cfilename
|
jpayne@69
|
527 cdef char *cmode = self.mode
|
jpayne@69
|
528 cdef int fd, dup_fd, threads
|
jpayne@69
|
529
|
jpayne@69
|
530 threads = self.threads - 1
|
jpayne@69
|
531 if isinstance(self.filename, bytes):
|
jpayne@69
|
532 cfilename = self.filename
|
jpayne@69
|
533 with nogil:
|
jpayne@69
|
534 htsfile = hts_open(cfilename, cmode)
|
jpayne@69
|
535 if htsfile != NULL:
|
jpayne@69
|
536 hts_set_threads(htsfile, threads)
|
jpayne@69
|
537 return htsfile
|
jpayne@69
|
538 else:
|
jpayne@69
|
539 if isinstance(self.filename, int):
|
jpayne@69
|
540 fd = self.filename
|
jpayne@69
|
541 else:
|
jpayne@69
|
542 fd = self.filename.fileno()
|
jpayne@69
|
543
|
jpayne@69
|
544 if self.duplicate_filehandle:
|
jpayne@69
|
545 dup_fd = dup(fd)
|
jpayne@69
|
546 else:
|
jpayne@69
|
547 dup_fd = fd
|
jpayne@69
|
548
|
jpayne@69
|
549 # Replicate mode normalization done in hts_open_format
|
jpayne@69
|
550 smode = self.mode.replace(b'b', b'').replace(b'c', b'')
|
jpayne@69
|
551 if b'b' in self.mode:
|
jpayne@69
|
552 smode += b'b'
|
jpayne@69
|
553 elif b'c' in self.mode:
|
jpayne@69
|
554 smode += b'c'
|
jpayne@69
|
555 cmode = smode
|
jpayne@69
|
556
|
jpayne@69
|
557 hfile = hdopen(dup_fd, cmode)
|
jpayne@69
|
558 if hfile == NULL:
|
jpayne@69
|
559 raise IOError('Cannot create hfile')
|
jpayne@69
|
560
|
jpayne@69
|
561 try:
|
jpayne@69
|
562 # filename.name can be an int
|
jpayne@69
|
563 filename = str(self.filename.name)
|
jpayne@69
|
564 except AttributeError:
|
jpayne@69
|
565 filename = '<fd:{}>'.format(fd)
|
jpayne@69
|
566
|
jpayne@69
|
567 filename = encode_filename(filename)
|
jpayne@69
|
568 cfilename = filename
|
jpayne@69
|
569 with nogil:
|
jpayne@69
|
570 htsfile = hts_hopen(hfile, cfilename, cmode)
|
jpayne@69
|
571 if htsfile != NULL:
|
jpayne@69
|
572 hts_set_threads(htsfile, threads)
|
jpayne@69
|
573 return htsfile
|
jpayne@69
|
574
|
jpayne@69
|
575 def add_hts_options(self, format_options=None):
|
jpayne@69
|
576 """Given a list of key=value format option strings, add them to an open htsFile
|
jpayne@69
|
577 """
|
jpayne@69
|
578 cdef int rval
|
jpayne@69
|
579 cdef hts_opt *opts = NULL
|
jpayne@69
|
580
|
jpayne@69
|
581 if format_options:
|
jpayne@69
|
582 for format_option in format_options:
|
jpayne@69
|
583 rval = hts_opt_add(&opts, format_option)
|
jpayne@69
|
584 if rval != 0:
|
jpayne@69
|
585 if opts != NULL:
|
jpayne@69
|
586 hts_opt_free(opts)
|
jpayne@69
|
587 raise RuntimeError('Invalid format option ({}) specified'.format(format_option))
|
jpayne@69
|
588 if opts != NULL:
|
jpayne@69
|
589 rval = hts_opt_apply(self.htsfile, opts)
|
jpayne@69
|
590 if rval != 0:
|
jpayne@69
|
591 hts_opt_free(opts)
|
jpayne@69
|
592 raise RuntimeError('An error occurred while applying the requested format options')
|
jpayne@69
|
593 hts_opt_free(opts)
|
jpayne@69
|
594
|
jpayne@69
|
595 def parse_region(self, contig=None, start=None, stop=None,
|
jpayne@69
|
596 region=None, tid=None,
|
jpayne@69
|
597 reference=None, end=None):
|
jpayne@69
|
598 """parse alternative ways to specify a genomic region. A region can
|
jpayne@69
|
599 either be specified by :term:`contig`, `start` and
|
jpayne@69
|
600 `stop`. `start` and `stop` denote 0-based, half-open
|
jpayne@69
|
601 intervals. :term:`reference` and `end` are also accepted for
|
jpayne@69
|
602 backward compatibility as synonyms for :term:`contig` and
|
jpayne@69
|
603 `stop`, respectively.
|
jpayne@69
|
604
|
jpayne@69
|
605 Alternatively, a samtools :term:`region` string can be
|
jpayne@69
|
606 supplied.
|
jpayne@69
|
607
|
jpayne@69
|
608 If any of the coordinates are missing they will be replaced by
|
jpayne@69
|
609 the minimum (`start`) or maximum (`stop`) coordinate.
|
jpayne@69
|
610
|
jpayne@69
|
611 Note that region strings are 1-based inclusive, while `start`
|
jpayne@69
|
612 and `stop` denote an interval in 0-based, half-open
|
jpayne@69
|
613 coordinates (like BED files and Python slices).
|
jpayne@69
|
614
|
jpayne@69
|
615 If `contig` or `region` or are ``*``, unmapped reads at the end
|
jpayne@69
|
616 of a BAM file will be returned. Setting either to ``.`` will
|
jpayne@69
|
617 iterate from the beginning of the file.
|
jpayne@69
|
618
|
jpayne@69
|
619 Returns
|
jpayne@69
|
620 -------
|
jpayne@69
|
621
|
jpayne@69
|
622 tuple :
|
jpayne@69
|
623 a tuple of `flag`, :term:`tid`, `start` and
|
jpayne@69
|
624 `stop`. The flag indicates whether no coordinates were
|
jpayne@69
|
625 supplied and the genomic region is the complete genomic space.
|
jpayne@69
|
626
|
jpayne@69
|
627 Raises
|
jpayne@69
|
628 ------
|
jpayne@69
|
629
|
jpayne@69
|
630 ValueError
|
jpayne@69
|
631 for invalid or out of bounds regions.
|
jpayne@69
|
632
|
jpayne@69
|
633 """
|
jpayne@69
|
634 cdef int rtid
|
jpayne@69
|
635 cdef int32_t rstart
|
jpayne@69
|
636 cdef int32_t rstop
|
jpayne@69
|
637
|
jpayne@69
|
638 if reference is not None:
|
jpayne@69
|
639 if contig is not None:
|
jpayne@69
|
640 raise ValueError('contig and reference should not both be specified')
|
jpayne@69
|
641 contig = reference
|
jpayne@69
|
642
|
jpayne@69
|
643 if end is not None:
|
jpayne@69
|
644 if stop is not None:
|
jpayne@69
|
645 raise ValueError('stop and end should not both be specified')
|
jpayne@69
|
646 stop = end
|
jpayne@69
|
647
|
jpayne@69
|
648 if contig is None and tid is None and region is None:
|
jpayne@69
|
649 return 0, 0, 0, MAX_POS
|
jpayne@69
|
650
|
jpayne@69
|
651 rtid = -1
|
jpayne@69
|
652 rstart = 0
|
jpayne@69
|
653 rstop = MAX_POS
|
jpayne@69
|
654 if start is not None:
|
jpayne@69
|
655 try:
|
jpayne@69
|
656 rstart = start
|
jpayne@69
|
657 except OverflowError:
|
jpayne@69
|
658 raise ValueError('start out of range (%i)' % start)
|
jpayne@69
|
659
|
jpayne@69
|
660 if stop is not None:
|
jpayne@69
|
661 try:
|
jpayne@69
|
662 rstop = stop
|
jpayne@69
|
663 except OverflowError:
|
jpayne@69
|
664 raise ValueError('stop out of range (%i)' % stop)
|
jpayne@69
|
665
|
jpayne@69
|
666 if region:
|
jpayne@69
|
667 region = force_str(region)
|
jpayne@69
|
668 if ":" in region:
|
jpayne@69
|
669 contig, coord = region.split(":")
|
jpayne@69
|
670 parts = coord.split("-")
|
jpayne@69
|
671 rstart = int(parts[0]) - 1
|
jpayne@69
|
672 if len(parts) >= 1:
|
jpayne@69
|
673 rstop = int(parts[1])
|
jpayne@69
|
674 else:
|
jpayne@69
|
675 contig = region
|
jpayne@69
|
676
|
jpayne@69
|
677 if tid is not None:
|
jpayne@69
|
678 if not self.is_valid_tid(tid):
|
jpayne@69
|
679 raise IndexError('invalid tid')
|
jpayne@69
|
680 rtid = tid
|
jpayne@69
|
681 else:
|
jpayne@69
|
682 if contig == "*":
|
jpayne@69
|
683 rtid = HTS_IDX_NOCOOR
|
jpayne@69
|
684 elif contig == ".":
|
jpayne@69
|
685 rtid = HTS_IDX_START
|
jpayne@69
|
686 else:
|
jpayne@69
|
687 rtid = self.get_tid(contig)
|
jpayne@69
|
688 if rtid < 0:
|
jpayne@69
|
689 raise ValueError('invalid contig `%s`' % contig)
|
jpayne@69
|
690
|
jpayne@69
|
691 if rstart > rstop:
|
jpayne@69
|
692 raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
|
jpayne@69
|
693 if not 0 <= rstart < MAX_POS:
|
jpayne@69
|
694 raise ValueError('start out of range (%i)' % rstart)
|
jpayne@69
|
695 if not 0 <= rstop <= MAX_POS:
|
jpayne@69
|
696 raise ValueError('stop out of range (%i)' % rstop)
|
jpayne@69
|
697
|
jpayne@69
|
698 return 1, rtid, rstart, rstop
|
jpayne@69
|
699
|
jpayne@69
|
700 def is_valid_tid(self, tid):
|
jpayne@69
|
701 """
|
jpayne@69
|
702 return True if the numerical :term:`tid` is valid; False otherwise.
|
jpayne@69
|
703
|
jpayne@69
|
704 returns -1 if contig is not known.
|
jpayne@69
|
705 """
|
jpayne@69
|
706 raise NotImplementedError()
|
jpayne@69
|
707
|
jpayne@69
|
708 def is_valid_reference_name(self, contig):
|
jpayne@69
|
709 """
|
jpayne@69
|
710 return True if the contig name :term:`contig` is valid; False otherwise.
|
jpayne@69
|
711 """
|
jpayne@69
|
712 return self.get_tid(contig) != -1
|
jpayne@69
|
713
|
jpayne@69
|
714 def get_tid(self, contig):
|
jpayne@69
|
715 """
|
jpayne@69
|
716 return the numerical :term:`tid` corresponding to
|
jpayne@69
|
717 :term:`contig`
|
jpayne@69
|
718
|
jpayne@69
|
719 returns -1 if contig is not known.
|
jpayne@69
|
720 """
|
jpayne@69
|
721 raise NotImplementedError()
|
jpayne@69
|
722
|
jpayne@69
|
723 def get_reference_name(self, tid):
|
jpayne@69
|
724 """
|
jpayne@69
|
725 return :term:`contig` name corresponding to numerical :term:`tid`
|
jpayne@69
|
726 """
|
jpayne@69
|
727 raise NotImplementedError()
|