jpayne@69
|
1 # cython: language_level=3
|
jpayne@69
|
2 # cython: embedsignature=True
|
jpayne@69
|
3 # cython: profile=True
|
jpayne@69
|
4 ###############################################################################
|
jpayne@69
|
5 ###############################################################################
|
jpayne@69
|
6 ## Cython wrapper for htslib VCF/BCF reader/writer
|
jpayne@69
|
7 ###############################################################################
|
jpayne@69
|
8 #
|
jpayne@69
|
9 # NOTICE: This code is incomplete and preliminary. It offers a nearly
|
jpayne@69
|
10 # complete Pythonic interface to VCF/BCF metadata and data with
|
jpayne@69
|
11 # reading and writing capability. Documentation and a unit test suite
|
jpayne@69
|
12 # are in the works. The code is best tested under Python 2, but
|
jpayne@69
|
13 # should also work with Python 3. Please report any remaining
|
jpayne@69
|
14 # str/bytes issues on the github site when using Python 3 and I'll
|
jpayne@69
|
15 # fix them promptly.
|
jpayne@69
|
16 #
|
jpayne@69
|
17 # Here is a minimal example of how to use the API:
|
jpayne@69
|
18 #
|
jpayne@69
|
19 # $ cat bcfview.py
|
jpayne@69
|
20 # import sys
|
jpayne@69
|
21 # from pysam import VariantFile
|
jpayne@69
|
22 #
|
jpayne@69
|
23 # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
|
jpayne@69
|
24 # bcf_out = VariantFile('-', 'w', header=bcf_in.header)
|
jpayne@69
|
25 #
|
jpayne@69
|
26 # for rec in bcf_in:
|
jpayne@69
|
27 # bcf_out.write(rec)
|
jpayne@69
|
28 #
|
jpayne@69
|
29 # Performance is fairly close to that of bcftools view. Here is an example
|
jpayne@69
|
30 # using some 1k Genomes data:
|
jpayne@69
|
31 #
|
jpayne@69
|
32 # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
|
jpayne@69
|
33 # 1103799
|
jpayne@69
|
34 #
|
jpayne@69
|
35 # real 0m56.114s
|
jpayne@69
|
36 # user 1m4.489s
|
jpayne@69
|
37 # sys 0m3.102s
|
jpayne@69
|
38 #
|
jpayne@69
|
39 # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
|
jpayne@69
|
40 # 1103800 # bcftools adds an extra header
|
jpayne@69
|
41 #
|
jpayne@69
|
42 # real 0m55.126s
|
jpayne@69
|
43 # user 1m3.502s
|
jpayne@69
|
44 # sys 0m3.459s
|
jpayne@69
|
45 #
|
jpayne@69
|
46 ###############################################################################
|
jpayne@69
|
47 #
|
jpayne@69
|
48 # TODO list:
|
jpayne@69
|
49 #
|
jpayne@69
|
50 # * more genotype methods
|
jpayne@69
|
51 # * unit test suite (perhaps py.test based)
|
jpayne@69
|
52 # * documentation
|
jpayne@69
|
53 # * pickle support
|
jpayne@69
|
54 # * left/right locus normalization
|
jpayne@69
|
55 # * fix reopen to re-use fd
|
jpayne@69
|
56 #
|
jpayne@69
|
57 ###############################################################################
|
jpayne@69
|
58 #
|
jpayne@69
|
59 # The MIT License
|
jpayne@69
|
60 #
|
jpayne@69
|
61 # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
|
jpayne@69
|
62 #
|
jpayne@69
|
63 # Permission is hereby granted, free of charge, to any person obtaining a
|
jpayne@69
|
64 # copy of this software and associated documentation files (the "Software"),
|
jpayne@69
|
65 # to deal in the Software without restriction, including without limitation
|
jpayne@69
|
66 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jpayne@69
|
67 # and/or sell copies of the Software, and to permit persons to whom the
|
jpayne@69
|
68 # Software is furnished to do so, subject to the following conditions:
|
jpayne@69
|
69 #
|
jpayne@69
|
70 # The above copyright notice and this permission notice shall be included in
|
jpayne@69
|
71 # all copies or substantial portions of the Software.
|
jpayne@69
|
72 #
|
jpayne@69
|
73 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@69
|
74 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@69
|
75 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
jpayne@69
|
76 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@69
|
77 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jpayne@69
|
78 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jpayne@69
|
79 # DEALINGS IN THE SOFTWARE.
|
jpayne@69
|
80 #
|
jpayne@69
|
81 ###############################################################################
|
jpayne@69
|
82
|
jpayne@69
|
83 from __future__ import division, print_function
|
jpayne@69
|
84
|
jpayne@69
|
85 import os
|
jpayne@69
|
86 import sys
|
jpayne@69
|
87
|
jpayne@69
|
88 from libc.errno cimport errno, EPIPE
|
jpayne@69
|
89 from libc.string cimport strcmp, strpbrk, strerror
|
jpayne@69
|
90 from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
|
jpayne@69
|
91
|
jpayne@69
|
92 cimport cython
|
jpayne@69
|
93
|
jpayne@69
|
94 from cpython.object cimport PyObject
|
jpayne@69
|
95 from cpython.ref cimport Py_INCREF
|
jpayne@69
|
96 from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
|
jpayne@69
|
97 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
|
jpayne@69
|
98 from cpython.bytes cimport PyBytes_FromStringAndSize
|
jpayne@69
|
99 from cpython.unicode cimport PyUnicode_DecodeUTF8
|
jpayne@69
|
100
|
jpayne@69
|
101 from pysam.libchtslib cimport HTSFile, hisremote
|
jpayne@69
|
102
|
jpayne@69
|
103 from pysam.utils import unquoted_str
|
jpayne@69
|
104
|
jpayne@69
|
105
|
jpayne@69
|
106 __all__ = ['VariantFile',
|
jpayne@69
|
107 'VariantHeader',
|
jpayne@69
|
108 'VariantHeaderRecord',
|
jpayne@69
|
109 'VariantHeaderRecords',
|
jpayne@69
|
110 'VariantMetadata',
|
jpayne@69
|
111 'VariantHeaderMetadata',
|
jpayne@69
|
112 'VariantContig',
|
jpayne@69
|
113 'VariantHeaderContigs',
|
jpayne@69
|
114 'VariantHeaderSamples',
|
jpayne@69
|
115 'VariantRecordFilter',
|
jpayne@69
|
116 'VariantRecordFormat',
|
jpayne@69
|
117 'VariantRecordInfo',
|
jpayne@69
|
118 'VariantRecordSamples',
|
jpayne@69
|
119 'VariantRecord',
|
jpayne@69
|
120 'VariantRecordSample',
|
jpayne@69
|
121 'BaseIndex',
|
jpayne@69
|
122 'BCFIndex',
|
jpayne@69
|
123 'TabixIndex',
|
jpayne@69
|
124 'BaseIterator',
|
jpayne@69
|
125 'BCFIterator',
|
jpayne@69
|
126 'TabixIterator',
|
jpayne@69
|
127 'VariantRecord']
|
jpayne@69
|
128
|
jpayne@69
|
129 ########################################################################
|
jpayne@69
|
130 ########################################################################
|
jpayne@69
|
131 ## Constants
|
jpayne@69
|
132 ########################################################################
|
jpayne@69
|
133
|
jpayne@69
|
134 cdef int MAX_POS = (1 << 31) - 1
|
jpayne@69
|
135 cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
|
jpayne@69
|
136 cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
|
jpayne@69
|
137 cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
|
jpayne@69
|
138
|
jpayne@69
|
139
|
jpayne@69
|
140 ########################################################################
|
jpayne@69
|
141 ########################################################################
|
jpayne@69
|
142 ## Python 3 compatibility functions
|
jpayne@69
|
143 ########################################################################
|
jpayne@69
|
144
|
jpayne@69
|
145 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
|
jpayne@69
|
146 from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
|
jpayne@69
|
147
|
jpayne@69
|
148
|
jpayne@69
|
149 ########################################################################
|
jpayne@69
|
150 ########################################################################
|
jpayne@69
|
151 ## Sentinel object
|
jpayne@69
|
152 ########################################################################
|
jpayne@69
|
153
|
jpayne@69
|
154 cdef object _nothing = object()
|
jpayne@69
|
155
|
jpayne@69
|
156 ########################################################################
|
jpayne@69
|
157 ########################################################################
|
jpayne@69
|
158 ## VCF/BCF string intern system
|
jpayne@69
|
159 ########################################################################
|
jpayne@69
|
160
|
jpayne@69
|
161 cdef dict bcf_str_cache = {}
|
jpayne@69
|
162
|
jpayne@69
|
163 cdef inline bcf_str_cache_get_charptr(const char* s):
|
jpayne@69
|
164 if s == NULL:
|
jpayne@69
|
165 return None
|
jpayne@69
|
166
|
jpayne@69
|
167 cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
|
jpayne@69
|
168 if pystr:
|
jpayne@69
|
169 return <object>pystr
|
jpayne@69
|
170
|
jpayne@69
|
171 val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
|
jpayne@69
|
172
|
jpayne@69
|
173 PyDict_SetItemString(bcf_str_cache, s, val)
|
jpayne@69
|
174
|
jpayne@69
|
175 return val
|
jpayne@69
|
176
|
jpayne@69
|
177
|
jpayne@69
|
178 ########################################################################
|
jpayne@69
|
179 ########################################################################
|
jpayne@69
|
180 ## Genotype math
|
jpayne@69
|
181 ########################################################################
|
jpayne@69
|
182
|
jpayne@69
|
183 cdef int comb(int n, int k) except -1:
|
jpayne@69
|
184 """Return binomial coefficient: n choose k
|
jpayne@69
|
185
|
jpayne@69
|
186 >>> comb(5, 1)
|
jpayne@69
|
187 5
|
jpayne@69
|
188 >>> comb(5, 2)
|
jpayne@69
|
189 10
|
jpayne@69
|
190 >>> comb(2, 2)
|
jpayne@69
|
191 1
|
jpayne@69
|
192 >>> comb(100, 2)
|
jpayne@69
|
193 4950
|
jpayne@69
|
194 """
|
jpayne@69
|
195 if k > n:
|
jpayne@69
|
196 return 0
|
jpayne@69
|
197 elif k == n:
|
jpayne@69
|
198 return 1
|
jpayne@69
|
199 elif k > n // 2:
|
jpayne@69
|
200 k = n - k
|
jpayne@69
|
201
|
jpayne@69
|
202 cdef d, result
|
jpayne@69
|
203
|
jpayne@69
|
204 d = result = n - k + 1
|
jpayne@69
|
205 for i in range(2, k + 1):
|
jpayne@69
|
206 d += 1
|
jpayne@69
|
207 result *= d
|
jpayne@69
|
208 result //= i
|
jpayne@69
|
209 return result
|
jpayne@69
|
210
|
jpayne@69
|
211
|
jpayne@69
|
212 cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
|
jpayne@69
|
213 """Return the count of genotypes expected for the given ploidy and number of alleles.
|
jpayne@69
|
214
|
jpayne@69
|
215 >>> bcf_geno_combinations(1, 2)
|
jpayne@69
|
216 2
|
jpayne@69
|
217 >>> bcf_geno_combinations(2, 2)
|
jpayne@69
|
218 3
|
jpayne@69
|
219 >>> bcf_geno_combinations(2, 3)
|
jpayne@69
|
220 6
|
jpayne@69
|
221 >>> bcf_geno_combinations(3, 2)
|
jpayne@69
|
222 4
|
jpayne@69
|
223 """
|
jpayne@69
|
224 return comb(alleles + ploidy - 1, ploidy)
|
jpayne@69
|
225
|
jpayne@69
|
226
|
jpayne@69
|
227 ########################################################################
|
jpayne@69
|
228 ########################################################################
|
jpayne@69
|
229 ## Low level type conversion helpers
|
jpayne@69
|
230 ########################################################################
|
jpayne@69
|
231
|
jpayne@69
|
232
|
jpayne@69
|
233 cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
|
jpayne@69
|
234 return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
|
jpayne@69
|
235
|
jpayne@69
|
236
|
jpayne@69
|
237 cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
|
jpayne@69
|
238 return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
|
jpayne@69
|
239
|
jpayne@69
|
240
|
jpayne@69
|
241 cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
|
jpayne@69
|
242
|
jpayne@69
|
243 if sample < 0:
|
jpayne@69
|
244 raise ValueError('genotype is only valid as a format field')
|
jpayne@69
|
245
|
jpayne@69
|
246 cdef int32_t *gt_arr = NULL
|
jpayne@69
|
247 cdef int ngt = 0
|
jpayne@69
|
248 ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt)
|
jpayne@69
|
249
|
jpayne@69
|
250 if ngt <= 0 or not gt_arr:
|
jpayne@69
|
251 return 0
|
jpayne@69
|
252
|
jpayne@69
|
253 assert ngt % rec.n_sample == 0
|
jpayne@69
|
254 cdef int max_ploidy = ngt // rec.n_sample
|
jpayne@69
|
255 cdef int32_t *gt = gt_arr + sample * max_ploidy
|
jpayne@69
|
256 cdef int ploidy = 0
|
jpayne@69
|
257
|
jpayne@69
|
258 while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
|
jpayne@69
|
259 gt += 1
|
jpayne@69
|
260 ploidy += 1
|
jpayne@69
|
261
|
jpayne@69
|
262 free(<void*>gt_arr)
|
jpayne@69
|
263
|
jpayne@69
|
264 return bcf_geno_combinations(ploidy, rec.n_allele)
|
jpayne@69
|
265
|
jpayne@69
|
266
|
jpayne@69
|
267 cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
|
jpayne@69
|
268 if not a:
|
jpayne@69
|
269 return None
|
jpayne@69
|
270 try:
|
jpayne@69
|
271 return tuple(charptr_to_str(a[i]) for i in range(n))
|
jpayne@69
|
272 finally:
|
jpayne@69
|
273 if free_after and a:
|
jpayne@69
|
274 free(a)
|
jpayne@69
|
275
|
jpayne@69
|
276
|
jpayne@69
|
277 cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
|
jpayne@69
|
278 cdef char *datac
|
jpayne@69
|
279 cdef int8_t *data8
|
jpayne@69
|
280 cdef int16_t *data16
|
jpayne@69
|
281 cdef int32_t *data32
|
jpayne@69
|
282 cdef float *dataf
|
jpayne@69
|
283 cdef int i
|
jpayne@69
|
284 cdef bytes b
|
jpayne@69
|
285
|
jpayne@69
|
286 if not data or n <= 0:
|
jpayne@69
|
287 return None
|
jpayne@69
|
288
|
jpayne@69
|
289 if type == BCF_BT_CHAR:
|
jpayne@69
|
290 datac = <char *>data
|
jpayne@69
|
291
|
jpayne@69
|
292 if not n:
|
jpayne@69
|
293 value = ()
|
jpayne@69
|
294 else:
|
jpayne@69
|
295 # Check if at least one null terminator is present
|
jpayne@69
|
296 if datac[n-1] == bcf_str_vector_end:
|
jpayne@69
|
297 # If so, create a string up to the first null terminator
|
jpayne@69
|
298 b = datac
|
jpayne@69
|
299 else:
|
jpayne@69
|
300 # Otherwise, copy the entire block
|
jpayne@69
|
301 b = datac[:n]
|
jpayne@69
|
302 value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
|
jpayne@69
|
303 else:
|
jpayne@69
|
304 value = []
|
jpayne@69
|
305 if type == BCF_BT_INT8:
|
jpayne@69
|
306 data8 = <int8_t *>data
|
jpayne@69
|
307 for i in range(n):
|
jpayne@69
|
308 if data8[i] == bcf_int8_vector_end:
|
jpayne@69
|
309 break
|
jpayne@69
|
310 value.append(data8[i] if data8[i] != bcf_int8_missing else None)
|
jpayne@69
|
311 elif type == BCF_BT_INT16:
|
jpayne@69
|
312 data16 = <int16_t *>data
|
jpayne@69
|
313 for i in range(n):
|
jpayne@69
|
314 if data16[i] == bcf_int16_vector_end:
|
jpayne@69
|
315 break
|
jpayne@69
|
316 value.append(data16[i] if data16[i] != bcf_int16_missing else None)
|
jpayne@69
|
317 elif type == BCF_BT_INT32:
|
jpayne@69
|
318 data32 = <int32_t *>data
|
jpayne@69
|
319 for i in range(n):
|
jpayne@69
|
320 if data32[i] == bcf_int32_vector_end:
|
jpayne@69
|
321 break
|
jpayne@69
|
322 value.append(data32[i] if data32[i] != bcf_int32_missing else None)
|
jpayne@69
|
323 elif type == BCF_BT_FLOAT:
|
jpayne@69
|
324 dataf = <float *>data
|
jpayne@69
|
325 for i in range(n):
|
jpayne@69
|
326 if bcf_float_is_vector_end(dataf[i]):
|
jpayne@69
|
327 break
|
jpayne@69
|
328 value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
|
jpayne@69
|
329 else:
|
jpayne@69
|
330 raise TypeError('unsupported info type code')
|
jpayne@69
|
331
|
jpayne@69
|
332 # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
|
jpayne@69
|
333 if not value:
|
jpayne@69
|
334 if scalar:
|
jpayne@69
|
335 value = None
|
jpayne@69
|
336 elif count <= 0:
|
jpayne@69
|
337 value = ()
|
jpayne@69
|
338 else:
|
jpayne@69
|
339 value = (None,)*count
|
jpayne@69
|
340 elif scalar and len(value) == 1:
|
jpayne@69
|
341 value = value[0]
|
jpayne@69
|
342 else:
|
jpayne@69
|
343 value = tuple(value)
|
jpayne@69
|
344
|
jpayne@69
|
345 return value
|
jpayne@69
|
346
|
jpayne@69
|
347
|
jpayne@69
|
348 cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
|
jpayne@69
|
349 cdef char *datac
|
jpayne@69
|
350 cdef int8_t *data8
|
jpayne@69
|
351 cdef int16_t *data16
|
jpayne@69
|
352 cdef int32_t *data32
|
jpayne@69
|
353 cdef float *dataf
|
jpayne@69
|
354 cdef ssize_t i, value_count = len(values)
|
jpayne@69
|
355
|
jpayne@69
|
356 assert value_count <= n
|
jpayne@69
|
357
|
jpayne@69
|
358 if bt_type == BCF_BT_CHAR:
|
jpayne@69
|
359 if not isinstance(values, (str, bytes)):
|
jpayne@69
|
360 values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
|
jpayne@69
|
361 value_count = len(values)
|
jpayne@69
|
362 assert value_count <= n
|
jpayne@69
|
363 datac = <char *>data
|
jpayne@69
|
364 memcpy(datac, <char *>values, value_count)
|
jpayne@69
|
365 for i in range(value_count, n):
|
jpayne@69
|
366 datac[i] = 0
|
jpayne@69
|
367 elif bt_type == BCF_BT_INT8:
|
jpayne@69
|
368 datai8 = <int8_t *>data
|
jpayne@69
|
369 for i in range(value_count):
|
jpayne@69
|
370 val = values[i]
|
jpayne@69
|
371 datai8[i] = val if val is not None else bcf_int8_missing
|
jpayne@69
|
372 for i in range(value_count, n):
|
jpayne@69
|
373 datai8[i] = bcf_int8_vector_end
|
jpayne@69
|
374 elif bt_type == BCF_BT_INT16:
|
jpayne@69
|
375 datai16 = <int16_t *>data
|
jpayne@69
|
376 for i in range(value_count):
|
jpayne@69
|
377 val = values[i]
|
jpayne@69
|
378 datai16[i] = val if val is not None else bcf_int16_missing
|
jpayne@69
|
379 for i in range(value_count, n):
|
jpayne@69
|
380 datai16[i] = bcf_int16_vector_end
|
jpayne@69
|
381 elif bt_type == BCF_BT_INT32:
|
jpayne@69
|
382 datai32 = <int32_t *>data
|
jpayne@69
|
383 for i in range(value_count):
|
jpayne@69
|
384 val = values[i]
|
jpayne@69
|
385 datai32[i] = val if val is not None else bcf_int32_missing
|
jpayne@69
|
386 for i in range(value_count, n):
|
jpayne@69
|
387 datai32[i] = bcf_int32_vector_end
|
jpayne@69
|
388 elif bt_type == BCF_BT_FLOAT:
|
jpayne@69
|
389 dataf = <float *>data
|
jpayne@69
|
390 for i in range(value_count):
|
jpayne@69
|
391 val = values[i]
|
jpayne@69
|
392 if val is None:
|
jpayne@69
|
393 bcf_float_set(dataf + i, bcf_float_missing)
|
jpayne@69
|
394 else:
|
jpayne@69
|
395 dataf[i] = val
|
jpayne@69
|
396 for i in range(value_count, n):
|
jpayne@69
|
397 bcf_float_set(dataf + i, bcf_float_vector_end)
|
jpayne@69
|
398 else:
|
jpayne@69
|
399 raise TypeError('unsupported type')
|
jpayne@69
|
400
|
jpayne@69
|
401
|
jpayne@69
|
402 cdef bcf_empty_array(int type, ssize_t n, int vlen):
|
jpayne@69
|
403 cdef char *datac
|
jpayne@69
|
404 cdef int32_t *data32
|
jpayne@69
|
405 cdef float *dataf
|
jpayne@69
|
406 cdef int i
|
jpayne@69
|
407
|
jpayne@69
|
408 if n <= 0:
|
jpayne@69
|
409 raise ValueError('Cannot create empty array')
|
jpayne@69
|
410
|
jpayne@69
|
411 if type == BCF_HT_STR:
|
jpayne@69
|
412 value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
|
jpayne@69
|
413 datac = <char *>value
|
jpayne@69
|
414 for i in range(n):
|
jpayne@69
|
415 datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
|
jpayne@69
|
416 elif type == BCF_HT_INT:
|
jpayne@69
|
417 value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
|
jpayne@69
|
418 data32 = <int32_t *><char *>value
|
jpayne@69
|
419 for i in range(n):
|
jpayne@69
|
420 data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
|
jpayne@69
|
421 elif type == BCF_HT_REAL:
|
jpayne@69
|
422 value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
|
jpayne@69
|
423 dataf = <float *><char *>value
|
jpayne@69
|
424 for i in range(n):
|
jpayne@69
|
425 bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
|
jpayne@69
|
426 else:
|
jpayne@69
|
427 raise TypeError('unsupported header type code')
|
jpayne@69
|
428
|
jpayne@69
|
429 return value
|
jpayne@69
|
430
|
jpayne@69
|
431
|
jpayne@69
|
432 cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
|
jpayne@69
|
433 void *dst_data, int dst_type, size_t dst_values,
|
jpayne@69
|
434 int vlen):
|
jpayne@69
|
435 """copy data from src to dest where the size of the elements (src_type/dst_type) differ
|
jpayne@69
|
436 as well as the number of elements (src_values/dst_values).
|
jpayne@69
|
437 """
|
jpayne@69
|
438
|
jpayne@69
|
439 cdef char *src_datac
|
jpayne@69
|
440 cdef char *dst_datac
|
jpayne@69
|
441 cdef int8_t *src_datai8
|
jpayne@69
|
442 cdef int16_t *src_datai16
|
jpayne@69
|
443 cdef int32_t *src_datai32
|
jpayne@69
|
444 cdef int32_t *dst_datai
|
jpayne@69
|
445 cdef float *src_dataf
|
jpayne@69
|
446 cdef float *dst_dataf
|
jpayne@69
|
447 cdef ssize_t src_size, dst_size, i, j
|
jpayne@69
|
448 cdef int val
|
jpayne@69
|
449
|
jpayne@69
|
450 if src_values > dst_values:
|
jpayne@69
|
451 raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
|
jpayne@69
|
452
|
jpayne@69
|
453 if src_type == dst_type == BCF_BT_CHAR:
|
jpayne@69
|
454 src_datac = <char *>src_data
|
jpayne@69
|
455 dst_datac = <char *>dst_data
|
jpayne@69
|
456 memcpy(dst_datac, src_datac, src_values)
|
jpayne@69
|
457 for i in range(src_values, dst_values):
|
jpayne@69
|
458 dst_datac[i] = 0
|
jpayne@69
|
459 elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
|
jpayne@69
|
460 src_datai8 = <int8_t *>src_data
|
jpayne@69
|
461 dst_datai = <int32_t *>dst_data
|
jpayne@69
|
462 for i in range(src_values):
|
jpayne@69
|
463 val = src_datai8[i]
|
jpayne@69
|
464 if val == bcf_int8_missing:
|
jpayne@69
|
465 val = bcf_int32_missing
|
jpayne@69
|
466 elif val == bcf_int8_vector_end:
|
jpayne@69
|
467 val = bcf_int32_vector_end
|
jpayne@69
|
468 dst_datai[i] = val
|
jpayne@69
|
469 for i in range(src_values, dst_values):
|
jpayne@69
|
470 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
|
jpayne@69
|
471 elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
|
jpayne@69
|
472 src_datai16 = <int16_t *>src_data
|
jpayne@69
|
473 dst_datai = <int32_t *>dst_data
|
jpayne@69
|
474 for i in range(src_values):
|
jpayne@69
|
475 val = src_datai16[i]
|
jpayne@69
|
476 if val == bcf_int16_missing:
|
jpayne@69
|
477 val = bcf_int32_missing
|
jpayne@69
|
478 elif val == bcf_int16_vector_end:
|
jpayne@69
|
479 val = bcf_int32_vector_end
|
jpayne@69
|
480 dst_datai[i] = val
|
jpayne@69
|
481 for i in range(src_values, dst_values):
|
jpayne@69
|
482 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
|
jpayne@69
|
483 elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
|
jpayne@69
|
484 src_datai32 = <int32_t *>src_data
|
jpayne@69
|
485 dst_datai = <int32_t *>dst_data
|
jpayne@69
|
486 for i in range(src_values):
|
jpayne@69
|
487 dst_datai[i] = src_datai32[i]
|
jpayne@69
|
488 for i in range(src_values, dst_values):
|
jpayne@69
|
489 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
|
jpayne@69
|
490 elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
|
jpayne@69
|
491 src_dataf = <float *>src_data
|
jpayne@69
|
492 dst_dataf = <float *>dst_data
|
jpayne@69
|
493 for i in range(src_values):
|
jpayne@69
|
494 dst_dataf[i] = src_dataf[i]
|
jpayne@69
|
495 for i in range(src_values, dst_values):
|
jpayne@69
|
496 bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
|
jpayne@69
|
497 else:
|
jpayne@69
|
498 raise TypeError('unsupported types')
|
jpayne@69
|
499
|
jpayne@69
|
500
|
jpayne@69
|
501 cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
|
jpayne@69
|
502 if record is None:
|
jpayne@69
|
503 raise ValueError('record must not be None')
|
jpayne@69
|
504
|
jpayne@69
|
505 cdef bcf_hdr_t *hdr = record.header.ptr
|
jpayne@69
|
506 cdef bcf1_t *r = record.ptr
|
jpayne@69
|
507
|
jpayne@69
|
508 if not check_header_id(hdr, hl_type, id):
|
jpayne@69
|
509 raise ValueError('Invalid header')
|
jpayne@69
|
510
|
jpayne@69
|
511 cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
|
jpayne@69
|
512 cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
|
jpayne@69
|
513
|
jpayne@69
|
514 scalar[0] = 0
|
jpayne@69
|
515
|
jpayne@69
|
516 if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
|
jpayne@69
|
517 count[0] = number
|
jpayne@69
|
518 elif length == BCF_VL_FIXED:
|
jpayne@69
|
519 if number == 1:
|
jpayne@69
|
520 scalar[0] = 1
|
jpayne@69
|
521 count[0] = number
|
jpayne@69
|
522 elif length == BCF_VL_R:
|
jpayne@69
|
523 count[0] = r.n_allele
|
jpayne@69
|
524 elif length == BCF_VL_A:
|
jpayne@69
|
525 count[0] = r.n_allele - 1
|
jpayne@69
|
526 elif length == BCF_VL_G:
|
jpayne@69
|
527 count[0] = bcf_genotype_count(hdr, r, sample)
|
jpayne@69
|
528 elif length == BCF_VL_VAR:
|
jpayne@69
|
529 count[0] = -1
|
jpayne@69
|
530 else:
|
jpayne@69
|
531 raise ValueError('Unknown format length')
|
jpayne@69
|
532
|
jpayne@69
|
533
|
jpayne@69
|
534 cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
|
jpayne@69
|
535 if record is None:
|
jpayne@69
|
536 raise ValueError('record must not be None')
|
jpayne@69
|
537
|
jpayne@69
|
538 cdef bcf_hdr_t *hdr = record.header.ptr
|
jpayne@69
|
539
|
jpayne@69
|
540 cdef char *s
|
jpayne@69
|
541 cdef ssize_t count
|
jpayne@69
|
542 cdef int scalar
|
jpayne@69
|
543
|
jpayne@69
|
544 bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
|
jpayne@69
|
545
|
jpayne@69
|
546 if z.len == 0:
|
jpayne@69
|
547 if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
|
jpayne@69
|
548 value = True
|
jpayne@69
|
549 elif scalar:
|
jpayne@69
|
550 value = None
|
jpayne@69
|
551 else:
|
jpayne@69
|
552 value = ()
|
jpayne@69
|
553 elif z.len == 1:
|
jpayne@69
|
554 if z.type == BCF_BT_INT8:
|
jpayne@69
|
555 value = z.v1.i if z.v1.i != bcf_int8_missing else None
|
jpayne@69
|
556 elif z.type == BCF_BT_INT16:
|
jpayne@69
|
557 value = z.v1.i if z.v1.i != bcf_int16_missing else None
|
jpayne@69
|
558 elif z.type == BCF_BT_INT32:
|
jpayne@69
|
559 value = z.v1.i if z.v1.i != bcf_int32_missing else None
|
jpayne@69
|
560 elif z.type == BCF_BT_FLOAT:
|
jpayne@69
|
561 value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
|
jpayne@69
|
562 elif z.type == BCF_BT_CHAR:
|
jpayne@69
|
563 value = force_str(chr(z.v1.i))
|
jpayne@69
|
564 else:
|
jpayne@69
|
565 raise TypeError('unsupported info type code')
|
jpayne@69
|
566
|
jpayne@69
|
567 if not scalar and value != ():
|
jpayne@69
|
568 value = (value,)
|
jpayne@69
|
569 else:
|
jpayne@69
|
570 value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
|
jpayne@69
|
571
|
jpayne@69
|
572 return value
|
jpayne@69
|
573
|
jpayne@69
|
574
|
jpayne@69
|
575 cdef object bcf_check_values(VariantRecord record, value, int sample,
|
jpayne@69
|
576 int hl_type, int ht_type,
|
jpayne@69
|
577 int id, int bt_type, ssize_t bt_len,
|
jpayne@69
|
578 ssize_t *value_count, int *scalar, int *realloc):
|
jpayne@69
|
579
|
jpayne@69
|
580 if record is None:
|
jpayne@69
|
581 raise ValueError('record must not be None')
|
jpayne@69
|
582
|
jpayne@69
|
583 bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
|
jpayne@69
|
584
|
jpayne@69
|
585 # Validate values now that we know the type and size
|
jpayne@69
|
586 values = (value,) if not isinstance(value, (list, tuple)) else value
|
jpayne@69
|
587
|
jpayne@69
|
588 # Validate values now that we know the type and size
|
jpayne@69
|
589 if ht_type == BCF_HT_FLAG:
|
jpayne@69
|
590 value_count[0] = 1
|
jpayne@69
|
591 elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
|
jpayne@69
|
592 # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
|
jpayne@69
|
593 value_count[0] = -1
|
jpayne@69
|
594
|
jpayne@69
|
595 cdef int given = len(values)
|
jpayne@69
|
596 if value_count[0] != -1 and value_count[0] != given:
|
jpayne@69
|
597 if scalar[0]:
|
jpayne@69
|
598 raise TypeError('value expected to be scalar, given len={}'.format(given))
|
jpayne@69
|
599 else:
|
jpayne@69
|
600 raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
|
jpayne@69
|
601
|
jpayne@69
|
602 if ht_type == BCF_HT_REAL:
|
jpayne@69
|
603 for v in values:
|
jpayne@69
|
604 if not(v is None or isinstance(v, (float, int))):
|
jpayne@69
|
605 raise TypeError('invalid value for Float format')
|
jpayne@69
|
606 elif ht_type == BCF_HT_INT:
|
jpayne@69
|
607 for v in values:
|
jpayne@69
|
608 if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
|
jpayne@69
|
609 raise TypeError('invalid value for Integer format')
|
jpayne@69
|
610 for v in values:
|
jpayne@69
|
611 if not(v is None or bcf_int32_missing < v <= INT32_MAX):
|
jpayne@69
|
612 raise ValueError('Integer value too small/large to store in VCF/BCF')
|
jpayne@69
|
613 elif ht_type == BCF_HT_STR:
|
jpayne@69
|
614 values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
|
jpayne@69
|
615 elif ht_type == BCF_HT_FLAG:
|
jpayne@69
|
616 if values[0] not in (True, False, None, 1, 0):
|
jpayne@69
|
617 raise ValueError('Flag values must be: True, False, None, 1, 0')
|
jpayne@69
|
618 else:
|
jpayne@69
|
619 raise TypeError('unsupported type')
|
jpayne@69
|
620
|
jpayne@69
|
621 realloc[0] = 0
|
jpayne@69
|
622 if len(values) <= 1 and hl_type == BCF_HL_INFO:
|
jpayne@69
|
623 realloc[0] = 0
|
jpayne@69
|
624 elif len(values) > bt_len:
|
jpayne@69
|
625 realloc[0] = 1
|
jpayne@69
|
626 elif bt_type == BCF_BT_INT8:
|
jpayne@69
|
627 for v in values:
|
jpayne@69
|
628 if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
|
jpayne@69
|
629 realloc[0] = 1
|
jpayne@69
|
630 break
|
jpayne@69
|
631 elif bt_type == BCF_BT_INT16:
|
jpayne@69
|
632 for v in values:
|
jpayne@69
|
633 if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
|
jpayne@69
|
634 realloc[0] = 1
|
jpayne@69
|
635 break
|
jpayne@69
|
636
|
jpayne@69
|
637 return values
|
jpayne@69
|
638
|
jpayne@69
|
639
|
jpayne@69
|
640 cdef bcf_encode_alleles(VariantRecord record, values):
|
jpayne@69
|
641 if record is None:
|
jpayne@69
|
642 raise ValueError('record must not be None')
|
jpayne@69
|
643
|
jpayne@69
|
644 cdef bcf1_t *r = record.ptr
|
jpayne@69
|
645 cdef int32_t nalleles = r.n_allele
|
jpayne@69
|
646 cdef list gt_values = []
|
jpayne@69
|
647 cdef char *s
|
jpayne@69
|
648 cdef int i
|
jpayne@69
|
649
|
jpayne@69
|
650 if values is None:
|
jpayne@69
|
651 return ()
|
jpayne@69
|
652
|
jpayne@69
|
653 if not isinstance(values, (list, tuple)):
|
jpayne@69
|
654 values = (values,)
|
jpayne@69
|
655
|
jpayne@69
|
656 for value in values:
|
jpayne@69
|
657 if value is None:
|
jpayne@69
|
658 gt_values.append(bcf_gt_missing)
|
jpayne@69
|
659 elif isinstance(value, (str, bytes)):
|
jpayne@69
|
660 bvalue = force_bytes(value)
|
jpayne@69
|
661 s = bvalue
|
jpayne@69
|
662 for i in range(r.n_allele):
|
jpayne@69
|
663 if strcmp(r.d.allele[i], s) != 0:
|
jpayne@69
|
664 gt_values.append(bcf_gt_unphased(i))
|
jpayne@69
|
665 break
|
jpayne@69
|
666 else:
|
jpayne@69
|
667 raise ValueError('Unknown allele')
|
jpayne@69
|
668 else:
|
jpayne@69
|
669 i = value
|
jpayne@69
|
670 if not (0 <= i < nalleles):
|
jpayne@69
|
671 raise ValueError('Invalid allele index')
|
jpayne@69
|
672 gt_values.append(bcf_gt_unphased(i))
|
jpayne@69
|
673
|
jpayne@69
|
674 return gt_values
|
jpayne@69
|
675
|
jpayne@69
|
676
|
jpayne@69
|
677 cdef bcf_info_set_value(VariantRecord record, key, value):
|
jpayne@69
|
678 if record is None:
|
jpayne@69
|
679 raise ValueError('record must not be None')
|
jpayne@69
|
680
|
jpayne@69
|
681 cdef bcf_hdr_t *hdr = record.header.ptr
|
jpayne@69
|
682 cdef bcf1_t *r = record.ptr
|
jpayne@69
|
683 cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
|
jpayne@69
|
684 cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
|
jpayne@69
|
685
|
jpayne@69
|
686 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
687 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
688
|
jpayne@69
|
689 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
690 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
691
|
jpayne@69
|
692 if info:
|
jpayne@69
|
693 info_id = info.key
|
jpayne@69
|
694 else:
|
jpayne@69
|
695 info_id = bcf_header_get_info_id(hdr, bkey)
|
jpayne@69
|
696
|
jpayne@69
|
697 if info_id < 0:
|
jpayne@69
|
698 raise KeyError('unknown INFO: {}'.format(key))
|
jpayne@69
|
699
|
jpayne@69
|
700 if not check_header_id(hdr, BCF_HL_INFO, info_id):
|
jpayne@69
|
701 raise ValueError('Invalid header')
|
jpayne@69
|
702
|
jpayne@69
|
703 info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
|
jpayne@69
|
704 values = bcf_check_values(record, value, -1,
|
jpayne@69
|
705 BCF_HL_INFO, info_type, info_id,
|
jpayne@69
|
706 info.type if info else -1,
|
jpayne@69
|
707 info.len if info else -1,
|
jpayne@69
|
708 &value_count, &scalar, &realloc)
|
jpayne@69
|
709
|
jpayne@69
|
710 if info_type == BCF_HT_FLAG:
|
jpayne@69
|
711 if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
|
jpayne@69
|
712 raise ValueError('Unable to update INFO values')
|
jpayne@69
|
713 return
|
jpayne@69
|
714
|
jpayne@69
|
715 vlen = value_count < 0
|
jpayne@69
|
716 value_count = len(values)
|
jpayne@69
|
717
|
jpayne@69
|
718 # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
|
jpayne@69
|
719 # If we can, write updated values to existing allocated storage
|
jpayne@69
|
720 if 0 and info and not realloc:
|
jpayne@69
|
721 r.d.shared_dirty |= BCF1_DIRTY_INF
|
jpayne@69
|
722
|
jpayne@69
|
723 if value_count == 0:
|
jpayne@69
|
724 info.len = 0
|
jpayne@69
|
725 if not info.vptr:
|
jpayne@69
|
726 info.vptr = <uint8_t *>&info.v1.i
|
jpayne@69
|
727
|
jpayne@69
|
728 elif value_count == 1:
|
jpayne@69
|
729 # FIXME: Check if need to free vptr if info.len > 0?
|
jpayne@69
|
730 if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
|
jpayne@69
|
731 bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
|
jpayne@69
|
732 elif info.type == BCF_BT_FLOAT:
|
jpayne@69
|
733 bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
|
jpayne@69
|
734 else:
|
jpayne@69
|
735 raise TypeError('unsupported info type code')
|
jpayne@69
|
736
|
jpayne@69
|
737 info.len = 1
|
jpayne@69
|
738 if not info.vptr:
|
jpayne@69
|
739 info.vptr = <uint8_t *>&info.v1.i
|
jpayne@69
|
740 else:
|
jpayne@69
|
741 bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
|
jpayne@69
|
742
|
jpayne@69
|
743 return
|
jpayne@69
|
744
|
jpayne@69
|
745 alloc_len = max(1, value_count)
|
jpayne@69
|
746 if info and info.len > alloc_len:
|
jpayne@69
|
747 alloc_len = info.len
|
jpayne@69
|
748
|
jpayne@69
|
749 new_values = bcf_empty_array(info_type, alloc_len, vlen)
|
jpayne@69
|
750 cdef char *valp = <char *>new_values
|
jpayne@69
|
751
|
jpayne@69
|
752 if info_type == BCF_HT_INT:
|
jpayne@69
|
753 dst_type = BCF_BT_INT32
|
jpayne@69
|
754 elif info_type == BCF_HT_REAL:
|
jpayne@69
|
755 dst_type = BCF_BT_FLOAT
|
jpayne@69
|
756 elif info_type == BCF_HT_STR:
|
jpayne@69
|
757 dst_type = BCF_BT_CHAR
|
jpayne@69
|
758 else:
|
jpayne@69
|
759 raise ValueError('Unsupported INFO type')
|
jpayne@69
|
760
|
jpayne@69
|
761 bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
|
jpayne@69
|
762
|
jpayne@69
|
763 if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
|
jpayne@69
|
764 raise ValueError('Unable to update INFO values')
|
jpayne@69
|
765
|
jpayne@69
|
766
|
jpayne@69
|
767 cdef bcf_info_del_value(VariantRecord record, key):
|
jpayne@69
|
768 if record is None:
|
jpayne@69
|
769 raise ValueError('record must not be None')
|
jpayne@69
|
770
|
jpayne@69
|
771 cdef bcf_hdr_t *hdr = record.header.ptr
|
jpayne@69
|
772 cdef bcf1_t *r = record.ptr
|
jpayne@69
|
773 cdef ssize_t value_count
|
jpayne@69
|
774 cdef int scalar
|
jpayne@69
|
775
|
jpayne@69
|
776 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
777 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
778
|
jpayne@69
|
779 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
780 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
781
|
jpayne@69
|
782 if not info:
|
jpayne@69
|
783 raise KeyError(key)
|
jpayne@69
|
784
|
jpayne@69
|
785 bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
|
jpayne@69
|
786
|
jpayne@69
|
787 if value_count <= 0:
|
jpayne@69
|
788 null_value = ()
|
jpayne@69
|
789 elif scalar:
|
jpayne@69
|
790 null_value = None
|
jpayne@69
|
791 else:
|
jpayne@69
|
792 null_value = (None,)*value_count
|
jpayne@69
|
793
|
jpayne@69
|
794 bcf_info_set_value(record, bkey, null_value)
|
jpayne@69
|
795
|
jpayne@69
|
796
|
jpayne@69
|
797 cdef bcf_format_get_value(VariantRecordSample sample, key):
|
jpayne@69
|
798 if sample is None:
|
jpayne@69
|
799 raise ValueError('sample must not be None')
|
jpayne@69
|
800
|
jpayne@69
|
801 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
802 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
803 cdef ssize_t count
|
jpayne@69
|
804 cdef int scalar
|
jpayne@69
|
805
|
jpayne@69
|
806 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
807 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
808
|
jpayne@69
|
809 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
810 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
811
|
jpayne@69
|
812 if not fmt or not fmt.p:
|
jpayne@69
|
813 raise KeyError('invalid FORMAT: {}'.format(key))
|
jpayne@69
|
814
|
jpayne@69
|
815 if is_gt_fmt(hdr, fmt.id):
|
jpayne@69
|
816 return bcf_format_get_allele_indices(sample)
|
jpayne@69
|
817
|
jpayne@69
|
818 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
|
jpayne@69
|
819
|
jpayne@69
|
820 if fmt.p and fmt.n and fmt.size:
|
jpayne@69
|
821 return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
|
jpayne@69
|
822 elif scalar:
|
jpayne@69
|
823 return None
|
jpayne@69
|
824 elif count <= 0:
|
jpayne@69
|
825 return ()
|
jpayne@69
|
826 else:
|
jpayne@69
|
827 return (None,)*count
|
jpayne@69
|
828
|
jpayne@69
|
829
|
jpayne@69
|
830 cdef bcf_format_set_value(VariantRecordSample sample, key, value):
|
jpayne@69
|
831 if sample is None:
|
jpayne@69
|
832 raise ValueError('sample must not be None')
|
jpayne@69
|
833
|
jpayne@69
|
834 if key == 'phased':
|
jpayne@69
|
835 sample.phased = bool(value)
|
jpayne@69
|
836 return
|
jpayne@69
|
837
|
jpayne@69
|
838 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
839 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
840 cdef int fmt_id
|
jpayne@69
|
841 cdef vdict_t *d
|
jpayne@69
|
842 cdef khiter_t k
|
jpayne@69
|
843 cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
|
jpayne@69
|
844 cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
|
jpayne@69
|
845
|
jpayne@69
|
846 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
847 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
848
|
jpayne@69
|
849 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
850 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
851
|
jpayne@69
|
852 if fmt:
|
jpayne@69
|
853 fmt_id = fmt.id
|
jpayne@69
|
854 else:
|
jpayne@69
|
855 d = <vdict_t *>hdr.dict[BCF_DT_ID]
|
jpayne@69
|
856 k = kh_get_vdict(d, bkey)
|
jpayne@69
|
857
|
jpayne@69
|
858 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
|
jpayne@69
|
859 raise KeyError('unknown format: {}'.format(key))
|
jpayne@69
|
860
|
jpayne@69
|
861 fmt_id = kh_val_vdict(d, k).id
|
jpayne@69
|
862
|
jpayne@69
|
863 if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
|
jpayne@69
|
864 raise ValueError('Invalid header')
|
jpayne@69
|
865
|
jpayne@69
|
866 fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
|
jpayne@69
|
867
|
jpayne@69
|
868 if fmt_type == BCF_HT_FLAG:
|
jpayne@69
|
869 raise ValueError('Flag types are not allowed on FORMATs')
|
jpayne@69
|
870
|
jpayne@69
|
871 if is_gt_fmt(hdr, fmt_id):
|
jpayne@69
|
872 value = bcf_encode_alleles(sample.record, value)
|
jpayne@69
|
873 # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
|
jpayne@69
|
874 fmt_type = BCF_HT_INT
|
jpayne@69
|
875
|
jpayne@69
|
876 values = bcf_check_values(sample.record, value, sample.index,
|
jpayne@69
|
877 BCF_HL_FMT, fmt_type, fmt_id,
|
jpayne@69
|
878 fmt.type if fmt else -1,
|
jpayne@69
|
879 fmt.n if fmt else -1,
|
jpayne@69
|
880 &value_count, &scalar, &realloc)
|
jpayne@69
|
881 vlen = value_count < 0
|
jpayne@69
|
882 value_count = len(values)
|
jpayne@69
|
883
|
jpayne@69
|
884 # If we can, write updated values to existing allocated storage.
|
jpayne@69
|
885 if fmt and not realloc:
|
jpayne@69
|
886 r.d.indiv_dirty = 1
|
jpayne@69
|
887 bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
|
jpayne@69
|
888 return
|
jpayne@69
|
889
|
jpayne@69
|
890 alloc_len = max(1, value_count)
|
jpayne@69
|
891 if fmt and fmt.n > alloc_len:
|
jpayne@69
|
892 alloc_len = fmt.n
|
jpayne@69
|
893
|
jpayne@69
|
894 nsamples = r.n_sample
|
jpayne@69
|
895 new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
|
jpayne@69
|
896 cdef char *new_values_p = <char *>new_values
|
jpayne@69
|
897
|
jpayne@69
|
898 if fmt_type == BCF_HT_INT:
|
jpayne@69
|
899 dst_type = BCF_BT_INT32
|
jpayne@69
|
900 dst_size = sizeof(int32_t) * alloc_len
|
jpayne@69
|
901 elif fmt_type == BCF_HT_REAL:
|
jpayne@69
|
902 dst_type = BCF_BT_FLOAT
|
jpayne@69
|
903 dst_size = sizeof(float) * alloc_len
|
jpayne@69
|
904 elif fmt_type == BCF_HT_STR:
|
jpayne@69
|
905 dst_type = BCF_BT_CHAR
|
jpayne@69
|
906 dst_size = sizeof(char) * alloc_len
|
jpayne@69
|
907 else:
|
jpayne@69
|
908 raise ValueError('Unsupported FORMAT type')
|
jpayne@69
|
909
|
jpayne@69
|
910 if fmt and nsamples > 1:
|
jpayne@69
|
911 for i in range(nsamples):
|
jpayne@69
|
912 bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
|
jpayne@69
|
913 new_values_p + i * dst_size, dst_type, alloc_len,
|
jpayne@69
|
914 vlen)
|
jpayne@69
|
915
|
jpayne@69
|
916 bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
|
jpayne@69
|
917
|
jpayne@69
|
918 if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
|
jpayne@69
|
919 raise ValueError('Unable to update format values')
|
jpayne@69
|
920
|
jpayne@69
|
921
|
jpayne@69
|
922 cdef bcf_format_del_value(VariantRecordSample sample, key):
|
jpayne@69
|
923 if sample is None:
|
jpayne@69
|
924 raise ValueError('sample must not be None')
|
jpayne@69
|
925
|
jpayne@69
|
926 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
927 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
928 cdef ssize_t value_count
|
jpayne@69
|
929 cdef int scalar
|
jpayne@69
|
930
|
jpayne@69
|
931 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
932 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
933
|
jpayne@69
|
934 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
935 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
936
|
jpayne@69
|
937 if not fmt or not fmt.p:
|
jpayne@69
|
938 raise KeyError(key)
|
jpayne@69
|
939
|
jpayne@69
|
940 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
|
jpayne@69
|
941
|
jpayne@69
|
942 if value_count <= 0:
|
jpayne@69
|
943 null_value = ()
|
jpayne@69
|
944 elif scalar:
|
jpayne@69
|
945 null_value = None
|
jpayne@69
|
946 else:
|
jpayne@69
|
947 null_value = (None,)*value_count
|
jpayne@69
|
948
|
jpayne@69
|
949 bcf_format_set_value(sample, bkey, null_value)
|
jpayne@69
|
950
|
jpayne@69
|
951
|
jpayne@69
|
952 cdef bcf_format_get_allele_indices(VariantRecordSample sample):
|
jpayne@69
|
953 if sample is None:
|
jpayne@69
|
954 raise ValueError('sample must not be None')
|
jpayne@69
|
955
|
jpayne@69
|
956 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
957 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
958 cdef int32_t n = r.n_sample
|
jpayne@69
|
959
|
jpayne@69
|
960 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
961 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
962
|
jpayne@69
|
963 if sample.index < 0 or sample.index >= n or not r.n_fmt:
|
jpayne@69
|
964 return ()
|
jpayne@69
|
965
|
jpayne@69
|
966 cdef bcf_fmt_t *fmt0 = r.d.fmt
|
jpayne@69
|
967 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
|
jpayne@69
|
968
|
jpayne@69
|
969 if not gt0 or not fmt0.n:
|
jpayne@69
|
970 return ()
|
jpayne@69
|
971
|
jpayne@69
|
972 cdef int8_t *data8
|
jpayne@69
|
973 cdef int16_t *data16
|
jpayne@69
|
974 cdef int32_t *data32
|
jpayne@69
|
975 cdef int32_t a, nalleles = r.n_allele
|
jpayne@69
|
976 cdef list alleles = []
|
jpayne@69
|
977
|
jpayne@69
|
978 if fmt0.type == BCF_BT_INT8:
|
jpayne@69
|
979 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
980 for i in range(fmt0.n):
|
jpayne@69
|
981 if data8[i] == bcf_int8_vector_end:
|
jpayne@69
|
982 break
|
jpayne@69
|
983 elif data8[i] == bcf_gt_missing:
|
jpayne@69
|
984 a = -1
|
jpayne@69
|
985 else:
|
jpayne@69
|
986 a = bcf_gt_allele(data8[i])
|
jpayne@69
|
987 alleles.append(a if 0 <= a < nalleles else None)
|
jpayne@69
|
988 elif fmt0.type == BCF_BT_INT16:
|
jpayne@69
|
989 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
990 for i in range(fmt0.n):
|
jpayne@69
|
991 if data16[i] == bcf_int16_vector_end:
|
jpayne@69
|
992 break
|
jpayne@69
|
993 elif data16[i] == bcf_gt_missing:
|
jpayne@69
|
994 a = -1
|
jpayne@69
|
995 else:
|
jpayne@69
|
996 a = bcf_gt_allele(data16[i])
|
jpayne@69
|
997 alleles.append(a if 0 <= a < nalleles else None)
|
jpayne@69
|
998 elif fmt0.type == BCF_BT_INT32:
|
jpayne@69
|
999 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1000 for i in range(fmt0.n):
|
jpayne@69
|
1001 if data32[i] == bcf_int32_vector_end:
|
jpayne@69
|
1002 break
|
jpayne@69
|
1003 elif data32[i] == bcf_gt_missing:
|
jpayne@69
|
1004 a = -1
|
jpayne@69
|
1005 else:
|
jpayne@69
|
1006 a = bcf_gt_allele(data32[i])
|
jpayne@69
|
1007 alleles.append(a if 0 <= a < nalleles else None)
|
jpayne@69
|
1008
|
jpayne@69
|
1009 return tuple(alleles)
|
jpayne@69
|
1010
|
jpayne@69
|
1011
|
jpayne@69
|
1012 cdef bcf_format_get_alleles(VariantRecordSample sample):
|
jpayne@69
|
1013 if sample is None:
|
jpayne@69
|
1014 raise ValueError('sample must not be None')
|
jpayne@69
|
1015
|
jpayne@69
|
1016 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
1017 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
1018 cdef int32_t nsamples = r.n_sample
|
jpayne@69
|
1019
|
jpayne@69
|
1020 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
1021 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
1022
|
jpayne@69
|
1023 cdef int32_t nalleles = r.n_allele
|
jpayne@69
|
1024
|
jpayne@69
|
1025 if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
|
jpayne@69
|
1026 return ()
|
jpayne@69
|
1027
|
jpayne@69
|
1028 cdef bcf_fmt_t *fmt0 = r.d.fmt
|
jpayne@69
|
1029 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
|
jpayne@69
|
1030
|
jpayne@69
|
1031 if not gt0 or not fmt0.n:
|
jpayne@69
|
1032 return ()
|
jpayne@69
|
1033
|
jpayne@69
|
1034 cdef int32_t a
|
jpayne@69
|
1035 cdef int8_t *data8
|
jpayne@69
|
1036 cdef int16_t *data16
|
jpayne@69
|
1037 cdef int32_t *data32
|
jpayne@69
|
1038 alleles = []
|
jpayne@69
|
1039 if fmt0.type == BCF_BT_INT8:
|
jpayne@69
|
1040 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1041 for i in range(fmt0.n):
|
jpayne@69
|
1042 if data8[i] == bcf_int8_vector_end:
|
jpayne@69
|
1043 break
|
jpayne@69
|
1044 a = bcf_gt_allele(data8[i])
|
jpayne@69
|
1045 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
|
jpayne@69
|
1046 elif fmt0.type == BCF_BT_INT16:
|
jpayne@69
|
1047 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1048 for i in range(fmt0.n):
|
jpayne@69
|
1049 if data16[i] == bcf_int16_vector_end:
|
jpayne@69
|
1050 break
|
jpayne@69
|
1051 a = bcf_gt_allele(data16[i])
|
jpayne@69
|
1052 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
|
jpayne@69
|
1053 elif fmt0.type == BCF_BT_INT32:
|
jpayne@69
|
1054 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1055 for i in range(fmt0.n):
|
jpayne@69
|
1056 if data32[i] == bcf_int32_vector_end:
|
jpayne@69
|
1057 break
|
jpayne@69
|
1058 a = bcf_gt_allele(data32[i])
|
jpayne@69
|
1059 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
|
jpayne@69
|
1060 return tuple(alleles)
|
jpayne@69
|
1061
|
jpayne@69
|
1062
|
jpayne@69
|
1063 cdef bint bcf_sample_get_phased(VariantRecordSample sample):
|
jpayne@69
|
1064 if sample is None:
|
jpayne@69
|
1065 raise ValueError('sample must not be None')
|
jpayne@69
|
1066
|
jpayne@69
|
1067 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
1068 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
1069 cdef int32_t n = r.n_sample
|
jpayne@69
|
1070
|
jpayne@69
|
1071 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
1072 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
1073
|
jpayne@69
|
1074 if sample.index < 0 or sample.index >= n or not r.n_fmt:
|
jpayne@69
|
1075 return False
|
jpayne@69
|
1076
|
jpayne@69
|
1077 cdef bcf_fmt_t *fmt0 = r.d.fmt
|
jpayne@69
|
1078 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
|
jpayne@69
|
1079
|
jpayne@69
|
1080 if not gt0 or not fmt0.n:
|
jpayne@69
|
1081 return False
|
jpayne@69
|
1082
|
jpayne@69
|
1083 cdef int8_t *data8
|
jpayne@69
|
1084 cdef int16_t *data16
|
jpayne@69
|
1085 cdef int32_t *data32
|
jpayne@69
|
1086
|
jpayne@69
|
1087 cdef bint phased = False
|
jpayne@69
|
1088
|
jpayne@69
|
1089 if fmt0.type == BCF_BT_INT8:
|
jpayne@69
|
1090 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1091 for i in range(fmt0.n):
|
jpayne@69
|
1092 if data8[i] == bcf_int8_vector_end:
|
jpayne@69
|
1093 break
|
jpayne@69
|
1094 elif data8[i] == bcf_int8_missing:
|
jpayne@69
|
1095 continue
|
jpayne@69
|
1096 elif i and not bcf_gt_is_phased(data8[i]):
|
jpayne@69
|
1097 return False
|
jpayne@69
|
1098 else:
|
jpayne@69
|
1099 phased = True
|
jpayne@69
|
1100 elif fmt0.type == BCF_BT_INT16:
|
jpayne@69
|
1101 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1102 for i in range(fmt0.n):
|
jpayne@69
|
1103 if data16[i] == bcf_int16_vector_end:
|
jpayne@69
|
1104 break
|
jpayne@69
|
1105 elif data16[i] == bcf_int16_missing:
|
jpayne@69
|
1106 continue
|
jpayne@69
|
1107 elif i and not bcf_gt_is_phased(data16[i]):
|
jpayne@69
|
1108 return False
|
jpayne@69
|
1109 else:
|
jpayne@69
|
1110 phased = True
|
jpayne@69
|
1111 elif fmt0.type == BCF_BT_INT32:
|
jpayne@69
|
1112 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1113 for i in range(fmt0.n):
|
jpayne@69
|
1114 if data32[i] == bcf_int32_vector_end:
|
jpayne@69
|
1115 break
|
jpayne@69
|
1116 elif data32[i] == bcf_int32_missing:
|
jpayne@69
|
1117 continue
|
jpayne@69
|
1118 elif i and not bcf_gt_is_phased(data32[i]):
|
jpayne@69
|
1119 return False
|
jpayne@69
|
1120 else:
|
jpayne@69
|
1121 phased = True
|
jpayne@69
|
1122
|
jpayne@69
|
1123 return phased
|
jpayne@69
|
1124
|
jpayne@69
|
1125
|
jpayne@69
|
1126 cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
|
jpayne@69
|
1127 if sample is None:
|
jpayne@69
|
1128 raise ValueError('sample must not be None')
|
jpayne@69
|
1129
|
jpayne@69
|
1130 cdef bcf_hdr_t *hdr = sample.record.header.ptr
|
jpayne@69
|
1131 cdef bcf1_t *r = sample.record.ptr
|
jpayne@69
|
1132 cdef int32_t n = r.n_sample
|
jpayne@69
|
1133
|
jpayne@69
|
1134 if bcf_unpack(r, BCF_UN_ALL) < 0:
|
jpayne@69
|
1135 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
1136
|
jpayne@69
|
1137 if sample.index < 0 or sample.index >= n or not r.n_fmt:
|
jpayne@69
|
1138 return
|
jpayne@69
|
1139
|
jpayne@69
|
1140 cdef bcf_fmt_t *fmt0 = r.d.fmt
|
jpayne@69
|
1141 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
|
jpayne@69
|
1142
|
jpayne@69
|
1143 if not gt0 or not fmt0.n:
|
jpayne@69
|
1144 raise ValueError('Cannot set phased before genotype is set')
|
jpayne@69
|
1145
|
jpayne@69
|
1146 cdef int8_t *data8
|
jpayne@69
|
1147 cdef int16_t *data16
|
jpayne@69
|
1148 cdef int32_t *data32
|
jpayne@69
|
1149
|
jpayne@69
|
1150 if fmt0.type == BCF_BT_INT8:
|
jpayne@69
|
1151 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1152 for i in range(fmt0.n):
|
jpayne@69
|
1153 if data8[i] == bcf_int8_vector_end:
|
jpayne@69
|
1154 break
|
jpayne@69
|
1155 elif data8[i] == bcf_int8_missing:
|
jpayne@69
|
1156 continue
|
jpayne@69
|
1157 elif i:
|
jpayne@69
|
1158 data8[i] = (data8[i] & 0xFE) | phased
|
jpayne@69
|
1159 elif fmt0.type == BCF_BT_INT16:
|
jpayne@69
|
1160 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1161 for i in range(fmt0.n):
|
jpayne@69
|
1162 if data16[i] == bcf_int16_vector_end:
|
jpayne@69
|
1163 break
|
jpayne@69
|
1164 elif data16[i] == bcf_int16_missing:
|
jpayne@69
|
1165 continue
|
jpayne@69
|
1166 elif i:
|
jpayne@69
|
1167 data16[i] = (data16[i] & 0xFFFE) | phased
|
jpayne@69
|
1168 elif fmt0.type == BCF_BT_INT32:
|
jpayne@69
|
1169 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
|
jpayne@69
|
1170 for i in range(fmt0.n):
|
jpayne@69
|
1171 if data32[i] == bcf_int32_vector_end:
|
jpayne@69
|
1172 break
|
jpayne@69
|
1173 elif data32[i] == bcf_int32_missing:
|
jpayne@69
|
1174 continue
|
jpayne@69
|
1175 elif i:
|
jpayne@69
|
1176 data32[i] = (data32[i] & 0xFFFFFFFE) | phased
|
jpayne@69
|
1177
|
jpayne@69
|
1178
|
jpayne@69
|
1179 cdef inline bcf_sync_end(VariantRecord record):
|
jpayne@69
|
1180 cdef bcf_hdr_t *hdr = record.header.ptr
|
jpayne@69
|
1181 cdef bcf_info_t *info
|
jpayne@69
|
1182 cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
|
jpayne@69
|
1183 cdef int ref_len
|
jpayne@69
|
1184
|
jpayne@69
|
1185 # allow missing ref when instantiating a new record
|
jpayne@69
|
1186 if record.ref is not None:
|
jpayne@69
|
1187 ref_len = len(record.ref)
|
jpayne@69
|
1188 else:
|
jpayne@69
|
1189 ref_len = 0
|
jpayne@69
|
1190
|
jpayne@69
|
1191 # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
|
jpayne@69
|
1192 # Always keep END for symbolic alleles
|
jpayne@69
|
1193 if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
|
jpayne@69
|
1194 # If INFO/END is not defined in the header, it doesn't exist in the record
|
jpayne@69
|
1195 if end_id >= 0:
|
jpayne@69
|
1196 info = bcf_get_info(hdr, record.ptr, b'END')
|
jpayne@69
|
1197 if info and info.vptr:
|
jpayne@69
|
1198 if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
|
jpayne@69
|
1199 raise ValueError('Unable to delete END')
|
jpayne@69
|
1200 else:
|
jpayne@69
|
1201 # Create END header, if not present
|
jpayne@69
|
1202 if end_id < 0:
|
jpayne@69
|
1203 record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
|
jpayne@69
|
1204
|
jpayne@69
|
1205 # Update to reflect stop position
|
jpayne@69
|
1206 bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
|
jpayne@69
|
1207
|
jpayne@69
|
1208
|
jpayne@69
|
1209 cdef inline int has_symbolic_allele(VariantRecord record):
|
jpayne@69
|
1210 """Return index of first symbolic allele. 0 if no symbolic alleles."""
|
jpayne@69
|
1211
|
jpayne@69
|
1212 for i in range(1, record.ptr.n_allele):
|
jpayne@69
|
1213 alt = record.ptr.d.allele[i]
|
jpayne@69
|
1214 if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
|
jpayne@69
|
1215 return i
|
jpayne@69
|
1216
|
jpayne@69
|
1217 return 0
|
jpayne@69
|
1218
|
jpayne@69
|
1219
|
jpayne@69
|
1220 ########################################################################
|
jpayne@69
|
1221 ########################################################################
|
jpayne@69
|
1222 ## Variant Header objects
|
jpayne@69
|
1223 ########################################################################
|
jpayne@69
|
1224
|
jpayne@69
|
1225
|
jpayne@69
|
1226 cdef bcf_header_remove_hrec(VariantHeader header, int i):
|
jpayne@69
|
1227 if header is None:
|
jpayne@69
|
1228 raise ValueError('header must not be None')
|
jpayne@69
|
1229
|
jpayne@69
|
1230 cdef bcf_hdr_t *hdr = header.ptr
|
jpayne@69
|
1231
|
jpayne@69
|
1232 if i < 0 or i >= hdr.nhrec:
|
jpayne@69
|
1233 raise ValueError('Invalid header record index')
|
jpayne@69
|
1234
|
jpayne@69
|
1235 cdef bcf_hrec_t *hrec = hdr.hrec[i]
|
jpayne@69
|
1236 hdr.nhrec -= 1
|
jpayne@69
|
1237
|
jpayne@69
|
1238 if i < hdr.nhrec:
|
jpayne@69
|
1239 memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
|
jpayne@69
|
1240
|
jpayne@69
|
1241 bcf_hrec_destroy(hrec)
|
jpayne@69
|
1242 hdr.hrec[hdr.nhrec] = NULL
|
jpayne@69
|
1243 hdr.dirty = 1
|
jpayne@69
|
1244
|
jpayne@69
|
1245
|
jpayne@69
|
1246 #FIXME: implement a full mapping interface
|
jpayne@69
|
1247 #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
|
jpayne@69
|
1248 # object lifetime.
|
jpayne@69
|
1249 cdef class VariantHeaderRecord(object):
|
jpayne@69
|
1250 """header record from a :class:`VariantHeader` object"""
|
jpayne@69
|
1251 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1252 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1253
|
jpayne@69
|
1254 @property
|
jpayne@69
|
1255 def type(self):
|
jpayne@69
|
1256 """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
|
jpayne@69
|
1257 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1258 if not r:
|
jpayne@69
|
1259 return None
|
jpayne@69
|
1260 return METADATA_TYPES[r.type]
|
jpayne@69
|
1261
|
jpayne@69
|
1262 @property
|
jpayne@69
|
1263 def key(self):
|
jpayne@69
|
1264 """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
|
jpayne@69
|
1265 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1266 return bcf_str_cache_get_charptr(r.key) if r and r.key else None
|
jpayne@69
|
1267
|
jpayne@69
|
1268 @property
|
jpayne@69
|
1269 def value(self):
|
jpayne@69
|
1270 """header value. Set only for generic lines, None for FILTER/INFO, etc."""
|
jpayne@69
|
1271 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1272 return charptr_to_str(r.value) if r and r.value else None
|
jpayne@69
|
1273
|
jpayne@69
|
1274 @property
|
jpayne@69
|
1275 def attrs(self):
|
jpayne@69
|
1276 """sequence of additional header attributes"""
|
jpayne@69
|
1277 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1278 if not r:
|
jpayne@69
|
1279 return ()
|
jpayne@69
|
1280 cdef int i
|
jpayne@69
|
1281 return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
|
jpayne@69
|
1282 charptr_to_str(r.vals[i]) if r.vals[i] else None)
|
jpayne@69
|
1283 for i in range(r.nkeys))
|
jpayne@69
|
1284
|
jpayne@69
|
1285 def __len__(self):
|
jpayne@69
|
1286 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1287 return r.nkeys if r else 0
|
jpayne@69
|
1288
|
jpayne@69
|
1289 def __bool__(self):
|
jpayne@69
|
1290 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1291 return r != NULL and r.nkeys != 0
|
jpayne@69
|
1292
|
jpayne@69
|
1293 def __getitem__(self, key):
|
jpayne@69
|
1294 """get attribute value"""
|
jpayne@69
|
1295 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1296 cdef int i
|
jpayne@69
|
1297 if r:
|
jpayne@69
|
1298 bkey = force_bytes(key)
|
jpayne@69
|
1299 for i in range(r.nkeys):
|
jpayne@69
|
1300 if r.keys[i] and r.keys[i] == bkey:
|
jpayne@69
|
1301 return charptr_to_str(r.vals[i]) if r.vals[i] else None
|
jpayne@69
|
1302 raise KeyError('cannot find metadata key')
|
jpayne@69
|
1303
|
jpayne@69
|
1304 def __iter__(self):
|
jpayne@69
|
1305 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1306 if not r:
|
jpayne@69
|
1307 return
|
jpayne@69
|
1308 cdef int i
|
jpayne@69
|
1309 for i in range(r.nkeys):
|
jpayne@69
|
1310 if r.keys[i]:
|
jpayne@69
|
1311 yield bcf_str_cache_get_charptr(r.keys[i])
|
jpayne@69
|
1312
|
jpayne@69
|
1313 def get(self, key, default=None):
|
jpayne@69
|
1314 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
1315 try:
|
jpayne@69
|
1316 return self[key]
|
jpayne@69
|
1317 except KeyError:
|
jpayne@69
|
1318 return default
|
jpayne@69
|
1319
|
jpayne@69
|
1320 def __contains__(self, key):
|
jpayne@69
|
1321 try:
|
jpayne@69
|
1322 self[key]
|
jpayne@69
|
1323 except KeyError:
|
jpayne@69
|
1324 return False
|
jpayne@69
|
1325 else:
|
jpayne@69
|
1326 return True
|
jpayne@69
|
1327
|
jpayne@69
|
1328 def iterkeys(self):
|
jpayne@69
|
1329 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
1330 return iter(self)
|
jpayne@69
|
1331
|
jpayne@69
|
1332 def itervalues(self):
|
jpayne@69
|
1333 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
1334 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1335 if not r:
|
jpayne@69
|
1336 return
|
jpayne@69
|
1337 cdef int i
|
jpayne@69
|
1338 for i in range(r.nkeys):
|
jpayne@69
|
1339 if r.keys[i]:
|
jpayne@69
|
1340 yield charptr_to_str(r.vals[i]) if r.vals[i] else None
|
jpayne@69
|
1341
|
jpayne@69
|
1342 def iteritems(self):
|
jpayne@69
|
1343 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
1344 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1345 if not r:
|
jpayne@69
|
1346 return
|
jpayne@69
|
1347 cdef int i
|
jpayne@69
|
1348 for i in range(r.nkeys):
|
jpayne@69
|
1349 if r.keys[i]:
|
jpayne@69
|
1350 yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
|
jpayne@69
|
1351
|
jpayne@69
|
1352 def keys(self):
|
jpayne@69
|
1353 """D.keys() -> list of D's keys"""
|
jpayne@69
|
1354 return list(self)
|
jpayne@69
|
1355
|
jpayne@69
|
1356 def items(self):
|
jpayne@69
|
1357 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
1358 return list(self.iteritems())
|
jpayne@69
|
1359
|
jpayne@69
|
1360 def values(self):
|
jpayne@69
|
1361 """D.values() -> list of D's values"""
|
jpayne@69
|
1362 return list(self.itervalues())
|
jpayne@69
|
1363
|
jpayne@69
|
1364 def update(self, items=None, **kwargs):
|
jpayne@69
|
1365 """D.update([E, ]**F) -> None.
|
jpayne@69
|
1366
|
jpayne@69
|
1367 Update D from dict/iterable E and F.
|
jpayne@69
|
1368 """
|
jpayne@69
|
1369 for k, v in items.items():
|
jpayne@69
|
1370 self[k] = v
|
jpayne@69
|
1371
|
jpayne@69
|
1372 if kwargs:
|
jpayne@69
|
1373 for k, v in kwargs.items():
|
jpayne@69
|
1374 self[k] = v
|
jpayne@69
|
1375
|
jpayne@69
|
1376 def pop(self, key, default=_nothing):
|
jpayne@69
|
1377 try:
|
jpayne@69
|
1378 value = self[key]
|
jpayne@69
|
1379 del self[key]
|
jpayne@69
|
1380 return value
|
jpayne@69
|
1381 except KeyError:
|
jpayne@69
|
1382 if default is not _nothing:
|
jpayne@69
|
1383 return default
|
jpayne@69
|
1384 raise
|
jpayne@69
|
1385
|
jpayne@69
|
1386 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
1387 __hash__ = None
|
jpayne@69
|
1388
|
jpayne@69
|
1389 #TODO: implement __richcmp__
|
jpayne@69
|
1390
|
jpayne@69
|
1391 def __str__(self):
|
jpayne@69
|
1392 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1393
|
jpayne@69
|
1394 if not r:
|
jpayne@69
|
1395 raise ValueError('cannot convert deleted record to str')
|
jpayne@69
|
1396
|
jpayne@69
|
1397 cdef kstring_t hrec_str
|
jpayne@69
|
1398 hrec_str.l = hrec_str.m = 0
|
jpayne@69
|
1399 hrec_str.s = NULL
|
jpayne@69
|
1400
|
jpayne@69
|
1401 bcf_hrec_format(r, &hrec_str)
|
jpayne@69
|
1402
|
jpayne@69
|
1403 ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
|
jpayne@69
|
1404
|
jpayne@69
|
1405 if hrec_str.m:
|
jpayne@69
|
1406 free(hrec_str.s)
|
jpayne@69
|
1407
|
jpayne@69
|
1408 return ret
|
jpayne@69
|
1409
|
jpayne@69
|
1410 # FIXME: Not safe -- causes trivial segfaults at the moment
|
jpayne@69
|
1411 def remove(self):
|
jpayne@69
|
1412 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1413 cdef bcf_hrec_t *r = self.ptr
|
jpayne@69
|
1414 if not r:
|
jpayne@69
|
1415 return
|
jpayne@69
|
1416 assert r.key
|
jpayne@69
|
1417 cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
|
jpayne@69
|
1418 bcf_hdr_remove(hdr, r.type, key)
|
jpayne@69
|
1419 self.ptr = NULL
|
jpayne@69
|
1420
|
jpayne@69
|
1421
|
jpayne@69
|
1422 cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
|
jpayne@69
|
1423 if not header:
|
jpayne@69
|
1424 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1425
|
jpayne@69
|
1426 if not hdr:
|
jpayne@69
|
1427 return None
|
jpayne@69
|
1428
|
jpayne@69
|
1429 cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
|
jpayne@69
|
1430 record.header = header
|
jpayne@69
|
1431 record.ptr = hdr
|
jpayne@69
|
1432
|
jpayne@69
|
1433 return record
|
jpayne@69
|
1434
|
jpayne@69
|
1435
|
jpayne@69
|
1436 cdef class VariantHeaderRecords(object):
|
jpayne@69
|
1437 """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
|
jpayne@69
|
1438 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1439 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1440
|
jpayne@69
|
1441 def __len__(self):
|
jpayne@69
|
1442 return self.header.ptr.nhrec
|
jpayne@69
|
1443
|
jpayne@69
|
1444 def __bool__(self):
|
jpayne@69
|
1445 return self.header.ptr.nhrec != 0
|
jpayne@69
|
1446
|
jpayne@69
|
1447 def __getitem__(self, index):
|
jpayne@69
|
1448 cdef int32_t i = index
|
jpayne@69
|
1449 if i < 0 or i >= self.header.ptr.nhrec:
|
jpayne@69
|
1450 raise IndexError('invalid header record index')
|
jpayne@69
|
1451 return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
|
jpayne@69
|
1452
|
jpayne@69
|
1453 def __iter__(self):
|
jpayne@69
|
1454 cdef int32_t i
|
jpayne@69
|
1455 for i in range(self.header.ptr.nhrec):
|
jpayne@69
|
1456 yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
|
jpayne@69
|
1457
|
jpayne@69
|
1458 __hash__ = None
|
jpayne@69
|
1459
|
jpayne@69
|
1460
|
jpayne@69
|
1461 cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
|
jpayne@69
|
1462 if not header:
|
jpayne@69
|
1463 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1464
|
jpayne@69
|
1465 cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
|
jpayne@69
|
1466 records.header = header
|
jpayne@69
|
1467 return records
|
jpayne@69
|
1468
|
jpayne@69
|
1469
|
jpayne@69
|
1470 cdef class VariantMetadata(object):
|
jpayne@69
|
1471 """filter, info or format metadata record from a :class:`VariantHeader` object"""
|
jpayne@69
|
1472 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1473 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1474
|
jpayne@69
|
1475 @property
|
jpayne@69
|
1476 def name(self):
|
jpayne@69
|
1477 """metadata name"""
|
jpayne@69
|
1478 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1479 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
|
jpayne@69
|
1480
|
jpayne@69
|
1481 # Q: Should this be exposed?
|
jpayne@69
|
1482 @property
|
jpayne@69
|
1483 def id(self):
|
jpayne@69
|
1484 """metadata internal header id number"""
|
jpayne@69
|
1485 return self.id
|
jpayne@69
|
1486
|
jpayne@69
|
1487 @property
|
jpayne@69
|
1488 def number(self):
|
jpayne@69
|
1489 """metadata number (i.e. cardinality)"""
|
jpayne@69
|
1490 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1491
|
jpayne@69
|
1492 if not check_header_id(hdr, self.type, self.id):
|
jpayne@69
|
1493 raise ValueError('Invalid header id')
|
jpayne@69
|
1494
|
jpayne@69
|
1495 if self.type == BCF_HL_FLT:
|
jpayne@69
|
1496 return None
|
jpayne@69
|
1497
|
jpayne@69
|
1498 cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
|
jpayne@69
|
1499 if l == BCF_VL_FIXED:
|
jpayne@69
|
1500 return bcf_hdr_id2number(hdr, self.type, self.id)
|
jpayne@69
|
1501 elif l == BCF_VL_VAR:
|
jpayne@69
|
1502 return '.'
|
jpayne@69
|
1503 else:
|
jpayne@69
|
1504 return METADATA_LENGTHS[l]
|
jpayne@69
|
1505
|
jpayne@69
|
1506 @property
|
jpayne@69
|
1507 def type(self):
|
jpayne@69
|
1508 """metadata value type"""
|
jpayne@69
|
1509 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1510 if not check_header_id(hdr, self.type, self.id):
|
jpayne@69
|
1511 raise ValueError('Invalid header id')
|
jpayne@69
|
1512
|
jpayne@69
|
1513 if self.type == BCF_HL_FLT:
|
jpayne@69
|
1514 return None
|
jpayne@69
|
1515 return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
|
jpayne@69
|
1516
|
jpayne@69
|
1517 @property
|
jpayne@69
|
1518 def description(self):
|
jpayne@69
|
1519 """metadata description (or None if not set)"""
|
jpayne@69
|
1520 descr = self.record.get('Description')
|
jpayne@69
|
1521 if descr:
|
jpayne@69
|
1522 descr = descr.strip('"')
|
jpayne@69
|
1523 return force_str(descr)
|
jpayne@69
|
1524
|
jpayne@69
|
1525 @property
|
jpayne@69
|
1526 def record(self):
|
jpayne@69
|
1527 """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
|
jpayne@69
|
1528 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1529 if not check_header_id(hdr, self.type, self.id):
|
jpayne@69
|
1530 raise ValueError('Invalid header id')
|
jpayne@69
|
1531 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
|
jpayne@69
|
1532 if not hrec:
|
jpayne@69
|
1533 return None
|
jpayne@69
|
1534 return makeVariantHeaderRecord(self.header, hrec)
|
jpayne@69
|
1535
|
jpayne@69
|
1536 def remove_header(self):
|
jpayne@69
|
1537 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1538 cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
|
jpayne@69
|
1539 bcf_hdr_remove(hdr, self.type, key)
|
jpayne@69
|
1540
|
jpayne@69
|
1541
|
jpayne@69
|
1542 cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
|
jpayne@69
|
1543 if not header:
|
jpayne@69
|
1544 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1545
|
jpayne@69
|
1546 if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
|
jpayne@69
|
1547 raise ValueError('invalid metadata type')
|
jpayne@69
|
1548
|
jpayne@69
|
1549 if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
|
jpayne@69
|
1550 raise ValueError('invalid metadata id')
|
jpayne@69
|
1551
|
jpayne@69
|
1552 cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
|
jpayne@69
|
1553 meta.header = header
|
jpayne@69
|
1554 meta.type = type
|
jpayne@69
|
1555 meta.id = id
|
jpayne@69
|
1556
|
jpayne@69
|
1557 return meta
|
jpayne@69
|
1558
|
jpayne@69
|
1559
|
jpayne@69
|
1560 cdef class VariantHeaderMetadata(object):
|
jpayne@69
|
1561 """mapping from filter, info or format name to :class:`VariantMetadata` object"""
|
jpayne@69
|
1562 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1563 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1564
|
jpayne@69
|
1565 def add(self, id, number, type, description, **kwargs):
|
jpayne@69
|
1566 """Add a new filter, info or format record"""
|
jpayne@69
|
1567 if id in self:
|
jpayne@69
|
1568 raise ValueError('Header already exists for id={}'.format(id))
|
jpayne@69
|
1569
|
jpayne@69
|
1570 if self.type == BCF_HL_FLT:
|
jpayne@69
|
1571 if number is not None:
|
jpayne@69
|
1572 raise ValueError('Number must be None when adding a filter')
|
jpayne@69
|
1573 if type is not None:
|
jpayne@69
|
1574 raise ValueError('Type must be None when adding a filter')
|
jpayne@69
|
1575
|
jpayne@69
|
1576 items = [('ID', unquoted_str(id)), ('Description', description)]
|
jpayne@69
|
1577 else:
|
jpayne@69
|
1578 if type not in VALUE_TYPES:
|
jpayne@69
|
1579 raise ValueError('unknown type specified: {}'.format(type))
|
jpayne@69
|
1580 if number is None:
|
jpayne@69
|
1581 number = '.'
|
jpayne@69
|
1582
|
jpayne@69
|
1583 items = [('ID', unquoted_str(id)),
|
jpayne@69
|
1584 ('Number', unquoted_str(number)),
|
jpayne@69
|
1585 ('Type', unquoted_str(type)),
|
jpayne@69
|
1586 ('Description', description)]
|
jpayne@69
|
1587
|
jpayne@69
|
1588 items += kwargs.items()
|
jpayne@69
|
1589 self.header.add_meta(METADATA_TYPES[self.type], items=items)
|
jpayne@69
|
1590
|
jpayne@69
|
1591 def __len__(self):
|
jpayne@69
|
1592 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1593 cdef bcf_idpair_t *idpair
|
jpayne@69
|
1594 cdef int32_t i, n = 0
|
jpayne@69
|
1595
|
jpayne@69
|
1596 for i in range(hdr.n[BCF_DT_ID]):
|
jpayne@69
|
1597 idpair = hdr.id[BCF_DT_ID] + i
|
jpayne@69
|
1598 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
|
jpayne@69
|
1599 n += 1
|
jpayne@69
|
1600 return n
|
jpayne@69
|
1601
|
jpayne@69
|
1602 def __bool__(self):
|
jpayne@69
|
1603 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1604 cdef bcf_idpair_t *idpair
|
jpayne@69
|
1605 cdef int32_t i
|
jpayne@69
|
1606
|
jpayne@69
|
1607 for i in range(hdr.n[BCF_DT_ID]):
|
jpayne@69
|
1608 idpair = hdr.id[BCF_DT_ID] + i
|
jpayne@69
|
1609 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
|
jpayne@69
|
1610 return True
|
jpayne@69
|
1611 return False
|
jpayne@69
|
1612
|
jpayne@69
|
1613 def __getitem__(self, key):
|
jpayne@69
|
1614 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1615 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
|
jpayne@69
|
1616
|
jpayne@69
|
1617 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
1618 cdef khiter_t k = kh_get_vdict(d, bkey)
|
jpayne@69
|
1619
|
jpayne@69
|
1620 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
|
jpayne@69
|
1621 raise KeyError('invalid key: {}'.format(key))
|
jpayne@69
|
1622
|
jpayne@69
|
1623 return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
|
jpayne@69
|
1624
|
jpayne@69
|
1625 def remove_header(self, key):
|
jpayne@69
|
1626 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1627 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
|
jpayne@69
|
1628
|
jpayne@69
|
1629 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
1630 cdef khiter_t k = kh_get_vdict(d, bkey)
|
jpayne@69
|
1631
|
jpayne@69
|
1632 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
|
jpayne@69
|
1633 raise KeyError('invalid key: {}'.format(key))
|
jpayne@69
|
1634
|
jpayne@69
|
1635 bcf_hdr_remove(hdr, self.type, bkey)
|
jpayne@69
|
1636 #bcf_hdr_sync(hdr)
|
jpayne@69
|
1637
|
jpayne@69
|
1638 def clear_header(self):
|
jpayne@69
|
1639 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1640 bcf_hdr_remove(hdr, self.type, NULL)
|
jpayne@69
|
1641 #bcf_hdr_sync(hdr)
|
jpayne@69
|
1642
|
jpayne@69
|
1643 def __iter__(self):
|
jpayne@69
|
1644 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1645 cdef bcf_idpair_t *idpair
|
jpayne@69
|
1646 cdef int32_t i
|
jpayne@69
|
1647
|
jpayne@69
|
1648 for i in range(hdr.n[BCF_DT_ID]):
|
jpayne@69
|
1649 idpair = hdr.id[BCF_DT_ID] + i
|
jpayne@69
|
1650 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
|
jpayne@69
|
1651 yield bcf_str_cache_get_charptr(idpair.key)
|
jpayne@69
|
1652
|
jpayne@69
|
1653 def get(self, key, default=None):
|
jpayne@69
|
1654 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
1655 try:
|
jpayne@69
|
1656 return self[key]
|
jpayne@69
|
1657 except KeyError:
|
jpayne@69
|
1658 return default
|
jpayne@69
|
1659
|
jpayne@69
|
1660 def __contains__(self, key):
|
jpayne@69
|
1661 try:
|
jpayne@69
|
1662 self[key]
|
jpayne@69
|
1663 except KeyError:
|
jpayne@69
|
1664 return False
|
jpayne@69
|
1665 else:
|
jpayne@69
|
1666 return True
|
jpayne@69
|
1667
|
jpayne@69
|
1668 def iterkeys(self):
|
jpayne@69
|
1669 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
1670 return iter(self)
|
jpayne@69
|
1671
|
jpayne@69
|
1672 def itervalues(self):
|
jpayne@69
|
1673 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
1674 for key in self:
|
jpayne@69
|
1675 yield self[key]
|
jpayne@69
|
1676
|
jpayne@69
|
1677 def iteritems(self):
|
jpayne@69
|
1678 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
1679 for key in self:
|
jpayne@69
|
1680 yield (key, self[key])
|
jpayne@69
|
1681
|
jpayne@69
|
1682 def keys(self):
|
jpayne@69
|
1683 """D.keys() -> list of D's keys"""
|
jpayne@69
|
1684 return list(self)
|
jpayne@69
|
1685
|
jpayne@69
|
1686 def items(self):
|
jpayne@69
|
1687 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
1688 return list(self.iteritems())
|
jpayne@69
|
1689
|
jpayne@69
|
1690 def values(self):
|
jpayne@69
|
1691 """D.values() -> list of D's values"""
|
jpayne@69
|
1692 return list(self.itervalues())
|
jpayne@69
|
1693
|
jpayne@69
|
1694 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
1695 __hash__ = None
|
jpayne@69
|
1696
|
jpayne@69
|
1697 #TODO: implement __richcmp__
|
jpayne@69
|
1698
|
jpayne@69
|
1699
|
jpayne@69
|
1700 cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
|
jpayne@69
|
1701 if not header:
|
jpayne@69
|
1702 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1703
|
jpayne@69
|
1704 cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
|
jpayne@69
|
1705 meta.header = header
|
jpayne@69
|
1706 meta.type = type
|
jpayne@69
|
1707
|
jpayne@69
|
1708 return meta
|
jpayne@69
|
1709
|
jpayne@69
|
1710
|
jpayne@69
|
1711 cdef class VariantContig(object):
|
jpayne@69
|
1712 """contig metadata from a :class:`VariantHeader`"""
|
jpayne@69
|
1713 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1714 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1715
|
jpayne@69
|
1716 @property
|
jpayne@69
|
1717 def name(self):
|
jpayne@69
|
1718 """contig name"""
|
jpayne@69
|
1719 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1720 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
|
jpayne@69
|
1721
|
jpayne@69
|
1722 @property
|
jpayne@69
|
1723 def id(self):
|
jpayne@69
|
1724 """contig internal id number"""
|
jpayne@69
|
1725 return self.id
|
jpayne@69
|
1726
|
jpayne@69
|
1727 @property
|
jpayne@69
|
1728 def length(self):
|
jpayne@69
|
1729 """contig length or None if not available"""
|
jpayne@69
|
1730 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1731 cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
|
jpayne@69
|
1732 return length if length else None
|
jpayne@69
|
1733
|
jpayne@69
|
1734 @property
|
jpayne@69
|
1735 def header_record(self):
|
jpayne@69
|
1736 """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
|
jpayne@69
|
1737 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1738 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
|
jpayne@69
|
1739 return makeVariantHeaderRecord(self.header, hrec)
|
jpayne@69
|
1740
|
jpayne@69
|
1741 def remove_header(self):
|
jpayne@69
|
1742 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1743 cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
|
jpayne@69
|
1744 bcf_hdr_remove(hdr, BCF_HL_CTG, key)
|
jpayne@69
|
1745
|
jpayne@69
|
1746
|
jpayne@69
|
1747 cdef VariantContig makeVariantContig(VariantHeader header, int id):
|
jpayne@69
|
1748 if not header:
|
jpayne@69
|
1749 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1750
|
jpayne@69
|
1751 if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
|
jpayne@69
|
1752 raise ValueError('invalid contig id')
|
jpayne@69
|
1753
|
jpayne@69
|
1754 cdef VariantContig contig = VariantContig.__new__(VariantContig)
|
jpayne@69
|
1755 contig.header = header
|
jpayne@69
|
1756 contig.id = id
|
jpayne@69
|
1757
|
jpayne@69
|
1758 return contig
|
jpayne@69
|
1759
|
jpayne@69
|
1760
|
jpayne@69
|
1761 cdef class VariantHeaderContigs(object):
|
jpayne@69
|
1762 """mapping from contig name or index to :class:`VariantContig` object."""
|
jpayne@69
|
1763 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1764 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1765
|
jpayne@69
|
1766 def __len__(self):
|
jpayne@69
|
1767 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1768 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
|
jpayne@69
|
1769 return hdr.n[BCF_DT_CTG]
|
jpayne@69
|
1770
|
jpayne@69
|
1771 def __bool__(self):
|
jpayne@69
|
1772 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1773 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
|
jpayne@69
|
1774 return hdr.n[BCF_DT_CTG] != 0
|
jpayne@69
|
1775
|
jpayne@69
|
1776 def __getitem__(self, key):
|
jpayne@69
|
1777 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1778 cdef int index
|
jpayne@69
|
1779
|
jpayne@69
|
1780 if isinstance(key, int):
|
jpayne@69
|
1781 index = key
|
jpayne@69
|
1782 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
|
jpayne@69
|
1783 raise IndexError('invalid contig index')
|
jpayne@69
|
1784 return makeVariantContig(self.header, index)
|
jpayne@69
|
1785
|
jpayne@69
|
1786 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
|
jpayne@69
|
1787 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
1788 cdef khiter_t k = kh_get_vdict(d, bkey)
|
jpayne@69
|
1789
|
jpayne@69
|
1790 if k == kh_end(d):
|
jpayne@69
|
1791 raise KeyError('invalid contig: {}'.format(key))
|
jpayne@69
|
1792
|
jpayne@69
|
1793 cdef int id = kh_val_vdict(d, k).id
|
jpayne@69
|
1794
|
jpayne@69
|
1795 return makeVariantContig(self.header, id)
|
jpayne@69
|
1796
|
jpayne@69
|
1797 def remove_header(self, key):
|
jpayne@69
|
1798 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1799 cdef int index
|
jpayne@69
|
1800 cdef const char *ckey
|
jpayne@69
|
1801 cdef vdict_t *d
|
jpayne@69
|
1802 cdef khiter_t k
|
jpayne@69
|
1803
|
jpayne@69
|
1804 if isinstance(key, int):
|
jpayne@69
|
1805 index = key
|
jpayne@69
|
1806 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
|
jpayne@69
|
1807 raise IndexError('invalid contig index')
|
jpayne@69
|
1808 ckey = hdr.id[BCF_DT_CTG][self.id].key
|
jpayne@69
|
1809 else:
|
jpayne@69
|
1810 d = <vdict_t *>hdr.dict[BCF_DT_CTG]
|
jpayne@69
|
1811 key = force_bytes(key)
|
jpayne@69
|
1812 if kh_get_vdict(d, key) == kh_end(d):
|
jpayne@69
|
1813 raise KeyError('invalid contig: {}'.format(key))
|
jpayne@69
|
1814 ckey = key
|
jpayne@69
|
1815
|
jpayne@69
|
1816 bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
|
jpayne@69
|
1817
|
jpayne@69
|
1818 def clear_header(self):
|
jpayne@69
|
1819 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1820 bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
|
jpayne@69
|
1821 #bcf_hdr_sync(hdr)
|
jpayne@69
|
1822
|
jpayne@69
|
1823 def __iter__(self):
|
jpayne@69
|
1824 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1825 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
|
jpayne@69
|
1826 cdef uint32_t n = kh_size(d)
|
jpayne@69
|
1827
|
jpayne@69
|
1828 assert n == hdr.n[BCF_DT_CTG]
|
jpayne@69
|
1829
|
jpayne@69
|
1830 for i in range(n):
|
jpayne@69
|
1831 yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
|
jpayne@69
|
1832
|
jpayne@69
|
1833 def get(self, key, default=None):
|
jpayne@69
|
1834 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
1835 try:
|
jpayne@69
|
1836 return self[key]
|
jpayne@69
|
1837 except KeyError:
|
jpayne@69
|
1838 return default
|
jpayne@69
|
1839
|
jpayne@69
|
1840 def __contains__(self, key):
|
jpayne@69
|
1841 try:
|
jpayne@69
|
1842 self[key]
|
jpayne@69
|
1843 except KeyError:
|
jpayne@69
|
1844 return False
|
jpayne@69
|
1845 else:
|
jpayne@69
|
1846 return True
|
jpayne@69
|
1847
|
jpayne@69
|
1848 def iterkeys(self):
|
jpayne@69
|
1849 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
1850 return iter(self)
|
jpayne@69
|
1851
|
jpayne@69
|
1852 def itervalues(self):
|
jpayne@69
|
1853 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
1854 for key in self:
|
jpayne@69
|
1855 yield self[key]
|
jpayne@69
|
1856
|
jpayne@69
|
1857 def iteritems(self):
|
jpayne@69
|
1858 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
1859 for key in self:
|
jpayne@69
|
1860 yield (key, self[key])
|
jpayne@69
|
1861
|
jpayne@69
|
1862 def keys(self):
|
jpayne@69
|
1863 """D.keys() -> list of D's keys"""
|
jpayne@69
|
1864 return list(self)
|
jpayne@69
|
1865
|
jpayne@69
|
1866 def items(self):
|
jpayne@69
|
1867 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
1868 return list(self.iteritems())
|
jpayne@69
|
1869
|
jpayne@69
|
1870 def values(self):
|
jpayne@69
|
1871 """D.values() -> list of D's values"""
|
jpayne@69
|
1872 return list(self.itervalues())
|
jpayne@69
|
1873
|
jpayne@69
|
1874 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
1875 __hash__ = None
|
jpayne@69
|
1876
|
jpayne@69
|
1877 #TODO: implement __richcmp__
|
jpayne@69
|
1878
|
jpayne@69
|
1879 def add(self, id, length=None, **kwargs):
|
jpayne@69
|
1880 """Add a new contig record"""
|
jpayne@69
|
1881 if id in self:
|
jpayne@69
|
1882 raise ValueError('Header already exists for contig {}'.format(id))
|
jpayne@69
|
1883
|
jpayne@69
|
1884 items = [('ID', unquoted_str(id))]
|
jpayne@69
|
1885 if length is not None:
|
jpayne@69
|
1886 items.append(("length", unquoted_str(length)))
|
jpayne@69
|
1887 items += kwargs.items()
|
jpayne@69
|
1888 self.header.add_meta('contig', items=items)
|
jpayne@69
|
1889
|
jpayne@69
|
1890
|
jpayne@69
|
1891 cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
|
jpayne@69
|
1892 if not header:
|
jpayne@69
|
1893 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1894
|
jpayne@69
|
1895 cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
|
jpayne@69
|
1896 contigs.header = header
|
jpayne@69
|
1897
|
jpayne@69
|
1898 return contigs
|
jpayne@69
|
1899
|
jpayne@69
|
1900
|
jpayne@69
|
1901 cdef class VariantHeaderSamples(object):
|
jpayne@69
|
1902 """sequence of sample names from a :class:`VariantHeader` object"""
|
jpayne@69
|
1903 def __init__(self, *args, **kwargs):
|
jpayne@69
|
1904 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
1905
|
jpayne@69
|
1906 def __len__(self):
|
jpayne@69
|
1907 return bcf_hdr_nsamples(self.header.ptr)
|
jpayne@69
|
1908
|
jpayne@69
|
1909 def __bool__(self):
|
jpayne@69
|
1910 return bcf_hdr_nsamples(self.header.ptr) != 0
|
jpayne@69
|
1911
|
jpayne@69
|
1912 def __getitem__(self, index):
|
jpayne@69
|
1913 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1914 cdef int32_t n = bcf_hdr_nsamples(hdr)
|
jpayne@69
|
1915 cdef int32_t i = index
|
jpayne@69
|
1916
|
jpayne@69
|
1917 if i < 0 or i >= n:
|
jpayne@69
|
1918 raise IndexError('invalid sample index')
|
jpayne@69
|
1919
|
jpayne@69
|
1920 return charptr_to_str(hdr.samples[i])
|
jpayne@69
|
1921
|
jpayne@69
|
1922 def __iter__(self):
|
jpayne@69
|
1923 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1924 cdef int32_t i, n = bcf_hdr_nsamples(hdr)
|
jpayne@69
|
1925
|
jpayne@69
|
1926 for i in range(n):
|
jpayne@69
|
1927 yield charptr_to_str(hdr.samples[i])
|
jpayne@69
|
1928
|
jpayne@69
|
1929 def __contains__(self, key):
|
jpayne@69
|
1930 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
1931 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
|
jpayne@69
|
1932 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
1933 cdef khiter_t k = kh_get_vdict(d, bkey)
|
jpayne@69
|
1934
|
jpayne@69
|
1935 return k != kh_end(d)
|
jpayne@69
|
1936
|
jpayne@69
|
1937 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
1938 __hash__ = None
|
jpayne@69
|
1939
|
jpayne@69
|
1940 #TODO: implement __richcmp__
|
jpayne@69
|
1941
|
jpayne@69
|
1942 def add(self, name):
|
jpayne@69
|
1943 """Add a new sample"""
|
jpayne@69
|
1944 self.header.add_sample(name)
|
jpayne@69
|
1945
|
jpayne@69
|
1946
|
jpayne@69
|
1947 cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
|
jpayne@69
|
1948 if not header:
|
jpayne@69
|
1949 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
1950
|
jpayne@69
|
1951 cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
|
jpayne@69
|
1952 samples.header = header
|
jpayne@69
|
1953
|
jpayne@69
|
1954 return samples
|
jpayne@69
|
1955
|
jpayne@69
|
1956
|
jpayne@69
|
1957 cdef class VariantHeader(object):
|
jpayne@69
|
1958 """header information for a :class:`VariantFile` object"""
|
jpayne@69
|
1959 #FIXME: Add structured proxy
|
jpayne@69
|
1960 #FIXME: Add generic proxy
|
jpayne@69
|
1961 #FIXME: Add mutable methods
|
jpayne@69
|
1962
|
jpayne@69
|
1963 # See makeVariantHeader for C constructor
|
jpayne@69
|
1964 def __cinit__(self):
|
jpayne@69
|
1965 self.ptr = NULL
|
jpayne@69
|
1966
|
jpayne@69
|
1967 # Python constructor
|
jpayne@69
|
1968 def __init__(self):
|
jpayne@69
|
1969 self.ptr = bcf_hdr_init(b'w')
|
jpayne@69
|
1970 if not self.ptr:
|
jpayne@69
|
1971 raise ValueError('cannot create VariantHeader')
|
jpayne@69
|
1972
|
jpayne@69
|
1973 def __dealloc__(self):
|
jpayne@69
|
1974 if self.ptr:
|
jpayne@69
|
1975 bcf_hdr_destroy(self.ptr)
|
jpayne@69
|
1976 self.ptr = NULL
|
jpayne@69
|
1977
|
jpayne@69
|
1978 def __bool__(self):
|
jpayne@69
|
1979 return self.ptr != NULL
|
jpayne@69
|
1980
|
jpayne@69
|
1981 def copy(self):
|
jpayne@69
|
1982 return makeVariantHeader(bcf_hdr_dup(self.ptr))
|
jpayne@69
|
1983
|
jpayne@69
|
1984 def merge(self, VariantHeader header):
|
jpayne@69
|
1985 if header is None:
|
jpayne@69
|
1986 raise ValueError('header must not be None')
|
jpayne@69
|
1987 bcf_hdr_merge(self.ptr, header.ptr)
|
jpayne@69
|
1988
|
jpayne@69
|
1989 @property
|
jpayne@69
|
1990 def version(self):
|
jpayne@69
|
1991 """VCF version"""
|
jpayne@69
|
1992 return force_str(bcf_hdr_get_version(self.ptr))
|
jpayne@69
|
1993
|
jpayne@69
|
1994 @property
|
jpayne@69
|
1995 def samples(self):
|
jpayne@69
|
1996 """samples (:class:`VariantHeaderSamples`)"""
|
jpayne@69
|
1997 return makeVariantHeaderSamples(self)
|
jpayne@69
|
1998
|
jpayne@69
|
1999 @property
|
jpayne@69
|
2000 def records(self):
|
jpayne@69
|
2001 """header records (:class:`VariantHeaderRecords`)"""
|
jpayne@69
|
2002 return makeVariantHeaderRecords(self)
|
jpayne@69
|
2003
|
jpayne@69
|
2004 @property
|
jpayne@69
|
2005 def contigs(self):
|
jpayne@69
|
2006 """contig information (:class:`VariantHeaderContigs`)"""
|
jpayne@69
|
2007 return makeVariantHeaderContigs(self)
|
jpayne@69
|
2008
|
jpayne@69
|
2009 @property
|
jpayne@69
|
2010 def filters(self):
|
jpayne@69
|
2011 """filter metadata (:class:`VariantHeaderMetadata`)"""
|
jpayne@69
|
2012 return makeVariantHeaderMetadata(self, BCF_HL_FLT)
|
jpayne@69
|
2013
|
jpayne@69
|
2014 @property
|
jpayne@69
|
2015 def info(self):
|
jpayne@69
|
2016 """info metadata (:class:`VariantHeaderMetadata`)"""
|
jpayne@69
|
2017 return makeVariantHeaderMetadata(self, BCF_HL_INFO)
|
jpayne@69
|
2018
|
jpayne@69
|
2019 @property
|
jpayne@69
|
2020 def formats(self):
|
jpayne@69
|
2021 """format metadata (:class:`VariantHeaderMetadata`)"""
|
jpayne@69
|
2022 return makeVariantHeaderMetadata(self, BCF_HL_FMT)
|
jpayne@69
|
2023
|
jpayne@69
|
2024 @property
|
jpayne@69
|
2025 def alts(self):
|
jpayne@69
|
2026 """alt metadata (:class:`dict` ID->record).
|
jpayne@69
|
2027
|
jpayne@69
|
2028 The data returned just a snapshot of alt records, is created
|
jpayne@69
|
2029 every time the property is requested, and modifications will
|
jpayne@69
|
2030 not be reflected in the header metadata and vice versa.
|
jpayne@69
|
2031
|
jpayne@69
|
2032 i.e. it is just a dict that reflects the state of alt records
|
jpayne@69
|
2033 at the time it is created.
|
jpayne@69
|
2034 """
|
jpayne@69
|
2035 return {record['ID']:record for record in self.records
|
jpayne@69
|
2036 if record.key.upper() == 'ALT' }
|
jpayne@69
|
2037
|
jpayne@69
|
2038 # only safe to do when opening an htsfile
|
jpayne@69
|
2039 cdef _subset_samples(self, include_samples):
|
jpayne@69
|
2040 keep_samples = set(self.samples)
|
jpayne@69
|
2041 include_samples = set(include_samples)
|
jpayne@69
|
2042 missing_samples = include_samples - keep_samples
|
jpayne@69
|
2043 keep_samples &= include_samples
|
jpayne@69
|
2044
|
jpayne@69
|
2045 if missing_samples:
|
jpayne@69
|
2046 # FIXME: add specialized exception with payload
|
jpayne@69
|
2047 raise ValueError(
|
jpayne@69
|
2048 'missing {:d} requested samples'.format(
|
jpayne@69
|
2049 len(missing_samples)))
|
jpayne@69
|
2050
|
jpayne@69
|
2051 keep_samples = force_bytes(','.join(keep_samples))
|
jpayne@69
|
2052 cdef char *keep = <char *>keep_samples if keep_samples else NULL
|
jpayne@69
|
2053 cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
|
jpayne@69
|
2054
|
jpayne@69
|
2055 if ret != 0:
|
jpayne@69
|
2056 raise ValueError(
|
jpayne@69
|
2057 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
|
jpayne@69
|
2058
|
jpayne@69
|
2059 def __str__(self):
|
jpayne@69
|
2060 cdef int hlen
|
jpayne@69
|
2061 cdef kstring_t line
|
jpayne@69
|
2062 line.l = line.m = 0
|
jpayne@69
|
2063 line.s = NULL
|
jpayne@69
|
2064
|
jpayne@69
|
2065 if bcf_hdr_format(self.ptr, 0, &line) < 0:
|
jpayne@69
|
2066 if line.m:
|
jpayne@69
|
2067 free(line.s)
|
jpayne@69
|
2068 raise ValueError('bcf_hdr_format failed')
|
jpayne@69
|
2069
|
jpayne@69
|
2070 ret = charptr_to_str_w_len(line.s, line.l)
|
jpayne@69
|
2071
|
jpayne@69
|
2072 if line.m:
|
jpayne@69
|
2073 free(line.s)
|
jpayne@69
|
2074 return ret
|
jpayne@69
|
2075
|
jpayne@69
|
2076 def new_record(self, contig=None, start=0, stop=0, alleles=None,
|
jpayne@69
|
2077 id=None, qual=None, filter=None, info=None, samples=None,
|
jpayne@69
|
2078 **kwargs):
|
jpayne@69
|
2079 """Create a new empty VariantRecord.
|
jpayne@69
|
2080
|
jpayne@69
|
2081 Arguments are currently experimental. Use with caution and expect
|
jpayne@69
|
2082 changes in upcoming releases.
|
jpayne@69
|
2083
|
jpayne@69
|
2084 """
|
jpayne@69
|
2085 rec = makeVariantRecord(self, bcf_init())
|
jpayne@69
|
2086
|
jpayne@69
|
2087 if not rec:
|
jpayne@69
|
2088 raise MemoryError('unable to allocate BCF record')
|
jpayne@69
|
2089
|
jpayne@69
|
2090 rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
|
jpayne@69
|
2091
|
jpayne@69
|
2092 if contig is not None:
|
jpayne@69
|
2093 rec.contig = contig
|
jpayne@69
|
2094
|
jpayne@69
|
2095 rec.start = start
|
jpayne@69
|
2096 rec.stop = stop
|
jpayne@69
|
2097 rec.id = id
|
jpayne@69
|
2098 rec.qual = qual
|
jpayne@69
|
2099
|
jpayne@69
|
2100 if alleles is not None:
|
jpayne@69
|
2101 rec.alleles = alleles
|
jpayne@69
|
2102
|
jpayne@69
|
2103 if filter is not None:
|
jpayne@69
|
2104 if isinstance(filter, (list, tuple, VariantRecordFilter)):
|
jpayne@69
|
2105 for f in filter:
|
jpayne@69
|
2106 rec.filter.add(f)
|
jpayne@69
|
2107 else:
|
jpayne@69
|
2108 rec.filter.add(filter)
|
jpayne@69
|
2109
|
jpayne@69
|
2110 if info:
|
jpayne@69
|
2111 rec.info.update(info)
|
jpayne@69
|
2112
|
jpayne@69
|
2113 if kwargs:
|
jpayne@69
|
2114 rec.samples[0].update(kwargs)
|
jpayne@69
|
2115
|
jpayne@69
|
2116 if samples:
|
jpayne@69
|
2117 for i, sample in enumerate(samples):
|
jpayne@69
|
2118 rec.samples[i].update(sample)
|
jpayne@69
|
2119
|
jpayne@69
|
2120 return rec
|
jpayne@69
|
2121
|
jpayne@69
|
2122 def add_record(self, VariantHeaderRecord record):
|
jpayne@69
|
2123 """Add an existing :class:`VariantHeaderRecord` to this header"""
|
jpayne@69
|
2124 if record is None:
|
jpayne@69
|
2125 raise ValueError('record must not be None')
|
jpayne@69
|
2126
|
jpayne@69
|
2127 cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
|
jpayne@69
|
2128
|
jpayne@69
|
2129 bcf_hdr_add_hrec(self.ptr, hrec)
|
jpayne@69
|
2130
|
jpayne@69
|
2131 self._hdr_sync()
|
jpayne@69
|
2132
|
jpayne@69
|
2133 def add_line(self, line):
|
jpayne@69
|
2134 """Add a metadata line to this header"""
|
jpayne@69
|
2135 bline = force_bytes(line)
|
jpayne@69
|
2136 if bcf_hdr_append(self.ptr, bline) < 0:
|
jpayne@69
|
2137 raise ValueError('invalid header line')
|
jpayne@69
|
2138
|
jpayne@69
|
2139 self._hdr_sync()
|
jpayne@69
|
2140
|
jpayne@69
|
2141
|
jpayne@69
|
2142 def add_meta(self, key, value=None, items=None):
|
jpayne@69
|
2143 """Add metadata to this header"""
|
jpayne@69
|
2144 if not ((value is not None) ^ (items is not None)):
|
jpayne@69
|
2145 raise ValueError('either value or items must be specified')
|
jpayne@69
|
2146
|
jpayne@69
|
2147 cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
|
jpayne@69
|
2148 cdef int quoted
|
jpayne@69
|
2149
|
jpayne@69
|
2150 try:
|
jpayne@69
|
2151 key = force_bytes(key)
|
jpayne@69
|
2152 hrec.key = strdup(key)
|
jpayne@69
|
2153
|
jpayne@69
|
2154 if value is not None:
|
jpayne@69
|
2155 hrec.value = strdup(force_bytes(value))
|
jpayne@69
|
2156 else:
|
jpayne@69
|
2157 for key, value in items:
|
jpayne@69
|
2158 quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
|
jpayne@69
|
2159
|
jpayne@69
|
2160 key = force_bytes(key)
|
jpayne@69
|
2161 bcf_hrec_add_key(hrec, key, <int>len(key))
|
jpayne@69
|
2162
|
jpayne@69
|
2163 value = force_bytes(str(value))
|
jpayne@69
|
2164 bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
|
jpayne@69
|
2165 except:
|
jpayne@69
|
2166 bcf_hrec_destroy(hrec)
|
jpayne@69
|
2167 raise
|
jpayne@69
|
2168
|
jpayne@69
|
2169 bcf_hdr_add_hrec(self.ptr, hrec)
|
jpayne@69
|
2170
|
jpayne@69
|
2171 self._hdr_sync()
|
jpayne@69
|
2172
|
jpayne@69
|
2173 cdef _add_sample(self, name):
|
jpayne@69
|
2174 bname = force_bytes(name)
|
jpayne@69
|
2175 if bcf_hdr_add_sample(self.ptr, bname) < 0:
|
jpayne@69
|
2176 raise ValueError('Duplicated sample name: {}'.format(name))
|
jpayne@69
|
2177
|
jpayne@69
|
2178 cdef _hdr_sync(self):
|
jpayne@69
|
2179 cdef bcf_hdr_t *hdr = self.ptr
|
jpayne@69
|
2180 if hdr.dirty:
|
jpayne@69
|
2181 if bcf_hdr_sync(hdr) < 0:
|
jpayne@69
|
2182 raise MemoryError('unable to reallocate VariantHeader')
|
jpayne@69
|
2183
|
jpayne@69
|
2184 def add_sample(self, name):
|
jpayne@69
|
2185 """Add a new sample to this header"""
|
jpayne@69
|
2186 self._add_sample(name)
|
jpayne@69
|
2187 self._hdr_sync()
|
jpayne@69
|
2188
|
jpayne@69
|
2189 def add_samples(self, *args):
|
jpayne@69
|
2190 """Add several new samples to this header.
|
jpayne@69
|
2191 This function takes multiple arguments, each of which may
|
jpayne@69
|
2192 be either a sample name or an iterable returning sample names
|
jpayne@69
|
2193 (e.g., a list of sample names).
|
jpayne@69
|
2194 """
|
jpayne@69
|
2195 for arg in args:
|
jpayne@69
|
2196 if isinstance(arg, str):
|
jpayne@69
|
2197 self._add_sample(arg)
|
jpayne@69
|
2198 else:
|
jpayne@69
|
2199 for name in arg:
|
jpayne@69
|
2200 self._add_sample(name)
|
jpayne@69
|
2201 self._hdr_sync()
|
jpayne@69
|
2202
|
jpayne@69
|
2203
|
jpayne@69
|
2204 cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
|
jpayne@69
|
2205 if not hdr:
|
jpayne@69
|
2206 raise ValueError('cannot create VariantHeader')
|
jpayne@69
|
2207
|
jpayne@69
|
2208 cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
|
jpayne@69
|
2209 header.ptr = hdr
|
jpayne@69
|
2210
|
jpayne@69
|
2211 return header
|
jpayne@69
|
2212
|
jpayne@69
|
2213
|
jpayne@69
|
2214 cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
|
jpayne@69
|
2215 cdef vdict_t *d
|
jpayne@69
|
2216 cdef khiter_t k
|
jpayne@69
|
2217 cdef int info_id
|
jpayne@69
|
2218
|
jpayne@69
|
2219 if isinstance(key, str):
|
jpayne@69
|
2220 key = force_bytes(key)
|
jpayne@69
|
2221
|
jpayne@69
|
2222 d = <vdict_t *>hdr.dict[BCF_DT_ID]
|
jpayne@69
|
2223 k = kh_get_vdict(d, key)
|
jpayne@69
|
2224
|
jpayne@69
|
2225 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
|
jpayne@69
|
2226 return -1
|
jpayne@69
|
2227
|
jpayne@69
|
2228 return kh_val_vdict(d, k).id
|
jpayne@69
|
2229
|
jpayne@69
|
2230
|
jpayne@69
|
2231 ########################################################################
|
jpayne@69
|
2232 ########################################################################
|
jpayne@69
|
2233 ## Variant Record objects
|
jpayne@69
|
2234 ########################################################################
|
jpayne@69
|
2235
|
jpayne@69
|
2236 cdef class VariantRecordFilter(object):
|
jpayne@69
|
2237 """Filters set on a :class:`VariantRecord` object, presented as a mapping from
|
jpayne@69
|
2238 filter index or name to :class:`VariantMetadata` object"""
|
jpayne@69
|
2239 def __init__(self, *args, **kwargs):
|
jpayne@69
|
2240 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
2241
|
jpayne@69
|
2242 def __len__(self):
|
jpayne@69
|
2243 return self.record.ptr.d.n_flt
|
jpayne@69
|
2244
|
jpayne@69
|
2245 def __bool__(self):
|
jpayne@69
|
2246 return self.record.ptr.d.n_flt != 0
|
jpayne@69
|
2247
|
jpayne@69
|
2248 def __getitem__(self, key):
|
jpayne@69
|
2249 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2250 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2251 cdef int index, id
|
jpayne@69
|
2252 cdef int n = r.d.n_flt
|
jpayne@69
|
2253
|
jpayne@69
|
2254 if isinstance(key, int):
|
jpayne@69
|
2255 index = key
|
jpayne@69
|
2256
|
jpayne@69
|
2257 if index < 0 or index >= n:
|
jpayne@69
|
2258 raise IndexError('invalid filter index')
|
jpayne@69
|
2259
|
jpayne@69
|
2260 id = r.d.flt[index]
|
jpayne@69
|
2261 else:
|
jpayne@69
|
2262 if key == '.':
|
jpayne@69
|
2263 key = 'PASS'
|
jpayne@69
|
2264
|
jpayne@69
|
2265 bkey = force_bytes(key)
|
jpayne@69
|
2266 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
|
jpayne@69
|
2267
|
jpayne@69
|
2268 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
|
jpayne@69
|
2269 raise KeyError('Invalid filter: {}'.format(key))
|
jpayne@69
|
2270
|
jpayne@69
|
2271 return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
|
jpayne@69
|
2272
|
jpayne@69
|
2273 def add(self, key):
|
jpayne@69
|
2274 """Add a new filter"""
|
jpayne@69
|
2275 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2276 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2277 cdef int id
|
jpayne@69
|
2278
|
jpayne@69
|
2279 if key == '.':
|
jpayne@69
|
2280 key = 'PASS'
|
jpayne@69
|
2281
|
jpayne@69
|
2282 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2283 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
|
jpayne@69
|
2284
|
jpayne@69
|
2285 if not check_header_id(hdr, BCF_HL_FLT, id):
|
jpayne@69
|
2286 raise KeyError('Invalid filter: {}'.format(key))
|
jpayne@69
|
2287
|
jpayne@69
|
2288 bcf_add_filter(hdr, r, id)
|
jpayne@69
|
2289
|
jpayne@69
|
2290 def __delitem__(self, key):
|
jpayne@69
|
2291 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2292 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2293 cdef int index, id
|
jpayne@69
|
2294 cdef int n = r.d.n_flt
|
jpayne@69
|
2295
|
jpayne@69
|
2296 if isinstance(key, int):
|
jpayne@69
|
2297 index = key
|
jpayne@69
|
2298
|
jpayne@69
|
2299 if index < 0 or index >= n:
|
jpayne@69
|
2300 raise IndexError('invalid filter index')
|
jpayne@69
|
2301
|
jpayne@69
|
2302 id = r.d.flt[index]
|
jpayne@69
|
2303 else:
|
jpayne@69
|
2304 if key == '.':
|
jpayne@69
|
2305 key = 'PASS'
|
jpayne@69
|
2306
|
jpayne@69
|
2307 bkey = force_bytes(key)
|
jpayne@69
|
2308 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
|
jpayne@69
|
2309
|
jpayne@69
|
2310 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
|
jpayne@69
|
2311 raise KeyError('Invalid filter: {}'.format(key))
|
jpayne@69
|
2312
|
jpayne@69
|
2313 bcf_remove_filter(hdr, r, id, 0)
|
jpayne@69
|
2314
|
jpayne@69
|
2315 def clear(self):
|
jpayne@69
|
2316 """Clear all filters"""
|
jpayne@69
|
2317 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2318 r.d.shared_dirty |= BCF1_DIRTY_FLT
|
jpayne@69
|
2319 r.d.n_flt = 0
|
jpayne@69
|
2320
|
jpayne@69
|
2321 def __iter__(self):
|
jpayne@69
|
2322 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2323 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2324 cdef int i
|
jpayne@69
|
2325
|
jpayne@69
|
2326 for i in range(r.d.n_flt):
|
jpayne@69
|
2327 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
|
jpayne@69
|
2328
|
jpayne@69
|
2329 def get(self, key, default=None):
|
jpayne@69
|
2330 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
2331 try:
|
jpayne@69
|
2332 return self[key]
|
jpayne@69
|
2333 except KeyError:
|
jpayne@69
|
2334 return default
|
jpayne@69
|
2335
|
jpayne@69
|
2336 def __contains__(self, key):
|
jpayne@69
|
2337 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2338 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2339 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2340 return bcf_has_filter(hdr, r, bkey) == 1
|
jpayne@69
|
2341
|
jpayne@69
|
2342 def iterkeys(self):
|
jpayne@69
|
2343 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
2344 return iter(self)
|
jpayne@69
|
2345
|
jpayne@69
|
2346 def itervalues(self):
|
jpayne@69
|
2347 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
2348 for key in self:
|
jpayne@69
|
2349 yield self[key]
|
jpayne@69
|
2350
|
jpayne@69
|
2351 def iteritems(self):
|
jpayne@69
|
2352 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
2353 for key in self:
|
jpayne@69
|
2354 yield (key, self[key])
|
jpayne@69
|
2355
|
jpayne@69
|
2356 def keys(self):
|
jpayne@69
|
2357 """D.keys() -> list of D's keys"""
|
jpayne@69
|
2358 return list(self)
|
jpayne@69
|
2359
|
jpayne@69
|
2360 def items(self):
|
jpayne@69
|
2361 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
2362 return list(self.iteritems())
|
jpayne@69
|
2363
|
jpayne@69
|
2364 def values(self):
|
jpayne@69
|
2365 """D.values() -> list of D's values"""
|
jpayne@69
|
2366 return list(self.itervalues())
|
jpayne@69
|
2367
|
jpayne@69
|
2368 def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
|
jpayne@69
|
2369 if op != 2 and op != 3:
|
jpayne@69
|
2370 return NotImplemented
|
jpayne@69
|
2371
|
jpayne@69
|
2372 cdef bcf1_t *s = self.record.ptr
|
jpayne@69
|
2373 cdef bcf1_t *o = other.record.ptr
|
jpayne@69
|
2374
|
jpayne@69
|
2375 cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
|
jpayne@69
|
2376
|
jpayne@69
|
2377 if op == 3:
|
jpayne@69
|
2378 cmp = not cmp
|
jpayne@69
|
2379
|
jpayne@69
|
2380 return cmp
|
jpayne@69
|
2381
|
jpayne@69
|
2382 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
2383 __hash__ = None
|
jpayne@69
|
2384
|
jpayne@69
|
2385 #TODO: implement __richcmp__
|
jpayne@69
|
2386
|
jpayne@69
|
2387
|
jpayne@69
|
2388 cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
|
jpayne@69
|
2389 if not record:
|
jpayne@69
|
2390 raise ValueError('invalid VariantRecord')
|
jpayne@69
|
2391
|
jpayne@69
|
2392 cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
|
jpayne@69
|
2393 filter.record = record
|
jpayne@69
|
2394
|
jpayne@69
|
2395 return filter
|
jpayne@69
|
2396
|
jpayne@69
|
2397
|
jpayne@69
|
2398 cdef class VariantRecordFormat(object):
|
jpayne@69
|
2399 """Format data present for each sample in a :class:`VariantRecord` object,
|
jpayne@69
|
2400 presented as mapping from format name to :class:`VariantMetadata` object."""
|
jpayne@69
|
2401 def __init__(self, *args, **kwargs):
|
jpayne@69
|
2402 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
2403
|
jpayne@69
|
2404 def __len__(self):
|
jpayne@69
|
2405 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2406 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2407 cdef int i, n = 0
|
jpayne@69
|
2408
|
jpayne@69
|
2409 for i in range(r.n_fmt):
|
jpayne@69
|
2410 if r.d.fmt[i].p:
|
jpayne@69
|
2411 n += 1
|
jpayne@69
|
2412 return n
|
jpayne@69
|
2413
|
jpayne@69
|
2414 def __bool__(self):
|
jpayne@69
|
2415 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2416 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2417 cdef int i
|
jpayne@69
|
2418
|
jpayne@69
|
2419 for i in range(r.n_fmt):
|
jpayne@69
|
2420 if r.d.fmt[i].p:
|
jpayne@69
|
2421 return True
|
jpayne@69
|
2422 return False
|
jpayne@69
|
2423
|
jpayne@69
|
2424 def __getitem__(self, key):
|
jpayne@69
|
2425 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2426 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2427
|
jpayne@69
|
2428 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2429 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
2430
|
jpayne@69
|
2431 if not fmt or not fmt.p:
|
jpayne@69
|
2432 raise KeyError('unknown format: {}'.format(key))
|
jpayne@69
|
2433
|
jpayne@69
|
2434 return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
|
jpayne@69
|
2435
|
jpayne@69
|
2436 def __delitem__(self, key):
|
jpayne@69
|
2437 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2438 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2439
|
jpayne@69
|
2440 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2441 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
2442
|
jpayne@69
|
2443 if not fmt or not fmt.p:
|
jpayne@69
|
2444 raise KeyError('unknown format: {}'.format(key))
|
jpayne@69
|
2445
|
jpayne@69
|
2446 if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
|
jpayne@69
|
2447 raise ValueError('Unable to delete FORMAT')
|
jpayne@69
|
2448
|
jpayne@69
|
2449 def clear(self):
|
jpayne@69
|
2450 """Clear all formats for all samples within the associated
|
jpayne@69
|
2451 :class:`VariantRecord` instance"""
|
jpayne@69
|
2452 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2453 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2454 cdef bcf_fmt_t *fmt
|
jpayne@69
|
2455 cdef const char *key
|
jpayne@69
|
2456 cdef int i
|
jpayne@69
|
2457
|
jpayne@69
|
2458 for i in reversed(range(r.n_fmt)):
|
jpayne@69
|
2459 fmt = &r.d.fmt[i]
|
jpayne@69
|
2460 if fmt.p:
|
jpayne@69
|
2461 key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
|
jpayne@69
|
2462 if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
|
jpayne@69
|
2463 raise ValueError('Unable to delete FORMAT')
|
jpayne@69
|
2464
|
jpayne@69
|
2465 def __iter__(self):
|
jpayne@69
|
2466 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2467 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2468 cdef bcf_fmt_t *fmt
|
jpayne@69
|
2469 cdef int i
|
jpayne@69
|
2470
|
jpayne@69
|
2471 for i in range(r.n_fmt):
|
jpayne@69
|
2472 fmt = &r.d.fmt[i]
|
jpayne@69
|
2473 if fmt.p:
|
jpayne@69
|
2474 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
|
jpayne@69
|
2475
|
jpayne@69
|
2476 def get(self, key, default=None):
|
jpayne@69
|
2477 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
2478 try:
|
jpayne@69
|
2479 return self[key]
|
jpayne@69
|
2480 except KeyError:
|
jpayne@69
|
2481 return default
|
jpayne@69
|
2482
|
jpayne@69
|
2483 def __contains__(self, key):
|
jpayne@69
|
2484 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2485 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2486 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2487 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
2488 return fmt != NULL and fmt.p != NULL
|
jpayne@69
|
2489
|
jpayne@69
|
2490 def iterkeys(self):
|
jpayne@69
|
2491 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
2492 return iter(self)
|
jpayne@69
|
2493
|
jpayne@69
|
2494 def itervalues(self):
|
jpayne@69
|
2495 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
2496 for key in self:
|
jpayne@69
|
2497 yield self[key]
|
jpayne@69
|
2498
|
jpayne@69
|
2499 def iteritems(self):
|
jpayne@69
|
2500 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
2501 for key in self:
|
jpayne@69
|
2502 yield (key, self[key])
|
jpayne@69
|
2503
|
jpayne@69
|
2504 def keys(self):
|
jpayne@69
|
2505 """D.keys() -> list of D's keys"""
|
jpayne@69
|
2506 return list(self)
|
jpayne@69
|
2507
|
jpayne@69
|
2508 def items(self):
|
jpayne@69
|
2509 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
2510 return list(self.iteritems())
|
jpayne@69
|
2511
|
jpayne@69
|
2512 def values(self):
|
jpayne@69
|
2513 """D.values() -> list of D's values"""
|
jpayne@69
|
2514 return list(self.itervalues())
|
jpayne@69
|
2515
|
jpayne@69
|
2516 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
2517 __hash__ = None
|
jpayne@69
|
2518
|
jpayne@69
|
2519 #TODO: implement __richcmp__
|
jpayne@69
|
2520
|
jpayne@69
|
2521
|
jpayne@69
|
2522 cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
|
jpayne@69
|
2523 if not record:
|
jpayne@69
|
2524 raise ValueError('invalid VariantRecord')
|
jpayne@69
|
2525
|
jpayne@69
|
2526 cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
|
jpayne@69
|
2527 format.record = record
|
jpayne@69
|
2528
|
jpayne@69
|
2529 return format
|
jpayne@69
|
2530
|
jpayne@69
|
2531
|
jpayne@69
|
2532 #TODO: Add a getmeta method to return the corresponding VariantMetadata?
|
jpayne@69
|
2533 cdef class VariantRecordInfo(object):
|
jpayne@69
|
2534 """Info data stored in a :class:`VariantRecord` object, presented as a
|
jpayne@69
|
2535 mapping from info metadata name to value."""
|
jpayne@69
|
2536
|
jpayne@69
|
2537 def __init__(self, *args, **kwargs):
|
jpayne@69
|
2538 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
2539
|
jpayne@69
|
2540 def __len__(self):
|
jpayne@69
|
2541 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2542 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2543 cdef bcf_info_t *info
|
jpayne@69
|
2544 cdef const char *key
|
jpayne@69
|
2545 cdef int i, count = 0
|
jpayne@69
|
2546
|
jpayne@69
|
2547 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2548 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2549
|
jpayne@69
|
2550 for i in range(r.n_info):
|
jpayne@69
|
2551 info = &r.d.info[i]
|
jpayne@69
|
2552 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2553 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
|
jpayne@69
|
2554 count += 1
|
jpayne@69
|
2555
|
jpayne@69
|
2556 return count
|
jpayne@69
|
2557
|
jpayne@69
|
2558 def __bool__(self):
|
jpayne@69
|
2559 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2560 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2561 cdef bcf_info_t *info
|
jpayne@69
|
2562 cdef const char *key
|
jpayne@69
|
2563 cdef int i
|
jpayne@69
|
2564
|
jpayne@69
|
2565 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2566 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2567
|
jpayne@69
|
2568 for i in range(r.n_info):
|
jpayne@69
|
2569 info = &r.d.info[i]
|
jpayne@69
|
2570 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2571 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
|
jpayne@69
|
2572 return True
|
jpayne@69
|
2573
|
jpayne@69
|
2574 return False
|
jpayne@69
|
2575
|
jpayne@69
|
2576 def __getitem__(self, key):
|
jpayne@69
|
2577 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2578 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2579
|
jpayne@69
|
2580 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2581 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2582
|
jpayne@69
|
2583 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2584
|
jpayne@69
|
2585 if strcmp(bkey, b'END') == 0:
|
jpayne@69
|
2586 raise KeyError('END is a reserved attribute; access is via record.stop')
|
jpayne@69
|
2587
|
jpayne@69
|
2588 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
2589
|
jpayne@69
|
2590 # Cannot stop here if info == NULL, since flags must return False
|
jpayne@69
|
2591 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
|
jpayne@69
|
2592
|
jpayne@69
|
2593 if info_id < 0:
|
jpayne@69
|
2594 raise KeyError('Unknown INFO field: {}'.format(key))
|
jpayne@69
|
2595
|
jpayne@69
|
2596 if not check_header_id(hdr, BCF_HL_INFO, info_id):
|
jpayne@69
|
2597 raise ValueError('Invalid header')
|
jpayne@69
|
2598
|
jpayne@69
|
2599 # Handle type=Flag values
|
jpayne@69
|
2600 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
|
jpayne@69
|
2601 return info != NULL and info.vptr != NULL
|
jpayne@69
|
2602
|
jpayne@69
|
2603 if not info or not info.vptr:
|
jpayne@69
|
2604 raise KeyError('Invalid INFO field: {}'.format(key))
|
jpayne@69
|
2605
|
jpayne@69
|
2606 return bcf_info_get_value(self.record, info)
|
jpayne@69
|
2607
|
jpayne@69
|
2608 def __setitem__(self, key, value):
|
jpayne@69
|
2609 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2610
|
jpayne@69
|
2611 if strcmp(bkey, b'END') == 0:
|
jpayne@69
|
2612 raise KeyError('END is a reserved attribute; access is via record.stop')
|
jpayne@69
|
2613
|
jpayne@69
|
2614 if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
|
jpayne@69
|
2615 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2616
|
jpayne@69
|
2617 bcf_info_set_value(self.record, key, value)
|
jpayne@69
|
2618
|
jpayne@69
|
2619 def __delitem__(self, key):
|
jpayne@69
|
2620 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2621 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2622
|
jpayne@69
|
2623 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2624 if strcmp(bkey, b'END') == 0:
|
jpayne@69
|
2625 raise KeyError('END is a reserved attribute; access is via record.stop')
|
jpayne@69
|
2626
|
jpayne@69
|
2627 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2628 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2629
|
jpayne@69
|
2630 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
2631
|
jpayne@69
|
2632 # Cannot stop here if info == NULL, since flags must return False
|
jpayne@69
|
2633 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
|
jpayne@69
|
2634
|
jpayne@69
|
2635 if info_id < 0:
|
jpayne@69
|
2636 raise KeyError('Unknown INFO field: {}'.format(key))
|
jpayne@69
|
2637
|
jpayne@69
|
2638 if not check_header_id(hdr, BCF_HL_INFO, info_id):
|
jpayne@69
|
2639 raise ValueError('Invalid header')
|
jpayne@69
|
2640
|
jpayne@69
|
2641 # Handle flags
|
jpayne@69
|
2642 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
|
jpayne@69
|
2643 return
|
jpayne@69
|
2644
|
jpayne@69
|
2645 if not info or not info.vptr:
|
jpayne@69
|
2646 raise KeyError('Unknown INFO field: {}'.format(key))
|
jpayne@69
|
2647
|
jpayne@69
|
2648 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
|
jpayne@69
|
2649 raise ValueError('Unable to delete INFO')
|
jpayne@69
|
2650
|
jpayne@69
|
2651 def clear(self):
|
jpayne@69
|
2652 """Clear all info data"""
|
jpayne@69
|
2653 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2654 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2655 cdef bcf_info_t *info
|
jpayne@69
|
2656 cdef const char *key
|
jpayne@69
|
2657 cdef int i
|
jpayne@69
|
2658
|
jpayne@69
|
2659 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2660 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2661
|
jpayne@69
|
2662 for i in range(r.n_info):
|
jpayne@69
|
2663 info = &r.d.info[i]
|
jpayne@69
|
2664 if info and info.vptr:
|
jpayne@69
|
2665 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2666 if strcmp(key, b'END') == 0:
|
jpayne@69
|
2667 continue
|
jpayne@69
|
2668 if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
|
jpayne@69
|
2669 raise ValueError('Unable to delete INFO')
|
jpayne@69
|
2670
|
jpayne@69
|
2671 def __iter__(self):
|
jpayne@69
|
2672 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2673 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2674 cdef bcf_info_t *info
|
jpayne@69
|
2675 cdef const char *key
|
jpayne@69
|
2676 cdef int i
|
jpayne@69
|
2677
|
jpayne@69
|
2678 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2679 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2680
|
jpayne@69
|
2681 for i in range(r.n_info):
|
jpayne@69
|
2682 info = &r.d.info[i]
|
jpayne@69
|
2683 if info and info.vptr:
|
jpayne@69
|
2684 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2685 if strcmp(key, b'END') != 0:
|
jpayne@69
|
2686 yield bcf_str_cache_get_charptr(key)
|
jpayne@69
|
2687
|
jpayne@69
|
2688 def get(self, key, default=None):
|
jpayne@69
|
2689 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
2690 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2691 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2692
|
jpayne@69
|
2693 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2694 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2695
|
jpayne@69
|
2696 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2697
|
jpayne@69
|
2698 if strcmp(bkey, b'END') == 0:
|
jpayne@69
|
2699 return default
|
jpayne@69
|
2700
|
jpayne@69
|
2701 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
2702
|
jpayne@69
|
2703 # Cannot stop here if info == NULL, since flags must return False
|
jpayne@69
|
2704 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
|
jpayne@69
|
2705
|
jpayne@69
|
2706 if not check_header_id(hdr, BCF_HL_INFO, info_id):
|
jpayne@69
|
2707 raise ValueError('Invalid header')
|
jpayne@69
|
2708
|
jpayne@69
|
2709 # Handle flags
|
jpayne@69
|
2710 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
|
jpayne@69
|
2711 return info != NULL and info.vptr != NULL
|
jpayne@69
|
2712
|
jpayne@69
|
2713 if not info or not info.vptr:
|
jpayne@69
|
2714 return default
|
jpayne@69
|
2715
|
jpayne@69
|
2716 return bcf_info_get_value(self.record, info)
|
jpayne@69
|
2717
|
jpayne@69
|
2718 def __contains__(self, key):
|
jpayne@69
|
2719 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2720 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2721
|
jpayne@69
|
2722 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2723 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2724
|
jpayne@69
|
2725 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2726
|
jpayne@69
|
2727 if strcmp(bkey, b'END') == 0:
|
jpayne@69
|
2728 return False
|
jpayne@69
|
2729
|
jpayne@69
|
2730 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
2731
|
jpayne@69
|
2732 return info != NULL and info.vptr != NULL
|
jpayne@69
|
2733
|
jpayne@69
|
2734 def iterkeys(self):
|
jpayne@69
|
2735 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
2736 return iter(self)
|
jpayne@69
|
2737
|
jpayne@69
|
2738 def itervalues(self):
|
jpayne@69
|
2739 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
2740 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2741 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2742 cdef bcf_info_t *info
|
jpayne@69
|
2743 cdef const char *key
|
jpayne@69
|
2744 cdef int i
|
jpayne@69
|
2745
|
jpayne@69
|
2746 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2747 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2748
|
jpayne@69
|
2749 for i in range(r.n_info):
|
jpayne@69
|
2750 info = &r.d.info[i]
|
jpayne@69
|
2751 if info and info.vptr:
|
jpayne@69
|
2752 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2753 if strcmp(key, b'END') != 0:
|
jpayne@69
|
2754 yield bcf_info_get_value(self.record, info)
|
jpayne@69
|
2755
|
jpayne@69
|
2756 def iteritems(self):
|
jpayne@69
|
2757 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
2758 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2759 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2760 cdef bcf_info_t *info
|
jpayne@69
|
2761 cdef const char *key
|
jpayne@69
|
2762 cdef int i
|
jpayne@69
|
2763
|
jpayne@69
|
2764 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2765 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2766
|
jpayne@69
|
2767 for i in range(r.n_info):
|
jpayne@69
|
2768 info = &r.d.info[i]
|
jpayne@69
|
2769 if info and info.vptr:
|
jpayne@69
|
2770 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
|
jpayne@69
|
2771 if strcmp(key, b'END') != 0:
|
jpayne@69
|
2772 value = bcf_info_get_value(self.record, info)
|
jpayne@69
|
2773 yield bcf_str_cache_get_charptr(key), value
|
jpayne@69
|
2774
|
jpayne@69
|
2775 def keys(self):
|
jpayne@69
|
2776 """D.keys() -> list of D's keys"""
|
jpayne@69
|
2777 return list(self)
|
jpayne@69
|
2778
|
jpayne@69
|
2779 def items(self):
|
jpayne@69
|
2780 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
2781 return list(self.iteritems())
|
jpayne@69
|
2782
|
jpayne@69
|
2783 def values(self):
|
jpayne@69
|
2784 """D.values() -> list of D's values"""
|
jpayne@69
|
2785 return list(self.itervalues())
|
jpayne@69
|
2786
|
jpayne@69
|
2787 def update(self, items=None, **kwargs):
|
jpayne@69
|
2788 """D.update([E, ]**F) -> None.
|
jpayne@69
|
2789
|
jpayne@69
|
2790 Update D from dict/iterable E and F.
|
jpayne@69
|
2791 """
|
jpayne@69
|
2792 for k, v in items.items():
|
jpayne@69
|
2793 if k != 'END':
|
jpayne@69
|
2794 self[k] = v
|
jpayne@69
|
2795
|
jpayne@69
|
2796 if kwargs:
|
jpayne@69
|
2797 kwargs.pop('END', None)
|
jpayne@69
|
2798 for k, v in kwargs.items():
|
jpayne@69
|
2799 self[k] = v
|
jpayne@69
|
2800
|
jpayne@69
|
2801 def pop(self, key, default=_nothing):
|
jpayne@69
|
2802 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2803 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2804
|
jpayne@69
|
2805 if bcf_unpack(r, BCF_UN_INFO) < 0:
|
jpayne@69
|
2806 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
2807
|
jpayne@69
|
2808 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
2809 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
|
jpayne@69
|
2810
|
jpayne@69
|
2811 # Cannot stop here if info == NULL, since flags must return False
|
jpayne@69
|
2812 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
|
jpayne@69
|
2813
|
jpayne@69
|
2814 if info_id < 0:
|
jpayne@69
|
2815 if default is _nothing:
|
jpayne@69
|
2816 raise KeyError('Unknown INFO field: {}'.format(key))
|
jpayne@69
|
2817 return default
|
jpayne@69
|
2818
|
jpayne@69
|
2819 if not check_header_id(hdr, BCF_HL_INFO, info_id):
|
jpayne@69
|
2820 raise ValueError('Invalid header')
|
jpayne@69
|
2821
|
jpayne@69
|
2822 # Handle flags
|
jpayne@69
|
2823 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
|
jpayne@69
|
2824 return
|
jpayne@69
|
2825
|
jpayne@69
|
2826 if not info or not info.vptr:
|
jpayne@69
|
2827 if default is _nothing:
|
jpayne@69
|
2828 raise KeyError('Unknown INFO field: {}'.format(key))
|
jpayne@69
|
2829 return default
|
jpayne@69
|
2830
|
jpayne@69
|
2831 value = bcf_info_get_value(self.record, info)
|
jpayne@69
|
2832
|
jpayne@69
|
2833 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
|
jpayne@69
|
2834 raise ValueError('Unable to delete INFO')
|
jpayne@69
|
2835
|
jpayne@69
|
2836 return value
|
jpayne@69
|
2837
|
jpayne@69
|
2838 def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
|
jpayne@69
|
2839 if op != 2 and op != 3:
|
jpayne@69
|
2840 return NotImplemented
|
jpayne@69
|
2841
|
jpayne@69
|
2842 cdef bcf1_t *s = self.record.ptr
|
jpayne@69
|
2843 cdef bcf1_t *o = other.record.ptr
|
jpayne@69
|
2844
|
jpayne@69
|
2845 # Cannot use n_info as shortcut logic, since null values may remain
|
jpayne@69
|
2846 cdef bint cmp = dict(self) == dict(other)
|
jpayne@69
|
2847
|
jpayne@69
|
2848 if op == 3:
|
jpayne@69
|
2849 cmp = not cmp
|
jpayne@69
|
2850
|
jpayne@69
|
2851 return cmp
|
jpayne@69
|
2852
|
jpayne@69
|
2853 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
2854 __hash__ = None
|
jpayne@69
|
2855
|
jpayne@69
|
2856
|
jpayne@69
|
2857 cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
|
jpayne@69
|
2858 if not record:
|
jpayne@69
|
2859 raise ValueError('invalid VariantRecord')
|
jpayne@69
|
2860
|
jpayne@69
|
2861 cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
|
jpayne@69
|
2862 info.record = record
|
jpayne@69
|
2863
|
jpayne@69
|
2864 return info
|
jpayne@69
|
2865
|
jpayne@69
|
2866
|
jpayne@69
|
2867 cdef class VariantRecordSamples(object):
|
jpayne@69
|
2868 """mapping from sample index or name to :class:`VariantRecordSample` object."""
|
jpayne@69
|
2869 def __init__(self, *args, **kwargs):
|
jpayne@69
|
2870 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
2871
|
jpayne@69
|
2872 def __len__(self):
|
jpayne@69
|
2873 return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
|
jpayne@69
|
2874
|
jpayne@69
|
2875 def __bool__(self):
|
jpayne@69
|
2876 return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
|
jpayne@69
|
2877
|
jpayne@69
|
2878 def __getitem__(self, key):
|
jpayne@69
|
2879 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2880 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2881 cdef int n = self.record.ptr.n_sample
|
jpayne@69
|
2882 cdef int sample_index
|
jpayne@69
|
2883 cdef vdict_t *d
|
jpayne@69
|
2884 cdef khiter_t k
|
jpayne@69
|
2885
|
jpayne@69
|
2886 if isinstance(key, int):
|
jpayne@69
|
2887 sample_index = key
|
jpayne@69
|
2888 else:
|
jpayne@69
|
2889 bkey = force_bytes(key)
|
jpayne@69
|
2890 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
|
jpayne@69
|
2891 if sample_index < 0:
|
jpayne@69
|
2892 raise KeyError('invalid sample name: {}'.format(key))
|
jpayne@69
|
2893
|
jpayne@69
|
2894 if sample_index < 0 or sample_index >= n:
|
jpayne@69
|
2895 raise IndexError('invalid sample index')
|
jpayne@69
|
2896
|
jpayne@69
|
2897 return makeVariantRecordSample(self.record, sample_index)
|
jpayne@69
|
2898
|
jpayne@69
|
2899 def __iter__(self):
|
jpayne@69
|
2900 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2901 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2902 cdef int32_t i, n = self.record.ptr.n_sample
|
jpayne@69
|
2903
|
jpayne@69
|
2904 for i in range(n):
|
jpayne@69
|
2905 yield charptr_to_str(hdr.samples[i])
|
jpayne@69
|
2906
|
jpayne@69
|
2907 def get(self, key, default=None):
|
jpayne@69
|
2908 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
2909 try:
|
jpayne@69
|
2910 return self[key]
|
jpayne@69
|
2911 except KeyError:
|
jpayne@69
|
2912 return default
|
jpayne@69
|
2913
|
jpayne@69
|
2914 def __contains__(self, key):
|
jpayne@69
|
2915 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2916 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2917 cdef int n = self.record.ptr.n_sample
|
jpayne@69
|
2918 cdef int sample_index
|
jpayne@69
|
2919 cdef vdict_t *d
|
jpayne@69
|
2920 cdef khiter_t k
|
jpayne@69
|
2921
|
jpayne@69
|
2922 if isinstance(key, int):
|
jpayne@69
|
2923 sample_index = key
|
jpayne@69
|
2924 else:
|
jpayne@69
|
2925 bkey = force_bytes(key)
|
jpayne@69
|
2926 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
|
jpayne@69
|
2927 if sample_index < 0:
|
jpayne@69
|
2928 raise KeyError('invalid sample name: {}'.format(key))
|
jpayne@69
|
2929
|
jpayne@69
|
2930 return 0 <= sample_index < n
|
jpayne@69
|
2931
|
jpayne@69
|
2932 def iterkeys(self):
|
jpayne@69
|
2933 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
2934 return iter(self)
|
jpayne@69
|
2935
|
jpayne@69
|
2936 def itervalues(self):
|
jpayne@69
|
2937 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
2938 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2939 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2940 cdef int32_t i, n = self.record.ptr.n_sample
|
jpayne@69
|
2941
|
jpayne@69
|
2942 for i in range(n):
|
jpayne@69
|
2943 yield makeVariantRecordSample(self.record, i)
|
jpayne@69
|
2944
|
jpayne@69
|
2945 def iteritems(self):
|
jpayne@69
|
2946 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
2947 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
2948 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
2949 cdef int32_t i, n = self.record.ptr.n_sample
|
jpayne@69
|
2950
|
jpayne@69
|
2951 for i in range(n):
|
jpayne@69
|
2952 yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
|
jpayne@69
|
2953
|
jpayne@69
|
2954 def keys(self):
|
jpayne@69
|
2955 """D.keys() -> list of D's keys"""
|
jpayne@69
|
2956 return list(self)
|
jpayne@69
|
2957
|
jpayne@69
|
2958 def items(self):
|
jpayne@69
|
2959 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
2960 return list(self.iteritems())
|
jpayne@69
|
2961
|
jpayne@69
|
2962 def values(self):
|
jpayne@69
|
2963 """D.values() -> list of D's values"""
|
jpayne@69
|
2964 return list(self.itervalues())
|
jpayne@69
|
2965
|
jpayne@69
|
2966 def update(self, items=None, **kwargs):
|
jpayne@69
|
2967 """D.update([E, ]**F) -> None.
|
jpayne@69
|
2968
|
jpayne@69
|
2969 Update D from dict/iterable E and F.
|
jpayne@69
|
2970 """
|
jpayne@69
|
2971 for k, v in items.items():
|
jpayne@69
|
2972 self[k] = v
|
jpayne@69
|
2973
|
jpayne@69
|
2974 if kwargs:
|
jpayne@69
|
2975 for k, v in kwargs.items():
|
jpayne@69
|
2976 self[k] = v
|
jpayne@69
|
2977
|
jpayne@69
|
2978 def pop(self, key, default=_nothing):
|
jpayne@69
|
2979 try:
|
jpayne@69
|
2980 value = self[key]
|
jpayne@69
|
2981 del self[key]
|
jpayne@69
|
2982 return value
|
jpayne@69
|
2983 except KeyError:
|
jpayne@69
|
2984 if default is not _nothing:
|
jpayne@69
|
2985 return default
|
jpayne@69
|
2986 raise
|
jpayne@69
|
2987
|
jpayne@69
|
2988 def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
|
jpayne@69
|
2989 if op != 2 and op != 3:
|
jpayne@69
|
2990 return NotImplemented
|
jpayne@69
|
2991
|
jpayne@69
|
2992 cdef bcf1_t *s = self.record.ptr
|
jpayne@69
|
2993 cdef bcf1_t *o = other.record.ptr
|
jpayne@69
|
2994
|
jpayne@69
|
2995 cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
|
jpayne@69
|
2996
|
jpayne@69
|
2997 if op == 3:
|
jpayne@69
|
2998 cmp = not cmp
|
jpayne@69
|
2999
|
jpayne@69
|
3000 return cmp
|
jpayne@69
|
3001
|
jpayne@69
|
3002 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
3003 __hash__ = None
|
jpayne@69
|
3004
|
jpayne@69
|
3005
|
jpayne@69
|
3006 cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
|
jpayne@69
|
3007 if not record:
|
jpayne@69
|
3008 raise ValueError('invalid VariantRecord')
|
jpayne@69
|
3009
|
jpayne@69
|
3010 cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
|
jpayne@69
|
3011 VariantRecordSamples)
|
jpayne@69
|
3012 samples.record = record
|
jpayne@69
|
3013
|
jpayne@69
|
3014 return samples
|
jpayne@69
|
3015
|
jpayne@69
|
3016
|
jpayne@69
|
3017 cdef class VariantRecord(object):
|
jpayne@69
|
3018 """Variant record"""
|
jpayne@69
|
3019 def __init__(self, *args, **kwargs):
|
jpayne@69
|
3020 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
3021
|
jpayne@69
|
3022 def __dealloc__(self):
|
jpayne@69
|
3023 if self.ptr:
|
jpayne@69
|
3024 bcf_destroy1(self.ptr)
|
jpayne@69
|
3025 self.ptr = NULL
|
jpayne@69
|
3026
|
jpayne@69
|
3027 def copy(self):
|
jpayne@69
|
3028 """return a copy of this VariantRecord object"""
|
jpayne@69
|
3029 return makeVariantRecord(self.header, bcf_dup(self.ptr))
|
jpayne@69
|
3030
|
jpayne@69
|
3031 def translate(self, VariantHeader dst_header):
|
jpayne@69
|
3032 if dst_header is None:
|
jpayne@69
|
3033 raise ValueError('dst_header must not be None')
|
jpayne@69
|
3034
|
jpayne@69
|
3035 cdef bcf_hdr_t *src_hdr = self.header.ptr
|
jpayne@69
|
3036 cdef bcf_hdr_t *dst_hdr = dst_header.ptr
|
jpayne@69
|
3037
|
jpayne@69
|
3038 if src_hdr != dst_hdr:
|
jpayne@69
|
3039 if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
|
jpayne@69
|
3040 msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
|
jpayne@69
|
3041 raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
|
jpayne@69
|
3042
|
jpayne@69
|
3043 bcf_translate(dst_hdr, src_hdr, self.ptr)
|
jpayne@69
|
3044 self.header = dst_header
|
jpayne@69
|
3045
|
jpayne@69
|
3046 @property
|
jpayne@69
|
3047 def rid(self):
|
jpayne@69
|
3048 """internal reference id number"""
|
jpayne@69
|
3049 return self.ptr.rid
|
jpayne@69
|
3050
|
jpayne@69
|
3051 @rid.setter
|
jpayne@69
|
3052 def rid(self, value):
|
jpayne@69
|
3053 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
3054 cdef int r = value
|
jpayne@69
|
3055 if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
|
jpayne@69
|
3056 raise ValueError('invalid reference id')
|
jpayne@69
|
3057 self.ptr.rid = r
|
jpayne@69
|
3058
|
jpayne@69
|
3059 @property
|
jpayne@69
|
3060 def chrom(self):
|
jpayne@69
|
3061 """chromosome/contig name"""
|
jpayne@69
|
3062 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
3063 cdef int rid = self.ptr.rid
|
jpayne@69
|
3064 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
|
jpayne@69
|
3065 raise ValueError('Invalid header')
|
jpayne@69
|
3066 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
|
jpayne@69
|
3067
|
jpayne@69
|
3068 @chrom.setter
|
jpayne@69
|
3069 def chrom(self, value):
|
jpayne@69
|
3070 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
|
jpayne@69
|
3071 bchrom = force_bytes(value)
|
jpayne@69
|
3072 cdef khint_t k = kh_get_vdict(d, bchrom)
|
jpayne@69
|
3073 if k == kh_end(d):
|
jpayne@69
|
3074 raise ValueError('Invalid chromosome/contig')
|
jpayne@69
|
3075 self.ptr.rid = kh_val_vdict(d, k).id
|
jpayne@69
|
3076
|
jpayne@69
|
3077 @property
|
jpayne@69
|
3078 def contig(self):
|
jpayne@69
|
3079 """chromosome/contig name"""
|
jpayne@69
|
3080 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
3081 cdef int rid = self.ptr.rid
|
jpayne@69
|
3082 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
|
jpayne@69
|
3083 raise ValueError('Invalid header')
|
jpayne@69
|
3084 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
|
jpayne@69
|
3085
|
jpayne@69
|
3086 @contig.setter
|
jpayne@69
|
3087 def contig(self, value):
|
jpayne@69
|
3088 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
|
jpayne@69
|
3089 bchrom = force_bytes(value)
|
jpayne@69
|
3090 cdef khint_t k = kh_get_vdict(d, bchrom)
|
jpayne@69
|
3091 if k == kh_end(d):
|
jpayne@69
|
3092 raise ValueError('Invalid chromosome/contig')
|
jpayne@69
|
3093 self.ptr.rid = kh_val_vdict(d, k).id
|
jpayne@69
|
3094
|
jpayne@69
|
3095 @property
|
jpayne@69
|
3096 def pos(self):
|
jpayne@69
|
3097 """record start position on chrom/contig (1-based inclusive)"""
|
jpayne@69
|
3098 return self.ptr.pos + 1
|
jpayne@69
|
3099
|
jpayne@69
|
3100 @pos.setter
|
jpayne@69
|
3101 def pos(self, value):
|
jpayne@69
|
3102 cdef int p = value
|
jpayne@69
|
3103 if p < 1:
|
jpayne@69
|
3104 raise ValueError('Position must be positive')
|
jpayne@69
|
3105 self.ptr.pos = p - 1
|
jpayne@69
|
3106 bcf_sync_end(self)
|
jpayne@69
|
3107
|
jpayne@69
|
3108 @property
|
jpayne@69
|
3109 def start(self):
|
jpayne@69
|
3110 """record start position on chrom/contig (0-based inclusive)"""
|
jpayne@69
|
3111 return self.ptr.pos
|
jpayne@69
|
3112
|
jpayne@69
|
3113 @start.setter
|
jpayne@69
|
3114 def start(self, value):
|
jpayne@69
|
3115 cdef int s = value
|
jpayne@69
|
3116 if s < 0:
|
jpayne@69
|
3117 raise ValueError('Start coordinate must be non-negative')
|
jpayne@69
|
3118 self.ptr.pos = s
|
jpayne@69
|
3119 bcf_sync_end(self)
|
jpayne@69
|
3120
|
jpayne@69
|
3121 @property
|
jpayne@69
|
3122 def stop(self):
|
jpayne@69
|
3123 """record stop position on chrom/contig (0-based exclusive)"""
|
jpayne@69
|
3124 return self.ptr.pos + self.ptr.rlen
|
jpayne@69
|
3125
|
jpayne@69
|
3126 @stop.setter
|
jpayne@69
|
3127 def stop(self, value):
|
jpayne@69
|
3128 cdef int s = value
|
jpayne@69
|
3129 if s < 0:
|
jpayne@69
|
3130 raise ValueError('Stop coordinate must be non-negative')
|
jpayne@69
|
3131 self.ptr.rlen = s - self.ptr.pos
|
jpayne@69
|
3132 bcf_sync_end(self)
|
jpayne@69
|
3133
|
jpayne@69
|
3134 @property
|
jpayne@69
|
3135 def rlen(self):
|
jpayne@69
|
3136 """record length on chrom/contig (aka rec.stop - rec.start)"""
|
jpayne@69
|
3137 return self.ptr.rlen
|
jpayne@69
|
3138
|
jpayne@69
|
3139 @rlen.setter
|
jpayne@69
|
3140 def rlen(self, value):
|
jpayne@69
|
3141 cdef int r = value
|
jpayne@69
|
3142 self.ptr.rlen = r
|
jpayne@69
|
3143 bcf_sync_end(self)
|
jpayne@69
|
3144
|
jpayne@69
|
3145 @property
|
jpayne@69
|
3146 def qual(self):
|
jpayne@69
|
3147 """phred scaled quality score or None if not available"""
|
jpayne@69
|
3148 return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
|
jpayne@69
|
3149
|
jpayne@69
|
3150 @qual.setter
|
jpayne@69
|
3151 def qual(self, value):
|
jpayne@69
|
3152 if value is not None:
|
jpayne@69
|
3153 self.ptr.qual = value
|
jpayne@69
|
3154 else:
|
jpayne@69
|
3155 bcf_float_set(&self.ptr.qual, bcf_float_missing)
|
jpayne@69
|
3156
|
jpayne@69
|
3157
|
jpayne@69
|
3158 # @property
|
jpayne@69
|
3159 # def n_allele(self):
|
jpayne@69
|
3160 # return self.ptr.n_allele
|
jpayne@69
|
3161
|
jpayne@69
|
3162 # @property
|
jpayne@69
|
3163 # def n_sample(self):
|
jpayne@69
|
3164 # return self.ptr.n_sample
|
jpayne@69
|
3165
|
jpayne@69
|
3166 @property
|
jpayne@69
|
3167 def id(self):
|
jpayne@69
|
3168 """record identifier or None if not available"""
|
jpayne@69
|
3169 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3170 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3171 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3172 # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
|
jpayne@69
|
3173 # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
|
jpayne@69
|
3174 if (r.d.m_id == 0):
|
jpayne@69
|
3175 raise ValueError('Error extracting ID')
|
jpayne@69
|
3176 return charptr_to_str(r.d.id) if r.d.id != b'.' else None
|
jpayne@69
|
3177
|
jpayne@69
|
3178 @id.setter
|
jpayne@69
|
3179 def id(self, value):
|
jpayne@69
|
3180 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3181 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3182 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3183 cdef char *idstr = NULL
|
jpayne@69
|
3184 if value is not None:
|
jpayne@69
|
3185 bid = force_bytes(value)
|
jpayne@69
|
3186 idstr = bid
|
jpayne@69
|
3187 if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
|
jpayne@69
|
3188 raise ValueError('Error updating id')
|
jpayne@69
|
3189
|
jpayne@69
|
3190 @property
|
jpayne@69
|
3191 def ref(self):
|
jpayne@69
|
3192 """reference allele"""
|
jpayne@69
|
3193 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3194 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3195 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3196 return charptr_to_str(r.d.allele[0]) if r.d.allele else None
|
jpayne@69
|
3197
|
jpayne@69
|
3198 @ref.setter
|
jpayne@69
|
3199 def ref(self, value):
|
jpayne@69
|
3200 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3201 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3202 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3203 #FIXME: Set alleles directly -- this is stupid
|
jpayne@69
|
3204 if not value:
|
jpayne@69
|
3205 raise ValueError('ref allele must not be null')
|
jpayne@69
|
3206 value = force_bytes(value)
|
jpayne@69
|
3207 if r.d.allele and r.n_allele:
|
jpayne@69
|
3208 alleles = [r.d.allele[i] for i in range(r.n_allele)]
|
jpayne@69
|
3209 alleles[0] = value
|
jpayne@69
|
3210 else:
|
jpayne@69
|
3211 alleles = [value, '<NON_REF>']
|
jpayne@69
|
3212 self.alleles = alleles
|
jpayne@69
|
3213 bcf_sync_end(self)
|
jpayne@69
|
3214
|
jpayne@69
|
3215 @property
|
jpayne@69
|
3216 def alleles(self):
|
jpayne@69
|
3217 """tuple of reference allele followed by alt alleles"""
|
jpayne@69
|
3218 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3219 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3220 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3221 if not r.d.allele:
|
jpayne@69
|
3222 return None
|
jpayne@69
|
3223 cdef tuple res = PyTuple_New(r.n_allele)
|
jpayne@69
|
3224 for i in range(r.n_allele):
|
jpayne@69
|
3225 a = charptr_to_str(r.d.allele[i])
|
jpayne@69
|
3226 PyTuple_SET_ITEM(res, i, a)
|
jpayne@69
|
3227 Py_INCREF(a)
|
jpayne@69
|
3228 return res
|
jpayne@69
|
3229
|
jpayne@69
|
3230 @alleles.setter
|
jpayne@69
|
3231 def alleles(self, values):
|
jpayne@69
|
3232 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3233
|
jpayne@69
|
3234 # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
|
jpayne@69
|
3235 cdef int rlen = r.rlen
|
jpayne@69
|
3236
|
jpayne@69
|
3237 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3238 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3239
|
jpayne@69
|
3240 values = [force_bytes(v) for v in values]
|
jpayne@69
|
3241
|
jpayne@69
|
3242 if len(values) < 2:
|
jpayne@69
|
3243 raise ValueError('must set at least 2 alleles')
|
jpayne@69
|
3244
|
jpayne@69
|
3245 if b'' in values:
|
jpayne@69
|
3246 raise ValueError('cannot set null allele')
|
jpayne@69
|
3247
|
jpayne@69
|
3248 value = b','.join(values)
|
jpayne@69
|
3249
|
jpayne@69
|
3250 if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
|
jpayne@69
|
3251 raise ValueError('Error updating alleles')
|
jpayne@69
|
3252
|
jpayne@69
|
3253 # Reset rlen if alternate allele isn't symbolic, otherwise used cached
|
jpayne@69
|
3254 if has_symbolic_allele(self):
|
jpayne@69
|
3255 self.ptr.rlen = rlen
|
jpayne@69
|
3256 else:
|
jpayne@69
|
3257 self.ptr.rlen = len(values[0])
|
jpayne@69
|
3258 r.d.var_type = -1
|
jpayne@69
|
3259 bcf_sync_end(self)
|
jpayne@69
|
3260
|
jpayne@69
|
3261 @property
|
jpayne@69
|
3262 def alts(self):
|
jpayne@69
|
3263 """tuple of alt alleles"""
|
jpayne@69
|
3264 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3265 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3266 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3267 if r.n_allele < 2 or not r.d.allele:
|
jpayne@69
|
3268 return None
|
jpayne@69
|
3269 cdef tuple res = PyTuple_New(r.n_allele - 1)
|
jpayne@69
|
3270 for i in range(1, r.n_allele):
|
jpayne@69
|
3271 a = charptr_to_str(r.d.allele[i])
|
jpayne@69
|
3272 PyTuple_SET_ITEM(res, i - 1, a)
|
jpayne@69
|
3273 Py_INCREF(a)
|
jpayne@69
|
3274 return res
|
jpayne@69
|
3275
|
jpayne@69
|
3276 @alts.setter
|
jpayne@69
|
3277 def alts(self, value):
|
jpayne@69
|
3278 #FIXME: Set alleles directly -- this is stupid
|
jpayne@69
|
3279 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3280 if bcf_unpack(r, BCF_UN_STR) < 0:
|
jpayne@69
|
3281 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3282 value = [force_bytes(v) for v in value]
|
jpayne@69
|
3283 if b'' in value:
|
jpayne@69
|
3284 raise ValueError('cannot set null alt allele')
|
jpayne@69
|
3285 ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
|
jpayne@69
|
3286 self.alleles = ref + value
|
jpayne@69
|
3287 r.d.var_type = -1
|
jpayne@69
|
3288
|
jpayne@69
|
3289 @property
|
jpayne@69
|
3290 def filter(self):
|
jpayne@69
|
3291 """filter information (see :class:`VariantRecordFilter`)"""
|
jpayne@69
|
3292 if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
|
jpayne@69
|
3293 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3294 return makeVariantRecordFilter(self)
|
jpayne@69
|
3295
|
jpayne@69
|
3296 @property
|
jpayne@69
|
3297 def info(self):
|
jpayne@69
|
3298 """info data (see :class:`VariantRecordInfo`)"""
|
jpayne@69
|
3299 if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
|
jpayne@69
|
3300 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3301 return makeVariantRecordInfo(self)
|
jpayne@69
|
3302
|
jpayne@69
|
3303 @property
|
jpayne@69
|
3304 def format(self):
|
jpayne@69
|
3305 """sample format metadata (see :class:`VariantRecordFormat`)"""
|
jpayne@69
|
3306 if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
|
jpayne@69
|
3307 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3308 return makeVariantRecordFormat(self)
|
jpayne@69
|
3309
|
jpayne@69
|
3310 @property
|
jpayne@69
|
3311 def samples(self):
|
jpayne@69
|
3312 """sample data (see :class:`VariantRecordSamples`)"""
|
jpayne@69
|
3313 if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
|
jpayne@69
|
3314 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3315 return makeVariantRecordSamples(self)
|
jpayne@69
|
3316
|
jpayne@69
|
3317 property alleles_variant_types:
|
jpayne@69
|
3318 def __get__(self):
|
jpayne@69
|
3319 cdef bcf1_t *r = self.ptr
|
jpayne@69
|
3320 cdef tuple result = PyTuple_New(r.n_allele)
|
jpayne@69
|
3321
|
jpayne@69
|
3322 for i in range(r.n_allele):
|
jpayne@69
|
3323 tp = bcf_get_variant_type(r, i)
|
jpayne@69
|
3324
|
jpayne@69
|
3325 if tp == VCF_REF:
|
jpayne@69
|
3326 v_type = "REF"
|
jpayne@69
|
3327 elif tp == VCF_SNP:
|
jpayne@69
|
3328 v_type = "SNP"
|
jpayne@69
|
3329 elif tp == VCF_MNP:
|
jpayne@69
|
3330 v_type = "MNP"
|
jpayne@69
|
3331 elif tp == VCF_INDEL:
|
jpayne@69
|
3332 v_type = "INDEL"
|
jpayne@69
|
3333 elif tp == VCF_BND:
|
jpayne@69
|
3334 v_type = "BND"
|
jpayne@69
|
3335 elif tp == VCF_OVERLAP:
|
jpayne@69
|
3336 v_type = "OVERLAP"
|
jpayne@69
|
3337 else:
|
jpayne@69
|
3338 v_type = "OTHER"
|
jpayne@69
|
3339
|
jpayne@69
|
3340 PyTuple_SET_ITEM(result, i, v_type)
|
jpayne@69
|
3341 Py_INCREF(v_type)
|
jpayne@69
|
3342
|
jpayne@69
|
3343 return result
|
jpayne@69
|
3344
|
jpayne@69
|
3345 def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
|
jpayne@69
|
3346 if op != 2 and op != 3:
|
jpayne@69
|
3347 return NotImplemented
|
jpayne@69
|
3348
|
jpayne@69
|
3349 cdef bcf1_t *s = self.ptr
|
jpayne@69
|
3350 cdef bcf1_t *o = other.ptr
|
jpayne@69
|
3351
|
jpayne@69
|
3352 cdef bint cmp = self is other or (
|
jpayne@69
|
3353 s.pos == o.pos
|
jpayne@69
|
3354 and s.rlen == o.rlen
|
jpayne@69
|
3355 and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
|
jpayne@69
|
3356 or s.qual == o.qual)
|
jpayne@69
|
3357 and s.n_sample == o.n_sample
|
jpayne@69
|
3358 and s.n_allele == o.n_allele
|
jpayne@69
|
3359 and self.contig == other.contig
|
jpayne@69
|
3360 and self.alleles == other.alleles
|
jpayne@69
|
3361 and self.id == other.id
|
jpayne@69
|
3362 and self.info == other.info
|
jpayne@69
|
3363 and self.filter == other.filter
|
jpayne@69
|
3364 and self.samples == other.samples)
|
jpayne@69
|
3365
|
jpayne@69
|
3366 if op == 3:
|
jpayne@69
|
3367 cmp = not cmp
|
jpayne@69
|
3368
|
jpayne@69
|
3369 return cmp
|
jpayne@69
|
3370
|
jpayne@69
|
3371 def __str__(self):
|
jpayne@69
|
3372 cdef kstring_t line
|
jpayne@69
|
3373 cdef char c
|
jpayne@69
|
3374
|
jpayne@69
|
3375 line.l = line.m = 0
|
jpayne@69
|
3376 line.s = NULL
|
jpayne@69
|
3377
|
jpayne@69
|
3378 if vcf_format(self.header.ptr, self.ptr, &line) < 0:
|
jpayne@69
|
3379 if line.m:
|
jpayne@69
|
3380 free(line.s)
|
jpayne@69
|
3381 raise ValueError('vcf_format failed')
|
jpayne@69
|
3382
|
jpayne@69
|
3383 # Strip CR/LF?
|
jpayne@69
|
3384 #while line.l:
|
jpayne@69
|
3385 # c = line.s[line.l - 1]
|
jpayne@69
|
3386 # if c != b'\n' and c != b'\r':
|
jpayne@69
|
3387 # break
|
jpayne@69
|
3388 # line.l -= 1
|
jpayne@69
|
3389
|
jpayne@69
|
3390 ret = charptr_to_str_w_len(line.s, line.l)
|
jpayne@69
|
3391
|
jpayne@69
|
3392 if line.m:
|
jpayne@69
|
3393 free(line.s)
|
jpayne@69
|
3394
|
jpayne@69
|
3395 return ret
|
jpayne@69
|
3396
|
jpayne@69
|
3397
|
jpayne@69
|
3398 cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
|
jpayne@69
|
3399 if not header:
|
jpayne@69
|
3400 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
3401
|
jpayne@69
|
3402 if not r:
|
jpayne@69
|
3403 raise ValueError('cannot create VariantRecord')
|
jpayne@69
|
3404
|
jpayne@69
|
3405 if r.errcode:
|
jpayne@69
|
3406 msg = []
|
jpayne@69
|
3407 #if r.errcode & BCF_ERR_CTG_UNDEF:
|
jpayne@69
|
3408 # msg.append('undefined contig')
|
jpayne@69
|
3409 #if r.errcode & BCF_ERR_TAG_UNDEF:
|
jpayne@69
|
3410 # msg.append('undefined tag')
|
jpayne@69
|
3411 if r.errcode & BCF_ERR_NCOLS:
|
jpayne@69
|
3412 msg.append('invalid number of columns')
|
jpayne@69
|
3413 if r.errcode & BCF_ERR_LIMITS:
|
jpayne@69
|
3414 msg.append('limits violated')
|
jpayne@69
|
3415 if r.errcode & BCF_ERR_CHAR:
|
jpayne@69
|
3416 msg.append('invalid character found')
|
jpayne@69
|
3417 if r.errcode & BCF_ERR_CTG_INVALID:
|
jpayne@69
|
3418 msg.append('invalid contig')
|
jpayne@69
|
3419 if r.errcode & BCF_ERR_TAG_INVALID:
|
jpayne@69
|
3420 msg.append('invalid tag')
|
jpayne@69
|
3421
|
jpayne@69
|
3422 if msg:
|
jpayne@69
|
3423 msg = ', '.join(msg)
|
jpayne@69
|
3424 raise ValueError('Error(s) reading record: {}'.format(msg))
|
jpayne@69
|
3425
|
jpayne@69
|
3426 cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
|
jpayne@69
|
3427 record.header = header
|
jpayne@69
|
3428 record.ptr = r
|
jpayne@69
|
3429
|
jpayne@69
|
3430 return record
|
jpayne@69
|
3431
|
jpayne@69
|
3432
|
jpayne@69
|
3433 ########################################################################
|
jpayne@69
|
3434 ########################################################################
|
jpayne@69
|
3435 ## Variant Sampletype object
|
jpayne@69
|
3436 ########################################################################
|
jpayne@69
|
3437
|
jpayne@69
|
3438
|
jpayne@69
|
3439 cdef class VariantRecordSample(object):
|
jpayne@69
|
3440 """Data for a single sample from a :class:`VariantRecord` object.
|
jpayne@69
|
3441 Provides data accessors for genotypes and a mapping interface
|
jpayne@69
|
3442 from format name to values.
|
jpayne@69
|
3443 """
|
jpayne@69
|
3444 def __init__(self, *args, **kwargs):
|
jpayne@69
|
3445 raise TypeError('this class cannot be instantiated from Python')
|
jpayne@69
|
3446
|
jpayne@69
|
3447 @property
|
jpayne@69
|
3448 def name(self):
|
jpayne@69
|
3449 """sample name"""
|
jpayne@69
|
3450 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3451 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3452 cdef int32_t n = r.n_sample
|
jpayne@69
|
3453
|
jpayne@69
|
3454 if self.index < 0 or self.index >= n:
|
jpayne@69
|
3455 raise ValueError('invalid sample index')
|
jpayne@69
|
3456
|
jpayne@69
|
3457 return charptr_to_str(hdr.samples[self.index])
|
jpayne@69
|
3458
|
jpayne@69
|
3459 @property
|
jpayne@69
|
3460 def allele_indices(self):
|
jpayne@69
|
3461 """allele indices for called genotype, if present. Otherwise None"""
|
jpayne@69
|
3462 return bcf_format_get_allele_indices(self)
|
jpayne@69
|
3463
|
jpayne@69
|
3464 @allele_indices.setter
|
jpayne@69
|
3465 def allele_indices(self, value):
|
jpayne@69
|
3466 self['GT'] = value
|
jpayne@69
|
3467
|
jpayne@69
|
3468 @allele_indices.deleter
|
jpayne@69
|
3469 def allele_indices(self):
|
jpayne@69
|
3470 self['GT'] = ()
|
jpayne@69
|
3471
|
jpayne@69
|
3472 @property
|
jpayne@69
|
3473 def alleles(self):
|
jpayne@69
|
3474 """alleles for called genotype, if present. Otherwise None"""
|
jpayne@69
|
3475 return bcf_format_get_alleles(self)
|
jpayne@69
|
3476
|
jpayne@69
|
3477 @alleles.setter
|
jpayne@69
|
3478 def alleles(self, value):
|
jpayne@69
|
3479 # Sets the genotype, supply a tuple of alleles to set.
|
jpayne@69
|
3480 # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
|
jpayne@69
|
3481 # The genotype is reset when an empty tuple, None or (None,) is supplied
|
jpayne@69
|
3482
|
jpayne@69
|
3483 if value==(None,) or value==tuple() or value is None:
|
jpayne@69
|
3484 self['GT'] = ()
|
jpayne@69
|
3485 return
|
jpayne@69
|
3486
|
jpayne@69
|
3487 if any((type(x) == int for x in value)):
|
jpayne@69
|
3488 raise ValueError('Use .allele_indices to set integer allele indices')
|
jpayne@69
|
3489
|
jpayne@69
|
3490 # determine and set allele indices:
|
jpayne@69
|
3491 try:
|
jpayne@69
|
3492 self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
|
jpayne@69
|
3493 except ValueError:
|
jpayne@69
|
3494 raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
|
jpayne@69
|
3495 "First set the .alleles of this record to define the alleles")
|
jpayne@69
|
3496
|
jpayne@69
|
3497 @alleles.deleter
|
jpayne@69
|
3498 def alleles(self):
|
jpayne@69
|
3499 self['GT'] = ()
|
jpayne@69
|
3500
|
jpayne@69
|
3501 @property
|
jpayne@69
|
3502 def phased(self):
|
jpayne@69
|
3503 """False if genotype is missing or any allele is unphased. Otherwise True."""
|
jpayne@69
|
3504 return bcf_sample_get_phased(self)
|
jpayne@69
|
3505
|
jpayne@69
|
3506 @phased.setter
|
jpayne@69
|
3507 def phased(self, value):
|
jpayne@69
|
3508 bcf_sample_set_phased(self, value)
|
jpayne@69
|
3509
|
jpayne@69
|
3510 def __len__(self):
|
jpayne@69
|
3511 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3512 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3513 cdef int i, n = 0
|
jpayne@69
|
3514
|
jpayne@69
|
3515 if bcf_unpack(r, BCF_UN_FMT) < 0:
|
jpayne@69
|
3516 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3517
|
jpayne@69
|
3518 for i in range(r.n_fmt):
|
jpayne@69
|
3519 if r.d.fmt[i].p:
|
jpayne@69
|
3520 n += 1
|
jpayne@69
|
3521 return n
|
jpayne@69
|
3522
|
jpayne@69
|
3523 def __bool__(self):
|
jpayne@69
|
3524 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3525 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3526 cdef int i
|
jpayne@69
|
3527
|
jpayne@69
|
3528 if bcf_unpack(r, BCF_UN_FMT) < 0:
|
jpayne@69
|
3529 raise ValueError('Error unpacking VariantRecord')
|
jpayne@69
|
3530
|
jpayne@69
|
3531 for i in range(r.n_fmt):
|
jpayne@69
|
3532 if r.d.fmt[i].p:
|
jpayne@69
|
3533 return True
|
jpayne@69
|
3534 return False
|
jpayne@69
|
3535
|
jpayne@69
|
3536 def __getitem__(self, key):
|
jpayne@69
|
3537 return bcf_format_get_value(self, key)
|
jpayne@69
|
3538
|
jpayne@69
|
3539 def __setitem__(self, key, value):
|
jpayne@69
|
3540 bcf_format_set_value(self, key, value)
|
jpayne@69
|
3541
|
jpayne@69
|
3542 def __delitem__(self, key):
|
jpayne@69
|
3543 bcf_format_del_value(self, key)
|
jpayne@69
|
3544
|
jpayne@69
|
3545 def clear(self):
|
jpayne@69
|
3546 """Clear all format data (including genotype) for this sample"""
|
jpayne@69
|
3547 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3548 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3549 cdef bcf_fmt_t *fmt
|
jpayne@69
|
3550 cdef int i
|
jpayne@69
|
3551
|
jpayne@69
|
3552 for i in range(r.n_fmt):
|
jpayne@69
|
3553 fmt = &r.d.fmt[i]
|
jpayne@69
|
3554 if fmt.p:
|
jpayne@69
|
3555 bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
|
jpayne@69
|
3556
|
jpayne@69
|
3557 def __iter__(self):
|
jpayne@69
|
3558 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3559 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3560 cdef bcf_fmt_t *fmt
|
jpayne@69
|
3561 cdef int i
|
jpayne@69
|
3562
|
jpayne@69
|
3563 for i in range(r.n_fmt):
|
jpayne@69
|
3564 fmt = &r.d.fmt[i]
|
jpayne@69
|
3565 if r.d.fmt[i].p:
|
jpayne@69
|
3566 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
|
jpayne@69
|
3567
|
jpayne@69
|
3568 def get(self, key, default=None):
|
jpayne@69
|
3569 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
3570 try:
|
jpayne@69
|
3571 return self[key]
|
jpayne@69
|
3572 except KeyError:
|
jpayne@69
|
3573 return default
|
jpayne@69
|
3574
|
jpayne@69
|
3575 def __contains__(self, key):
|
jpayne@69
|
3576 cdef bcf_hdr_t *hdr = self.record.header.ptr
|
jpayne@69
|
3577 cdef bcf1_t *r = self.record.ptr
|
jpayne@69
|
3578 cdef bytes bkey = force_bytes(key)
|
jpayne@69
|
3579 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
|
jpayne@69
|
3580 return fmt != NULL and fmt.p != NULL
|
jpayne@69
|
3581
|
jpayne@69
|
3582 def iterkeys(self):
|
jpayne@69
|
3583 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
3584 return iter(self)
|
jpayne@69
|
3585
|
jpayne@69
|
3586 def itervalues(self):
|
jpayne@69
|
3587 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
3588 for key in self:
|
jpayne@69
|
3589 yield self[key]
|
jpayne@69
|
3590
|
jpayne@69
|
3591 def iteritems(self):
|
jpayne@69
|
3592 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
3593 for key in self:
|
jpayne@69
|
3594 yield (key, self[key])
|
jpayne@69
|
3595
|
jpayne@69
|
3596 def keys(self):
|
jpayne@69
|
3597 """D.keys() -> list of D's keys"""
|
jpayne@69
|
3598 return list(self)
|
jpayne@69
|
3599
|
jpayne@69
|
3600 def items(self):
|
jpayne@69
|
3601 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
3602 return list(self.iteritems())
|
jpayne@69
|
3603
|
jpayne@69
|
3604 def values(self):
|
jpayne@69
|
3605 """D.values() -> list of D's values"""
|
jpayne@69
|
3606 return list(self.itervalues())
|
jpayne@69
|
3607
|
jpayne@69
|
3608 def update(self, items=None, **kwargs):
|
jpayne@69
|
3609 """D.update([E, ]**F) -> None.
|
jpayne@69
|
3610
|
jpayne@69
|
3611 Update D from dict/iterable E and F.
|
jpayne@69
|
3612 """
|
jpayne@69
|
3613 for k, v in items.items():
|
jpayne@69
|
3614 self[k] = v
|
jpayne@69
|
3615
|
jpayne@69
|
3616 if kwargs:
|
jpayne@69
|
3617 for k, v in kwargs.items():
|
jpayne@69
|
3618 self[k] = v
|
jpayne@69
|
3619
|
jpayne@69
|
3620 def pop(self, key, default=_nothing):
|
jpayne@69
|
3621 try:
|
jpayne@69
|
3622 value = self[key]
|
jpayne@69
|
3623 del self[key]
|
jpayne@69
|
3624 return value
|
jpayne@69
|
3625 except KeyError:
|
jpayne@69
|
3626 if default is not _nothing:
|
jpayne@69
|
3627 return default
|
jpayne@69
|
3628 raise
|
jpayne@69
|
3629
|
jpayne@69
|
3630 def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
|
jpayne@69
|
3631 if op != 2 and op != 3:
|
jpayne@69
|
3632 return NotImplemented
|
jpayne@69
|
3633
|
jpayne@69
|
3634 cdef bint cmp = dict(self) == dict(other)
|
jpayne@69
|
3635
|
jpayne@69
|
3636 if op == 3:
|
jpayne@69
|
3637 cmp = not cmp
|
jpayne@69
|
3638
|
jpayne@69
|
3639 return cmp
|
jpayne@69
|
3640
|
jpayne@69
|
3641 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
3642 __hash__ = None
|
jpayne@69
|
3643
|
jpayne@69
|
3644
|
jpayne@69
|
3645 cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
|
jpayne@69
|
3646 if not record or sample_index < 0:
|
jpayne@69
|
3647 raise ValueError('cannot create VariantRecordSample')
|
jpayne@69
|
3648
|
jpayne@69
|
3649 cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
|
jpayne@69
|
3650 sample.record = record
|
jpayne@69
|
3651 sample.index = sample_index
|
jpayne@69
|
3652
|
jpayne@69
|
3653 return sample
|
jpayne@69
|
3654
|
jpayne@69
|
3655
|
jpayne@69
|
3656 ########################################################################
|
jpayne@69
|
3657 ########################################################################
|
jpayne@69
|
3658 ## Index objects
|
jpayne@69
|
3659 ########################################################################
|
jpayne@69
|
3660
|
jpayne@69
|
3661
|
jpayne@69
|
3662 cdef class BaseIndex(object):
|
jpayne@69
|
3663 def __init__(self):
|
jpayne@69
|
3664 self.refs = ()
|
jpayne@69
|
3665 self.remap = {}
|
jpayne@69
|
3666
|
jpayne@69
|
3667 def __len__(self):
|
jpayne@69
|
3668 return len(self.refs)
|
jpayne@69
|
3669
|
jpayne@69
|
3670 def __bool__(self):
|
jpayne@69
|
3671 return len(self.refs) != 0
|
jpayne@69
|
3672
|
jpayne@69
|
3673 def __getitem__(self, key):
|
jpayne@69
|
3674 if isinstance(key, int):
|
jpayne@69
|
3675 return self.refs[key]
|
jpayne@69
|
3676 else:
|
jpayne@69
|
3677 return self.refmap[key]
|
jpayne@69
|
3678
|
jpayne@69
|
3679 def __iter__(self):
|
jpayne@69
|
3680 return iter(self.refs)
|
jpayne@69
|
3681
|
jpayne@69
|
3682 def get(self, key, default=None):
|
jpayne@69
|
3683 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
|
jpayne@69
|
3684 try:
|
jpayne@69
|
3685 return self[key]
|
jpayne@69
|
3686 except KeyError:
|
jpayne@69
|
3687 return default
|
jpayne@69
|
3688
|
jpayne@69
|
3689 def __contains__(self, key):
|
jpayne@69
|
3690 try:
|
jpayne@69
|
3691 self[key]
|
jpayne@69
|
3692 except KeyError:
|
jpayne@69
|
3693 return False
|
jpayne@69
|
3694 else:
|
jpayne@69
|
3695 return True
|
jpayne@69
|
3696
|
jpayne@69
|
3697 def iterkeys(self):
|
jpayne@69
|
3698 """D.iterkeys() -> an iterator over the keys of D"""
|
jpayne@69
|
3699 return iter(self)
|
jpayne@69
|
3700
|
jpayne@69
|
3701 def itervalues(self):
|
jpayne@69
|
3702 """D.itervalues() -> an iterator over the values of D"""
|
jpayne@69
|
3703 for key in self:
|
jpayne@69
|
3704 yield self[key]
|
jpayne@69
|
3705
|
jpayne@69
|
3706 def iteritems(self):
|
jpayne@69
|
3707 """D.iteritems() -> an iterator over the (key, value) items of D"""
|
jpayne@69
|
3708 for key in self:
|
jpayne@69
|
3709 yield (key, self[key])
|
jpayne@69
|
3710
|
jpayne@69
|
3711 def keys(self):
|
jpayne@69
|
3712 """D.keys() -> list of D's keys"""
|
jpayne@69
|
3713 return list(self)
|
jpayne@69
|
3714
|
jpayne@69
|
3715 def items(self):
|
jpayne@69
|
3716 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
|
jpayne@69
|
3717 return list(self.iteritems())
|
jpayne@69
|
3718
|
jpayne@69
|
3719 def values(self):
|
jpayne@69
|
3720 """D.values() -> list of D's values"""
|
jpayne@69
|
3721 return list(self.itervalues())
|
jpayne@69
|
3722
|
jpayne@69
|
3723 def update(self, items=None, **kwargs):
|
jpayne@69
|
3724 """D.update([E, ]**F) -> None.
|
jpayne@69
|
3725
|
jpayne@69
|
3726 Update D from dict/iterable E and F.
|
jpayne@69
|
3727 """
|
jpayne@69
|
3728 for k, v in items.items():
|
jpayne@69
|
3729 self[k] = v
|
jpayne@69
|
3730
|
jpayne@69
|
3731 if kwargs:
|
jpayne@69
|
3732 for k, v in kwargs.items():
|
jpayne@69
|
3733 self[k] = v
|
jpayne@69
|
3734
|
jpayne@69
|
3735 def pop(self, key, default=_nothing):
|
jpayne@69
|
3736 try:
|
jpayne@69
|
3737 value = self[key]
|
jpayne@69
|
3738 del self[key]
|
jpayne@69
|
3739 return value
|
jpayne@69
|
3740 except KeyError:
|
jpayne@69
|
3741 if default is not _nothing:
|
jpayne@69
|
3742 return default
|
jpayne@69
|
3743 raise
|
jpayne@69
|
3744
|
jpayne@69
|
3745 # Mappings are not hashable by default, but subclasses can change this
|
jpayne@69
|
3746 __hash__ = None
|
jpayne@69
|
3747
|
jpayne@69
|
3748 #TODO: implement __richcmp__
|
jpayne@69
|
3749
|
jpayne@69
|
3750
|
jpayne@69
|
3751 cdef class BCFIndex(object):
|
jpayne@69
|
3752 """CSI index data structure for BCF files"""
|
jpayne@69
|
3753 def __init__(self):
|
jpayne@69
|
3754 self.refs = ()
|
jpayne@69
|
3755 self.refmap = {}
|
jpayne@69
|
3756
|
jpayne@69
|
3757 if not self.ptr:
|
jpayne@69
|
3758 raise ValueError('Invalid index object')
|
jpayne@69
|
3759
|
jpayne@69
|
3760 cdef int n
|
jpayne@69
|
3761 cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
|
jpayne@69
|
3762
|
jpayne@69
|
3763 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
|
jpayne@69
|
3764 self.refmap = { r:i for i,r in enumerate(self.refs) }
|
jpayne@69
|
3765
|
jpayne@69
|
3766 def __dealloc__(self):
|
jpayne@69
|
3767 if self.ptr:
|
jpayne@69
|
3768 hts_idx_destroy(self.ptr)
|
jpayne@69
|
3769 self.ptr = NULL
|
jpayne@69
|
3770
|
jpayne@69
|
3771 def fetch(self, bcf, contig, start, stop, reopen):
|
jpayne@69
|
3772 return BCFIterator(bcf, contig, start, stop, reopen)
|
jpayne@69
|
3773
|
jpayne@69
|
3774
|
jpayne@69
|
3775 cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
|
jpayne@69
|
3776 if not idx:
|
jpayne@69
|
3777 return None
|
jpayne@69
|
3778
|
jpayne@69
|
3779 if not header:
|
jpayne@69
|
3780 raise ValueError('invalid VariantHeader')
|
jpayne@69
|
3781
|
jpayne@69
|
3782 cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
|
jpayne@69
|
3783 index.header = header
|
jpayne@69
|
3784 index.ptr = idx
|
jpayne@69
|
3785 index.__init__()
|
jpayne@69
|
3786
|
jpayne@69
|
3787 return index
|
jpayne@69
|
3788
|
jpayne@69
|
3789
|
jpayne@69
|
3790 cdef class TabixIndex(BaseIndex):
|
jpayne@69
|
3791 """Tabix index data structure for VCF files"""
|
jpayne@69
|
3792 def __init__(self):
|
jpayne@69
|
3793 self.refs = ()
|
jpayne@69
|
3794 self.refmap = {}
|
jpayne@69
|
3795
|
jpayne@69
|
3796 if not self.ptr:
|
jpayne@69
|
3797 raise ValueError('Invalid index object')
|
jpayne@69
|
3798
|
jpayne@69
|
3799 cdef int n
|
jpayne@69
|
3800 cdef const char **refs = tbx_seqnames(self.ptr, &n)
|
jpayne@69
|
3801
|
jpayne@69
|
3802 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
|
jpayne@69
|
3803 self.refmap = { r:i for i,r in enumerate(self.refs) }
|
jpayne@69
|
3804
|
jpayne@69
|
3805 def __dealloc__(self):
|
jpayne@69
|
3806 if self.ptr:
|
jpayne@69
|
3807 tbx_destroy(self.ptr)
|
jpayne@69
|
3808 self.ptr = NULL
|
jpayne@69
|
3809
|
jpayne@69
|
3810 def fetch(self, bcf, contig, start, stop, reopen):
|
jpayne@69
|
3811 return TabixIterator(bcf, contig, start, stop, reopen)
|
jpayne@69
|
3812
|
jpayne@69
|
3813
|
jpayne@69
|
3814 cdef TabixIndex makeTabixIndex(tbx_t *idx):
|
jpayne@69
|
3815 if not idx:
|
jpayne@69
|
3816 return None
|
jpayne@69
|
3817
|
jpayne@69
|
3818 cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
|
jpayne@69
|
3819 index.ptr = idx
|
jpayne@69
|
3820 index.__init__()
|
jpayne@69
|
3821
|
jpayne@69
|
3822 return index
|
jpayne@69
|
3823
|
jpayne@69
|
3824
|
jpayne@69
|
3825 ########################################################################
|
jpayne@69
|
3826 ########################################################################
|
jpayne@69
|
3827 ## Iterators
|
jpayne@69
|
3828 ########################################################################
|
jpayne@69
|
3829
|
jpayne@69
|
3830
|
jpayne@69
|
3831 cdef class BaseIterator(object):
|
jpayne@69
|
3832 pass
|
jpayne@69
|
3833
|
jpayne@69
|
3834
|
jpayne@69
|
3835 # Internal function to clean up after iteration stop or failure.
|
jpayne@69
|
3836 # This would be a nested function if it weren't a cdef function.
|
jpayne@69
|
3837 cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
|
jpayne@69
|
3838 bcf_destroy1(record)
|
jpayne@69
|
3839
|
jpayne@69
|
3840 # destroy iter so future calls to __next__ raise StopIteration
|
jpayne@69
|
3841 bcf_itr_destroy(self.iter)
|
jpayne@69
|
3842 self.iter = NULL
|
jpayne@69
|
3843
|
jpayne@69
|
3844
|
jpayne@69
|
3845 cdef class BCFIterator(BaseIterator):
|
jpayne@69
|
3846 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
|
jpayne@69
|
3847 if bcf is None:
|
jpayne@69
|
3848 raise ValueError('bcf must not be None')
|
jpayne@69
|
3849
|
jpayne@69
|
3850 if contig is None:
|
jpayne@69
|
3851 raise ValueError('contig must be specified')
|
jpayne@69
|
3852
|
jpayne@69
|
3853 if not isinstance(bcf.index, BCFIndex):
|
jpayne@69
|
3854 raise ValueError('bcf index required')
|
jpayne@69
|
3855
|
jpayne@69
|
3856 cdef BCFIndex index = bcf.index
|
jpayne@69
|
3857
|
jpayne@69
|
3858 self.bcf = bcf
|
jpayne@69
|
3859 self.index = index
|
jpayne@69
|
3860
|
jpayne@69
|
3861 cdef int rid, cstart, cstop
|
jpayne@69
|
3862
|
jpayne@69
|
3863 try:
|
jpayne@69
|
3864 rid = index.refmap[contig]
|
jpayne@69
|
3865 except KeyError:
|
jpayne@69
|
3866 # A query for a non-existent contig yields an empty iterator, does not raise an error
|
jpayne@69
|
3867 self.iter = NULL
|
jpayne@69
|
3868 return
|
jpayne@69
|
3869
|
jpayne@69
|
3870 if reopen:
|
jpayne@69
|
3871 self.bcf = self.bcf.copy()
|
jpayne@69
|
3872
|
jpayne@69
|
3873 cstart = start if start is not None else 0
|
jpayne@69
|
3874 cstop = stop if stop is not None else MAX_POS
|
jpayne@69
|
3875
|
jpayne@69
|
3876 with nogil:
|
jpayne@69
|
3877 self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
|
jpayne@69
|
3878
|
jpayne@69
|
3879 if not self.iter:
|
jpayne@69
|
3880 if errno:
|
jpayne@69
|
3881 raise IOError(errno, strerror(errno))
|
jpayne@69
|
3882 else:
|
jpayne@69
|
3883 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
|
jpayne@69
|
3884
|
jpayne@69
|
3885 def __dealloc__(self):
|
jpayne@69
|
3886 if self.iter:
|
jpayne@69
|
3887 bcf_itr_destroy(self.iter)
|
jpayne@69
|
3888 self.iter = NULL
|
jpayne@69
|
3889
|
jpayne@69
|
3890 def __iter__(self):
|
jpayne@69
|
3891 return self
|
jpayne@69
|
3892
|
jpayne@69
|
3893 def __next__(self):
|
jpayne@69
|
3894 if not self.iter:
|
jpayne@69
|
3895 raise StopIteration
|
jpayne@69
|
3896
|
jpayne@69
|
3897 cdef bcf1_t *record = bcf_init1()
|
jpayne@69
|
3898
|
jpayne@69
|
3899 if not record:
|
jpayne@69
|
3900 raise MemoryError('unable to allocate BCF record')
|
jpayne@69
|
3901
|
jpayne@69
|
3902 record.pos = -1
|
jpayne@69
|
3903 if self.bcf.drop_samples:
|
jpayne@69
|
3904 record.max_unpack = BCF_UN_SHR
|
jpayne@69
|
3905
|
jpayne@69
|
3906 cdef int ret
|
jpayne@69
|
3907
|
jpayne@69
|
3908 with nogil:
|
jpayne@69
|
3909 ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
|
jpayne@69
|
3910
|
jpayne@69
|
3911 if ret < 0:
|
jpayne@69
|
3912 _stop_BCFIterator(self, record)
|
jpayne@69
|
3913 if ret == -1:
|
jpayne@69
|
3914 raise StopIteration
|
jpayne@69
|
3915 elif ret == -2:
|
jpayne@69
|
3916 raise IOError('truncated file')
|
jpayne@69
|
3917 elif errno:
|
jpayne@69
|
3918 raise IOError(errno, strerror(errno))
|
jpayne@69
|
3919 else:
|
jpayne@69
|
3920 raise IOError('unable to fetch next record')
|
jpayne@69
|
3921
|
jpayne@69
|
3922 ret = bcf_subset_format(self.bcf.header.ptr, record)
|
jpayne@69
|
3923
|
jpayne@69
|
3924 if ret < 0:
|
jpayne@69
|
3925 _stop_BCFIterator(self, record)
|
jpayne@69
|
3926 raise ValueError('error in bcf_subset_format')
|
jpayne@69
|
3927
|
jpayne@69
|
3928 return makeVariantRecord(self.bcf.header, record)
|
jpayne@69
|
3929
|
jpayne@69
|
3930
|
jpayne@69
|
3931 cdef class TabixIterator(BaseIterator):
|
jpayne@69
|
3932 def __cinit__(self, *args, **kwargs):
|
jpayne@69
|
3933 self.line_buffer.l = 0
|
jpayne@69
|
3934 self.line_buffer.m = 0
|
jpayne@69
|
3935 self.line_buffer.s = NULL
|
jpayne@69
|
3936
|
jpayne@69
|
3937 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
|
jpayne@69
|
3938 if bcf is None:
|
jpayne@69
|
3939 raise ValueError('bcf must not be None')
|
jpayne@69
|
3940
|
jpayne@69
|
3941 if not isinstance(bcf.index, TabixIndex):
|
jpayne@69
|
3942 raise ValueError('tabix index required')
|
jpayne@69
|
3943
|
jpayne@69
|
3944 cdef TabixIndex index = bcf.index
|
jpayne@69
|
3945
|
jpayne@69
|
3946 self.bcf = bcf
|
jpayne@69
|
3947 self.index = index
|
jpayne@69
|
3948
|
jpayne@69
|
3949 cdef int rid, cstart, cstop
|
jpayne@69
|
3950
|
jpayne@69
|
3951 try:
|
jpayne@69
|
3952 rid = index.refmap[contig]
|
jpayne@69
|
3953 except KeyError:
|
jpayne@69
|
3954 # A query for a non-existent contig yields an empty iterator, does not raise an error
|
jpayne@69
|
3955 self.iter = NULL
|
jpayne@69
|
3956 return
|
jpayne@69
|
3957
|
jpayne@69
|
3958 if reopen:
|
jpayne@69
|
3959 self.bcf = self.bcf.copy()
|
jpayne@69
|
3960
|
jpayne@69
|
3961 cstart = start if start is not None else 0
|
jpayne@69
|
3962 cstop = stop if stop is not None else MAX_POS
|
jpayne@69
|
3963
|
jpayne@69
|
3964 self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
|
jpayne@69
|
3965
|
jpayne@69
|
3966 if not self.iter:
|
jpayne@69
|
3967 if errno:
|
jpayne@69
|
3968 raise IOError(errno, strerror(errno))
|
jpayne@69
|
3969 else:
|
jpayne@69
|
3970 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
|
jpayne@69
|
3971
|
jpayne@69
|
3972 def __dealloc__(self):
|
jpayne@69
|
3973 if self.iter:
|
jpayne@69
|
3974 tbx_itr_destroy(self.iter)
|
jpayne@69
|
3975 self.iter = NULL
|
jpayne@69
|
3976
|
jpayne@69
|
3977 if self.line_buffer.m:
|
jpayne@69
|
3978 free(self.line_buffer.s)
|
jpayne@69
|
3979
|
jpayne@69
|
3980 self.line_buffer.l = 0
|
jpayne@69
|
3981 self.line_buffer.m = 0
|
jpayne@69
|
3982 self.line_buffer.s = NULL
|
jpayne@69
|
3983
|
jpayne@69
|
3984 def __iter__(self):
|
jpayne@69
|
3985 return self
|
jpayne@69
|
3986
|
jpayne@69
|
3987 def __next__(self):
|
jpayne@69
|
3988 if not self.iter:
|
jpayne@69
|
3989 raise StopIteration
|
jpayne@69
|
3990
|
jpayne@69
|
3991 cdef int ret
|
jpayne@69
|
3992
|
jpayne@69
|
3993 with nogil:
|
jpayne@69
|
3994 ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
|
jpayne@69
|
3995
|
jpayne@69
|
3996 if ret < 0:
|
jpayne@69
|
3997 tbx_itr_destroy(self.iter)
|
jpayne@69
|
3998 self.iter = NULL
|
jpayne@69
|
3999 if ret == -1:
|
jpayne@69
|
4000 raise StopIteration
|
jpayne@69
|
4001 elif ret == -2:
|
jpayne@69
|
4002 raise IOError('truncated file')
|
jpayne@69
|
4003 elif errno:
|
jpayne@69
|
4004 raise IOError(errno, strerror(errno))
|
jpayne@69
|
4005 else:
|
jpayne@69
|
4006 raise IOError('unable to fetch next record')
|
jpayne@69
|
4007
|
jpayne@69
|
4008 cdef bcf1_t *record = bcf_init1()
|
jpayne@69
|
4009
|
jpayne@69
|
4010 if not record:
|
jpayne@69
|
4011 raise MemoryError('unable to allocate BCF record')
|
jpayne@69
|
4012
|
jpayne@69
|
4013 record.pos = -1
|
jpayne@69
|
4014 if self.bcf.drop_samples:
|
jpayne@69
|
4015 record.max_unpack = BCF_UN_SHR
|
jpayne@69
|
4016
|
jpayne@69
|
4017 ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
|
jpayne@69
|
4018
|
jpayne@69
|
4019 # FIXME: stop iteration on parse failure?
|
jpayne@69
|
4020 if ret < 0:
|
jpayne@69
|
4021 bcf_destroy1(record)
|
jpayne@69
|
4022 raise ValueError('error in vcf_parse')
|
jpayne@69
|
4023
|
jpayne@69
|
4024 return makeVariantRecord(self.bcf.header, record)
|
jpayne@69
|
4025
|
jpayne@69
|
4026
|
jpayne@69
|
4027 ########################################################################
|
jpayne@69
|
4028 ########################################################################
|
jpayne@69
|
4029 ## Variant File
|
jpayne@69
|
4030 ########################################################################
|
jpayne@69
|
4031
|
jpayne@69
|
4032
|
jpayne@69
|
4033 cdef class VariantFile(HTSFile):
|
jpayne@69
|
4034 """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
|
jpayne@69
|
4035 duplicate_filehandle=True, ignore_truncation=False, threads=1)*
|
jpayne@69
|
4036
|
jpayne@69
|
4037 A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
|
jpayne@69
|
4038 opened.
|
jpayne@69
|
4039
|
jpayne@69
|
4040 If an index for a variant file exists (.csi or .tbi), it will be
|
jpayne@69
|
4041 opened automatically. Without an index random access to records
|
jpayne@69
|
4042 via :meth:`fetch` is disabled.
|
jpayne@69
|
4043
|
jpayne@69
|
4044 For writing, a :class:`VariantHeader` object must be provided,
|
jpayne@69
|
4045 typically obtained from another :term:`VCF` file/:term:`BCF`
|
jpayne@69
|
4046 file.
|
jpayne@69
|
4047
|
jpayne@69
|
4048 Parameters
|
jpayne@69
|
4049 ----------
|
jpayne@69
|
4050 mode : string
|
jpayne@69
|
4051 *mode* should be ``r`` for reading or ``w`` for writing. The default is
|
jpayne@69
|
4052 text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
|
jpayne@69
|
4053 ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
|
jpayne@69
|
4054
|
jpayne@69
|
4055 If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
|
jpayne@69
|
4056 modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
|
jpayne@69
|
4057 For instance, to open a :term:`BCF` formatted file for reading, type::
|
jpayne@69
|
4058
|
jpayne@69
|
4059 f = pysam.VariantFile('ex1.bcf','r')
|
jpayne@69
|
4060
|
jpayne@69
|
4061 If mode is not specified, we will try to auto-detect the file type. All
|
jpayne@69
|
4062 of the following should work::
|
jpayne@69
|
4063
|
jpayne@69
|
4064 f1 = pysam.VariantFile('ex1.bcf')
|
jpayne@69
|
4065 f2 = pysam.VariantFile('ex1.vcf')
|
jpayne@69
|
4066 f3 = pysam.VariantFile('ex1.vcf.gz')
|
jpayne@69
|
4067
|
jpayne@69
|
4068 index_filename : string
|
jpayne@69
|
4069 Explicit path to an index file.
|
jpayne@69
|
4070
|
jpayne@69
|
4071 header : VariantHeader
|
jpayne@69
|
4072 :class:`VariantHeader` object required for writing.
|
jpayne@69
|
4073
|
jpayne@69
|
4074 drop_samples: bool
|
jpayne@69
|
4075 Ignore sample information when reading.
|
jpayne@69
|
4076
|
jpayne@69
|
4077 duplicate_filehandle: bool
|
jpayne@69
|
4078 By default, file handles passed either directly or through
|
jpayne@69
|
4079 File-like objects will be duplicated before passing them to
|
jpayne@69
|
4080 htslib. The duplication prevents issues where the same stream
|
jpayne@69
|
4081 will be closed by htslib and through destruction of the
|
jpayne@69
|
4082 high-level python object. Set to False to turn off
|
jpayne@69
|
4083 duplication.
|
jpayne@69
|
4084
|
jpayne@69
|
4085 ignore_truncation: bool
|
jpayne@69
|
4086 Issue a warning, instead of raising an error if the current file
|
jpayne@69
|
4087 appears to be truncated due to a missing EOF marker. Only applies
|
jpayne@69
|
4088 to bgzipped formats. (Default=False)
|
jpayne@69
|
4089
|
jpayne@69
|
4090 threads: integer
|
jpayne@69
|
4091 Number of threads to use for compressing/decompressing VCF/BCF files.
|
jpayne@69
|
4092 Setting threads to > 1 cannot be combined with `ignore_truncation`.
|
jpayne@69
|
4093 (Default=1)
|
jpayne@69
|
4094
|
jpayne@69
|
4095 """
|
jpayne@69
|
4096 def __cinit__(self, *args, **kwargs):
|
jpayne@69
|
4097 self.htsfile = NULL
|
jpayne@69
|
4098
|
jpayne@69
|
4099 def __init__(self, *args, **kwargs):
|
jpayne@69
|
4100 self.header = None
|
jpayne@69
|
4101 self.index = None
|
jpayne@69
|
4102 self.filename = None
|
jpayne@69
|
4103 self.mode = None
|
jpayne@69
|
4104 self.threads = 1
|
jpayne@69
|
4105 self.index_filename = None
|
jpayne@69
|
4106 self.is_stream = False
|
jpayne@69
|
4107 self.is_remote = False
|
jpayne@69
|
4108 self.is_reading = False
|
jpayne@69
|
4109 self.drop_samples = False
|
jpayne@69
|
4110 self.header_written = False
|
jpayne@69
|
4111 self.start_offset = -1
|
jpayne@69
|
4112
|
jpayne@69
|
4113 self.open(*args, **kwargs)
|
jpayne@69
|
4114
|
jpayne@69
|
4115 def __dealloc__(self):
|
jpayne@69
|
4116 if not self.htsfile or not self.header:
|
jpayne@69
|
4117 return
|
jpayne@69
|
4118
|
jpayne@69
|
4119 # Write header if no records were written
|
jpayne@69
|
4120 if self.htsfile.is_write and not self.header_written:
|
jpayne@69
|
4121 with nogil:
|
jpayne@69
|
4122 bcf_hdr_write(self.htsfile, self.header.ptr)
|
jpayne@69
|
4123
|
jpayne@69
|
4124 cdef int ret = hts_close(self.htsfile)
|
jpayne@69
|
4125 self.htsfile = NULL
|
jpayne@69
|
4126 self.header = self.index = None
|
jpayne@69
|
4127
|
jpayne@69
|
4128 if ret < 0:
|
jpayne@69
|
4129 global errno
|
jpayne@69
|
4130 if errno == EPIPE:
|
jpayne@69
|
4131 errno = 0
|
jpayne@69
|
4132 else:
|
jpayne@69
|
4133 raise IOError(errno, force_str(strerror(errno)))
|
jpayne@69
|
4134
|
jpayne@69
|
4135 def close(self):
|
jpayne@69
|
4136 """closes the :class:`pysam.VariantFile`."""
|
jpayne@69
|
4137 if not self.htsfile:
|
jpayne@69
|
4138 return
|
jpayne@69
|
4139
|
jpayne@69
|
4140 # Write header if no records were written
|
jpayne@69
|
4141 if self.htsfile.is_write and not self.header_written:
|
jpayne@69
|
4142 with nogil:
|
jpayne@69
|
4143 bcf_hdr_write(self.htsfile, self.header.ptr)
|
jpayne@69
|
4144
|
jpayne@69
|
4145 cdef int ret = hts_close(self.htsfile)
|
jpayne@69
|
4146 self.htsfile = NULL
|
jpayne@69
|
4147 self.header = self.index = None
|
jpayne@69
|
4148
|
jpayne@69
|
4149 if ret < 0:
|
jpayne@69
|
4150 global errno
|
jpayne@69
|
4151 if errno == EPIPE:
|
jpayne@69
|
4152 errno = 0
|
jpayne@69
|
4153 else:
|
jpayne@69
|
4154 raise IOError(errno, force_str(strerror(errno)))
|
jpayne@69
|
4155
|
jpayne@69
|
4156 def __iter__(self):
|
jpayne@69
|
4157 if not self.is_open:
|
jpayne@69
|
4158 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4159
|
jpayne@69
|
4160 if self.htsfile.is_write:
|
jpayne@69
|
4161 raise ValueError('cannot iterate over Variantfile opened for writing')
|
jpayne@69
|
4162
|
jpayne@69
|
4163 self.is_reading = 1
|
jpayne@69
|
4164 return self
|
jpayne@69
|
4165
|
jpayne@69
|
4166 def __next__(self):
|
jpayne@69
|
4167 cdef int ret
|
jpayne@69
|
4168 cdef int errcode
|
jpayne@69
|
4169 cdef bcf1_t *record = bcf_init1()
|
jpayne@69
|
4170
|
jpayne@69
|
4171 if not record:
|
jpayne@69
|
4172 raise MemoryError('unable to allocate BCF record')
|
jpayne@69
|
4173
|
jpayne@69
|
4174 record.pos = -1
|
jpayne@69
|
4175 if self.drop_samples:
|
jpayne@69
|
4176 record.max_unpack = BCF_UN_SHR
|
jpayne@69
|
4177
|
jpayne@69
|
4178 with nogil:
|
jpayne@69
|
4179 ret = bcf_read1(self.htsfile, self.header.ptr, record)
|
jpayne@69
|
4180
|
jpayne@69
|
4181 if ret < 0:
|
jpayne@69
|
4182 errcode = record.errcode
|
jpayne@69
|
4183 bcf_destroy1(record)
|
jpayne@69
|
4184 if errcode:
|
jpayne@69
|
4185 raise IOError('unable to parse next record')
|
jpayne@69
|
4186 if ret == -1:
|
jpayne@69
|
4187 raise StopIteration
|
jpayne@69
|
4188 elif ret == -2:
|
jpayne@69
|
4189 raise IOError('truncated file')
|
jpayne@69
|
4190 elif errno:
|
jpayne@69
|
4191 raise IOError(errno, strerror(errno))
|
jpayne@69
|
4192 else:
|
jpayne@69
|
4193 raise IOError('unable to fetch next record')
|
jpayne@69
|
4194
|
jpayne@69
|
4195 return makeVariantRecord(self.header, record)
|
jpayne@69
|
4196
|
jpayne@69
|
4197 def copy(self):
|
jpayne@69
|
4198 if not self.is_open:
|
jpayne@69
|
4199 raise ValueError
|
jpayne@69
|
4200
|
jpayne@69
|
4201 cdef VariantFile vars = VariantFile.__new__(VariantFile)
|
jpayne@69
|
4202 cdef bcf_hdr_t *hdr
|
jpayne@69
|
4203
|
jpayne@69
|
4204 # FIXME: re-open using fd or else header and index could be invalid
|
jpayne@69
|
4205 vars.htsfile = self._open_htsfile()
|
jpayne@69
|
4206
|
jpayne@69
|
4207 if not vars.htsfile:
|
jpayne@69
|
4208 raise ValueError('Cannot re-open htsfile')
|
jpayne@69
|
4209
|
jpayne@69
|
4210 # minimize overhead by re-using header and index. This approach is
|
jpayne@69
|
4211 # currently risky, but see above for how this can be mitigated.
|
jpayne@69
|
4212 vars.header = self.header
|
jpayne@69
|
4213 vars.index = self.index
|
jpayne@69
|
4214
|
jpayne@69
|
4215 vars.filename = self.filename
|
jpayne@69
|
4216 vars.mode = self.mode
|
jpayne@69
|
4217 vars.threads = self.threads
|
jpayne@69
|
4218 vars.index_filename = self.index_filename
|
jpayne@69
|
4219 vars.drop_samples = self.drop_samples
|
jpayne@69
|
4220 vars.is_stream = self.is_stream
|
jpayne@69
|
4221 vars.is_remote = self.is_remote
|
jpayne@69
|
4222 vars.is_reading = self.is_reading
|
jpayne@69
|
4223 vars.start_offset = self.start_offset
|
jpayne@69
|
4224 vars.header_written = self.header_written
|
jpayne@69
|
4225
|
jpayne@69
|
4226 if self.htsfile.is_bin:
|
jpayne@69
|
4227 vars.seek(self.tell())
|
jpayne@69
|
4228 else:
|
jpayne@69
|
4229 with nogil:
|
jpayne@69
|
4230 hdr = bcf_hdr_read(vars.htsfile)
|
jpayne@69
|
4231 makeVariantHeader(hdr)
|
jpayne@69
|
4232
|
jpayne@69
|
4233 return vars
|
jpayne@69
|
4234
|
jpayne@69
|
4235 def open(self, filename, mode='r',
|
jpayne@69
|
4236 index_filename=None,
|
jpayne@69
|
4237 VariantHeader header=None,
|
jpayne@69
|
4238 drop_samples=False,
|
jpayne@69
|
4239 duplicate_filehandle=True,
|
jpayne@69
|
4240 ignore_truncation=False,
|
jpayne@69
|
4241 threads=1):
|
jpayne@69
|
4242 """open a vcf/bcf file.
|
jpayne@69
|
4243
|
jpayne@69
|
4244 If open is called on an existing VariantFile, the current file will be
|
jpayne@69
|
4245 closed and a new file will be opened.
|
jpayne@69
|
4246 """
|
jpayne@69
|
4247 cdef bcf_hdr_t *hdr
|
jpayne@69
|
4248 cdef BGZF *bgzfp
|
jpayne@69
|
4249 cdef hts_idx_t *idx
|
jpayne@69
|
4250 cdef tbx_t *tidx
|
jpayne@69
|
4251 cdef char *cfilename
|
jpayne@69
|
4252 cdef char *cindex_filename = NULL
|
jpayne@69
|
4253 cdef char *cmode
|
jpayne@69
|
4254
|
jpayne@69
|
4255 if threads > 1 and ignore_truncation:
|
jpayne@69
|
4256 # This won't raise errors if reaching a truncated alignment,
|
jpayne@69
|
4257 # because bgzf_mt_reader in htslib does not deal with
|
jpayne@69
|
4258 # bgzf_mt_read_block returning non-zero values, contrary
|
jpayne@69
|
4259 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
|
jpayne@69
|
4260 # Better to avoid this (for now) than to produce seemingly correct results.
|
jpayne@69
|
4261 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
|
jpayne@69
|
4262 self.threads = threads
|
jpayne@69
|
4263
|
jpayne@69
|
4264 # close a previously opened file
|
jpayne@69
|
4265 if self.is_open:
|
jpayne@69
|
4266 self.close()
|
jpayne@69
|
4267
|
jpayne@69
|
4268 if not mode or mode[0] not in 'rwa':
|
jpayne@69
|
4269 raise ValueError('mode must begin with r, w or a')
|
jpayne@69
|
4270
|
jpayne@69
|
4271 self.duplicate_filehandle = duplicate_filehandle
|
jpayne@69
|
4272
|
jpayne@69
|
4273 format_modes = [m for m in mode[1:] if m in 'bcguz']
|
jpayne@69
|
4274 if len(format_modes) > 1:
|
jpayne@69
|
4275 raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
|
jpayne@69
|
4276
|
jpayne@69
|
4277 invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
|
jpayne@69
|
4278 if invalid_modes:
|
jpayne@69
|
4279 raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
|
jpayne@69
|
4280
|
jpayne@69
|
4281 # Autodetect mode from filename
|
jpayne@69
|
4282 if mode == 'w' and isinstance(filename, str):
|
jpayne@69
|
4283 if filename.endswith('.gz'):
|
jpayne@69
|
4284 mode = 'wz'
|
jpayne@69
|
4285 elif filename.endswith('.bcf'):
|
jpayne@69
|
4286 mode = 'wb'
|
jpayne@69
|
4287
|
jpayne@69
|
4288 # for htslib, wbu seems to not work
|
jpayne@69
|
4289 if mode == 'wbu':
|
jpayne@69
|
4290 mode = 'wb0'
|
jpayne@69
|
4291
|
jpayne@69
|
4292 self.mode = mode = force_bytes(mode)
|
jpayne@69
|
4293 try:
|
jpayne@69
|
4294 filename = encode_filename(filename)
|
jpayne@69
|
4295 self.is_remote = hisremote(filename)
|
jpayne@69
|
4296 self.is_stream = filename == b'-'
|
jpayne@69
|
4297 except TypeError:
|
jpayne@69
|
4298 filename = filename
|
jpayne@69
|
4299 self.is_remote = False
|
jpayne@69
|
4300 self.is_stream = True
|
jpayne@69
|
4301
|
jpayne@69
|
4302 self.filename = filename
|
jpayne@69
|
4303
|
jpayne@69
|
4304 if index_filename is not None:
|
jpayne@69
|
4305 self.index_filename = index_filename = encode_filename(index_filename)
|
jpayne@69
|
4306 else:
|
jpayne@69
|
4307 self.index_filename = None
|
jpayne@69
|
4308
|
jpayne@69
|
4309 self.drop_samples = bool(drop_samples)
|
jpayne@69
|
4310 self.header = None
|
jpayne@69
|
4311
|
jpayne@69
|
4312 self.header_written = False
|
jpayne@69
|
4313
|
jpayne@69
|
4314 if mode.startswith(b'w'):
|
jpayne@69
|
4315 # open file for writing
|
jpayne@69
|
4316 if index_filename is not None:
|
jpayne@69
|
4317 raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
|
jpayne@69
|
4318
|
jpayne@69
|
4319 # header structure (used for writing)
|
jpayne@69
|
4320 if header:
|
jpayne@69
|
4321 self.header = header.copy()
|
jpayne@69
|
4322 else:
|
jpayne@69
|
4323 self.header = VariantHeader()
|
jpayne@69
|
4324 #raise ValueError('a VariantHeader must be specified')
|
jpayne@69
|
4325
|
jpayne@69
|
4326 # Header is not written until the first write or on close
|
jpayne@69
|
4327 self.htsfile = self._open_htsfile()
|
jpayne@69
|
4328
|
jpayne@69
|
4329 if not self.htsfile:
|
jpayne@69
|
4330 raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
|
jpayne@69
|
4331
|
jpayne@69
|
4332 elif mode.startswith(b'r'):
|
jpayne@69
|
4333 # open file for reading
|
jpayne@69
|
4334 self.htsfile = self._open_htsfile()
|
jpayne@69
|
4335
|
jpayne@69
|
4336 if not self.htsfile:
|
jpayne@69
|
4337 if errno:
|
jpayne@69
|
4338 raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
|
jpayne@69
|
4339 else:
|
jpayne@69
|
4340 raise ValueError('could not open variant file `{}`'.format(filename))
|
jpayne@69
|
4341
|
jpayne@69
|
4342 if self.htsfile.format.format not in (bcf, vcf):
|
jpayne@69
|
4343 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
|
jpayne@69
|
4344
|
jpayne@69
|
4345 self.check_truncation(ignore_truncation)
|
jpayne@69
|
4346
|
jpayne@69
|
4347 with nogil:
|
jpayne@69
|
4348 hdr = bcf_hdr_read(self.htsfile)
|
jpayne@69
|
4349
|
jpayne@69
|
4350 try:
|
jpayne@69
|
4351 self.header = makeVariantHeader(hdr)
|
jpayne@69
|
4352 except ValueError:
|
jpayne@69
|
4353 raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
|
jpayne@69
|
4354
|
jpayne@69
|
4355 if isinstance(self.filename, bytes):
|
jpayne@69
|
4356 cfilename = self.filename
|
jpayne@69
|
4357 else:
|
jpayne@69
|
4358 cfilename = NULL
|
jpayne@69
|
4359
|
jpayne@69
|
4360 # check for index and open if present
|
jpayne@69
|
4361 if self.htsfile.format.format == bcf and cfilename:
|
jpayne@69
|
4362 if index_filename is not None:
|
jpayne@69
|
4363 cindex_filename = index_filename
|
jpayne@69
|
4364 with nogil:
|
jpayne@69
|
4365 idx = bcf_index_load2(cfilename, cindex_filename)
|
jpayne@69
|
4366 self.index = makeBCFIndex(self.header, idx)
|
jpayne@69
|
4367
|
jpayne@69
|
4368 elif self.htsfile.format.compression == bgzf and cfilename:
|
jpayne@69
|
4369 if index_filename is not None:
|
jpayne@69
|
4370 cindex_filename = index_filename
|
jpayne@69
|
4371 with nogil:
|
jpayne@69
|
4372 tidx = tbx_index_load2(cfilename, cindex_filename)
|
jpayne@69
|
4373 self.index = makeTabixIndex(tidx)
|
jpayne@69
|
4374
|
jpayne@69
|
4375 if not self.is_stream:
|
jpayne@69
|
4376 self.start_offset = self.tell()
|
jpayne@69
|
4377 else:
|
jpayne@69
|
4378 raise ValueError('unknown mode {}'.format(mode))
|
jpayne@69
|
4379
|
jpayne@69
|
4380 def reset(self):
|
jpayne@69
|
4381 """reset file position to beginning of file just after the header."""
|
jpayne@69
|
4382 return self.seek(self.start_offset)
|
jpayne@69
|
4383
|
jpayne@69
|
4384 def is_valid_tid(self, tid):
|
jpayne@69
|
4385 """
|
jpayne@69
|
4386 return True if the numerical :term:`tid` is valid; False otherwise.
|
jpayne@69
|
4387
|
jpayne@69
|
4388 returns -1 if reference is not known.
|
jpayne@69
|
4389 """
|
jpayne@69
|
4390 if not self.is_open:
|
jpayne@69
|
4391 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4392
|
jpayne@69
|
4393 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
4394 cdef int rid = tid
|
jpayne@69
|
4395 return 0 <= rid < hdr.n[BCF_DT_CTG]
|
jpayne@69
|
4396
|
jpayne@69
|
4397 def get_tid(self, reference):
|
jpayne@69
|
4398 """
|
jpayne@69
|
4399 return the numerical :term:`tid` corresponding to
|
jpayne@69
|
4400 :term:`reference`
|
jpayne@69
|
4401
|
jpayne@69
|
4402 returns -1 if reference is not known.
|
jpayne@69
|
4403 """
|
jpayne@69
|
4404 if not self.is_open:
|
jpayne@69
|
4405 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4406
|
jpayne@69
|
4407 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
|
jpayne@69
|
4408 reference = force_bytes(reference)
|
jpayne@69
|
4409 cdef khint_t k = kh_get_vdict(d, reference)
|
jpayne@69
|
4410 return kh_val_vdict(d, k).id if k != kh_end(d) else -1
|
jpayne@69
|
4411
|
jpayne@69
|
4412 def get_reference_name(self, tid):
|
jpayne@69
|
4413 """
|
jpayne@69
|
4414 return :term:`reference` name corresponding to numerical :term:`tid`
|
jpayne@69
|
4415 """
|
jpayne@69
|
4416 if not self.is_open:
|
jpayne@69
|
4417 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4418
|
jpayne@69
|
4419 cdef bcf_hdr_t *hdr = self.header.ptr
|
jpayne@69
|
4420 cdef int rid = tid
|
jpayne@69
|
4421 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
|
jpayne@69
|
4422 raise ValueError('Invalid tid')
|
jpayne@69
|
4423 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
|
jpayne@69
|
4424
|
jpayne@69
|
4425 def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
|
jpayne@69
|
4426 """fetch records in a :term:`region`, specified either by
|
jpayne@69
|
4427 :term:`contig`, *start*, and *end* (which are 0-based, half-open);
|
jpayne@69
|
4428 or alternatively by a samtools :term:`region` string (which is
|
jpayne@69
|
4429 1-based inclusive).
|
jpayne@69
|
4430
|
jpayne@69
|
4431 Without *contig* or *region* all mapped records will be fetched. The
|
jpayne@69
|
4432 records will be returned ordered by contig, which will not necessarily
|
jpayne@69
|
4433 be the order within the file.
|
jpayne@69
|
4434
|
jpayne@69
|
4435 Set *reopen* to true if you will be using multiple iterators on the
|
jpayne@69
|
4436 same file at the same time. The iterator returned will receive its
|
jpayne@69
|
4437 own copy of a filehandle to the file effectively re-opening the
|
jpayne@69
|
4438 file. Re-opening a file incurrs some overhead, so use with care.
|
jpayne@69
|
4439
|
jpayne@69
|
4440 If only *contig* is set, all records on *contig* will be fetched.
|
jpayne@69
|
4441 If both *region* and *contig* are given, an exception is raised.
|
jpayne@69
|
4442
|
jpayne@69
|
4443 Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
|
jpayne@69
|
4444 (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
|
jpayne@69
|
4445 read sequentially.
|
jpayne@69
|
4446 """
|
jpayne@69
|
4447 if not self.is_open:
|
jpayne@69
|
4448 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4449
|
jpayne@69
|
4450 if self.htsfile.is_write:
|
jpayne@69
|
4451 raise ValueError('cannot fetch from Variantfile opened for writing')
|
jpayne@69
|
4452
|
jpayne@69
|
4453 if contig is None and region is None:
|
jpayne@69
|
4454 self.is_reading = 1
|
jpayne@69
|
4455 bcf = self.copy() if reopen else self
|
jpayne@69
|
4456 bcf.seek(self.start_offset)
|
jpayne@69
|
4457 return iter(bcf)
|
jpayne@69
|
4458
|
jpayne@69
|
4459 if self.index is None:
|
jpayne@69
|
4460 raise ValueError('fetch requires an index')
|
jpayne@69
|
4461
|
jpayne@69
|
4462 _, tid, start, stop = self.parse_region(contig, start, stop, region,
|
jpayne@69
|
4463 None, end=end, reference=reference)
|
jpayne@69
|
4464
|
jpayne@69
|
4465 if contig is None:
|
jpayne@69
|
4466 contig = self.get_reference_name(tid)
|
jpayne@69
|
4467
|
jpayne@69
|
4468 self.is_reading = 1
|
jpayne@69
|
4469 return self.index.fetch(self, contig, start, stop, reopen)
|
jpayne@69
|
4470
|
jpayne@69
|
4471 def new_record(self, *args, **kwargs):
|
jpayne@69
|
4472 """Create a new empty :class:`VariantRecord`.
|
jpayne@69
|
4473
|
jpayne@69
|
4474 See :meth:`VariantHeader.new_record`
|
jpayne@69
|
4475 """
|
jpayne@69
|
4476 return self.header.new_record(*args, **kwargs)
|
jpayne@69
|
4477
|
jpayne@69
|
4478 cpdef int write(self, VariantRecord record) except -1:
|
jpayne@69
|
4479 """
|
jpayne@69
|
4480 write a single :class:`pysam.VariantRecord` to disk.
|
jpayne@69
|
4481
|
jpayne@69
|
4482 returns the number of bytes written.
|
jpayne@69
|
4483 """
|
jpayne@69
|
4484 if record is None:
|
jpayne@69
|
4485 raise ValueError('record must not be None')
|
jpayne@69
|
4486
|
jpayne@69
|
4487 if not self.is_open:
|
jpayne@69
|
4488 return ValueError('I/O operation on closed file')
|
jpayne@69
|
4489
|
jpayne@69
|
4490 if not self.htsfile.is_write:
|
jpayne@69
|
4491 raise ValueError('cannot write to a Variantfile opened for reading')
|
jpayne@69
|
4492
|
jpayne@69
|
4493 if not self.header_written:
|
jpayne@69
|
4494 self.header_written = True
|
jpayne@69
|
4495 with nogil:
|
jpayne@69
|
4496 bcf_hdr_write(self.htsfile, self.header.ptr)
|
jpayne@69
|
4497
|
jpayne@69
|
4498 #if record.header is not self.header:
|
jpayne@69
|
4499 # record.translate(self.header)
|
jpayne@69
|
4500 # raise ValueError('Writing records from a different VariantFile is not yet supported')
|
jpayne@69
|
4501
|
jpayne@69
|
4502 if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
|
jpayne@69
|
4503 msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
|
jpayne@69
|
4504 raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
|
jpayne@69
|
4505
|
jpayne@69
|
4506 # Sync END annotation before writing
|
jpayne@69
|
4507 bcf_sync_end(record)
|
jpayne@69
|
4508
|
jpayne@69
|
4509 cdef int ret
|
jpayne@69
|
4510
|
jpayne@69
|
4511 with nogil:
|
jpayne@69
|
4512 ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
|
jpayne@69
|
4513
|
jpayne@69
|
4514 if ret < 0:
|
jpayne@69
|
4515 raise IOError(errno, strerror(errno))
|
jpayne@69
|
4516
|
jpayne@69
|
4517 return ret
|
jpayne@69
|
4518
|
jpayne@69
|
4519 def subset_samples(self, include_samples):
|
jpayne@69
|
4520 """
|
jpayne@69
|
4521 Read only a subset of samples to reduce processing time and memory.
|
jpayne@69
|
4522 Must be called prior to retrieving records.
|
jpayne@69
|
4523 """
|
jpayne@69
|
4524 if not self.is_open:
|
jpayne@69
|
4525 raise ValueError('I/O operation on closed file')
|
jpayne@69
|
4526
|
jpayne@69
|
4527 if self.htsfile.is_write:
|
jpayne@69
|
4528 raise ValueError('cannot subset samples from Variantfile opened for writing')
|
jpayne@69
|
4529
|
jpayne@69
|
4530 if self.is_reading:
|
jpayne@69
|
4531 raise ValueError('cannot subset samples after fetching records')
|
jpayne@69
|
4532
|
jpayne@69
|
4533 self.header._subset_samples(include_samples)
|
jpayne@69
|
4534
|
jpayne@69
|
4535 # potentially unnecessary optimization that also sets max_unpack
|
jpayne@69
|
4536 if not include_samples:
|
jpayne@69
|
4537 self.drop_samples = True
|
jpayne@69
|
4538
|