comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcbcf.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # cython: language_level=3
2 # cython: embedsignature=True
3 # cython: profile=True
4 ###############################################################################
5 ###############################################################################
6 ## Cython wrapper for htslib VCF/BCF reader/writer
7 ###############################################################################
8 #
9 # NOTICE: This code is incomplete and preliminary. It offers a nearly
10 # complete Pythonic interface to VCF/BCF metadata and data with
11 # reading and writing capability. Documentation and a unit test suite
12 # are in the works. The code is best tested under Python 2, but
13 # should also work with Python 3. Please report any remaining
14 # str/bytes issues on the github site when using Python 3 and I'll
15 # fix them promptly.
16 #
17 # Here is a minimal example of how to use the API:
18 #
19 # $ cat bcfview.py
20 # import sys
21 # from pysam import VariantFile
22 #
23 # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
24 # bcf_out = VariantFile('-', 'w', header=bcf_in.header)
25 #
26 # for rec in bcf_in:
27 # bcf_out.write(rec)
28 #
29 # Performance is fairly close to that of bcftools view. Here is an example
30 # using some 1k Genomes data:
31 #
32 # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
33 # 1103799
34 #
35 # real 0m56.114s
36 # user 1m4.489s
37 # sys 0m3.102s
38 #
39 # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
40 # 1103800 # bcftools adds an extra header
41 #
42 # real 0m55.126s
43 # user 1m3.502s
44 # sys 0m3.459s
45 #
46 ###############################################################################
47 #
48 # TODO list:
49 #
50 # * more genotype methods
51 # * unit test suite (perhaps py.test based)
52 # * documentation
53 # * pickle support
54 # * left/right locus normalization
55 # * fix reopen to re-use fd
56 #
57 ###############################################################################
58 #
59 # The MIT License
60 #
61 # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
62 #
63 # Permission is hereby granted, free of charge, to any person obtaining a
64 # copy of this software and associated documentation files (the "Software"),
65 # to deal in the Software without restriction, including without limitation
66 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
67 # and/or sell copies of the Software, and to permit persons to whom the
68 # Software is furnished to do so, subject to the following conditions:
69 #
70 # The above copyright notice and this permission notice shall be included in
71 # all copies or substantial portions of the Software.
72 #
73 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
75 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
76 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
78 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
79 # DEALINGS IN THE SOFTWARE.
80 #
81 ###############################################################################
82
83 from __future__ import division, print_function
84
85 import os
86 import sys
87
88 from libc.errno cimport errno, EPIPE
89 from libc.string cimport strcmp, strpbrk, strerror
90 from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
91
92 cimport cython
93
94 from cpython.object cimport PyObject
95 from cpython.ref cimport Py_INCREF
96 from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
97 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
98 from cpython.bytes cimport PyBytes_FromStringAndSize
99 from cpython.unicode cimport PyUnicode_DecodeUTF8
100
101 from pysam.libchtslib cimport HTSFile, hisremote
102
103 from pysam.utils import unquoted_str
104
105
106 __all__ = ['VariantFile',
107 'VariantHeader',
108 'VariantHeaderRecord',
109 'VariantHeaderRecords',
110 'VariantMetadata',
111 'VariantHeaderMetadata',
112 'VariantContig',
113 'VariantHeaderContigs',
114 'VariantHeaderSamples',
115 'VariantRecordFilter',
116 'VariantRecordFormat',
117 'VariantRecordInfo',
118 'VariantRecordSamples',
119 'VariantRecord',
120 'VariantRecordSample',
121 'BaseIndex',
122 'BCFIndex',
123 'TabixIndex',
124 'BaseIterator',
125 'BCFIterator',
126 'TabixIterator',
127 'VariantRecord']
128
129 ########################################################################
130 ########################################################################
131 ## Constants
132 ########################################################################
133
134 cdef int MAX_POS = (1 << 31) - 1
135 cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
136 cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
137 cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
138
139
140 ########################################################################
141 ########################################################################
142 ## Python 3 compatibility functions
143 ########################################################################
144
145 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
146 from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
147
148
149 ########################################################################
150 ########################################################################
151 ## Sentinel object
152 ########################################################################
153
154 cdef object _nothing = object()
155
156 ########################################################################
157 ########################################################################
158 ## VCF/BCF string intern system
159 ########################################################################
160
161 cdef dict bcf_str_cache = {}
162
163 cdef inline bcf_str_cache_get_charptr(const char* s):
164 if s == NULL:
165 return None
166
167 cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
168 if pystr:
169 return <object>pystr
170
171 val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
172
173 PyDict_SetItemString(bcf_str_cache, s, val)
174
175 return val
176
177
178 ########################################################################
179 ########################################################################
180 ## Genotype math
181 ########################################################################
182
183 cdef int comb(int n, int k) except -1:
184 """Return binomial coefficient: n choose k
185
186 >>> comb(5, 1)
187 5
188 >>> comb(5, 2)
189 10
190 >>> comb(2, 2)
191 1
192 >>> comb(100, 2)
193 4950
194 """
195 if k > n:
196 return 0
197 elif k == n:
198 return 1
199 elif k > n // 2:
200 k = n - k
201
202 cdef d, result
203
204 d = result = n - k + 1
205 for i in range(2, k + 1):
206 d += 1
207 result *= d
208 result //= i
209 return result
210
211
212 cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
213 """Return the count of genotypes expected for the given ploidy and number of alleles.
214
215 >>> bcf_geno_combinations(1, 2)
216 2
217 >>> bcf_geno_combinations(2, 2)
218 3
219 >>> bcf_geno_combinations(2, 3)
220 6
221 >>> bcf_geno_combinations(3, 2)
222 4
223 """
224 return comb(alleles + ploidy - 1, ploidy)
225
226
227 ########################################################################
228 ########################################################################
229 ## Low level type conversion helpers
230 ########################################################################
231
232
233 cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
234 return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
235
236
237 cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
238 return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
239
240
241 cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
242
243 if sample < 0:
244 raise ValueError('genotype is only valid as a format field')
245
246 cdef int32_t *gt_arr = NULL
247 cdef int ngt = 0
248 ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
249
250 if ngt <= 0 or not gt_arr:
251 return 0
252
253 assert ngt % rec.n_sample == 0
254 cdef int max_ploidy = ngt // rec.n_sample
255 cdef int32_t *gt = gt_arr + sample * max_ploidy
256 cdef int ploidy = 0
257
258 while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
259 gt += 1
260 ploidy += 1
261
262 free(<void*>gt_arr)
263
264 return bcf_geno_combinations(ploidy, rec.n_allele)
265
266
267 cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
268 if not a:
269 return None
270 try:
271 return tuple(charptr_to_str(a[i]) for i in range(n))
272 finally:
273 if free_after and a:
274 free(a)
275
276
277 cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
278 cdef char *datac
279 cdef int8_t *data8
280 cdef int16_t *data16
281 cdef int32_t *data32
282 cdef float *dataf
283 cdef int i
284 cdef bytes b
285
286 if not data or n <= 0:
287 return None
288
289 if type == BCF_BT_CHAR:
290 datac = <char *>data
291
292 if not n:
293 value = ()
294 else:
295 # Check if at least one null terminator is present
296 if datac[n-1] == bcf_str_vector_end:
297 # If so, create a string up to the first null terminator
298 b = datac
299 else:
300 # Otherwise, copy the entire block
301 b = datac[:n]
302 value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
303 else:
304 value = []
305 if type == BCF_BT_INT8:
306 data8 = <int8_t *>data
307 for i in range(n):
308 if data8[i] == bcf_int8_vector_end:
309 break
310 value.append(data8[i] if data8[i] != bcf_int8_missing else None)
311 elif type == BCF_BT_INT16:
312 data16 = <int16_t *>data
313 for i in range(n):
314 if data16[i] == bcf_int16_vector_end:
315 break
316 value.append(data16[i] if data16[i] != bcf_int16_missing else None)
317 elif type == BCF_BT_INT32:
318 data32 = <int32_t *>data
319 for i in range(n):
320 if data32[i] == bcf_int32_vector_end:
321 break
322 value.append(data32[i] if data32[i] != bcf_int32_missing else None)
323 elif type == BCF_BT_FLOAT:
324 dataf = <float *>data
325 for i in range(n):
326 if bcf_float_is_vector_end(dataf[i]):
327 break
328 value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
329 else:
330 raise TypeError('unsupported info type code')
331
332 # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
333 if not value:
334 if scalar:
335 value = None
336 elif count <= 0:
337 value = ()
338 else:
339 value = (None,)*count
340 elif scalar and len(value) == 1:
341 value = value[0]
342 else:
343 value = tuple(value)
344
345 return value
346
347
348 cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
349 cdef char *datac
350 cdef int8_t *data8
351 cdef int16_t *data16
352 cdef int32_t *data32
353 cdef float *dataf
354 cdef ssize_t i, value_count = len(values)
355
356 assert value_count <= n
357
358 if bt_type == BCF_BT_CHAR:
359 if not isinstance(values, (str, bytes)):
360 values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
361 value_count = len(values)
362 assert value_count <= n
363 datac = <char *>data
364 memcpy(datac, <char *>values, value_count)
365 for i in range(value_count, n):
366 datac[i] = 0
367 elif bt_type == BCF_BT_INT8:
368 datai8 = <int8_t *>data
369 for i in range(value_count):
370 val = values[i]
371 datai8[i] = val if val is not None else bcf_int8_missing
372 for i in range(value_count, n):
373 datai8[i] = bcf_int8_vector_end
374 elif bt_type == BCF_BT_INT16:
375 datai16 = <int16_t *>data
376 for i in range(value_count):
377 val = values[i]
378 datai16[i] = val if val is not None else bcf_int16_missing
379 for i in range(value_count, n):
380 datai16[i] = bcf_int16_vector_end
381 elif bt_type == BCF_BT_INT32:
382 datai32 = <int32_t *>data
383 for i in range(value_count):
384 val = values[i]
385 datai32[i] = val if val is not None else bcf_int32_missing
386 for i in range(value_count, n):
387 datai32[i] = bcf_int32_vector_end
388 elif bt_type == BCF_BT_FLOAT:
389 dataf = <float *>data
390 for i in range(value_count):
391 val = values[i]
392 if val is None:
393 bcf_float_set(dataf + i, bcf_float_missing)
394 else:
395 dataf[i] = val
396 for i in range(value_count, n):
397 bcf_float_set(dataf + i, bcf_float_vector_end)
398 else:
399 raise TypeError('unsupported type')
400
401
402 cdef bcf_empty_array(int type, ssize_t n, int vlen):
403 cdef char *datac
404 cdef int32_t *data32
405 cdef float *dataf
406 cdef int i
407
408 if n <= 0:
409 raise ValueError('Cannot create empty array')
410
411 if type == BCF_HT_STR:
412 value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
413 datac = <char *>value
414 for i in range(n):
415 datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
416 elif type == BCF_HT_INT:
417 value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
418 data32 = <int32_t *><char *>value
419 for i in range(n):
420 data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
421 elif type == BCF_HT_REAL:
422 value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
423 dataf = <float *><char *>value
424 for i in range(n):
425 bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
426 else:
427 raise TypeError('unsupported header type code')
428
429 return value
430
431
432 cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
433 void *dst_data, int dst_type, size_t dst_values,
434 int vlen):
435 """copy data from src to dest where the size of the elements (src_type/dst_type) differ
436 as well as the number of elements (src_values/dst_values).
437 """
438
439 cdef char *src_datac
440 cdef char *dst_datac
441 cdef int8_t *src_datai8
442 cdef int16_t *src_datai16
443 cdef int32_t *src_datai32
444 cdef int32_t *dst_datai
445 cdef float *src_dataf
446 cdef float *dst_dataf
447 cdef ssize_t src_size, dst_size, i, j
448 cdef int val
449
450 if src_values > dst_values:
451 raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
452
453 if src_type == dst_type == BCF_BT_CHAR:
454 src_datac = <char *>src_data
455 dst_datac = <char *>dst_data
456 memcpy(dst_datac, src_datac, src_values)
457 for i in range(src_values, dst_values):
458 dst_datac[i] = 0
459 elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
460 src_datai8 = <int8_t *>src_data
461 dst_datai = <int32_t *>dst_data
462 for i in range(src_values):
463 val = src_datai8[i]
464 if val == bcf_int8_missing:
465 val = bcf_int32_missing
466 elif val == bcf_int8_vector_end:
467 val = bcf_int32_vector_end
468 dst_datai[i] = val
469 for i in range(src_values, dst_values):
470 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
471 elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
472 src_datai16 = <int16_t *>src_data
473 dst_datai = <int32_t *>dst_data
474 for i in range(src_values):
475 val = src_datai16[i]
476 if val == bcf_int16_missing:
477 val = bcf_int32_missing
478 elif val == bcf_int16_vector_end:
479 val = bcf_int32_vector_end
480 dst_datai[i] = val
481 for i in range(src_values, dst_values):
482 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
483 elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
484 src_datai32 = <int32_t *>src_data
485 dst_datai = <int32_t *>dst_data
486 for i in range(src_values):
487 dst_datai[i] = src_datai32[i]
488 for i in range(src_values, dst_values):
489 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
490 elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
491 src_dataf = <float *>src_data
492 dst_dataf = <float *>dst_data
493 for i in range(src_values):
494 dst_dataf[i] = src_dataf[i]
495 for i in range(src_values, dst_values):
496 bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
497 else:
498 raise TypeError('unsupported types')
499
500
501 cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
502 if record is None:
503 raise ValueError('record must not be None')
504
505 cdef bcf_hdr_t *hdr = record.header.ptr
506 cdef bcf1_t *r = record.ptr
507
508 if not check_header_id(hdr, hl_type, id):
509 raise ValueError('Invalid header')
510
511 cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
512 cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
513
514 scalar[0] = 0
515
516 if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
517 count[0] = number
518 elif length == BCF_VL_FIXED:
519 if number == 1:
520 scalar[0] = 1
521 count[0] = number
522 elif length == BCF_VL_R:
523 count[0] = r.n_allele
524 elif length == BCF_VL_A:
525 count[0] = r.n_allele - 1
526 elif length == BCF_VL_G:
527 count[0] = bcf_genotype_count(hdr, r, sample)
528 elif length == BCF_VL_VAR:
529 count[0] = -1
530 else:
531 raise ValueError('Unknown format length')
532
533
534 cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
535 if record is None:
536 raise ValueError('record must not be None')
537
538 cdef bcf_hdr_t *hdr = record.header.ptr
539
540 cdef char *s
541 cdef ssize_t count
542 cdef int scalar
543
544 bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
545
546 if z.len == 0:
547 if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
548 value = True
549 elif scalar:
550 value = None
551 else:
552 value = ()
553 elif z.len == 1:
554 if z.type == BCF_BT_INT8:
555 value = z.v1.i if z.v1.i != bcf_int8_missing else None
556 elif z.type == BCF_BT_INT16:
557 value = z.v1.i if z.v1.i != bcf_int16_missing else None
558 elif z.type == BCF_BT_INT32:
559 value = z.v1.i if z.v1.i != bcf_int32_missing else None
560 elif z.type == BCF_BT_FLOAT:
561 value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
562 elif z.type == BCF_BT_CHAR:
563 value = force_str(chr(z.v1.i))
564 else:
565 raise TypeError('unsupported info type code')
566
567 if not scalar and value != ():
568 value = (value,)
569 else:
570 value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
571
572 return value
573
574
575 cdef object bcf_check_values(VariantRecord record, value, int sample,
576 int hl_type, int ht_type,
577 int id, int bt_type, ssize_t bt_len,
578 ssize_t *value_count, int *scalar, int *realloc):
579
580 if record is None:
581 raise ValueError('record must not be None')
582
583 bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
584
585 # Validate values now that we know the type and size
586 values = (value,) if not isinstance(value, (list, tuple)) else value
587
588 # Validate values now that we know the type and size
589 if ht_type == BCF_HT_FLAG:
590 value_count[0] = 1
591 elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
592 # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
593 value_count[0] = -1
594
595 cdef int given = len(values)
596 if value_count[0] != -1 and value_count[0] != given:
597 if scalar[0]:
598 raise TypeError('value expected to be scalar, given len={}'.format(given))
599 else:
600 raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
601
602 if ht_type == BCF_HT_REAL:
603 for v in values:
604 if not(v is None or isinstance(v, (float, int))):
605 raise TypeError('invalid value for Float format')
606 elif ht_type == BCF_HT_INT:
607 for v in values:
608 if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
609 raise TypeError('invalid value for Integer format')
610 for v in values:
611 if not(v is None or bcf_int32_missing < v <= INT32_MAX):
612 raise ValueError('Integer value too small/large to store in VCF/BCF')
613 elif ht_type == BCF_HT_STR:
614 values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
615 elif ht_type == BCF_HT_FLAG:
616 if values[0] not in (True, False, None, 1, 0):
617 raise ValueError('Flag values must be: True, False, None, 1, 0')
618 else:
619 raise TypeError('unsupported type')
620
621 realloc[0] = 0
622 if len(values) <= 1 and hl_type == BCF_HL_INFO:
623 realloc[0] = 0
624 elif len(values) > bt_len:
625 realloc[0] = 1
626 elif bt_type == BCF_BT_INT8:
627 for v in values:
628 if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
629 realloc[0] = 1
630 break
631 elif bt_type == BCF_BT_INT16:
632 for v in values:
633 if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
634 realloc[0] = 1
635 break
636
637 return values
638
639
640 cdef bcf_encode_alleles(VariantRecord record, values):
641 if record is None:
642 raise ValueError('record must not be None')
643
644 cdef bcf1_t *r = record.ptr
645 cdef int32_t nalleles = r.n_allele
646 cdef list gt_values = []
647 cdef char *s
648 cdef int i
649
650 if values is None:
651 return ()
652
653 if not isinstance(values, (list, tuple)):
654 values = (values,)
655
656 for value in values:
657 if value is None:
658 gt_values.append(bcf_gt_missing)
659 elif isinstance(value, (str, bytes)):
660 bvalue = force_bytes(value)
661 s = bvalue
662 for i in range(r.n_allele):
663 if strcmp(r.d.allele[i], s) != 0:
664 gt_values.append(bcf_gt_unphased(i))
665 break
666 else:
667 raise ValueError('Unknown allele')
668 else:
669 i = value
670 if not (0 <= i < nalleles):
671 raise ValueError('Invalid allele index')
672 gt_values.append(bcf_gt_unphased(i))
673
674 return gt_values
675
676
677 cdef bcf_info_set_value(VariantRecord record, key, value):
678 if record is None:
679 raise ValueError('record must not be None')
680
681 cdef bcf_hdr_t *hdr = record.header.ptr
682 cdef bcf1_t *r = record.ptr
683 cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
684 cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
685
686 if bcf_unpack(r, BCF_UN_INFO) < 0:
687 raise ValueError('Error unpacking VariantRecord')
688
689 cdef bytes bkey = force_bytes(key)
690 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
691
692 if info:
693 info_id = info.key
694 else:
695 info_id = bcf_header_get_info_id(hdr, bkey)
696
697 if info_id < 0:
698 raise KeyError('unknown INFO: {}'.format(key))
699
700 if not check_header_id(hdr, BCF_HL_INFO, info_id):
701 raise ValueError('Invalid header')
702
703 info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
704 values = bcf_check_values(record, value, -1,
705 BCF_HL_INFO, info_type, info_id,
706 info.type if info else -1,
707 info.len if info else -1,
708 &value_count, &scalar, &realloc)
709
710 if info_type == BCF_HT_FLAG:
711 if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
712 raise ValueError('Unable to update INFO values')
713 return
714
715 vlen = value_count < 0
716 value_count = len(values)
717
718 # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
719 # If we can, write updated values to existing allocated storage
720 if 0 and info and not realloc:
721 r.d.shared_dirty |= BCF1_DIRTY_INF
722
723 if value_count == 0:
724 info.len = 0
725 if not info.vptr:
726 info.vptr = <uint8_t *>&info.v1.i
727
728 elif value_count == 1:
729 # FIXME: Check if need to free vptr if info.len > 0?
730 if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
731 bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
732 elif info.type == BCF_BT_FLOAT:
733 bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
734 else:
735 raise TypeError('unsupported info type code')
736
737 info.len = 1
738 if not info.vptr:
739 info.vptr = <uint8_t *>&info.v1.i
740 else:
741 bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
742
743 return
744
745 alloc_len = max(1, value_count)
746 if info and info.len > alloc_len:
747 alloc_len = info.len
748
749 new_values = bcf_empty_array(info_type, alloc_len, vlen)
750 cdef char *valp = <char *>new_values
751
752 if info_type == BCF_HT_INT:
753 dst_type = BCF_BT_INT32
754 elif info_type == BCF_HT_REAL:
755 dst_type = BCF_BT_FLOAT
756 elif info_type == BCF_HT_STR:
757 dst_type = BCF_BT_CHAR
758 else:
759 raise ValueError('Unsupported INFO type')
760
761 bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
762
763 if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
764 raise ValueError('Unable to update INFO values')
765
766
767 cdef bcf_info_del_value(VariantRecord record, key):
768 if record is None:
769 raise ValueError('record must not be None')
770
771 cdef bcf_hdr_t *hdr = record.header.ptr
772 cdef bcf1_t *r = record.ptr
773 cdef ssize_t value_count
774 cdef int scalar
775
776 if bcf_unpack(r, BCF_UN_INFO) < 0:
777 raise ValueError('Error unpacking VariantRecord')
778
779 cdef bytes bkey = force_bytes(key)
780 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
781
782 if not info:
783 raise KeyError(key)
784
785 bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
786
787 if value_count <= 0:
788 null_value = ()
789 elif scalar:
790 null_value = None
791 else:
792 null_value = (None,)*value_count
793
794 bcf_info_set_value(record, bkey, null_value)
795
796
797 cdef bcf_format_get_value(VariantRecordSample sample, key):
798 if sample is None:
799 raise ValueError('sample must not be None')
800
801 cdef bcf_hdr_t *hdr = sample.record.header.ptr
802 cdef bcf1_t *r = sample.record.ptr
803 cdef ssize_t count
804 cdef int scalar
805
806 if bcf_unpack(r, BCF_UN_ALL) < 0:
807 raise ValueError('Error unpacking VariantRecord')
808
809 cdef bytes bkey = force_bytes(key)
810 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
811
812 if not fmt or not fmt.p:
813 raise KeyError('invalid FORMAT: {}'.format(key))
814
815 if is_gt_fmt(hdr, fmt.id):
816 return bcf_format_get_allele_indices(sample)
817
818 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
819
820 if fmt.p and fmt.n and fmt.size:
821 return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
822 elif scalar:
823 return None
824 elif count <= 0:
825 return ()
826 else:
827 return (None,)*count
828
829
830 cdef bcf_format_set_value(VariantRecordSample sample, key, value):
831 if sample is None:
832 raise ValueError('sample must not be None')
833
834 if key == 'phased':
835 sample.phased = bool(value)
836 return
837
838 cdef bcf_hdr_t *hdr = sample.record.header.ptr
839 cdef bcf1_t *r = sample.record.ptr
840 cdef int fmt_id
841 cdef vdict_t *d
842 cdef khiter_t k
843 cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
844 cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
845
846 if bcf_unpack(r, BCF_UN_ALL) < 0:
847 raise ValueError('Error unpacking VariantRecord')
848
849 cdef bytes bkey = force_bytes(key)
850 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
851
852 if fmt:
853 fmt_id = fmt.id
854 else:
855 d = <vdict_t *>hdr.dict[BCF_DT_ID]
856 k = kh_get_vdict(d, bkey)
857
858 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
859 raise KeyError('unknown format: {}'.format(key))
860
861 fmt_id = kh_val_vdict(d, k).id
862
863 if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
864 raise ValueError('Invalid header')
865
866 fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
867
868 if fmt_type == BCF_HT_FLAG:
869 raise ValueError('Flag types are not allowed on FORMATs')
870
871 if is_gt_fmt(hdr, fmt_id):
872 value = bcf_encode_alleles(sample.record, value)
873 # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
874 fmt_type = BCF_HT_INT
875
876 values = bcf_check_values(sample.record, value, sample.index,
877 BCF_HL_FMT, fmt_type, fmt_id,
878 fmt.type if fmt else -1,
879 fmt.n if fmt else -1,
880 &value_count, &scalar, &realloc)
881 vlen = value_count < 0
882 value_count = len(values)
883
884 # If we can, write updated values to existing allocated storage.
885 if fmt and not realloc:
886 r.d.indiv_dirty = 1
887 bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
888 return
889
890 alloc_len = max(1, value_count)
891 if fmt and fmt.n > alloc_len:
892 alloc_len = fmt.n
893
894 nsamples = r.n_sample
895 new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
896 cdef char *new_values_p = <char *>new_values
897
898 if fmt_type == BCF_HT_INT:
899 dst_type = BCF_BT_INT32
900 dst_size = sizeof(int32_t) * alloc_len
901 elif fmt_type == BCF_HT_REAL:
902 dst_type = BCF_BT_FLOAT
903 dst_size = sizeof(float) * alloc_len
904 elif fmt_type == BCF_HT_STR:
905 dst_type = BCF_BT_CHAR
906 dst_size = sizeof(char) * alloc_len
907 else:
908 raise ValueError('Unsupported FORMAT type')
909
910 if fmt and nsamples > 1:
911 for i in range(nsamples):
912 bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
913 new_values_p + i * dst_size, dst_type, alloc_len,
914 vlen)
915
916 bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
917
918 if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
919 raise ValueError('Unable to update format values')
920
921
922 cdef bcf_format_del_value(VariantRecordSample sample, key):
923 if sample is None:
924 raise ValueError('sample must not be None')
925
926 cdef bcf_hdr_t *hdr = sample.record.header.ptr
927 cdef bcf1_t *r = sample.record.ptr
928 cdef ssize_t value_count
929 cdef int scalar
930
931 if bcf_unpack(r, BCF_UN_ALL) < 0:
932 raise ValueError('Error unpacking VariantRecord')
933
934 cdef bytes bkey = force_bytes(key)
935 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
936
937 if not fmt or not fmt.p:
938 raise KeyError(key)
939
940 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
941
942 if value_count <= 0:
943 null_value = ()
944 elif scalar:
945 null_value = None
946 else:
947 null_value = (None,)*value_count
948
949 bcf_format_set_value(sample, bkey, null_value)
950
951
952 cdef bcf_format_get_allele_indices(VariantRecordSample sample):
953 if sample is None:
954 raise ValueError('sample must not be None')
955
956 cdef bcf_hdr_t *hdr = sample.record.header.ptr
957 cdef bcf1_t *r = sample.record.ptr
958 cdef int32_t n = r.n_sample
959
960 if bcf_unpack(r, BCF_UN_ALL) < 0:
961 raise ValueError('Error unpacking VariantRecord')
962
963 if sample.index < 0 or sample.index >= n or not r.n_fmt:
964 return ()
965
966 cdef bcf_fmt_t *fmt0 = r.d.fmt
967 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
968
969 if not gt0 or not fmt0.n:
970 return ()
971
972 cdef int8_t *data8
973 cdef int16_t *data16
974 cdef int32_t *data32
975 cdef int32_t a, nalleles = r.n_allele
976 cdef list alleles = []
977
978 if fmt0.type == BCF_BT_INT8:
979 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
980 for i in range(fmt0.n):
981 if data8[i] == bcf_int8_vector_end:
982 break
983 elif data8[i] == bcf_gt_missing:
984 a = -1
985 else:
986 a = bcf_gt_allele(data8[i])
987 alleles.append(a if 0 <= a < nalleles else None)
988 elif fmt0.type == BCF_BT_INT16:
989 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
990 for i in range(fmt0.n):
991 if data16[i] == bcf_int16_vector_end:
992 break
993 elif data16[i] == bcf_gt_missing:
994 a = -1
995 else:
996 a = bcf_gt_allele(data16[i])
997 alleles.append(a if 0 <= a < nalleles else None)
998 elif fmt0.type == BCF_BT_INT32:
999 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
1000 for i in range(fmt0.n):
1001 if data32[i] == bcf_int32_vector_end:
1002 break
1003 elif data32[i] == bcf_gt_missing:
1004 a = -1
1005 else:
1006 a = bcf_gt_allele(data32[i])
1007 alleles.append(a if 0 <= a < nalleles else None)
1008
1009 return tuple(alleles)
1010
1011
1012 cdef bcf_format_get_alleles(VariantRecordSample sample):
1013 if sample is None:
1014 raise ValueError('sample must not be None')
1015
1016 cdef bcf_hdr_t *hdr = sample.record.header.ptr
1017 cdef bcf1_t *r = sample.record.ptr
1018 cdef int32_t nsamples = r.n_sample
1019
1020 if bcf_unpack(r, BCF_UN_ALL) < 0:
1021 raise ValueError('Error unpacking VariantRecord')
1022
1023 cdef int32_t nalleles = r.n_allele
1024
1025 if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
1026 return ()
1027
1028 cdef bcf_fmt_t *fmt0 = r.d.fmt
1029 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
1030
1031 if not gt0 or not fmt0.n:
1032 return ()
1033
1034 cdef int32_t a
1035 cdef int8_t *data8
1036 cdef int16_t *data16
1037 cdef int32_t *data32
1038 alleles = []
1039 if fmt0.type == BCF_BT_INT8:
1040 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
1041 for i in range(fmt0.n):
1042 if data8[i] == bcf_int8_vector_end:
1043 break
1044 a = bcf_gt_allele(data8[i])
1045 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
1046 elif fmt0.type == BCF_BT_INT16:
1047 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
1048 for i in range(fmt0.n):
1049 if data16[i] == bcf_int16_vector_end:
1050 break
1051 a = bcf_gt_allele(data16[i])
1052 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
1053 elif fmt0.type == BCF_BT_INT32:
1054 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
1055 for i in range(fmt0.n):
1056 if data32[i] == bcf_int32_vector_end:
1057 break
1058 a = bcf_gt_allele(data32[i])
1059 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
1060 return tuple(alleles)
1061
1062
1063 cdef bint bcf_sample_get_phased(VariantRecordSample sample):
1064 if sample is None:
1065 raise ValueError('sample must not be None')
1066
1067 cdef bcf_hdr_t *hdr = sample.record.header.ptr
1068 cdef bcf1_t *r = sample.record.ptr
1069 cdef int32_t n = r.n_sample
1070
1071 if bcf_unpack(r, BCF_UN_ALL) < 0:
1072 raise ValueError('Error unpacking VariantRecord')
1073
1074 if sample.index < 0 or sample.index >= n or not r.n_fmt:
1075 return False
1076
1077 cdef bcf_fmt_t *fmt0 = r.d.fmt
1078 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
1079
1080 if not gt0 or not fmt0.n:
1081 return False
1082
1083 cdef int8_t *data8
1084 cdef int16_t *data16
1085 cdef int32_t *data32
1086
1087 cdef bint phased = False
1088
1089 if fmt0.type == BCF_BT_INT8:
1090 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
1091 for i in range(fmt0.n):
1092 if data8[i] == bcf_int8_vector_end:
1093 break
1094 elif data8[i] == bcf_int8_missing:
1095 continue
1096 elif i and not bcf_gt_is_phased(data8[i]):
1097 return False
1098 else:
1099 phased = True
1100 elif fmt0.type == BCF_BT_INT16:
1101 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
1102 for i in range(fmt0.n):
1103 if data16[i] == bcf_int16_vector_end:
1104 break
1105 elif data16[i] == bcf_int16_missing:
1106 continue
1107 elif i and not bcf_gt_is_phased(data16[i]):
1108 return False
1109 else:
1110 phased = True
1111 elif fmt0.type == BCF_BT_INT32:
1112 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
1113 for i in range(fmt0.n):
1114 if data32[i] == bcf_int32_vector_end:
1115 break
1116 elif data32[i] == bcf_int32_missing:
1117 continue
1118 elif i and not bcf_gt_is_phased(data32[i]):
1119 return False
1120 else:
1121 phased = True
1122
1123 return phased
1124
1125
1126 cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
1127 if sample is None:
1128 raise ValueError('sample must not be None')
1129
1130 cdef bcf_hdr_t *hdr = sample.record.header.ptr
1131 cdef bcf1_t *r = sample.record.ptr
1132 cdef int32_t n = r.n_sample
1133
1134 if bcf_unpack(r, BCF_UN_ALL) < 0:
1135 raise ValueError('Error unpacking VariantRecord')
1136
1137 if sample.index < 0 or sample.index >= n or not r.n_fmt:
1138 return
1139
1140 cdef bcf_fmt_t *fmt0 = r.d.fmt
1141 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
1142
1143 if not gt0 or not fmt0.n:
1144 raise ValueError('Cannot set phased before genotype is set')
1145
1146 cdef int8_t *data8
1147 cdef int16_t *data16
1148 cdef int32_t *data32
1149
1150 if fmt0.type == BCF_BT_INT8:
1151 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
1152 for i in range(fmt0.n):
1153 if data8[i] == bcf_int8_vector_end:
1154 break
1155 elif data8[i] == bcf_int8_missing:
1156 continue
1157 elif i:
1158 data8[i] = (data8[i] & 0xFE) | phased
1159 elif fmt0.type == BCF_BT_INT16:
1160 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
1161 for i in range(fmt0.n):
1162 if data16[i] == bcf_int16_vector_end:
1163 break
1164 elif data16[i] == bcf_int16_missing:
1165 continue
1166 elif i:
1167 data16[i] = (data16[i] & 0xFFFE) | phased
1168 elif fmt0.type == BCF_BT_INT32:
1169 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
1170 for i in range(fmt0.n):
1171 if data32[i] == bcf_int32_vector_end:
1172 break
1173 elif data32[i] == bcf_int32_missing:
1174 continue
1175 elif i:
1176 data32[i] = (data32[i] & 0xFFFFFFFE) | phased
1177
1178
1179 cdef inline bcf_sync_end(VariantRecord record):
1180 cdef bcf_hdr_t *hdr = record.header.ptr
1181 cdef bcf_info_t *info
1182 cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
1183 cdef int ref_len
1184
1185 # allow missing ref when instantiating a new record
1186 if record.ref is not None:
1187 ref_len = len(record.ref)
1188 else:
1189 ref_len = 0
1190
1191 # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
1192 # Always keep END for symbolic alleles
1193 if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
1194 # If INFO/END is not defined in the header, it doesn't exist in the record
1195 if end_id >= 0:
1196 info = bcf_get_info(hdr, record.ptr, b'END')
1197 if info and info.vptr:
1198 if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
1199 raise ValueError('Unable to delete END')
1200 else:
1201 # Create END header, if not present
1202 if end_id < 0:
1203 record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
1204
1205 # Update to reflect stop position
1206 bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
1207
1208
1209 cdef inline int has_symbolic_allele(VariantRecord record):
1210 """Return index of first symbolic allele. 0 if no symbolic alleles."""
1211
1212 for i in range(1, record.ptr.n_allele):
1213 alt = record.ptr.d.allele[i]
1214 if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
1215 return i
1216
1217 return 0
1218
1219
1220 ########################################################################
1221 ########################################################################
1222 ## Variant Header objects
1223 ########################################################################
1224
1225
1226 cdef bcf_header_remove_hrec(VariantHeader header, int i):
1227 if header is None:
1228 raise ValueError('header must not be None')
1229
1230 cdef bcf_hdr_t *hdr = header.ptr
1231
1232 if i < 0 or i >= hdr.nhrec:
1233 raise ValueError('Invalid header record index')
1234
1235 cdef bcf_hrec_t *hrec = hdr.hrec[i]
1236 hdr.nhrec -= 1
1237
1238 if i < hdr.nhrec:
1239 memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
1240
1241 bcf_hrec_destroy(hrec)
1242 hdr.hrec[hdr.nhrec] = NULL
1243 hdr.dirty = 1
1244
1245
1246 #FIXME: implement a full mapping interface
1247 #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
1248 # object lifetime.
1249 cdef class VariantHeaderRecord(object):
1250 """header record from a :class:`VariantHeader` object"""
1251 def __init__(self, *args, **kwargs):
1252 raise TypeError('this class cannot be instantiated from Python')
1253
1254 @property
1255 def type(self):
1256 """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
1257 cdef bcf_hrec_t *r = self.ptr
1258 if not r:
1259 return None
1260 return METADATA_TYPES[r.type]
1261
1262 @property
1263 def key(self):
1264 """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
1265 cdef bcf_hrec_t *r = self.ptr
1266 return bcf_str_cache_get_charptr(r.key) if r and r.key else None
1267
1268 @property
1269 def value(self):
1270 """header value. Set only for generic lines, None for FILTER/INFO, etc."""
1271 cdef bcf_hrec_t *r = self.ptr
1272 return charptr_to_str(r.value) if r and r.value else None
1273
1274 @property
1275 def attrs(self):
1276 """sequence of additional header attributes"""
1277 cdef bcf_hrec_t *r = self.ptr
1278 if not r:
1279 return ()
1280 cdef int i
1281 return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
1282 charptr_to_str(r.vals[i]) if r.vals[i] else None)
1283 for i in range(r.nkeys))
1284
1285 def __len__(self):
1286 cdef bcf_hrec_t *r = self.ptr
1287 return r.nkeys if r else 0
1288
1289 def __bool__(self):
1290 cdef bcf_hrec_t *r = self.ptr
1291 return r != NULL and r.nkeys != 0
1292
1293 def __getitem__(self, key):
1294 """get attribute value"""
1295 cdef bcf_hrec_t *r = self.ptr
1296 cdef int i
1297 if r:
1298 bkey = force_bytes(key)
1299 for i in range(r.nkeys):
1300 if r.keys[i] and r.keys[i] == bkey:
1301 return charptr_to_str(r.vals[i]) if r.vals[i] else None
1302 raise KeyError('cannot find metadata key')
1303
1304 def __iter__(self):
1305 cdef bcf_hrec_t *r = self.ptr
1306 if not r:
1307 return
1308 cdef int i
1309 for i in range(r.nkeys):
1310 if r.keys[i]:
1311 yield bcf_str_cache_get_charptr(r.keys[i])
1312
1313 def get(self, key, default=None):
1314 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
1315 try:
1316 return self[key]
1317 except KeyError:
1318 return default
1319
1320 def __contains__(self, key):
1321 try:
1322 self[key]
1323 except KeyError:
1324 return False
1325 else:
1326 return True
1327
1328 def iterkeys(self):
1329 """D.iterkeys() -> an iterator over the keys of D"""
1330 return iter(self)
1331
1332 def itervalues(self):
1333 """D.itervalues() -> an iterator over the values of D"""
1334 cdef bcf_hrec_t *r = self.ptr
1335 if not r:
1336 return
1337 cdef int i
1338 for i in range(r.nkeys):
1339 if r.keys[i]:
1340 yield charptr_to_str(r.vals[i]) if r.vals[i] else None
1341
1342 def iteritems(self):
1343 """D.iteritems() -> an iterator over the (key, value) items of D"""
1344 cdef bcf_hrec_t *r = self.ptr
1345 if not r:
1346 return
1347 cdef int i
1348 for i in range(r.nkeys):
1349 if r.keys[i]:
1350 yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
1351
1352 def keys(self):
1353 """D.keys() -> list of D's keys"""
1354 return list(self)
1355
1356 def items(self):
1357 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
1358 return list(self.iteritems())
1359
1360 def values(self):
1361 """D.values() -> list of D's values"""
1362 return list(self.itervalues())
1363
1364 def update(self, items=None, **kwargs):
1365 """D.update([E, ]**F) -> None.
1366
1367 Update D from dict/iterable E and F.
1368 """
1369 for k, v in items.items():
1370 self[k] = v
1371
1372 if kwargs:
1373 for k, v in kwargs.items():
1374 self[k] = v
1375
1376 def pop(self, key, default=_nothing):
1377 try:
1378 value = self[key]
1379 del self[key]
1380 return value
1381 except KeyError:
1382 if default is not _nothing:
1383 return default
1384 raise
1385
1386 # Mappings are not hashable by default, but subclasses can change this
1387 __hash__ = None
1388
1389 #TODO: implement __richcmp__
1390
1391 def __str__(self):
1392 cdef bcf_hrec_t *r = self.ptr
1393
1394 if not r:
1395 raise ValueError('cannot convert deleted record to str')
1396
1397 cdef kstring_t hrec_str
1398 hrec_str.l = hrec_str.m = 0
1399 hrec_str.s = NULL
1400
1401 bcf_hrec_format(r, &hrec_str)
1402
1403 ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
1404
1405 if hrec_str.m:
1406 free(hrec_str.s)
1407
1408 return ret
1409
1410 # FIXME: Not safe -- causes trivial segfaults at the moment
1411 def remove(self):
1412 cdef bcf_hdr_t *hdr = self.header.ptr
1413 cdef bcf_hrec_t *r = self.ptr
1414 if not r:
1415 return
1416 assert r.key
1417 cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
1418 bcf_hdr_remove(hdr, r.type, key)
1419 self.ptr = NULL
1420
1421
1422 cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
1423 if not header:
1424 raise ValueError('invalid VariantHeader')
1425
1426 if not hdr:
1427 return None
1428
1429 cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
1430 record.header = header
1431 record.ptr = hdr
1432
1433 return record
1434
1435
1436 cdef class VariantHeaderRecords(object):
1437 """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
1438 def __init__(self, *args, **kwargs):
1439 raise TypeError('this class cannot be instantiated from Python')
1440
1441 def __len__(self):
1442 return self.header.ptr.nhrec
1443
1444 def __bool__(self):
1445 return self.header.ptr.nhrec != 0
1446
1447 def __getitem__(self, index):
1448 cdef int32_t i = index
1449 if i < 0 or i >= self.header.ptr.nhrec:
1450 raise IndexError('invalid header record index')
1451 return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
1452
1453 def __iter__(self):
1454 cdef int32_t i
1455 for i in range(self.header.ptr.nhrec):
1456 yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
1457
1458 __hash__ = None
1459
1460
1461 cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
1462 if not header:
1463 raise ValueError('invalid VariantHeader')
1464
1465 cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
1466 records.header = header
1467 return records
1468
1469
1470 cdef class VariantMetadata(object):
1471 """filter, info or format metadata record from a :class:`VariantHeader` object"""
1472 def __init__(self, *args, **kwargs):
1473 raise TypeError('this class cannot be instantiated from Python')
1474
1475 @property
1476 def name(self):
1477 """metadata name"""
1478 cdef bcf_hdr_t *hdr = self.header.ptr
1479 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
1480
1481 # Q: Should this be exposed?
1482 @property
1483 def id(self):
1484 """metadata internal header id number"""
1485 return self.id
1486
1487 @property
1488 def number(self):
1489 """metadata number (i.e. cardinality)"""
1490 cdef bcf_hdr_t *hdr = self.header.ptr
1491
1492 if not check_header_id(hdr, self.type, self.id):
1493 raise ValueError('Invalid header id')
1494
1495 if self.type == BCF_HL_FLT:
1496 return None
1497
1498 cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
1499 if l == BCF_VL_FIXED:
1500 return bcf_hdr_id2number(hdr, self.type, self.id)
1501 elif l == BCF_VL_VAR:
1502 return '.'
1503 else:
1504 return METADATA_LENGTHS[l]
1505
1506 @property
1507 def type(self):
1508 """metadata value type"""
1509 cdef bcf_hdr_t *hdr = self.header.ptr
1510 if not check_header_id(hdr, self.type, self.id):
1511 raise ValueError('Invalid header id')
1512
1513 if self.type == BCF_HL_FLT:
1514 return None
1515 return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
1516
1517 @property
1518 def description(self):
1519 """metadata description (or None if not set)"""
1520 descr = self.record.get('Description')
1521 if descr:
1522 descr = descr.strip('"')
1523 return force_str(descr)
1524
1525 @property
1526 def record(self):
1527 """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
1528 cdef bcf_hdr_t *hdr = self.header.ptr
1529 if not check_header_id(hdr, self.type, self.id):
1530 raise ValueError('Invalid header id')
1531 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
1532 if not hrec:
1533 return None
1534 return makeVariantHeaderRecord(self.header, hrec)
1535
1536 def remove_header(self):
1537 cdef bcf_hdr_t *hdr = self.header.ptr
1538 cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
1539 bcf_hdr_remove(hdr, self.type, key)
1540
1541
1542 cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
1543 if not header:
1544 raise ValueError('invalid VariantHeader')
1545
1546 if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
1547 raise ValueError('invalid metadata type')
1548
1549 if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
1550 raise ValueError('invalid metadata id')
1551
1552 cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
1553 meta.header = header
1554 meta.type = type
1555 meta.id = id
1556
1557 return meta
1558
1559
1560 cdef class VariantHeaderMetadata(object):
1561 """mapping from filter, info or format name to :class:`VariantMetadata` object"""
1562 def __init__(self, *args, **kwargs):
1563 raise TypeError('this class cannot be instantiated from Python')
1564
1565 def add(self, id, number, type, description, **kwargs):
1566 """Add a new filter, info or format record"""
1567 if id in self:
1568 raise ValueError('Header already exists for id={}'.format(id))
1569
1570 if self.type == BCF_HL_FLT:
1571 if number is not None:
1572 raise ValueError('Number must be None when adding a filter')
1573 if type is not None:
1574 raise ValueError('Type must be None when adding a filter')
1575
1576 items = [('ID', unquoted_str(id)), ('Description', description)]
1577 else:
1578 if type not in VALUE_TYPES:
1579 raise ValueError('unknown type specified: {}'.format(type))
1580 if number is None:
1581 number = '.'
1582
1583 items = [('ID', unquoted_str(id)),
1584 ('Number', unquoted_str(number)),
1585 ('Type', unquoted_str(type)),
1586 ('Description', description)]
1587
1588 items += kwargs.items()
1589 self.header.add_meta(METADATA_TYPES[self.type], items=items)
1590
1591 def __len__(self):
1592 cdef bcf_hdr_t *hdr = self.header.ptr
1593 cdef bcf_idpair_t *idpair
1594 cdef int32_t i, n = 0
1595
1596 for i in range(hdr.n[BCF_DT_ID]):
1597 idpair = hdr.id[BCF_DT_ID] + i
1598 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
1599 n += 1
1600 return n
1601
1602 def __bool__(self):
1603 cdef bcf_hdr_t *hdr = self.header.ptr
1604 cdef bcf_idpair_t *idpair
1605 cdef int32_t i
1606
1607 for i in range(hdr.n[BCF_DT_ID]):
1608 idpair = hdr.id[BCF_DT_ID] + i
1609 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
1610 return True
1611 return False
1612
1613 def __getitem__(self, key):
1614 cdef bcf_hdr_t *hdr = self.header.ptr
1615 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
1616
1617 cdef bytes bkey = force_bytes(key)
1618 cdef khiter_t k = kh_get_vdict(d, bkey)
1619
1620 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
1621 raise KeyError('invalid key: {}'.format(key))
1622
1623 return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
1624
1625 def remove_header(self, key):
1626 cdef bcf_hdr_t *hdr = self.header.ptr
1627 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
1628
1629 cdef bytes bkey = force_bytes(key)
1630 cdef khiter_t k = kh_get_vdict(d, bkey)
1631
1632 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
1633 raise KeyError('invalid key: {}'.format(key))
1634
1635 bcf_hdr_remove(hdr, self.type, bkey)
1636 #bcf_hdr_sync(hdr)
1637
1638 def clear_header(self):
1639 cdef bcf_hdr_t *hdr = self.header.ptr
1640 bcf_hdr_remove(hdr, self.type, NULL)
1641 #bcf_hdr_sync(hdr)
1642
1643 def __iter__(self):
1644 cdef bcf_hdr_t *hdr = self.header.ptr
1645 cdef bcf_idpair_t *idpair
1646 cdef int32_t i
1647
1648 for i in range(hdr.n[BCF_DT_ID]):
1649 idpair = hdr.id[BCF_DT_ID] + i
1650 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
1651 yield bcf_str_cache_get_charptr(idpair.key)
1652
1653 def get(self, key, default=None):
1654 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
1655 try:
1656 return self[key]
1657 except KeyError:
1658 return default
1659
1660 def __contains__(self, key):
1661 try:
1662 self[key]
1663 except KeyError:
1664 return False
1665 else:
1666 return True
1667
1668 def iterkeys(self):
1669 """D.iterkeys() -> an iterator over the keys of D"""
1670 return iter(self)
1671
1672 def itervalues(self):
1673 """D.itervalues() -> an iterator over the values of D"""
1674 for key in self:
1675 yield self[key]
1676
1677 def iteritems(self):
1678 """D.iteritems() -> an iterator over the (key, value) items of D"""
1679 for key in self:
1680 yield (key, self[key])
1681
1682 def keys(self):
1683 """D.keys() -> list of D's keys"""
1684 return list(self)
1685
1686 def items(self):
1687 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
1688 return list(self.iteritems())
1689
1690 def values(self):
1691 """D.values() -> list of D's values"""
1692 return list(self.itervalues())
1693
1694 # Mappings are not hashable by default, but subclasses can change this
1695 __hash__ = None
1696
1697 #TODO: implement __richcmp__
1698
1699
1700 cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
1701 if not header:
1702 raise ValueError('invalid VariantHeader')
1703
1704 cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
1705 meta.header = header
1706 meta.type = type
1707
1708 return meta
1709
1710
1711 cdef class VariantContig(object):
1712 """contig metadata from a :class:`VariantHeader`"""
1713 def __init__(self, *args, **kwargs):
1714 raise TypeError('this class cannot be instantiated from Python')
1715
1716 @property
1717 def name(self):
1718 """contig name"""
1719 cdef bcf_hdr_t *hdr = self.header.ptr
1720 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
1721
1722 @property
1723 def id(self):
1724 """contig internal id number"""
1725 return self.id
1726
1727 @property
1728 def length(self):
1729 """contig length or None if not available"""
1730 cdef bcf_hdr_t *hdr = self.header.ptr
1731 cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
1732 return length if length else None
1733
1734 @property
1735 def header_record(self):
1736 """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
1737 cdef bcf_hdr_t *hdr = self.header.ptr
1738 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
1739 return makeVariantHeaderRecord(self.header, hrec)
1740
1741 def remove_header(self):
1742 cdef bcf_hdr_t *hdr = self.header.ptr
1743 cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
1744 bcf_hdr_remove(hdr, BCF_HL_CTG, key)
1745
1746
1747 cdef VariantContig makeVariantContig(VariantHeader header, int id):
1748 if not header:
1749 raise ValueError('invalid VariantHeader')
1750
1751 if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
1752 raise ValueError('invalid contig id')
1753
1754 cdef VariantContig contig = VariantContig.__new__(VariantContig)
1755 contig.header = header
1756 contig.id = id
1757
1758 return contig
1759
1760
1761 cdef class VariantHeaderContigs(object):
1762 """mapping from contig name or index to :class:`VariantContig` object."""
1763 def __init__(self, *args, **kwargs):
1764 raise TypeError('this class cannot be instantiated from Python')
1765
1766 def __len__(self):
1767 cdef bcf_hdr_t *hdr = self.header.ptr
1768 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
1769 return hdr.n[BCF_DT_CTG]
1770
1771 def __bool__(self):
1772 cdef bcf_hdr_t *hdr = self.header.ptr
1773 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
1774 return hdr.n[BCF_DT_CTG] != 0
1775
1776 def __getitem__(self, key):
1777 cdef bcf_hdr_t *hdr = self.header.ptr
1778 cdef int index
1779
1780 if isinstance(key, int):
1781 index = key
1782 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
1783 raise IndexError('invalid contig index')
1784 return makeVariantContig(self.header, index)
1785
1786 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
1787 cdef bytes bkey = force_bytes(key)
1788 cdef khiter_t k = kh_get_vdict(d, bkey)
1789
1790 if k == kh_end(d):
1791 raise KeyError('invalid contig: {}'.format(key))
1792
1793 cdef int id = kh_val_vdict(d, k).id
1794
1795 return makeVariantContig(self.header, id)
1796
1797 def remove_header(self, key):
1798 cdef bcf_hdr_t *hdr = self.header.ptr
1799 cdef int index
1800 cdef const char *ckey
1801 cdef vdict_t *d
1802 cdef khiter_t k
1803
1804 if isinstance(key, int):
1805 index = key
1806 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
1807 raise IndexError('invalid contig index')
1808 ckey = hdr.id[BCF_DT_CTG][self.id].key
1809 else:
1810 d = <vdict_t *>hdr.dict[BCF_DT_CTG]
1811 key = force_bytes(key)
1812 if kh_get_vdict(d, key) == kh_end(d):
1813 raise KeyError('invalid contig: {}'.format(key))
1814 ckey = key
1815
1816 bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
1817
1818 def clear_header(self):
1819 cdef bcf_hdr_t *hdr = self.header.ptr
1820 bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
1821 #bcf_hdr_sync(hdr)
1822
1823 def __iter__(self):
1824 cdef bcf_hdr_t *hdr = self.header.ptr
1825 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
1826 cdef uint32_t n = kh_size(d)
1827
1828 assert n == hdr.n[BCF_DT_CTG]
1829
1830 for i in range(n):
1831 yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
1832
1833 def get(self, key, default=None):
1834 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
1835 try:
1836 return self[key]
1837 except KeyError:
1838 return default
1839
1840 def __contains__(self, key):
1841 try:
1842 self[key]
1843 except KeyError:
1844 return False
1845 else:
1846 return True
1847
1848 def iterkeys(self):
1849 """D.iterkeys() -> an iterator over the keys of D"""
1850 return iter(self)
1851
1852 def itervalues(self):
1853 """D.itervalues() -> an iterator over the values of D"""
1854 for key in self:
1855 yield self[key]
1856
1857 def iteritems(self):
1858 """D.iteritems() -> an iterator over the (key, value) items of D"""
1859 for key in self:
1860 yield (key, self[key])
1861
1862 def keys(self):
1863 """D.keys() -> list of D's keys"""
1864 return list(self)
1865
1866 def items(self):
1867 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
1868 return list(self.iteritems())
1869
1870 def values(self):
1871 """D.values() -> list of D's values"""
1872 return list(self.itervalues())
1873
1874 # Mappings are not hashable by default, but subclasses can change this
1875 __hash__ = None
1876
1877 #TODO: implement __richcmp__
1878
1879 def add(self, id, length=None, **kwargs):
1880 """Add a new contig record"""
1881 if id in self:
1882 raise ValueError('Header already exists for contig {}'.format(id))
1883
1884 items = [('ID', unquoted_str(id))]
1885 if length is not None:
1886 items.append(("length", unquoted_str(length)))
1887 items += kwargs.items()
1888 self.header.add_meta('contig', items=items)
1889
1890
1891 cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
1892 if not header:
1893 raise ValueError('invalid VariantHeader')
1894
1895 cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
1896 contigs.header = header
1897
1898 return contigs
1899
1900
1901 cdef class VariantHeaderSamples(object):
1902 """sequence of sample names from a :class:`VariantHeader` object"""
1903 def __init__(self, *args, **kwargs):
1904 raise TypeError('this class cannot be instantiated from Python')
1905
1906 def __len__(self):
1907 return bcf_hdr_nsamples(self.header.ptr)
1908
1909 def __bool__(self):
1910 return bcf_hdr_nsamples(self.header.ptr) != 0
1911
1912 def __getitem__(self, index):
1913 cdef bcf_hdr_t *hdr = self.header.ptr
1914 cdef int32_t n = bcf_hdr_nsamples(hdr)
1915 cdef int32_t i = index
1916
1917 if i < 0 or i >= n:
1918 raise IndexError('invalid sample index')
1919
1920 return charptr_to_str(hdr.samples[i])
1921
1922 def __iter__(self):
1923 cdef bcf_hdr_t *hdr = self.header.ptr
1924 cdef int32_t i, n = bcf_hdr_nsamples(hdr)
1925
1926 for i in range(n):
1927 yield charptr_to_str(hdr.samples[i])
1928
1929 def __contains__(self, key):
1930 cdef bcf_hdr_t *hdr = self.header.ptr
1931 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
1932 cdef bytes bkey = force_bytes(key)
1933 cdef khiter_t k = kh_get_vdict(d, bkey)
1934
1935 return k != kh_end(d)
1936
1937 # Mappings are not hashable by default, but subclasses can change this
1938 __hash__ = None
1939
1940 #TODO: implement __richcmp__
1941
1942 def add(self, name):
1943 """Add a new sample"""
1944 self.header.add_sample(name)
1945
1946
1947 cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
1948 if not header:
1949 raise ValueError('invalid VariantHeader')
1950
1951 cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
1952 samples.header = header
1953
1954 return samples
1955
1956
1957 cdef class VariantHeader(object):
1958 """header information for a :class:`VariantFile` object"""
1959 #FIXME: Add structured proxy
1960 #FIXME: Add generic proxy
1961 #FIXME: Add mutable methods
1962
1963 # See makeVariantHeader for C constructor
1964 def __cinit__(self):
1965 self.ptr = NULL
1966
1967 # Python constructor
1968 def __init__(self):
1969 self.ptr = bcf_hdr_init(b'w')
1970 if not self.ptr:
1971 raise ValueError('cannot create VariantHeader')
1972
1973 def __dealloc__(self):
1974 if self.ptr:
1975 bcf_hdr_destroy(self.ptr)
1976 self.ptr = NULL
1977
1978 def __bool__(self):
1979 return self.ptr != NULL
1980
1981 def copy(self):
1982 return makeVariantHeader(bcf_hdr_dup(self.ptr))
1983
1984 def merge(self, VariantHeader header):
1985 if header is None:
1986 raise ValueError('header must not be None')
1987 bcf_hdr_merge(self.ptr, header.ptr)
1988
1989 @property
1990 def version(self):
1991 """VCF version"""
1992 return force_str(bcf_hdr_get_version(self.ptr))
1993
1994 @property
1995 def samples(self):
1996 """samples (:class:`VariantHeaderSamples`)"""
1997 return makeVariantHeaderSamples(self)
1998
1999 @property
2000 def records(self):
2001 """header records (:class:`VariantHeaderRecords`)"""
2002 return makeVariantHeaderRecords(self)
2003
2004 @property
2005 def contigs(self):
2006 """contig information (:class:`VariantHeaderContigs`)"""
2007 return makeVariantHeaderContigs(self)
2008
2009 @property
2010 def filters(self):
2011 """filter metadata (:class:`VariantHeaderMetadata`)"""
2012 return makeVariantHeaderMetadata(self, BCF_HL_FLT)
2013
2014 @property
2015 def info(self):
2016 """info metadata (:class:`VariantHeaderMetadata`)"""
2017 return makeVariantHeaderMetadata(self, BCF_HL_INFO)
2018
2019 @property
2020 def formats(self):
2021 """format metadata (:class:`VariantHeaderMetadata`)"""
2022 return makeVariantHeaderMetadata(self, BCF_HL_FMT)
2023
2024 @property
2025 def alts(self):
2026 """alt metadata (:class:`dict` ID->record).
2027
2028 The data returned just a snapshot of alt records, is created
2029 every time the property is requested, and modifications will
2030 not be reflected in the header metadata and vice versa.
2031
2032 i.e. it is just a dict that reflects the state of alt records
2033 at the time it is created.
2034 """
2035 return {record['ID']:record for record in self.records
2036 if record.key.upper() == 'ALT' }
2037
2038 # only safe to do when opening an htsfile
2039 cdef _subset_samples(self, include_samples):
2040 keep_samples = set(self.samples)
2041 include_samples = set(include_samples)
2042 missing_samples = include_samples - keep_samples
2043 keep_samples &= include_samples
2044
2045 if missing_samples:
2046 # FIXME: add specialized exception with payload
2047 raise ValueError(
2048 'missing {:d} requested samples'.format(
2049 len(missing_samples)))
2050
2051 keep_samples = force_bytes(','.join(keep_samples))
2052 cdef char *keep = <char *>keep_samples if keep_samples else NULL
2053 cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
2054
2055 if ret != 0:
2056 raise ValueError(
2057 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
2058
2059 def __str__(self):
2060 cdef int hlen
2061 cdef kstring_t line
2062 line.l = line.m = 0
2063 line.s = NULL
2064
2065 if bcf_hdr_format(self.ptr, 0, &line) < 0:
2066 if line.m:
2067 free(line.s)
2068 raise ValueError('bcf_hdr_format failed')
2069
2070 ret = charptr_to_str_w_len(line.s, line.l)
2071
2072 if line.m:
2073 free(line.s)
2074 return ret
2075
2076 def new_record(self, contig=None, start=0, stop=0, alleles=None,
2077 id=None, qual=None, filter=None, info=None, samples=None,
2078 **kwargs):
2079 """Create a new empty VariantRecord.
2080
2081 Arguments are currently experimental. Use with caution and expect
2082 changes in upcoming releases.
2083
2084 """
2085 rec = makeVariantRecord(self, bcf_init())
2086
2087 if not rec:
2088 raise MemoryError('unable to allocate BCF record')
2089
2090 rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
2091
2092 if contig is not None:
2093 rec.contig = contig
2094
2095 rec.start = start
2096 rec.stop = stop
2097 rec.id = id
2098 rec.qual = qual
2099
2100 if alleles is not None:
2101 rec.alleles = alleles
2102
2103 if filter is not None:
2104 if isinstance(filter, (list, tuple, VariantRecordFilter)):
2105 for f in filter:
2106 rec.filter.add(f)
2107 else:
2108 rec.filter.add(filter)
2109
2110 if info:
2111 rec.info.update(info)
2112
2113 if kwargs:
2114 rec.samples[0].update(kwargs)
2115
2116 if samples:
2117 for i, sample in enumerate(samples):
2118 rec.samples[i].update(sample)
2119
2120 return rec
2121
2122 def add_record(self, VariantHeaderRecord record):
2123 """Add an existing :class:`VariantHeaderRecord` to this header"""
2124 if record is None:
2125 raise ValueError('record must not be None')
2126
2127 cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
2128
2129 bcf_hdr_add_hrec(self.ptr, hrec)
2130
2131 self._hdr_sync()
2132
2133 def add_line(self, line):
2134 """Add a metadata line to this header"""
2135 bline = force_bytes(line)
2136 if bcf_hdr_append(self.ptr, bline) < 0:
2137 raise ValueError('invalid header line')
2138
2139 self._hdr_sync()
2140
2141
2142 def add_meta(self, key, value=None, items=None):
2143 """Add metadata to this header"""
2144 if not ((value is not None) ^ (items is not None)):
2145 raise ValueError('either value or items must be specified')
2146
2147 cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
2148 cdef int quoted
2149
2150 try:
2151 key = force_bytes(key)
2152 hrec.key = strdup(key)
2153
2154 if value is not None:
2155 hrec.value = strdup(force_bytes(value))
2156 else:
2157 for key, value in items:
2158 quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
2159
2160 key = force_bytes(key)
2161 bcf_hrec_add_key(hrec, key, <int>len(key))
2162
2163 value = force_bytes(str(value))
2164 bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
2165 except:
2166 bcf_hrec_destroy(hrec)
2167 raise
2168
2169 bcf_hdr_add_hrec(self.ptr, hrec)
2170
2171 self._hdr_sync()
2172
2173 cdef _add_sample(self, name):
2174 bname = force_bytes(name)
2175 if bcf_hdr_add_sample(self.ptr, bname) < 0:
2176 raise ValueError('Duplicated sample name: {}'.format(name))
2177
2178 cdef _hdr_sync(self):
2179 cdef bcf_hdr_t *hdr = self.ptr
2180 if hdr.dirty:
2181 if bcf_hdr_sync(hdr) < 0:
2182 raise MemoryError('unable to reallocate VariantHeader')
2183
2184 def add_sample(self, name):
2185 """Add a new sample to this header"""
2186 self._add_sample(name)
2187 self._hdr_sync()
2188
2189 def add_samples(self, *args):
2190 """Add several new samples to this header.
2191 This function takes multiple arguments, each of which may
2192 be either a sample name or an iterable returning sample names
2193 (e.g., a list of sample names).
2194 """
2195 for arg in args:
2196 if isinstance(arg, str):
2197 self._add_sample(arg)
2198 else:
2199 for name in arg:
2200 self._add_sample(name)
2201 self._hdr_sync()
2202
2203
2204 cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
2205 if not hdr:
2206 raise ValueError('cannot create VariantHeader')
2207
2208 cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
2209 header.ptr = hdr
2210
2211 return header
2212
2213
2214 cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
2215 cdef vdict_t *d
2216 cdef khiter_t k
2217 cdef int info_id
2218
2219 if isinstance(key, str):
2220 key = force_bytes(key)
2221
2222 d = <vdict_t *>hdr.dict[BCF_DT_ID]
2223 k = kh_get_vdict(d, key)
2224
2225 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
2226 return -1
2227
2228 return kh_val_vdict(d, k).id
2229
2230
2231 ########################################################################
2232 ########################################################################
2233 ## Variant Record objects
2234 ########################################################################
2235
2236 cdef class VariantRecordFilter(object):
2237 """Filters set on a :class:`VariantRecord` object, presented as a mapping from
2238 filter index or name to :class:`VariantMetadata` object"""
2239 def __init__(self, *args, **kwargs):
2240 raise TypeError('this class cannot be instantiated from Python')
2241
2242 def __len__(self):
2243 return self.record.ptr.d.n_flt
2244
2245 def __bool__(self):
2246 return self.record.ptr.d.n_flt != 0
2247
2248 def __getitem__(self, key):
2249 cdef bcf_hdr_t *hdr = self.record.header.ptr
2250 cdef bcf1_t *r = self.record.ptr
2251 cdef int index, id
2252 cdef int n = r.d.n_flt
2253
2254 if isinstance(key, int):
2255 index = key
2256
2257 if index < 0 or index >= n:
2258 raise IndexError('invalid filter index')
2259
2260 id = r.d.flt[index]
2261 else:
2262 if key == '.':
2263 key = 'PASS'
2264
2265 bkey = force_bytes(key)
2266 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
2267
2268 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
2269 raise KeyError('Invalid filter: {}'.format(key))
2270
2271 return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
2272
2273 def add(self, key):
2274 """Add a new filter"""
2275 cdef bcf_hdr_t *hdr = self.record.header.ptr
2276 cdef bcf1_t *r = self.record.ptr
2277 cdef int id
2278
2279 if key == '.':
2280 key = 'PASS'
2281
2282 cdef bytes bkey = force_bytes(key)
2283 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
2284
2285 if not check_header_id(hdr, BCF_HL_FLT, id):
2286 raise KeyError('Invalid filter: {}'.format(key))
2287
2288 bcf_add_filter(hdr, r, id)
2289
2290 def __delitem__(self, key):
2291 cdef bcf_hdr_t *hdr = self.record.header.ptr
2292 cdef bcf1_t *r = self.record.ptr
2293 cdef int index, id
2294 cdef int n = r.d.n_flt
2295
2296 if isinstance(key, int):
2297 index = key
2298
2299 if index < 0 or index >= n:
2300 raise IndexError('invalid filter index')
2301
2302 id = r.d.flt[index]
2303 else:
2304 if key == '.':
2305 key = 'PASS'
2306
2307 bkey = force_bytes(key)
2308 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
2309
2310 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
2311 raise KeyError('Invalid filter: {}'.format(key))
2312
2313 bcf_remove_filter(hdr, r, id, 0)
2314
2315 def clear(self):
2316 """Clear all filters"""
2317 cdef bcf1_t *r = self.record.ptr
2318 r.d.shared_dirty |= BCF1_DIRTY_FLT
2319 r.d.n_flt = 0
2320
2321 def __iter__(self):
2322 cdef bcf_hdr_t *hdr = self.record.header.ptr
2323 cdef bcf1_t *r = self.record.ptr
2324 cdef int i
2325
2326 for i in range(r.d.n_flt):
2327 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
2328
2329 def get(self, key, default=None):
2330 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
2331 try:
2332 return self[key]
2333 except KeyError:
2334 return default
2335
2336 def __contains__(self, key):
2337 cdef bcf_hdr_t *hdr = self.record.header.ptr
2338 cdef bcf1_t *r = self.record.ptr
2339 cdef bytes bkey = force_bytes(key)
2340 return bcf_has_filter(hdr, r, bkey) == 1
2341
2342 def iterkeys(self):
2343 """D.iterkeys() -> an iterator over the keys of D"""
2344 return iter(self)
2345
2346 def itervalues(self):
2347 """D.itervalues() -> an iterator over the values of D"""
2348 for key in self:
2349 yield self[key]
2350
2351 def iteritems(self):
2352 """D.iteritems() -> an iterator over the (key, value) items of D"""
2353 for key in self:
2354 yield (key, self[key])
2355
2356 def keys(self):
2357 """D.keys() -> list of D's keys"""
2358 return list(self)
2359
2360 def items(self):
2361 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
2362 return list(self.iteritems())
2363
2364 def values(self):
2365 """D.values() -> list of D's values"""
2366 return list(self.itervalues())
2367
2368 def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
2369 if op != 2 and op != 3:
2370 return NotImplemented
2371
2372 cdef bcf1_t *s = self.record.ptr
2373 cdef bcf1_t *o = other.record.ptr
2374
2375 cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
2376
2377 if op == 3:
2378 cmp = not cmp
2379
2380 return cmp
2381
2382 # Mappings are not hashable by default, but subclasses can change this
2383 __hash__ = None
2384
2385 #TODO: implement __richcmp__
2386
2387
2388 cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
2389 if not record:
2390 raise ValueError('invalid VariantRecord')
2391
2392 cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
2393 filter.record = record
2394
2395 return filter
2396
2397
2398 cdef class VariantRecordFormat(object):
2399 """Format data present for each sample in a :class:`VariantRecord` object,
2400 presented as mapping from format name to :class:`VariantMetadata` object."""
2401 def __init__(self, *args, **kwargs):
2402 raise TypeError('this class cannot be instantiated from Python')
2403
2404 def __len__(self):
2405 cdef bcf_hdr_t *hdr = self.record.header.ptr
2406 cdef bcf1_t *r = self.record.ptr
2407 cdef int i, n = 0
2408
2409 for i in range(r.n_fmt):
2410 if r.d.fmt[i].p:
2411 n += 1
2412 return n
2413
2414 def __bool__(self):
2415 cdef bcf_hdr_t *hdr = self.record.header.ptr
2416 cdef bcf1_t *r = self.record.ptr
2417 cdef int i
2418
2419 for i in range(r.n_fmt):
2420 if r.d.fmt[i].p:
2421 return True
2422 return False
2423
2424 def __getitem__(self, key):
2425 cdef bcf_hdr_t *hdr = self.record.header.ptr
2426 cdef bcf1_t *r = self.record.ptr
2427
2428 cdef bytes bkey = force_bytes(key)
2429 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
2430
2431 if not fmt or not fmt.p:
2432 raise KeyError('unknown format: {}'.format(key))
2433
2434 return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
2435
2436 def __delitem__(self, key):
2437 cdef bcf_hdr_t *hdr = self.record.header.ptr
2438 cdef bcf1_t *r = self.record.ptr
2439
2440 cdef bytes bkey = force_bytes(key)
2441 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
2442
2443 if not fmt or not fmt.p:
2444 raise KeyError('unknown format: {}'.format(key))
2445
2446 if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
2447 raise ValueError('Unable to delete FORMAT')
2448
2449 def clear(self):
2450 """Clear all formats for all samples within the associated
2451 :class:`VariantRecord` instance"""
2452 cdef bcf_hdr_t *hdr = self.record.header.ptr
2453 cdef bcf1_t *r = self.record.ptr
2454 cdef bcf_fmt_t *fmt
2455 cdef const char *key
2456 cdef int i
2457
2458 for i in reversed(range(r.n_fmt)):
2459 fmt = &r.d.fmt[i]
2460 if fmt.p:
2461 key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
2462 if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
2463 raise ValueError('Unable to delete FORMAT')
2464
2465 def __iter__(self):
2466 cdef bcf_hdr_t *hdr = self.record.header.ptr
2467 cdef bcf1_t *r = self.record.ptr
2468 cdef bcf_fmt_t *fmt
2469 cdef int i
2470
2471 for i in range(r.n_fmt):
2472 fmt = &r.d.fmt[i]
2473 if fmt.p:
2474 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
2475
2476 def get(self, key, default=None):
2477 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
2478 try:
2479 return self[key]
2480 except KeyError:
2481 return default
2482
2483 def __contains__(self, key):
2484 cdef bcf_hdr_t *hdr = self.record.header.ptr
2485 cdef bcf1_t *r = self.record.ptr
2486 cdef bytes bkey = force_bytes(key)
2487 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
2488 return fmt != NULL and fmt.p != NULL
2489
2490 def iterkeys(self):
2491 """D.iterkeys() -> an iterator over the keys of D"""
2492 return iter(self)
2493
2494 def itervalues(self):
2495 """D.itervalues() -> an iterator over the values of D"""
2496 for key in self:
2497 yield self[key]
2498
2499 def iteritems(self):
2500 """D.iteritems() -> an iterator over the (key, value) items of D"""
2501 for key in self:
2502 yield (key, self[key])
2503
2504 def keys(self):
2505 """D.keys() -> list of D's keys"""
2506 return list(self)
2507
2508 def items(self):
2509 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
2510 return list(self.iteritems())
2511
2512 def values(self):
2513 """D.values() -> list of D's values"""
2514 return list(self.itervalues())
2515
2516 # Mappings are not hashable by default, but subclasses can change this
2517 __hash__ = None
2518
2519 #TODO: implement __richcmp__
2520
2521
2522 cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
2523 if not record:
2524 raise ValueError('invalid VariantRecord')
2525
2526 cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
2527 format.record = record
2528
2529 return format
2530
2531
2532 #TODO: Add a getmeta method to return the corresponding VariantMetadata?
2533 cdef class VariantRecordInfo(object):
2534 """Info data stored in a :class:`VariantRecord` object, presented as a
2535 mapping from info metadata name to value."""
2536
2537 def __init__(self, *args, **kwargs):
2538 raise TypeError('this class cannot be instantiated from Python')
2539
2540 def __len__(self):
2541 cdef bcf_hdr_t *hdr = self.record.header.ptr
2542 cdef bcf1_t *r = self.record.ptr
2543 cdef bcf_info_t *info
2544 cdef const char *key
2545 cdef int i, count = 0
2546
2547 if bcf_unpack(r, BCF_UN_INFO) < 0:
2548 raise ValueError('Error unpacking VariantRecord')
2549
2550 for i in range(r.n_info):
2551 info = &r.d.info[i]
2552 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2553 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
2554 count += 1
2555
2556 return count
2557
2558 def __bool__(self):
2559 cdef bcf_hdr_t *hdr = self.record.header.ptr
2560 cdef bcf1_t *r = self.record.ptr
2561 cdef bcf_info_t *info
2562 cdef const char *key
2563 cdef int i
2564
2565 if bcf_unpack(r, BCF_UN_INFO) < 0:
2566 raise ValueError('Error unpacking VariantRecord')
2567
2568 for i in range(r.n_info):
2569 info = &r.d.info[i]
2570 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2571 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
2572 return True
2573
2574 return False
2575
2576 def __getitem__(self, key):
2577 cdef bcf_hdr_t *hdr = self.record.header.ptr
2578 cdef bcf1_t *r = self.record.ptr
2579
2580 if bcf_unpack(r, BCF_UN_INFO) < 0:
2581 raise ValueError('Error unpacking VariantRecord')
2582
2583 cdef bytes bkey = force_bytes(key)
2584
2585 if strcmp(bkey, b'END') == 0:
2586 raise KeyError('END is a reserved attribute; access is via record.stop')
2587
2588 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
2589
2590 # Cannot stop here if info == NULL, since flags must return False
2591 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
2592
2593 if info_id < 0:
2594 raise KeyError('Unknown INFO field: {}'.format(key))
2595
2596 if not check_header_id(hdr, BCF_HL_INFO, info_id):
2597 raise ValueError('Invalid header')
2598
2599 # Handle type=Flag values
2600 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
2601 return info != NULL and info.vptr != NULL
2602
2603 if not info or not info.vptr:
2604 raise KeyError('Invalid INFO field: {}'.format(key))
2605
2606 return bcf_info_get_value(self.record, info)
2607
2608 def __setitem__(self, key, value):
2609 cdef bytes bkey = force_bytes(key)
2610
2611 if strcmp(bkey, b'END') == 0:
2612 raise KeyError('END is a reserved attribute; access is via record.stop')
2613
2614 if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
2615 raise ValueError('Error unpacking VariantRecord')
2616
2617 bcf_info_set_value(self.record, key, value)
2618
2619 def __delitem__(self, key):
2620 cdef bcf_hdr_t *hdr = self.record.header.ptr
2621 cdef bcf1_t *r = self.record.ptr
2622
2623 cdef bytes bkey = force_bytes(key)
2624 if strcmp(bkey, b'END') == 0:
2625 raise KeyError('END is a reserved attribute; access is via record.stop')
2626
2627 if bcf_unpack(r, BCF_UN_INFO) < 0:
2628 raise ValueError('Error unpacking VariantRecord')
2629
2630 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
2631
2632 # Cannot stop here if info == NULL, since flags must return False
2633 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
2634
2635 if info_id < 0:
2636 raise KeyError('Unknown INFO field: {}'.format(key))
2637
2638 if not check_header_id(hdr, BCF_HL_INFO, info_id):
2639 raise ValueError('Invalid header')
2640
2641 # Handle flags
2642 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
2643 return
2644
2645 if not info or not info.vptr:
2646 raise KeyError('Unknown INFO field: {}'.format(key))
2647
2648 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
2649 raise ValueError('Unable to delete INFO')
2650
2651 def clear(self):
2652 """Clear all info data"""
2653 cdef bcf_hdr_t *hdr = self.record.header.ptr
2654 cdef bcf1_t *r = self.record.ptr
2655 cdef bcf_info_t *info
2656 cdef const char *key
2657 cdef int i
2658
2659 if bcf_unpack(r, BCF_UN_INFO) < 0:
2660 raise ValueError('Error unpacking VariantRecord')
2661
2662 for i in range(r.n_info):
2663 info = &r.d.info[i]
2664 if info and info.vptr:
2665 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2666 if strcmp(key, b'END') == 0:
2667 continue
2668 if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
2669 raise ValueError('Unable to delete INFO')
2670
2671 def __iter__(self):
2672 cdef bcf_hdr_t *hdr = self.record.header.ptr
2673 cdef bcf1_t *r = self.record.ptr
2674 cdef bcf_info_t *info
2675 cdef const char *key
2676 cdef int i
2677
2678 if bcf_unpack(r, BCF_UN_INFO) < 0:
2679 raise ValueError('Error unpacking VariantRecord')
2680
2681 for i in range(r.n_info):
2682 info = &r.d.info[i]
2683 if info and info.vptr:
2684 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2685 if strcmp(key, b'END') != 0:
2686 yield bcf_str_cache_get_charptr(key)
2687
2688 def get(self, key, default=None):
2689 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
2690 cdef bcf_hdr_t *hdr = self.record.header.ptr
2691 cdef bcf1_t *r = self.record.ptr
2692
2693 if bcf_unpack(r, BCF_UN_INFO) < 0:
2694 raise ValueError('Error unpacking VariantRecord')
2695
2696 cdef bytes bkey = force_bytes(key)
2697
2698 if strcmp(bkey, b'END') == 0:
2699 return default
2700
2701 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
2702
2703 # Cannot stop here if info == NULL, since flags must return False
2704 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
2705
2706 if not check_header_id(hdr, BCF_HL_INFO, info_id):
2707 raise ValueError('Invalid header')
2708
2709 # Handle flags
2710 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
2711 return info != NULL and info.vptr != NULL
2712
2713 if not info or not info.vptr:
2714 return default
2715
2716 return bcf_info_get_value(self.record, info)
2717
2718 def __contains__(self, key):
2719 cdef bcf_hdr_t *hdr = self.record.header.ptr
2720 cdef bcf1_t *r = self.record.ptr
2721
2722 if bcf_unpack(r, BCF_UN_INFO) < 0:
2723 raise ValueError('Error unpacking VariantRecord')
2724
2725 cdef bytes bkey = force_bytes(key)
2726
2727 if strcmp(bkey, b'END') == 0:
2728 return False
2729
2730 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
2731
2732 return info != NULL and info.vptr != NULL
2733
2734 def iterkeys(self):
2735 """D.iterkeys() -> an iterator over the keys of D"""
2736 return iter(self)
2737
2738 def itervalues(self):
2739 """D.itervalues() -> an iterator over the values of D"""
2740 cdef bcf_hdr_t *hdr = self.record.header.ptr
2741 cdef bcf1_t *r = self.record.ptr
2742 cdef bcf_info_t *info
2743 cdef const char *key
2744 cdef int i
2745
2746 if bcf_unpack(r, BCF_UN_INFO) < 0:
2747 raise ValueError('Error unpacking VariantRecord')
2748
2749 for i in range(r.n_info):
2750 info = &r.d.info[i]
2751 if info and info.vptr:
2752 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2753 if strcmp(key, b'END') != 0:
2754 yield bcf_info_get_value(self.record, info)
2755
2756 def iteritems(self):
2757 """D.iteritems() -> an iterator over the (key, value) items of D"""
2758 cdef bcf_hdr_t *hdr = self.record.header.ptr
2759 cdef bcf1_t *r = self.record.ptr
2760 cdef bcf_info_t *info
2761 cdef const char *key
2762 cdef int i
2763
2764 if bcf_unpack(r, BCF_UN_INFO) < 0:
2765 raise ValueError('Error unpacking VariantRecord')
2766
2767 for i in range(r.n_info):
2768 info = &r.d.info[i]
2769 if info and info.vptr:
2770 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
2771 if strcmp(key, b'END') != 0:
2772 value = bcf_info_get_value(self.record, info)
2773 yield bcf_str_cache_get_charptr(key), value
2774
2775 def keys(self):
2776 """D.keys() -> list of D's keys"""
2777 return list(self)
2778
2779 def items(self):
2780 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
2781 return list(self.iteritems())
2782
2783 def values(self):
2784 """D.values() -> list of D's values"""
2785 return list(self.itervalues())
2786
2787 def update(self, items=None, **kwargs):
2788 """D.update([E, ]**F) -> None.
2789
2790 Update D from dict/iterable E and F.
2791 """
2792 for k, v in items.items():
2793 if k != 'END':
2794 self[k] = v
2795
2796 if kwargs:
2797 kwargs.pop('END', None)
2798 for k, v in kwargs.items():
2799 self[k] = v
2800
2801 def pop(self, key, default=_nothing):
2802 cdef bcf_hdr_t *hdr = self.record.header.ptr
2803 cdef bcf1_t *r = self.record.ptr
2804
2805 if bcf_unpack(r, BCF_UN_INFO) < 0:
2806 raise ValueError('Error unpacking VariantRecord')
2807
2808 cdef bytes bkey = force_bytes(key)
2809 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
2810
2811 # Cannot stop here if info == NULL, since flags must return False
2812 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
2813
2814 if info_id < 0:
2815 if default is _nothing:
2816 raise KeyError('Unknown INFO field: {}'.format(key))
2817 return default
2818
2819 if not check_header_id(hdr, BCF_HL_INFO, info_id):
2820 raise ValueError('Invalid header')
2821
2822 # Handle flags
2823 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
2824 return
2825
2826 if not info or not info.vptr:
2827 if default is _nothing:
2828 raise KeyError('Unknown INFO field: {}'.format(key))
2829 return default
2830
2831 value = bcf_info_get_value(self.record, info)
2832
2833 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
2834 raise ValueError('Unable to delete INFO')
2835
2836 return value
2837
2838 def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
2839 if op != 2 and op != 3:
2840 return NotImplemented
2841
2842 cdef bcf1_t *s = self.record.ptr
2843 cdef bcf1_t *o = other.record.ptr
2844
2845 # Cannot use n_info as shortcut logic, since null values may remain
2846 cdef bint cmp = dict(self) == dict(other)
2847
2848 if op == 3:
2849 cmp = not cmp
2850
2851 return cmp
2852
2853 # Mappings are not hashable by default, but subclasses can change this
2854 __hash__ = None
2855
2856
2857 cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
2858 if not record:
2859 raise ValueError('invalid VariantRecord')
2860
2861 cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
2862 info.record = record
2863
2864 return info
2865
2866
2867 cdef class VariantRecordSamples(object):
2868 """mapping from sample index or name to :class:`VariantRecordSample` object."""
2869 def __init__(self, *args, **kwargs):
2870 raise TypeError('this class cannot be instantiated from Python')
2871
2872 def __len__(self):
2873 return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
2874
2875 def __bool__(self):
2876 return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
2877
2878 def __getitem__(self, key):
2879 cdef bcf_hdr_t *hdr = self.record.header.ptr
2880 cdef bcf1_t *r = self.record.ptr
2881 cdef int n = self.record.ptr.n_sample
2882 cdef int sample_index
2883 cdef vdict_t *d
2884 cdef khiter_t k
2885
2886 if isinstance(key, int):
2887 sample_index = key
2888 else:
2889 bkey = force_bytes(key)
2890 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
2891 if sample_index < 0:
2892 raise KeyError('invalid sample name: {}'.format(key))
2893
2894 if sample_index < 0 or sample_index >= n:
2895 raise IndexError('invalid sample index')
2896
2897 return makeVariantRecordSample(self.record, sample_index)
2898
2899 def __iter__(self):
2900 cdef bcf_hdr_t *hdr = self.record.header.ptr
2901 cdef bcf1_t *r = self.record.ptr
2902 cdef int32_t i, n = self.record.ptr.n_sample
2903
2904 for i in range(n):
2905 yield charptr_to_str(hdr.samples[i])
2906
2907 def get(self, key, default=None):
2908 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
2909 try:
2910 return self[key]
2911 except KeyError:
2912 return default
2913
2914 def __contains__(self, key):
2915 cdef bcf_hdr_t *hdr = self.record.header.ptr
2916 cdef bcf1_t *r = self.record.ptr
2917 cdef int n = self.record.ptr.n_sample
2918 cdef int sample_index
2919 cdef vdict_t *d
2920 cdef khiter_t k
2921
2922 if isinstance(key, int):
2923 sample_index = key
2924 else:
2925 bkey = force_bytes(key)
2926 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
2927 if sample_index < 0:
2928 raise KeyError('invalid sample name: {}'.format(key))
2929
2930 return 0 <= sample_index < n
2931
2932 def iterkeys(self):
2933 """D.iterkeys() -> an iterator over the keys of D"""
2934 return iter(self)
2935
2936 def itervalues(self):
2937 """D.itervalues() -> an iterator over the values of D"""
2938 cdef bcf_hdr_t *hdr = self.record.header.ptr
2939 cdef bcf1_t *r = self.record.ptr
2940 cdef int32_t i, n = self.record.ptr.n_sample
2941
2942 for i in range(n):
2943 yield makeVariantRecordSample(self.record, i)
2944
2945 def iteritems(self):
2946 """D.iteritems() -> an iterator over the (key, value) items of D"""
2947 cdef bcf_hdr_t *hdr = self.record.header.ptr
2948 cdef bcf1_t *r = self.record.ptr
2949 cdef int32_t i, n = self.record.ptr.n_sample
2950
2951 for i in range(n):
2952 yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
2953
2954 def keys(self):
2955 """D.keys() -> list of D's keys"""
2956 return list(self)
2957
2958 def items(self):
2959 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
2960 return list(self.iteritems())
2961
2962 def values(self):
2963 """D.values() -> list of D's values"""
2964 return list(self.itervalues())
2965
2966 def update(self, items=None, **kwargs):
2967 """D.update([E, ]**F) -> None.
2968
2969 Update D from dict/iterable E and F.
2970 """
2971 for k, v in items.items():
2972 self[k] = v
2973
2974 if kwargs:
2975 for k, v in kwargs.items():
2976 self[k] = v
2977
2978 def pop(self, key, default=_nothing):
2979 try:
2980 value = self[key]
2981 del self[key]
2982 return value
2983 except KeyError:
2984 if default is not _nothing:
2985 return default
2986 raise
2987
2988 def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
2989 if op != 2 and op != 3:
2990 return NotImplemented
2991
2992 cdef bcf1_t *s = self.record.ptr
2993 cdef bcf1_t *o = other.record.ptr
2994
2995 cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
2996
2997 if op == 3:
2998 cmp = not cmp
2999
3000 return cmp
3001
3002 # Mappings are not hashable by default, but subclasses can change this
3003 __hash__ = None
3004
3005
3006 cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
3007 if not record:
3008 raise ValueError('invalid VariantRecord')
3009
3010 cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
3011 VariantRecordSamples)
3012 samples.record = record
3013
3014 return samples
3015
3016
3017 cdef class VariantRecord(object):
3018 """Variant record"""
3019 def __init__(self, *args, **kwargs):
3020 raise TypeError('this class cannot be instantiated from Python')
3021
3022 def __dealloc__(self):
3023 if self.ptr:
3024 bcf_destroy1(self.ptr)
3025 self.ptr = NULL
3026
3027 def copy(self):
3028 """return a copy of this VariantRecord object"""
3029 return makeVariantRecord(self.header, bcf_dup(self.ptr))
3030
3031 def translate(self, VariantHeader dst_header):
3032 if dst_header is None:
3033 raise ValueError('dst_header must not be None')
3034
3035 cdef bcf_hdr_t *src_hdr = self.header.ptr
3036 cdef bcf_hdr_t *dst_hdr = dst_header.ptr
3037
3038 if src_hdr != dst_hdr:
3039 if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
3040 msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
3041 raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
3042
3043 bcf_translate(dst_hdr, src_hdr, self.ptr)
3044 self.header = dst_header
3045
3046 @property
3047 def rid(self):
3048 """internal reference id number"""
3049 return self.ptr.rid
3050
3051 @rid.setter
3052 def rid(self, value):
3053 cdef bcf_hdr_t *hdr = self.header.ptr
3054 cdef int r = value
3055 if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
3056 raise ValueError('invalid reference id')
3057 self.ptr.rid = r
3058
3059 @property
3060 def chrom(self):
3061 """chromosome/contig name"""
3062 cdef bcf_hdr_t *hdr = self.header.ptr
3063 cdef int rid = self.ptr.rid
3064 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
3065 raise ValueError('Invalid header')
3066 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
3067
3068 @chrom.setter
3069 def chrom(self, value):
3070 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
3071 bchrom = force_bytes(value)
3072 cdef khint_t k = kh_get_vdict(d, bchrom)
3073 if k == kh_end(d):
3074 raise ValueError('Invalid chromosome/contig')
3075 self.ptr.rid = kh_val_vdict(d, k).id
3076
3077 @property
3078 def contig(self):
3079 """chromosome/contig name"""
3080 cdef bcf_hdr_t *hdr = self.header.ptr
3081 cdef int rid = self.ptr.rid
3082 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
3083 raise ValueError('Invalid header')
3084 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
3085
3086 @contig.setter
3087 def contig(self, value):
3088 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
3089 bchrom = force_bytes(value)
3090 cdef khint_t k = kh_get_vdict(d, bchrom)
3091 if k == kh_end(d):
3092 raise ValueError('Invalid chromosome/contig')
3093 self.ptr.rid = kh_val_vdict(d, k).id
3094
3095 @property
3096 def pos(self):
3097 """record start position on chrom/contig (1-based inclusive)"""
3098 return self.ptr.pos + 1
3099
3100 @pos.setter
3101 def pos(self, value):
3102 cdef int p = value
3103 if p < 1:
3104 raise ValueError('Position must be positive')
3105 self.ptr.pos = p - 1
3106 bcf_sync_end(self)
3107
3108 @property
3109 def start(self):
3110 """record start position on chrom/contig (0-based inclusive)"""
3111 return self.ptr.pos
3112
3113 @start.setter
3114 def start(self, value):
3115 cdef int s = value
3116 if s < 0:
3117 raise ValueError('Start coordinate must be non-negative')
3118 self.ptr.pos = s
3119 bcf_sync_end(self)
3120
3121 @property
3122 def stop(self):
3123 """record stop position on chrom/contig (0-based exclusive)"""
3124 return self.ptr.pos + self.ptr.rlen
3125
3126 @stop.setter
3127 def stop(self, value):
3128 cdef int s = value
3129 if s < 0:
3130 raise ValueError('Stop coordinate must be non-negative')
3131 self.ptr.rlen = s - self.ptr.pos
3132 bcf_sync_end(self)
3133
3134 @property
3135 def rlen(self):
3136 """record length on chrom/contig (aka rec.stop - rec.start)"""
3137 return self.ptr.rlen
3138
3139 @rlen.setter
3140 def rlen(self, value):
3141 cdef int r = value
3142 self.ptr.rlen = r
3143 bcf_sync_end(self)
3144
3145 @property
3146 def qual(self):
3147 """phred scaled quality score or None if not available"""
3148 return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
3149
3150 @qual.setter
3151 def qual(self, value):
3152 if value is not None:
3153 self.ptr.qual = value
3154 else:
3155 bcf_float_set(&self.ptr.qual, bcf_float_missing)
3156
3157
3158 # @property
3159 # def n_allele(self):
3160 # return self.ptr.n_allele
3161
3162 # @property
3163 # def n_sample(self):
3164 # return self.ptr.n_sample
3165
3166 @property
3167 def id(self):
3168 """record identifier or None if not available"""
3169 cdef bcf1_t *r = self.ptr
3170 if bcf_unpack(r, BCF_UN_STR) < 0:
3171 raise ValueError('Error unpacking VariantRecord')
3172 # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
3173 # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
3174 if (r.d.m_id == 0):
3175 raise ValueError('Error extracting ID')
3176 return charptr_to_str(r.d.id) if r.d.id != b'.' else None
3177
3178 @id.setter
3179 def id(self, value):
3180 cdef bcf1_t *r = self.ptr
3181 if bcf_unpack(r, BCF_UN_STR) < 0:
3182 raise ValueError('Error unpacking VariantRecord')
3183 cdef char *idstr = NULL
3184 if value is not None:
3185 bid = force_bytes(value)
3186 idstr = bid
3187 if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
3188 raise ValueError('Error updating id')
3189
3190 @property
3191 def ref(self):
3192 """reference allele"""
3193 cdef bcf1_t *r = self.ptr
3194 if bcf_unpack(r, BCF_UN_STR) < 0:
3195 raise ValueError('Error unpacking VariantRecord')
3196 return charptr_to_str(r.d.allele[0]) if r.d.allele else None
3197
3198 @ref.setter
3199 def ref(self, value):
3200 cdef bcf1_t *r = self.ptr
3201 if bcf_unpack(r, BCF_UN_STR) < 0:
3202 raise ValueError('Error unpacking VariantRecord')
3203 #FIXME: Set alleles directly -- this is stupid
3204 if not value:
3205 raise ValueError('ref allele must not be null')
3206 value = force_bytes(value)
3207 if r.d.allele and r.n_allele:
3208 alleles = [r.d.allele[i] for i in range(r.n_allele)]
3209 alleles[0] = value
3210 else:
3211 alleles = [value, '<NON_REF>']
3212 self.alleles = alleles
3213 bcf_sync_end(self)
3214
3215 @property
3216 def alleles(self):
3217 """tuple of reference allele followed by alt alleles"""
3218 cdef bcf1_t *r = self.ptr
3219 if bcf_unpack(r, BCF_UN_STR) < 0:
3220 raise ValueError('Error unpacking VariantRecord')
3221 if not r.d.allele:
3222 return None
3223 cdef tuple res = PyTuple_New(r.n_allele)
3224 for i in range(r.n_allele):
3225 a = charptr_to_str(r.d.allele[i])
3226 PyTuple_SET_ITEM(res, i, a)
3227 Py_INCREF(a)
3228 return res
3229
3230 @alleles.setter
3231 def alleles(self, values):
3232 cdef bcf1_t *r = self.ptr
3233
3234 # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
3235 cdef int rlen = r.rlen
3236
3237 if bcf_unpack(r, BCF_UN_STR) < 0:
3238 raise ValueError('Error unpacking VariantRecord')
3239
3240 values = [force_bytes(v) for v in values]
3241
3242 if len(values) < 2:
3243 raise ValueError('must set at least 2 alleles')
3244
3245 if b'' in values:
3246 raise ValueError('cannot set null allele')
3247
3248 value = b','.join(values)
3249
3250 if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
3251 raise ValueError('Error updating alleles')
3252
3253 # Reset rlen if alternate allele isn't symbolic, otherwise used cached
3254 if has_symbolic_allele(self):
3255 self.ptr.rlen = rlen
3256 else:
3257 self.ptr.rlen = len(values[0])
3258 r.d.var_type = -1
3259 bcf_sync_end(self)
3260
3261 @property
3262 def alts(self):
3263 """tuple of alt alleles"""
3264 cdef bcf1_t *r = self.ptr
3265 if bcf_unpack(r, BCF_UN_STR) < 0:
3266 raise ValueError('Error unpacking VariantRecord')
3267 if r.n_allele < 2 or not r.d.allele:
3268 return None
3269 cdef tuple res = PyTuple_New(r.n_allele - 1)
3270 for i in range(1, r.n_allele):
3271 a = charptr_to_str(r.d.allele[i])
3272 PyTuple_SET_ITEM(res, i - 1, a)
3273 Py_INCREF(a)
3274 return res
3275
3276 @alts.setter
3277 def alts(self, value):
3278 #FIXME: Set alleles directly -- this is stupid
3279 cdef bcf1_t *r = self.ptr
3280 if bcf_unpack(r, BCF_UN_STR) < 0:
3281 raise ValueError('Error unpacking VariantRecord')
3282 value = [force_bytes(v) for v in value]
3283 if b'' in value:
3284 raise ValueError('cannot set null alt allele')
3285 ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
3286 self.alleles = ref + value
3287 r.d.var_type = -1
3288
3289 @property
3290 def filter(self):
3291 """filter information (see :class:`VariantRecordFilter`)"""
3292 if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
3293 raise ValueError('Error unpacking VariantRecord')
3294 return makeVariantRecordFilter(self)
3295
3296 @property
3297 def info(self):
3298 """info data (see :class:`VariantRecordInfo`)"""
3299 if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
3300 raise ValueError('Error unpacking VariantRecord')
3301 return makeVariantRecordInfo(self)
3302
3303 @property
3304 def format(self):
3305 """sample format metadata (see :class:`VariantRecordFormat`)"""
3306 if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
3307 raise ValueError('Error unpacking VariantRecord')
3308 return makeVariantRecordFormat(self)
3309
3310 @property
3311 def samples(self):
3312 """sample data (see :class:`VariantRecordSamples`)"""
3313 if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
3314 raise ValueError('Error unpacking VariantRecord')
3315 return makeVariantRecordSamples(self)
3316
3317 property alleles_variant_types:
3318 def __get__(self):
3319 cdef bcf1_t *r = self.ptr
3320 cdef tuple result = PyTuple_New(r.n_allele)
3321
3322 for i in range(r.n_allele):
3323 tp = bcf_get_variant_type(r, i)
3324
3325 if tp == VCF_REF:
3326 v_type = "REF"
3327 elif tp == VCF_SNP:
3328 v_type = "SNP"
3329 elif tp == VCF_MNP:
3330 v_type = "MNP"
3331 elif tp == VCF_INDEL:
3332 v_type = "INDEL"
3333 elif tp == VCF_BND:
3334 v_type = "BND"
3335 elif tp == VCF_OVERLAP:
3336 v_type = "OVERLAP"
3337 else:
3338 v_type = "OTHER"
3339
3340 PyTuple_SET_ITEM(result, i, v_type)
3341 Py_INCREF(v_type)
3342
3343 return result
3344
3345 def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
3346 if op != 2 and op != 3:
3347 return NotImplemented
3348
3349 cdef bcf1_t *s = self.ptr
3350 cdef bcf1_t *o = other.ptr
3351
3352 cdef bint cmp = self is other or (
3353 s.pos == o.pos
3354 and s.rlen == o.rlen
3355 and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
3356 or s.qual == o.qual)
3357 and s.n_sample == o.n_sample
3358 and s.n_allele == o.n_allele
3359 and self.contig == other.contig
3360 and self.alleles == other.alleles
3361 and self.id == other.id
3362 and self.info == other.info
3363 and self.filter == other.filter
3364 and self.samples == other.samples)
3365
3366 if op == 3:
3367 cmp = not cmp
3368
3369 return cmp
3370
3371 def __str__(self):
3372 cdef kstring_t line
3373 cdef char c
3374
3375 line.l = line.m = 0
3376 line.s = NULL
3377
3378 if vcf_format(self.header.ptr, self.ptr, &line) < 0:
3379 if line.m:
3380 free(line.s)
3381 raise ValueError('vcf_format failed')
3382
3383 # Strip CR/LF?
3384 #while line.l:
3385 # c = line.s[line.l - 1]
3386 # if c != b'\n' and c != b'\r':
3387 # break
3388 # line.l -= 1
3389
3390 ret = charptr_to_str_w_len(line.s, line.l)
3391
3392 if line.m:
3393 free(line.s)
3394
3395 return ret
3396
3397
3398 cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
3399 if not header:
3400 raise ValueError('invalid VariantHeader')
3401
3402 if not r:
3403 raise ValueError('cannot create VariantRecord')
3404
3405 if r.errcode:
3406 msg = []
3407 #if r.errcode & BCF_ERR_CTG_UNDEF:
3408 # msg.append('undefined contig')
3409 #if r.errcode & BCF_ERR_TAG_UNDEF:
3410 # msg.append('undefined tag')
3411 if r.errcode & BCF_ERR_NCOLS:
3412 msg.append('invalid number of columns')
3413 if r.errcode & BCF_ERR_LIMITS:
3414 msg.append('limits violated')
3415 if r.errcode & BCF_ERR_CHAR:
3416 msg.append('invalid character found')
3417 if r.errcode & BCF_ERR_CTG_INVALID:
3418 msg.append('invalid contig')
3419 if r.errcode & BCF_ERR_TAG_INVALID:
3420 msg.append('invalid tag')
3421
3422 if msg:
3423 msg = ', '.join(msg)
3424 raise ValueError('Error(s) reading record: {}'.format(msg))
3425
3426 cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
3427 record.header = header
3428 record.ptr = r
3429
3430 return record
3431
3432
3433 ########################################################################
3434 ########################################################################
3435 ## Variant Sampletype object
3436 ########################################################################
3437
3438
3439 cdef class VariantRecordSample(object):
3440 """Data for a single sample from a :class:`VariantRecord` object.
3441 Provides data accessors for genotypes and a mapping interface
3442 from format name to values.
3443 """
3444 def __init__(self, *args, **kwargs):
3445 raise TypeError('this class cannot be instantiated from Python')
3446
3447 @property
3448 def name(self):
3449 """sample name"""
3450 cdef bcf_hdr_t *hdr = self.record.header.ptr
3451 cdef bcf1_t *r = self.record.ptr
3452 cdef int32_t n = r.n_sample
3453
3454 if self.index < 0 or self.index >= n:
3455 raise ValueError('invalid sample index')
3456
3457 return charptr_to_str(hdr.samples[self.index])
3458
3459 @property
3460 def allele_indices(self):
3461 """allele indices for called genotype, if present. Otherwise None"""
3462 return bcf_format_get_allele_indices(self)
3463
3464 @allele_indices.setter
3465 def allele_indices(self, value):
3466 self['GT'] = value
3467
3468 @allele_indices.deleter
3469 def allele_indices(self):
3470 self['GT'] = ()
3471
3472 @property
3473 def alleles(self):
3474 """alleles for called genotype, if present. Otherwise None"""
3475 return bcf_format_get_alleles(self)
3476
3477 @alleles.setter
3478 def alleles(self, value):
3479 # Sets the genotype, supply a tuple of alleles to set.
3480 # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
3481 # The genotype is reset when an empty tuple, None or (None,) is supplied
3482
3483 if value==(None,) or value==tuple() or value is None:
3484 self['GT'] = ()
3485 return
3486
3487 if any((type(x) == int for x in value)):
3488 raise ValueError('Use .allele_indices to set integer allele indices')
3489
3490 # determine and set allele indices:
3491 try:
3492 self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
3493 except ValueError:
3494 raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
3495 "First set the .alleles of this record to define the alleles")
3496
3497 @alleles.deleter
3498 def alleles(self):
3499 self['GT'] = ()
3500
3501 @property
3502 def phased(self):
3503 """False if genotype is missing or any allele is unphased. Otherwise True."""
3504 return bcf_sample_get_phased(self)
3505
3506 @phased.setter
3507 def phased(self, value):
3508 bcf_sample_set_phased(self, value)
3509
3510 def __len__(self):
3511 cdef bcf_hdr_t *hdr = self.record.header.ptr
3512 cdef bcf1_t *r = self.record.ptr
3513 cdef int i, n = 0
3514
3515 if bcf_unpack(r, BCF_UN_FMT) < 0:
3516 raise ValueError('Error unpacking VariantRecord')
3517
3518 for i in range(r.n_fmt):
3519 if r.d.fmt[i].p:
3520 n += 1
3521 return n
3522
3523 def __bool__(self):
3524 cdef bcf_hdr_t *hdr = self.record.header.ptr
3525 cdef bcf1_t *r = self.record.ptr
3526 cdef int i
3527
3528 if bcf_unpack(r, BCF_UN_FMT) < 0:
3529 raise ValueError('Error unpacking VariantRecord')
3530
3531 for i in range(r.n_fmt):
3532 if r.d.fmt[i].p:
3533 return True
3534 return False
3535
3536 def __getitem__(self, key):
3537 return bcf_format_get_value(self, key)
3538
3539 def __setitem__(self, key, value):
3540 bcf_format_set_value(self, key, value)
3541
3542 def __delitem__(self, key):
3543 bcf_format_del_value(self, key)
3544
3545 def clear(self):
3546 """Clear all format data (including genotype) for this sample"""
3547 cdef bcf_hdr_t *hdr = self.record.header.ptr
3548 cdef bcf1_t *r = self.record.ptr
3549 cdef bcf_fmt_t *fmt
3550 cdef int i
3551
3552 for i in range(r.n_fmt):
3553 fmt = &r.d.fmt[i]
3554 if fmt.p:
3555 bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
3556
3557 def __iter__(self):
3558 cdef bcf_hdr_t *hdr = self.record.header.ptr
3559 cdef bcf1_t *r = self.record.ptr
3560 cdef bcf_fmt_t *fmt
3561 cdef int i
3562
3563 for i in range(r.n_fmt):
3564 fmt = &r.d.fmt[i]
3565 if r.d.fmt[i].p:
3566 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
3567
3568 def get(self, key, default=None):
3569 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
3570 try:
3571 return self[key]
3572 except KeyError:
3573 return default
3574
3575 def __contains__(self, key):
3576 cdef bcf_hdr_t *hdr = self.record.header.ptr
3577 cdef bcf1_t *r = self.record.ptr
3578 cdef bytes bkey = force_bytes(key)
3579 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
3580 return fmt != NULL and fmt.p != NULL
3581
3582 def iterkeys(self):
3583 """D.iterkeys() -> an iterator over the keys of D"""
3584 return iter(self)
3585
3586 def itervalues(self):
3587 """D.itervalues() -> an iterator over the values of D"""
3588 for key in self:
3589 yield self[key]
3590
3591 def iteritems(self):
3592 """D.iteritems() -> an iterator over the (key, value) items of D"""
3593 for key in self:
3594 yield (key, self[key])
3595
3596 def keys(self):
3597 """D.keys() -> list of D's keys"""
3598 return list(self)
3599
3600 def items(self):
3601 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
3602 return list(self.iteritems())
3603
3604 def values(self):
3605 """D.values() -> list of D's values"""
3606 return list(self.itervalues())
3607
3608 def update(self, items=None, **kwargs):
3609 """D.update([E, ]**F) -> None.
3610
3611 Update D from dict/iterable E and F.
3612 """
3613 for k, v in items.items():
3614 self[k] = v
3615
3616 if kwargs:
3617 for k, v in kwargs.items():
3618 self[k] = v
3619
3620 def pop(self, key, default=_nothing):
3621 try:
3622 value = self[key]
3623 del self[key]
3624 return value
3625 except KeyError:
3626 if default is not _nothing:
3627 return default
3628 raise
3629
3630 def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
3631 if op != 2 and op != 3:
3632 return NotImplemented
3633
3634 cdef bint cmp = dict(self) == dict(other)
3635
3636 if op == 3:
3637 cmp = not cmp
3638
3639 return cmp
3640
3641 # Mappings are not hashable by default, but subclasses can change this
3642 __hash__ = None
3643
3644
3645 cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
3646 if not record or sample_index < 0:
3647 raise ValueError('cannot create VariantRecordSample')
3648
3649 cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
3650 sample.record = record
3651 sample.index = sample_index
3652
3653 return sample
3654
3655
3656 ########################################################################
3657 ########################################################################
3658 ## Index objects
3659 ########################################################################
3660
3661
3662 cdef class BaseIndex(object):
3663 def __init__(self):
3664 self.refs = ()
3665 self.remap = {}
3666
3667 def __len__(self):
3668 return len(self.refs)
3669
3670 def __bool__(self):
3671 return len(self.refs) != 0
3672
3673 def __getitem__(self, key):
3674 if isinstance(key, int):
3675 return self.refs[key]
3676 else:
3677 return self.refmap[key]
3678
3679 def __iter__(self):
3680 return iter(self.refs)
3681
3682 def get(self, key, default=None):
3683 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
3684 try:
3685 return self[key]
3686 except KeyError:
3687 return default
3688
3689 def __contains__(self, key):
3690 try:
3691 self[key]
3692 except KeyError:
3693 return False
3694 else:
3695 return True
3696
3697 def iterkeys(self):
3698 """D.iterkeys() -> an iterator over the keys of D"""
3699 return iter(self)
3700
3701 def itervalues(self):
3702 """D.itervalues() -> an iterator over the values of D"""
3703 for key in self:
3704 yield self[key]
3705
3706 def iteritems(self):
3707 """D.iteritems() -> an iterator over the (key, value) items of D"""
3708 for key in self:
3709 yield (key, self[key])
3710
3711 def keys(self):
3712 """D.keys() -> list of D's keys"""
3713 return list(self)
3714
3715 def items(self):
3716 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
3717 return list(self.iteritems())
3718
3719 def values(self):
3720 """D.values() -> list of D's values"""
3721 return list(self.itervalues())
3722
3723 def update(self, items=None, **kwargs):
3724 """D.update([E, ]**F) -> None.
3725
3726 Update D from dict/iterable E and F.
3727 """
3728 for k, v in items.items():
3729 self[k] = v
3730
3731 if kwargs:
3732 for k, v in kwargs.items():
3733 self[k] = v
3734
3735 def pop(self, key, default=_nothing):
3736 try:
3737 value = self[key]
3738 del self[key]
3739 return value
3740 except KeyError:
3741 if default is not _nothing:
3742 return default
3743 raise
3744
3745 # Mappings are not hashable by default, but subclasses can change this
3746 __hash__ = None
3747
3748 #TODO: implement __richcmp__
3749
3750
3751 cdef class BCFIndex(object):
3752 """CSI index data structure for BCF files"""
3753 def __init__(self):
3754 self.refs = ()
3755 self.refmap = {}
3756
3757 if not self.ptr:
3758 raise ValueError('Invalid index object')
3759
3760 cdef int n
3761 cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
3762
3763 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
3764 self.refmap = { r:i for i,r in enumerate(self.refs) }
3765
3766 def __dealloc__(self):
3767 if self.ptr:
3768 hts_idx_destroy(self.ptr)
3769 self.ptr = NULL
3770
3771 def fetch(self, bcf, contig, start, stop, reopen):
3772 return BCFIterator(bcf, contig, start, stop, reopen)
3773
3774
3775 cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
3776 if not idx:
3777 return None
3778
3779 if not header:
3780 raise ValueError('invalid VariantHeader')
3781
3782 cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
3783 index.header = header
3784 index.ptr = idx
3785 index.__init__()
3786
3787 return index
3788
3789
3790 cdef class TabixIndex(BaseIndex):
3791 """Tabix index data structure for VCF files"""
3792 def __init__(self):
3793 self.refs = ()
3794 self.refmap = {}
3795
3796 if not self.ptr:
3797 raise ValueError('Invalid index object')
3798
3799 cdef int n
3800 cdef const char **refs = tbx_seqnames(self.ptr, &n)
3801
3802 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
3803 self.refmap = { r:i for i,r in enumerate(self.refs) }
3804
3805 def __dealloc__(self):
3806 if self.ptr:
3807 tbx_destroy(self.ptr)
3808 self.ptr = NULL
3809
3810 def fetch(self, bcf, contig, start, stop, reopen):
3811 return TabixIterator(bcf, contig, start, stop, reopen)
3812
3813
3814 cdef TabixIndex makeTabixIndex(tbx_t *idx):
3815 if not idx:
3816 return None
3817
3818 cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
3819 index.ptr = idx
3820 index.__init__()
3821
3822 return index
3823
3824
3825 ########################################################################
3826 ########################################################################
3827 ## Iterators
3828 ########################################################################
3829
3830
3831 cdef class BaseIterator(object):
3832 pass
3833
3834
3835 # Internal function to clean up after iteration stop or failure.
3836 # This would be a nested function if it weren't a cdef function.
3837 cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
3838 bcf_destroy1(record)
3839
3840 # destroy iter so future calls to __next__ raise StopIteration
3841 bcf_itr_destroy(self.iter)
3842 self.iter = NULL
3843
3844
3845 cdef class BCFIterator(BaseIterator):
3846 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
3847 if bcf is None:
3848 raise ValueError('bcf must not be None')
3849
3850 if contig is None:
3851 raise ValueError('contig must be specified')
3852
3853 if not isinstance(bcf.index, BCFIndex):
3854 raise ValueError('bcf index required')
3855
3856 cdef BCFIndex index = bcf.index
3857
3858 self.bcf = bcf
3859 self.index = index
3860
3861 cdef int rid, cstart, cstop
3862
3863 try:
3864 rid = index.refmap[contig]
3865 except KeyError:
3866 # A query for a non-existent contig yields an empty iterator, does not raise an error
3867 self.iter = NULL
3868 return
3869
3870 if reopen:
3871 self.bcf = self.bcf.copy()
3872
3873 cstart = start if start is not None else 0
3874 cstop = stop if stop is not None else MAX_POS
3875
3876 with nogil:
3877 self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
3878
3879 if not self.iter:
3880 if errno:
3881 raise IOError(errno, strerror(errno))
3882 else:
3883 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
3884
3885 def __dealloc__(self):
3886 if self.iter:
3887 bcf_itr_destroy(self.iter)
3888 self.iter = NULL
3889
3890 def __iter__(self):
3891 return self
3892
3893 def __next__(self):
3894 if not self.iter:
3895 raise StopIteration
3896
3897 cdef bcf1_t *record = bcf_init1()
3898
3899 if not record:
3900 raise MemoryError('unable to allocate BCF record')
3901
3902 record.pos = -1
3903 if self.bcf.drop_samples:
3904 record.max_unpack = BCF_UN_SHR
3905
3906 cdef int ret
3907
3908 with nogil:
3909 ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
3910
3911 if ret < 0:
3912 _stop_BCFIterator(self, record)
3913 if ret == -1:
3914 raise StopIteration
3915 elif ret == -2:
3916 raise IOError('truncated file')
3917 elif errno:
3918 raise IOError(errno, strerror(errno))
3919 else:
3920 raise IOError('unable to fetch next record')
3921
3922 ret = bcf_subset_format(self.bcf.header.ptr, record)
3923
3924 if ret < 0:
3925 _stop_BCFIterator(self, record)
3926 raise ValueError('error in bcf_subset_format')
3927
3928 return makeVariantRecord(self.bcf.header, record)
3929
3930
3931 cdef class TabixIterator(BaseIterator):
3932 def __cinit__(self, *args, **kwargs):
3933 self.line_buffer.l = 0
3934 self.line_buffer.m = 0
3935 self.line_buffer.s = NULL
3936
3937 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
3938 if bcf is None:
3939 raise ValueError('bcf must not be None')
3940
3941 if not isinstance(bcf.index, TabixIndex):
3942 raise ValueError('tabix index required')
3943
3944 cdef TabixIndex index = bcf.index
3945
3946 self.bcf = bcf
3947 self.index = index
3948
3949 cdef int rid, cstart, cstop
3950
3951 try:
3952 rid = index.refmap[contig]
3953 except KeyError:
3954 # A query for a non-existent contig yields an empty iterator, does not raise an error
3955 self.iter = NULL
3956 return
3957
3958 if reopen:
3959 self.bcf = self.bcf.copy()
3960
3961 cstart = start if start is not None else 0
3962 cstop = stop if stop is not None else MAX_POS
3963
3964 self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
3965
3966 if not self.iter:
3967 if errno:
3968 raise IOError(errno, strerror(errno))
3969 else:
3970 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
3971
3972 def __dealloc__(self):
3973 if self.iter:
3974 tbx_itr_destroy(self.iter)
3975 self.iter = NULL
3976
3977 if self.line_buffer.m:
3978 free(self.line_buffer.s)
3979
3980 self.line_buffer.l = 0
3981 self.line_buffer.m = 0
3982 self.line_buffer.s = NULL
3983
3984 def __iter__(self):
3985 return self
3986
3987 def __next__(self):
3988 if not self.iter:
3989 raise StopIteration
3990
3991 cdef int ret
3992
3993 with nogil:
3994 ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
3995
3996 if ret < 0:
3997 tbx_itr_destroy(self.iter)
3998 self.iter = NULL
3999 if ret == -1:
4000 raise StopIteration
4001 elif ret == -2:
4002 raise IOError('truncated file')
4003 elif errno:
4004 raise IOError(errno, strerror(errno))
4005 else:
4006 raise IOError('unable to fetch next record')
4007
4008 cdef bcf1_t *record = bcf_init1()
4009
4010 if not record:
4011 raise MemoryError('unable to allocate BCF record')
4012
4013 record.pos = -1
4014 if self.bcf.drop_samples:
4015 record.max_unpack = BCF_UN_SHR
4016
4017 ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
4018
4019 # FIXME: stop iteration on parse failure?
4020 if ret < 0:
4021 bcf_destroy1(record)
4022 raise ValueError('error in vcf_parse')
4023
4024 return makeVariantRecord(self.bcf.header, record)
4025
4026
4027 ########################################################################
4028 ########################################################################
4029 ## Variant File
4030 ########################################################################
4031
4032
4033 cdef class VariantFile(HTSFile):
4034 """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
4035 duplicate_filehandle=True, ignore_truncation=False, threads=1)*
4036
4037 A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
4038 opened.
4039
4040 If an index for a variant file exists (.csi or .tbi), it will be
4041 opened automatically. Without an index random access to records
4042 via :meth:`fetch` is disabled.
4043
4044 For writing, a :class:`VariantHeader` object must be provided,
4045 typically obtained from another :term:`VCF` file/:term:`BCF`
4046 file.
4047
4048 Parameters
4049 ----------
4050 mode : string
4051 *mode* should be ``r`` for reading or ``w`` for writing. The default is
4052 text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
4053 ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
4054
4055 If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
4056 modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
4057 For instance, to open a :term:`BCF` formatted file for reading, type::
4058
4059 f = pysam.VariantFile('ex1.bcf','r')
4060
4061 If mode is not specified, we will try to auto-detect the file type. All
4062 of the following should work::
4063
4064 f1 = pysam.VariantFile('ex1.bcf')
4065 f2 = pysam.VariantFile('ex1.vcf')
4066 f3 = pysam.VariantFile('ex1.vcf.gz')
4067
4068 index_filename : string
4069 Explicit path to an index file.
4070
4071 header : VariantHeader
4072 :class:`VariantHeader` object required for writing.
4073
4074 drop_samples: bool
4075 Ignore sample information when reading.
4076
4077 duplicate_filehandle: bool
4078 By default, file handles passed either directly or through
4079 File-like objects will be duplicated before passing them to
4080 htslib. The duplication prevents issues where the same stream
4081 will be closed by htslib and through destruction of the
4082 high-level python object. Set to False to turn off
4083 duplication.
4084
4085 ignore_truncation: bool
4086 Issue a warning, instead of raising an error if the current file
4087 appears to be truncated due to a missing EOF marker. Only applies
4088 to bgzipped formats. (Default=False)
4089
4090 threads: integer
4091 Number of threads to use for compressing/decompressing VCF/BCF files.
4092 Setting threads to > 1 cannot be combined with `ignore_truncation`.
4093 (Default=1)
4094
4095 """
4096 def __cinit__(self, *args, **kwargs):
4097 self.htsfile = NULL
4098
4099 def __init__(self, *args, **kwargs):
4100 self.header = None
4101 self.index = None
4102 self.filename = None
4103 self.mode = None
4104 self.threads = 1
4105 self.index_filename = None
4106 self.is_stream = False
4107 self.is_remote = False
4108 self.is_reading = False
4109 self.drop_samples = False
4110 self.header_written = False
4111 self.start_offset = -1
4112
4113 self.open(*args, **kwargs)
4114
4115 def __dealloc__(self):
4116 if not self.htsfile or not self.header:
4117 return
4118
4119 # Write header if no records were written
4120 if self.htsfile.is_write and not self.header_written:
4121 with nogil:
4122 bcf_hdr_write(self.htsfile, self.header.ptr)
4123
4124 cdef int ret = hts_close(self.htsfile)
4125 self.htsfile = NULL
4126 self.header = self.index = None
4127
4128 if ret < 0:
4129 global errno
4130 if errno == EPIPE:
4131 errno = 0
4132 else:
4133 raise IOError(errno, force_str(strerror(errno)))
4134
4135 def close(self):
4136 """closes the :class:`pysam.VariantFile`."""
4137 if not self.htsfile:
4138 return
4139
4140 # Write header if no records were written
4141 if self.htsfile.is_write and not self.header_written:
4142 with nogil:
4143 bcf_hdr_write(self.htsfile, self.header.ptr)
4144
4145 cdef int ret = hts_close(self.htsfile)
4146 self.htsfile = NULL
4147 self.header = self.index = None
4148
4149 if ret < 0:
4150 global errno
4151 if errno == EPIPE:
4152 errno = 0
4153 else:
4154 raise IOError(errno, force_str(strerror(errno)))
4155
4156 def __iter__(self):
4157 if not self.is_open:
4158 raise ValueError('I/O operation on closed file')
4159
4160 if self.htsfile.is_write:
4161 raise ValueError('cannot iterate over Variantfile opened for writing')
4162
4163 self.is_reading = 1
4164 return self
4165
4166 def __next__(self):
4167 cdef int ret
4168 cdef int errcode
4169 cdef bcf1_t *record = bcf_init1()
4170
4171 if not record:
4172 raise MemoryError('unable to allocate BCF record')
4173
4174 record.pos = -1
4175 if self.drop_samples:
4176 record.max_unpack = BCF_UN_SHR
4177
4178 with nogil:
4179 ret = bcf_read1(self.htsfile, self.header.ptr, record)
4180
4181 if ret < 0:
4182 errcode = record.errcode
4183 bcf_destroy1(record)
4184 if errcode:
4185 raise IOError('unable to parse next record')
4186 if ret == -1:
4187 raise StopIteration
4188 elif ret == -2:
4189 raise IOError('truncated file')
4190 elif errno:
4191 raise IOError(errno, strerror(errno))
4192 else:
4193 raise IOError('unable to fetch next record')
4194
4195 return makeVariantRecord(self.header, record)
4196
4197 def copy(self):
4198 if not self.is_open:
4199 raise ValueError
4200
4201 cdef VariantFile vars = VariantFile.__new__(VariantFile)
4202 cdef bcf_hdr_t *hdr
4203
4204 # FIXME: re-open using fd or else header and index could be invalid
4205 vars.htsfile = self._open_htsfile()
4206
4207 if not vars.htsfile:
4208 raise ValueError('Cannot re-open htsfile')
4209
4210 # minimize overhead by re-using header and index. This approach is
4211 # currently risky, but see above for how this can be mitigated.
4212 vars.header = self.header
4213 vars.index = self.index
4214
4215 vars.filename = self.filename
4216 vars.mode = self.mode
4217 vars.threads = self.threads
4218 vars.index_filename = self.index_filename
4219 vars.drop_samples = self.drop_samples
4220 vars.is_stream = self.is_stream
4221 vars.is_remote = self.is_remote
4222 vars.is_reading = self.is_reading
4223 vars.start_offset = self.start_offset
4224 vars.header_written = self.header_written
4225
4226 if self.htsfile.is_bin:
4227 vars.seek(self.tell())
4228 else:
4229 with nogil:
4230 hdr = bcf_hdr_read(vars.htsfile)
4231 makeVariantHeader(hdr)
4232
4233 return vars
4234
4235 def open(self, filename, mode='r',
4236 index_filename=None,
4237 VariantHeader header=None,
4238 drop_samples=False,
4239 duplicate_filehandle=True,
4240 ignore_truncation=False,
4241 threads=1):
4242 """open a vcf/bcf file.
4243
4244 If open is called on an existing VariantFile, the current file will be
4245 closed and a new file will be opened.
4246 """
4247 cdef bcf_hdr_t *hdr
4248 cdef BGZF *bgzfp
4249 cdef hts_idx_t *idx
4250 cdef tbx_t *tidx
4251 cdef char *cfilename
4252 cdef char *cindex_filename = NULL
4253 cdef char *cmode
4254
4255 if threads > 1 and ignore_truncation:
4256 # This won't raise errors if reaching a truncated alignment,
4257 # because bgzf_mt_reader in htslib does not deal with
4258 # bgzf_mt_read_block returning non-zero values, contrary
4259 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
4260 # Better to avoid this (for now) than to produce seemingly correct results.
4261 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
4262 self.threads = threads
4263
4264 # close a previously opened file
4265 if self.is_open:
4266 self.close()
4267
4268 if not mode or mode[0] not in 'rwa':
4269 raise ValueError('mode must begin with r, w or a')
4270
4271 self.duplicate_filehandle = duplicate_filehandle
4272
4273 format_modes = [m for m in mode[1:] if m in 'bcguz']
4274 if len(format_modes) > 1:
4275 raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
4276
4277 invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
4278 if invalid_modes:
4279 raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
4280
4281 # Autodetect mode from filename
4282 if mode == 'w' and isinstance(filename, str):
4283 if filename.endswith('.gz'):
4284 mode = 'wz'
4285 elif filename.endswith('.bcf'):
4286 mode = 'wb'
4287
4288 # for htslib, wbu seems to not work
4289 if mode == 'wbu':
4290 mode = 'wb0'
4291
4292 self.mode = mode = force_bytes(mode)
4293 try:
4294 filename = encode_filename(filename)
4295 self.is_remote = hisremote(filename)
4296 self.is_stream = filename == b'-'
4297 except TypeError:
4298 filename = filename
4299 self.is_remote = False
4300 self.is_stream = True
4301
4302 self.filename = filename
4303
4304 if index_filename is not None:
4305 self.index_filename = index_filename = encode_filename(index_filename)
4306 else:
4307 self.index_filename = None
4308
4309 self.drop_samples = bool(drop_samples)
4310 self.header = None
4311
4312 self.header_written = False
4313
4314 if mode.startswith(b'w'):
4315 # open file for writing
4316 if index_filename is not None:
4317 raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
4318
4319 # header structure (used for writing)
4320 if header:
4321 self.header = header.copy()
4322 else:
4323 self.header = VariantHeader()
4324 #raise ValueError('a VariantHeader must be specified')
4325
4326 # Header is not written until the first write or on close
4327 self.htsfile = self._open_htsfile()
4328
4329 if not self.htsfile:
4330 raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
4331
4332 elif mode.startswith(b'r'):
4333 # open file for reading
4334 self.htsfile = self._open_htsfile()
4335
4336 if not self.htsfile:
4337 if errno:
4338 raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
4339 else:
4340 raise ValueError('could not open variant file `{}`'.format(filename))
4341
4342 if self.htsfile.format.format not in (bcf, vcf):
4343 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
4344
4345 self.check_truncation(ignore_truncation)
4346
4347 with nogil:
4348 hdr = bcf_hdr_read(self.htsfile)
4349
4350 try:
4351 self.header = makeVariantHeader(hdr)
4352 except ValueError:
4353 raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
4354
4355 if isinstance(self.filename, bytes):
4356 cfilename = self.filename
4357 else:
4358 cfilename = NULL
4359
4360 # check for index and open if present
4361 if self.htsfile.format.format == bcf and cfilename:
4362 if index_filename is not None:
4363 cindex_filename = index_filename
4364 with nogil:
4365 idx = bcf_index_load2(cfilename, cindex_filename)
4366 self.index = makeBCFIndex(self.header, idx)
4367
4368 elif self.htsfile.format.compression == bgzf and cfilename:
4369 if index_filename is not None:
4370 cindex_filename = index_filename
4371 with nogil:
4372 tidx = tbx_index_load2(cfilename, cindex_filename)
4373 self.index = makeTabixIndex(tidx)
4374
4375 if not self.is_stream:
4376 self.start_offset = self.tell()
4377 else:
4378 raise ValueError('unknown mode {}'.format(mode))
4379
4380 def reset(self):
4381 """reset file position to beginning of file just after the header."""
4382 return self.seek(self.start_offset)
4383
4384 def is_valid_tid(self, tid):
4385 """
4386 return True if the numerical :term:`tid` is valid; False otherwise.
4387
4388 returns -1 if reference is not known.
4389 """
4390 if not self.is_open:
4391 raise ValueError('I/O operation on closed file')
4392
4393 cdef bcf_hdr_t *hdr = self.header.ptr
4394 cdef int rid = tid
4395 return 0 <= rid < hdr.n[BCF_DT_CTG]
4396
4397 def get_tid(self, reference):
4398 """
4399 return the numerical :term:`tid` corresponding to
4400 :term:`reference`
4401
4402 returns -1 if reference is not known.
4403 """
4404 if not self.is_open:
4405 raise ValueError('I/O operation on closed file')
4406
4407 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
4408 reference = force_bytes(reference)
4409 cdef khint_t k = kh_get_vdict(d, reference)
4410 return kh_val_vdict(d, k).id if k != kh_end(d) else -1
4411
4412 def get_reference_name(self, tid):
4413 """
4414 return :term:`reference` name corresponding to numerical :term:`tid`
4415 """
4416 if not self.is_open:
4417 raise ValueError('I/O operation on closed file')
4418
4419 cdef bcf_hdr_t *hdr = self.header.ptr
4420 cdef int rid = tid
4421 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
4422 raise ValueError('Invalid tid')
4423 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
4424
4425 def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
4426 """fetch records in a :term:`region`, specified either by
4427 :term:`contig`, *start*, and *end* (which are 0-based, half-open);
4428 or alternatively by a samtools :term:`region` string (which is
4429 1-based inclusive).
4430
4431 Without *contig* or *region* all mapped records will be fetched. The
4432 records will be returned ordered by contig, which will not necessarily
4433 be the order within the file.
4434
4435 Set *reopen* to true if you will be using multiple iterators on the
4436 same file at the same time. The iterator returned will receive its
4437 own copy of a filehandle to the file effectively re-opening the
4438 file. Re-opening a file incurrs some overhead, so use with care.
4439
4440 If only *contig* is set, all records on *contig* will be fetched.
4441 If both *region* and *contig* are given, an exception is raised.
4442
4443 Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
4444 (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
4445 read sequentially.
4446 """
4447 if not self.is_open:
4448 raise ValueError('I/O operation on closed file')
4449
4450 if self.htsfile.is_write:
4451 raise ValueError('cannot fetch from Variantfile opened for writing')
4452
4453 if contig is None and region is None:
4454 self.is_reading = 1
4455 bcf = self.copy() if reopen else self
4456 bcf.seek(self.start_offset)
4457 return iter(bcf)
4458
4459 if self.index is None:
4460 raise ValueError('fetch requires an index')
4461
4462 _, tid, start, stop = self.parse_region(contig, start, stop, region,
4463 None, end=end, reference=reference)
4464
4465 if contig is None:
4466 contig = self.get_reference_name(tid)
4467
4468 self.is_reading = 1
4469 return self.index.fetch(self, contig, start, stop, reopen)
4470
4471 def new_record(self, *args, **kwargs):
4472 """Create a new empty :class:`VariantRecord`.
4473
4474 See :meth:`VariantHeader.new_record`
4475 """
4476 return self.header.new_record(*args, **kwargs)
4477
4478 cpdef int write(self, VariantRecord record) except -1:
4479 """
4480 write a single :class:`pysam.VariantRecord` to disk.
4481
4482 returns the number of bytes written.
4483 """
4484 if record is None:
4485 raise ValueError('record must not be None')
4486
4487 if not self.is_open:
4488 return ValueError('I/O operation on closed file')
4489
4490 if not self.htsfile.is_write:
4491 raise ValueError('cannot write to a Variantfile opened for reading')
4492
4493 if not self.header_written:
4494 self.header_written = True
4495 with nogil:
4496 bcf_hdr_write(self.htsfile, self.header.ptr)
4497
4498 #if record.header is not self.header:
4499 # record.translate(self.header)
4500 # raise ValueError('Writing records from a different VariantFile is not yet supported')
4501
4502 if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
4503 msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
4504 raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
4505
4506 # Sync END annotation before writing
4507 bcf_sync_end(record)
4508
4509 cdef int ret
4510
4511 with nogil:
4512 ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
4513
4514 if ret < 0:
4515 raise IOError(errno, strerror(errno))
4516
4517 return ret
4518
4519 def subset_samples(self, include_samples):
4520 """
4521 Read only a subset of samples to reduce processing time and memory.
4522 Must be called prior to retrieving records.
4523 """
4524 if not self.is_open:
4525 raise ValueError('I/O operation on closed file')
4526
4527 if self.htsfile.is_write:
4528 raise ValueError('cannot subset samples from Variantfile opened for writing')
4529
4530 if self.is_reading:
4531 raise ValueError('cannot subset samples after fetching records')
4532
4533 self.header._subset_samples(include_samples)
4534
4535 # potentially unnecessary optimization that also sets max_unpack
4536 if not include_samples:
4537 self.drop_samples = True
4538