annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcbcf.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # cython: language_level=3
jpayne@69 2 # cython: embedsignature=True
jpayne@69 3 # cython: profile=True
jpayne@69 4 ###############################################################################
jpayne@69 5 ###############################################################################
jpayne@69 6 ## Cython wrapper for htslib VCF/BCF reader/writer
jpayne@69 7 ###############################################################################
jpayne@69 8 #
jpayne@69 9 # NOTICE: This code is incomplete and preliminary. It offers a nearly
jpayne@69 10 # complete Pythonic interface to VCF/BCF metadata and data with
jpayne@69 11 # reading and writing capability. Documentation and a unit test suite
jpayne@69 12 # are in the works. The code is best tested under Python 2, but
jpayne@69 13 # should also work with Python 3. Please report any remaining
jpayne@69 14 # str/bytes issues on the github site when using Python 3 and I'll
jpayne@69 15 # fix them promptly.
jpayne@69 16 #
jpayne@69 17 # Here is a minimal example of how to use the API:
jpayne@69 18 #
jpayne@69 19 # $ cat bcfview.py
jpayne@69 20 # import sys
jpayne@69 21 # from pysam import VariantFile
jpayne@69 22 #
jpayne@69 23 # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
jpayne@69 24 # bcf_out = VariantFile('-', 'w', header=bcf_in.header)
jpayne@69 25 #
jpayne@69 26 # for rec in bcf_in:
jpayne@69 27 # bcf_out.write(rec)
jpayne@69 28 #
jpayne@69 29 # Performance is fairly close to that of bcftools view. Here is an example
jpayne@69 30 # using some 1k Genomes data:
jpayne@69 31 #
jpayne@69 32 # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@69 33 # 1103799
jpayne@69 34 #
jpayne@69 35 # real 0m56.114s
jpayne@69 36 # user 1m4.489s
jpayne@69 37 # sys 0m3.102s
jpayne@69 38 #
jpayne@69 39 # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@69 40 # 1103800 # bcftools adds an extra header
jpayne@69 41 #
jpayne@69 42 # real 0m55.126s
jpayne@69 43 # user 1m3.502s
jpayne@69 44 # sys 0m3.459s
jpayne@69 45 #
jpayne@69 46 ###############################################################################
jpayne@69 47 #
jpayne@69 48 # TODO list:
jpayne@69 49 #
jpayne@69 50 # * more genotype methods
jpayne@69 51 # * unit test suite (perhaps py.test based)
jpayne@69 52 # * documentation
jpayne@69 53 # * pickle support
jpayne@69 54 # * left/right locus normalization
jpayne@69 55 # * fix reopen to re-use fd
jpayne@69 56 #
jpayne@69 57 ###############################################################################
jpayne@69 58 #
jpayne@69 59 # The MIT License
jpayne@69 60 #
jpayne@69 61 # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
jpayne@69 62 #
jpayne@69 63 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@69 64 # copy of this software and associated documentation files (the "Software"),
jpayne@69 65 # to deal in the Software without restriction, including without limitation
jpayne@69 66 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@69 67 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@69 68 # Software is furnished to do so, subject to the following conditions:
jpayne@69 69 #
jpayne@69 70 # The above copyright notice and this permission notice shall be included in
jpayne@69 71 # all copies or substantial portions of the Software.
jpayne@69 72 #
jpayne@69 73 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69 74 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69 75 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@69 76 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69 77 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@69 78 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@69 79 # DEALINGS IN THE SOFTWARE.
jpayne@69 80 #
jpayne@69 81 ###############################################################################
jpayne@69 82
jpayne@69 83 from __future__ import division, print_function
jpayne@69 84
jpayne@69 85 import os
jpayne@69 86 import sys
jpayne@69 87
jpayne@69 88 from libc.errno cimport errno, EPIPE
jpayne@69 89 from libc.string cimport strcmp, strpbrk, strerror
jpayne@69 90 from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
jpayne@69 91
jpayne@69 92 cimport cython
jpayne@69 93
jpayne@69 94 from cpython.object cimport PyObject
jpayne@69 95 from cpython.ref cimport Py_INCREF
jpayne@69 96 from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
jpayne@69 97 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
jpayne@69 98 from cpython.bytes cimport PyBytes_FromStringAndSize
jpayne@69 99 from cpython.unicode cimport PyUnicode_DecodeUTF8
jpayne@69 100
jpayne@69 101 from pysam.libchtslib cimport HTSFile, hisremote
jpayne@69 102
jpayne@69 103 from pysam.utils import unquoted_str
jpayne@69 104
jpayne@69 105
jpayne@69 106 __all__ = ['VariantFile',
jpayne@69 107 'VariantHeader',
jpayne@69 108 'VariantHeaderRecord',
jpayne@69 109 'VariantHeaderRecords',
jpayne@69 110 'VariantMetadata',
jpayne@69 111 'VariantHeaderMetadata',
jpayne@69 112 'VariantContig',
jpayne@69 113 'VariantHeaderContigs',
jpayne@69 114 'VariantHeaderSamples',
jpayne@69 115 'VariantRecordFilter',
jpayne@69 116 'VariantRecordFormat',
jpayne@69 117 'VariantRecordInfo',
jpayne@69 118 'VariantRecordSamples',
jpayne@69 119 'VariantRecord',
jpayne@69 120 'VariantRecordSample',
jpayne@69 121 'BaseIndex',
jpayne@69 122 'BCFIndex',
jpayne@69 123 'TabixIndex',
jpayne@69 124 'BaseIterator',
jpayne@69 125 'BCFIterator',
jpayne@69 126 'TabixIterator',
jpayne@69 127 'VariantRecord']
jpayne@69 128
jpayne@69 129 ########################################################################
jpayne@69 130 ########################################################################
jpayne@69 131 ## Constants
jpayne@69 132 ########################################################################
jpayne@69 133
jpayne@69 134 cdef int MAX_POS = (1 << 31) - 1
jpayne@69 135 cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
jpayne@69 136 cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
jpayne@69 137 cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
jpayne@69 138
jpayne@69 139
jpayne@69 140 ########################################################################
jpayne@69 141 ########################################################################
jpayne@69 142 ## Python 3 compatibility functions
jpayne@69 143 ########################################################################
jpayne@69 144
jpayne@69 145 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
jpayne@69 146 from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
jpayne@69 147
jpayne@69 148
jpayne@69 149 ########################################################################
jpayne@69 150 ########################################################################
jpayne@69 151 ## Sentinel object
jpayne@69 152 ########################################################################
jpayne@69 153
jpayne@69 154 cdef object _nothing = object()
jpayne@69 155
jpayne@69 156 ########################################################################
jpayne@69 157 ########################################################################
jpayne@69 158 ## VCF/BCF string intern system
jpayne@69 159 ########################################################################
jpayne@69 160
jpayne@69 161 cdef dict bcf_str_cache = {}
jpayne@69 162
jpayne@69 163 cdef inline bcf_str_cache_get_charptr(const char* s):
jpayne@69 164 if s == NULL:
jpayne@69 165 return None
jpayne@69 166
jpayne@69 167 cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
jpayne@69 168 if pystr:
jpayne@69 169 return <object>pystr
jpayne@69 170
jpayne@69 171 val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
jpayne@69 172
jpayne@69 173 PyDict_SetItemString(bcf_str_cache, s, val)
jpayne@69 174
jpayne@69 175 return val
jpayne@69 176
jpayne@69 177
jpayne@69 178 ########################################################################
jpayne@69 179 ########################################################################
jpayne@69 180 ## Genotype math
jpayne@69 181 ########################################################################
jpayne@69 182
jpayne@69 183 cdef int comb(int n, int k) except -1:
jpayne@69 184 """Return binomial coefficient: n choose k
jpayne@69 185
jpayne@69 186 >>> comb(5, 1)
jpayne@69 187 5
jpayne@69 188 >>> comb(5, 2)
jpayne@69 189 10
jpayne@69 190 >>> comb(2, 2)
jpayne@69 191 1
jpayne@69 192 >>> comb(100, 2)
jpayne@69 193 4950
jpayne@69 194 """
jpayne@69 195 if k > n:
jpayne@69 196 return 0
jpayne@69 197 elif k == n:
jpayne@69 198 return 1
jpayne@69 199 elif k > n // 2:
jpayne@69 200 k = n - k
jpayne@69 201
jpayne@69 202 cdef d, result
jpayne@69 203
jpayne@69 204 d = result = n - k + 1
jpayne@69 205 for i in range(2, k + 1):
jpayne@69 206 d += 1
jpayne@69 207 result *= d
jpayne@69 208 result //= i
jpayne@69 209 return result
jpayne@69 210
jpayne@69 211
jpayne@69 212 cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
jpayne@69 213 """Return the count of genotypes expected for the given ploidy and number of alleles.
jpayne@69 214
jpayne@69 215 >>> bcf_geno_combinations(1, 2)
jpayne@69 216 2
jpayne@69 217 >>> bcf_geno_combinations(2, 2)
jpayne@69 218 3
jpayne@69 219 >>> bcf_geno_combinations(2, 3)
jpayne@69 220 6
jpayne@69 221 >>> bcf_geno_combinations(3, 2)
jpayne@69 222 4
jpayne@69 223 """
jpayne@69 224 return comb(alleles + ploidy - 1, ploidy)
jpayne@69 225
jpayne@69 226
jpayne@69 227 ########################################################################
jpayne@69 228 ########################################################################
jpayne@69 229 ## Low level type conversion helpers
jpayne@69 230 ########################################################################
jpayne@69 231
jpayne@69 232
jpayne@69 233 cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
jpayne@69 234 return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
jpayne@69 235
jpayne@69 236
jpayne@69 237 cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
jpayne@69 238 return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
jpayne@69 239
jpayne@69 240
jpayne@69 241 cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
jpayne@69 242
jpayne@69 243 if sample < 0:
jpayne@69 244 raise ValueError('genotype is only valid as a format field')
jpayne@69 245
jpayne@69 246 cdef int32_t *gt_arr = NULL
jpayne@69 247 cdef int ngt = 0
jpayne@69 248 ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
jpayne@69 249
jpayne@69 250 if ngt <= 0 or not gt_arr:
jpayne@69 251 return 0
jpayne@69 252
jpayne@69 253 assert ngt % rec.n_sample == 0
jpayne@69 254 cdef int max_ploidy = ngt // rec.n_sample
jpayne@69 255 cdef int32_t *gt = gt_arr + sample * max_ploidy
jpayne@69 256 cdef int ploidy = 0
jpayne@69 257
jpayne@69 258 while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
jpayne@69 259 gt += 1
jpayne@69 260 ploidy += 1
jpayne@69 261
jpayne@69 262 free(<void*>gt_arr)
jpayne@69 263
jpayne@69 264 return bcf_geno_combinations(ploidy, rec.n_allele)
jpayne@69 265
jpayne@69 266
jpayne@69 267 cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
jpayne@69 268 if not a:
jpayne@69 269 return None
jpayne@69 270 try:
jpayne@69 271 return tuple(charptr_to_str(a[i]) for i in range(n))
jpayne@69 272 finally:
jpayne@69 273 if free_after and a:
jpayne@69 274 free(a)
jpayne@69 275
jpayne@69 276
jpayne@69 277 cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
jpayne@69 278 cdef char *datac
jpayne@69 279 cdef int8_t *data8
jpayne@69 280 cdef int16_t *data16
jpayne@69 281 cdef int32_t *data32
jpayne@69 282 cdef float *dataf
jpayne@69 283 cdef int i
jpayne@69 284 cdef bytes b
jpayne@69 285
jpayne@69 286 if not data or n <= 0:
jpayne@69 287 return None
jpayne@69 288
jpayne@69 289 if type == BCF_BT_CHAR:
jpayne@69 290 datac = <char *>data
jpayne@69 291
jpayne@69 292 if not n:
jpayne@69 293 value = ()
jpayne@69 294 else:
jpayne@69 295 # Check if at least one null terminator is present
jpayne@69 296 if datac[n-1] == bcf_str_vector_end:
jpayne@69 297 # If so, create a string up to the first null terminator
jpayne@69 298 b = datac
jpayne@69 299 else:
jpayne@69 300 # Otherwise, copy the entire block
jpayne@69 301 b = datac[:n]
jpayne@69 302 value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
jpayne@69 303 else:
jpayne@69 304 value = []
jpayne@69 305 if type == BCF_BT_INT8:
jpayne@69 306 data8 = <int8_t *>data
jpayne@69 307 for i in range(n):
jpayne@69 308 if data8[i] == bcf_int8_vector_end:
jpayne@69 309 break
jpayne@69 310 value.append(data8[i] if data8[i] != bcf_int8_missing else None)
jpayne@69 311 elif type == BCF_BT_INT16:
jpayne@69 312 data16 = <int16_t *>data
jpayne@69 313 for i in range(n):
jpayne@69 314 if data16[i] == bcf_int16_vector_end:
jpayne@69 315 break
jpayne@69 316 value.append(data16[i] if data16[i] != bcf_int16_missing else None)
jpayne@69 317 elif type == BCF_BT_INT32:
jpayne@69 318 data32 = <int32_t *>data
jpayne@69 319 for i in range(n):
jpayne@69 320 if data32[i] == bcf_int32_vector_end:
jpayne@69 321 break
jpayne@69 322 value.append(data32[i] if data32[i] != bcf_int32_missing else None)
jpayne@69 323 elif type == BCF_BT_FLOAT:
jpayne@69 324 dataf = <float *>data
jpayne@69 325 for i in range(n):
jpayne@69 326 if bcf_float_is_vector_end(dataf[i]):
jpayne@69 327 break
jpayne@69 328 value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
jpayne@69 329 else:
jpayne@69 330 raise TypeError('unsupported info type code')
jpayne@69 331
jpayne@69 332 # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
jpayne@69 333 if not value:
jpayne@69 334 if scalar:
jpayne@69 335 value = None
jpayne@69 336 elif count <= 0:
jpayne@69 337 value = ()
jpayne@69 338 else:
jpayne@69 339 value = (None,)*count
jpayne@69 340 elif scalar and len(value) == 1:
jpayne@69 341 value = value[0]
jpayne@69 342 else:
jpayne@69 343 value = tuple(value)
jpayne@69 344
jpayne@69 345 return value
jpayne@69 346
jpayne@69 347
jpayne@69 348 cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
jpayne@69 349 cdef char *datac
jpayne@69 350 cdef int8_t *data8
jpayne@69 351 cdef int16_t *data16
jpayne@69 352 cdef int32_t *data32
jpayne@69 353 cdef float *dataf
jpayne@69 354 cdef ssize_t i, value_count = len(values)
jpayne@69 355
jpayne@69 356 assert value_count <= n
jpayne@69 357
jpayne@69 358 if bt_type == BCF_BT_CHAR:
jpayne@69 359 if not isinstance(values, (str, bytes)):
jpayne@69 360 values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
jpayne@69 361 value_count = len(values)
jpayne@69 362 assert value_count <= n
jpayne@69 363 datac = <char *>data
jpayne@69 364 memcpy(datac, <char *>values, value_count)
jpayne@69 365 for i in range(value_count, n):
jpayne@69 366 datac[i] = 0
jpayne@69 367 elif bt_type == BCF_BT_INT8:
jpayne@69 368 datai8 = <int8_t *>data
jpayne@69 369 for i in range(value_count):
jpayne@69 370 val = values[i]
jpayne@69 371 datai8[i] = val if val is not None else bcf_int8_missing
jpayne@69 372 for i in range(value_count, n):
jpayne@69 373 datai8[i] = bcf_int8_vector_end
jpayne@69 374 elif bt_type == BCF_BT_INT16:
jpayne@69 375 datai16 = <int16_t *>data
jpayne@69 376 for i in range(value_count):
jpayne@69 377 val = values[i]
jpayne@69 378 datai16[i] = val if val is not None else bcf_int16_missing
jpayne@69 379 for i in range(value_count, n):
jpayne@69 380 datai16[i] = bcf_int16_vector_end
jpayne@69 381 elif bt_type == BCF_BT_INT32:
jpayne@69 382 datai32 = <int32_t *>data
jpayne@69 383 for i in range(value_count):
jpayne@69 384 val = values[i]
jpayne@69 385 datai32[i] = val if val is not None else bcf_int32_missing
jpayne@69 386 for i in range(value_count, n):
jpayne@69 387 datai32[i] = bcf_int32_vector_end
jpayne@69 388 elif bt_type == BCF_BT_FLOAT:
jpayne@69 389 dataf = <float *>data
jpayne@69 390 for i in range(value_count):
jpayne@69 391 val = values[i]
jpayne@69 392 if val is None:
jpayne@69 393 bcf_float_set(dataf + i, bcf_float_missing)
jpayne@69 394 else:
jpayne@69 395 dataf[i] = val
jpayne@69 396 for i in range(value_count, n):
jpayne@69 397 bcf_float_set(dataf + i, bcf_float_vector_end)
jpayne@69 398 else:
jpayne@69 399 raise TypeError('unsupported type')
jpayne@69 400
jpayne@69 401
jpayne@69 402 cdef bcf_empty_array(int type, ssize_t n, int vlen):
jpayne@69 403 cdef char *datac
jpayne@69 404 cdef int32_t *data32
jpayne@69 405 cdef float *dataf
jpayne@69 406 cdef int i
jpayne@69 407
jpayne@69 408 if n <= 0:
jpayne@69 409 raise ValueError('Cannot create empty array')
jpayne@69 410
jpayne@69 411 if type == BCF_HT_STR:
jpayne@69 412 value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
jpayne@69 413 datac = <char *>value
jpayne@69 414 for i in range(n):
jpayne@69 415 datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
jpayne@69 416 elif type == BCF_HT_INT:
jpayne@69 417 value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
jpayne@69 418 data32 = <int32_t *><char *>value
jpayne@69 419 for i in range(n):
jpayne@69 420 data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@69 421 elif type == BCF_HT_REAL:
jpayne@69 422 value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
jpayne@69 423 dataf = <float *><char *>value
jpayne@69 424 for i in range(n):
jpayne@69 425 bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@69 426 else:
jpayne@69 427 raise TypeError('unsupported header type code')
jpayne@69 428
jpayne@69 429 return value
jpayne@69 430
jpayne@69 431
jpayne@69 432 cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
jpayne@69 433 void *dst_data, int dst_type, size_t dst_values,
jpayne@69 434 int vlen):
jpayne@69 435 """copy data from src to dest where the size of the elements (src_type/dst_type) differ
jpayne@69 436 as well as the number of elements (src_values/dst_values).
jpayne@69 437 """
jpayne@69 438
jpayne@69 439 cdef char *src_datac
jpayne@69 440 cdef char *dst_datac
jpayne@69 441 cdef int8_t *src_datai8
jpayne@69 442 cdef int16_t *src_datai16
jpayne@69 443 cdef int32_t *src_datai32
jpayne@69 444 cdef int32_t *dst_datai
jpayne@69 445 cdef float *src_dataf
jpayne@69 446 cdef float *dst_dataf
jpayne@69 447 cdef ssize_t src_size, dst_size, i, j
jpayne@69 448 cdef int val
jpayne@69 449
jpayne@69 450 if src_values > dst_values:
jpayne@69 451 raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
jpayne@69 452
jpayne@69 453 if src_type == dst_type == BCF_BT_CHAR:
jpayne@69 454 src_datac = <char *>src_data
jpayne@69 455 dst_datac = <char *>dst_data
jpayne@69 456 memcpy(dst_datac, src_datac, src_values)
jpayne@69 457 for i in range(src_values, dst_values):
jpayne@69 458 dst_datac[i] = 0
jpayne@69 459 elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
jpayne@69 460 src_datai8 = <int8_t *>src_data
jpayne@69 461 dst_datai = <int32_t *>dst_data
jpayne@69 462 for i in range(src_values):
jpayne@69 463 val = src_datai8[i]
jpayne@69 464 if val == bcf_int8_missing:
jpayne@69 465 val = bcf_int32_missing
jpayne@69 466 elif val == bcf_int8_vector_end:
jpayne@69 467 val = bcf_int32_vector_end
jpayne@69 468 dst_datai[i] = val
jpayne@69 469 for i in range(src_values, dst_values):
jpayne@69 470 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@69 471 elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
jpayne@69 472 src_datai16 = <int16_t *>src_data
jpayne@69 473 dst_datai = <int32_t *>dst_data
jpayne@69 474 for i in range(src_values):
jpayne@69 475 val = src_datai16[i]
jpayne@69 476 if val == bcf_int16_missing:
jpayne@69 477 val = bcf_int32_missing
jpayne@69 478 elif val == bcf_int16_vector_end:
jpayne@69 479 val = bcf_int32_vector_end
jpayne@69 480 dst_datai[i] = val
jpayne@69 481 for i in range(src_values, dst_values):
jpayne@69 482 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@69 483 elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
jpayne@69 484 src_datai32 = <int32_t *>src_data
jpayne@69 485 dst_datai = <int32_t *>dst_data
jpayne@69 486 for i in range(src_values):
jpayne@69 487 dst_datai[i] = src_datai32[i]
jpayne@69 488 for i in range(src_values, dst_values):
jpayne@69 489 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@69 490 elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
jpayne@69 491 src_dataf = <float *>src_data
jpayne@69 492 dst_dataf = <float *>dst_data
jpayne@69 493 for i in range(src_values):
jpayne@69 494 dst_dataf[i] = src_dataf[i]
jpayne@69 495 for i in range(src_values, dst_values):
jpayne@69 496 bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@69 497 else:
jpayne@69 498 raise TypeError('unsupported types')
jpayne@69 499
jpayne@69 500
jpayne@69 501 cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
jpayne@69 502 if record is None:
jpayne@69 503 raise ValueError('record must not be None')
jpayne@69 504
jpayne@69 505 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@69 506 cdef bcf1_t *r = record.ptr
jpayne@69 507
jpayne@69 508 if not check_header_id(hdr, hl_type, id):
jpayne@69 509 raise ValueError('Invalid header')
jpayne@69 510
jpayne@69 511 cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
jpayne@69 512 cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
jpayne@69 513
jpayne@69 514 scalar[0] = 0
jpayne@69 515
jpayne@69 516 if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
jpayne@69 517 count[0] = number
jpayne@69 518 elif length == BCF_VL_FIXED:
jpayne@69 519 if number == 1:
jpayne@69 520 scalar[0] = 1
jpayne@69 521 count[0] = number
jpayne@69 522 elif length == BCF_VL_R:
jpayne@69 523 count[0] = r.n_allele
jpayne@69 524 elif length == BCF_VL_A:
jpayne@69 525 count[0] = r.n_allele - 1
jpayne@69 526 elif length == BCF_VL_G:
jpayne@69 527 count[0] = bcf_genotype_count(hdr, r, sample)
jpayne@69 528 elif length == BCF_VL_VAR:
jpayne@69 529 count[0] = -1
jpayne@69 530 else:
jpayne@69 531 raise ValueError('Unknown format length')
jpayne@69 532
jpayne@69 533
jpayne@69 534 cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
jpayne@69 535 if record is None:
jpayne@69 536 raise ValueError('record must not be None')
jpayne@69 537
jpayne@69 538 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@69 539
jpayne@69 540 cdef char *s
jpayne@69 541 cdef ssize_t count
jpayne@69 542 cdef int scalar
jpayne@69 543
jpayne@69 544 bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
jpayne@69 545
jpayne@69 546 if z.len == 0:
jpayne@69 547 if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
jpayne@69 548 value = True
jpayne@69 549 elif scalar:
jpayne@69 550 value = None
jpayne@69 551 else:
jpayne@69 552 value = ()
jpayne@69 553 elif z.len == 1:
jpayne@69 554 if z.type == BCF_BT_INT8:
jpayne@69 555 value = z.v1.i if z.v1.i != bcf_int8_missing else None
jpayne@69 556 elif z.type == BCF_BT_INT16:
jpayne@69 557 value = z.v1.i if z.v1.i != bcf_int16_missing else None
jpayne@69 558 elif z.type == BCF_BT_INT32:
jpayne@69 559 value = z.v1.i if z.v1.i != bcf_int32_missing else None
jpayne@69 560 elif z.type == BCF_BT_FLOAT:
jpayne@69 561 value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
jpayne@69 562 elif z.type == BCF_BT_CHAR:
jpayne@69 563 value = force_str(chr(z.v1.i))
jpayne@69 564 else:
jpayne@69 565 raise TypeError('unsupported info type code')
jpayne@69 566
jpayne@69 567 if not scalar and value != ():
jpayne@69 568 value = (value,)
jpayne@69 569 else:
jpayne@69 570 value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
jpayne@69 571
jpayne@69 572 return value
jpayne@69 573
jpayne@69 574
jpayne@69 575 cdef object bcf_check_values(VariantRecord record, value, int sample,
jpayne@69 576 int hl_type, int ht_type,
jpayne@69 577 int id, int bt_type, ssize_t bt_len,
jpayne@69 578 ssize_t *value_count, int *scalar, int *realloc):
jpayne@69 579
jpayne@69 580 if record is None:
jpayne@69 581 raise ValueError('record must not be None')
jpayne@69 582
jpayne@69 583 bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
jpayne@69 584
jpayne@69 585 # Validate values now that we know the type and size
jpayne@69 586 values = (value,) if not isinstance(value, (list, tuple)) else value
jpayne@69 587
jpayne@69 588 # Validate values now that we know the type and size
jpayne@69 589 if ht_type == BCF_HT_FLAG:
jpayne@69 590 value_count[0] = 1
jpayne@69 591 elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
jpayne@69 592 # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
jpayne@69 593 value_count[0] = -1
jpayne@69 594
jpayne@69 595 cdef int given = len(values)
jpayne@69 596 if value_count[0] != -1 and value_count[0] != given:
jpayne@69 597 if scalar[0]:
jpayne@69 598 raise TypeError('value expected to be scalar, given len={}'.format(given))
jpayne@69 599 else:
jpayne@69 600 raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
jpayne@69 601
jpayne@69 602 if ht_type == BCF_HT_REAL:
jpayne@69 603 for v in values:
jpayne@69 604 if not(v is None or isinstance(v, (float, int))):
jpayne@69 605 raise TypeError('invalid value for Float format')
jpayne@69 606 elif ht_type == BCF_HT_INT:
jpayne@69 607 for v in values:
jpayne@69 608 if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
jpayne@69 609 raise TypeError('invalid value for Integer format')
jpayne@69 610 for v in values:
jpayne@69 611 if not(v is None or bcf_int32_missing < v <= INT32_MAX):
jpayne@69 612 raise ValueError('Integer value too small/large to store in VCF/BCF')
jpayne@69 613 elif ht_type == BCF_HT_STR:
jpayne@69 614 values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
jpayne@69 615 elif ht_type == BCF_HT_FLAG:
jpayne@69 616 if values[0] not in (True, False, None, 1, 0):
jpayne@69 617 raise ValueError('Flag values must be: True, False, None, 1, 0')
jpayne@69 618 else:
jpayne@69 619 raise TypeError('unsupported type')
jpayne@69 620
jpayne@69 621 realloc[0] = 0
jpayne@69 622 if len(values) <= 1 and hl_type == BCF_HL_INFO:
jpayne@69 623 realloc[0] = 0
jpayne@69 624 elif len(values) > bt_len:
jpayne@69 625 realloc[0] = 1
jpayne@69 626 elif bt_type == BCF_BT_INT8:
jpayne@69 627 for v in values:
jpayne@69 628 if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
jpayne@69 629 realloc[0] = 1
jpayne@69 630 break
jpayne@69 631 elif bt_type == BCF_BT_INT16:
jpayne@69 632 for v in values:
jpayne@69 633 if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
jpayne@69 634 realloc[0] = 1
jpayne@69 635 break
jpayne@69 636
jpayne@69 637 return values
jpayne@69 638
jpayne@69 639
jpayne@69 640 cdef bcf_encode_alleles(VariantRecord record, values):
jpayne@69 641 if record is None:
jpayne@69 642 raise ValueError('record must not be None')
jpayne@69 643
jpayne@69 644 cdef bcf1_t *r = record.ptr
jpayne@69 645 cdef int32_t nalleles = r.n_allele
jpayne@69 646 cdef list gt_values = []
jpayne@69 647 cdef char *s
jpayne@69 648 cdef int i
jpayne@69 649
jpayne@69 650 if values is None:
jpayne@69 651 return ()
jpayne@69 652
jpayne@69 653 if not isinstance(values, (list, tuple)):
jpayne@69 654 values = (values,)
jpayne@69 655
jpayne@69 656 for value in values:
jpayne@69 657 if value is None:
jpayne@69 658 gt_values.append(bcf_gt_missing)
jpayne@69 659 elif isinstance(value, (str, bytes)):
jpayne@69 660 bvalue = force_bytes(value)
jpayne@69 661 s = bvalue
jpayne@69 662 for i in range(r.n_allele):
jpayne@69 663 if strcmp(r.d.allele[i], s) != 0:
jpayne@69 664 gt_values.append(bcf_gt_unphased(i))
jpayne@69 665 break
jpayne@69 666 else:
jpayne@69 667 raise ValueError('Unknown allele')
jpayne@69 668 else:
jpayne@69 669 i = value
jpayne@69 670 if not (0 <= i < nalleles):
jpayne@69 671 raise ValueError('Invalid allele index')
jpayne@69 672 gt_values.append(bcf_gt_unphased(i))
jpayne@69 673
jpayne@69 674 return gt_values
jpayne@69 675
jpayne@69 676
jpayne@69 677 cdef bcf_info_set_value(VariantRecord record, key, value):
jpayne@69 678 if record is None:
jpayne@69 679 raise ValueError('record must not be None')
jpayne@69 680
jpayne@69 681 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@69 682 cdef bcf1_t *r = record.ptr
jpayne@69 683 cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
jpayne@69 684 cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
jpayne@69 685
jpayne@69 686 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 687 raise ValueError('Error unpacking VariantRecord')
jpayne@69 688
jpayne@69 689 cdef bytes bkey = force_bytes(key)
jpayne@69 690 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 691
jpayne@69 692 if info:
jpayne@69 693 info_id = info.key
jpayne@69 694 else:
jpayne@69 695 info_id = bcf_header_get_info_id(hdr, bkey)
jpayne@69 696
jpayne@69 697 if info_id < 0:
jpayne@69 698 raise KeyError('unknown INFO: {}'.format(key))
jpayne@69 699
jpayne@69 700 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@69 701 raise ValueError('Invalid header')
jpayne@69 702
jpayne@69 703 info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
jpayne@69 704 values = bcf_check_values(record, value, -1,
jpayne@69 705 BCF_HL_INFO, info_type, info_id,
jpayne@69 706 info.type if info else -1,
jpayne@69 707 info.len if info else -1,
jpayne@69 708 &value_count, &scalar, &realloc)
jpayne@69 709
jpayne@69 710 if info_type == BCF_HT_FLAG:
jpayne@69 711 if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
jpayne@69 712 raise ValueError('Unable to update INFO values')
jpayne@69 713 return
jpayne@69 714
jpayne@69 715 vlen = value_count < 0
jpayne@69 716 value_count = len(values)
jpayne@69 717
jpayne@69 718 # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
jpayne@69 719 # If we can, write updated values to existing allocated storage
jpayne@69 720 if 0 and info and not realloc:
jpayne@69 721 r.d.shared_dirty |= BCF1_DIRTY_INF
jpayne@69 722
jpayne@69 723 if value_count == 0:
jpayne@69 724 info.len = 0
jpayne@69 725 if not info.vptr:
jpayne@69 726 info.vptr = <uint8_t *>&info.v1.i
jpayne@69 727
jpayne@69 728 elif value_count == 1:
jpayne@69 729 # FIXME: Check if need to free vptr if info.len > 0?
jpayne@69 730 if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
jpayne@69 731 bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
jpayne@69 732 elif info.type == BCF_BT_FLOAT:
jpayne@69 733 bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
jpayne@69 734 else:
jpayne@69 735 raise TypeError('unsupported info type code')
jpayne@69 736
jpayne@69 737 info.len = 1
jpayne@69 738 if not info.vptr:
jpayne@69 739 info.vptr = <uint8_t *>&info.v1.i
jpayne@69 740 else:
jpayne@69 741 bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
jpayne@69 742
jpayne@69 743 return
jpayne@69 744
jpayne@69 745 alloc_len = max(1, value_count)
jpayne@69 746 if info and info.len > alloc_len:
jpayne@69 747 alloc_len = info.len
jpayne@69 748
jpayne@69 749 new_values = bcf_empty_array(info_type, alloc_len, vlen)
jpayne@69 750 cdef char *valp = <char *>new_values
jpayne@69 751
jpayne@69 752 if info_type == BCF_HT_INT:
jpayne@69 753 dst_type = BCF_BT_INT32
jpayne@69 754 elif info_type == BCF_HT_REAL:
jpayne@69 755 dst_type = BCF_BT_FLOAT
jpayne@69 756 elif info_type == BCF_HT_STR:
jpayne@69 757 dst_type = BCF_BT_CHAR
jpayne@69 758 else:
jpayne@69 759 raise ValueError('Unsupported INFO type')
jpayne@69 760
jpayne@69 761 bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
jpayne@69 762
jpayne@69 763 if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
jpayne@69 764 raise ValueError('Unable to update INFO values')
jpayne@69 765
jpayne@69 766
jpayne@69 767 cdef bcf_info_del_value(VariantRecord record, key):
jpayne@69 768 if record is None:
jpayne@69 769 raise ValueError('record must not be None')
jpayne@69 770
jpayne@69 771 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@69 772 cdef bcf1_t *r = record.ptr
jpayne@69 773 cdef ssize_t value_count
jpayne@69 774 cdef int scalar
jpayne@69 775
jpayne@69 776 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 777 raise ValueError('Error unpacking VariantRecord')
jpayne@69 778
jpayne@69 779 cdef bytes bkey = force_bytes(key)
jpayne@69 780 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 781
jpayne@69 782 if not info:
jpayne@69 783 raise KeyError(key)
jpayne@69 784
jpayne@69 785 bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
jpayne@69 786
jpayne@69 787 if value_count <= 0:
jpayne@69 788 null_value = ()
jpayne@69 789 elif scalar:
jpayne@69 790 null_value = None
jpayne@69 791 else:
jpayne@69 792 null_value = (None,)*value_count
jpayne@69 793
jpayne@69 794 bcf_info_set_value(record, bkey, null_value)
jpayne@69 795
jpayne@69 796
jpayne@69 797 cdef bcf_format_get_value(VariantRecordSample sample, key):
jpayne@69 798 if sample is None:
jpayne@69 799 raise ValueError('sample must not be None')
jpayne@69 800
jpayne@69 801 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 802 cdef bcf1_t *r = sample.record.ptr
jpayne@69 803 cdef ssize_t count
jpayne@69 804 cdef int scalar
jpayne@69 805
jpayne@69 806 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 807 raise ValueError('Error unpacking VariantRecord')
jpayne@69 808
jpayne@69 809 cdef bytes bkey = force_bytes(key)
jpayne@69 810 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 811
jpayne@69 812 if not fmt or not fmt.p:
jpayne@69 813 raise KeyError('invalid FORMAT: {}'.format(key))
jpayne@69 814
jpayne@69 815 if is_gt_fmt(hdr, fmt.id):
jpayne@69 816 return bcf_format_get_allele_indices(sample)
jpayne@69 817
jpayne@69 818 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
jpayne@69 819
jpayne@69 820 if fmt.p and fmt.n and fmt.size:
jpayne@69 821 return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
jpayne@69 822 elif scalar:
jpayne@69 823 return None
jpayne@69 824 elif count <= 0:
jpayne@69 825 return ()
jpayne@69 826 else:
jpayne@69 827 return (None,)*count
jpayne@69 828
jpayne@69 829
jpayne@69 830 cdef bcf_format_set_value(VariantRecordSample sample, key, value):
jpayne@69 831 if sample is None:
jpayne@69 832 raise ValueError('sample must not be None')
jpayne@69 833
jpayne@69 834 if key == 'phased':
jpayne@69 835 sample.phased = bool(value)
jpayne@69 836 return
jpayne@69 837
jpayne@69 838 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 839 cdef bcf1_t *r = sample.record.ptr
jpayne@69 840 cdef int fmt_id
jpayne@69 841 cdef vdict_t *d
jpayne@69 842 cdef khiter_t k
jpayne@69 843 cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
jpayne@69 844 cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
jpayne@69 845
jpayne@69 846 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 847 raise ValueError('Error unpacking VariantRecord')
jpayne@69 848
jpayne@69 849 cdef bytes bkey = force_bytes(key)
jpayne@69 850 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 851
jpayne@69 852 if fmt:
jpayne@69 853 fmt_id = fmt.id
jpayne@69 854 else:
jpayne@69 855 d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@69 856 k = kh_get_vdict(d, bkey)
jpayne@69 857
jpayne@69 858 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
jpayne@69 859 raise KeyError('unknown format: {}'.format(key))
jpayne@69 860
jpayne@69 861 fmt_id = kh_val_vdict(d, k).id
jpayne@69 862
jpayne@69 863 if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
jpayne@69 864 raise ValueError('Invalid header')
jpayne@69 865
jpayne@69 866 fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
jpayne@69 867
jpayne@69 868 if fmt_type == BCF_HT_FLAG:
jpayne@69 869 raise ValueError('Flag types are not allowed on FORMATs')
jpayne@69 870
jpayne@69 871 if is_gt_fmt(hdr, fmt_id):
jpayne@69 872 value = bcf_encode_alleles(sample.record, value)
jpayne@69 873 # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
jpayne@69 874 fmt_type = BCF_HT_INT
jpayne@69 875
jpayne@69 876 values = bcf_check_values(sample.record, value, sample.index,
jpayne@69 877 BCF_HL_FMT, fmt_type, fmt_id,
jpayne@69 878 fmt.type if fmt else -1,
jpayne@69 879 fmt.n if fmt else -1,
jpayne@69 880 &value_count, &scalar, &realloc)
jpayne@69 881 vlen = value_count < 0
jpayne@69 882 value_count = len(values)
jpayne@69 883
jpayne@69 884 # If we can, write updated values to existing allocated storage.
jpayne@69 885 if fmt and not realloc:
jpayne@69 886 r.d.indiv_dirty = 1
jpayne@69 887 bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
jpayne@69 888 return
jpayne@69 889
jpayne@69 890 alloc_len = max(1, value_count)
jpayne@69 891 if fmt and fmt.n > alloc_len:
jpayne@69 892 alloc_len = fmt.n
jpayne@69 893
jpayne@69 894 nsamples = r.n_sample
jpayne@69 895 new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
jpayne@69 896 cdef char *new_values_p = <char *>new_values
jpayne@69 897
jpayne@69 898 if fmt_type == BCF_HT_INT:
jpayne@69 899 dst_type = BCF_BT_INT32
jpayne@69 900 dst_size = sizeof(int32_t) * alloc_len
jpayne@69 901 elif fmt_type == BCF_HT_REAL:
jpayne@69 902 dst_type = BCF_BT_FLOAT
jpayne@69 903 dst_size = sizeof(float) * alloc_len
jpayne@69 904 elif fmt_type == BCF_HT_STR:
jpayne@69 905 dst_type = BCF_BT_CHAR
jpayne@69 906 dst_size = sizeof(char) * alloc_len
jpayne@69 907 else:
jpayne@69 908 raise ValueError('Unsupported FORMAT type')
jpayne@69 909
jpayne@69 910 if fmt and nsamples > 1:
jpayne@69 911 for i in range(nsamples):
jpayne@69 912 bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
jpayne@69 913 new_values_p + i * dst_size, dst_type, alloc_len,
jpayne@69 914 vlen)
jpayne@69 915
jpayne@69 916 bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
jpayne@69 917
jpayne@69 918 if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
jpayne@69 919 raise ValueError('Unable to update format values')
jpayne@69 920
jpayne@69 921
jpayne@69 922 cdef bcf_format_del_value(VariantRecordSample sample, key):
jpayne@69 923 if sample is None:
jpayne@69 924 raise ValueError('sample must not be None')
jpayne@69 925
jpayne@69 926 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 927 cdef bcf1_t *r = sample.record.ptr
jpayne@69 928 cdef ssize_t value_count
jpayne@69 929 cdef int scalar
jpayne@69 930
jpayne@69 931 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 932 raise ValueError('Error unpacking VariantRecord')
jpayne@69 933
jpayne@69 934 cdef bytes bkey = force_bytes(key)
jpayne@69 935 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 936
jpayne@69 937 if not fmt or not fmt.p:
jpayne@69 938 raise KeyError(key)
jpayne@69 939
jpayne@69 940 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
jpayne@69 941
jpayne@69 942 if value_count <= 0:
jpayne@69 943 null_value = ()
jpayne@69 944 elif scalar:
jpayne@69 945 null_value = None
jpayne@69 946 else:
jpayne@69 947 null_value = (None,)*value_count
jpayne@69 948
jpayne@69 949 bcf_format_set_value(sample, bkey, null_value)
jpayne@69 950
jpayne@69 951
jpayne@69 952 cdef bcf_format_get_allele_indices(VariantRecordSample sample):
jpayne@69 953 if sample is None:
jpayne@69 954 raise ValueError('sample must not be None')
jpayne@69 955
jpayne@69 956 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 957 cdef bcf1_t *r = sample.record.ptr
jpayne@69 958 cdef int32_t n = r.n_sample
jpayne@69 959
jpayne@69 960 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 961 raise ValueError('Error unpacking VariantRecord')
jpayne@69 962
jpayne@69 963 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@69 964 return ()
jpayne@69 965
jpayne@69 966 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@69 967 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@69 968
jpayne@69 969 if not gt0 or not fmt0.n:
jpayne@69 970 return ()
jpayne@69 971
jpayne@69 972 cdef int8_t *data8
jpayne@69 973 cdef int16_t *data16
jpayne@69 974 cdef int32_t *data32
jpayne@69 975 cdef int32_t a, nalleles = r.n_allele
jpayne@69 976 cdef list alleles = []
jpayne@69 977
jpayne@69 978 if fmt0.type == BCF_BT_INT8:
jpayne@69 979 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 980 for i in range(fmt0.n):
jpayne@69 981 if data8[i] == bcf_int8_vector_end:
jpayne@69 982 break
jpayne@69 983 elif data8[i] == bcf_gt_missing:
jpayne@69 984 a = -1
jpayne@69 985 else:
jpayne@69 986 a = bcf_gt_allele(data8[i])
jpayne@69 987 alleles.append(a if 0 <= a < nalleles else None)
jpayne@69 988 elif fmt0.type == BCF_BT_INT16:
jpayne@69 989 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 990 for i in range(fmt0.n):
jpayne@69 991 if data16[i] == bcf_int16_vector_end:
jpayne@69 992 break
jpayne@69 993 elif data16[i] == bcf_gt_missing:
jpayne@69 994 a = -1
jpayne@69 995 else:
jpayne@69 996 a = bcf_gt_allele(data16[i])
jpayne@69 997 alleles.append(a if 0 <= a < nalleles else None)
jpayne@69 998 elif fmt0.type == BCF_BT_INT32:
jpayne@69 999 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1000 for i in range(fmt0.n):
jpayne@69 1001 if data32[i] == bcf_int32_vector_end:
jpayne@69 1002 break
jpayne@69 1003 elif data32[i] == bcf_gt_missing:
jpayne@69 1004 a = -1
jpayne@69 1005 else:
jpayne@69 1006 a = bcf_gt_allele(data32[i])
jpayne@69 1007 alleles.append(a if 0 <= a < nalleles else None)
jpayne@69 1008
jpayne@69 1009 return tuple(alleles)
jpayne@69 1010
jpayne@69 1011
jpayne@69 1012 cdef bcf_format_get_alleles(VariantRecordSample sample):
jpayne@69 1013 if sample is None:
jpayne@69 1014 raise ValueError('sample must not be None')
jpayne@69 1015
jpayne@69 1016 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 1017 cdef bcf1_t *r = sample.record.ptr
jpayne@69 1018 cdef int32_t nsamples = r.n_sample
jpayne@69 1019
jpayne@69 1020 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 1021 raise ValueError('Error unpacking VariantRecord')
jpayne@69 1022
jpayne@69 1023 cdef int32_t nalleles = r.n_allele
jpayne@69 1024
jpayne@69 1025 if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
jpayne@69 1026 return ()
jpayne@69 1027
jpayne@69 1028 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@69 1029 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@69 1030
jpayne@69 1031 if not gt0 or not fmt0.n:
jpayne@69 1032 return ()
jpayne@69 1033
jpayne@69 1034 cdef int32_t a
jpayne@69 1035 cdef int8_t *data8
jpayne@69 1036 cdef int16_t *data16
jpayne@69 1037 cdef int32_t *data32
jpayne@69 1038 alleles = []
jpayne@69 1039 if fmt0.type == BCF_BT_INT8:
jpayne@69 1040 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1041 for i in range(fmt0.n):
jpayne@69 1042 if data8[i] == bcf_int8_vector_end:
jpayne@69 1043 break
jpayne@69 1044 a = bcf_gt_allele(data8[i])
jpayne@69 1045 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@69 1046 elif fmt0.type == BCF_BT_INT16:
jpayne@69 1047 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1048 for i in range(fmt0.n):
jpayne@69 1049 if data16[i] == bcf_int16_vector_end:
jpayne@69 1050 break
jpayne@69 1051 a = bcf_gt_allele(data16[i])
jpayne@69 1052 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@69 1053 elif fmt0.type == BCF_BT_INT32:
jpayne@69 1054 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1055 for i in range(fmt0.n):
jpayne@69 1056 if data32[i] == bcf_int32_vector_end:
jpayne@69 1057 break
jpayne@69 1058 a = bcf_gt_allele(data32[i])
jpayne@69 1059 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@69 1060 return tuple(alleles)
jpayne@69 1061
jpayne@69 1062
jpayne@69 1063 cdef bint bcf_sample_get_phased(VariantRecordSample sample):
jpayne@69 1064 if sample is None:
jpayne@69 1065 raise ValueError('sample must not be None')
jpayne@69 1066
jpayne@69 1067 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 1068 cdef bcf1_t *r = sample.record.ptr
jpayne@69 1069 cdef int32_t n = r.n_sample
jpayne@69 1070
jpayne@69 1071 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 1072 raise ValueError('Error unpacking VariantRecord')
jpayne@69 1073
jpayne@69 1074 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@69 1075 return False
jpayne@69 1076
jpayne@69 1077 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@69 1078 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@69 1079
jpayne@69 1080 if not gt0 or not fmt0.n:
jpayne@69 1081 return False
jpayne@69 1082
jpayne@69 1083 cdef int8_t *data8
jpayne@69 1084 cdef int16_t *data16
jpayne@69 1085 cdef int32_t *data32
jpayne@69 1086
jpayne@69 1087 cdef bint phased = False
jpayne@69 1088
jpayne@69 1089 if fmt0.type == BCF_BT_INT8:
jpayne@69 1090 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1091 for i in range(fmt0.n):
jpayne@69 1092 if data8[i] == bcf_int8_vector_end:
jpayne@69 1093 break
jpayne@69 1094 elif data8[i] == bcf_int8_missing:
jpayne@69 1095 continue
jpayne@69 1096 elif i and not bcf_gt_is_phased(data8[i]):
jpayne@69 1097 return False
jpayne@69 1098 else:
jpayne@69 1099 phased = True
jpayne@69 1100 elif fmt0.type == BCF_BT_INT16:
jpayne@69 1101 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1102 for i in range(fmt0.n):
jpayne@69 1103 if data16[i] == bcf_int16_vector_end:
jpayne@69 1104 break
jpayne@69 1105 elif data16[i] == bcf_int16_missing:
jpayne@69 1106 continue
jpayne@69 1107 elif i and not bcf_gt_is_phased(data16[i]):
jpayne@69 1108 return False
jpayne@69 1109 else:
jpayne@69 1110 phased = True
jpayne@69 1111 elif fmt0.type == BCF_BT_INT32:
jpayne@69 1112 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1113 for i in range(fmt0.n):
jpayne@69 1114 if data32[i] == bcf_int32_vector_end:
jpayne@69 1115 break
jpayne@69 1116 elif data32[i] == bcf_int32_missing:
jpayne@69 1117 continue
jpayne@69 1118 elif i and not bcf_gt_is_phased(data32[i]):
jpayne@69 1119 return False
jpayne@69 1120 else:
jpayne@69 1121 phased = True
jpayne@69 1122
jpayne@69 1123 return phased
jpayne@69 1124
jpayne@69 1125
jpayne@69 1126 cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
jpayne@69 1127 if sample is None:
jpayne@69 1128 raise ValueError('sample must not be None')
jpayne@69 1129
jpayne@69 1130 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@69 1131 cdef bcf1_t *r = sample.record.ptr
jpayne@69 1132 cdef int32_t n = r.n_sample
jpayne@69 1133
jpayne@69 1134 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@69 1135 raise ValueError('Error unpacking VariantRecord')
jpayne@69 1136
jpayne@69 1137 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@69 1138 return
jpayne@69 1139
jpayne@69 1140 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@69 1141 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@69 1142
jpayne@69 1143 if not gt0 or not fmt0.n:
jpayne@69 1144 raise ValueError('Cannot set phased before genotype is set')
jpayne@69 1145
jpayne@69 1146 cdef int8_t *data8
jpayne@69 1147 cdef int16_t *data16
jpayne@69 1148 cdef int32_t *data32
jpayne@69 1149
jpayne@69 1150 if fmt0.type == BCF_BT_INT8:
jpayne@69 1151 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1152 for i in range(fmt0.n):
jpayne@69 1153 if data8[i] == bcf_int8_vector_end:
jpayne@69 1154 break
jpayne@69 1155 elif data8[i] == bcf_int8_missing:
jpayne@69 1156 continue
jpayne@69 1157 elif i:
jpayne@69 1158 data8[i] = (data8[i] & 0xFE) | phased
jpayne@69 1159 elif fmt0.type == BCF_BT_INT16:
jpayne@69 1160 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1161 for i in range(fmt0.n):
jpayne@69 1162 if data16[i] == bcf_int16_vector_end:
jpayne@69 1163 break
jpayne@69 1164 elif data16[i] == bcf_int16_missing:
jpayne@69 1165 continue
jpayne@69 1166 elif i:
jpayne@69 1167 data16[i] = (data16[i] & 0xFFFE) | phased
jpayne@69 1168 elif fmt0.type == BCF_BT_INT32:
jpayne@69 1169 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@69 1170 for i in range(fmt0.n):
jpayne@69 1171 if data32[i] == bcf_int32_vector_end:
jpayne@69 1172 break
jpayne@69 1173 elif data32[i] == bcf_int32_missing:
jpayne@69 1174 continue
jpayne@69 1175 elif i:
jpayne@69 1176 data32[i] = (data32[i] & 0xFFFFFFFE) | phased
jpayne@69 1177
jpayne@69 1178
jpayne@69 1179 cdef inline bcf_sync_end(VariantRecord record):
jpayne@69 1180 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@69 1181 cdef bcf_info_t *info
jpayne@69 1182 cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
jpayne@69 1183 cdef int ref_len
jpayne@69 1184
jpayne@69 1185 # allow missing ref when instantiating a new record
jpayne@69 1186 if record.ref is not None:
jpayne@69 1187 ref_len = len(record.ref)
jpayne@69 1188 else:
jpayne@69 1189 ref_len = 0
jpayne@69 1190
jpayne@69 1191 # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
jpayne@69 1192 # Always keep END for symbolic alleles
jpayne@69 1193 if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
jpayne@69 1194 # If INFO/END is not defined in the header, it doesn't exist in the record
jpayne@69 1195 if end_id >= 0:
jpayne@69 1196 info = bcf_get_info(hdr, record.ptr, b'END')
jpayne@69 1197 if info and info.vptr:
jpayne@69 1198 if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
jpayne@69 1199 raise ValueError('Unable to delete END')
jpayne@69 1200 else:
jpayne@69 1201 # Create END header, if not present
jpayne@69 1202 if end_id < 0:
jpayne@69 1203 record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
jpayne@69 1204
jpayne@69 1205 # Update to reflect stop position
jpayne@69 1206 bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
jpayne@69 1207
jpayne@69 1208
jpayne@69 1209 cdef inline int has_symbolic_allele(VariantRecord record):
jpayne@69 1210 """Return index of first symbolic allele. 0 if no symbolic alleles."""
jpayne@69 1211
jpayne@69 1212 for i in range(1, record.ptr.n_allele):
jpayne@69 1213 alt = record.ptr.d.allele[i]
jpayne@69 1214 if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
jpayne@69 1215 return i
jpayne@69 1216
jpayne@69 1217 return 0
jpayne@69 1218
jpayne@69 1219
jpayne@69 1220 ########################################################################
jpayne@69 1221 ########################################################################
jpayne@69 1222 ## Variant Header objects
jpayne@69 1223 ########################################################################
jpayne@69 1224
jpayne@69 1225
jpayne@69 1226 cdef bcf_header_remove_hrec(VariantHeader header, int i):
jpayne@69 1227 if header is None:
jpayne@69 1228 raise ValueError('header must not be None')
jpayne@69 1229
jpayne@69 1230 cdef bcf_hdr_t *hdr = header.ptr
jpayne@69 1231
jpayne@69 1232 if i < 0 or i >= hdr.nhrec:
jpayne@69 1233 raise ValueError('Invalid header record index')
jpayne@69 1234
jpayne@69 1235 cdef bcf_hrec_t *hrec = hdr.hrec[i]
jpayne@69 1236 hdr.nhrec -= 1
jpayne@69 1237
jpayne@69 1238 if i < hdr.nhrec:
jpayne@69 1239 memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
jpayne@69 1240
jpayne@69 1241 bcf_hrec_destroy(hrec)
jpayne@69 1242 hdr.hrec[hdr.nhrec] = NULL
jpayne@69 1243 hdr.dirty = 1
jpayne@69 1244
jpayne@69 1245
jpayne@69 1246 #FIXME: implement a full mapping interface
jpayne@69 1247 #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
jpayne@69 1248 # object lifetime.
jpayne@69 1249 cdef class VariantHeaderRecord(object):
jpayne@69 1250 """header record from a :class:`VariantHeader` object"""
jpayne@69 1251 def __init__(self, *args, **kwargs):
jpayne@69 1252 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1253
jpayne@69 1254 @property
jpayne@69 1255 def type(self):
jpayne@69 1256 """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
jpayne@69 1257 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1258 if not r:
jpayne@69 1259 return None
jpayne@69 1260 return METADATA_TYPES[r.type]
jpayne@69 1261
jpayne@69 1262 @property
jpayne@69 1263 def key(self):
jpayne@69 1264 """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
jpayne@69 1265 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1266 return bcf_str_cache_get_charptr(r.key) if r and r.key else None
jpayne@69 1267
jpayne@69 1268 @property
jpayne@69 1269 def value(self):
jpayne@69 1270 """header value. Set only for generic lines, None for FILTER/INFO, etc."""
jpayne@69 1271 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1272 return charptr_to_str(r.value) if r and r.value else None
jpayne@69 1273
jpayne@69 1274 @property
jpayne@69 1275 def attrs(self):
jpayne@69 1276 """sequence of additional header attributes"""
jpayne@69 1277 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1278 if not r:
jpayne@69 1279 return ()
jpayne@69 1280 cdef int i
jpayne@69 1281 return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
jpayne@69 1282 charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@69 1283 for i in range(r.nkeys))
jpayne@69 1284
jpayne@69 1285 def __len__(self):
jpayne@69 1286 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1287 return r.nkeys if r else 0
jpayne@69 1288
jpayne@69 1289 def __bool__(self):
jpayne@69 1290 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1291 return r != NULL and r.nkeys != 0
jpayne@69 1292
jpayne@69 1293 def __getitem__(self, key):
jpayne@69 1294 """get attribute value"""
jpayne@69 1295 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1296 cdef int i
jpayne@69 1297 if r:
jpayne@69 1298 bkey = force_bytes(key)
jpayne@69 1299 for i in range(r.nkeys):
jpayne@69 1300 if r.keys[i] and r.keys[i] == bkey:
jpayne@69 1301 return charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@69 1302 raise KeyError('cannot find metadata key')
jpayne@69 1303
jpayne@69 1304 def __iter__(self):
jpayne@69 1305 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1306 if not r:
jpayne@69 1307 return
jpayne@69 1308 cdef int i
jpayne@69 1309 for i in range(r.nkeys):
jpayne@69 1310 if r.keys[i]:
jpayne@69 1311 yield bcf_str_cache_get_charptr(r.keys[i])
jpayne@69 1312
jpayne@69 1313 def get(self, key, default=None):
jpayne@69 1314 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 1315 try:
jpayne@69 1316 return self[key]
jpayne@69 1317 except KeyError:
jpayne@69 1318 return default
jpayne@69 1319
jpayne@69 1320 def __contains__(self, key):
jpayne@69 1321 try:
jpayne@69 1322 self[key]
jpayne@69 1323 except KeyError:
jpayne@69 1324 return False
jpayne@69 1325 else:
jpayne@69 1326 return True
jpayne@69 1327
jpayne@69 1328 def iterkeys(self):
jpayne@69 1329 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 1330 return iter(self)
jpayne@69 1331
jpayne@69 1332 def itervalues(self):
jpayne@69 1333 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 1334 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1335 if not r:
jpayne@69 1336 return
jpayne@69 1337 cdef int i
jpayne@69 1338 for i in range(r.nkeys):
jpayne@69 1339 if r.keys[i]:
jpayne@69 1340 yield charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@69 1341
jpayne@69 1342 def iteritems(self):
jpayne@69 1343 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 1344 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1345 if not r:
jpayne@69 1346 return
jpayne@69 1347 cdef int i
jpayne@69 1348 for i in range(r.nkeys):
jpayne@69 1349 if r.keys[i]:
jpayne@69 1350 yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@69 1351
jpayne@69 1352 def keys(self):
jpayne@69 1353 """D.keys() -> list of D's keys"""
jpayne@69 1354 return list(self)
jpayne@69 1355
jpayne@69 1356 def items(self):
jpayne@69 1357 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 1358 return list(self.iteritems())
jpayne@69 1359
jpayne@69 1360 def values(self):
jpayne@69 1361 """D.values() -> list of D's values"""
jpayne@69 1362 return list(self.itervalues())
jpayne@69 1363
jpayne@69 1364 def update(self, items=None, **kwargs):
jpayne@69 1365 """D.update([E, ]**F) -> None.
jpayne@69 1366
jpayne@69 1367 Update D from dict/iterable E and F.
jpayne@69 1368 """
jpayne@69 1369 for k, v in items.items():
jpayne@69 1370 self[k] = v
jpayne@69 1371
jpayne@69 1372 if kwargs:
jpayne@69 1373 for k, v in kwargs.items():
jpayne@69 1374 self[k] = v
jpayne@69 1375
jpayne@69 1376 def pop(self, key, default=_nothing):
jpayne@69 1377 try:
jpayne@69 1378 value = self[key]
jpayne@69 1379 del self[key]
jpayne@69 1380 return value
jpayne@69 1381 except KeyError:
jpayne@69 1382 if default is not _nothing:
jpayne@69 1383 return default
jpayne@69 1384 raise
jpayne@69 1385
jpayne@69 1386 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 1387 __hash__ = None
jpayne@69 1388
jpayne@69 1389 #TODO: implement __richcmp__
jpayne@69 1390
jpayne@69 1391 def __str__(self):
jpayne@69 1392 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1393
jpayne@69 1394 if not r:
jpayne@69 1395 raise ValueError('cannot convert deleted record to str')
jpayne@69 1396
jpayne@69 1397 cdef kstring_t hrec_str
jpayne@69 1398 hrec_str.l = hrec_str.m = 0
jpayne@69 1399 hrec_str.s = NULL
jpayne@69 1400
jpayne@69 1401 bcf_hrec_format(r, &hrec_str)
jpayne@69 1402
jpayne@69 1403 ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
jpayne@69 1404
jpayne@69 1405 if hrec_str.m:
jpayne@69 1406 free(hrec_str.s)
jpayne@69 1407
jpayne@69 1408 return ret
jpayne@69 1409
jpayne@69 1410 # FIXME: Not safe -- causes trivial segfaults at the moment
jpayne@69 1411 def remove(self):
jpayne@69 1412 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1413 cdef bcf_hrec_t *r = self.ptr
jpayne@69 1414 if not r:
jpayne@69 1415 return
jpayne@69 1416 assert r.key
jpayne@69 1417 cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
jpayne@69 1418 bcf_hdr_remove(hdr, r.type, key)
jpayne@69 1419 self.ptr = NULL
jpayne@69 1420
jpayne@69 1421
jpayne@69 1422 cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
jpayne@69 1423 if not header:
jpayne@69 1424 raise ValueError('invalid VariantHeader')
jpayne@69 1425
jpayne@69 1426 if not hdr:
jpayne@69 1427 return None
jpayne@69 1428
jpayne@69 1429 cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
jpayne@69 1430 record.header = header
jpayne@69 1431 record.ptr = hdr
jpayne@69 1432
jpayne@69 1433 return record
jpayne@69 1434
jpayne@69 1435
jpayne@69 1436 cdef class VariantHeaderRecords(object):
jpayne@69 1437 """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
jpayne@69 1438 def __init__(self, *args, **kwargs):
jpayne@69 1439 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1440
jpayne@69 1441 def __len__(self):
jpayne@69 1442 return self.header.ptr.nhrec
jpayne@69 1443
jpayne@69 1444 def __bool__(self):
jpayne@69 1445 return self.header.ptr.nhrec != 0
jpayne@69 1446
jpayne@69 1447 def __getitem__(self, index):
jpayne@69 1448 cdef int32_t i = index
jpayne@69 1449 if i < 0 or i >= self.header.ptr.nhrec:
jpayne@69 1450 raise IndexError('invalid header record index')
jpayne@69 1451 return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@69 1452
jpayne@69 1453 def __iter__(self):
jpayne@69 1454 cdef int32_t i
jpayne@69 1455 for i in range(self.header.ptr.nhrec):
jpayne@69 1456 yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@69 1457
jpayne@69 1458 __hash__ = None
jpayne@69 1459
jpayne@69 1460
jpayne@69 1461 cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
jpayne@69 1462 if not header:
jpayne@69 1463 raise ValueError('invalid VariantHeader')
jpayne@69 1464
jpayne@69 1465 cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
jpayne@69 1466 records.header = header
jpayne@69 1467 return records
jpayne@69 1468
jpayne@69 1469
jpayne@69 1470 cdef class VariantMetadata(object):
jpayne@69 1471 """filter, info or format metadata record from a :class:`VariantHeader` object"""
jpayne@69 1472 def __init__(self, *args, **kwargs):
jpayne@69 1473 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1474
jpayne@69 1475 @property
jpayne@69 1476 def name(self):
jpayne@69 1477 """metadata name"""
jpayne@69 1478 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1479 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
jpayne@69 1480
jpayne@69 1481 # Q: Should this be exposed?
jpayne@69 1482 @property
jpayne@69 1483 def id(self):
jpayne@69 1484 """metadata internal header id number"""
jpayne@69 1485 return self.id
jpayne@69 1486
jpayne@69 1487 @property
jpayne@69 1488 def number(self):
jpayne@69 1489 """metadata number (i.e. cardinality)"""
jpayne@69 1490 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1491
jpayne@69 1492 if not check_header_id(hdr, self.type, self.id):
jpayne@69 1493 raise ValueError('Invalid header id')
jpayne@69 1494
jpayne@69 1495 if self.type == BCF_HL_FLT:
jpayne@69 1496 return None
jpayne@69 1497
jpayne@69 1498 cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
jpayne@69 1499 if l == BCF_VL_FIXED:
jpayne@69 1500 return bcf_hdr_id2number(hdr, self.type, self.id)
jpayne@69 1501 elif l == BCF_VL_VAR:
jpayne@69 1502 return '.'
jpayne@69 1503 else:
jpayne@69 1504 return METADATA_LENGTHS[l]
jpayne@69 1505
jpayne@69 1506 @property
jpayne@69 1507 def type(self):
jpayne@69 1508 """metadata value type"""
jpayne@69 1509 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1510 if not check_header_id(hdr, self.type, self.id):
jpayne@69 1511 raise ValueError('Invalid header id')
jpayne@69 1512
jpayne@69 1513 if self.type == BCF_HL_FLT:
jpayne@69 1514 return None
jpayne@69 1515 return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
jpayne@69 1516
jpayne@69 1517 @property
jpayne@69 1518 def description(self):
jpayne@69 1519 """metadata description (or None if not set)"""
jpayne@69 1520 descr = self.record.get('Description')
jpayne@69 1521 if descr:
jpayne@69 1522 descr = descr.strip('"')
jpayne@69 1523 return force_str(descr)
jpayne@69 1524
jpayne@69 1525 @property
jpayne@69 1526 def record(self):
jpayne@69 1527 """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
jpayne@69 1528 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1529 if not check_header_id(hdr, self.type, self.id):
jpayne@69 1530 raise ValueError('Invalid header id')
jpayne@69 1531 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
jpayne@69 1532 if not hrec:
jpayne@69 1533 return None
jpayne@69 1534 return makeVariantHeaderRecord(self.header, hrec)
jpayne@69 1535
jpayne@69 1536 def remove_header(self):
jpayne@69 1537 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1538 cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
jpayne@69 1539 bcf_hdr_remove(hdr, self.type, key)
jpayne@69 1540
jpayne@69 1541
jpayne@69 1542 cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
jpayne@69 1543 if not header:
jpayne@69 1544 raise ValueError('invalid VariantHeader')
jpayne@69 1545
jpayne@69 1546 if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
jpayne@69 1547 raise ValueError('invalid metadata type')
jpayne@69 1548
jpayne@69 1549 if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
jpayne@69 1550 raise ValueError('invalid metadata id')
jpayne@69 1551
jpayne@69 1552 cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
jpayne@69 1553 meta.header = header
jpayne@69 1554 meta.type = type
jpayne@69 1555 meta.id = id
jpayne@69 1556
jpayne@69 1557 return meta
jpayne@69 1558
jpayne@69 1559
jpayne@69 1560 cdef class VariantHeaderMetadata(object):
jpayne@69 1561 """mapping from filter, info or format name to :class:`VariantMetadata` object"""
jpayne@69 1562 def __init__(self, *args, **kwargs):
jpayne@69 1563 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1564
jpayne@69 1565 def add(self, id, number, type, description, **kwargs):
jpayne@69 1566 """Add a new filter, info or format record"""
jpayne@69 1567 if id in self:
jpayne@69 1568 raise ValueError('Header already exists for id={}'.format(id))
jpayne@69 1569
jpayne@69 1570 if self.type == BCF_HL_FLT:
jpayne@69 1571 if number is not None:
jpayne@69 1572 raise ValueError('Number must be None when adding a filter')
jpayne@69 1573 if type is not None:
jpayne@69 1574 raise ValueError('Type must be None when adding a filter')
jpayne@69 1575
jpayne@69 1576 items = [('ID', unquoted_str(id)), ('Description', description)]
jpayne@69 1577 else:
jpayne@69 1578 if type not in VALUE_TYPES:
jpayne@69 1579 raise ValueError('unknown type specified: {}'.format(type))
jpayne@69 1580 if number is None:
jpayne@69 1581 number = '.'
jpayne@69 1582
jpayne@69 1583 items = [('ID', unquoted_str(id)),
jpayne@69 1584 ('Number', unquoted_str(number)),
jpayne@69 1585 ('Type', unquoted_str(type)),
jpayne@69 1586 ('Description', description)]
jpayne@69 1587
jpayne@69 1588 items += kwargs.items()
jpayne@69 1589 self.header.add_meta(METADATA_TYPES[self.type], items=items)
jpayne@69 1590
jpayne@69 1591 def __len__(self):
jpayne@69 1592 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1593 cdef bcf_idpair_t *idpair
jpayne@69 1594 cdef int32_t i, n = 0
jpayne@69 1595
jpayne@69 1596 for i in range(hdr.n[BCF_DT_ID]):
jpayne@69 1597 idpair = hdr.id[BCF_DT_ID] + i
jpayne@69 1598 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@69 1599 n += 1
jpayne@69 1600 return n
jpayne@69 1601
jpayne@69 1602 def __bool__(self):
jpayne@69 1603 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1604 cdef bcf_idpair_t *idpair
jpayne@69 1605 cdef int32_t i
jpayne@69 1606
jpayne@69 1607 for i in range(hdr.n[BCF_DT_ID]):
jpayne@69 1608 idpair = hdr.id[BCF_DT_ID] + i
jpayne@69 1609 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@69 1610 return True
jpayne@69 1611 return False
jpayne@69 1612
jpayne@69 1613 def __getitem__(self, key):
jpayne@69 1614 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1615 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@69 1616
jpayne@69 1617 cdef bytes bkey = force_bytes(key)
jpayne@69 1618 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@69 1619
jpayne@69 1620 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@69 1621 raise KeyError('invalid key: {}'.format(key))
jpayne@69 1622
jpayne@69 1623 return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
jpayne@69 1624
jpayne@69 1625 def remove_header(self, key):
jpayne@69 1626 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1627 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@69 1628
jpayne@69 1629 cdef bytes bkey = force_bytes(key)
jpayne@69 1630 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@69 1631
jpayne@69 1632 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@69 1633 raise KeyError('invalid key: {}'.format(key))
jpayne@69 1634
jpayne@69 1635 bcf_hdr_remove(hdr, self.type, bkey)
jpayne@69 1636 #bcf_hdr_sync(hdr)
jpayne@69 1637
jpayne@69 1638 def clear_header(self):
jpayne@69 1639 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1640 bcf_hdr_remove(hdr, self.type, NULL)
jpayne@69 1641 #bcf_hdr_sync(hdr)
jpayne@69 1642
jpayne@69 1643 def __iter__(self):
jpayne@69 1644 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1645 cdef bcf_idpair_t *idpair
jpayne@69 1646 cdef int32_t i
jpayne@69 1647
jpayne@69 1648 for i in range(hdr.n[BCF_DT_ID]):
jpayne@69 1649 idpair = hdr.id[BCF_DT_ID] + i
jpayne@69 1650 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@69 1651 yield bcf_str_cache_get_charptr(idpair.key)
jpayne@69 1652
jpayne@69 1653 def get(self, key, default=None):
jpayne@69 1654 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 1655 try:
jpayne@69 1656 return self[key]
jpayne@69 1657 except KeyError:
jpayne@69 1658 return default
jpayne@69 1659
jpayne@69 1660 def __contains__(self, key):
jpayne@69 1661 try:
jpayne@69 1662 self[key]
jpayne@69 1663 except KeyError:
jpayne@69 1664 return False
jpayne@69 1665 else:
jpayne@69 1666 return True
jpayne@69 1667
jpayne@69 1668 def iterkeys(self):
jpayne@69 1669 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 1670 return iter(self)
jpayne@69 1671
jpayne@69 1672 def itervalues(self):
jpayne@69 1673 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 1674 for key in self:
jpayne@69 1675 yield self[key]
jpayne@69 1676
jpayne@69 1677 def iteritems(self):
jpayne@69 1678 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 1679 for key in self:
jpayne@69 1680 yield (key, self[key])
jpayne@69 1681
jpayne@69 1682 def keys(self):
jpayne@69 1683 """D.keys() -> list of D's keys"""
jpayne@69 1684 return list(self)
jpayne@69 1685
jpayne@69 1686 def items(self):
jpayne@69 1687 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 1688 return list(self.iteritems())
jpayne@69 1689
jpayne@69 1690 def values(self):
jpayne@69 1691 """D.values() -> list of D's values"""
jpayne@69 1692 return list(self.itervalues())
jpayne@69 1693
jpayne@69 1694 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 1695 __hash__ = None
jpayne@69 1696
jpayne@69 1697 #TODO: implement __richcmp__
jpayne@69 1698
jpayne@69 1699
jpayne@69 1700 cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
jpayne@69 1701 if not header:
jpayne@69 1702 raise ValueError('invalid VariantHeader')
jpayne@69 1703
jpayne@69 1704 cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
jpayne@69 1705 meta.header = header
jpayne@69 1706 meta.type = type
jpayne@69 1707
jpayne@69 1708 return meta
jpayne@69 1709
jpayne@69 1710
jpayne@69 1711 cdef class VariantContig(object):
jpayne@69 1712 """contig metadata from a :class:`VariantHeader`"""
jpayne@69 1713 def __init__(self, *args, **kwargs):
jpayne@69 1714 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1715
jpayne@69 1716 @property
jpayne@69 1717 def name(self):
jpayne@69 1718 """contig name"""
jpayne@69 1719 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1720 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
jpayne@69 1721
jpayne@69 1722 @property
jpayne@69 1723 def id(self):
jpayne@69 1724 """contig internal id number"""
jpayne@69 1725 return self.id
jpayne@69 1726
jpayne@69 1727 @property
jpayne@69 1728 def length(self):
jpayne@69 1729 """contig length or None if not available"""
jpayne@69 1730 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1731 cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
jpayne@69 1732 return length if length else None
jpayne@69 1733
jpayne@69 1734 @property
jpayne@69 1735 def header_record(self):
jpayne@69 1736 """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
jpayne@69 1737 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1738 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
jpayne@69 1739 return makeVariantHeaderRecord(self.header, hrec)
jpayne@69 1740
jpayne@69 1741 def remove_header(self):
jpayne@69 1742 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1743 cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
jpayne@69 1744 bcf_hdr_remove(hdr, BCF_HL_CTG, key)
jpayne@69 1745
jpayne@69 1746
jpayne@69 1747 cdef VariantContig makeVariantContig(VariantHeader header, int id):
jpayne@69 1748 if not header:
jpayne@69 1749 raise ValueError('invalid VariantHeader')
jpayne@69 1750
jpayne@69 1751 if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
jpayne@69 1752 raise ValueError('invalid contig id')
jpayne@69 1753
jpayne@69 1754 cdef VariantContig contig = VariantContig.__new__(VariantContig)
jpayne@69 1755 contig.header = header
jpayne@69 1756 contig.id = id
jpayne@69 1757
jpayne@69 1758 return contig
jpayne@69 1759
jpayne@69 1760
jpayne@69 1761 cdef class VariantHeaderContigs(object):
jpayne@69 1762 """mapping from contig name or index to :class:`VariantContig` object."""
jpayne@69 1763 def __init__(self, *args, **kwargs):
jpayne@69 1764 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1765
jpayne@69 1766 def __len__(self):
jpayne@69 1767 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1768 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@69 1769 return hdr.n[BCF_DT_CTG]
jpayne@69 1770
jpayne@69 1771 def __bool__(self):
jpayne@69 1772 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1773 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@69 1774 return hdr.n[BCF_DT_CTG] != 0
jpayne@69 1775
jpayne@69 1776 def __getitem__(self, key):
jpayne@69 1777 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1778 cdef int index
jpayne@69 1779
jpayne@69 1780 if isinstance(key, int):
jpayne@69 1781 index = key
jpayne@69 1782 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@69 1783 raise IndexError('invalid contig index')
jpayne@69 1784 return makeVariantContig(self.header, index)
jpayne@69 1785
jpayne@69 1786 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@69 1787 cdef bytes bkey = force_bytes(key)
jpayne@69 1788 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@69 1789
jpayne@69 1790 if k == kh_end(d):
jpayne@69 1791 raise KeyError('invalid contig: {}'.format(key))
jpayne@69 1792
jpayne@69 1793 cdef int id = kh_val_vdict(d, k).id
jpayne@69 1794
jpayne@69 1795 return makeVariantContig(self.header, id)
jpayne@69 1796
jpayne@69 1797 def remove_header(self, key):
jpayne@69 1798 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1799 cdef int index
jpayne@69 1800 cdef const char *ckey
jpayne@69 1801 cdef vdict_t *d
jpayne@69 1802 cdef khiter_t k
jpayne@69 1803
jpayne@69 1804 if isinstance(key, int):
jpayne@69 1805 index = key
jpayne@69 1806 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@69 1807 raise IndexError('invalid contig index')
jpayne@69 1808 ckey = hdr.id[BCF_DT_CTG][self.id].key
jpayne@69 1809 else:
jpayne@69 1810 d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@69 1811 key = force_bytes(key)
jpayne@69 1812 if kh_get_vdict(d, key) == kh_end(d):
jpayne@69 1813 raise KeyError('invalid contig: {}'.format(key))
jpayne@69 1814 ckey = key
jpayne@69 1815
jpayne@69 1816 bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
jpayne@69 1817
jpayne@69 1818 def clear_header(self):
jpayne@69 1819 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1820 bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
jpayne@69 1821 #bcf_hdr_sync(hdr)
jpayne@69 1822
jpayne@69 1823 def __iter__(self):
jpayne@69 1824 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1825 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@69 1826 cdef uint32_t n = kh_size(d)
jpayne@69 1827
jpayne@69 1828 assert n == hdr.n[BCF_DT_CTG]
jpayne@69 1829
jpayne@69 1830 for i in range(n):
jpayne@69 1831 yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
jpayne@69 1832
jpayne@69 1833 def get(self, key, default=None):
jpayne@69 1834 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 1835 try:
jpayne@69 1836 return self[key]
jpayne@69 1837 except KeyError:
jpayne@69 1838 return default
jpayne@69 1839
jpayne@69 1840 def __contains__(self, key):
jpayne@69 1841 try:
jpayne@69 1842 self[key]
jpayne@69 1843 except KeyError:
jpayne@69 1844 return False
jpayne@69 1845 else:
jpayne@69 1846 return True
jpayne@69 1847
jpayne@69 1848 def iterkeys(self):
jpayne@69 1849 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 1850 return iter(self)
jpayne@69 1851
jpayne@69 1852 def itervalues(self):
jpayne@69 1853 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 1854 for key in self:
jpayne@69 1855 yield self[key]
jpayne@69 1856
jpayne@69 1857 def iteritems(self):
jpayne@69 1858 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 1859 for key in self:
jpayne@69 1860 yield (key, self[key])
jpayne@69 1861
jpayne@69 1862 def keys(self):
jpayne@69 1863 """D.keys() -> list of D's keys"""
jpayne@69 1864 return list(self)
jpayne@69 1865
jpayne@69 1866 def items(self):
jpayne@69 1867 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 1868 return list(self.iteritems())
jpayne@69 1869
jpayne@69 1870 def values(self):
jpayne@69 1871 """D.values() -> list of D's values"""
jpayne@69 1872 return list(self.itervalues())
jpayne@69 1873
jpayne@69 1874 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 1875 __hash__ = None
jpayne@69 1876
jpayne@69 1877 #TODO: implement __richcmp__
jpayne@69 1878
jpayne@69 1879 def add(self, id, length=None, **kwargs):
jpayne@69 1880 """Add a new contig record"""
jpayne@69 1881 if id in self:
jpayne@69 1882 raise ValueError('Header already exists for contig {}'.format(id))
jpayne@69 1883
jpayne@69 1884 items = [('ID', unquoted_str(id))]
jpayne@69 1885 if length is not None:
jpayne@69 1886 items.append(("length", unquoted_str(length)))
jpayne@69 1887 items += kwargs.items()
jpayne@69 1888 self.header.add_meta('contig', items=items)
jpayne@69 1889
jpayne@69 1890
jpayne@69 1891 cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
jpayne@69 1892 if not header:
jpayne@69 1893 raise ValueError('invalid VariantHeader')
jpayne@69 1894
jpayne@69 1895 cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
jpayne@69 1896 contigs.header = header
jpayne@69 1897
jpayne@69 1898 return contigs
jpayne@69 1899
jpayne@69 1900
jpayne@69 1901 cdef class VariantHeaderSamples(object):
jpayne@69 1902 """sequence of sample names from a :class:`VariantHeader` object"""
jpayne@69 1903 def __init__(self, *args, **kwargs):
jpayne@69 1904 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 1905
jpayne@69 1906 def __len__(self):
jpayne@69 1907 return bcf_hdr_nsamples(self.header.ptr)
jpayne@69 1908
jpayne@69 1909 def __bool__(self):
jpayne@69 1910 return bcf_hdr_nsamples(self.header.ptr) != 0
jpayne@69 1911
jpayne@69 1912 def __getitem__(self, index):
jpayne@69 1913 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1914 cdef int32_t n = bcf_hdr_nsamples(hdr)
jpayne@69 1915 cdef int32_t i = index
jpayne@69 1916
jpayne@69 1917 if i < 0 or i >= n:
jpayne@69 1918 raise IndexError('invalid sample index')
jpayne@69 1919
jpayne@69 1920 return charptr_to_str(hdr.samples[i])
jpayne@69 1921
jpayne@69 1922 def __iter__(self):
jpayne@69 1923 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1924 cdef int32_t i, n = bcf_hdr_nsamples(hdr)
jpayne@69 1925
jpayne@69 1926 for i in range(n):
jpayne@69 1927 yield charptr_to_str(hdr.samples[i])
jpayne@69 1928
jpayne@69 1929 def __contains__(self, key):
jpayne@69 1930 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 1931 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
jpayne@69 1932 cdef bytes bkey = force_bytes(key)
jpayne@69 1933 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@69 1934
jpayne@69 1935 return k != kh_end(d)
jpayne@69 1936
jpayne@69 1937 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 1938 __hash__ = None
jpayne@69 1939
jpayne@69 1940 #TODO: implement __richcmp__
jpayne@69 1941
jpayne@69 1942 def add(self, name):
jpayne@69 1943 """Add a new sample"""
jpayne@69 1944 self.header.add_sample(name)
jpayne@69 1945
jpayne@69 1946
jpayne@69 1947 cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
jpayne@69 1948 if not header:
jpayne@69 1949 raise ValueError('invalid VariantHeader')
jpayne@69 1950
jpayne@69 1951 cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
jpayne@69 1952 samples.header = header
jpayne@69 1953
jpayne@69 1954 return samples
jpayne@69 1955
jpayne@69 1956
jpayne@69 1957 cdef class VariantHeader(object):
jpayne@69 1958 """header information for a :class:`VariantFile` object"""
jpayne@69 1959 #FIXME: Add structured proxy
jpayne@69 1960 #FIXME: Add generic proxy
jpayne@69 1961 #FIXME: Add mutable methods
jpayne@69 1962
jpayne@69 1963 # See makeVariantHeader for C constructor
jpayne@69 1964 def __cinit__(self):
jpayne@69 1965 self.ptr = NULL
jpayne@69 1966
jpayne@69 1967 # Python constructor
jpayne@69 1968 def __init__(self):
jpayne@69 1969 self.ptr = bcf_hdr_init(b'w')
jpayne@69 1970 if not self.ptr:
jpayne@69 1971 raise ValueError('cannot create VariantHeader')
jpayne@69 1972
jpayne@69 1973 def __dealloc__(self):
jpayne@69 1974 if self.ptr:
jpayne@69 1975 bcf_hdr_destroy(self.ptr)
jpayne@69 1976 self.ptr = NULL
jpayne@69 1977
jpayne@69 1978 def __bool__(self):
jpayne@69 1979 return self.ptr != NULL
jpayne@69 1980
jpayne@69 1981 def copy(self):
jpayne@69 1982 return makeVariantHeader(bcf_hdr_dup(self.ptr))
jpayne@69 1983
jpayne@69 1984 def merge(self, VariantHeader header):
jpayne@69 1985 if header is None:
jpayne@69 1986 raise ValueError('header must not be None')
jpayne@69 1987 bcf_hdr_merge(self.ptr, header.ptr)
jpayne@69 1988
jpayne@69 1989 @property
jpayne@69 1990 def version(self):
jpayne@69 1991 """VCF version"""
jpayne@69 1992 return force_str(bcf_hdr_get_version(self.ptr))
jpayne@69 1993
jpayne@69 1994 @property
jpayne@69 1995 def samples(self):
jpayne@69 1996 """samples (:class:`VariantHeaderSamples`)"""
jpayne@69 1997 return makeVariantHeaderSamples(self)
jpayne@69 1998
jpayne@69 1999 @property
jpayne@69 2000 def records(self):
jpayne@69 2001 """header records (:class:`VariantHeaderRecords`)"""
jpayne@69 2002 return makeVariantHeaderRecords(self)
jpayne@69 2003
jpayne@69 2004 @property
jpayne@69 2005 def contigs(self):
jpayne@69 2006 """contig information (:class:`VariantHeaderContigs`)"""
jpayne@69 2007 return makeVariantHeaderContigs(self)
jpayne@69 2008
jpayne@69 2009 @property
jpayne@69 2010 def filters(self):
jpayne@69 2011 """filter metadata (:class:`VariantHeaderMetadata`)"""
jpayne@69 2012 return makeVariantHeaderMetadata(self, BCF_HL_FLT)
jpayne@69 2013
jpayne@69 2014 @property
jpayne@69 2015 def info(self):
jpayne@69 2016 """info metadata (:class:`VariantHeaderMetadata`)"""
jpayne@69 2017 return makeVariantHeaderMetadata(self, BCF_HL_INFO)
jpayne@69 2018
jpayne@69 2019 @property
jpayne@69 2020 def formats(self):
jpayne@69 2021 """format metadata (:class:`VariantHeaderMetadata`)"""
jpayne@69 2022 return makeVariantHeaderMetadata(self, BCF_HL_FMT)
jpayne@69 2023
jpayne@69 2024 @property
jpayne@69 2025 def alts(self):
jpayne@69 2026 """alt metadata (:class:`dict` ID->record).
jpayne@69 2027
jpayne@69 2028 The data returned just a snapshot of alt records, is created
jpayne@69 2029 every time the property is requested, and modifications will
jpayne@69 2030 not be reflected in the header metadata and vice versa.
jpayne@69 2031
jpayne@69 2032 i.e. it is just a dict that reflects the state of alt records
jpayne@69 2033 at the time it is created.
jpayne@69 2034 """
jpayne@69 2035 return {record['ID']:record for record in self.records
jpayne@69 2036 if record.key.upper() == 'ALT' }
jpayne@69 2037
jpayne@69 2038 # only safe to do when opening an htsfile
jpayne@69 2039 cdef _subset_samples(self, include_samples):
jpayne@69 2040 keep_samples = set(self.samples)
jpayne@69 2041 include_samples = set(include_samples)
jpayne@69 2042 missing_samples = include_samples - keep_samples
jpayne@69 2043 keep_samples &= include_samples
jpayne@69 2044
jpayne@69 2045 if missing_samples:
jpayne@69 2046 # FIXME: add specialized exception with payload
jpayne@69 2047 raise ValueError(
jpayne@69 2048 'missing {:d} requested samples'.format(
jpayne@69 2049 len(missing_samples)))
jpayne@69 2050
jpayne@69 2051 keep_samples = force_bytes(','.join(keep_samples))
jpayne@69 2052 cdef char *keep = <char *>keep_samples if keep_samples else NULL
jpayne@69 2053 cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
jpayne@69 2054
jpayne@69 2055 if ret != 0:
jpayne@69 2056 raise ValueError(
jpayne@69 2057 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
jpayne@69 2058
jpayne@69 2059 def __str__(self):
jpayne@69 2060 cdef int hlen
jpayne@69 2061 cdef kstring_t line
jpayne@69 2062 line.l = line.m = 0
jpayne@69 2063 line.s = NULL
jpayne@69 2064
jpayne@69 2065 if bcf_hdr_format(self.ptr, 0, &line) < 0:
jpayne@69 2066 if line.m:
jpayne@69 2067 free(line.s)
jpayne@69 2068 raise ValueError('bcf_hdr_format failed')
jpayne@69 2069
jpayne@69 2070 ret = charptr_to_str_w_len(line.s, line.l)
jpayne@69 2071
jpayne@69 2072 if line.m:
jpayne@69 2073 free(line.s)
jpayne@69 2074 return ret
jpayne@69 2075
jpayne@69 2076 def new_record(self, contig=None, start=0, stop=0, alleles=None,
jpayne@69 2077 id=None, qual=None, filter=None, info=None, samples=None,
jpayne@69 2078 **kwargs):
jpayne@69 2079 """Create a new empty VariantRecord.
jpayne@69 2080
jpayne@69 2081 Arguments are currently experimental. Use with caution and expect
jpayne@69 2082 changes in upcoming releases.
jpayne@69 2083
jpayne@69 2084 """
jpayne@69 2085 rec = makeVariantRecord(self, bcf_init())
jpayne@69 2086
jpayne@69 2087 if not rec:
jpayne@69 2088 raise MemoryError('unable to allocate BCF record')
jpayne@69 2089
jpayne@69 2090 rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
jpayne@69 2091
jpayne@69 2092 if contig is not None:
jpayne@69 2093 rec.contig = contig
jpayne@69 2094
jpayne@69 2095 rec.start = start
jpayne@69 2096 rec.stop = stop
jpayne@69 2097 rec.id = id
jpayne@69 2098 rec.qual = qual
jpayne@69 2099
jpayne@69 2100 if alleles is not None:
jpayne@69 2101 rec.alleles = alleles
jpayne@69 2102
jpayne@69 2103 if filter is not None:
jpayne@69 2104 if isinstance(filter, (list, tuple, VariantRecordFilter)):
jpayne@69 2105 for f in filter:
jpayne@69 2106 rec.filter.add(f)
jpayne@69 2107 else:
jpayne@69 2108 rec.filter.add(filter)
jpayne@69 2109
jpayne@69 2110 if info:
jpayne@69 2111 rec.info.update(info)
jpayne@69 2112
jpayne@69 2113 if kwargs:
jpayne@69 2114 rec.samples[0].update(kwargs)
jpayne@69 2115
jpayne@69 2116 if samples:
jpayne@69 2117 for i, sample in enumerate(samples):
jpayne@69 2118 rec.samples[i].update(sample)
jpayne@69 2119
jpayne@69 2120 return rec
jpayne@69 2121
jpayne@69 2122 def add_record(self, VariantHeaderRecord record):
jpayne@69 2123 """Add an existing :class:`VariantHeaderRecord` to this header"""
jpayne@69 2124 if record is None:
jpayne@69 2125 raise ValueError('record must not be None')
jpayne@69 2126
jpayne@69 2127 cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
jpayne@69 2128
jpayne@69 2129 bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@69 2130
jpayne@69 2131 self._hdr_sync()
jpayne@69 2132
jpayne@69 2133 def add_line(self, line):
jpayne@69 2134 """Add a metadata line to this header"""
jpayne@69 2135 bline = force_bytes(line)
jpayne@69 2136 if bcf_hdr_append(self.ptr, bline) < 0:
jpayne@69 2137 raise ValueError('invalid header line')
jpayne@69 2138
jpayne@69 2139 self._hdr_sync()
jpayne@69 2140
jpayne@69 2141
jpayne@69 2142 def add_meta(self, key, value=None, items=None):
jpayne@69 2143 """Add metadata to this header"""
jpayne@69 2144 if not ((value is not None) ^ (items is not None)):
jpayne@69 2145 raise ValueError('either value or items must be specified')
jpayne@69 2146
jpayne@69 2147 cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
jpayne@69 2148 cdef int quoted
jpayne@69 2149
jpayne@69 2150 try:
jpayne@69 2151 key = force_bytes(key)
jpayne@69 2152 hrec.key = strdup(key)
jpayne@69 2153
jpayne@69 2154 if value is not None:
jpayne@69 2155 hrec.value = strdup(force_bytes(value))
jpayne@69 2156 else:
jpayne@69 2157 for key, value in items:
jpayne@69 2158 quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
jpayne@69 2159
jpayne@69 2160 key = force_bytes(key)
jpayne@69 2161 bcf_hrec_add_key(hrec, key, <int>len(key))
jpayne@69 2162
jpayne@69 2163 value = force_bytes(str(value))
jpayne@69 2164 bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
jpayne@69 2165 except:
jpayne@69 2166 bcf_hrec_destroy(hrec)
jpayne@69 2167 raise
jpayne@69 2168
jpayne@69 2169 bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@69 2170
jpayne@69 2171 self._hdr_sync()
jpayne@69 2172
jpayne@69 2173 cdef _add_sample(self, name):
jpayne@69 2174 bname = force_bytes(name)
jpayne@69 2175 if bcf_hdr_add_sample(self.ptr, bname) < 0:
jpayne@69 2176 raise ValueError('Duplicated sample name: {}'.format(name))
jpayne@69 2177
jpayne@69 2178 cdef _hdr_sync(self):
jpayne@69 2179 cdef bcf_hdr_t *hdr = self.ptr
jpayne@69 2180 if hdr.dirty:
jpayne@69 2181 if bcf_hdr_sync(hdr) < 0:
jpayne@69 2182 raise MemoryError('unable to reallocate VariantHeader')
jpayne@69 2183
jpayne@69 2184 def add_sample(self, name):
jpayne@69 2185 """Add a new sample to this header"""
jpayne@69 2186 self._add_sample(name)
jpayne@69 2187 self._hdr_sync()
jpayne@69 2188
jpayne@69 2189 def add_samples(self, *args):
jpayne@69 2190 """Add several new samples to this header.
jpayne@69 2191 This function takes multiple arguments, each of which may
jpayne@69 2192 be either a sample name or an iterable returning sample names
jpayne@69 2193 (e.g., a list of sample names).
jpayne@69 2194 """
jpayne@69 2195 for arg in args:
jpayne@69 2196 if isinstance(arg, str):
jpayne@69 2197 self._add_sample(arg)
jpayne@69 2198 else:
jpayne@69 2199 for name in arg:
jpayne@69 2200 self._add_sample(name)
jpayne@69 2201 self._hdr_sync()
jpayne@69 2202
jpayne@69 2203
jpayne@69 2204 cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
jpayne@69 2205 if not hdr:
jpayne@69 2206 raise ValueError('cannot create VariantHeader')
jpayne@69 2207
jpayne@69 2208 cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
jpayne@69 2209 header.ptr = hdr
jpayne@69 2210
jpayne@69 2211 return header
jpayne@69 2212
jpayne@69 2213
jpayne@69 2214 cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
jpayne@69 2215 cdef vdict_t *d
jpayne@69 2216 cdef khiter_t k
jpayne@69 2217 cdef int info_id
jpayne@69 2218
jpayne@69 2219 if isinstance(key, str):
jpayne@69 2220 key = force_bytes(key)
jpayne@69 2221
jpayne@69 2222 d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@69 2223 k = kh_get_vdict(d, key)
jpayne@69 2224
jpayne@69 2225 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
jpayne@69 2226 return -1
jpayne@69 2227
jpayne@69 2228 return kh_val_vdict(d, k).id
jpayne@69 2229
jpayne@69 2230
jpayne@69 2231 ########################################################################
jpayne@69 2232 ########################################################################
jpayne@69 2233 ## Variant Record objects
jpayne@69 2234 ########################################################################
jpayne@69 2235
jpayne@69 2236 cdef class VariantRecordFilter(object):
jpayne@69 2237 """Filters set on a :class:`VariantRecord` object, presented as a mapping from
jpayne@69 2238 filter index or name to :class:`VariantMetadata` object"""
jpayne@69 2239 def __init__(self, *args, **kwargs):
jpayne@69 2240 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 2241
jpayne@69 2242 def __len__(self):
jpayne@69 2243 return self.record.ptr.d.n_flt
jpayne@69 2244
jpayne@69 2245 def __bool__(self):
jpayne@69 2246 return self.record.ptr.d.n_flt != 0
jpayne@69 2247
jpayne@69 2248 def __getitem__(self, key):
jpayne@69 2249 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2250 cdef bcf1_t *r = self.record.ptr
jpayne@69 2251 cdef int index, id
jpayne@69 2252 cdef int n = r.d.n_flt
jpayne@69 2253
jpayne@69 2254 if isinstance(key, int):
jpayne@69 2255 index = key
jpayne@69 2256
jpayne@69 2257 if index < 0 or index >= n:
jpayne@69 2258 raise IndexError('invalid filter index')
jpayne@69 2259
jpayne@69 2260 id = r.d.flt[index]
jpayne@69 2261 else:
jpayne@69 2262 if key == '.':
jpayne@69 2263 key = 'PASS'
jpayne@69 2264
jpayne@69 2265 bkey = force_bytes(key)
jpayne@69 2266 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@69 2267
jpayne@69 2268 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@69 2269 raise KeyError('Invalid filter: {}'.format(key))
jpayne@69 2270
jpayne@69 2271 return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
jpayne@69 2272
jpayne@69 2273 def add(self, key):
jpayne@69 2274 """Add a new filter"""
jpayne@69 2275 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2276 cdef bcf1_t *r = self.record.ptr
jpayne@69 2277 cdef int id
jpayne@69 2278
jpayne@69 2279 if key == '.':
jpayne@69 2280 key = 'PASS'
jpayne@69 2281
jpayne@69 2282 cdef bytes bkey = force_bytes(key)
jpayne@69 2283 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@69 2284
jpayne@69 2285 if not check_header_id(hdr, BCF_HL_FLT, id):
jpayne@69 2286 raise KeyError('Invalid filter: {}'.format(key))
jpayne@69 2287
jpayne@69 2288 bcf_add_filter(hdr, r, id)
jpayne@69 2289
jpayne@69 2290 def __delitem__(self, key):
jpayne@69 2291 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2292 cdef bcf1_t *r = self.record.ptr
jpayne@69 2293 cdef int index, id
jpayne@69 2294 cdef int n = r.d.n_flt
jpayne@69 2295
jpayne@69 2296 if isinstance(key, int):
jpayne@69 2297 index = key
jpayne@69 2298
jpayne@69 2299 if index < 0 or index >= n:
jpayne@69 2300 raise IndexError('invalid filter index')
jpayne@69 2301
jpayne@69 2302 id = r.d.flt[index]
jpayne@69 2303 else:
jpayne@69 2304 if key == '.':
jpayne@69 2305 key = 'PASS'
jpayne@69 2306
jpayne@69 2307 bkey = force_bytes(key)
jpayne@69 2308 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@69 2309
jpayne@69 2310 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@69 2311 raise KeyError('Invalid filter: {}'.format(key))
jpayne@69 2312
jpayne@69 2313 bcf_remove_filter(hdr, r, id, 0)
jpayne@69 2314
jpayne@69 2315 def clear(self):
jpayne@69 2316 """Clear all filters"""
jpayne@69 2317 cdef bcf1_t *r = self.record.ptr
jpayne@69 2318 r.d.shared_dirty |= BCF1_DIRTY_FLT
jpayne@69 2319 r.d.n_flt = 0
jpayne@69 2320
jpayne@69 2321 def __iter__(self):
jpayne@69 2322 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2323 cdef bcf1_t *r = self.record.ptr
jpayne@69 2324 cdef int i
jpayne@69 2325
jpayne@69 2326 for i in range(r.d.n_flt):
jpayne@69 2327 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
jpayne@69 2328
jpayne@69 2329 def get(self, key, default=None):
jpayne@69 2330 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 2331 try:
jpayne@69 2332 return self[key]
jpayne@69 2333 except KeyError:
jpayne@69 2334 return default
jpayne@69 2335
jpayne@69 2336 def __contains__(self, key):
jpayne@69 2337 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2338 cdef bcf1_t *r = self.record.ptr
jpayne@69 2339 cdef bytes bkey = force_bytes(key)
jpayne@69 2340 return bcf_has_filter(hdr, r, bkey) == 1
jpayne@69 2341
jpayne@69 2342 def iterkeys(self):
jpayne@69 2343 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 2344 return iter(self)
jpayne@69 2345
jpayne@69 2346 def itervalues(self):
jpayne@69 2347 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 2348 for key in self:
jpayne@69 2349 yield self[key]
jpayne@69 2350
jpayne@69 2351 def iteritems(self):
jpayne@69 2352 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 2353 for key in self:
jpayne@69 2354 yield (key, self[key])
jpayne@69 2355
jpayne@69 2356 def keys(self):
jpayne@69 2357 """D.keys() -> list of D's keys"""
jpayne@69 2358 return list(self)
jpayne@69 2359
jpayne@69 2360 def items(self):
jpayne@69 2361 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 2362 return list(self.iteritems())
jpayne@69 2363
jpayne@69 2364 def values(self):
jpayne@69 2365 """D.values() -> list of D's values"""
jpayne@69 2366 return list(self.itervalues())
jpayne@69 2367
jpayne@69 2368 def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
jpayne@69 2369 if op != 2 and op != 3:
jpayne@69 2370 return NotImplemented
jpayne@69 2371
jpayne@69 2372 cdef bcf1_t *s = self.record.ptr
jpayne@69 2373 cdef bcf1_t *o = other.record.ptr
jpayne@69 2374
jpayne@69 2375 cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
jpayne@69 2376
jpayne@69 2377 if op == 3:
jpayne@69 2378 cmp = not cmp
jpayne@69 2379
jpayne@69 2380 return cmp
jpayne@69 2381
jpayne@69 2382 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 2383 __hash__ = None
jpayne@69 2384
jpayne@69 2385 #TODO: implement __richcmp__
jpayne@69 2386
jpayne@69 2387
jpayne@69 2388 cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
jpayne@69 2389 if not record:
jpayne@69 2390 raise ValueError('invalid VariantRecord')
jpayne@69 2391
jpayne@69 2392 cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
jpayne@69 2393 filter.record = record
jpayne@69 2394
jpayne@69 2395 return filter
jpayne@69 2396
jpayne@69 2397
jpayne@69 2398 cdef class VariantRecordFormat(object):
jpayne@69 2399 """Format data present for each sample in a :class:`VariantRecord` object,
jpayne@69 2400 presented as mapping from format name to :class:`VariantMetadata` object."""
jpayne@69 2401 def __init__(self, *args, **kwargs):
jpayne@69 2402 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 2403
jpayne@69 2404 def __len__(self):
jpayne@69 2405 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2406 cdef bcf1_t *r = self.record.ptr
jpayne@69 2407 cdef int i, n = 0
jpayne@69 2408
jpayne@69 2409 for i in range(r.n_fmt):
jpayne@69 2410 if r.d.fmt[i].p:
jpayne@69 2411 n += 1
jpayne@69 2412 return n
jpayne@69 2413
jpayne@69 2414 def __bool__(self):
jpayne@69 2415 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2416 cdef bcf1_t *r = self.record.ptr
jpayne@69 2417 cdef int i
jpayne@69 2418
jpayne@69 2419 for i in range(r.n_fmt):
jpayne@69 2420 if r.d.fmt[i].p:
jpayne@69 2421 return True
jpayne@69 2422 return False
jpayne@69 2423
jpayne@69 2424 def __getitem__(self, key):
jpayne@69 2425 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2426 cdef bcf1_t *r = self.record.ptr
jpayne@69 2427
jpayne@69 2428 cdef bytes bkey = force_bytes(key)
jpayne@69 2429 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 2430
jpayne@69 2431 if not fmt or not fmt.p:
jpayne@69 2432 raise KeyError('unknown format: {}'.format(key))
jpayne@69 2433
jpayne@69 2434 return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
jpayne@69 2435
jpayne@69 2436 def __delitem__(self, key):
jpayne@69 2437 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2438 cdef bcf1_t *r = self.record.ptr
jpayne@69 2439
jpayne@69 2440 cdef bytes bkey = force_bytes(key)
jpayne@69 2441 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 2442
jpayne@69 2443 if not fmt or not fmt.p:
jpayne@69 2444 raise KeyError('unknown format: {}'.format(key))
jpayne@69 2445
jpayne@69 2446 if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
jpayne@69 2447 raise ValueError('Unable to delete FORMAT')
jpayne@69 2448
jpayne@69 2449 def clear(self):
jpayne@69 2450 """Clear all formats for all samples within the associated
jpayne@69 2451 :class:`VariantRecord` instance"""
jpayne@69 2452 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2453 cdef bcf1_t *r = self.record.ptr
jpayne@69 2454 cdef bcf_fmt_t *fmt
jpayne@69 2455 cdef const char *key
jpayne@69 2456 cdef int i
jpayne@69 2457
jpayne@69 2458 for i in reversed(range(r.n_fmt)):
jpayne@69 2459 fmt = &r.d.fmt[i]
jpayne@69 2460 if fmt.p:
jpayne@69 2461 key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
jpayne@69 2462 if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
jpayne@69 2463 raise ValueError('Unable to delete FORMAT')
jpayne@69 2464
jpayne@69 2465 def __iter__(self):
jpayne@69 2466 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2467 cdef bcf1_t *r = self.record.ptr
jpayne@69 2468 cdef bcf_fmt_t *fmt
jpayne@69 2469 cdef int i
jpayne@69 2470
jpayne@69 2471 for i in range(r.n_fmt):
jpayne@69 2472 fmt = &r.d.fmt[i]
jpayne@69 2473 if fmt.p:
jpayne@69 2474 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@69 2475
jpayne@69 2476 def get(self, key, default=None):
jpayne@69 2477 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 2478 try:
jpayne@69 2479 return self[key]
jpayne@69 2480 except KeyError:
jpayne@69 2481 return default
jpayne@69 2482
jpayne@69 2483 def __contains__(self, key):
jpayne@69 2484 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2485 cdef bcf1_t *r = self.record.ptr
jpayne@69 2486 cdef bytes bkey = force_bytes(key)
jpayne@69 2487 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 2488 return fmt != NULL and fmt.p != NULL
jpayne@69 2489
jpayne@69 2490 def iterkeys(self):
jpayne@69 2491 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 2492 return iter(self)
jpayne@69 2493
jpayne@69 2494 def itervalues(self):
jpayne@69 2495 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 2496 for key in self:
jpayne@69 2497 yield self[key]
jpayne@69 2498
jpayne@69 2499 def iteritems(self):
jpayne@69 2500 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 2501 for key in self:
jpayne@69 2502 yield (key, self[key])
jpayne@69 2503
jpayne@69 2504 def keys(self):
jpayne@69 2505 """D.keys() -> list of D's keys"""
jpayne@69 2506 return list(self)
jpayne@69 2507
jpayne@69 2508 def items(self):
jpayne@69 2509 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 2510 return list(self.iteritems())
jpayne@69 2511
jpayne@69 2512 def values(self):
jpayne@69 2513 """D.values() -> list of D's values"""
jpayne@69 2514 return list(self.itervalues())
jpayne@69 2515
jpayne@69 2516 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 2517 __hash__ = None
jpayne@69 2518
jpayne@69 2519 #TODO: implement __richcmp__
jpayne@69 2520
jpayne@69 2521
jpayne@69 2522 cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
jpayne@69 2523 if not record:
jpayne@69 2524 raise ValueError('invalid VariantRecord')
jpayne@69 2525
jpayne@69 2526 cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
jpayne@69 2527 format.record = record
jpayne@69 2528
jpayne@69 2529 return format
jpayne@69 2530
jpayne@69 2531
jpayne@69 2532 #TODO: Add a getmeta method to return the corresponding VariantMetadata?
jpayne@69 2533 cdef class VariantRecordInfo(object):
jpayne@69 2534 """Info data stored in a :class:`VariantRecord` object, presented as a
jpayne@69 2535 mapping from info metadata name to value."""
jpayne@69 2536
jpayne@69 2537 def __init__(self, *args, **kwargs):
jpayne@69 2538 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 2539
jpayne@69 2540 def __len__(self):
jpayne@69 2541 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2542 cdef bcf1_t *r = self.record.ptr
jpayne@69 2543 cdef bcf_info_t *info
jpayne@69 2544 cdef const char *key
jpayne@69 2545 cdef int i, count = 0
jpayne@69 2546
jpayne@69 2547 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2548 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2549
jpayne@69 2550 for i in range(r.n_info):
jpayne@69 2551 info = &r.d.info[i]
jpayne@69 2552 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2553 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@69 2554 count += 1
jpayne@69 2555
jpayne@69 2556 return count
jpayne@69 2557
jpayne@69 2558 def __bool__(self):
jpayne@69 2559 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2560 cdef bcf1_t *r = self.record.ptr
jpayne@69 2561 cdef bcf_info_t *info
jpayne@69 2562 cdef const char *key
jpayne@69 2563 cdef int i
jpayne@69 2564
jpayne@69 2565 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2566 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2567
jpayne@69 2568 for i in range(r.n_info):
jpayne@69 2569 info = &r.d.info[i]
jpayne@69 2570 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2571 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@69 2572 return True
jpayne@69 2573
jpayne@69 2574 return False
jpayne@69 2575
jpayne@69 2576 def __getitem__(self, key):
jpayne@69 2577 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2578 cdef bcf1_t *r = self.record.ptr
jpayne@69 2579
jpayne@69 2580 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2581 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2582
jpayne@69 2583 cdef bytes bkey = force_bytes(key)
jpayne@69 2584
jpayne@69 2585 if strcmp(bkey, b'END') == 0:
jpayne@69 2586 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@69 2587
jpayne@69 2588 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 2589
jpayne@69 2590 # Cannot stop here if info == NULL, since flags must return False
jpayne@69 2591 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@69 2592
jpayne@69 2593 if info_id < 0:
jpayne@69 2594 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@69 2595
jpayne@69 2596 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@69 2597 raise ValueError('Invalid header')
jpayne@69 2598
jpayne@69 2599 # Handle type=Flag values
jpayne@69 2600 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@69 2601 return info != NULL and info.vptr != NULL
jpayne@69 2602
jpayne@69 2603 if not info or not info.vptr:
jpayne@69 2604 raise KeyError('Invalid INFO field: {}'.format(key))
jpayne@69 2605
jpayne@69 2606 return bcf_info_get_value(self.record, info)
jpayne@69 2607
jpayne@69 2608 def __setitem__(self, key, value):
jpayne@69 2609 cdef bytes bkey = force_bytes(key)
jpayne@69 2610
jpayne@69 2611 if strcmp(bkey, b'END') == 0:
jpayne@69 2612 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@69 2613
jpayne@69 2614 if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
jpayne@69 2615 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2616
jpayne@69 2617 bcf_info_set_value(self.record, key, value)
jpayne@69 2618
jpayne@69 2619 def __delitem__(self, key):
jpayne@69 2620 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2621 cdef bcf1_t *r = self.record.ptr
jpayne@69 2622
jpayne@69 2623 cdef bytes bkey = force_bytes(key)
jpayne@69 2624 if strcmp(bkey, b'END') == 0:
jpayne@69 2625 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@69 2626
jpayne@69 2627 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2628 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2629
jpayne@69 2630 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 2631
jpayne@69 2632 # Cannot stop here if info == NULL, since flags must return False
jpayne@69 2633 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@69 2634
jpayne@69 2635 if info_id < 0:
jpayne@69 2636 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@69 2637
jpayne@69 2638 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@69 2639 raise ValueError('Invalid header')
jpayne@69 2640
jpayne@69 2641 # Handle flags
jpayne@69 2642 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@69 2643 return
jpayne@69 2644
jpayne@69 2645 if not info or not info.vptr:
jpayne@69 2646 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@69 2647
jpayne@69 2648 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@69 2649 raise ValueError('Unable to delete INFO')
jpayne@69 2650
jpayne@69 2651 def clear(self):
jpayne@69 2652 """Clear all info data"""
jpayne@69 2653 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2654 cdef bcf1_t *r = self.record.ptr
jpayne@69 2655 cdef bcf_info_t *info
jpayne@69 2656 cdef const char *key
jpayne@69 2657 cdef int i
jpayne@69 2658
jpayne@69 2659 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2660 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2661
jpayne@69 2662 for i in range(r.n_info):
jpayne@69 2663 info = &r.d.info[i]
jpayne@69 2664 if info and info.vptr:
jpayne@69 2665 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2666 if strcmp(key, b'END') == 0:
jpayne@69 2667 continue
jpayne@69 2668 if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
jpayne@69 2669 raise ValueError('Unable to delete INFO')
jpayne@69 2670
jpayne@69 2671 def __iter__(self):
jpayne@69 2672 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2673 cdef bcf1_t *r = self.record.ptr
jpayne@69 2674 cdef bcf_info_t *info
jpayne@69 2675 cdef const char *key
jpayne@69 2676 cdef int i
jpayne@69 2677
jpayne@69 2678 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2679 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2680
jpayne@69 2681 for i in range(r.n_info):
jpayne@69 2682 info = &r.d.info[i]
jpayne@69 2683 if info and info.vptr:
jpayne@69 2684 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2685 if strcmp(key, b'END') != 0:
jpayne@69 2686 yield bcf_str_cache_get_charptr(key)
jpayne@69 2687
jpayne@69 2688 def get(self, key, default=None):
jpayne@69 2689 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 2690 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2691 cdef bcf1_t *r = self.record.ptr
jpayne@69 2692
jpayne@69 2693 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2694 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2695
jpayne@69 2696 cdef bytes bkey = force_bytes(key)
jpayne@69 2697
jpayne@69 2698 if strcmp(bkey, b'END') == 0:
jpayne@69 2699 return default
jpayne@69 2700
jpayne@69 2701 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 2702
jpayne@69 2703 # Cannot stop here if info == NULL, since flags must return False
jpayne@69 2704 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@69 2705
jpayne@69 2706 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@69 2707 raise ValueError('Invalid header')
jpayne@69 2708
jpayne@69 2709 # Handle flags
jpayne@69 2710 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@69 2711 return info != NULL and info.vptr != NULL
jpayne@69 2712
jpayne@69 2713 if not info or not info.vptr:
jpayne@69 2714 return default
jpayne@69 2715
jpayne@69 2716 return bcf_info_get_value(self.record, info)
jpayne@69 2717
jpayne@69 2718 def __contains__(self, key):
jpayne@69 2719 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2720 cdef bcf1_t *r = self.record.ptr
jpayne@69 2721
jpayne@69 2722 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2723 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2724
jpayne@69 2725 cdef bytes bkey = force_bytes(key)
jpayne@69 2726
jpayne@69 2727 if strcmp(bkey, b'END') == 0:
jpayne@69 2728 return False
jpayne@69 2729
jpayne@69 2730 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 2731
jpayne@69 2732 return info != NULL and info.vptr != NULL
jpayne@69 2733
jpayne@69 2734 def iterkeys(self):
jpayne@69 2735 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 2736 return iter(self)
jpayne@69 2737
jpayne@69 2738 def itervalues(self):
jpayne@69 2739 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 2740 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2741 cdef bcf1_t *r = self.record.ptr
jpayne@69 2742 cdef bcf_info_t *info
jpayne@69 2743 cdef const char *key
jpayne@69 2744 cdef int i
jpayne@69 2745
jpayne@69 2746 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2747 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2748
jpayne@69 2749 for i in range(r.n_info):
jpayne@69 2750 info = &r.d.info[i]
jpayne@69 2751 if info and info.vptr:
jpayne@69 2752 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2753 if strcmp(key, b'END') != 0:
jpayne@69 2754 yield bcf_info_get_value(self.record, info)
jpayne@69 2755
jpayne@69 2756 def iteritems(self):
jpayne@69 2757 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 2758 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2759 cdef bcf1_t *r = self.record.ptr
jpayne@69 2760 cdef bcf_info_t *info
jpayne@69 2761 cdef const char *key
jpayne@69 2762 cdef int i
jpayne@69 2763
jpayne@69 2764 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2765 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2766
jpayne@69 2767 for i in range(r.n_info):
jpayne@69 2768 info = &r.d.info[i]
jpayne@69 2769 if info and info.vptr:
jpayne@69 2770 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@69 2771 if strcmp(key, b'END') != 0:
jpayne@69 2772 value = bcf_info_get_value(self.record, info)
jpayne@69 2773 yield bcf_str_cache_get_charptr(key), value
jpayne@69 2774
jpayne@69 2775 def keys(self):
jpayne@69 2776 """D.keys() -> list of D's keys"""
jpayne@69 2777 return list(self)
jpayne@69 2778
jpayne@69 2779 def items(self):
jpayne@69 2780 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 2781 return list(self.iteritems())
jpayne@69 2782
jpayne@69 2783 def values(self):
jpayne@69 2784 """D.values() -> list of D's values"""
jpayne@69 2785 return list(self.itervalues())
jpayne@69 2786
jpayne@69 2787 def update(self, items=None, **kwargs):
jpayne@69 2788 """D.update([E, ]**F) -> None.
jpayne@69 2789
jpayne@69 2790 Update D from dict/iterable E and F.
jpayne@69 2791 """
jpayne@69 2792 for k, v in items.items():
jpayne@69 2793 if k != 'END':
jpayne@69 2794 self[k] = v
jpayne@69 2795
jpayne@69 2796 if kwargs:
jpayne@69 2797 kwargs.pop('END', None)
jpayne@69 2798 for k, v in kwargs.items():
jpayne@69 2799 self[k] = v
jpayne@69 2800
jpayne@69 2801 def pop(self, key, default=_nothing):
jpayne@69 2802 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2803 cdef bcf1_t *r = self.record.ptr
jpayne@69 2804
jpayne@69 2805 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@69 2806 raise ValueError('Error unpacking VariantRecord')
jpayne@69 2807
jpayne@69 2808 cdef bytes bkey = force_bytes(key)
jpayne@69 2809 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@69 2810
jpayne@69 2811 # Cannot stop here if info == NULL, since flags must return False
jpayne@69 2812 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@69 2813
jpayne@69 2814 if info_id < 0:
jpayne@69 2815 if default is _nothing:
jpayne@69 2816 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@69 2817 return default
jpayne@69 2818
jpayne@69 2819 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@69 2820 raise ValueError('Invalid header')
jpayne@69 2821
jpayne@69 2822 # Handle flags
jpayne@69 2823 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@69 2824 return
jpayne@69 2825
jpayne@69 2826 if not info or not info.vptr:
jpayne@69 2827 if default is _nothing:
jpayne@69 2828 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@69 2829 return default
jpayne@69 2830
jpayne@69 2831 value = bcf_info_get_value(self.record, info)
jpayne@69 2832
jpayne@69 2833 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@69 2834 raise ValueError('Unable to delete INFO')
jpayne@69 2835
jpayne@69 2836 return value
jpayne@69 2837
jpayne@69 2838 def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
jpayne@69 2839 if op != 2 and op != 3:
jpayne@69 2840 return NotImplemented
jpayne@69 2841
jpayne@69 2842 cdef bcf1_t *s = self.record.ptr
jpayne@69 2843 cdef bcf1_t *o = other.record.ptr
jpayne@69 2844
jpayne@69 2845 # Cannot use n_info as shortcut logic, since null values may remain
jpayne@69 2846 cdef bint cmp = dict(self) == dict(other)
jpayne@69 2847
jpayne@69 2848 if op == 3:
jpayne@69 2849 cmp = not cmp
jpayne@69 2850
jpayne@69 2851 return cmp
jpayne@69 2852
jpayne@69 2853 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 2854 __hash__ = None
jpayne@69 2855
jpayne@69 2856
jpayne@69 2857 cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
jpayne@69 2858 if not record:
jpayne@69 2859 raise ValueError('invalid VariantRecord')
jpayne@69 2860
jpayne@69 2861 cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
jpayne@69 2862 info.record = record
jpayne@69 2863
jpayne@69 2864 return info
jpayne@69 2865
jpayne@69 2866
jpayne@69 2867 cdef class VariantRecordSamples(object):
jpayne@69 2868 """mapping from sample index or name to :class:`VariantRecordSample` object."""
jpayne@69 2869 def __init__(self, *args, **kwargs):
jpayne@69 2870 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 2871
jpayne@69 2872 def __len__(self):
jpayne@69 2873 return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
jpayne@69 2874
jpayne@69 2875 def __bool__(self):
jpayne@69 2876 return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
jpayne@69 2877
jpayne@69 2878 def __getitem__(self, key):
jpayne@69 2879 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2880 cdef bcf1_t *r = self.record.ptr
jpayne@69 2881 cdef int n = self.record.ptr.n_sample
jpayne@69 2882 cdef int sample_index
jpayne@69 2883 cdef vdict_t *d
jpayne@69 2884 cdef khiter_t k
jpayne@69 2885
jpayne@69 2886 if isinstance(key, int):
jpayne@69 2887 sample_index = key
jpayne@69 2888 else:
jpayne@69 2889 bkey = force_bytes(key)
jpayne@69 2890 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@69 2891 if sample_index < 0:
jpayne@69 2892 raise KeyError('invalid sample name: {}'.format(key))
jpayne@69 2893
jpayne@69 2894 if sample_index < 0 or sample_index >= n:
jpayne@69 2895 raise IndexError('invalid sample index')
jpayne@69 2896
jpayne@69 2897 return makeVariantRecordSample(self.record, sample_index)
jpayne@69 2898
jpayne@69 2899 def __iter__(self):
jpayne@69 2900 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2901 cdef bcf1_t *r = self.record.ptr
jpayne@69 2902 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@69 2903
jpayne@69 2904 for i in range(n):
jpayne@69 2905 yield charptr_to_str(hdr.samples[i])
jpayne@69 2906
jpayne@69 2907 def get(self, key, default=None):
jpayne@69 2908 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 2909 try:
jpayne@69 2910 return self[key]
jpayne@69 2911 except KeyError:
jpayne@69 2912 return default
jpayne@69 2913
jpayne@69 2914 def __contains__(self, key):
jpayne@69 2915 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2916 cdef bcf1_t *r = self.record.ptr
jpayne@69 2917 cdef int n = self.record.ptr.n_sample
jpayne@69 2918 cdef int sample_index
jpayne@69 2919 cdef vdict_t *d
jpayne@69 2920 cdef khiter_t k
jpayne@69 2921
jpayne@69 2922 if isinstance(key, int):
jpayne@69 2923 sample_index = key
jpayne@69 2924 else:
jpayne@69 2925 bkey = force_bytes(key)
jpayne@69 2926 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@69 2927 if sample_index < 0:
jpayne@69 2928 raise KeyError('invalid sample name: {}'.format(key))
jpayne@69 2929
jpayne@69 2930 return 0 <= sample_index < n
jpayne@69 2931
jpayne@69 2932 def iterkeys(self):
jpayne@69 2933 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 2934 return iter(self)
jpayne@69 2935
jpayne@69 2936 def itervalues(self):
jpayne@69 2937 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 2938 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2939 cdef bcf1_t *r = self.record.ptr
jpayne@69 2940 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@69 2941
jpayne@69 2942 for i in range(n):
jpayne@69 2943 yield makeVariantRecordSample(self.record, i)
jpayne@69 2944
jpayne@69 2945 def iteritems(self):
jpayne@69 2946 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 2947 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 2948 cdef bcf1_t *r = self.record.ptr
jpayne@69 2949 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@69 2950
jpayne@69 2951 for i in range(n):
jpayne@69 2952 yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
jpayne@69 2953
jpayne@69 2954 def keys(self):
jpayne@69 2955 """D.keys() -> list of D's keys"""
jpayne@69 2956 return list(self)
jpayne@69 2957
jpayne@69 2958 def items(self):
jpayne@69 2959 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 2960 return list(self.iteritems())
jpayne@69 2961
jpayne@69 2962 def values(self):
jpayne@69 2963 """D.values() -> list of D's values"""
jpayne@69 2964 return list(self.itervalues())
jpayne@69 2965
jpayne@69 2966 def update(self, items=None, **kwargs):
jpayne@69 2967 """D.update([E, ]**F) -> None.
jpayne@69 2968
jpayne@69 2969 Update D from dict/iterable E and F.
jpayne@69 2970 """
jpayne@69 2971 for k, v in items.items():
jpayne@69 2972 self[k] = v
jpayne@69 2973
jpayne@69 2974 if kwargs:
jpayne@69 2975 for k, v in kwargs.items():
jpayne@69 2976 self[k] = v
jpayne@69 2977
jpayne@69 2978 def pop(self, key, default=_nothing):
jpayne@69 2979 try:
jpayne@69 2980 value = self[key]
jpayne@69 2981 del self[key]
jpayne@69 2982 return value
jpayne@69 2983 except KeyError:
jpayne@69 2984 if default is not _nothing:
jpayne@69 2985 return default
jpayne@69 2986 raise
jpayne@69 2987
jpayne@69 2988 def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
jpayne@69 2989 if op != 2 and op != 3:
jpayne@69 2990 return NotImplemented
jpayne@69 2991
jpayne@69 2992 cdef bcf1_t *s = self.record.ptr
jpayne@69 2993 cdef bcf1_t *o = other.record.ptr
jpayne@69 2994
jpayne@69 2995 cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
jpayne@69 2996
jpayne@69 2997 if op == 3:
jpayne@69 2998 cmp = not cmp
jpayne@69 2999
jpayne@69 3000 return cmp
jpayne@69 3001
jpayne@69 3002 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 3003 __hash__ = None
jpayne@69 3004
jpayne@69 3005
jpayne@69 3006 cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
jpayne@69 3007 if not record:
jpayne@69 3008 raise ValueError('invalid VariantRecord')
jpayne@69 3009
jpayne@69 3010 cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
jpayne@69 3011 VariantRecordSamples)
jpayne@69 3012 samples.record = record
jpayne@69 3013
jpayne@69 3014 return samples
jpayne@69 3015
jpayne@69 3016
jpayne@69 3017 cdef class VariantRecord(object):
jpayne@69 3018 """Variant record"""
jpayne@69 3019 def __init__(self, *args, **kwargs):
jpayne@69 3020 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 3021
jpayne@69 3022 def __dealloc__(self):
jpayne@69 3023 if self.ptr:
jpayne@69 3024 bcf_destroy1(self.ptr)
jpayne@69 3025 self.ptr = NULL
jpayne@69 3026
jpayne@69 3027 def copy(self):
jpayne@69 3028 """return a copy of this VariantRecord object"""
jpayne@69 3029 return makeVariantRecord(self.header, bcf_dup(self.ptr))
jpayne@69 3030
jpayne@69 3031 def translate(self, VariantHeader dst_header):
jpayne@69 3032 if dst_header is None:
jpayne@69 3033 raise ValueError('dst_header must not be None')
jpayne@69 3034
jpayne@69 3035 cdef bcf_hdr_t *src_hdr = self.header.ptr
jpayne@69 3036 cdef bcf_hdr_t *dst_hdr = dst_header.ptr
jpayne@69 3037
jpayne@69 3038 if src_hdr != dst_hdr:
jpayne@69 3039 if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
jpayne@69 3040 msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
jpayne@69 3041 raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
jpayne@69 3042
jpayne@69 3043 bcf_translate(dst_hdr, src_hdr, self.ptr)
jpayne@69 3044 self.header = dst_header
jpayne@69 3045
jpayne@69 3046 @property
jpayne@69 3047 def rid(self):
jpayne@69 3048 """internal reference id number"""
jpayne@69 3049 return self.ptr.rid
jpayne@69 3050
jpayne@69 3051 @rid.setter
jpayne@69 3052 def rid(self, value):
jpayne@69 3053 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 3054 cdef int r = value
jpayne@69 3055 if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
jpayne@69 3056 raise ValueError('invalid reference id')
jpayne@69 3057 self.ptr.rid = r
jpayne@69 3058
jpayne@69 3059 @property
jpayne@69 3060 def chrom(self):
jpayne@69 3061 """chromosome/contig name"""
jpayne@69 3062 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 3063 cdef int rid = self.ptr.rid
jpayne@69 3064 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@69 3065 raise ValueError('Invalid header')
jpayne@69 3066 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@69 3067
jpayne@69 3068 @chrom.setter
jpayne@69 3069 def chrom(self, value):
jpayne@69 3070 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@69 3071 bchrom = force_bytes(value)
jpayne@69 3072 cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@69 3073 if k == kh_end(d):
jpayne@69 3074 raise ValueError('Invalid chromosome/contig')
jpayne@69 3075 self.ptr.rid = kh_val_vdict(d, k).id
jpayne@69 3076
jpayne@69 3077 @property
jpayne@69 3078 def contig(self):
jpayne@69 3079 """chromosome/contig name"""
jpayne@69 3080 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 3081 cdef int rid = self.ptr.rid
jpayne@69 3082 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@69 3083 raise ValueError('Invalid header')
jpayne@69 3084 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@69 3085
jpayne@69 3086 @contig.setter
jpayne@69 3087 def contig(self, value):
jpayne@69 3088 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@69 3089 bchrom = force_bytes(value)
jpayne@69 3090 cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@69 3091 if k == kh_end(d):
jpayne@69 3092 raise ValueError('Invalid chromosome/contig')
jpayne@69 3093 self.ptr.rid = kh_val_vdict(d, k).id
jpayne@69 3094
jpayne@69 3095 @property
jpayne@69 3096 def pos(self):
jpayne@69 3097 """record start position on chrom/contig (1-based inclusive)"""
jpayne@69 3098 return self.ptr.pos + 1
jpayne@69 3099
jpayne@69 3100 @pos.setter
jpayne@69 3101 def pos(self, value):
jpayne@69 3102 cdef int p = value
jpayne@69 3103 if p < 1:
jpayne@69 3104 raise ValueError('Position must be positive')
jpayne@69 3105 self.ptr.pos = p - 1
jpayne@69 3106 bcf_sync_end(self)
jpayne@69 3107
jpayne@69 3108 @property
jpayne@69 3109 def start(self):
jpayne@69 3110 """record start position on chrom/contig (0-based inclusive)"""
jpayne@69 3111 return self.ptr.pos
jpayne@69 3112
jpayne@69 3113 @start.setter
jpayne@69 3114 def start(self, value):
jpayne@69 3115 cdef int s = value
jpayne@69 3116 if s < 0:
jpayne@69 3117 raise ValueError('Start coordinate must be non-negative')
jpayne@69 3118 self.ptr.pos = s
jpayne@69 3119 bcf_sync_end(self)
jpayne@69 3120
jpayne@69 3121 @property
jpayne@69 3122 def stop(self):
jpayne@69 3123 """record stop position on chrom/contig (0-based exclusive)"""
jpayne@69 3124 return self.ptr.pos + self.ptr.rlen
jpayne@69 3125
jpayne@69 3126 @stop.setter
jpayne@69 3127 def stop(self, value):
jpayne@69 3128 cdef int s = value
jpayne@69 3129 if s < 0:
jpayne@69 3130 raise ValueError('Stop coordinate must be non-negative')
jpayne@69 3131 self.ptr.rlen = s - self.ptr.pos
jpayne@69 3132 bcf_sync_end(self)
jpayne@69 3133
jpayne@69 3134 @property
jpayne@69 3135 def rlen(self):
jpayne@69 3136 """record length on chrom/contig (aka rec.stop - rec.start)"""
jpayne@69 3137 return self.ptr.rlen
jpayne@69 3138
jpayne@69 3139 @rlen.setter
jpayne@69 3140 def rlen(self, value):
jpayne@69 3141 cdef int r = value
jpayne@69 3142 self.ptr.rlen = r
jpayne@69 3143 bcf_sync_end(self)
jpayne@69 3144
jpayne@69 3145 @property
jpayne@69 3146 def qual(self):
jpayne@69 3147 """phred scaled quality score or None if not available"""
jpayne@69 3148 return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
jpayne@69 3149
jpayne@69 3150 @qual.setter
jpayne@69 3151 def qual(self, value):
jpayne@69 3152 if value is not None:
jpayne@69 3153 self.ptr.qual = value
jpayne@69 3154 else:
jpayne@69 3155 bcf_float_set(&self.ptr.qual, bcf_float_missing)
jpayne@69 3156
jpayne@69 3157
jpayne@69 3158 # @property
jpayne@69 3159 # def n_allele(self):
jpayne@69 3160 # return self.ptr.n_allele
jpayne@69 3161
jpayne@69 3162 # @property
jpayne@69 3163 # def n_sample(self):
jpayne@69 3164 # return self.ptr.n_sample
jpayne@69 3165
jpayne@69 3166 @property
jpayne@69 3167 def id(self):
jpayne@69 3168 """record identifier or None if not available"""
jpayne@69 3169 cdef bcf1_t *r = self.ptr
jpayne@69 3170 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3171 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3172 # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
jpayne@69 3173 # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
jpayne@69 3174 if (r.d.m_id == 0):
jpayne@69 3175 raise ValueError('Error extracting ID')
jpayne@69 3176 return charptr_to_str(r.d.id) if r.d.id != b'.' else None
jpayne@69 3177
jpayne@69 3178 @id.setter
jpayne@69 3179 def id(self, value):
jpayne@69 3180 cdef bcf1_t *r = self.ptr
jpayne@69 3181 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3182 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3183 cdef char *idstr = NULL
jpayne@69 3184 if value is not None:
jpayne@69 3185 bid = force_bytes(value)
jpayne@69 3186 idstr = bid
jpayne@69 3187 if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
jpayne@69 3188 raise ValueError('Error updating id')
jpayne@69 3189
jpayne@69 3190 @property
jpayne@69 3191 def ref(self):
jpayne@69 3192 """reference allele"""
jpayne@69 3193 cdef bcf1_t *r = self.ptr
jpayne@69 3194 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3195 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3196 return charptr_to_str(r.d.allele[0]) if r.d.allele else None
jpayne@69 3197
jpayne@69 3198 @ref.setter
jpayne@69 3199 def ref(self, value):
jpayne@69 3200 cdef bcf1_t *r = self.ptr
jpayne@69 3201 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3202 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3203 #FIXME: Set alleles directly -- this is stupid
jpayne@69 3204 if not value:
jpayne@69 3205 raise ValueError('ref allele must not be null')
jpayne@69 3206 value = force_bytes(value)
jpayne@69 3207 if r.d.allele and r.n_allele:
jpayne@69 3208 alleles = [r.d.allele[i] for i in range(r.n_allele)]
jpayne@69 3209 alleles[0] = value
jpayne@69 3210 else:
jpayne@69 3211 alleles = [value, '<NON_REF>']
jpayne@69 3212 self.alleles = alleles
jpayne@69 3213 bcf_sync_end(self)
jpayne@69 3214
jpayne@69 3215 @property
jpayne@69 3216 def alleles(self):
jpayne@69 3217 """tuple of reference allele followed by alt alleles"""
jpayne@69 3218 cdef bcf1_t *r = self.ptr
jpayne@69 3219 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3220 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3221 if not r.d.allele:
jpayne@69 3222 return None
jpayne@69 3223 cdef tuple res = PyTuple_New(r.n_allele)
jpayne@69 3224 for i in range(r.n_allele):
jpayne@69 3225 a = charptr_to_str(r.d.allele[i])
jpayne@69 3226 PyTuple_SET_ITEM(res, i, a)
jpayne@69 3227 Py_INCREF(a)
jpayne@69 3228 return res
jpayne@69 3229
jpayne@69 3230 @alleles.setter
jpayne@69 3231 def alleles(self, values):
jpayne@69 3232 cdef bcf1_t *r = self.ptr
jpayne@69 3233
jpayne@69 3234 # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
jpayne@69 3235 cdef int rlen = r.rlen
jpayne@69 3236
jpayne@69 3237 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3238 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3239
jpayne@69 3240 values = [force_bytes(v) for v in values]
jpayne@69 3241
jpayne@69 3242 if len(values) < 2:
jpayne@69 3243 raise ValueError('must set at least 2 alleles')
jpayne@69 3244
jpayne@69 3245 if b'' in values:
jpayne@69 3246 raise ValueError('cannot set null allele')
jpayne@69 3247
jpayne@69 3248 value = b','.join(values)
jpayne@69 3249
jpayne@69 3250 if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
jpayne@69 3251 raise ValueError('Error updating alleles')
jpayne@69 3252
jpayne@69 3253 # Reset rlen if alternate allele isn't symbolic, otherwise used cached
jpayne@69 3254 if has_symbolic_allele(self):
jpayne@69 3255 self.ptr.rlen = rlen
jpayne@69 3256 else:
jpayne@69 3257 self.ptr.rlen = len(values[0])
jpayne@69 3258 r.d.var_type = -1
jpayne@69 3259 bcf_sync_end(self)
jpayne@69 3260
jpayne@69 3261 @property
jpayne@69 3262 def alts(self):
jpayne@69 3263 """tuple of alt alleles"""
jpayne@69 3264 cdef bcf1_t *r = self.ptr
jpayne@69 3265 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3266 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3267 if r.n_allele < 2 or not r.d.allele:
jpayne@69 3268 return None
jpayne@69 3269 cdef tuple res = PyTuple_New(r.n_allele - 1)
jpayne@69 3270 for i in range(1, r.n_allele):
jpayne@69 3271 a = charptr_to_str(r.d.allele[i])
jpayne@69 3272 PyTuple_SET_ITEM(res, i - 1, a)
jpayne@69 3273 Py_INCREF(a)
jpayne@69 3274 return res
jpayne@69 3275
jpayne@69 3276 @alts.setter
jpayne@69 3277 def alts(self, value):
jpayne@69 3278 #FIXME: Set alleles directly -- this is stupid
jpayne@69 3279 cdef bcf1_t *r = self.ptr
jpayne@69 3280 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@69 3281 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3282 value = [force_bytes(v) for v in value]
jpayne@69 3283 if b'' in value:
jpayne@69 3284 raise ValueError('cannot set null alt allele')
jpayne@69 3285 ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
jpayne@69 3286 self.alleles = ref + value
jpayne@69 3287 r.d.var_type = -1
jpayne@69 3288
jpayne@69 3289 @property
jpayne@69 3290 def filter(self):
jpayne@69 3291 """filter information (see :class:`VariantRecordFilter`)"""
jpayne@69 3292 if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
jpayne@69 3293 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3294 return makeVariantRecordFilter(self)
jpayne@69 3295
jpayne@69 3296 @property
jpayne@69 3297 def info(self):
jpayne@69 3298 """info data (see :class:`VariantRecordInfo`)"""
jpayne@69 3299 if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
jpayne@69 3300 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3301 return makeVariantRecordInfo(self)
jpayne@69 3302
jpayne@69 3303 @property
jpayne@69 3304 def format(self):
jpayne@69 3305 """sample format metadata (see :class:`VariantRecordFormat`)"""
jpayne@69 3306 if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
jpayne@69 3307 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3308 return makeVariantRecordFormat(self)
jpayne@69 3309
jpayne@69 3310 @property
jpayne@69 3311 def samples(self):
jpayne@69 3312 """sample data (see :class:`VariantRecordSamples`)"""
jpayne@69 3313 if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
jpayne@69 3314 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3315 return makeVariantRecordSamples(self)
jpayne@69 3316
jpayne@69 3317 property alleles_variant_types:
jpayne@69 3318 def __get__(self):
jpayne@69 3319 cdef bcf1_t *r = self.ptr
jpayne@69 3320 cdef tuple result = PyTuple_New(r.n_allele)
jpayne@69 3321
jpayne@69 3322 for i in range(r.n_allele):
jpayne@69 3323 tp = bcf_get_variant_type(r, i)
jpayne@69 3324
jpayne@69 3325 if tp == VCF_REF:
jpayne@69 3326 v_type = "REF"
jpayne@69 3327 elif tp == VCF_SNP:
jpayne@69 3328 v_type = "SNP"
jpayne@69 3329 elif tp == VCF_MNP:
jpayne@69 3330 v_type = "MNP"
jpayne@69 3331 elif tp == VCF_INDEL:
jpayne@69 3332 v_type = "INDEL"
jpayne@69 3333 elif tp == VCF_BND:
jpayne@69 3334 v_type = "BND"
jpayne@69 3335 elif tp == VCF_OVERLAP:
jpayne@69 3336 v_type = "OVERLAP"
jpayne@69 3337 else:
jpayne@69 3338 v_type = "OTHER"
jpayne@69 3339
jpayne@69 3340 PyTuple_SET_ITEM(result, i, v_type)
jpayne@69 3341 Py_INCREF(v_type)
jpayne@69 3342
jpayne@69 3343 return result
jpayne@69 3344
jpayne@69 3345 def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
jpayne@69 3346 if op != 2 and op != 3:
jpayne@69 3347 return NotImplemented
jpayne@69 3348
jpayne@69 3349 cdef bcf1_t *s = self.ptr
jpayne@69 3350 cdef bcf1_t *o = other.ptr
jpayne@69 3351
jpayne@69 3352 cdef bint cmp = self is other or (
jpayne@69 3353 s.pos == o.pos
jpayne@69 3354 and s.rlen == o.rlen
jpayne@69 3355 and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
jpayne@69 3356 or s.qual == o.qual)
jpayne@69 3357 and s.n_sample == o.n_sample
jpayne@69 3358 and s.n_allele == o.n_allele
jpayne@69 3359 and self.contig == other.contig
jpayne@69 3360 and self.alleles == other.alleles
jpayne@69 3361 and self.id == other.id
jpayne@69 3362 and self.info == other.info
jpayne@69 3363 and self.filter == other.filter
jpayne@69 3364 and self.samples == other.samples)
jpayne@69 3365
jpayne@69 3366 if op == 3:
jpayne@69 3367 cmp = not cmp
jpayne@69 3368
jpayne@69 3369 return cmp
jpayne@69 3370
jpayne@69 3371 def __str__(self):
jpayne@69 3372 cdef kstring_t line
jpayne@69 3373 cdef char c
jpayne@69 3374
jpayne@69 3375 line.l = line.m = 0
jpayne@69 3376 line.s = NULL
jpayne@69 3377
jpayne@69 3378 if vcf_format(self.header.ptr, self.ptr, &line) < 0:
jpayne@69 3379 if line.m:
jpayne@69 3380 free(line.s)
jpayne@69 3381 raise ValueError('vcf_format failed')
jpayne@69 3382
jpayne@69 3383 # Strip CR/LF?
jpayne@69 3384 #while line.l:
jpayne@69 3385 # c = line.s[line.l - 1]
jpayne@69 3386 # if c != b'\n' and c != b'\r':
jpayne@69 3387 # break
jpayne@69 3388 # line.l -= 1
jpayne@69 3389
jpayne@69 3390 ret = charptr_to_str_w_len(line.s, line.l)
jpayne@69 3391
jpayne@69 3392 if line.m:
jpayne@69 3393 free(line.s)
jpayne@69 3394
jpayne@69 3395 return ret
jpayne@69 3396
jpayne@69 3397
jpayne@69 3398 cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
jpayne@69 3399 if not header:
jpayne@69 3400 raise ValueError('invalid VariantHeader')
jpayne@69 3401
jpayne@69 3402 if not r:
jpayne@69 3403 raise ValueError('cannot create VariantRecord')
jpayne@69 3404
jpayne@69 3405 if r.errcode:
jpayne@69 3406 msg = []
jpayne@69 3407 #if r.errcode & BCF_ERR_CTG_UNDEF:
jpayne@69 3408 # msg.append('undefined contig')
jpayne@69 3409 #if r.errcode & BCF_ERR_TAG_UNDEF:
jpayne@69 3410 # msg.append('undefined tag')
jpayne@69 3411 if r.errcode & BCF_ERR_NCOLS:
jpayne@69 3412 msg.append('invalid number of columns')
jpayne@69 3413 if r.errcode & BCF_ERR_LIMITS:
jpayne@69 3414 msg.append('limits violated')
jpayne@69 3415 if r.errcode & BCF_ERR_CHAR:
jpayne@69 3416 msg.append('invalid character found')
jpayne@69 3417 if r.errcode & BCF_ERR_CTG_INVALID:
jpayne@69 3418 msg.append('invalid contig')
jpayne@69 3419 if r.errcode & BCF_ERR_TAG_INVALID:
jpayne@69 3420 msg.append('invalid tag')
jpayne@69 3421
jpayne@69 3422 if msg:
jpayne@69 3423 msg = ', '.join(msg)
jpayne@69 3424 raise ValueError('Error(s) reading record: {}'.format(msg))
jpayne@69 3425
jpayne@69 3426 cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
jpayne@69 3427 record.header = header
jpayne@69 3428 record.ptr = r
jpayne@69 3429
jpayne@69 3430 return record
jpayne@69 3431
jpayne@69 3432
jpayne@69 3433 ########################################################################
jpayne@69 3434 ########################################################################
jpayne@69 3435 ## Variant Sampletype object
jpayne@69 3436 ########################################################################
jpayne@69 3437
jpayne@69 3438
jpayne@69 3439 cdef class VariantRecordSample(object):
jpayne@69 3440 """Data for a single sample from a :class:`VariantRecord` object.
jpayne@69 3441 Provides data accessors for genotypes and a mapping interface
jpayne@69 3442 from format name to values.
jpayne@69 3443 """
jpayne@69 3444 def __init__(self, *args, **kwargs):
jpayne@69 3445 raise TypeError('this class cannot be instantiated from Python')
jpayne@69 3446
jpayne@69 3447 @property
jpayne@69 3448 def name(self):
jpayne@69 3449 """sample name"""
jpayne@69 3450 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3451 cdef bcf1_t *r = self.record.ptr
jpayne@69 3452 cdef int32_t n = r.n_sample
jpayne@69 3453
jpayne@69 3454 if self.index < 0 or self.index >= n:
jpayne@69 3455 raise ValueError('invalid sample index')
jpayne@69 3456
jpayne@69 3457 return charptr_to_str(hdr.samples[self.index])
jpayne@69 3458
jpayne@69 3459 @property
jpayne@69 3460 def allele_indices(self):
jpayne@69 3461 """allele indices for called genotype, if present. Otherwise None"""
jpayne@69 3462 return bcf_format_get_allele_indices(self)
jpayne@69 3463
jpayne@69 3464 @allele_indices.setter
jpayne@69 3465 def allele_indices(self, value):
jpayne@69 3466 self['GT'] = value
jpayne@69 3467
jpayne@69 3468 @allele_indices.deleter
jpayne@69 3469 def allele_indices(self):
jpayne@69 3470 self['GT'] = ()
jpayne@69 3471
jpayne@69 3472 @property
jpayne@69 3473 def alleles(self):
jpayne@69 3474 """alleles for called genotype, if present. Otherwise None"""
jpayne@69 3475 return bcf_format_get_alleles(self)
jpayne@69 3476
jpayne@69 3477 @alleles.setter
jpayne@69 3478 def alleles(self, value):
jpayne@69 3479 # Sets the genotype, supply a tuple of alleles to set.
jpayne@69 3480 # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
jpayne@69 3481 # The genotype is reset when an empty tuple, None or (None,) is supplied
jpayne@69 3482
jpayne@69 3483 if value==(None,) or value==tuple() or value is None:
jpayne@69 3484 self['GT'] = ()
jpayne@69 3485 return
jpayne@69 3486
jpayne@69 3487 if any((type(x) == int for x in value)):
jpayne@69 3488 raise ValueError('Use .allele_indices to set integer allele indices')
jpayne@69 3489
jpayne@69 3490 # determine and set allele indices:
jpayne@69 3491 try:
jpayne@69 3492 self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
jpayne@69 3493 except ValueError:
jpayne@69 3494 raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
jpayne@69 3495 "First set the .alleles of this record to define the alleles")
jpayne@69 3496
jpayne@69 3497 @alleles.deleter
jpayne@69 3498 def alleles(self):
jpayne@69 3499 self['GT'] = ()
jpayne@69 3500
jpayne@69 3501 @property
jpayne@69 3502 def phased(self):
jpayne@69 3503 """False if genotype is missing or any allele is unphased. Otherwise True."""
jpayne@69 3504 return bcf_sample_get_phased(self)
jpayne@69 3505
jpayne@69 3506 @phased.setter
jpayne@69 3507 def phased(self, value):
jpayne@69 3508 bcf_sample_set_phased(self, value)
jpayne@69 3509
jpayne@69 3510 def __len__(self):
jpayne@69 3511 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3512 cdef bcf1_t *r = self.record.ptr
jpayne@69 3513 cdef int i, n = 0
jpayne@69 3514
jpayne@69 3515 if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@69 3516 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3517
jpayne@69 3518 for i in range(r.n_fmt):
jpayne@69 3519 if r.d.fmt[i].p:
jpayne@69 3520 n += 1
jpayne@69 3521 return n
jpayne@69 3522
jpayne@69 3523 def __bool__(self):
jpayne@69 3524 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3525 cdef bcf1_t *r = self.record.ptr
jpayne@69 3526 cdef int i
jpayne@69 3527
jpayne@69 3528 if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@69 3529 raise ValueError('Error unpacking VariantRecord')
jpayne@69 3530
jpayne@69 3531 for i in range(r.n_fmt):
jpayne@69 3532 if r.d.fmt[i].p:
jpayne@69 3533 return True
jpayne@69 3534 return False
jpayne@69 3535
jpayne@69 3536 def __getitem__(self, key):
jpayne@69 3537 return bcf_format_get_value(self, key)
jpayne@69 3538
jpayne@69 3539 def __setitem__(self, key, value):
jpayne@69 3540 bcf_format_set_value(self, key, value)
jpayne@69 3541
jpayne@69 3542 def __delitem__(self, key):
jpayne@69 3543 bcf_format_del_value(self, key)
jpayne@69 3544
jpayne@69 3545 def clear(self):
jpayne@69 3546 """Clear all format data (including genotype) for this sample"""
jpayne@69 3547 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3548 cdef bcf1_t *r = self.record.ptr
jpayne@69 3549 cdef bcf_fmt_t *fmt
jpayne@69 3550 cdef int i
jpayne@69 3551
jpayne@69 3552 for i in range(r.n_fmt):
jpayne@69 3553 fmt = &r.d.fmt[i]
jpayne@69 3554 if fmt.p:
jpayne@69 3555 bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@69 3556
jpayne@69 3557 def __iter__(self):
jpayne@69 3558 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3559 cdef bcf1_t *r = self.record.ptr
jpayne@69 3560 cdef bcf_fmt_t *fmt
jpayne@69 3561 cdef int i
jpayne@69 3562
jpayne@69 3563 for i in range(r.n_fmt):
jpayne@69 3564 fmt = &r.d.fmt[i]
jpayne@69 3565 if r.d.fmt[i].p:
jpayne@69 3566 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@69 3567
jpayne@69 3568 def get(self, key, default=None):
jpayne@69 3569 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 3570 try:
jpayne@69 3571 return self[key]
jpayne@69 3572 except KeyError:
jpayne@69 3573 return default
jpayne@69 3574
jpayne@69 3575 def __contains__(self, key):
jpayne@69 3576 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@69 3577 cdef bcf1_t *r = self.record.ptr
jpayne@69 3578 cdef bytes bkey = force_bytes(key)
jpayne@69 3579 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@69 3580 return fmt != NULL and fmt.p != NULL
jpayne@69 3581
jpayne@69 3582 def iterkeys(self):
jpayne@69 3583 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 3584 return iter(self)
jpayne@69 3585
jpayne@69 3586 def itervalues(self):
jpayne@69 3587 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 3588 for key in self:
jpayne@69 3589 yield self[key]
jpayne@69 3590
jpayne@69 3591 def iteritems(self):
jpayne@69 3592 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 3593 for key in self:
jpayne@69 3594 yield (key, self[key])
jpayne@69 3595
jpayne@69 3596 def keys(self):
jpayne@69 3597 """D.keys() -> list of D's keys"""
jpayne@69 3598 return list(self)
jpayne@69 3599
jpayne@69 3600 def items(self):
jpayne@69 3601 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 3602 return list(self.iteritems())
jpayne@69 3603
jpayne@69 3604 def values(self):
jpayne@69 3605 """D.values() -> list of D's values"""
jpayne@69 3606 return list(self.itervalues())
jpayne@69 3607
jpayne@69 3608 def update(self, items=None, **kwargs):
jpayne@69 3609 """D.update([E, ]**F) -> None.
jpayne@69 3610
jpayne@69 3611 Update D from dict/iterable E and F.
jpayne@69 3612 """
jpayne@69 3613 for k, v in items.items():
jpayne@69 3614 self[k] = v
jpayne@69 3615
jpayne@69 3616 if kwargs:
jpayne@69 3617 for k, v in kwargs.items():
jpayne@69 3618 self[k] = v
jpayne@69 3619
jpayne@69 3620 def pop(self, key, default=_nothing):
jpayne@69 3621 try:
jpayne@69 3622 value = self[key]
jpayne@69 3623 del self[key]
jpayne@69 3624 return value
jpayne@69 3625 except KeyError:
jpayne@69 3626 if default is not _nothing:
jpayne@69 3627 return default
jpayne@69 3628 raise
jpayne@69 3629
jpayne@69 3630 def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
jpayne@69 3631 if op != 2 and op != 3:
jpayne@69 3632 return NotImplemented
jpayne@69 3633
jpayne@69 3634 cdef bint cmp = dict(self) == dict(other)
jpayne@69 3635
jpayne@69 3636 if op == 3:
jpayne@69 3637 cmp = not cmp
jpayne@69 3638
jpayne@69 3639 return cmp
jpayne@69 3640
jpayne@69 3641 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 3642 __hash__ = None
jpayne@69 3643
jpayne@69 3644
jpayne@69 3645 cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
jpayne@69 3646 if not record or sample_index < 0:
jpayne@69 3647 raise ValueError('cannot create VariantRecordSample')
jpayne@69 3648
jpayne@69 3649 cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
jpayne@69 3650 sample.record = record
jpayne@69 3651 sample.index = sample_index
jpayne@69 3652
jpayne@69 3653 return sample
jpayne@69 3654
jpayne@69 3655
jpayne@69 3656 ########################################################################
jpayne@69 3657 ########################################################################
jpayne@69 3658 ## Index objects
jpayne@69 3659 ########################################################################
jpayne@69 3660
jpayne@69 3661
jpayne@69 3662 cdef class BaseIndex(object):
jpayne@69 3663 def __init__(self):
jpayne@69 3664 self.refs = ()
jpayne@69 3665 self.remap = {}
jpayne@69 3666
jpayne@69 3667 def __len__(self):
jpayne@69 3668 return len(self.refs)
jpayne@69 3669
jpayne@69 3670 def __bool__(self):
jpayne@69 3671 return len(self.refs) != 0
jpayne@69 3672
jpayne@69 3673 def __getitem__(self, key):
jpayne@69 3674 if isinstance(key, int):
jpayne@69 3675 return self.refs[key]
jpayne@69 3676 else:
jpayne@69 3677 return self.refmap[key]
jpayne@69 3678
jpayne@69 3679 def __iter__(self):
jpayne@69 3680 return iter(self.refs)
jpayne@69 3681
jpayne@69 3682 def get(self, key, default=None):
jpayne@69 3683 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@69 3684 try:
jpayne@69 3685 return self[key]
jpayne@69 3686 except KeyError:
jpayne@69 3687 return default
jpayne@69 3688
jpayne@69 3689 def __contains__(self, key):
jpayne@69 3690 try:
jpayne@69 3691 self[key]
jpayne@69 3692 except KeyError:
jpayne@69 3693 return False
jpayne@69 3694 else:
jpayne@69 3695 return True
jpayne@69 3696
jpayne@69 3697 def iterkeys(self):
jpayne@69 3698 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@69 3699 return iter(self)
jpayne@69 3700
jpayne@69 3701 def itervalues(self):
jpayne@69 3702 """D.itervalues() -> an iterator over the values of D"""
jpayne@69 3703 for key in self:
jpayne@69 3704 yield self[key]
jpayne@69 3705
jpayne@69 3706 def iteritems(self):
jpayne@69 3707 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@69 3708 for key in self:
jpayne@69 3709 yield (key, self[key])
jpayne@69 3710
jpayne@69 3711 def keys(self):
jpayne@69 3712 """D.keys() -> list of D's keys"""
jpayne@69 3713 return list(self)
jpayne@69 3714
jpayne@69 3715 def items(self):
jpayne@69 3716 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@69 3717 return list(self.iteritems())
jpayne@69 3718
jpayne@69 3719 def values(self):
jpayne@69 3720 """D.values() -> list of D's values"""
jpayne@69 3721 return list(self.itervalues())
jpayne@69 3722
jpayne@69 3723 def update(self, items=None, **kwargs):
jpayne@69 3724 """D.update([E, ]**F) -> None.
jpayne@69 3725
jpayne@69 3726 Update D from dict/iterable E and F.
jpayne@69 3727 """
jpayne@69 3728 for k, v in items.items():
jpayne@69 3729 self[k] = v
jpayne@69 3730
jpayne@69 3731 if kwargs:
jpayne@69 3732 for k, v in kwargs.items():
jpayne@69 3733 self[k] = v
jpayne@69 3734
jpayne@69 3735 def pop(self, key, default=_nothing):
jpayne@69 3736 try:
jpayne@69 3737 value = self[key]
jpayne@69 3738 del self[key]
jpayne@69 3739 return value
jpayne@69 3740 except KeyError:
jpayne@69 3741 if default is not _nothing:
jpayne@69 3742 return default
jpayne@69 3743 raise
jpayne@69 3744
jpayne@69 3745 # Mappings are not hashable by default, but subclasses can change this
jpayne@69 3746 __hash__ = None
jpayne@69 3747
jpayne@69 3748 #TODO: implement __richcmp__
jpayne@69 3749
jpayne@69 3750
jpayne@69 3751 cdef class BCFIndex(object):
jpayne@69 3752 """CSI index data structure for BCF files"""
jpayne@69 3753 def __init__(self):
jpayne@69 3754 self.refs = ()
jpayne@69 3755 self.refmap = {}
jpayne@69 3756
jpayne@69 3757 if not self.ptr:
jpayne@69 3758 raise ValueError('Invalid index object')
jpayne@69 3759
jpayne@69 3760 cdef int n
jpayne@69 3761 cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
jpayne@69 3762
jpayne@69 3763 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@69 3764 self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@69 3765
jpayne@69 3766 def __dealloc__(self):
jpayne@69 3767 if self.ptr:
jpayne@69 3768 hts_idx_destroy(self.ptr)
jpayne@69 3769 self.ptr = NULL
jpayne@69 3770
jpayne@69 3771 def fetch(self, bcf, contig, start, stop, reopen):
jpayne@69 3772 return BCFIterator(bcf, contig, start, stop, reopen)
jpayne@69 3773
jpayne@69 3774
jpayne@69 3775 cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
jpayne@69 3776 if not idx:
jpayne@69 3777 return None
jpayne@69 3778
jpayne@69 3779 if not header:
jpayne@69 3780 raise ValueError('invalid VariantHeader')
jpayne@69 3781
jpayne@69 3782 cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
jpayne@69 3783 index.header = header
jpayne@69 3784 index.ptr = idx
jpayne@69 3785 index.__init__()
jpayne@69 3786
jpayne@69 3787 return index
jpayne@69 3788
jpayne@69 3789
jpayne@69 3790 cdef class TabixIndex(BaseIndex):
jpayne@69 3791 """Tabix index data structure for VCF files"""
jpayne@69 3792 def __init__(self):
jpayne@69 3793 self.refs = ()
jpayne@69 3794 self.refmap = {}
jpayne@69 3795
jpayne@69 3796 if not self.ptr:
jpayne@69 3797 raise ValueError('Invalid index object')
jpayne@69 3798
jpayne@69 3799 cdef int n
jpayne@69 3800 cdef const char **refs = tbx_seqnames(self.ptr, &n)
jpayne@69 3801
jpayne@69 3802 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@69 3803 self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@69 3804
jpayne@69 3805 def __dealloc__(self):
jpayne@69 3806 if self.ptr:
jpayne@69 3807 tbx_destroy(self.ptr)
jpayne@69 3808 self.ptr = NULL
jpayne@69 3809
jpayne@69 3810 def fetch(self, bcf, contig, start, stop, reopen):
jpayne@69 3811 return TabixIterator(bcf, contig, start, stop, reopen)
jpayne@69 3812
jpayne@69 3813
jpayne@69 3814 cdef TabixIndex makeTabixIndex(tbx_t *idx):
jpayne@69 3815 if not idx:
jpayne@69 3816 return None
jpayne@69 3817
jpayne@69 3818 cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
jpayne@69 3819 index.ptr = idx
jpayne@69 3820 index.__init__()
jpayne@69 3821
jpayne@69 3822 return index
jpayne@69 3823
jpayne@69 3824
jpayne@69 3825 ########################################################################
jpayne@69 3826 ########################################################################
jpayne@69 3827 ## Iterators
jpayne@69 3828 ########################################################################
jpayne@69 3829
jpayne@69 3830
jpayne@69 3831 cdef class BaseIterator(object):
jpayne@69 3832 pass
jpayne@69 3833
jpayne@69 3834
jpayne@69 3835 # Internal function to clean up after iteration stop or failure.
jpayne@69 3836 # This would be a nested function if it weren't a cdef function.
jpayne@69 3837 cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
jpayne@69 3838 bcf_destroy1(record)
jpayne@69 3839
jpayne@69 3840 # destroy iter so future calls to __next__ raise StopIteration
jpayne@69 3841 bcf_itr_destroy(self.iter)
jpayne@69 3842 self.iter = NULL
jpayne@69 3843
jpayne@69 3844
jpayne@69 3845 cdef class BCFIterator(BaseIterator):
jpayne@69 3846 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@69 3847 if bcf is None:
jpayne@69 3848 raise ValueError('bcf must not be None')
jpayne@69 3849
jpayne@69 3850 if contig is None:
jpayne@69 3851 raise ValueError('contig must be specified')
jpayne@69 3852
jpayne@69 3853 if not isinstance(bcf.index, BCFIndex):
jpayne@69 3854 raise ValueError('bcf index required')
jpayne@69 3855
jpayne@69 3856 cdef BCFIndex index = bcf.index
jpayne@69 3857
jpayne@69 3858 self.bcf = bcf
jpayne@69 3859 self.index = index
jpayne@69 3860
jpayne@69 3861 cdef int rid, cstart, cstop
jpayne@69 3862
jpayne@69 3863 try:
jpayne@69 3864 rid = index.refmap[contig]
jpayne@69 3865 except KeyError:
jpayne@69 3866 # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@69 3867 self.iter = NULL
jpayne@69 3868 return
jpayne@69 3869
jpayne@69 3870 if reopen:
jpayne@69 3871 self.bcf = self.bcf.copy()
jpayne@69 3872
jpayne@69 3873 cstart = start if start is not None else 0
jpayne@69 3874 cstop = stop if stop is not None else MAX_POS
jpayne@69 3875
jpayne@69 3876 with nogil:
jpayne@69 3877 self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
jpayne@69 3878
jpayne@69 3879 if not self.iter:
jpayne@69 3880 if errno:
jpayne@69 3881 raise IOError(errno, strerror(errno))
jpayne@69 3882 else:
jpayne@69 3883 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@69 3884
jpayne@69 3885 def __dealloc__(self):
jpayne@69 3886 if self.iter:
jpayne@69 3887 bcf_itr_destroy(self.iter)
jpayne@69 3888 self.iter = NULL
jpayne@69 3889
jpayne@69 3890 def __iter__(self):
jpayne@69 3891 return self
jpayne@69 3892
jpayne@69 3893 def __next__(self):
jpayne@69 3894 if not self.iter:
jpayne@69 3895 raise StopIteration
jpayne@69 3896
jpayne@69 3897 cdef bcf1_t *record = bcf_init1()
jpayne@69 3898
jpayne@69 3899 if not record:
jpayne@69 3900 raise MemoryError('unable to allocate BCF record')
jpayne@69 3901
jpayne@69 3902 record.pos = -1
jpayne@69 3903 if self.bcf.drop_samples:
jpayne@69 3904 record.max_unpack = BCF_UN_SHR
jpayne@69 3905
jpayne@69 3906 cdef int ret
jpayne@69 3907
jpayne@69 3908 with nogil:
jpayne@69 3909 ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
jpayne@69 3910
jpayne@69 3911 if ret < 0:
jpayne@69 3912 _stop_BCFIterator(self, record)
jpayne@69 3913 if ret == -1:
jpayne@69 3914 raise StopIteration
jpayne@69 3915 elif ret == -2:
jpayne@69 3916 raise IOError('truncated file')
jpayne@69 3917 elif errno:
jpayne@69 3918 raise IOError(errno, strerror(errno))
jpayne@69 3919 else:
jpayne@69 3920 raise IOError('unable to fetch next record')
jpayne@69 3921
jpayne@69 3922 ret = bcf_subset_format(self.bcf.header.ptr, record)
jpayne@69 3923
jpayne@69 3924 if ret < 0:
jpayne@69 3925 _stop_BCFIterator(self, record)
jpayne@69 3926 raise ValueError('error in bcf_subset_format')
jpayne@69 3927
jpayne@69 3928 return makeVariantRecord(self.bcf.header, record)
jpayne@69 3929
jpayne@69 3930
jpayne@69 3931 cdef class TabixIterator(BaseIterator):
jpayne@69 3932 def __cinit__(self, *args, **kwargs):
jpayne@69 3933 self.line_buffer.l = 0
jpayne@69 3934 self.line_buffer.m = 0
jpayne@69 3935 self.line_buffer.s = NULL
jpayne@69 3936
jpayne@69 3937 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@69 3938 if bcf is None:
jpayne@69 3939 raise ValueError('bcf must not be None')
jpayne@69 3940
jpayne@69 3941 if not isinstance(bcf.index, TabixIndex):
jpayne@69 3942 raise ValueError('tabix index required')
jpayne@69 3943
jpayne@69 3944 cdef TabixIndex index = bcf.index
jpayne@69 3945
jpayne@69 3946 self.bcf = bcf
jpayne@69 3947 self.index = index
jpayne@69 3948
jpayne@69 3949 cdef int rid, cstart, cstop
jpayne@69 3950
jpayne@69 3951 try:
jpayne@69 3952 rid = index.refmap[contig]
jpayne@69 3953 except KeyError:
jpayne@69 3954 # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@69 3955 self.iter = NULL
jpayne@69 3956 return
jpayne@69 3957
jpayne@69 3958 if reopen:
jpayne@69 3959 self.bcf = self.bcf.copy()
jpayne@69 3960
jpayne@69 3961 cstart = start if start is not None else 0
jpayne@69 3962 cstop = stop if stop is not None else MAX_POS
jpayne@69 3963
jpayne@69 3964 self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
jpayne@69 3965
jpayne@69 3966 if not self.iter:
jpayne@69 3967 if errno:
jpayne@69 3968 raise IOError(errno, strerror(errno))
jpayne@69 3969 else:
jpayne@69 3970 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@69 3971
jpayne@69 3972 def __dealloc__(self):
jpayne@69 3973 if self.iter:
jpayne@69 3974 tbx_itr_destroy(self.iter)
jpayne@69 3975 self.iter = NULL
jpayne@69 3976
jpayne@69 3977 if self.line_buffer.m:
jpayne@69 3978 free(self.line_buffer.s)
jpayne@69 3979
jpayne@69 3980 self.line_buffer.l = 0
jpayne@69 3981 self.line_buffer.m = 0
jpayne@69 3982 self.line_buffer.s = NULL
jpayne@69 3983
jpayne@69 3984 def __iter__(self):
jpayne@69 3985 return self
jpayne@69 3986
jpayne@69 3987 def __next__(self):
jpayne@69 3988 if not self.iter:
jpayne@69 3989 raise StopIteration
jpayne@69 3990
jpayne@69 3991 cdef int ret
jpayne@69 3992
jpayne@69 3993 with nogil:
jpayne@69 3994 ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
jpayne@69 3995
jpayne@69 3996 if ret < 0:
jpayne@69 3997 tbx_itr_destroy(self.iter)
jpayne@69 3998 self.iter = NULL
jpayne@69 3999 if ret == -1:
jpayne@69 4000 raise StopIteration
jpayne@69 4001 elif ret == -2:
jpayne@69 4002 raise IOError('truncated file')
jpayne@69 4003 elif errno:
jpayne@69 4004 raise IOError(errno, strerror(errno))
jpayne@69 4005 else:
jpayne@69 4006 raise IOError('unable to fetch next record')
jpayne@69 4007
jpayne@69 4008 cdef bcf1_t *record = bcf_init1()
jpayne@69 4009
jpayne@69 4010 if not record:
jpayne@69 4011 raise MemoryError('unable to allocate BCF record')
jpayne@69 4012
jpayne@69 4013 record.pos = -1
jpayne@69 4014 if self.bcf.drop_samples:
jpayne@69 4015 record.max_unpack = BCF_UN_SHR
jpayne@69 4016
jpayne@69 4017 ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
jpayne@69 4018
jpayne@69 4019 # FIXME: stop iteration on parse failure?
jpayne@69 4020 if ret < 0:
jpayne@69 4021 bcf_destroy1(record)
jpayne@69 4022 raise ValueError('error in vcf_parse')
jpayne@69 4023
jpayne@69 4024 return makeVariantRecord(self.bcf.header, record)
jpayne@69 4025
jpayne@69 4026
jpayne@69 4027 ########################################################################
jpayne@69 4028 ########################################################################
jpayne@69 4029 ## Variant File
jpayne@69 4030 ########################################################################
jpayne@69 4031
jpayne@69 4032
jpayne@69 4033 cdef class VariantFile(HTSFile):
jpayne@69 4034 """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
jpayne@69 4035 duplicate_filehandle=True, ignore_truncation=False, threads=1)*
jpayne@69 4036
jpayne@69 4037 A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
jpayne@69 4038 opened.
jpayne@69 4039
jpayne@69 4040 If an index for a variant file exists (.csi or .tbi), it will be
jpayne@69 4041 opened automatically. Without an index random access to records
jpayne@69 4042 via :meth:`fetch` is disabled.
jpayne@69 4043
jpayne@69 4044 For writing, a :class:`VariantHeader` object must be provided,
jpayne@69 4045 typically obtained from another :term:`VCF` file/:term:`BCF`
jpayne@69 4046 file.
jpayne@69 4047
jpayne@69 4048 Parameters
jpayne@69 4049 ----------
jpayne@69 4050 mode : string
jpayne@69 4051 *mode* should be ``r`` for reading or ``w`` for writing. The default is
jpayne@69 4052 text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
jpayne@69 4053 ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
jpayne@69 4054
jpayne@69 4055 If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
jpayne@69 4056 modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
jpayne@69 4057 For instance, to open a :term:`BCF` formatted file for reading, type::
jpayne@69 4058
jpayne@69 4059 f = pysam.VariantFile('ex1.bcf','r')
jpayne@69 4060
jpayne@69 4061 If mode is not specified, we will try to auto-detect the file type. All
jpayne@69 4062 of the following should work::
jpayne@69 4063
jpayne@69 4064 f1 = pysam.VariantFile('ex1.bcf')
jpayne@69 4065 f2 = pysam.VariantFile('ex1.vcf')
jpayne@69 4066 f3 = pysam.VariantFile('ex1.vcf.gz')
jpayne@69 4067
jpayne@69 4068 index_filename : string
jpayne@69 4069 Explicit path to an index file.
jpayne@69 4070
jpayne@69 4071 header : VariantHeader
jpayne@69 4072 :class:`VariantHeader` object required for writing.
jpayne@69 4073
jpayne@69 4074 drop_samples: bool
jpayne@69 4075 Ignore sample information when reading.
jpayne@69 4076
jpayne@69 4077 duplicate_filehandle: bool
jpayne@69 4078 By default, file handles passed either directly or through
jpayne@69 4079 File-like objects will be duplicated before passing them to
jpayne@69 4080 htslib. The duplication prevents issues where the same stream
jpayne@69 4081 will be closed by htslib and through destruction of the
jpayne@69 4082 high-level python object. Set to False to turn off
jpayne@69 4083 duplication.
jpayne@69 4084
jpayne@69 4085 ignore_truncation: bool
jpayne@69 4086 Issue a warning, instead of raising an error if the current file
jpayne@69 4087 appears to be truncated due to a missing EOF marker. Only applies
jpayne@69 4088 to bgzipped formats. (Default=False)
jpayne@69 4089
jpayne@69 4090 threads: integer
jpayne@69 4091 Number of threads to use for compressing/decompressing VCF/BCF files.
jpayne@69 4092 Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@69 4093 (Default=1)
jpayne@69 4094
jpayne@69 4095 """
jpayne@69 4096 def __cinit__(self, *args, **kwargs):
jpayne@69 4097 self.htsfile = NULL
jpayne@69 4098
jpayne@69 4099 def __init__(self, *args, **kwargs):
jpayne@69 4100 self.header = None
jpayne@69 4101 self.index = None
jpayne@69 4102 self.filename = None
jpayne@69 4103 self.mode = None
jpayne@69 4104 self.threads = 1
jpayne@69 4105 self.index_filename = None
jpayne@69 4106 self.is_stream = False
jpayne@69 4107 self.is_remote = False
jpayne@69 4108 self.is_reading = False
jpayne@69 4109 self.drop_samples = False
jpayne@69 4110 self.header_written = False
jpayne@69 4111 self.start_offset = -1
jpayne@69 4112
jpayne@69 4113 self.open(*args, **kwargs)
jpayne@69 4114
jpayne@69 4115 def __dealloc__(self):
jpayne@69 4116 if not self.htsfile or not self.header:
jpayne@69 4117 return
jpayne@69 4118
jpayne@69 4119 # Write header if no records were written
jpayne@69 4120 if self.htsfile.is_write and not self.header_written:
jpayne@69 4121 with nogil:
jpayne@69 4122 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@69 4123
jpayne@69 4124 cdef int ret = hts_close(self.htsfile)
jpayne@69 4125 self.htsfile = NULL
jpayne@69 4126 self.header = self.index = None
jpayne@69 4127
jpayne@69 4128 if ret < 0:
jpayne@69 4129 global errno
jpayne@69 4130 if errno == EPIPE:
jpayne@69 4131 errno = 0
jpayne@69 4132 else:
jpayne@69 4133 raise IOError(errno, force_str(strerror(errno)))
jpayne@69 4134
jpayne@69 4135 def close(self):
jpayne@69 4136 """closes the :class:`pysam.VariantFile`."""
jpayne@69 4137 if not self.htsfile:
jpayne@69 4138 return
jpayne@69 4139
jpayne@69 4140 # Write header if no records were written
jpayne@69 4141 if self.htsfile.is_write and not self.header_written:
jpayne@69 4142 with nogil:
jpayne@69 4143 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@69 4144
jpayne@69 4145 cdef int ret = hts_close(self.htsfile)
jpayne@69 4146 self.htsfile = NULL
jpayne@69 4147 self.header = self.index = None
jpayne@69 4148
jpayne@69 4149 if ret < 0:
jpayne@69 4150 global errno
jpayne@69 4151 if errno == EPIPE:
jpayne@69 4152 errno = 0
jpayne@69 4153 else:
jpayne@69 4154 raise IOError(errno, force_str(strerror(errno)))
jpayne@69 4155
jpayne@69 4156 def __iter__(self):
jpayne@69 4157 if not self.is_open:
jpayne@69 4158 raise ValueError('I/O operation on closed file')
jpayne@69 4159
jpayne@69 4160 if self.htsfile.is_write:
jpayne@69 4161 raise ValueError('cannot iterate over Variantfile opened for writing')
jpayne@69 4162
jpayne@69 4163 self.is_reading = 1
jpayne@69 4164 return self
jpayne@69 4165
jpayne@69 4166 def __next__(self):
jpayne@69 4167 cdef int ret
jpayne@69 4168 cdef int errcode
jpayne@69 4169 cdef bcf1_t *record = bcf_init1()
jpayne@69 4170
jpayne@69 4171 if not record:
jpayne@69 4172 raise MemoryError('unable to allocate BCF record')
jpayne@69 4173
jpayne@69 4174 record.pos = -1
jpayne@69 4175 if self.drop_samples:
jpayne@69 4176 record.max_unpack = BCF_UN_SHR
jpayne@69 4177
jpayne@69 4178 with nogil:
jpayne@69 4179 ret = bcf_read1(self.htsfile, self.header.ptr, record)
jpayne@69 4180
jpayne@69 4181 if ret < 0:
jpayne@69 4182 errcode = record.errcode
jpayne@69 4183 bcf_destroy1(record)
jpayne@69 4184 if errcode:
jpayne@69 4185 raise IOError('unable to parse next record')
jpayne@69 4186 if ret == -1:
jpayne@69 4187 raise StopIteration
jpayne@69 4188 elif ret == -2:
jpayne@69 4189 raise IOError('truncated file')
jpayne@69 4190 elif errno:
jpayne@69 4191 raise IOError(errno, strerror(errno))
jpayne@69 4192 else:
jpayne@69 4193 raise IOError('unable to fetch next record')
jpayne@69 4194
jpayne@69 4195 return makeVariantRecord(self.header, record)
jpayne@69 4196
jpayne@69 4197 def copy(self):
jpayne@69 4198 if not self.is_open:
jpayne@69 4199 raise ValueError
jpayne@69 4200
jpayne@69 4201 cdef VariantFile vars = VariantFile.__new__(VariantFile)
jpayne@69 4202 cdef bcf_hdr_t *hdr
jpayne@69 4203
jpayne@69 4204 # FIXME: re-open using fd or else header and index could be invalid
jpayne@69 4205 vars.htsfile = self._open_htsfile()
jpayne@69 4206
jpayne@69 4207 if not vars.htsfile:
jpayne@69 4208 raise ValueError('Cannot re-open htsfile')
jpayne@69 4209
jpayne@69 4210 # minimize overhead by re-using header and index. This approach is
jpayne@69 4211 # currently risky, but see above for how this can be mitigated.
jpayne@69 4212 vars.header = self.header
jpayne@69 4213 vars.index = self.index
jpayne@69 4214
jpayne@69 4215 vars.filename = self.filename
jpayne@69 4216 vars.mode = self.mode
jpayne@69 4217 vars.threads = self.threads
jpayne@69 4218 vars.index_filename = self.index_filename
jpayne@69 4219 vars.drop_samples = self.drop_samples
jpayne@69 4220 vars.is_stream = self.is_stream
jpayne@69 4221 vars.is_remote = self.is_remote
jpayne@69 4222 vars.is_reading = self.is_reading
jpayne@69 4223 vars.start_offset = self.start_offset
jpayne@69 4224 vars.header_written = self.header_written
jpayne@69 4225
jpayne@69 4226 if self.htsfile.is_bin:
jpayne@69 4227 vars.seek(self.tell())
jpayne@69 4228 else:
jpayne@69 4229 with nogil:
jpayne@69 4230 hdr = bcf_hdr_read(vars.htsfile)
jpayne@69 4231 makeVariantHeader(hdr)
jpayne@69 4232
jpayne@69 4233 return vars
jpayne@69 4234
jpayne@69 4235 def open(self, filename, mode='r',
jpayne@69 4236 index_filename=None,
jpayne@69 4237 VariantHeader header=None,
jpayne@69 4238 drop_samples=False,
jpayne@69 4239 duplicate_filehandle=True,
jpayne@69 4240 ignore_truncation=False,
jpayne@69 4241 threads=1):
jpayne@69 4242 """open a vcf/bcf file.
jpayne@69 4243
jpayne@69 4244 If open is called on an existing VariantFile, the current file will be
jpayne@69 4245 closed and a new file will be opened.
jpayne@69 4246 """
jpayne@69 4247 cdef bcf_hdr_t *hdr
jpayne@69 4248 cdef BGZF *bgzfp
jpayne@69 4249 cdef hts_idx_t *idx
jpayne@69 4250 cdef tbx_t *tidx
jpayne@69 4251 cdef char *cfilename
jpayne@69 4252 cdef char *cindex_filename = NULL
jpayne@69 4253 cdef char *cmode
jpayne@69 4254
jpayne@69 4255 if threads > 1 and ignore_truncation:
jpayne@69 4256 # This won't raise errors if reaching a truncated alignment,
jpayne@69 4257 # because bgzf_mt_reader in htslib does not deal with
jpayne@69 4258 # bgzf_mt_read_block returning non-zero values, contrary
jpayne@69 4259 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@69 4260 # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@69 4261 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@69 4262 self.threads = threads
jpayne@69 4263
jpayne@69 4264 # close a previously opened file
jpayne@69 4265 if self.is_open:
jpayne@69 4266 self.close()
jpayne@69 4267
jpayne@69 4268 if not mode or mode[0] not in 'rwa':
jpayne@69 4269 raise ValueError('mode must begin with r, w or a')
jpayne@69 4270
jpayne@69 4271 self.duplicate_filehandle = duplicate_filehandle
jpayne@69 4272
jpayne@69 4273 format_modes = [m for m in mode[1:] if m in 'bcguz']
jpayne@69 4274 if len(format_modes) > 1:
jpayne@69 4275 raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
jpayne@69 4276
jpayne@69 4277 invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
jpayne@69 4278 if invalid_modes:
jpayne@69 4279 raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
jpayne@69 4280
jpayne@69 4281 # Autodetect mode from filename
jpayne@69 4282 if mode == 'w' and isinstance(filename, str):
jpayne@69 4283 if filename.endswith('.gz'):
jpayne@69 4284 mode = 'wz'
jpayne@69 4285 elif filename.endswith('.bcf'):
jpayne@69 4286 mode = 'wb'
jpayne@69 4287
jpayne@69 4288 # for htslib, wbu seems to not work
jpayne@69 4289 if mode == 'wbu':
jpayne@69 4290 mode = 'wb0'
jpayne@69 4291
jpayne@69 4292 self.mode = mode = force_bytes(mode)
jpayne@69 4293 try:
jpayne@69 4294 filename = encode_filename(filename)
jpayne@69 4295 self.is_remote = hisremote(filename)
jpayne@69 4296 self.is_stream = filename == b'-'
jpayne@69 4297 except TypeError:
jpayne@69 4298 filename = filename
jpayne@69 4299 self.is_remote = False
jpayne@69 4300 self.is_stream = True
jpayne@69 4301
jpayne@69 4302 self.filename = filename
jpayne@69 4303
jpayne@69 4304 if index_filename is not None:
jpayne@69 4305 self.index_filename = index_filename = encode_filename(index_filename)
jpayne@69 4306 else:
jpayne@69 4307 self.index_filename = None
jpayne@69 4308
jpayne@69 4309 self.drop_samples = bool(drop_samples)
jpayne@69 4310 self.header = None
jpayne@69 4311
jpayne@69 4312 self.header_written = False
jpayne@69 4313
jpayne@69 4314 if mode.startswith(b'w'):
jpayne@69 4315 # open file for writing
jpayne@69 4316 if index_filename is not None:
jpayne@69 4317 raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
jpayne@69 4318
jpayne@69 4319 # header structure (used for writing)
jpayne@69 4320 if header:
jpayne@69 4321 self.header = header.copy()
jpayne@69 4322 else:
jpayne@69 4323 self.header = VariantHeader()
jpayne@69 4324 #raise ValueError('a VariantHeader must be specified')
jpayne@69 4325
jpayne@69 4326 # Header is not written until the first write or on close
jpayne@69 4327 self.htsfile = self._open_htsfile()
jpayne@69 4328
jpayne@69 4329 if not self.htsfile:
jpayne@69 4330 raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
jpayne@69 4331
jpayne@69 4332 elif mode.startswith(b'r'):
jpayne@69 4333 # open file for reading
jpayne@69 4334 self.htsfile = self._open_htsfile()
jpayne@69 4335
jpayne@69 4336 if not self.htsfile:
jpayne@69 4337 if errno:
jpayne@69 4338 raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
jpayne@69 4339 else:
jpayne@69 4340 raise ValueError('could not open variant file `{}`'.format(filename))
jpayne@69 4341
jpayne@69 4342 if self.htsfile.format.format not in (bcf, vcf):
jpayne@69 4343 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@69 4344
jpayne@69 4345 self.check_truncation(ignore_truncation)
jpayne@69 4346
jpayne@69 4347 with nogil:
jpayne@69 4348 hdr = bcf_hdr_read(self.htsfile)
jpayne@69 4349
jpayne@69 4350 try:
jpayne@69 4351 self.header = makeVariantHeader(hdr)
jpayne@69 4352 except ValueError:
jpayne@69 4353 raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@69 4354
jpayne@69 4355 if isinstance(self.filename, bytes):
jpayne@69 4356 cfilename = self.filename
jpayne@69 4357 else:
jpayne@69 4358 cfilename = NULL
jpayne@69 4359
jpayne@69 4360 # check for index and open if present
jpayne@69 4361 if self.htsfile.format.format == bcf and cfilename:
jpayne@69 4362 if index_filename is not None:
jpayne@69 4363 cindex_filename = index_filename
jpayne@69 4364 with nogil:
jpayne@69 4365 idx = bcf_index_load2(cfilename, cindex_filename)
jpayne@69 4366 self.index = makeBCFIndex(self.header, idx)
jpayne@69 4367
jpayne@69 4368 elif self.htsfile.format.compression == bgzf and cfilename:
jpayne@69 4369 if index_filename is not None:
jpayne@69 4370 cindex_filename = index_filename
jpayne@69 4371 with nogil:
jpayne@69 4372 tidx = tbx_index_load2(cfilename, cindex_filename)
jpayne@69 4373 self.index = makeTabixIndex(tidx)
jpayne@69 4374
jpayne@69 4375 if not self.is_stream:
jpayne@69 4376 self.start_offset = self.tell()
jpayne@69 4377 else:
jpayne@69 4378 raise ValueError('unknown mode {}'.format(mode))
jpayne@69 4379
jpayne@69 4380 def reset(self):
jpayne@69 4381 """reset file position to beginning of file just after the header."""
jpayne@69 4382 return self.seek(self.start_offset)
jpayne@69 4383
jpayne@69 4384 def is_valid_tid(self, tid):
jpayne@69 4385 """
jpayne@69 4386 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@69 4387
jpayne@69 4388 returns -1 if reference is not known.
jpayne@69 4389 """
jpayne@69 4390 if not self.is_open:
jpayne@69 4391 raise ValueError('I/O operation on closed file')
jpayne@69 4392
jpayne@69 4393 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 4394 cdef int rid = tid
jpayne@69 4395 return 0 <= rid < hdr.n[BCF_DT_CTG]
jpayne@69 4396
jpayne@69 4397 def get_tid(self, reference):
jpayne@69 4398 """
jpayne@69 4399 return the numerical :term:`tid` corresponding to
jpayne@69 4400 :term:`reference`
jpayne@69 4401
jpayne@69 4402 returns -1 if reference is not known.
jpayne@69 4403 """
jpayne@69 4404 if not self.is_open:
jpayne@69 4405 raise ValueError('I/O operation on closed file')
jpayne@69 4406
jpayne@69 4407 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@69 4408 reference = force_bytes(reference)
jpayne@69 4409 cdef khint_t k = kh_get_vdict(d, reference)
jpayne@69 4410 return kh_val_vdict(d, k).id if k != kh_end(d) else -1
jpayne@69 4411
jpayne@69 4412 def get_reference_name(self, tid):
jpayne@69 4413 """
jpayne@69 4414 return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@69 4415 """
jpayne@69 4416 if not self.is_open:
jpayne@69 4417 raise ValueError('I/O operation on closed file')
jpayne@69 4418
jpayne@69 4419 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@69 4420 cdef int rid = tid
jpayne@69 4421 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@69 4422 raise ValueError('Invalid tid')
jpayne@69 4423 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@69 4424
jpayne@69 4425 def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
jpayne@69 4426 """fetch records in a :term:`region`, specified either by
jpayne@69 4427 :term:`contig`, *start*, and *end* (which are 0-based, half-open);
jpayne@69 4428 or alternatively by a samtools :term:`region` string (which is
jpayne@69 4429 1-based inclusive).
jpayne@69 4430
jpayne@69 4431 Without *contig* or *region* all mapped records will be fetched. The
jpayne@69 4432 records will be returned ordered by contig, which will not necessarily
jpayne@69 4433 be the order within the file.
jpayne@69 4434
jpayne@69 4435 Set *reopen* to true if you will be using multiple iterators on the
jpayne@69 4436 same file at the same time. The iterator returned will receive its
jpayne@69 4437 own copy of a filehandle to the file effectively re-opening the
jpayne@69 4438 file. Re-opening a file incurrs some overhead, so use with care.
jpayne@69 4439
jpayne@69 4440 If only *contig* is set, all records on *contig* will be fetched.
jpayne@69 4441 If both *region* and *contig* are given, an exception is raised.
jpayne@69 4442
jpayne@69 4443 Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
jpayne@69 4444 (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
jpayne@69 4445 read sequentially.
jpayne@69 4446 """
jpayne@69 4447 if not self.is_open:
jpayne@69 4448 raise ValueError('I/O operation on closed file')
jpayne@69 4449
jpayne@69 4450 if self.htsfile.is_write:
jpayne@69 4451 raise ValueError('cannot fetch from Variantfile opened for writing')
jpayne@69 4452
jpayne@69 4453 if contig is None and region is None:
jpayne@69 4454 self.is_reading = 1
jpayne@69 4455 bcf = self.copy() if reopen else self
jpayne@69 4456 bcf.seek(self.start_offset)
jpayne@69 4457 return iter(bcf)
jpayne@69 4458
jpayne@69 4459 if self.index is None:
jpayne@69 4460 raise ValueError('fetch requires an index')
jpayne@69 4461
jpayne@69 4462 _, tid, start, stop = self.parse_region(contig, start, stop, region,
jpayne@69 4463 None, end=end, reference=reference)
jpayne@69 4464
jpayne@69 4465 if contig is None:
jpayne@69 4466 contig = self.get_reference_name(tid)
jpayne@69 4467
jpayne@69 4468 self.is_reading = 1
jpayne@69 4469 return self.index.fetch(self, contig, start, stop, reopen)
jpayne@69 4470
jpayne@69 4471 def new_record(self, *args, **kwargs):
jpayne@69 4472 """Create a new empty :class:`VariantRecord`.
jpayne@69 4473
jpayne@69 4474 See :meth:`VariantHeader.new_record`
jpayne@69 4475 """
jpayne@69 4476 return self.header.new_record(*args, **kwargs)
jpayne@69 4477
jpayne@69 4478 cpdef int write(self, VariantRecord record) except -1:
jpayne@69 4479 """
jpayne@69 4480 write a single :class:`pysam.VariantRecord` to disk.
jpayne@69 4481
jpayne@69 4482 returns the number of bytes written.
jpayne@69 4483 """
jpayne@69 4484 if record is None:
jpayne@69 4485 raise ValueError('record must not be None')
jpayne@69 4486
jpayne@69 4487 if not self.is_open:
jpayne@69 4488 return ValueError('I/O operation on closed file')
jpayne@69 4489
jpayne@69 4490 if not self.htsfile.is_write:
jpayne@69 4491 raise ValueError('cannot write to a Variantfile opened for reading')
jpayne@69 4492
jpayne@69 4493 if not self.header_written:
jpayne@69 4494 self.header_written = True
jpayne@69 4495 with nogil:
jpayne@69 4496 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@69 4497
jpayne@69 4498 #if record.header is not self.header:
jpayne@69 4499 # record.translate(self.header)
jpayne@69 4500 # raise ValueError('Writing records from a different VariantFile is not yet supported')
jpayne@69 4501
jpayne@69 4502 if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
jpayne@69 4503 msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
jpayne@69 4504 raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
jpayne@69 4505
jpayne@69 4506 # Sync END annotation before writing
jpayne@69 4507 bcf_sync_end(record)
jpayne@69 4508
jpayne@69 4509 cdef int ret
jpayne@69 4510
jpayne@69 4511 with nogil:
jpayne@69 4512 ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
jpayne@69 4513
jpayne@69 4514 if ret < 0:
jpayne@69 4515 raise IOError(errno, strerror(errno))
jpayne@69 4516
jpayne@69 4517 return ret
jpayne@69 4518
jpayne@69 4519 def subset_samples(self, include_samples):
jpayne@69 4520 """
jpayne@69 4521 Read only a subset of samples to reduce processing time and memory.
jpayne@69 4522 Must be called prior to retrieving records.
jpayne@69 4523 """
jpayne@69 4524 if not self.is_open:
jpayne@69 4525 raise ValueError('I/O operation on closed file')
jpayne@69 4526
jpayne@69 4527 if self.htsfile.is_write:
jpayne@69 4528 raise ValueError('cannot subset samples from Variantfile opened for writing')
jpayne@69 4529
jpayne@69 4530 if self.is_reading:
jpayne@69 4531 raise ValueError('cannot subset samples after fetching records')
jpayne@69 4532
jpayne@69 4533 self.header._subset_samples(include_samples)
jpayne@69 4534
jpayne@69 4535 # potentially unnecessary optimization that also sets max_unpack
jpayne@69 4536 if not include_samples:
jpayne@69 4537 self.drop_samples = True
jpayne@69 4538