annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcbcf.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: language_level=3
jpayne@68 2 # cython: embedsignature=True
jpayne@68 3 # cython: profile=True
jpayne@68 4 ###############################################################################
jpayne@68 5 ###############################################################################
jpayne@68 6 ## Cython wrapper for htslib VCF/BCF reader/writer
jpayne@68 7 ###############################################################################
jpayne@68 8 #
jpayne@68 9 # NOTICE: This code is incomplete and preliminary. It offers a nearly
jpayne@68 10 # complete Pythonic interface to VCF/BCF metadata and data with
jpayne@68 11 # reading and writing capability. Documentation and a unit test suite
jpayne@68 12 # are in the works. The code is best tested under Python 2, but
jpayne@68 13 # should also work with Python 3. Please report any remaining
jpayne@68 14 # str/bytes issues on the github site when using Python 3 and I'll
jpayne@68 15 # fix them promptly.
jpayne@68 16 #
jpayne@68 17 # Here is a minimal example of how to use the API:
jpayne@68 18 #
jpayne@68 19 # $ cat bcfview.py
jpayne@68 20 # import sys
jpayne@68 21 # from pysam import VariantFile
jpayne@68 22 #
jpayne@68 23 # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
jpayne@68 24 # bcf_out = VariantFile('-', 'w', header=bcf_in.header)
jpayne@68 25 #
jpayne@68 26 # for rec in bcf_in:
jpayne@68 27 # bcf_out.write(rec)
jpayne@68 28 #
jpayne@68 29 # Performance is fairly close to that of bcftools view. Here is an example
jpayne@68 30 # using some 1k Genomes data:
jpayne@68 31 #
jpayne@68 32 # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@68 33 # 1103799
jpayne@68 34 #
jpayne@68 35 # real 0m56.114s
jpayne@68 36 # user 1m4.489s
jpayne@68 37 # sys 0m3.102s
jpayne@68 38 #
jpayne@68 39 # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@68 40 # 1103800 # bcftools adds an extra header
jpayne@68 41 #
jpayne@68 42 # real 0m55.126s
jpayne@68 43 # user 1m3.502s
jpayne@68 44 # sys 0m3.459s
jpayne@68 45 #
jpayne@68 46 ###############################################################################
jpayne@68 47 #
jpayne@68 48 # TODO list:
jpayne@68 49 #
jpayne@68 50 # * more genotype methods
jpayne@68 51 # * unit test suite (perhaps py.test based)
jpayne@68 52 # * documentation
jpayne@68 53 # * pickle support
jpayne@68 54 # * left/right locus normalization
jpayne@68 55 # * fix reopen to re-use fd
jpayne@68 56 #
jpayne@68 57 ###############################################################################
jpayne@68 58 #
jpayne@68 59 # The MIT License
jpayne@68 60 #
jpayne@68 61 # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
jpayne@68 62 #
jpayne@68 63 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68 64 # copy of this software and associated documentation files (the "Software"),
jpayne@68 65 # to deal in the Software without restriction, including without limitation
jpayne@68 66 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68 67 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68 68 # Software is furnished to do so, subject to the following conditions:
jpayne@68 69 #
jpayne@68 70 # The above copyright notice and this permission notice shall be included in
jpayne@68 71 # all copies or substantial portions of the Software.
jpayne@68 72 #
jpayne@68 73 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68 74 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68 75 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68 76 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68 77 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68 78 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68 79 # DEALINGS IN THE SOFTWARE.
jpayne@68 80 #
jpayne@68 81 ###############################################################################
jpayne@68 82
jpayne@68 83 from __future__ import division, print_function
jpayne@68 84
jpayne@68 85 import os
jpayne@68 86 import sys
jpayne@68 87
jpayne@68 88 from libc.errno cimport errno, EPIPE
jpayne@68 89 from libc.string cimport strcmp, strpbrk, strerror
jpayne@68 90 from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
jpayne@68 91
jpayne@68 92 cimport cython
jpayne@68 93
jpayne@68 94 from cpython.object cimport PyObject
jpayne@68 95 from cpython.ref cimport Py_INCREF
jpayne@68 96 from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
jpayne@68 97 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
jpayne@68 98 from cpython.bytes cimport PyBytes_FromStringAndSize
jpayne@68 99 from cpython.unicode cimport PyUnicode_DecodeUTF8
jpayne@68 100
jpayne@68 101 from pysam.libchtslib cimport HTSFile, hisremote
jpayne@68 102
jpayne@68 103 from pysam.utils import unquoted_str
jpayne@68 104
jpayne@68 105
jpayne@68 106 __all__ = ['VariantFile',
jpayne@68 107 'VariantHeader',
jpayne@68 108 'VariantHeaderRecord',
jpayne@68 109 'VariantHeaderRecords',
jpayne@68 110 'VariantMetadata',
jpayne@68 111 'VariantHeaderMetadata',
jpayne@68 112 'VariantContig',
jpayne@68 113 'VariantHeaderContigs',
jpayne@68 114 'VariantHeaderSamples',
jpayne@68 115 'VariantRecordFilter',
jpayne@68 116 'VariantRecordFormat',
jpayne@68 117 'VariantRecordInfo',
jpayne@68 118 'VariantRecordSamples',
jpayne@68 119 'VariantRecord',
jpayne@68 120 'VariantRecordSample',
jpayne@68 121 'BaseIndex',
jpayne@68 122 'BCFIndex',
jpayne@68 123 'TabixIndex',
jpayne@68 124 'BaseIterator',
jpayne@68 125 'BCFIterator',
jpayne@68 126 'TabixIterator',
jpayne@68 127 'VariantRecord']
jpayne@68 128
jpayne@68 129 ########################################################################
jpayne@68 130 ########################################################################
jpayne@68 131 ## Constants
jpayne@68 132 ########################################################################
jpayne@68 133
jpayne@68 134 cdef int MAX_POS = (1 << 31) - 1
jpayne@68 135 cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
jpayne@68 136 cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
jpayne@68 137 cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
jpayne@68 138
jpayne@68 139
jpayne@68 140 ########################################################################
jpayne@68 141 ########################################################################
jpayne@68 142 ## Python 3 compatibility functions
jpayne@68 143 ########################################################################
jpayne@68 144
jpayne@68 145 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
jpayne@68 146 from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
jpayne@68 147
jpayne@68 148
jpayne@68 149 ########################################################################
jpayne@68 150 ########################################################################
jpayne@68 151 ## Sentinel object
jpayne@68 152 ########################################################################
jpayne@68 153
jpayne@68 154 cdef object _nothing = object()
jpayne@68 155
jpayne@68 156 ########################################################################
jpayne@68 157 ########################################################################
jpayne@68 158 ## VCF/BCF string intern system
jpayne@68 159 ########################################################################
jpayne@68 160
jpayne@68 161 cdef dict bcf_str_cache = {}
jpayne@68 162
jpayne@68 163 cdef inline bcf_str_cache_get_charptr(const char* s):
jpayne@68 164 if s == NULL:
jpayne@68 165 return None
jpayne@68 166
jpayne@68 167 cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
jpayne@68 168 if pystr:
jpayne@68 169 return <object>pystr
jpayne@68 170
jpayne@68 171 val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
jpayne@68 172
jpayne@68 173 PyDict_SetItemString(bcf_str_cache, s, val)
jpayne@68 174
jpayne@68 175 return val
jpayne@68 176
jpayne@68 177
jpayne@68 178 ########################################################################
jpayne@68 179 ########################################################################
jpayne@68 180 ## Genotype math
jpayne@68 181 ########################################################################
jpayne@68 182
jpayne@68 183 cdef int comb(int n, int k) except -1:
jpayne@68 184 """Return binomial coefficient: n choose k
jpayne@68 185
jpayne@68 186 >>> comb(5, 1)
jpayne@68 187 5
jpayne@68 188 >>> comb(5, 2)
jpayne@68 189 10
jpayne@68 190 >>> comb(2, 2)
jpayne@68 191 1
jpayne@68 192 >>> comb(100, 2)
jpayne@68 193 4950
jpayne@68 194 """
jpayne@68 195 if k > n:
jpayne@68 196 return 0
jpayne@68 197 elif k == n:
jpayne@68 198 return 1
jpayne@68 199 elif k > n // 2:
jpayne@68 200 k = n - k
jpayne@68 201
jpayne@68 202 cdef d, result
jpayne@68 203
jpayne@68 204 d = result = n - k + 1
jpayne@68 205 for i in range(2, k + 1):
jpayne@68 206 d += 1
jpayne@68 207 result *= d
jpayne@68 208 result //= i
jpayne@68 209 return result
jpayne@68 210
jpayne@68 211
jpayne@68 212 cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
jpayne@68 213 """Return the count of genotypes expected for the given ploidy and number of alleles.
jpayne@68 214
jpayne@68 215 >>> bcf_geno_combinations(1, 2)
jpayne@68 216 2
jpayne@68 217 >>> bcf_geno_combinations(2, 2)
jpayne@68 218 3
jpayne@68 219 >>> bcf_geno_combinations(2, 3)
jpayne@68 220 6
jpayne@68 221 >>> bcf_geno_combinations(3, 2)
jpayne@68 222 4
jpayne@68 223 """
jpayne@68 224 return comb(alleles + ploidy - 1, ploidy)
jpayne@68 225
jpayne@68 226
jpayne@68 227 ########################################################################
jpayne@68 228 ########################################################################
jpayne@68 229 ## Low level type conversion helpers
jpayne@68 230 ########################################################################
jpayne@68 231
jpayne@68 232
jpayne@68 233 cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
jpayne@68 234 return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
jpayne@68 235
jpayne@68 236
jpayne@68 237 cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
jpayne@68 238 return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
jpayne@68 239
jpayne@68 240
jpayne@68 241 cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
jpayne@68 242
jpayne@68 243 if sample < 0:
jpayne@68 244 raise ValueError('genotype is only valid as a format field')
jpayne@68 245
jpayne@68 246 cdef int32_t *gt_arr = NULL
jpayne@68 247 cdef int ngt = 0
jpayne@68 248 ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
jpayne@68 249
jpayne@68 250 if ngt <= 0 or not gt_arr:
jpayne@68 251 return 0
jpayne@68 252
jpayne@68 253 assert ngt % rec.n_sample == 0
jpayne@68 254 cdef int max_ploidy = ngt // rec.n_sample
jpayne@68 255 cdef int32_t *gt = gt_arr + sample * max_ploidy
jpayne@68 256 cdef int ploidy = 0
jpayne@68 257
jpayne@68 258 while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
jpayne@68 259 gt += 1
jpayne@68 260 ploidy += 1
jpayne@68 261
jpayne@68 262 free(<void*>gt_arr)
jpayne@68 263
jpayne@68 264 return bcf_geno_combinations(ploidy, rec.n_allele)
jpayne@68 265
jpayne@68 266
jpayne@68 267 cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
jpayne@68 268 if not a:
jpayne@68 269 return None
jpayne@68 270 try:
jpayne@68 271 return tuple(charptr_to_str(a[i]) for i in range(n))
jpayne@68 272 finally:
jpayne@68 273 if free_after and a:
jpayne@68 274 free(a)
jpayne@68 275
jpayne@68 276
jpayne@68 277 cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
jpayne@68 278 cdef char *datac
jpayne@68 279 cdef int8_t *data8
jpayne@68 280 cdef int16_t *data16
jpayne@68 281 cdef int32_t *data32
jpayne@68 282 cdef float *dataf
jpayne@68 283 cdef int i
jpayne@68 284 cdef bytes b
jpayne@68 285
jpayne@68 286 if not data or n <= 0:
jpayne@68 287 return None
jpayne@68 288
jpayne@68 289 if type == BCF_BT_CHAR:
jpayne@68 290 datac = <char *>data
jpayne@68 291
jpayne@68 292 if not n:
jpayne@68 293 value = ()
jpayne@68 294 else:
jpayne@68 295 # Check if at least one null terminator is present
jpayne@68 296 if datac[n-1] == bcf_str_vector_end:
jpayne@68 297 # If so, create a string up to the first null terminator
jpayne@68 298 b = datac
jpayne@68 299 else:
jpayne@68 300 # Otherwise, copy the entire block
jpayne@68 301 b = datac[:n]
jpayne@68 302 value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
jpayne@68 303 else:
jpayne@68 304 value = []
jpayne@68 305 if type == BCF_BT_INT8:
jpayne@68 306 data8 = <int8_t *>data
jpayne@68 307 for i in range(n):
jpayne@68 308 if data8[i] == bcf_int8_vector_end:
jpayne@68 309 break
jpayne@68 310 value.append(data8[i] if data8[i] != bcf_int8_missing else None)
jpayne@68 311 elif type == BCF_BT_INT16:
jpayne@68 312 data16 = <int16_t *>data
jpayne@68 313 for i in range(n):
jpayne@68 314 if data16[i] == bcf_int16_vector_end:
jpayne@68 315 break
jpayne@68 316 value.append(data16[i] if data16[i] != bcf_int16_missing else None)
jpayne@68 317 elif type == BCF_BT_INT32:
jpayne@68 318 data32 = <int32_t *>data
jpayne@68 319 for i in range(n):
jpayne@68 320 if data32[i] == bcf_int32_vector_end:
jpayne@68 321 break
jpayne@68 322 value.append(data32[i] if data32[i] != bcf_int32_missing else None)
jpayne@68 323 elif type == BCF_BT_FLOAT:
jpayne@68 324 dataf = <float *>data
jpayne@68 325 for i in range(n):
jpayne@68 326 if bcf_float_is_vector_end(dataf[i]):
jpayne@68 327 break
jpayne@68 328 value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
jpayne@68 329 else:
jpayne@68 330 raise TypeError('unsupported info type code')
jpayne@68 331
jpayne@68 332 # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
jpayne@68 333 if not value:
jpayne@68 334 if scalar:
jpayne@68 335 value = None
jpayne@68 336 elif count <= 0:
jpayne@68 337 value = ()
jpayne@68 338 else:
jpayne@68 339 value = (None,)*count
jpayne@68 340 elif scalar and len(value) == 1:
jpayne@68 341 value = value[0]
jpayne@68 342 else:
jpayne@68 343 value = tuple(value)
jpayne@68 344
jpayne@68 345 return value
jpayne@68 346
jpayne@68 347
jpayne@68 348 cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
jpayne@68 349 cdef char *datac
jpayne@68 350 cdef int8_t *data8
jpayne@68 351 cdef int16_t *data16
jpayne@68 352 cdef int32_t *data32
jpayne@68 353 cdef float *dataf
jpayne@68 354 cdef ssize_t i, value_count = len(values)
jpayne@68 355
jpayne@68 356 assert value_count <= n
jpayne@68 357
jpayne@68 358 if bt_type == BCF_BT_CHAR:
jpayne@68 359 if not isinstance(values, (str, bytes)):
jpayne@68 360 values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
jpayne@68 361 value_count = len(values)
jpayne@68 362 assert value_count <= n
jpayne@68 363 datac = <char *>data
jpayne@68 364 memcpy(datac, <char *>values, value_count)
jpayne@68 365 for i in range(value_count, n):
jpayne@68 366 datac[i] = 0
jpayne@68 367 elif bt_type == BCF_BT_INT8:
jpayne@68 368 datai8 = <int8_t *>data
jpayne@68 369 for i in range(value_count):
jpayne@68 370 val = values[i]
jpayne@68 371 datai8[i] = val if val is not None else bcf_int8_missing
jpayne@68 372 for i in range(value_count, n):
jpayne@68 373 datai8[i] = bcf_int8_vector_end
jpayne@68 374 elif bt_type == BCF_BT_INT16:
jpayne@68 375 datai16 = <int16_t *>data
jpayne@68 376 for i in range(value_count):
jpayne@68 377 val = values[i]
jpayne@68 378 datai16[i] = val if val is not None else bcf_int16_missing
jpayne@68 379 for i in range(value_count, n):
jpayne@68 380 datai16[i] = bcf_int16_vector_end
jpayne@68 381 elif bt_type == BCF_BT_INT32:
jpayne@68 382 datai32 = <int32_t *>data
jpayne@68 383 for i in range(value_count):
jpayne@68 384 val = values[i]
jpayne@68 385 datai32[i] = val if val is not None else bcf_int32_missing
jpayne@68 386 for i in range(value_count, n):
jpayne@68 387 datai32[i] = bcf_int32_vector_end
jpayne@68 388 elif bt_type == BCF_BT_FLOAT:
jpayne@68 389 dataf = <float *>data
jpayne@68 390 for i in range(value_count):
jpayne@68 391 val = values[i]
jpayne@68 392 if val is None:
jpayne@68 393 bcf_float_set(dataf + i, bcf_float_missing)
jpayne@68 394 else:
jpayne@68 395 dataf[i] = val
jpayne@68 396 for i in range(value_count, n):
jpayne@68 397 bcf_float_set(dataf + i, bcf_float_vector_end)
jpayne@68 398 else:
jpayne@68 399 raise TypeError('unsupported type')
jpayne@68 400
jpayne@68 401
jpayne@68 402 cdef bcf_empty_array(int type, ssize_t n, int vlen):
jpayne@68 403 cdef char *datac
jpayne@68 404 cdef int32_t *data32
jpayne@68 405 cdef float *dataf
jpayne@68 406 cdef int i
jpayne@68 407
jpayne@68 408 if n <= 0:
jpayne@68 409 raise ValueError('Cannot create empty array')
jpayne@68 410
jpayne@68 411 if type == BCF_HT_STR:
jpayne@68 412 value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
jpayne@68 413 datac = <char *>value
jpayne@68 414 for i in range(n):
jpayne@68 415 datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
jpayne@68 416 elif type == BCF_HT_INT:
jpayne@68 417 value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
jpayne@68 418 data32 = <int32_t *><char *>value
jpayne@68 419 for i in range(n):
jpayne@68 420 data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68 421 elif type == BCF_HT_REAL:
jpayne@68 422 value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
jpayne@68 423 dataf = <float *><char *>value
jpayne@68 424 for i in range(n):
jpayne@68 425 bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@68 426 else:
jpayne@68 427 raise TypeError('unsupported header type code')
jpayne@68 428
jpayne@68 429 return value
jpayne@68 430
jpayne@68 431
jpayne@68 432 cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
jpayne@68 433 void *dst_data, int dst_type, size_t dst_values,
jpayne@68 434 int vlen):
jpayne@68 435 """copy data from src to dest where the size of the elements (src_type/dst_type) differ
jpayne@68 436 as well as the number of elements (src_values/dst_values).
jpayne@68 437 """
jpayne@68 438
jpayne@68 439 cdef char *src_datac
jpayne@68 440 cdef char *dst_datac
jpayne@68 441 cdef int8_t *src_datai8
jpayne@68 442 cdef int16_t *src_datai16
jpayne@68 443 cdef int32_t *src_datai32
jpayne@68 444 cdef int32_t *dst_datai
jpayne@68 445 cdef float *src_dataf
jpayne@68 446 cdef float *dst_dataf
jpayne@68 447 cdef ssize_t src_size, dst_size, i, j
jpayne@68 448 cdef int val
jpayne@68 449
jpayne@68 450 if src_values > dst_values:
jpayne@68 451 raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
jpayne@68 452
jpayne@68 453 if src_type == dst_type == BCF_BT_CHAR:
jpayne@68 454 src_datac = <char *>src_data
jpayne@68 455 dst_datac = <char *>dst_data
jpayne@68 456 memcpy(dst_datac, src_datac, src_values)
jpayne@68 457 for i in range(src_values, dst_values):
jpayne@68 458 dst_datac[i] = 0
jpayne@68 459 elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
jpayne@68 460 src_datai8 = <int8_t *>src_data
jpayne@68 461 dst_datai = <int32_t *>dst_data
jpayne@68 462 for i in range(src_values):
jpayne@68 463 val = src_datai8[i]
jpayne@68 464 if val == bcf_int8_missing:
jpayne@68 465 val = bcf_int32_missing
jpayne@68 466 elif val == bcf_int8_vector_end:
jpayne@68 467 val = bcf_int32_vector_end
jpayne@68 468 dst_datai[i] = val
jpayne@68 469 for i in range(src_values, dst_values):
jpayne@68 470 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68 471 elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
jpayne@68 472 src_datai16 = <int16_t *>src_data
jpayne@68 473 dst_datai = <int32_t *>dst_data
jpayne@68 474 for i in range(src_values):
jpayne@68 475 val = src_datai16[i]
jpayne@68 476 if val == bcf_int16_missing:
jpayne@68 477 val = bcf_int32_missing
jpayne@68 478 elif val == bcf_int16_vector_end:
jpayne@68 479 val = bcf_int32_vector_end
jpayne@68 480 dst_datai[i] = val
jpayne@68 481 for i in range(src_values, dst_values):
jpayne@68 482 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68 483 elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
jpayne@68 484 src_datai32 = <int32_t *>src_data
jpayne@68 485 dst_datai = <int32_t *>dst_data
jpayne@68 486 for i in range(src_values):
jpayne@68 487 dst_datai[i] = src_datai32[i]
jpayne@68 488 for i in range(src_values, dst_values):
jpayne@68 489 dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68 490 elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
jpayne@68 491 src_dataf = <float *>src_data
jpayne@68 492 dst_dataf = <float *>dst_data
jpayne@68 493 for i in range(src_values):
jpayne@68 494 dst_dataf[i] = src_dataf[i]
jpayne@68 495 for i in range(src_values, dst_values):
jpayne@68 496 bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@68 497 else:
jpayne@68 498 raise TypeError('unsupported types')
jpayne@68 499
jpayne@68 500
jpayne@68 501 cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
jpayne@68 502 if record is None:
jpayne@68 503 raise ValueError('record must not be None')
jpayne@68 504
jpayne@68 505 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68 506 cdef bcf1_t *r = record.ptr
jpayne@68 507
jpayne@68 508 if not check_header_id(hdr, hl_type, id):
jpayne@68 509 raise ValueError('Invalid header')
jpayne@68 510
jpayne@68 511 cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
jpayne@68 512 cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
jpayne@68 513
jpayne@68 514 scalar[0] = 0
jpayne@68 515
jpayne@68 516 if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
jpayne@68 517 count[0] = number
jpayne@68 518 elif length == BCF_VL_FIXED:
jpayne@68 519 if number == 1:
jpayne@68 520 scalar[0] = 1
jpayne@68 521 count[0] = number
jpayne@68 522 elif length == BCF_VL_R:
jpayne@68 523 count[0] = r.n_allele
jpayne@68 524 elif length == BCF_VL_A:
jpayne@68 525 count[0] = r.n_allele - 1
jpayne@68 526 elif length == BCF_VL_G:
jpayne@68 527 count[0] = bcf_genotype_count(hdr, r, sample)
jpayne@68 528 elif length == BCF_VL_VAR:
jpayne@68 529 count[0] = -1
jpayne@68 530 else:
jpayne@68 531 raise ValueError('Unknown format length')
jpayne@68 532
jpayne@68 533
jpayne@68 534 cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
jpayne@68 535 if record is None:
jpayne@68 536 raise ValueError('record must not be None')
jpayne@68 537
jpayne@68 538 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68 539
jpayne@68 540 cdef char *s
jpayne@68 541 cdef ssize_t count
jpayne@68 542 cdef int scalar
jpayne@68 543
jpayne@68 544 bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
jpayne@68 545
jpayne@68 546 if z.len == 0:
jpayne@68 547 if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
jpayne@68 548 value = True
jpayne@68 549 elif scalar:
jpayne@68 550 value = None
jpayne@68 551 else:
jpayne@68 552 value = ()
jpayne@68 553 elif z.len == 1:
jpayne@68 554 if z.type == BCF_BT_INT8:
jpayne@68 555 value = z.v1.i if z.v1.i != bcf_int8_missing else None
jpayne@68 556 elif z.type == BCF_BT_INT16:
jpayne@68 557 value = z.v1.i if z.v1.i != bcf_int16_missing else None
jpayne@68 558 elif z.type == BCF_BT_INT32:
jpayne@68 559 value = z.v1.i if z.v1.i != bcf_int32_missing else None
jpayne@68 560 elif z.type == BCF_BT_FLOAT:
jpayne@68 561 value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
jpayne@68 562 elif z.type == BCF_BT_CHAR:
jpayne@68 563 value = force_str(chr(z.v1.i))
jpayne@68 564 else:
jpayne@68 565 raise TypeError('unsupported info type code')
jpayne@68 566
jpayne@68 567 if not scalar and value != ():
jpayne@68 568 value = (value,)
jpayne@68 569 else:
jpayne@68 570 value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
jpayne@68 571
jpayne@68 572 return value
jpayne@68 573
jpayne@68 574
jpayne@68 575 cdef object bcf_check_values(VariantRecord record, value, int sample,
jpayne@68 576 int hl_type, int ht_type,
jpayne@68 577 int id, int bt_type, ssize_t bt_len,
jpayne@68 578 ssize_t *value_count, int *scalar, int *realloc):
jpayne@68 579
jpayne@68 580 if record is None:
jpayne@68 581 raise ValueError('record must not be None')
jpayne@68 582
jpayne@68 583 bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
jpayne@68 584
jpayne@68 585 # Validate values now that we know the type and size
jpayne@68 586 values = (value,) if not isinstance(value, (list, tuple)) else value
jpayne@68 587
jpayne@68 588 # Validate values now that we know the type and size
jpayne@68 589 if ht_type == BCF_HT_FLAG:
jpayne@68 590 value_count[0] = 1
jpayne@68 591 elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
jpayne@68 592 # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
jpayne@68 593 value_count[0] = -1
jpayne@68 594
jpayne@68 595 cdef int given = len(values)
jpayne@68 596 if value_count[0] != -1 and value_count[0] != given:
jpayne@68 597 if scalar[0]:
jpayne@68 598 raise TypeError('value expected to be scalar, given len={}'.format(given))
jpayne@68 599 else:
jpayne@68 600 raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
jpayne@68 601
jpayne@68 602 if ht_type == BCF_HT_REAL:
jpayne@68 603 for v in values:
jpayne@68 604 if not(v is None or isinstance(v, (float, int))):
jpayne@68 605 raise TypeError('invalid value for Float format')
jpayne@68 606 elif ht_type == BCF_HT_INT:
jpayne@68 607 for v in values:
jpayne@68 608 if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
jpayne@68 609 raise TypeError('invalid value for Integer format')
jpayne@68 610 for v in values:
jpayne@68 611 if not(v is None or bcf_int32_missing < v <= INT32_MAX):
jpayne@68 612 raise ValueError('Integer value too small/large to store in VCF/BCF')
jpayne@68 613 elif ht_type == BCF_HT_STR:
jpayne@68 614 values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
jpayne@68 615 elif ht_type == BCF_HT_FLAG:
jpayne@68 616 if values[0] not in (True, False, None, 1, 0):
jpayne@68 617 raise ValueError('Flag values must be: True, False, None, 1, 0')
jpayne@68 618 else:
jpayne@68 619 raise TypeError('unsupported type')
jpayne@68 620
jpayne@68 621 realloc[0] = 0
jpayne@68 622 if len(values) <= 1 and hl_type == BCF_HL_INFO:
jpayne@68 623 realloc[0] = 0
jpayne@68 624 elif len(values) > bt_len:
jpayne@68 625 realloc[0] = 1
jpayne@68 626 elif bt_type == BCF_BT_INT8:
jpayne@68 627 for v in values:
jpayne@68 628 if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
jpayne@68 629 realloc[0] = 1
jpayne@68 630 break
jpayne@68 631 elif bt_type == BCF_BT_INT16:
jpayne@68 632 for v in values:
jpayne@68 633 if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
jpayne@68 634 realloc[0] = 1
jpayne@68 635 break
jpayne@68 636
jpayne@68 637 return values
jpayne@68 638
jpayne@68 639
jpayne@68 640 cdef bcf_encode_alleles(VariantRecord record, values):
jpayne@68 641 if record is None:
jpayne@68 642 raise ValueError('record must not be None')
jpayne@68 643
jpayne@68 644 cdef bcf1_t *r = record.ptr
jpayne@68 645 cdef int32_t nalleles = r.n_allele
jpayne@68 646 cdef list gt_values = []
jpayne@68 647 cdef char *s
jpayne@68 648 cdef int i
jpayne@68 649
jpayne@68 650 if values is None:
jpayne@68 651 return ()
jpayne@68 652
jpayne@68 653 if not isinstance(values, (list, tuple)):
jpayne@68 654 values = (values,)
jpayne@68 655
jpayne@68 656 for value in values:
jpayne@68 657 if value is None:
jpayne@68 658 gt_values.append(bcf_gt_missing)
jpayne@68 659 elif isinstance(value, (str, bytes)):
jpayne@68 660 bvalue = force_bytes(value)
jpayne@68 661 s = bvalue
jpayne@68 662 for i in range(r.n_allele):
jpayne@68 663 if strcmp(r.d.allele[i], s) != 0:
jpayne@68 664 gt_values.append(bcf_gt_unphased(i))
jpayne@68 665 break
jpayne@68 666 else:
jpayne@68 667 raise ValueError('Unknown allele')
jpayne@68 668 else:
jpayne@68 669 i = value
jpayne@68 670 if not (0 <= i < nalleles):
jpayne@68 671 raise ValueError('Invalid allele index')
jpayne@68 672 gt_values.append(bcf_gt_unphased(i))
jpayne@68 673
jpayne@68 674 return gt_values
jpayne@68 675
jpayne@68 676
jpayne@68 677 cdef bcf_info_set_value(VariantRecord record, key, value):
jpayne@68 678 if record is None:
jpayne@68 679 raise ValueError('record must not be None')
jpayne@68 680
jpayne@68 681 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68 682 cdef bcf1_t *r = record.ptr
jpayne@68 683 cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
jpayne@68 684 cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
jpayne@68 685
jpayne@68 686 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 687 raise ValueError('Error unpacking VariantRecord')
jpayne@68 688
jpayne@68 689 cdef bytes bkey = force_bytes(key)
jpayne@68 690 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 691
jpayne@68 692 if info:
jpayne@68 693 info_id = info.key
jpayne@68 694 else:
jpayne@68 695 info_id = bcf_header_get_info_id(hdr, bkey)
jpayne@68 696
jpayne@68 697 if info_id < 0:
jpayne@68 698 raise KeyError('unknown INFO: {}'.format(key))
jpayne@68 699
jpayne@68 700 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68 701 raise ValueError('Invalid header')
jpayne@68 702
jpayne@68 703 info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
jpayne@68 704 values = bcf_check_values(record, value, -1,
jpayne@68 705 BCF_HL_INFO, info_type, info_id,
jpayne@68 706 info.type if info else -1,
jpayne@68 707 info.len if info else -1,
jpayne@68 708 &value_count, &scalar, &realloc)
jpayne@68 709
jpayne@68 710 if info_type == BCF_HT_FLAG:
jpayne@68 711 if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
jpayne@68 712 raise ValueError('Unable to update INFO values')
jpayne@68 713 return
jpayne@68 714
jpayne@68 715 vlen = value_count < 0
jpayne@68 716 value_count = len(values)
jpayne@68 717
jpayne@68 718 # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
jpayne@68 719 # If we can, write updated values to existing allocated storage
jpayne@68 720 if 0 and info and not realloc:
jpayne@68 721 r.d.shared_dirty |= BCF1_DIRTY_INF
jpayne@68 722
jpayne@68 723 if value_count == 0:
jpayne@68 724 info.len = 0
jpayne@68 725 if not info.vptr:
jpayne@68 726 info.vptr = <uint8_t *>&info.v1.i
jpayne@68 727
jpayne@68 728 elif value_count == 1:
jpayne@68 729 # FIXME: Check if need to free vptr if info.len > 0?
jpayne@68 730 if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
jpayne@68 731 bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
jpayne@68 732 elif info.type == BCF_BT_FLOAT:
jpayne@68 733 bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
jpayne@68 734 else:
jpayne@68 735 raise TypeError('unsupported info type code')
jpayne@68 736
jpayne@68 737 info.len = 1
jpayne@68 738 if not info.vptr:
jpayne@68 739 info.vptr = <uint8_t *>&info.v1.i
jpayne@68 740 else:
jpayne@68 741 bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
jpayne@68 742
jpayne@68 743 return
jpayne@68 744
jpayne@68 745 alloc_len = max(1, value_count)
jpayne@68 746 if info and info.len > alloc_len:
jpayne@68 747 alloc_len = info.len
jpayne@68 748
jpayne@68 749 new_values = bcf_empty_array(info_type, alloc_len, vlen)
jpayne@68 750 cdef char *valp = <char *>new_values
jpayne@68 751
jpayne@68 752 if info_type == BCF_HT_INT:
jpayne@68 753 dst_type = BCF_BT_INT32
jpayne@68 754 elif info_type == BCF_HT_REAL:
jpayne@68 755 dst_type = BCF_BT_FLOAT
jpayne@68 756 elif info_type == BCF_HT_STR:
jpayne@68 757 dst_type = BCF_BT_CHAR
jpayne@68 758 else:
jpayne@68 759 raise ValueError('Unsupported INFO type')
jpayne@68 760
jpayne@68 761 bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
jpayne@68 762
jpayne@68 763 if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
jpayne@68 764 raise ValueError('Unable to update INFO values')
jpayne@68 765
jpayne@68 766
jpayne@68 767 cdef bcf_info_del_value(VariantRecord record, key):
jpayne@68 768 if record is None:
jpayne@68 769 raise ValueError('record must not be None')
jpayne@68 770
jpayne@68 771 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68 772 cdef bcf1_t *r = record.ptr
jpayne@68 773 cdef ssize_t value_count
jpayne@68 774 cdef int scalar
jpayne@68 775
jpayne@68 776 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 777 raise ValueError('Error unpacking VariantRecord')
jpayne@68 778
jpayne@68 779 cdef bytes bkey = force_bytes(key)
jpayne@68 780 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 781
jpayne@68 782 if not info:
jpayne@68 783 raise KeyError(key)
jpayne@68 784
jpayne@68 785 bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
jpayne@68 786
jpayne@68 787 if value_count <= 0:
jpayne@68 788 null_value = ()
jpayne@68 789 elif scalar:
jpayne@68 790 null_value = None
jpayne@68 791 else:
jpayne@68 792 null_value = (None,)*value_count
jpayne@68 793
jpayne@68 794 bcf_info_set_value(record, bkey, null_value)
jpayne@68 795
jpayne@68 796
jpayne@68 797 cdef bcf_format_get_value(VariantRecordSample sample, key):
jpayne@68 798 if sample is None:
jpayne@68 799 raise ValueError('sample must not be None')
jpayne@68 800
jpayne@68 801 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 802 cdef bcf1_t *r = sample.record.ptr
jpayne@68 803 cdef ssize_t count
jpayne@68 804 cdef int scalar
jpayne@68 805
jpayne@68 806 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 807 raise ValueError('Error unpacking VariantRecord')
jpayne@68 808
jpayne@68 809 cdef bytes bkey = force_bytes(key)
jpayne@68 810 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 811
jpayne@68 812 if not fmt or not fmt.p:
jpayne@68 813 raise KeyError('invalid FORMAT: {}'.format(key))
jpayne@68 814
jpayne@68 815 if is_gt_fmt(hdr, fmt.id):
jpayne@68 816 return bcf_format_get_allele_indices(sample)
jpayne@68 817
jpayne@68 818 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
jpayne@68 819
jpayne@68 820 if fmt.p and fmt.n and fmt.size:
jpayne@68 821 return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
jpayne@68 822 elif scalar:
jpayne@68 823 return None
jpayne@68 824 elif count <= 0:
jpayne@68 825 return ()
jpayne@68 826 else:
jpayne@68 827 return (None,)*count
jpayne@68 828
jpayne@68 829
jpayne@68 830 cdef bcf_format_set_value(VariantRecordSample sample, key, value):
jpayne@68 831 if sample is None:
jpayne@68 832 raise ValueError('sample must not be None')
jpayne@68 833
jpayne@68 834 if key == 'phased':
jpayne@68 835 sample.phased = bool(value)
jpayne@68 836 return
jpayne@68 837
jpayne@68 838 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 839 cdef bcf1_t *r = sample.record.ptr
jpayne@68 840 cdef int fmt_id
jpayne@68 841 cdef vdict_t *d
jpayne@68 842 cdef khiter_t k
jpayne@68 843 cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
jpayne@68 844 cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
jpayne@68 845
jpayne@68 846 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 847 raise ValueError('Error unpacking VariantRecord')
jpayne@68 848
jpayne@68 849 cdef bytes bkey = force_bytes(key)
jpayne@68 850 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 851
jpayne@68 852 if fmt:
jpayne@68 853 fmt_id = fmt.id
jpayne@68 854 else:
jpayne@68 855 d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68 856 k = kh_get_vdict(d, bkey)
jpayne@68 857
jpayne@68 858 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
jpayne@68 859 raise KeyError('unknown format: {}'.format(key))
jpayne@68 860
jpayne@68 861 fmt_id = kh_val_vdict(d, k).id
jpayne@68 862
jpayne@68 863 if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
jpayne@68 864 raise ValueError('Invalid header')
jpayne@68 865
jpayne@68 866 fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
jpayne@68 867
jpayne@68 868 if fmt_type == BCF_HT_FLAG:
jpayne@68 869 raise ValueError('Flag types are not allowed on FORMATs')
jpayne@68 870
jpayne@68 871 if is_gt_fmt(hdr, fmt_id):
jpayne@68 872 value = bcf_encode_alleles(sample.record, value)
jpayne@68 873 # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
jpayne@68 874 fmt_type = BCF_HT_INT
jpayne@68 875
jpayne@68 876 values = bcf_check_values(sample.record, value, sample.index,
jpayne@68 877 BCF_HL_FMT, fmt_type, fmt_id,
jpayne@68 878 fmt.type if fmt else -1,
jpayne@68 879 fmt.n if fmt else -1,
jpayne@68 880 &value_count, &scalar, &realloc)
jpayne@68 881 vlen = value_count < 0
jpayne@68 882 value_count = len(values)
jpayne@68 883
jpayne@68 884 # If we can, write updated values to existing allocated storage.
jpayne@68 885 if fmt and not realloc:
jpayne@68 886 r.d.indiv_dirty = 1
jpayne@68 887 bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
jpayne@68 888 return
jpayne@68 889
jpayne@68 890 alloc_len = max(1, value_count)
jpayne@68 891 if fmt and fmt.n > alloc_len:
jpayne@68 892 alloc_len = fmt.n
jpayne@68 893
jpayne@68 894 nsamples = r.n_sample
jpayne@68 895 new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
jpayne@68 896 cdef char *new_values_p = <char *>new_values
jpayne@68 897
jpayne@68 898 if fmt_type == BCF_HT_INT:
jpayne@68 899 dst_type = BCF_BT_INT32
jpayne@68 900 dst_size = sizeof(int32_t) * alloc_len
jpayne@68 901 elif fmt_type == BCF_HT_REAL:
jpayne@68 902 dst_type = BCF_BT_FLOAT
jpayne@68 903 dst_size = sizeof(float) * alloc_len
jpayne@68 904 elif fmt_type == BCF_HT_STR:
jpayne@68 905 dst_type = BCF_BT_CHAR
jpayne@68 906 dst_size = sizeof(char) * alloc_len
jpayne@68 907 else:
jpayne@68 908 raise ValueError('Unsupported FORMAT type')
jpayne@68 909
jpayne@68 910 if fmt and nsamples > 1:
jpayne@68 911 for i in range(nsamples):
jpayne@68 912 bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
jpayne@68 913 new_values_p + i * dst_size, dst_type, alloc_len,
jpayne@68 914 vlen)
jpayne@68 915
jpayne@68 916 bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
jpayne@68 917
jpayne@68 918 if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
jpayne@68 919 raise ValueError('Unable to update format values')
jpayne@68 920
jpayne@68 921
jpayne@68 922 cdef bcf_format_del_value(VariantRecordSample sample, key):
jpayne@68 923 if sample is None:
jpayne@68 924 raise ValueError('sample must not be None')
jpayne@68 925
jpayne@68 926 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 927 cdef bcf1_t *r = sample.record.ptr
jpayne@68 928 cdef ssize_t value_count
jpayne@68 929 cdef int scalar
jpayne@68 930
jpayne@68 931 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 932 raise ValueError('Error unpacking VariantRecord')
jpayne@68 933
jpayne@68 934 cdef bytes bkey = force_bytes(key)
jpayne@68 935 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 936
jpayne@68 937 if not fmt or not fmt.p:
jpayne@68 938 raise KeyError(key)
jpayne@68 939
jpayne@68 940 bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
jpayne@68 941
jpayne@68 942 if value_count <= 0:
jpayne@68 943 null_value = ()
jpayne@68 944 elif scalar:
jpayne@68 945 null_value = None
jpayne@68 946 else:
jpayne@68 947 null_value = (None,)*value_count
jpayne@68 948
jpayne@68 949 bcf_format_set_value(sample, bkey, null_value)
jpayne@68 950
jpayne@68 951
jpayne@68 952 cdef bcf_format_get_allele_indices(VariantRecordSample sample):
jpayne@68 953 if sample is None:
jpayne@68 954 raise ValueError('sample must not be None')
jpayne@68 955
jpayne@68 956 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 957 cdef bcf1_t *r = sample.record.ptr
jpayne@68 958 cdef int32_t n = r.n_sample
jpayne@68 959
jpayne@68 960 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 961 raise ValueError('Error unpacking VariantRecord')
jpayne@68 962
jpayne@68 963 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68 964 return ()
jpayne@68 965
jpayne@68 966 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68 967 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68 968
jpayne@68 969 if not gt0 or not fmt0.n:
jpayne@68 970 return ()
jpayne@68 971
jpayne@68 972 cdef int8_t *data8
jpayne@68 973 cdef int16_t *data16
jpayne@68 974 cdef int32_t *data32
jpayne@68 975 cdef int32_t a, nalleles = r.n_allele
jpayne@68 976 cdef list alleles = []
jpayne@68 977
jpayne@68 978 if fmt0.type == BCF_BT_INT8:
jpayne@68 979 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 980 for i in range(fmt0.n):
jpayne@68 981 if data8[i] == bcf_int8_vector_end:
jpayne@68 982 break
jpayne@68 983 elif data8[i] == bcf_gt_missing:
jpayne@68 984 a = -1
jpayne@68 985 else:
jpayne@68 986 a = bcf_gt_allele(data8[i])
jpayne@68 987 alleles.append(a if 0 <= a < nalleles else None)
jpayne@68 988 elif fmt0.type == BCF_BT_INT16:
jpayne@68 989 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 990 for i in range(fmt0.n):
jpayne@68 991 if data16[i] == bcf_int16_vector_end:
jpayne@68 992 break
jpayne@68 993 elif data16[i] == bcf_gt_missing:
jpayne@68 994 a = -1
jpayne@68 995 else:
jpayne@68 996 a = bcf_gt_allele(data16[i])
jpayne@68 997 alleles.append(a if 0 <= a < nalleles else None)
jpayne@68 998 elif fmt0.type == BCF_BT_INT32:
jpayne@68 999 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1000 for i in range(fmt0.n):
jpayne@68 1001 if data32[i] == bcf_int32_vector_end:
jpayne@68 1002 break
jpayne@68 1003 elif data32[i] == bcf_gt_missing:
jpayne@68 1004 a = -1
jpayne@68 1005 else:
jpayne@68 1006 a = bcf_gt_allele(data32[i])
jpayne@68 1007 alleles.append(a if 0 <= a < nalleles else None)
jpayne@68 1008
jpayne@68 1009 return tuple(alleles)
jpayne@68 1010
jpayne@68 1011
jpayne@68 1012 cdef bcf_format_get_alleles(VariantRecordSample sample):
jpayne@68 1013 if sample is None:
jpayne@68 1014 raise ValueError('sample must not be None')
jpayne@68 1015
jpayne@68 1016 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 1017 cdef bcf1_t *r = sample.record.ptr
jpayne@68 1018 cdef int32_t nsamples = r.n_sample
jpayne@68 1019
jpayne@68 1020 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 1021 raise ValueError('Error unpacking VariantRecord')
jpayne@68 1022
jpayne@68 1023 cdef int32_t nalleles = r.n_allele
jpayne@68 1024
jpayne@68 1025 if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
jpayne@68 1026 return ()
jpayne@68 1027
jpayne@68 1028 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68 1029 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68 1030
jpayne@68 1031 if not gt0 or not fmt0.n:
jpayne@68 1032 return ()
jpayne@68 1033
jpayne@68 1034 cdef int32_t a
jpayne@68 1035 cdef int8_t *data8
jpayne@68 1036 cdef int16_t *data16
jpayne@68 1037 cdef int32_t *data32
jpayne@68 1038 alleles = []
jpayne@68 1039 if fmt0.type == BCF_BT_INT8:
jpayne@68 1040 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1041 for i in range(fmt0.n):
jpayne@68 1042 if data8[i] == bcf_int8_vector_end:
jpayne@68 1043 break
jpayne@68 1044 a = bcf_gt_allele(data8[i])
jpayne@68 1045 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68 1046 elif fmt0.type == BCF_BT_INT16:
jpayne@68 1047 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1048 for i in range(fmt0.n):
jpayne@68 1049 if data16[i] == bcf_int16_vector_end:
jpayne@68 1050 break
jpayne@68 1051 a = bcf_gt_allele(data16[i])
jpayne@68 1052 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68 1053 elif fmt0.type == BCF_BT_INT32:
jpayne@68 1054 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1055 for i in range(fmt0.n):
jpayne@68 1056 if data32[i] == bcf_int32_vector_end:
jpayne@68 1057 break
jpayne@68 1058 a = bcf_gt_allele(data32[i])
jpayne@68 1059 alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68 1060 return tuple(alleles)
jpayne@68 1061
jpayne@68 1062
jpayne@68 1063 cdef bint bcf_sample_get_phased(VariantRecordSample sample):
jpayne@68 1064 if sample is None:
jpayne@68 1065 raise ValueError('sample must not be None')
jpayne@68 1066
jpayne@68 1067 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 1068 cdef bcf1_t *r = sample.record.ptr
jpayne@68 1069 cdef int32_t n = r.n_sample
jpayne@68 1070
jpayne@68 1071 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 1072 raise ValueError('Error unpacking VariantRecord')
jpayne@68 1073
jpayne@68 1074 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68 1075 return False
jpayne@68 1076
jpayne@68 1077 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68 1078 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68 1079
jpayne@68 1080 if not gt0 or not fmt0.n:
jpayne@68 1081 return False
jpayne@68 1082
jpayne@68 1083 cdef int8_t *data8
jpayne@68 1084 cdef int16_t *data16
jpayne@68 1085 cdef int32_t *data32
jpayne@68 1086
jpayne@68 1087 cdef bint phased = False
jpayne@68 1088
jpayne@68 1089 if fmt0.type == BCF_BT_INT8:
jpayne@68 1090 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1091 for i in range(fmt0.n):
jpayne@68 1092 if data8[i] == bcf_int8_vector_end:
jpayne@68 1093 break
jpayne@68 1094 elif data8[i] == bcf_int8_missing:
jpayne@68 1095 continue
jpayne@68 1096 elif i and not bcf_gt_is_phased(data8[i]):
jpayne@68 1097 return False
jpayne@68 1098 else:
jpayne@68 1099 phased = True
jpayne@68 1100 elif fmt0.type == BCF_BT_INT16:
jpayne@68 1101 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1102 for i in range(fmt0.n):
jpayne@68 1103 if data16[i] == bcf_int16_vector_end:
jpayne@68 1104 break
jpayne@68 1105 elif data16[i] == bcf_int16_missing:
jpayne@68 1106 continue
jpayne@68 1107 elif i and not bcf_gt_is_phased(data16[i]):
jpayne@68 1108 return False
jpayne@68 1109 else:
jpayne@68 1110 phased = True
jpayne@68 1111 elif fmt0.type == BCF_BT_INT32:
jpayne@68 1112 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1113 for i in range(fmt0.n):
jpayne@68 1114 if data32[i] == bcf_int32_vector_end:
jpayne@68 1115 break
jpayne@68 1116 elif data32[i] == bcf_int32_missing:
jpayne@68 1117 continue
jpayne@68 1118 elif i and not bcf_gt_is_phased(data32[i]):
jpayne@68 1119 return False
jpayne@68 1120 else:
jpayne@68 1121 phased = True
jpayne@68 1122
jpayne@68 1123 return phased
jpayne@68 1124
jpayne@68 1125
jpayne@68 1126 cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
jpayne@68 1127 if sample is None:
jpayne@68 1128 raise ValueError('sample must not be None')
jpayne@68 1129
jpayne@68 1130 cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68 1131 cdef bcf1_t *r = sample.record.ptr
jpayne@68 1132 cdef int32_t n = r.n_sample
jpayne@68 1133
jpayne@68 1134 if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68 1135 raise ValueError('Error unpacking VariantRecord')
jpayne@68 1136
jpayne@68 1137 if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68 1138 return
jpayne@68 1139
jpayne@68 1140 cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68 1141 cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68 1142
jpayne@68 1143 if not gt0 or not fmt0.n:
jpayne@68 1144 raise ValueError('Cannot set phased before genotype is set')
jpayne@68 1145
jpayne@68 1146 cdef int8_t *data8
jpayne@68 1147 cdef int16_t *data16
jpayne@68 1148 cdef int32_t *data32
jpayne@68 1149
jpayne@68 1150 if fmt0.type == BCF_BT_INT8:
jpayne@68 1151 data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1152 for i in range(fmt0.n):
jpayne@68 1153 if data8[i] == bcf_int8_vector_end:
jpayne@68 1154 break
jpayne@68 1155 elif data8[i] == bcf_int8_missing:
jpayne@68 1156 continue
jpayne@68 1157 elif i:
jpayne@68 1158 data8[i] = (data8[i] & 0xFE) | phased
jpayne@68 1159 elif fmt0.type == BCF_BT_INT16:
jpayne@68 1160 data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1161 for i in range(fmt0.n):
jpayne@68 1162 if data16[i] == bcf_int16_vector_end:
jpayne@68 1163 break
jpayne@68 1164 elif data16[i] == bcf_int16_missing:
jpayne@68 1165 continue
jpayne@68 1166 elif i:
jpayne@68 1167 data16[i] = (data16[i] & 0xFFFE) | phased
jpayne@68 1168 elif fmt0.type == BCF_BT_INT32:
jpayne@68 1169 data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68 1170 for i in range(fmt0.n):
jpayne@68 1171 if data32[i] == bcf_int32_vector_end:
jpayne@68 1172 break
jpayne@68 1173 elif data32[i] == bcf_int32_missing:
jpayne@68 1174 continue
jpayne@68 1175 elif i:
jpayne@68 1176 data32[i] = (data32[i] & 0xFFFFFFFE) | phased
jpayne@68 1177
jpayne@68 1178
jpayne@68 1179 cdef inline bcf_sync_end(VariantRecord record):
jpayne@68 1180 cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68 1181 cdef bcf_info_t *info
jpayne@68 1182 cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
jpayne@68 1183 cdef int ref_len
jpayne@68 1184
jpayne@68 1185 # allow missing ref when instantiating a new record
jpayne@68 1186 if record.ref is not None:
jpayne@68 1187 ref_len = len(record.ref)
jpayne@68 1188 else:
jpayne@68 1189 ref_len = 0
jpayne@68 1190
jpayne@68 1191 # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
jpayne@68 1192 # Always keep END for symbolic alleles
jpayne@68 1193 if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
jpayne@68 1194 # If INFO/END is not defined in the header, it doesn't exist in the record
jpayne@68 1195 if end_id >= 0:
jpayne@68 1196 info = bcf_get_info(hdr, record.ptr, b'END')
jpayne@68 1197 if info and info.vptr:
jpayne@68 1198 if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
jpayne@68 1199 raise ValueError('Unable to delete END')
jpayne@68 1200 else:
jpayne@68 1201 # Create END header, if not present
jpayne@68 1202 if end_id < 0:
jpayne@68 1203 record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
jpayne@68 1204
jpayne@68 1205 # Update to reflect stop position
jpayne@68 1206 bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
jpayne@68 1207
jpayne@68 1208
jpayne@68 1209 cdef inline int has_symbolic_allele(VariantRecord record):
jpayne@68 1210 """Return index of first symbolic allele. 0 if no symbolic alleles."""
jpayne@68 1211
jpayne@68 1212 for i in range(1, record.ptr.n_allele):
jpayne@68 1213 alt = record.ptr.d.allele[i]
jpayne@68 1214 if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
jpayne@68 1215 return i
jpayne@68 1216
jpayne@68 1217 return 0
jpayne@68 1218
jpayne@68 1219
jpayne@68 1220 ########################################################################
jpayne@68 1221 ########################################################################
jpayne@68 1222 ## Variant Header objects
jpayne@68 1223 ########################################################################
jpayne@68 1224
jpayne@68 1225
jpayne@68 1226 cdef bcf_header_remove_hrec(VariantHeader header, int i):
jpayne@68 1227 if header is None:
jpayne@68 1228 raise ValueError('header must not be None')
jpayne@68 1229
jpayne@68 1230 cdef bcf_hdr_t *hdr = header.ptr
jpayne@68 1231
jpayne@68 1232 if i < 0 or i >= hdr.nhrec:
jpayne@68 1233 raise ValueError('Invalid header record index')
jpayne@68 1234
jpayne@68 1235 cdef bcf_hrec_t *hrec = hdr.hrec[i]
jpayne@68 1236 hdr.nhrec -= 1
jpayne@68 1237
jpayne@68 1238 if i < hdr.nhrec:
jpayne@68 1239 memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
jpayne@68 1240
jpayne@68 1241 bcf_hrec_destroy(hrec)
jpayne@68 1242 hdr.hrec[hdr.nhrec] = NULL
jpayne@68 1243 hdr.dirty = 1
jpayne@68 1244
jpayne@68 1245
jpayne@68 1246 #FIXME: implement a full mapping interface
jpayne@68 1247 #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
jpayne@68 1248 # object lifetime.
jpayne@68 1249 cdef class VariantHeaderRecord(object):
jpayne@68 1250 """header record from a :class:`VariantHeader` object"""
jpayne@68 1251 def __init__(self, *args, **kwargs):
jpayne@68 1252 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1253
jpayne@68 1254 @property
jpayne@68 1255 def type(self):
jpayne@68 1256 """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
jpayne@68 1257 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1258 if not r:
jpayne@68 1259 return None
jpayne@68 1260 return METADATA_TYPES[r.type]
jpayne@68 1261
jpayne@68 1262 @property
jpayne@68 1263 def key(self):
jpayne@68 1264 """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
jpayne@68 1265 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1266 return bcf_str_cache_get_charptr(r.key) if r and r.key else None
jpayne@68 1267
jpayne@68 1268 @property
jpayne@68 1269 def value(self):
jpayne@68 1270 """header value. Set only for generic lines, None for FILTER/INFO, etc."""
jpayne@68 1271 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1272 return charptr_to_str(r.value) if r and r.value else None
jpayne@68 1273
jpayne@68 1274 @property
jpayne@68 1275 def attrs(self):
jpayne@68 1276 """sequence of additional header attributes"""
jpayne@68 1277 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1278 if not r:
jpayne@68 1279 return ()
jpayne@68 1280 cdef int i
jpayne@68 1281 return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
jpayne@68 1282 charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@68 1283 for i in range(r.nkeys))
jpayne@68 1284
jpayne@68 1285 def __len__(self):
jpayne@68 1286 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1287 return r.nkeys if r else 0
jpayne@68 1288
jpayne@68 1289 def __bool__(self):
jpayne@68 1290 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1291 return r != NULL and r.nkeys != 0
jpayne@68 1292
jpayne@68 1293 def __getitem__(self, key):
jpayne@68 1294 """get attribute value"""
jpayne@68 1295 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1296 cdef int i
jpayne@68 1297 if r:
jpayne@68 1298 bkey = force_bytes(key)
jpayne@68 1299 for i in range(r.nkeys):
jpayne@68 1300 if r.keys[i] and r.keys[i] == bkey:
jpayne@68 1301 return charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@68 1302 raise KeyError('cannot find metadata key')
jpayne@68 1303
jpayne@68 1304 def __iter__(self):
jpayne@68 1305 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1306 if not r:
jpayne@68 1307 return
jpayne@68 1308 cdef int i
jpayne@68 1309 for i in range(r.nkeys):
jpayne@68 1310 if r.keys[i]:
jpayne@68 1311 yield bcf_str_cache_get_charptr(r.keys[i])
jpayne@68 1312
jpayne@68 1313 def get(self, key, default=None):
jpayne@68 1314 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 1315 try:
jpayne@68 1316 return self[key]
jpayne@68 1317 except KeyError:
jpayne@68 1318 return default
jpayne@68 1319
jpayne@68 1320 def __contains__(self, key):
jpayne@68 1321 try:
jpayne@68 1322 self[key]
jpayne@68 1323 except KeyError:
jpayne@68 1324 return False
jpayne@68 1325 else:
jpayne@68 1326 return True
jpayne@68 1327
jpayne@68 1328 def iterkeys(self):
jpayne@68 1329 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 1330 return iter(self)
jpayne@68 1331
jpayne@68 1332 def itervalues(self):
jpayne@68 1333 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 1334 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1335 if not r:
jpayne@68 1336 return
jpayne@68 1337 cdef int i
jpayne@68 1338 for i in range(r.nkeys):
jpayne@68 1339 if r.keys[i]:
jpayne@68 1340 yield charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@68 1341
jpayne@68 1342 def iteritems(self):
jpayne@68 1343 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 1344 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1345 if not r:
jpayne@68 1346 return
jpayne@68 1347 cdef int i
jpayne@68 1348 for i in range(r.nkeys):
jpayne@68 1349 if r.keys[i]:
jpayne@68 1350 yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@68 1351
jpayne@68 1352 def keys(self):
jpayne@68 1353 """D.keys() -> list of D's keys"""
jpayne@68 1354 return list(self)
jpayne@68 1355
jpayne@68 1356 def items(self):
jpayne@68 1357 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 1358 return list(self.iteritems())
jpayne@68 1359
jpayne@68 1360 def values(self):
jpayne@68 1361 """D.values() -> list of D's values"""
jpayne@68 1362 return list(self.itervalues())
jpayne@68 1363
jpayne@68 1364 def update(self, items=None, **kwargs):
jpayne@68 1365 """D.update([E, ]**F) -> None.
jpayne@68 1366
jpayne@68 1367 Update D from dict/iterable E and F.
jpayne@68 1368 """
jpayne@68 1369 for k, v in items.items():
jpayne@68 1370 self[k] = v
jpayne@68 1371
jpayne@68 1372 if kwargs:
jpayne@68 1373 for k, v in kwargs.items():
jpayne@68 1374 self[k] = v
jpayne@68 1375
jpayne@68 1376 def pop(self, key, default=_nothing):
jpayne@68 1377 try:
jpayne@68 1378 value = self[key]
jpayne@68 1379 del self[key]
jpayne@68 1380 return value
jpayne@68 1381 except KeyError:
jpayne@68 1382 if default is not _nothing:
jpayne@68 1383 return default
jpayne@68 1384 raise
jpayne@68 1385
jpayne@68 1386 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 1387 __hash__ = None
jpayne@68 1388
jpayne@68 1389 #TODO: implement __richcmp__
jpayne@68 1390
jpayne@68 1391 def __str__(self):
jpayne@68 1392 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1393
jpayne@68 1394 if not r:
jpayne@68 1395 raise ValueError('cannot convert deleted record to str')
jpayne@68 1396
jpayne@68 1397 cdef kstring_t hrec_str
jpayne@68 1398 hrec_str.l = hrec_str.m = 0
jpayne@68 1399 hrec_str.s = NULL
jpayne@68 1400
jpayne@68 1401 bcf_hrec_format(r, &hrec_str)
jpayne@68 1402
jpayne@68 1403 ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
jpayne@68 1404
jpayne@68 1405 if hrec_str.m:
jpayne@68 1406 free(hrec_str.s)
jpayne@68 1407
jpayne@68 1408 return ret
jpayne@68 1409
jpayne@68 1410 # FIXME: Not safe -- causes trivial segfaults at the moment
jpayne@68 1411 def remove(self):
jpayne@68 1412 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1413 cdef bcf_hrec_t *r = self.ptr
jpayne@68 1414 if not r:
jpayne@68 1415 return
jpayne@68 1416 assert r.key
jpayne@68 1417 cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
jpayne@68 1418 bcf_hdr_remove(hdr, r.type, key)
jpayne@68 1419 self.ptr = NULL
jpayne@68 1420
jpayne@68 1421
jpayne@68 1422 cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
jpayne@68 1423 if not header:
jpayne@68 1424 raise ValueError('invalid VariantHeader')
jpayne@68 1425
jpayne@68 1426 if not hdr:
jpayne@68 1427 return None
jpayne@68 1428
jpayne@68 1429 cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
jpayne@68 1430 record.header = header
jpayne@68 1431 record.ptr = hdr
jpayne@68 1432
jpayne@68 1433 return record
jpayne@68 1434
jpayne@68 1435
jpayne@68 1436 cdef class VariantHeaderRecords(object):
jpayne@68 1437 """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
jpayne@68 1438 def __init__(self, *args, **kwargs):
jpayne@68 1439 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1440
jpayne@68 1441 def __len__(self):
jpayne@68 1442 return self.header.ptr.nhrec
jpayne@68 1443
jpayne@68 1444 def __bool__(self):
jpayne@68 1445 return self.header.ptr.nhrec != 0
jpayne@68 1446
jpayne@68 1447 def __getitem__(self, index):
jpayne@68 1448 cdef int32_t i = index
jpayne@68 1449 if i < 0 or i >= self.header.ptr.nhrec:
jpayne@68 1450 raise IndexError('invalid header record index')
jpayne@68 1451 return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@68 1452
jpayne@68 1453 def __iter__(self):
jpayne@68 1454 cdef int32_t i
jpayne@68 1455 for i in range(self.header.ptr.nhrec):
jpayne@68 1456 yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@68 1457
jpayne@68 1458 __hash__ = None
jpayne@68 1459
jpayne@68 1460
jpayne@68 1461 cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
jpayne@68 1462 if not header:
jpayne@68 1463 raise ValueError('invalid VariantHeader')
jpayne@68 1464
jpayne@68 1465 cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
jpayne@68 1466 records.header = header
jpayne@68 1467 return records
jpayne@68 1468
jpayne@68 1469
jpayne@68 1470 cdef class VariantMetadata(object):
jpayne@68 1471 """filter, info or format metadata record from a :class:`VariantHeader` object"""
jpayne@68 1472 def __init__(self, *args, **kwargs):
jpayne@68 1473 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1474
jpayne@68 1475 @property
jpayne@68 1476 def name(self):
jpayne@68 1477 """metadata name"""
jpayne@68 1478 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1479 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
jpayne@68 1480
jpayne@68 1481 # Q: Should this be exposed?
jpayne@68 1482 @property
jpayne@68 1483 def id(self):
jpayne@68 1484 """metadata internal header id number"""
jpayne@68 1485 return self.id
jpayne@68 1486
jpayne@68 1487 @property
jpayne@68 1488 def number(self):
jpayne@68 1489 """metadata number (i.e. cardinality)"""
jpayne@68 1490 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1491
jpayne@68 1492 if not check_header_id(hdr, self.type, self.id):
jpayne@68 1493 raise ValueError('Invalid header id')
jpayne@68 1494
jpayne@68 1495 if self.type == BCF_HL_FLT:
jpayne@68 1496 return None
jpayne@68 1497
jpayne@68 1498 cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
jpayne@68 1499 if l == BCF_VL_FIXED:
jpayne@68 1500 return bcf_hdr_id2number(hdr, self.type, self.id)
jpayne@68 1501 elif l == BCF_VL_VAR:
jpayne@68 1502 return '.'
jpayne@68 1503 else:
jpayne@68 1504 return METADATA_LENGTHS[l]
jpayne@68 1505
jpayne@68 1506 @property
jpayne@68 1507 def type(self):
jpayne@68 1508 """metadata value type"""
jpayne@68 1509 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1510 if not check_header_id(hdr, self.type, self.id):
jpayne@68 1511 raise ValueError('Invalid header id')
jpayne@68 1512
jpayne@68 1513 if self.type == BCF_HL_FLT:
jpayne@68 1514 return None
jpayne@68 1515 return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
jpayne@68 1516
jpayne@68 1517 @property
jpayne@68 1518 def description(self):
jpayne@68 1519 """metadata description (or None if not set)"""
jpayne@68 1520 descr = self.record.get('Description')
jpayne@68 1521 if descr:
jpayne@68 1522 descr = descr.strip('"')
jpayne@68 1523 return force_str(descr)
jpayne@68 1524
jpayne@68 1525 @property
jpayne@68 1526 def record(self):
jpayne@68 1527 """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
jpayne@68 1528 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1529 if not check_header_id(hdr, self.type, self.id):
jpayne@68 1530 raise ValueError('Invalid header id')
jpayne@68 1531 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
jpayne@68 1532 if not hrec:
jpayne@68 1533 return None
jpayne@68 1534 return makeVariantHeaderRecord(self.header, hrec)
jpayne@68 1535
jpayne@68 1536 def remove_header(self):
jpayne@68 1537 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1538 cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
jpayne@68 1539 bcf_hdr_remove(hdr, self.type, key)
jpayne@68 1540
jpayne@68 1541
jpayne@68 1542 cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
jpayne@68 1543 if not header:
jpayne@68 1544 raise ValueError('invalid VariantHeader')
jpayne@68 1545
jpayne@68 1546 if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
jpayne@68 1547 raise ValueError('invalid metadata type')
jpayne@68 1548
jpayne@68 1549 if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
jpayne@68 1550 raise ValueError('invalid metadata id')
jpayne@68 1551
jpayne@68 1552 cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
jpayne@68 1553 meta.header = header
jpayne@68 1554 meta.type = type
jpayne@68 1555 meta.id = id
jpayne@68 1556
jpayne@68 1557 return meta
jpayne@68 1558
jpayne@68 1559
jpayne@68 1560 cdef class VariantHeaderMetadata(object):
jpayne@68 1561 """mapping from filter, info or format name to :class:`VariantMetadata` object"""
jpayne@68 1562 def __init__(self, *args, **kwargs):
jpayne@68 1563 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1564
jpayne@68 1565 def add(self, id, number, type, description, **kwargs):
jpayne@68 1566 """Add a new filter, info or format record"""
jpayne@68 1567 if id in self:
jpayne@68 1568 raise ValueError('Header already exists for id={}'.format(id))
jpayne@68 1569
jpayne@68 1570 if self.type == BCF_HL_FLT:
jpayne@68 1571 if number is not None:
jpayne@68 1572 raise ValueError('Number must be None when adding a filter')
jpayne@68 1573 if type is not None:
jpayne@68 1574 raise ValueError('Type must be None when adding a filter')
jpayne@68 1575
jpayne@68 1576 items = [('ID', unquoted_str(id)), ('Description', description)]
jpayne@68 1577 else:
jpayne@68 1578 if type not in VALUE_TYPES:
jpayne@68 1579 raise ValueError('unknown type specified: {}'.format(type))
jpayne@68 1580 if number is None:
jpayne@68 1581 number = '.'
jpayne@68 1582
jpayne@68 1583 items = [('ID', unquoted_str(id)),
jpayne@68 1584 ('Number', unquoted_str(number)),
jpayne@68 1585 ('Type', unquoted_str(type)),
jpayne@68 1586 ('Description', description)]
jpayne@68 1587
jpayne@68 1588 items += kwargs.items()
jpayne@68 1589 self.header.add_meta(METADATA_TYPES[self.type], items=items)
jpayne@68 1590
jpayne@68 1591 def __len__(self):
jpayne@68 1592 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1593 cdef bcf_idpair_t *idpair
jpayne@68 1594 cdef int32_t i, n = 0
jpayne@68 1595
jpayne@68 1596 for i in range(hdr.n[BCF_DT_ID]):
jpayne@68 1597 idpair = hdr.id[BCF_DT_ID] + i
jpayne@68 1598 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68 1599 n += 1
jpayne@68 1600 return n
jpayne@68 1601
jpayne@68 1602 def __bool__(self):
jpayne@68 1603 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1604 cdef bcf_idpair_t *idpair
jpayne@68 1605 cdef int32_t i
jpayne@68 1606
jpayne@68 1607 for i in range(hdr.n[BCF_DT_ID]):
jpayne@68 1608 idpair = hdr.id[BCF_DT_ID] + i
jpayne@68 1609 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68 1610 return True
jpayne@68 1611 return False
jpayne@68 1612
jpayne@68 1613 def __getitem__(self, key):
jpayne@68 1614 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1615 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68 1616
jpayne@68 1617 cdef bytes bkey = force_bytes(key)
jpayne@68 1618 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68 1619
jpayne@68 1620 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@68 1621 raise KeyError('invalid key: {}'.format(key))
jpayne@68 1622
jpayne@68 1623 return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
jpayne@68 1624
jpayne@68 1625 def remove_header(self, key):
jpayne@68 1626 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1627 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68 1628
jpayne@68 1629 cdef bytes bkey = force_bytes(key)
jpayne@68 1630 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68 1631
jpayne@68 1632 if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@68 1633 raise KeyError('invalid key: {}'.format(key))
jpayne@68 1634
jpayne@68 1635 bcf_hdr_remove(hdr, self.type, bkey)
jpayne@68 1636 #bcf_hdr_sync(hdr)
jpayne@68 1637
jpayne@68 1638 def clear_header(self):
jpayne@68 1639 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1640 bcf_hdr_remove(hdr, self.type, NULL)
jpayne@68 1641 #bcf_hdr_sync(hdr)
jpayne@68 1642
jpayne@68 1643 def __iter__(self):
jpayne@68 1644 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1645 cdef bcf_idpair_t *idpair
jpayne@68 1646 cdef int32_t i
jpayne@68 1647
jpayne@68 1648 for i in range(hdr.n[BCF_DT_ID]):
jpayne@68 1649 idpair = hdr.id[BCF_DT_ID] + i
jpayne@68 1650 if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68 1651 yield bcf_str_cache_get_charptr(idpair.key)
jpayne@68 1652
jpayne@68 1653 def get(self, key, default=None):
jpayne@68 1654 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 1655 try:
jpayne@68 1656 return self[key]
jpayne@68 1657 except KeyError:
jpayne@68 1658 return default
jpayne@68 1659
jpayne@68 1660 def __contains__(self, key):
jpayne@68 1661 try:
jpayne@68 1662 self[key]
jpayne@68 1663 except KeyError:
jpayne@68 1664 return False
jpayne@68 1665 else:
jpayne@68 1666 return True
jpayne@68 1667
jpayne@68 1668 def iterkeys(self):
jpayne@68 1669 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 1670 return iter(self)
jpayne@68 1671
jpayne@68 1672 def itervalues(self):
jpayne@68 1673 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 1674 for key in self:
jpayne@68 1675 yield self[key]
jpayne@68 1676
jpayne@68 1677 def iteritems(self):
jpayne@68 1678 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 1679 for key in self:
jpayne@68 1680 yield (key, self[key])
jpayne@68 1681
jpayne@68 1682 def keys(self):
jpayne@68 1683 """D.keys() -> list of D's keys"""
jpayne@68 1684 return list(self)
jpayne@68 1685
jpayne@68 1686 def items(self):
jpayne@68 1687 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 1688 return list(self.iteritems())
jpayne@68 1689
jpayne@68 1690 def values(self):
jpayne@68 1691 """D.values() -> list of D's values"""
jpayne@68 1692 return list(self.itervalues())
jpayne@68 1693
jpayne@68 1694 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 1695 __hash__ = None
jpayne@68 1696
jpayne@68 1697 #TODO: implement __richcmp__
jpayne@68 1698
jpayne@68 1699
jpayne@68 1700 cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
jpayne@68 1701 if not header:
jpayne@68 1702 raise ValueError('invalid VariantHeader')
jpayne@68 1703
jpayne@68 1704 cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
jpayne@68 1705 meta.header = header
jpayne@68 1706 meta.type = type
jpayne@68 1707
jpayne@68 1708 return meta
jpayne@68 1709
jpayne@68 1710
jpayne@68 1711 cdef class VariantContig(object):
jpayne@68 1712 """contig metadata from a :class:`VariantHeader`"""
jpayne@68 1713 def __init__(self, *args, **kwargs):
jpayne@68 1714 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1715
jpayne@68 1716 @property
jpayne@68 1717 def name(self):
jpayne@68 1718 """contig name"""
jpayne@68 1719 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1720 return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
jpayne@68 1721
jpayne@68 1722 @property
jpayne@68 1723 def id(self):
jpayne@68 1724 """contig internal id number"""
jpayne@68 1725 return self.id
jpayne@68 1726
jpayne@68 1727 @property
jpayne@68 1728 def length(self):
jpayne@68 1729 """contig length or None if not available"""
jpayne@68 1730 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1731 cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
jpayne@68 1732 return length if length else None
jpayne@68 1733
jpayne@68 1734 @property
jpayne@68 1735 def header_record(self):
jpayne@68 1736 """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
jpayne@68 1737 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1738 cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
jpayne@68 1739 return makeVariantHeaderRecord(self.header, hrec)
jpayne@68 1740
jpayne@68 1741 def remove_header(self):
jpayne@68 1742 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1743 cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
jpayne@68 1744 bcf_hdr_remove(hdr, BCF_HL_CTG, key)
jpayne@68 1745
jpayne@68 1746
jpayne@68 1747 cdef VariantContig makeVariantContig(VariantHeader header, int id):
jpayne@68 1748 if not header:
jpayne@68 1749 raise ValueError('invalid VariantHeader')
jpayne@68 1750
jpayne@68 1751 if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
jpayne@68 1752 raise ValueError('invalid contig id')
jpayne@68 1753
jpayne@68 1754 cdef VariantContig contig = VariantContig.__new__(VariantContig)
jpayne@68 1755 contig.header = header
jpayne@68 1756 contig.id = id
jpayne@68 1757
jpayne@68 1758 return contig
jpayne@68 1759
jpayne@68 1760
jpayne@68 1761 cdef class VariantHeaderContigs(object):
jpayne@68 1762 """mapping from contig name or index to :class:`VariantContig` object."""
jpayne@68 1763 def __init__(self, *args, **kwargs):
jpayne@68 1764 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1765
jpayne@68 1766 def __len__(self):
jpayne@68 1767 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1768 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@68 1769 return hdr.n[BCF_DT_CTG]
jpayne@68 1770
jpayne@68 1771 def __bool__(self):
jpayne@68 1772 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1773 assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@68 1774 return hdr.n[BCF_DT_CTG] != 0
jpayne@68 1775
jpayne@68 1776 def __getitem__(self, key):
jpayne@68 1777 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1778 cdef int index
jpayne@68 1779
jpayne@68 1780 if isinstance(key, int):
jpayne@68 1781 index = key
jpayne@68 1782 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@68 1783 raise IndexError('invalid contig index')
jpayne@68 1784 return makeVariantContig(self.header, index)
jpayne@68 1785
jpayne@68 1786 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68 1787 cdef bytes bkey = force_bytes(key)
jpayne@68 1788 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68 1789
jpayne@68 1790 if k == kh_end(d):
jpayne@68 1791 raise KeyError('invalid contig: {}'.format(key))
jpayne@68 1792
jpayne@68 1793 cdef int id = kh_val_vdict(d, k).id
jpayne@68 1794
jpayne@68 1795 return makeVariantContig(self.header, id)
jpayne@68 1796
jpayne@68 1797 def remove_header(self, key):
jpayne@68 1798 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1799 cdef int index
jpayne@68 1800 cdef const char *ckey
jpayne@68 1801 cdef vdict_t *d
jpayne@68 1802 cdef khiter_t k
jpayne@68 1803
jpayne@68 1804 if isinstance(key, int):
jpayne@68 1805 index = key
jpayne@68 1806 if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@68 1807 raise IndexError('invalid contig index')
jpayne@68 1808 ckey = hdr.id[BCF_DT_CTG][self.id].key
jpayne@68 1809 else:
jpayne@68 1810 d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68 1811 key = force_bytes(key)
jpayne@68 1812 if kh_get_vdict(d, key) == kh_end(d):
jpayne@68 1813 raise KeyError('invalid contig: {}'.format(key))
jpayne@68 1814 ckey = key
jpayne@68 1815
jpayne@68 1816 bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
jpayne@68 1817
jpayne@68 1818 def clear_header(self):
jpayne@68 1819 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1820 bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
jpayne@68 1821 #bcf_hdr_sync(hdr)
jpayne@68 1822
jpayne@68 1823 def __iter__(self):
jpayne@68 1824 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1825 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68 1826 cdef uint32_t n = kh_size(d)
jpayne@68 1827
jpayne@68 1828 assert n == hdr.n[BCF_DT_CTG]
jpayne@68 1829
jpayne@68 1830 for i in range(n):
jpayne@68 1831 yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
jpayne@68 1832
jpayne@68 1833 def get(self, key, default=None):
jpayne@68 1834 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 1835 try:
jpayne@68 1836 return self[key]
jpayne@68 1837 except KeyError:
jpayne@68 1838 return default
jpayne@68 1839
jpayne@68 1840 def __contains__(self, key):
jpayne@68 1841 try:
jpayne@68 1842 self[key]
jpayne@68 1843 except KeyError:
jpayne@68 1844 return False
jpayne@68 1845 else:
jpayne@68 1846 return True
jpayne@68 1847
jpayne@68 1848 def iterkeys(self):
jpayne@68 1849 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 1850 return iter(self)
jpayne@68 1851
jpayne@68 1852 def itervalues(self):
jpayne@68 1853 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 1854 for key in self:
jpayne@68 1855 yield self[key]
jpayne@68 1856
jpayne@68 1857 def iteritems(self):
jpayne@68 1858 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 1859 for key in self:
jpayne@68 1860 yield (key, self[key])
jpayne@68 1861
jpayne@68 1862 def keys(self):
jpayne@68 1863 """D.keys() -> list of D's keys"""
jpayne@68 1864 return list(self)
jpayne@68 1865
jpayne@68 1866 def items(self):
jpayne@68 1867 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 1868 return list(self.iteritems())
jpayne@68 1869
jpayne@68 1870 def values(self):
jpayne@68 1871 """D.values() -> list of D's values"""
jpayne@68 1872 return list(self.itervalues())
jpayne@68 1873
jpayne@68 1874 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 1875 __hash__ = None
jpayne@68 1876
jpayne@68 1877 #TODO: implement __richcmp__
jpayne@68 1878
jpayne@68 1879 def add(self, id, length=None, **kwargs):
jpayne@68 1880 """Add a new contig record"""
jpayne@68 1881 if id in self:
jpayne@68 1882 raise ValueError('Header already exists for contig {}'.format(id))
jpayne@68 1883
jpayne@68 1884 items = [('ID', unquoted_str(id))]
jpayne@68 1885 if length is not None:
jpayne@68 1886 items.append(("length", unquoted_str(length)))
jpayne@68 1887 items += kwargs.items()
jpayne@68 1888 self.header.add_meta('contig', items=items)
jpayne@68 1889
jpayne@68 1890
jpayne@68 1891 cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
jpayne@68 1892 if not header:
jpayne@68 1893 raise ValueError('invalid VariantHeader')
jpayne@68 1894
jpayne@68 1895 cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
jpayne@68 1896 contigs.header = header
jpayne@68 1897
jpayne@68 1898 return contigs
jpayne@68 1899
jpayne@68 1900
jpayne@68 1901 cdef class VariantHeaderSamples(object):
jpayne@68 1902 """sequence of sample names from a :class:`VariantHeader` object"""
jpayne@68 1903 def __init__(self, *args, **kwargs):
jpayne@68 1904 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 1905
jpayne@68 1906 def __len__(self):
jpayne@68 1907 return bcf_hdr_nsamples(self.header.ptr)
jpayne@68 1908
jpayne@68 1909 def __bool__(self):
jpayne@68 1910 return bcf_hdr_nsamples(self.header.ptr) != 0
jpayne@68 1911
jpayne@68 1912 def __getitem__(self, index):
jpayne@68 1913 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1914 cdef int32_t n = bcf_hdr_nsamples(hdr)
jpayne@68 1915 cdef int32_t i = index
jpayne@68 1916
jpayne@68 1917 if i < 0 or i >= n:
jpayne@68 1918 raise IndexError('invalid sample index')
jpayne@68 1919
jpayne@68 1920 return charptr_to_str(hdr.samples[i])
jpayne@68 1921
jpayne@68 1922 def __iter__(self):
jpayne@68 1923 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1924 cdef int32_t i, n = bcf_hdr_nsamples(hdr)
jpayne@68 1925
jpayne@68 1926 for i in range(n):
jpayne@68 1927 yield charptr_to_str(hdr.samples[i])
jpayne@68 1928
jpayne@68 1929 def __contains__(self, key):
jpayne@68 1930 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 1931 cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
jpayne@68 1932 cdef bytes bkey = force_bytes(key)
jpayne@68 1933 cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68 1934
jpayne@68 1935 return k != kh_end(d)
jpayne@68 1936
jpayne@68 1937 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 1938 __hash__ = None
jpayne@68 1939
jpayne@68 1940 #TODO: implement __richcmp__
jpayne@68 1941
jpayne@68 1942 def add(self, name):
jpayne@68 1943 """Add a new sample"""
jpayne@68 1944 self.header.add_sample(name)
jpayne@68 1945
jpayne@68 1946
jpayne@68 1947 cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
jpayne@68 1948 if not header:
jpayne@68 1949 raise ValueError('invalid VariantHeader')
jpayne@68 1950
jpayne@68 1951 cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
jpayne@68 1952 samples.header = header
jpayne@68 1953
jpayne@68 1954 return samples
jpayne@68 1955
jpayne@68 1956
jpayne@68 1957 cdef class VariantHeader(object):
jpayne@68 1958 """header information for a :class:`VariantFile` object"""
jpayne@68 1959 #FIXME: Add structured proxy
jpayne@68 1960 #FIXME: Add generic proxy
jpayne@68 1961 #FIXME: Add mutable methods
jpayne@68 1962
jpayne@68 1963 # See makeVariantHeader for C constructor
jpayne@68 1964 def __cinit__(self):
jpayne@68 1965 self.ptr = NULL
jpayne@68 1966
jpayne@68 1967 # Python constructor
jpayne@68 1968 def __init__(self):
jpayne@68 1969 self.ptr = bcf_hdr_init(b'w')
jpayne@68 1970 if not self.ptr:
jpayne@68 1971 raise ValueError('cannot create VariantHeader')
jpayne@68 1972
jpayne@68 1973 def __dealloc__(self):
jpayne@68 1974 if self.ptr:
jpayne@68 1975 bcf_hdr_destroy(self.ptr)
jpayne@68 1976 self.ptr = NULL
jpayne@68 1977
jpayne@68 1978 def __bool__(self):
jpayne@68 1979 return self.ptr != NULL
jpayne@68 1980
jpayne@68 1981 def copy(self):
jpayne@68 1982 return makeVariantHeader(bcf_hdr_dup(self.ptr))
jpayne@68 1983
jpayne@68 1984 def merge(self, VariantHeader header):
jpayne@68 1985 if header is None:
jpayne@68 1986 raise ValueError('header must not be None')
jpayne@68 1987 bcf_hdr_merge(self.ptr, header.ptr)
jpayne@68 1988
jpayne@68 1989 @property
jpayne@68 1990 def version(self):
jpayne@68 1991 """VCF version"""
jpayne@68 1992 return force_str(bcf_hdr_get_version(self.ptr))
jpayne@68 1993
jpayne@68 1994 @property
jpayne@68 1995 def samples(self):
jpayne@68 1996 """samples (:class:`VariantHeaderSamples`)"""
jpayne@68 1997 return makeVariantHeaderSamples(self)
jpayne@68 1998
jpayne@68 1999 @property
jpayne@68 2000 def records(self):
jpayne@68 2001 """header records (:class:`VariantHeaderRecords`)"""
jpayne@68 2002 return makeVariantHeaderRecords(self)
jpayne@68 2003
jpayne@68 2004 @property
jpayne@68 2005 def contigs(self):
jpayne@68 2006 """contig information (:class:`VariantHeaderContigs`)"""
jpayne@68 2007 return makeVariantHeaderContigs(self)
jpayne@68 2008
jpayne@68 2009 @property
jpayne@68 2010 def filters(self):
jpayne@68 2011 """filter metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68 2012 return makeVariantHeaderMetadata(self, BCF_HL_FLT)
jpayne@68 2013
jpayne@68 2014 @property
jpayne@68 2015 def info(self):
jpayne@68 2016 """info metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68 2017 return makeVariantHeaderMetadata(self, BCF_HL_INFO)
jpayne@68 2018
jpayne@68 2019 @property
jpayne@68 2020 def formats(self):
jpayne@68 2021 """format metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68 2022 return makeVariantHeaderMetadata(self, BCF_HL_FMT)
jpayne@68 2023
jpayne@68 2024 @property
jpayne@68 2025 def alts(self):
jpayne@68 2026 """alt metadata (:class:`dict` ID->record).
jpayne@68 2027
jpayne@68 2028 The data returned just a snapshot of alt records, is created
jpayne@68 2029 every time the property is requested, and modifications will
jpayne@68 2030 not be reflected in the header metadata and vice versa.
jpayne@68 2031
jpayne@68 2032 i.e. it is just a dict that reflects the state of alt records
jpayne@68 2033 at the time it is created.
jpayne@68 2034 """
jpayne@68 2035 return {record['ID']:record for record in self.records
jpayne@68 2036 if record.key.upper() == 'ALT' }
jpayne@68 2037
jpayne@68 2038 # only safe to do when opening an htsfile
jpayne@68 2039 cdef _subset_samples(self, include_samples):
jpayne@68 2040 keep_samples = set(self.samples)
jpayne@68 2041 include_samples = set(include_samples)
jpayne@68 2042 missing_samples = include_samples - keep_samples
jpayne@68 2043 keep_samples &= include_samples
jpayne@68 2044
jpayne@68 2045 if missing_samples:
jpayne@68 2046 # FIXME: add specialized exception with payload
jpayne@68 2047 raise ValueError(
jpayne@68 2048 'missing {:d} requested samples'.format(
jpayne@68 2049 len(missing_samples)))
jpayne@68 2050
jpayne@68 2051 keep_samples = force_bytes(','.join(keep_samples))
jpayne@68 2052 cdef char *keep = <char *>keep_samples if keep_samples else NULL
jpayne@68 2053 cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
jpayne@68 2054
jpayne@68 2055 if ret != 0:
jpayne@68 2056 raise ValueError(
jpayne@68 2057 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
jpayne@68 2058
jpayne@68 2059 def __str__(self):
jpayne@68 2060 cdef int hlen
jpayne@68 2061 cdef kstring_t line
jpayne@68 2062 line.l = line.m = 0
jpayne@68 2063 line.s = NULL
jpayne@68 2064
jpayne@68 2065 if bcf_hdr_format(self.ptr, 0, &line) < 0:
jpayne@68 2066 if line.m:
jpayne@68 2067 free(line.s)
jpayne@68 2068 raise ValueError('bcf_hdr_format failed')
jpayne@68 2069
jpayne@68 2070 ret = charptr_to_str_w_len(line.s, line.l)
jpayne@68 2071
jpayne@68 2072 if line.m:
jpayne@68 2073 free(line.s)
jpayne@68 2074 return ret
jpayne@68 2075
jpayne@68 2076 def new_record(self, contig=None, start=0, stop=0, alleles=None,
jpayne@68 2077 id=None, qual=None, filter=None, info=None, samples=None,
jpayne@68 2078 **kwargs):
jpayne@68 2079 """Create a new empty VariantRecord.
jpayne@68 2080
jpayne@68 2081 Arguments are currently experimental. Use with caution and expect
jpayne@68 2082 changes in upcoming releases.
jpayne@68 2083
jpayne@68 2084 """
jpayne@68 2085 rec = makeVariantRecord(self, bcf_init())
jpayne@68 2086
jpayne@68 2087 if not rec:
jpayne@68 2088 raise MemoryError('unable to allocate BCF record')
jpayne@68 2089
jpayne@68 2090 rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
jpayne@68 2091
jpayne@68 2092 if contig is not None:
jpayne@68 2093 rec.contig = contig
jpayne@68 2094
jpayne@68 2095 rec.start = start
jpayne@68 2096 rec.stop = stop
jpayne@68 2097 rec.id = id
jpayne@68 2098 rec.qual = qual
jpayne@68 2099
jpayne@68 2100 if alleles is not None:
jpayne@68 2101 rec.alleles = alleles
jpayne@68 2102
jpayne@68 2103 if filter is not None:
jpayne@68 2104 if isinstance(filter, (list, tuple, VariantRecordFilter)):
jpayne@68 2105 for f in filter:
jpayne@68 2106 rec.filter.add(f)
jpayne@68 2107 else:
jpayne@68 2108 rec.filter.add(filter)
jpayne@68 2109
jpayne@68 2110 if info:
jpayne@68 2111 rec.info.update(info)
jpayne@68 2112
jpayne@68 2113 if kwargs:
jpayne@68 2114 rec.samples[0].update(kwargs)
jpayne@68 2115
jpayne@68 2116 if samples:
jpayne@68 2117 for i, sample in enumerate(samples):
jpayne@68 2118 rec.samples[i].update(sample)
jpayne@68 2119
jpayne@68 2120 return rec
jpayne@68 2121
jpayne@68 2122 def add_record(self, VariantHeaderRecord record):
jpayne@68 2123 """Add an existing :class:`VariantHeaderRecord` to this header"""
jpayne@68 2124 if record is None:
jpayne@68 2125 raise ValueError('record must not be None')
jpayne@68 2126
jpayne@68 2127 cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
jpayne@68 2128
jpayne@68 2129 bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@68 2130
jpayne@68 2131 self._hdr_sync()
jpayne@68 2132
jpayne@68 2133 def add_line(self, line):
jpayne@68 2134 """Add a metadata line to this header"""
jpayne@68 2135 bline = force_bytes(line)
jpayne@68 2136 if bcf_hdr_append(self.ptr, bline) < 0:
jpayne@68 2137 raise ValueError('invalid header line')
jpayne@68 2138
jpayne@68 2139 self._hdr_sync()
jpayne@68 2140
jpayne@68 2141
jpayne@68 2142 def add_meta(self, key, value=None, items=None):
jpayne@68 2143 """Add metadata to this header"""
jpayne@68 2144 if not ((value is not None) ^ (items is not None)):
jpayne@68 2145 raise ValueError('either value or items must be specified')
jpayne@68 2146
jpayne@68 2147 cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
jpayne@68 2148 cdef int quoted
jpayne@68 2149
jpayne@68 2150 try:
jpayne@68 2151 key = force_bytes(key)
jpayne@68 2152 hrec.key = strdup(key)
jpayne@68 2153
jpayne@68 2154 if value is not None:
jpayne@68 2155 hrec.value = strdup(force_bytes(value))
jpayne@68 2156 else:
jpayne@68 2157 for key, value in items:
jpayne@68 2158 quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
jpayne@68 2159
jpayne@68 2160 key = force_bytes(key)
jpayne@68 2161 bcf_hrec_add_key(hrec, key, <int>len(key))
jpayne@68 2162
jpayne@68 2163 value = force_bytes(str(value))
jpayne@68 2164 bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
jpayne@68 2165 except:
jpayne@68 2166 bcf_hrec_destroy(hrec)
jpayne@68 2167 raise
jpayne@68 2168
jpayne@68 2169 bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@68 2170
jpayne@68 2171 self._hdr_sync()
jpayne@68 2172
jpayne@68 2173 cdef _add_sample(self, name):
jpayne@68 2174 bname = force_bytes(name)
jpayne@68 2175 if bcf_hdr_add_sample(self.ptr, bname) < 0:
jpayne@68 2176 raise ValueError('Duplicated sample name: {}'.format(name))
jpayne@68 2177
jpayne@68 2178 cdef _hdr_sync(self):
jpayne@68 2179 cdef bcf_hdr_t *hdr = self.ptr
jpayne@68 2180 if hdr.dirty:
jpayne@68 2181 if bcf_hdr_sync(hdr) < 0:
jpayne@68 2182 raise MemoryError('unable to reallocate VariantHeader')
jpayne@68 2183
jpayne@68 2184 def add_sample(self, name):
jpayne@68 2185 """Add a new sample to this header"""
jpayne@68 2186 self._add_sample(name)
jpayne@68 2187 self._hdr_sync()
jpayne@68 2188
jpayne@68 2189 def add_samples(self, *args):
jpayne@68 2190 """Add several new samples to this header.
jpayne@68 2191 This function takes multiple arguments, each of which may
jpayne@68 2192 be either a sample name or an iterable returning sample names
jpayne@68 2193 (e.g., a list of sample names).
jpayne@68 2194 """
jpayne@68 2195 for arg in args:
jpayne@68 2196 if isinstance(arg, str):
jpayne@68 2197 self._add_sample(arg)
jpayne@68 2198 else:
jpayne@68 2199 for name in arg:
jpayne@68 2200 self._add_sample(name)
jpayne@68 2201 self._hdr_sync()
jpayne@68 2202
jpayne@68 2203
jpayne@68 2204 cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
jpayne@68 2205 if not hdr:
jpayne@68 2206 raise ValueError('cannot create VariantHeader')
jpayne@68 2207
jpayne@68 2208 cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
jpayne@68 2209 header.ptr = hdr
jpayne@68 2210
jpayne@68 2211 return header
jpayne@68 2212
jpayne@68 2213
jpayne@68 2214 cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
jpayne@68 2215 cdef vdict_t *d
jpayne@68 2216 cdef khiter_t k
jpayne@68 2217 cdef int info_id
jpayne@68 2218
jpayne@68 2219 if isinstance(key, str):
jpayne@68 2220 key = force_bytes(key)
jpayne@68 2221
jpayne@68 2222 d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68 2223 k = kh_get_vdict(d, key)
jpayne@68 2224
jpayne@68 2225 if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
jpayne@68 2226 return -1
jpayne@68 2227
jpayne@68 2228 return kh_val_vdict(d, k).id
jpayne@68 2229
jpayne@68 2230
jpayne@68 2231 ########################################################################
jpayne@68 2232 ########################################################################
jpayne@68 2233 ## Variant Record objects
jpayne@68 2234 ########################################################################
jpayne@68 2235
jpayne@68 2236 cdef class VariantRecordFilter(object):
jpayne@68 2237 """Filters set on a :class:`VariantRecord` object, presented as a mapping from
jpayne@68 2238 filter index or name to :class:`VariantMetadata` object"""
jpayne@68 2239 def __init__(self, *args, **kwargs):
jpayne@68 2240 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 2241
jpayne@68 2242 def __len__(self):
jpayne@68 2243 return self.record.ptr.d.n_flt
jpayne@68 2244
jpayne@68 2245 def __bool__(self):
jpayne@68 2246 return self.record.ptr.d.n_flt != 0
jpayne@68 2247
jpayne@68 2248 def __getitem__(self, key):
jpayne@68 2249 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2250 cdef bcf1_t *r = self.record.ptr
jpayne@68 2251 cdef int index, id
jpayne@68 2252 cdef int n = r.d.n_flt
jpayne@68 2253
jpayne@68 2254 if isinstance(key, int):
jpayne@68 2255 index = key
jpayne@68 2256
jpayne@68 2257 if index < 0 or index >= n:
jpayne@68 2258 raise IndexError('invalid filter index')
jpayne@68 2259
jpayne@68 2260 id = r.d.flt[index]
jpayne@68 2261 else:
jpayne@68 2262 if key == '.':
jpayne@68 2263 key = 'PASS'
jpayne@68 2264
jpayne@68 2265 bkey = force_bytes(key)
jpayne@68 2266 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68 2267
jpayne@68 2268 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@68 2269 raise KeyError('Invalid filter: {}'.format(key))
jpayne@68 2270
jpayne@68 2271 return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
jpayne@68 2272
jpayne@68 2273 def add(self, key):
jpayne@68 2274 """Add a new filter"""
jpayne@68 2275 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2276 cdef bcf1_t *r = self.record.ptr
jpayne@68 2277 cdef int id
jpayne@68 2278
jpayne@68 2279 if key == '.':
jpayne@68 2280 key = 'PASS'
jpayne@68 2281
jpayne@68 2282 cdef bytes bkey = force_bytes(key)
jpayne@68 2283 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68 2284
jpayne@68 2285 if not check_header_id(hdr, BCF_HL_FLT, id):
jpayne@68 2286 raise KeyError('Invalid filter: {}'.format(key))
jpayne@68 2287
jpayne@68 2288 bcf_add_filter(hdr, r, id)
jpayne@68 2289
jpayne@68 2290 def __delitem__(self, key):
jpayne@68 2291 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2292 cdef bcf1_t *r = self.record.ptr
jpayne@68 2293 cdef int index, id
jpayne@68 2294 cdef int n = r.d.n_flt
jpayne@68 2295
jpayne@68 2296 if isinstance(key, int):
jpayne@68 2297 index = key
jpayne@68 2298
jpayne@68 2299 if index < 0 or index >= n:
jpayne@68 2300 raise IndexError('invalid filter index')
jpayne@68 2301
jpayne@68 2302 id = r.d.flt[index]
jpayne@68 2303 else:
jpayne@68 2304 if key == '.':
jpayne@68 2305 key = 'PASS'
jpayne@68 2306
jpayne@68 2307 bkey = force_bytes(key)
jpayne@68 2308 id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68 2309
jpayne@68 2310 if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@68 2311 raise KeyError('Invalid filter: {}'.format(key))
jpayne@68 2312
jpayne@68 2313 bcf_remove_filter(hdr, r, id, 0)
jpayne@68 2314
jpayne@68 2315 def clear(self):
jpayne@68 2316 """Clear all filters"""
jpayne@68 2317 cdef bcf1_t *r = self.record.ptr
jpayne@68 2318 r.d.shared_dirty |= BCF1_DIRTY_FLT
jpayne@68 2319 r.d.n_flt = 0
jpayne@68 2320
jpayne@68 2321 def __iter__(self):
jpayne@68 2322 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2323 cdef bcf1_t *r = self.record.ptr
jpayne@68 2324 cdef int i
jpayne@68 2325
jpayne@68 2326 for i in range(r.d.n_flt):
jpayne@68 2327 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
jpayne@68 2328
jpayne@68 2329 def get(self, key, default=None):
jpayne@68 2330 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 2331 try:
jpayne@68 2332 return self[key]
jpayne@68 2333 except KeyError:
jpayne@68 2334 return default
jpayne@68 2335
jpayne@68 2336 def __contains__(self, key):
jpayne@68 2337 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2338 cdef bcf1_t *r = self.record.ptr
jpayne@68 2339 cdef bytes bkey = force_bytes(key)
jpayne@68 2340 return bcf_has_filter(hdr, r, bkey) == 1
jpayne@68 2341
jpayne@68 2342 def iterkeys(self):
jpayne@68 2343 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 2344 return iter(self)
jpayne@68 2345
jpayne@68 2346 def itervalues(self):
jpayne@68 2347 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 2348 for key in self:
jpayne@68 2349 yield self[key]
jpayne@68 2350
jpayne@68 2351 def iteritems(self):
jpayne@68 2352 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 2353 for key in self:
jpayne@68 2354 yield (key, self[key])
jpayne@68 2355
jpayne@68 2356 def keys(self):
jpayne@68 2357 """D.keys() -> list of D's keys"""
jpayne@68 2358 return list(self)
jpayne@68 2359
jpayne@68 2360 def items(self):
jpayne@68 2361 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 2362 return list(self.iteritems())
jpayne@68 2363
jpayne@68 2364 def values(self):
jpayne@68 2365 """D.values() -> list of D's values"""
jpayne@68 2366 return list(self.itervalues())
jpayne@68 2367
jpayne@68 2368 def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
jpayne@68 2369 if op != 2 and op != 3:
jpayne@68 2370 return NotImplemented
jpayne@68 2371
jpayne@68 2372 cdef bcf1_t *s = self.record.ptr
jpayne@68 2373 cdef bcf1_t *o = other.record.ptr
jpayne@68 2374
jpayne@68 2375 cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
jpayne@68 2376
jpayne@68 2377 if op == 3:
jpayne@68 2378 cmp = not cmp
jpayne@68 2379
jpayne@68 2380 return cmp
jpayne@68 2381
jpayne@68 2382 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 2383 __hash__ = None
jpayne@68 2384
jpayne@68 2385 #TODO: implement __richcmp__
jpayne@68 2386
jpayne@68 2387
jpayne@68 2388 cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
jpayne@68 2389 if not record:
jpayne@68 2390 raise ValueError('invalid VariantRecord')
jpayne@68 2391
jpayne@68 2392 cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
jpayne@68 2393 filter.record = record
jpayne@68 2394
jpayne@68 2395 return filter
jpayne@68 2396
jpayne@68 2397
jpayne@68 2398 cdef class VariantRecordFormat(object):
jpayne@68 2399 """Format data present for each sample in a :class:`VariantRecord` object,
jpayne@68 2400 presented as mapping from format name to :class:`VariantMetadata` object."""
jpayne@68 2401 def __init__(self, *args, **kwargs):
jpayne@68 2402 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 2403
jpayne@68 2404 def __len__(self):
jpayne@68 2405 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2406 cdef bcf1_t *r = self.record.ptr
jpayne@68 2407 cdef int i, n = 0
jpayne@68 2408
jpayne@68 2409 for i in range(r.n_fmt):
jpayne@68 2410 if r.d.fmt[i].p:
jpayne@68 2411 n += 1
jpayne@68 2412 return n
jpayne@68 2413
jpayne@68 2414 def __bool__(self):
jpayne@68 2415 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2416 cdef bcf1_t *r = self.record.ptr
jpayne@68 2417 cdef int i
jpayne@68 2418
jpayne@68 2419 for i in range(r.n_fmt):
jpayne@68 2420 if r.d.fmt[i].p:
jpayne@68 2421 return True
jpayne@68 2422 return False
jpayne@68 2423
jpayne@68 2424 def __getitem__(self, key):
jpayne@68 2425 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2426 cdef bcf1_t *r = self.record.ptr
jpayne@68 2427
jpayne@68 2428 cdef bytes bkey = force_bytes(key)
jpayne@68 2429 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 2430
jpayne@68 2431 if not fmt or not fmt.p:
jpayne@68 2432 raise KeyError('unknown format: {}'.format(key))
jpayne@68 2433
jpayne@68 2434 return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
jpayne@68 2435
jpayne@68 2436 def __delitem__(self, key):
jpayne@68 2437 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2438 cdef bcf1_t *r = self.record.ptr
jpayne@68 2439
jpayne@68 2440 cdef bytes bkey = force_bytes(key)
jpayne@68 2441 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 2442
jpayne@68 2443 if not fmt or not fmt.p:
jpayne@68 2444 raise KeyError('unknown format: {}'.format(key))
jpayne@68 2445
jpayne@68 2446 if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
jpayne@68 2447 raise ValueError('Unable to delete FORMAT')
jpayne@68 2448
jpayne@68 2449 def clear(self):
jpayne@68 2450 """Clear all formats for all samples within the associated
jpayne@68 2451 :class:`VariantRecord` instance"""
jpayne@68 2452 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2453 cdef bcf1_t *r = self.record.ptr
jpayne@68 2454 cdef bcf_fmt_t *fmt
jpayne@68 2455 cdef const char *key
jpayne@68 2456 cdef int i
jpayne@68 2457
jpayne@68 2458 for i in reversed(range(r.n_fmt)):
jpayne@68 2459 fmt = &r.d.fmt[i]
jpayne@68 2460 if fmt.p:
jpayne@68 2461 key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
jpayne@68 2462 if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
jpayne@68 2463 raise ValueError('Unable to delete FORMAT')
jpayne@68 2464
jpayne@68 2465 def __iter__(self):
jpayne@68 2466 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2467 cdef bcf1_t *r = self.record.ptr
jpayne@68 2468 cdef bcf_fmt_t *fmt
jpayne@68 2469 cdef int i
jpayne@68 2470
jpayne@68 2471 for i in range(r.n_fmt):
jpayne@68 2472 fmt = &r.d.fmt[i]
jpayne@68 2473 if fmt.p:
jpayne@68 2474 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68 2475
jpayne@68 2476 def get(self, key, default=None):
jpayne@68 2477 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 2478 try:
jpayne@68 2479 return self[key]
jpayne@68 2480 except KeyError:
jpayne@68 2481 return default
jpayne@68 2482
jpayne@68 2483 def __contains__(self, key):
jpayne@68 2484 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2485 cdef bcf1_t *r = self.record.ptr
jpayne@68 2486 cdef bytes bkey = force_bytes(key)
jpayne@68 2487 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 2488 return fmt != NULL and fmt.p != NULL
jpayne@68 2489
jpayne@68 2490 def iterkeys(self):
jpayne@68 2491 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 2492 return iter(self)
jpayne@68 2493
jpayne@68 2494 def itervalues(self):
jpayne@68 2495 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 2496 for key in self:
jpayne@68 2497 yield self[key]
jpayne@68 2498
jpayne@68 2499 def iteritems(self):
jpayne@68 2500 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 2501 for key in self:
jpayne@68 2502 yield (key, self[key])
jpayne@68 2503
jpayne@68 2504 def keys(self):
jpayne@68 2505 """D.keys() -> list of D's keys"""
jpayne@68 2506 return list(self)
jpayne@68 2507
jpayne@68 2508 def items(self):
jpayne@68 2509 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 2510 return list(self.iteritems())
jpayne@68 2511
jpayne@68 2512 def values(self):
jpayne@68 2513 """D.values() -> list of D's values"""
jpayne@68 2514 return list(self.itervalues())
jpayne@68 2515
jpayne@68 2516 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 2517 __hash__ = None
jpayne@68 2518
jpayne@68 2519 #TODO: implement __richcmp__
jpayne@68 2520
jpayne@68 2521
jpayne@68 2522 cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
jpayne@68 2523 if not record:
jpayne@68 2524 raise ValueError('invalid VariantRecord')
jpayne@68 2525
jpayne@68 2526 cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
jpayne@68 2527 format.record = record
jpayne@68 2528
jpayne@68 2529 return format
jpayne@68 2530
jpayne@68 2531
jpayne@68 2532 #TODO: Add a getmeta method to return the corresponding VariantMetadata?
jpayne@68 2533 cdef class VariantRecordInfo(object):
jpayne@68 2534 """Info data stored in a :class:`VariantRecord` object, presented as a
jpayne@68 2535 mapping from info metadata name to value."""
jpayne@68 2536
jpayne@68 2537 def __init__(self, *args, **kwargs):
jpayne@68 2538 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 2539
jpayne@68 2540 def __len__(self):
jpayne@68 2541 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2542 cdef bcf1_t *r = self.record.ptr
jpayne@68 2543 cdef bcf_info_t *info
jpayne@68 2544 cdef const char *key
jpayne@68 2545 cdef int i, count = 0
jpayne@68 2546
jpayne@68 2547 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2548 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2549
jpayne@68 2550 for i in range(r.n_info):
jpayne@68 2551 info = &r.d.info[i]
jpayne@68 2552 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2553 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@68 2554 count += 1
jpayne@68 2555
jpayne@68 2556 return count
jpayne@68 2557
jpayne@68 2558 def __bool__(self):
jpayne@68 2559 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2560 cdef bcf1_t *r = self.record.ptr
jpayne@68 2561 cdef bcf_info_t *info
jpayne@68 2562 cdef const char *key
jpayne@68 2563 cdef int i
jpayne@68 2564
jpayne@68 2565 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2566 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2567
jpayne@68 2568 for i in range(r.n_info):
jpayne@68 2569 info = &r.d.info[i]
jpayne@68 2570 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2571 if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@68 2572 return True
jpayne@68 2573
jpayne@68 2574 return False
jpayne@68 2575
jpayne@68 2576 def __getitem__(self, key):
jpayne@68 2577 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2578 cdef bcf1_t *r = self.record.ptr
jpayne@68 2579
jpayne@68 2580 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2581 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2582
jpayne@68 2583 cdef bytes bkey = force_bytes(key)
jpayne@68 2584
jpayne@68 2585 if strcmp(bkey, b'END') == 0:
jpayne@68 2586 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68 2587
jpayne@68 2588 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 2589
jpayne@68 2590 # Cannot stop here if info == NULL, since flags must return False
jpayne@68 2591 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68 2592
jpayne@68 2593 if info_id < 0:
jpayne@68 2594 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68 2595
jpayne@68 2596 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68 2597 raise ValueError('Invalid header')
jpayne@68 2598
jpayne@68 2599 # Handle type=Flag values
jpayne@68 2600 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@68 2601 return info != NULL and info.vptr != NULL
jpayne@68 2602
jpayne@68 2603 if not info or not info.vptr:
jpayne@68 2604 raise KeyError('Invalid INFO field: {}'.format(key))
jpayne@68 2605
jpayne@68 2606 return bcf_info_get_value(self.record, info)
jpayne@68 2607
jpayne@68 2608 def __setitem__(self, key, value):
jpayne@68 2609 cdef bytes bkey = force_bytes(key)
jpayne@68 2610
jpayne@68 2611 if strcmp(bkey, b'END') == 0:
jpayne@68 2612 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68 2613
jpayne@68 2614 if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
jpayne@68 2615 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2616
jpayne@68 2617 bcf_info_set_value(self.record, key, value)
jpayne@68 2618
jpayne@68 2619 def __delitem__(self, key):
jpayne@68 2620 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2621 cdef bcf1_t *r = self.record.ptr
jpayne@68 2622
jpayne@68 2623 cdef bytes bkey = force_bytes(key)
jpayne@68 2624 if strcmp(bkey, b'END') == 0:
jpayne@68 2625 raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68 2626
jpayne@68 2627 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2628 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2629
jpayne@68 2630 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 2631
jpayne@68 2632 # Cannot stop here if info == NULL, since flags must return False
jpayne@68 2633 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68 2634
jpayne@68 2635 if info_id < 0:
jpayne@68 2636 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68 2637
jpayne@68 2638 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68 2639 raise ValueError('Invalid header')
jpayne@68 2640
jpayne@68 2641 # Handle flags
jpayne@68 2642 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@68 2643 return
jpayne@68 2644
jpayne@68 2645 if not info or not info.vptr:
jpayne@68 2646 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68 2647
jpayne@68 2648 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@68 2649 raise ValueError('Unable to delete INFO')
jpayne@68 2650
jpayne@68 2651 def clear(self):
jpayne@68 2652 """Clear all info data"""
jpayne@68 2653 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2654 cdef bcf1_t *r = self.record.ptr
jpayne@68 2655 cdef bcf_info_t *info
jpayne@68 2656 cdef const char *key
jpayne@68 2657 cdef int i
jpayne@68 2658
jpayne@68 2659 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2660 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2661
jpayne@68 2662 for i in range(r.n_info):
jpayne@68 2663 info = &r.d.info[i]
jpayne@68 2664 if info and info.vptr:
jpayne@68 2665 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2666 if strcmp(key, b'END') == 0:
jpayne@68 2667 continue
jpayne@68 2668 if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
jpayne@68 2669 raise ValueError('Unable to delete INFO')
jpayne@68 2670
jpayne@68 2671 def __iter__(self):
jpayne@68 2672 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2673 cdef bcf1_t *r = self.record.ptr
jpayne@68 2674 cdef bcf_info_t *info
jpayne@68 2675 cdef const char *key
jpayne@68 2676 cdef int i
jpayne@68 2677
jpayne@68 2678 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2679 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2680
jpayne@68 2681 for i in range(r.n_info):
jpayne@68 2682 info = &r.d.info[i]
jpayne@68 2683 if info and info.vptr:
jpayne@68 2684 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2685 if strcmp(key, b'END') != 0:
jpayne@68 2686 yield bcf_str_cache_get_charptr(key)
jpayne@68 2687
jpayne@68 2688 def get(self, key, default=None):
jpayne@68 2689 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 2690 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2691 cdef bcf1_t *r = self.record.ptr
jpayne@68 2692
jpayne@68 2693 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2694 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2695
jpayne@68 2696 cdef bytes bkey = force_bytes(key)
jpayne@68 2697
jpayne@68 2698 if strcmp(bkey, b'END') == 0:
jpayne@68 2699 return default
jpayne@68 2700
jpayne@68 2701 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 2702
jpayne@68 2703 # Cannot stop here if info == NULL, since flags must return False
jpayne@68 2704 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68 2705
jpayne@68 2706 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68 2707 raise ValueError('Invalid header')
jpayne@68 2708
jpayne@68 2709 # Handle flags
jpayne@68 2710 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@68 2711 return info != NULL and info.vptr != NULL
jpayne@68 2712
jpayne@68 2713 if not info or not info.vptr:
jpayne@68 2714 return default
jpayne@68 2715
jpayne@68 2716 return bcf_info_get_value(self.record, info)
jpayne@68 2717
jpayne@68 2718 def __contains__(self, key):
jpayne@68 2719 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2720 cdef bcf1_t *r = self.record.ptr
jpayne@68 2721
jpayne@68 2722 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2723 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2724
jpayne@68 2725 cdef bytes bkey = force_bytes(key)
jpayne@68 2726
jpayne@68 2727 if strcmp(bkey, b'END') == 0:
jpayne@68 2728 return False
jpayne@68 2729
jpayne@68 2730 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 2731
jpayne@68 2732 return info != NULL and info.vptr != NULL
jpayne@68 2733
jpayne@68 2734 def iterkeys(self):
jpayne@68 2735 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 2736 return iter(self)
jpayne@68 2737
jpayne@68 2738 def itervalues(self):
jpayne@68 2739 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 2740 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2741 cdef bcf1_t *r = self.record.ptr
jpayne@68 2742 cdef bcf_info_t *info
jpayne@68 2743 cdef const char *key
jpayne@68 2744 cdef int i
jpayne@68 2745
jpayne@68 2746 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2747 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2748
jpayne@68 2749 for i in range(r.n_info):
jpayne@68 2750 info = &r.d.info[i]
jpayne@68 2751 if info and info.vptr:
jpayne@68 2752 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2753 if strcmp(key, b'END') != 0:
jpayne@68 2754 yield bcf_info_get_value(self.record, info)
jpayne@68 2755
jpayne@68 2756 def iteritems(self):
jpayne@68 2757 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 2758 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2759 cdef bcf1_t *r = self.record.ptr
jpayne@68 2760 cdef bcf_info_t *info
jpayne@68 2761 cdef const char *key
jpayne@68 2762 cdef int i
jpayne@68 2763
jpayne@68 2764 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2765 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2766
jpayne@68 2767 for i in range(r.n_info):
jpayne@68 2768 info = &r.d.info[i]
jpayne@68 2769 if info and info.vptr:
jpayne@68 2770 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68 2771 if strcmp(key, b'END') != 0:
jpayne@68 2772 value = bcf_info_get_value(self.record, info)
jpayne@68 2773 yield bcf_str_cache_get_charptr(key), value
jpayne@68 2774
jpayne@68 2775 def keys(self):
jpayne@68 2776 """D.keys() -> list of D's keys"""
jpayne@68 2777 return list(self)
jpayne@68 2778
jpayne@68 2779 def items(self):
jpayne@68 2780 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 2781 return list(self.iteritems())
jpayne@68 2782
jpayne@68 2783 def values(self):
jpayne@68 2784 """D.values() -> list of D's values"""
jpayne@68 2785 return list(self.itervalues())
jpayne@68 2786
jpayne@68 2787 def update(self, items=None, **kwargs):
jpayne@68 2788 """D.update([E, ]**F) -> None.
jpayne@68 2789
jpayne@68 2790 Update D from dict/iterable E and F.
jpayne@68 2791 """
jpayne@68 2792 for k, v in items.items():
jpayne@68 2793 if k != 'END':
jpayne@68 2794 self[k] = v
jpayne@68 2795
jpayne@68 2796 if kwargs:
jpayne@68 2797 kwargs.pop('END', None)
jpayne@68 2798 for k, v in kwargs.items():
jpayne@68 2799 self[k] = v
jpayne@68 2800
jpayne@68 2801 def pop(self, key, default=_nothing):
jpayne@68 2802 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2803 cdef bcf1_t *r = self.record.ptr
jpayne@68 2804
jpayne@68 2805 if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68 2806 raise ValueError('Error unpacking VariantRecord')
jpayne@68 2807
jpayne@68 2808 cdef bytes bkey = force_bytes(key)
jpayne@68 2809 cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68 2810
jpayne@68 2811 # Cannot stop here if info == NULL, since flags must return False
jpayne@68 2812 cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68 2813
jpayne@68 2814 if info_id < 0:
jpayne@68 2815 if default is _nothing:
jpayne@68 2816 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68 2817 return default
jpayne@68 2818
jpayne@68 2819 if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68 2820 raise ValueError('Invalid header')
jpayne@68 2821
jpayne@68 2822 # Handle flags
jpayne@68 2823 if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@68 2824 return
jpayne@68 2825
jpayne@68 2826 if not info or not info.vptr:
jpayne@68 2827 if default is _nothing:
jpayne@68 2828 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68 2829 return default
jpayne@68 2830
jpayne@68 2831 value = bcf_info_get_value(self.record, info)
jpayne@68 2832
jpayne@68 2833 if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@68 2834 raise ValueError('Unable to delete INFO')
jpayne@68 2835
jpayne@68 2836 return value
jpayne@68 2837
jpayne@68 2838 def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
jpayne@68 2839 if op != 2 and op != 3:
jpayne@68 2840 return NotImplemented
jpayne@68 2841
jpayne@68 2842 cdef bcf1_t *s = self.record.ptr
jpayne@68 2843 cdef bcf1_t *o = other.record.ptr
jpayne@68 2844
jpayne@68 2845 # Cannot use n_info as shortcut logic, since null values may remain
jpayne@68 2846 cdef bint cmp = dict(self) == dict(other)
jpayne@68 2847
jpayne@68 2848 if op == 3:
jpayne@68 2849 cmp = not cmp
jpayne@68 2850
jpayne@68 2851 return cmp
jpayne@68 2852
jpayne@68 2853 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 2854 __hash__ = None
jpayne@68 2855
jpayne@68 2856
jpayne@68 2857 cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
jpayne@68 2858 if not record:
jpayne@68 2859 raise ValueError('invalid VariantRecord')
jpayne@68 2860
jpayne@68 2861 cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
jpayne@68 2862 info.record = record
jpayne@68 2863
jpayne@68 2864 return info
jpayne@68 2865
jpayne@68 2866
jpayne@68 2867 cdef class VariantRecordSamples(object):
jpayne@68 2868 """mapping from sample index or name to :class:`VariantRecordSample` object."""
jpayne@68 2869 def __init__(self, *args, **kwargs):
jpayne@68 2870 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 2871
jpayne@68 2872 def __len__(self):
jpayne@68 2873 return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
jpayne@68 2874
jpayne@68 2875 def __bool__(self):
jpayne@68 2876 return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
jpayne@68 2877
jpayne@68 2878 def __getitem__(self, key):
jpayne@68 2879 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2880 cdef bcf1_t *r = self.record.ptr
jpayne@68 2881 cdef int n = self.record.ptr.n_sample
jpayne@68 2882 cdef int sample_index
jpayne@68 2883 cdef vdict_t *d
jpayne@68 2884 cdef khiter_t k
jpayne@68 2885
jpayne@68 2886 if isinstance(key, int):
jpayne@68 2887 sample_index = key
jpayne@68 2888 else:
jpayne@68 2889 bkey = force_bytes(key)
jpayne@68 2890 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@68 2891 if sample_index < 0:
jpayne@68 2892 raise KeyError('invalid sample name: {}'.format(key))
jpayne@68 2893
jpayne@68 2894 if sample_index < 0 or sample_index >= n:
jpayne@68 2895 raise IndexError('invalid sample index')
jpayne@68 2896
jpayne@68 2897 return makeVariantRecordSample(self.record, sample_index)
jpayne@68 2898
jpayne@68 2899 def __iter__(self):
jpayne@68 2900 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2901 cdef bcf1_t *r = self.record.ptr
jpayne@68 2902 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68 2903
jpayne@68 2904 for i in range(n):
jpayne@68 2905 yield charptr_to_str(hdr.samples[i])
jpayne@68 2906
jpayne@68 2907 def get(self, key, default=None):
jpayne@68 2908 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 2909 try:
jpayne@68 2910 return self[key]
jpayne@68 2911 except KeyError:
jpayne@68 2912 return default
jpayne@68 2913
jpayne@68 2914 def __contains__(self, key):
jpayne@68 2915 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2916 cdef bcf1_t *r = self.record.ptr
jpayne@68 2917 cdef int n = self.record.ptr.n_sample
jpayne@68 2918 cdef int sample_index
jpayne@68 2919 cdef vdict_t *d
jpayne@68 2920 cdef khiter_t k
jpayne@68 2921
jpayne@68 2922 if isinstance(key, int):
jpayne@68 2923 sample_index = key
jpayne@68 2924 else:
jpayne@68 2925 bkey = force_bytes(key)
jpayne@68 2926 sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@68 2927 if sample_index < 0:
jpayne@68 2928 raise KeyError('invalid sample name: {}'.format(key))
jpayne@68 2929
jpayne@68 2930 return 0 <= sample_index < n
jpayne@68 2931
jpayne@68 2932 def iterkeys(self):
jpayne@68 2933 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 2934 return iter(self)
jpayne@68 2935
jpayne@68 2936 def itervalues(self):
jpayne@68 2937 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 2938 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2939 cdef bcf1_t *r = self.record.ptr
jpayne@68 2940 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68 2941
jpayne@68 2942 for i in range(n):
jpayne@68 2943 yield makeVariantRecordSample(self.record, i)
jpayne@68 2944
jpayne@68 2945 def iteritems(self):
jpayne@68 2946 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 2947 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 2948 cdef bcf1_t *r = self.record.ptr
jpayne@68 2949 cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68 2950
jpayne@68 2951 for i in range(n):
jpayne@68 2952 yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
jpayne@68 2953
jpayne@68 2954 def keys(self):
jpayne@68 2955 """D.keys() -> list of D's keys"""
jpayne@68 2956 return list(self)
jpayne@68 2957
jpayne@68 2958 def items(self):
jpayne@68 2959 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 2960 return list(self.iteritems())
jpayne@68 2961
jpayne@68 2962 def values(self):
jpayne@68 2963 """D.values() -> list of D's values"""
jpayne@68 2964 return list(self.itervalues())
jpayne@68 2965
jpayne@68 2966 def update(self, items=None, **kwargs):
jpayne@68 2967 """D.update([E, ]**F) -> None.
jpayne@68 2968
jpayne@68 2969 Update D from dict/iterable E and F.
jpayne@68 2970 """
jpayne@68 2971 for k, v in items.items():
jpayne@68 2972 self[k] = v
jpayne@68 2973
jpayne@68 2974 if kwargs:
jpayne@68 2975 for k, v in kwargs.items():
jpayne@68 2976 self[k] = v
jpayne@68 2977
jpayne@68 2978 def pop(self, key, default=_nothing):
jpayne@68 2979 try:
jpayne@68 2980 value = self[key]
jpayne@68 2981 del self[key]
jpayne@68 2982 return value
jpayne@68 2983 except KeyError:
jpayne@68 2984 if default is not _nothing:
jpayne@68 2985 return default
jpayne@68 2986 raise
jpayne@68 2987
jpayne@68 2988 def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
jpayne@68 2989 if op != 2 and op != 3:
jpayne@68 2990 return NotImplemented
jpayne@68 2991
jpayne@68 2992 cdef bcf1_t *s = self.record.ptr
jpayne@68 2993 cdef bcf1_t *o = other.record.ptr
jpayne@68 2994
jpayne@68 2995 cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
jpayne@68 2996
jpayne@68 2997 if op == 3:
jpayne@68 2998 cmp = not cmp
jpayne@68 2999
jpayne@68 3000 return cmp
jpayne@68 3001
jpayne@68 3002 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 3003 __hash__ = None
jpayne@68 3004
jpayne@68 3005
jpayne@68 3006 cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
jpayne@68 3007 if not record:
jpayne@68 3008 raise ValueError('invalid VariantRecord')
jpayne@68 3009
jpayne@68 3010 cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
jpayne@68 3011 VariantRecordSamples)
jpayne@68 3012 samples.record = record
jpayne@68 3013
jpayne@68 3014 return samples
jpayne@68 3015
jpayne@68 3016
jpayne@68 3017 cdef class VariantRecord(object):
jpayne@68 3018 """Variant record"""
jpayne@68 3019 def __init__(self, *args, **kwargs):
jpayne@68 3020 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 3021
jpayne@68 3022 def __dealloc__(self):
jpayne@68 3023 if self.ptr:
jpayne@68 3024 bcf_destroy1(self.ptr)
jpayne@68 3025 self.ptr = NULL
jpayne@68 3026
jpayne@68 3027 def copy(self):
jpayne@68 3028 """return a copy of this VariantRecord object"""
jpayne@68 3029 return makeVariantRecord(self.header, bcf_dup(self.ptr))
jpayne@68 3030
jpayne@68 3031 def translate(self, VariantHeader dst_header):
jpayne@68 3032 if dst_header is None:
jpayne@68 3033 raise ValueError('dst_header must not be None')
jpayne@68 3034
jpayne@68 3035 cdef bcf_hdr_t *src_hdr = self.header.ptr
jpayne@68 3036 cdef bcf_hdr_t *dst_hdr = dst_header.ptr
jpayne@68 3037
jpayne@68 3038 if src_hdr != dst_hdr:
jpayne@68 3039 if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
jpayne@68 3040 msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
jpayne@68 3041 raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
jpayne@68 3042
jpayne@68 3043 bcf_translate(dst_hdr, src_hdr, self.ptr)
jpayne@68 3044 self.header = dst_header
jpayne@68 3045
jpayne@68 3046 @property
jpayne@68 3047 def rid(self):
jpayne@68 3048 """internal reference id number"""
jpayne@68 3049 return self.ptr.rid
jpayne@68 3050
jpayne@68 3051 @rid.setter
jpayne@68 3052 def rid(self, value):
jpayne@68 3053 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 3054 cdef int r = value
jpayne@68 3055 if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
jpayne@68 3056 raise ValueError('invalid reference id')
jpayne@68 3057 self.ptr.rid = r
jpayne@68 3058
jpayne@68 3059 @property
jpayne@68 3060 def chrom(self):
jpayne@68 3061 """chromosome/contig name"""
jpayne@68 3062 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 3063 cdef int rid = self.ptr.rid
jpayne@68 3064 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68 3065 raise ValueError('Invalid header')
jpayne@68 3066 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68 3067
jpayne@68 3068 @chrom.setter
jpayne@68 3069 def chrom(self, value):
jpayne@68 3070 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68 3071 bchrom = force_bytes(value)
jpayne@68 3072 cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@68 3073 if k == kh_end(d):
jpayne@68 3074 raise ValueError('Invalid chromosome/contig')
jpayne@68 3075 self.ptr.rid = kh_val_vdict(d, k).id
jpayne@68 3076
jpayne@68 3077 @property
jpayne@68 3078 def contig(self):
jpayne@68 3079 """chromosome/contig name"""
jpayne@68 3080 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 3081 cdef int rid = self.ptr.rid
jpayne@68 3082 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68 3083 raise ValueError('Invalid header')
jpayne@68 3084 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68 3085
jpayne@68 3086 @contig.setter
jpayne@68 3087 def contig(self, value):
jpayne@68 3088 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68 3089 bchrom = force_bytes(value)
jpayne@68 3090 cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@68 3091 if k == kh_end(d):
jpayne@68 3092 raise ValueError('Invalid chromosome/contig')
jpayne@68 3093 self.ptr.rid = kh_val_vdict(d, k).id
jpayne@68 3094
jpayne@68 3095 @property
jpayne@68 3096 def pos(self):
jpayne@68 3097 """record start position on chrom/contig (1-based inclusive)"""
jpayne@68 3098 return self.ptr.pos + 1
jpayne@68 3099
jpayne@68 3100 @pos.setter
jpayne@68 3101 def pos(self, value):
jpayne@68 3102 cdef int p = value
jpayne@68 3103 if p < 1:
jpayne@68 3104 raise ValueError('Position must be positive')
jpayne@68 3105 self.ptr.pos = p - 1
jpayne@68 3106 bcf_sync_end(self)
jpayne@68 3107
jpayne@68 3108 @property
jpayne@68 3109 def start(self):
jpayne@68 3110 """record start position on chrom/contig (0-based inclusive)"""
jpayne@68 3111 return self.ptr.pos
jpayne@68 3112
jpayne@68 3113 @start.setter
jpayne@68 3114 def start(self, value):
jpayne@68 3115 cdef int s = value
jpayne@68 3116 if s < 0:
jpayne@68 3117 raise ValueError('Start coordinate must be non-negative')
jpayne@68 3118 self.ptr.pos = s
jpayne@68 3119 bcf_sync_end(self)
jpayne@68 3120
jpayne@68 3121 @property
jpayne@68 3122 def stop(self):
jpayne@68 3123 """record stop position on chrom/contig (0-based exclusive)"""
jpayne@68 3124 return self.ptr.pos + self.ptr.rlen
jpayne@68 3125
jpayne@68 3126 @stop.setter
jpayne@68 3127 def stop(self, value):
jpayne@68 3128 cdef int s = value
jpayne@68 3129 if s < 0:
jpayne@68 3130 raise ValueError('Stop coordinate must be non-negative')
jpayne@68 3131 self.ptr.rlen = s - self.ptr.pos
jpayne@68 3132 bcf_sync_end(self)
jpayne@68 3133
jpayne@68 3134 @property
jpayne@68 3135 def rlen(self):
jpayne@68 3136 """record length on chrom/contig (aka rec.stop - rec.start)"""
jpayne@68 3137 return self.ptr.rlen
jpayne@68 3138
jpayne@68 3139 @rlen.setter
jpayne@68 3140 def rlen(self, value):
jpayne@68 3141 cdef int r = value
jpayne@68 3142 self.ptr.rlen = r
jpayne@68 3143 bcf_sync_end(self)
jpayne@68 3144
jpayne@68 3145 @property
jpayne@68 3146 def qual(self):
jpayne@68 3147 """phred scaled quality score or None if not available"""
jpayne@68 3148 return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
jpayne@68 3149
jpayne@68 3150 @qual.setter
jpayne@68 3151 def qual(self, value):
jpayne@68 3152 if value is not None:
jpayne@68 3153 self.ptr.qual = value
jpayne@68 3154 else:
jpayne@68 3155 bcf_float_set(&self.ptr.qual, bcf_float_missing)
jpayne@68 3156
jpayne@68 3157
jpayne@68 3158 # @property
jpayne@68 3159 # def n_allele(self):
jpayne@68 3160 # return self.ptr.n_allele
jpayne@68 3161
jpayne@68 3162 # @property
jpayne@68 3163 # def n_sample(self):
jpayne@68 3164 # return self.ptr.n_sample
jpayne@68 3165
jpayne@68 3166 @property
jpayne@68 3167 def id(self):
jpayne@68 3168 """record identifier or None if not available"""
jpayne@68 3169 cdef bcf1_t *r = self.ptr
jpayne@68 3170 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3171 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3172 # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
jpayne@68 3173 # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
jpayne@68 3174 if (r.d.m_id == 0):
jpayne@68 3175 raise ValueError('Error extracting ID')
jpayne@68 3176 return charptr_to_str(r.d.id) if r.d.id != b'.' else None
jpayne@68 3177
jpayne@68 3178 @id.setter
jpayne@68 3179 def id(self, value):
jpayne@68 3180 cdef bcf1_t *r = self.ptr
jpayne@68 3181 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3182 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3183 cdef char *idstr = NULL
jpayne@68 3184 if value is not None:
jpayne@68 3185 bid = force_bytes(value)
jpayne@68 3186 idstr = bid
jpayne@68 3187 if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
jpayne@68 3188 raise ValueError('Error updating id')
jpayne@68 3189
jpayne@68 3190 @property
jpayne@68 3191 def ref(self):
jpayne@68 3192 """reference allele"""
jpayne@68 3193 cdef bcf1_t *r = self.ptr
jpayne@68 3194 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3195 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3196 return charptr_to_str(r.d.allele[0]) if r.d.allele else None
jpayne@68 3197
jpayne@68 3198 @ref.setter
jpayne@68 3199 def ref(self, value):
jpayne@68 3200 cdef bcf1_t *r = self.ptr
jpayne@68 3201 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3202 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3203 #FIXME: Set alleles directly -- this is stupid
jpayne@68 3204 if not value:
jpayne@68 3205 raise ValueError('ref allele must not be null')
jpayne@68 3206 value = force_bytes(value)
jpayne@68 3207 if r.d.allele and r.n_allele:
jpayne@68 3208 alleles = [r.d.allele[i] for i in range(r.n_allele)]
jpayne@68 3209 alleles[0] = value
jpayne@68 3210 else:
jpayne@68 3211 alleles = [value, '<NON_REF>']
jpayne@68 3212 self.alleles = alleles
jpayne@68 3213 bcf_sync_end(self)
jpayne@68 3214
jpayne@68 3215 @property
jpayne@68 3216 def alleles(self):
jpayne@68 3217 """tuple of reference allele followed by alt alleles"""
jpayne@68 3218 cdef bcf1_t *r = self.ptr
jpayne@68 3219 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3220 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3221 if not r.d.allele:
jpayne@68 3222 return None
jpayne@68 3223 cdef tuple res = PyTuple_New(r.n_allele)
jpayne@68 3224 for i in range(r.n_allele):
jpayne@68 3225 a = charptr_to_str(r.d.allele[i])
jpayne@68 3226 PyTuple_SET_ITEM(res, i, a)
jpayne@68 3227 Py_INCREF(a)
jpayne@68 3228 return res
jpayne@68 3229
jpayne@68 3230 @alleles.setter
jpayne@68 3231 def alleles(self, values):
jpayne@68 3232 cdef bcf1_t *r = self.ptr
jpayne@68 3233
jpayne@68 3234 # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
jpayne@68 3235 cdef int rlen = r.rlen
jpayne@68 3236
jpayne@68 3237 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3238 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3239
jpayne@68 3240 values = [force_bytes(v) for v in values]
jpayne@68 3241
jpayne@68 3242 if len(values) < 2:
jpayne@68 3243 raise ValueError('must set at least 2 alleles')
jpayne@68 3244
jpayne@68 3245 if b'' in values:
jpayne@68 3246 raise ValueError('cannot set null allele')
jpayne@68 3247
jpayne@68 3248 value = b','.join(values)
jpayne@68 3249
jpayne@68 3250 if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
jpayne@68 3251 raise ValueError('Error updating alleles')
jpayne@68 3252
jpayne@68 3253 # Reset rlen if alternate allele isn't symbolic, otherwise used cached
jpayne@68 3254 if has_symbolic_allele(self):
jpayne@68 3255 self.ptr.rlen = rlen
jpayne@68 3256 else:
jpayne@68 3257 self.ptr.rlen = len(values[0])
jpayne@68 3258 r.d.var_type = -1
jpayne@68 3259 bcf_sync_end(self)
jpayne@68 3260
jpayne@68 3261 @property
jpayne@68 3262 def alts(self):
jpayne@68 3263 """tuple of alt alleles"""
jpayne@68 3264 cdef bcf1_t *r = self.ptr
jpayne@68 3265 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3266 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3267 if r.n_allele < 2 or not r.d.allele:
jpayne@68 3268 return None
jpayne@68 3269 cdef tuple res = PyTuple_New(r.n_allele - 1)
jpayne@68 3270 for i in range(1, r.n_allele):
jpayne@68 3271 a = charptr_to_str(r.d.allele[i])
jpayne@68 3272 PyTuple_SET_ITEM(res, i - 1, a)
jpayne@68 3273 Py_INCREF(a)
jpayne@68 3274 return res
jpayne@68 3275
jpayne@68 3276 @alts.setter
jpayne@68 3277 def alts(self, value):
jpayne@68 3278 #FIXME: Set alleles directly -- this is stupid
jpayne@68 3279 cdef bcf1_t *r = self.ptr
jpayne@68 3280 if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68 3281 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3282 value = [force_bytes(v) for v in value]
jpayne@68 3283 if b'' in value:
jpayne@68 3284 raise ValueError('cannot set null alt allele')
jpayne@68 3285 ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
jpayne@68 3286 self.alleles = ref + value
jpayne@68 3287 r.d.var_type = -1
jpayne@68 3288
jpayne@68 3289 @property
jpayne@68 3290 def filter(self):
jpayne@68 3291 """filter information (see :class:`VariantRecordFilter`)"""
jpayne@68 3292 if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
jpayne@68 3293 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3294 return makeVariantRecordFilter(self)
jpayne@68 3295
jpayne@68 3296 @property
jpayne@68 3297 def info(self):
jpayne@68 3298 """info data (see :class:`VariantRecordInfo`)"""
jpayne@68 3299 if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
jpayne@68 3300 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3301 return makeVariantRecordInfo(self)
jpayne@68 3302
jpayne@68 3303 @property
jpayne@68 3304 def format(self):
jpayne@68 3305 """sample format metadata (see :class:`VariantRecordFormat`)"""
jpayne@68 3306 if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
jpayne@68 3307 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3308 return makeVariantRecordFormat(self)
jpayne@68 3309
jpayne@68 3310 @property
jpayne@68 3311 def samples(self):
jpayne@68 3312 """sample data (see :class:`VariantRecordSamples`)"""
jpayne@68 3313 if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
jpayne@68 3314 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3315 return makeVariantRecordSamples(self)
jpayne@68 3316
jpayne@68 3317 property alleles_variant_types:
jpayne@68 3318 def __get__(self):
jpayne@68 3319 cdef bcf1_t *r = self.ptr
jpayne@68 3320 cdef tuple result = PyTuple_New(r.n_allele)
jpayne@68 3321
jpayne@68 3322 for i in range(r.n_allele):
jpayne@68 3323 tp = bcf_get_variant_type(r, i)
jpayne@68 3324
jpayne@68 3325 if tp == VCF_REF:
jpayne@68 3326 v_type = "REF"
jpayne@68 3327 elif tp == VCF_SNP:
jpayne@68 3328 v_type = "SNP"
jpayne@68 3329 elif tp == VCF_MNP:
jpayne@68 3330 v_type = "MNP"
jpayne@68 3331 elif tp == VCF_INDEL:
jpayne@68 3332 v_type = "INDEL"
jpayne@68 3333 elif tp == VCF_BND:
jpayne@68 3334 v_type = "BND"
jpayne@68 3335 elif tp == VCF_OVERLAP:
jpayne@68 3336 v_type = "OVERLAP"
jpayne@68 3337 else:
jpayne@68 3338 v_type = "OTHER"
jpayne@68 3339
jpayne@68 3340 PyTuple_SET_ITEM(result, i, v_type)
jpayne@68 3341 Py_INCREF(v_type)
jpayne@68 3342
jpayne@68 3343 return result
jpayne@68 3344
jpayne@68 3345 def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
jpayne@68 3346 if op != 2 and op != 3:
jpayne@68 3347 return NotImplemented
jpayne@68 3348
jpayne@68 3349 cdef bcf1_t *s = self.ptr
jpayne@68 3350 cdef bcf1_t *o = other.ptr
jpayne@68 3351
jpayne@68 3352 cdef bint cmp = self is other or (
jpayne@68 3353 s.pos == o.pos
jpayne@68 3354 and s.rlen == o.rlen
jpayne@68 3355 and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
jpayne@68 3356 or s.qual == o.qual)
jpayne@68 3357 and s.n_sample == o.n_sample
jpayne@68 3358 and s.n_allele == o.n_allele
jpayne@68 3359 and self.contig == other.contig
jpayne@68 3360 and self.alleles == other.alleles
jpayne@68 3361 and self.id == other.id
jpayne@68 3362 and self.info == other.info
jpayne@68 3363 and self.filter == other.filter
jpayne@68 3364 and self.samples == other.samples)
jpayne@68 3365
jpayne@68 3366 if op == 3:
jpayne@68 3367 cmp = not cmp
jpayne@68 3368
jpayne@68 3369 return cmp
jpayne@68 3370
jpayne@68 3371 def __str__(self):
jpayne@68 3372 cdef kstring_t line
jpayne@68 3373 cdef char c
jpayne@68 3374
jpayne@68 3375 line.l = line.m = 0
jpayne@68 3376 line.s = NULL
jpayne@68 3377
jpayne@68 3378 if vcf_format(self.header.ptr, self.ptr, &line) < 0:
jpayne@68 3379 if line.m:
jpayne@68 3380 free(line.s)
jpayne@68 3381 raise ValueError('vcf_format failed')
jpayne@68 3382
jpayne@68 3383 # Strip CR/LF?
jpayne@68 3384 #while line.l:
jpayne@68 3385 # c = line.s[line.l - 1]
jpayne@68 3386 # if c != b'\n' and c != b'\r':
jpayne@68 3387 # break
jpayne@68 3388 # line.l -= 1
jpayne@68 3389
jpayne@68 3390 ret = charptr_to_str_w_len(line.s, line.l)
jpayne@68 3391
jpayne@68 3392 if line.m:
jpayne@68 3393 free(line.s)
jpayne@68 3394
jpayne@68 3395 return ret
jpayne@68 3396
jpayne@68 3397
jpayne@68 3398 cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
jpayne@68 3399 if not header:
jpayne@68 3400 raise ValueError('invalid VariantHeader')
jpayne@68 3401
jpayne@68 3402 if not r:
jpayne@68 3403 raise ValueError('cannot create VariantRecord')
jpayne@68 3404
jpayne@68 3405 if r.errcode:
jpayne@68 3406 msg = []
jpayne@68 3407 #if r.errcode & BCF_ERR_CTG_UNDEF:
jpayne@68 3408 # msg.append('undefined contig')
jpayne@68 3409 #if r.errcode & BCF_ERR_TAG_UNDEF:
jpayne@68 3410 # msg.append('undefined tag')
jpayne@68 3411 if r.errcode & BCF_ERR_NCOLS:
jpayne@68 3412 msg.append('invalid number of columns')
jpayne@68 3413 if r.errcode & BCF_ERR_LIMITS:
jpayne@68 3414 msg.append('limits violated')
jpayne@68 3415 if r.errcode & BCF_ERR_CHAR:
jpayne@68 3416 msg.append('invalid character found')
jpayne@68 3417 if r.errcode & BCF_ERR_CTG_INVALID:
jpayne@68 3418 msg.append('invalid contig')
jpayne@68 3419 if r.errcode & BCF_ERR_TAG_INVALID:
jpayne@68 3420 msg.append('invalid tag')
jpayne@68 3421
jpayne@68 3422 if msg:
jpayne@68 3423 msg = ', '.join(msg)
jpayne@68 3424 raise ValueError('Error(s) reading record: {}'.format(msg))
jpayne@68 3425
jpayne@68 3426 cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
jpayne@68 3427 record.header = header
jpayne@68 3428 record.ptr = r
jpayne@68 3429
jpayne@68 3430 return record
jpayne@68 3431
jpayne@68 3432
jpayne@68 3433 ########################################################################
jpayne@68 3434 ########################################################################
jpayne@68 3435 ## Variant Sampletype object
jpayne@68 3436 ########################################################################
jpayne@68 3437
jpayne@68 3438
jpayne@68 3439 cdef class VariantRecordSample(object):
jpayne@68 3440 """Data for a single sample from a :class:`VariantRecord` object.
jpayne@68 3441 Provides data accessors for genotypes and a mapping interface
jpayne@68 3442 from format name to values.
jpayne@68 3443 """
jpayne@68 3444 def __init__(self, *args, **kwargs):
jpayne@68 3445 raise TypeError('this class cannot be instantiated from Python')
jpayne@68 3446
jpayne@68 3447 @property
jpayne@68 3448 def name(self):
jpayne@68 3449 """sample name"""
jpayne@68 3450 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3451 cdef bcf1_t *r = self.record.ptr
jpayne@68 3452 cdef int32_t n = r.n_sample
jpayne@68 3453
jpayne@68 3454 if self.index < 0 or self.index >= n:
jpayne@68 3455 raise ValueError('invalid sample index')
jpayne@68 3456
jpayne@68 3457 return charptr_to_str(hdr.samples[self.index])
jpayne@68 3458
jpayne@68 3459 @property
jpayne@68 3460 def allele_indices(self):
jpayne@68 3461 """allele indices for called genotype, if present. Otherwise None"""
jpayne@68 3462 return bcf_format_get_allele_indices(self)
jpayne@68 3463
jpayne@68 3464 @allele_indices.setter
jpayne@68 3465 def allele_indices(self, value):
jpayne@68 3466 self['GT'] = value
jpayne@68 3467
jpayne@68 3468 @allele_indices.deleter
jpayne@68 3469 def allele_indices(self):
jpayne@68 3470 self['GT'] = ()
jpayne@68 3471
jpayne@68 3472 @property
jpayne@68 3473 def alleles(self):
jpayne@68 3474 """alleles for called genotype, if present. Otherwise None"""
jpayne@68 3475 return bcf_format_get_alleles(self)
jpayne@68 3476
jpayne@68 3477 @alleles.setter
jpayne@68 3478 def alleles(self, value):
jpayne@68 3479 # Sets the genotype, supply a tuple of alleles to set.
jpayne@68 3480 # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
jpayne@68 3481 # The genotype is reset when an empty tuple, None or (None,) is supplied
jpayne@68 3482
jpayne@68 3483 if value==(None,) or value==tuple() or value is None:
jpayne@68 3484 self['GT'] = ()
jpayne@68 3485 return
jpayne@68 3486
jpayne@68 3487 if any((type(x) == int for x in value)):
jpayne@68 3488 raise ValueError('Use .allele_indices to set integer allele indices')
jpayne@68 3489
jpayne@68 3490 # determine and set allele indices:
jpayne@68 3491 try:
jpayne@68 3492 self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
jpayne@68 3493 except ValueError:
jpayne@68 3494 raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
jpayne@68 3495 "First set the .alleles of this record to define the alleles")
jpayne@68 3496
jpayne@68 3497 @alleles.deleter
jpayne@68 3498 def alleles(self):
jpayne@68 3499 self['GT'] = ()
jpayne@68 3500
jpayne@68 3501 @property
jpayne@68 3502 def phased(self):
jpayne@68 3503 """False if genotype is missing or any allele is unphased. Otherwise True."""
jpayne@68 3504 return bcf_sample_get_phased(self)
jpayne@68 3505
jpayne@68 3506 @phased.setter
jpayne@68 3507 def phased(self, value):
jpayne@68 3508 bcf_sample_set_phased(self, value)
jpayne@68 3509
jpayne@68 3510 def __len__(self):
jpayne@68 3511 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3512 cdef bcf1_t *r = self.record.ptr
jpayne@68 3513 cdef int i, n = 0
jpayne@68 3514
jpayne@68 3515 if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@68 3516 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3517
jpayne@68 3518 for i in range(r.n_fmt):
jpayne@68 3519 if r.d.fmt[i].p:
jpayne@68 3520 n += 1
jpayne@68 3521 return n
jpayne@68 3522
jpayne@68 3523 def __bool__(self):
jpayne@68 3524 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3525 cdef bcf1_t *r = self.record.ptr
jpayne@68 3526 cdef int i
jpayne@68 3527
jpayne@68 3528 if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@68 3529 raise ValueError('Error unpacking VariantRecord')
jpayne@68 3530
jpayne@68 3531 for i in range(r.n_fmt):
jpayne@68 3532 if r.d.fmt[i].p:
jpayne@68 3533 return True
jpayne@68 3534 return False
jpayne@68 3535
jpayne@68 3536 def __getitem__(self, key):
jpayne@68 3537 return bcf_format_get_value(self, key)
jpayne@68 3538
jpayne@68 3539 def __setitem__(self, key, value):
jpayne@68 3540 bcf_format_set_value(self, key, value)
jpayne@68 3541
jpayne@68 3542 def __delitem__(self, key):
jpayne@68 3543 bcf_format_del_value(self, key)
jpayne@68 3544
jpayne@68 3545 def clear(self):
jpayne@68 3546 """Clear all format data (including genotype) for this sample"""
jpayne@68 3547 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3548 cdef bcf1_t *r = self.record.ptr
jpayne@68 3549 cdef bcf_fmt_t *fmt
jpayne@68 3550 cdef int i
jpayne@68 3551
jpayne@68 3552 for i in range(r.n_fmt):
jpayne@68 3553 fmt = &r.d.fmt[i]
jpayne@68 3554 if fmt.p:
jpayne@68 3555 bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68 3556
jpayne@68 3557 def __iter__(self):
jpayne@68 3558 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3559 cdef bcf1_t *r = self.record.ptr
jpayne@68 3560 cdef bcf_fmt_t *fmt
jpayne@68 3561 cdef int i
jpayne@68 3562
jpayne@68 3563 for i in range(r.n_fmt):
jpayne@68 3564 fmt = &r.d.fmt[i]
jpayne@68 3565 if r.d.fmt[i].p:
jpayne@68 3566 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68 3567
jpayne@68 3568 def get(self, key, default=None):
jpayne@68 3569 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 3570 try:
jpayne@68 3571 return self[key]
jpayne@68 3572 except KeyError:
jpayne@68 3573 return default
jpayne@68 3574
jpayne@68 3575 def __contains__(self, key):
jpayne@68 3576 cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68 3577 cdef bcf1_t *r = self.record.ptr
jpayne@68 3578 cdef bytes bkey = force_bytes(key)
jpayne@68 3579 cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68 3580 return fmt != NULL and fmt.p != NULL
jpayne@68 3581
jpayne@68 3582 def iterkeys(self):
jpayne@68 3583 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 3584 return iter(self)
jpayne@68 3585
jpayne@68 3586 def itervalues(self):
jpayne@68 3587 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 3588 for key in self:
jpayne@68 3589 yield self[key]
jpayne@68 3590
jpayne@68 3591 def iteritems(self):
jpayne@68 3592 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 3593 for key in self:
jpayne@68 3594 yield (key, self[key])
jpayne@68 3595
jpayne@68 3596 def keys(self):
jpayne@68 3597 """D.keys() -> list of D's keys"""
jpayne@68 3598 return list(self)
jpayne@68 3599
jpayne@68 3600 def items(self):
jpayne@68 3601 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 3602 return list(self.iteritems())
jpayne@68 3603
jpayne@68 3604 def values(self):
jpayne@68 3605 """D.values() -> list of D's values"""
jpayne@68 3606 return list(self.itervalues())
jpayne@68 3607
jpayne@68 3608 def update(self, items=None, **kwargs):
jpayne@68 3609 """D.update([E, ]**F) -> None.
jpayne@68 3610
jpayne@68 3611 Update D from dict/iterable E and F.
jpayne@68 3612 """
jpayne@68 3613 for k, v in items.items():
jpayne@68 3614 self[k] = v
jpayne@68 3615
jpayne@68 3616 if kwargs:
jpayne@68 3617 for k, v in kwargs.items():
jpayne@68 3618 self[k] = v
jpayne@68 3619
jpayne@68 3620 def pop(self, key, default=_nothing):
jpayne@68 3621 try:
jpayne@68 3622 value = self[key]
jpayne@68 3623 del self[key]
jpayne@68 3624 return value
jpayne@68 3625 except KeyError:
jpayne@68 3626 if default is not _nothing:
jpayne@68 3627 return default
jpayne@68 3628 raise
jpayne@68 3629
jpayne@68 3630 def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
jpayne@68 3631 if op != 2 and op != 3:
jpayne@68 3632 return NotImplemented
jpayne@68 3633
jpayne@68 3634 cdef bint cmp = dict(self) == dict(other)
jpayne@68 3635
jpayne@68 3636 if op == 3:
jpayne@68 3637 cmp = not cmp
jpayne@68 3638
jpayne@68 3639 return cmp
jpayne@68 3640
jpayne@68 3641 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 3642 __hash__ = None
jpayne@68 3643
jpayne@68 3644
jpayne@68 3645 cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
jpayne@68 3646 if not record or sample_index < 0:
jpayne@68 3647 raise ValueError('cannot create VariantRecordSample')
jpayne@68 3648
jpayne@68 3649 cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
jpayne@68 3650 sample.record = record
jpayne@68 3651 sample.index = sample_index
jpayne@68 3652
jpayne@68 3653 return sample
jpayne@68 3654
jpayne@68 3655
jpayne@68 3656 ########################################################################
jpayne@68 3657 ########################################################################
jpayne@68 3658 ## Index objects
jpayne@68 3659 ########################################################################
jpayne@68 3660
jpayne@68 3661
jpayne@68 3662 cdef class BaseIndex(object):
jpayne@68 3663 def __init__(self):
jpayne@68 3664 self.refs = ()
jpayne@68 3665 self.remap = {}
jpayne@68 3666
jpayne@68 3667 def __len__(self):
jpayne@68 3668 return len(self.refs)
jpayne@68 3669
jpayne@68 3670 def __bool__(self):
jpayne@68 3671 return len(self.refs) != 0
jpayne@68 3672
jpayne@68 3673 def __getitem__(self, key):
jpayne@68 3674 if isinstance(key, int):
jpayne@68 3675 return self.refs[key]
jpayne@68 3676 else:
jpayne@68 3677 return self.refmap[key]
jpayne@68 3678
jpayne@68 3679 def __iter__(self):
jpayne@68 3680 return iter(self.refs)
jpayne@68 3681
jpayne@68 3682 def get(self, key, default=None):
jpayne@68 3683 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
jpayne@68 3684 try:
jpayne@68 3685 return self[key]
jpayne@68 3686 except KeyError:
jpayne@68 3687 return default
jpayne@68 3688
jpayne@68 3689 def __contains__(self, key):
jpayne@68 3690 try:
jpayne@68 3691 self[key]
jpayne@68 3692 except KeyError:
jpayne@68 3693 return False
jpayne@68 3694 else:
jpayne@68 3695 return True
jpayne@68 3696
jpayne@68 3697 def iterkeys(self):
jpayne@68 3698 """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68 3699 return iter(self)
jpayne@68 3700
jpayne@68 3701 def itervalues(self):
jpayne@68 3702 """D.itervalues() -> an iterator over the values of D"""
jpayne@68 3703 for key in self:
jpayne@68 3704 yield self[key]
jpayne@68 3705
jpayne@68 3706 def iteritems(self):
jpayne@68 3707 """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68 3708 for key in self:
jpayne@68 3709 yield (key, self[key])
jpayne@68 3710
jpayne@68 3711 def keys(self):
jpayne@68 3712 """D.keys() -> list of D's keys"""
jpayne@68 3713 return list(self)
jpayne@68 3714
jpayne@68 3715 def items(self):
jpayne@68 3716 """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68 3717 return list(self.iteritems())
jpayne@68 3718
jpayne@68 3719 def values(self):
jpayne@68 3720 """D.values() -> list of D's values"""
jpayne@68 3721 return list(self.itervalues())
jpayne@68 3722
jpayne@68 3723 def update(self, items=None, **kwargs):
jpayne@68 3724 """D.update([E, ]**F) -> None.
jpayne@68 3725
jpayne@68 3726 Update D from dict/iterable E and F.
jpayne@68 3727 """
jpayne@68 3728 for k, v in items.items():
jpayne@68 3729 self[k] = v
jpayne@68 3730
jpayne@68 3731 if kwargs:
jpayne@68 3732 for k, v in kwargs.items():
jpayne@68 3733 self[k] = v
jpayne@68 3734
jpayne@68 3735 def pop(self, key, default=_nothing):
jpayne@68 3736 try:
jpayne@68 3737 value = self[key]
jpayne@68 3738 del self[key]
jpayne@68 3739 return value
jpayne@68 3740 except KeyError:
jpayne@68 3741 if default is not _nothing:
jpayne@68 3742 return default
jpayne@68 3743 raise
jpayne@68 3744
jpayne@68 3745 # Mappings are not hashable by default, but subclasses can change this
jpayne@68 3746 __hash__ = None
jpayne@68 3747
jpayne@68 3748 #TODO: implement __richcmp__
jpayne@68 3749
jpayne@68 3750
jpayne@68 3751 cdef class BCFIndex(object):
jpayne@68 3752 """CSI index data structure for BCF files"""
jpayne@68 3753 def __init__(self):
jpayne@68 3754 self.refs = ()
jpayne@68 3755 self.refmap = {}
jpayne@68 3756
jpayne@68 3757 if not self.ptr:
jpayne@68 3758 raise ValueError('Invalid index object')
jpayne@68 3759
jpayne@68 3760 cdef int n
jpayne@68 3761 cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
jpayne@68 3762
jpayne@68 3763 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@68 3764 self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@68 3765
jpayne@68 3766 def __dealloc__(self):
jpayne@68 3767 if self.ptr:
jpayne@68 3768 hts_idx_destroy(self.ptr)
jpayne@68 3769 self.ptr = NULL
jpayne@68 3770
jpayne@68 3771 def fetch(self, bcf, contig, start, stop, reopen):
jpayne@68 3772 return BCFIterator(bcf, contig, start, stop, reopen)
jpayne@68 3773
jpayne@68 3774
jpayne@68 3775 cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
jpayne@68 3776 if not idx:
jpayne@68 3777 return None
jpayne@68 3778
jpayne@68 3779 if not header:
jpayne@68 3780 raise ValueError('invalid VariantHeader')
jpayne@68 3781
jpayne@68 3782 cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
jpayne@68 3783 index.header = header
jpayne@68 3784 index.ptr = idx
jpayne@68 3785 index.__init__()
jpayne@68 3786
jpayne@68 3787 return index
jpayne@68 3788
jpayne@68 3789
jpayne@68 3790 cdef class TabixIndex(BaseIndex):
jpayne@68 3791 """Tabix index data structure for VCF files"""
jpayne@68 3792 def __init__(self):
jpayne@68 3793 self.refs = ()
jpayne@68 3794 self.refmap = {}
jpayne@68 3795
jpayne@68 3796 if not self.ptr:
jpayne@68 3797 raise ValueError('Invalid index object')
jpayne@68 3798
jpayne@68 3799 cdef int n
jpayne@68 3800 cdef const char **refs = tbx_seqnames(self.ptr, &n)
jpayne@68 3801
jpayne@68 3802 self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@68 3803 self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@68 3804
jpayne@68 3805 def __dealloc__(self):
jpayne@68 3806 if self.ptr:
jpayne@68 3807 tbx_destroy(self.ptr)
jpayne@68 3808 self.ptr = NULL
jpayne@68 3809
jpayne@68 3810 def fetch(self, bcf, contig, start, stop, reopen):
jpayne@68 3811 return TabixIterator(bcf, contig, start, stop, reopen)
jpayne@68 3812
jpayne@68 3813
jpayne@68 3814 cdef TabixIndex makeTabixIndex(tbx_t *idx):
jpayne@68 3815 if not idx:
jpayne@68 3816 return None
jpayne@68 3817
jpayne@68 3818 cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
jpayne@68 3819 index.ptr = idx
jpayne@68 3820 index.__init__()
jpayne@68 3821
jpayne@68 3822 return index
jpayne@68 3823
jpayne@68 3824
jpayne@68 3825 ########################################################################
jpayne@68 3826 ########################################################################
jpayne@68 3827 ## Iterators
jpayne@68 3828 ########################################################################
jpayne@68 3829
jpayne@68 3830
jpayne@68 3831 cdef class BaseIterator(object):
jpayne@68 3832 pass
jpayne@68 3833
jpayne@68 3834
jpayne@68 3835 # Internal function to clean up after iteration stop or failure.
jpayne@68 3836 # This would be a nested function if it weren't a cdef function.
jpayne@68 3837 cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
jpayne@68 3838 bcf_destroy1(record)
jpayne@68 3839
jpayne@68 3840 # destroy iter so future calls to __next__ raise StopIteration
jpayne@68 3841 bcf_itr_destroy(self.iter)
jpayne@68 3842 self.iter = NULL
jpayne@68 3843
jpayne@68 3844
jpayne@68 3845 cdef class BCFIterator(BaseIterator):
jpayne@68 3846 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@68 3847 if bcf is None:
jpayne@68 3848 raise ValueError('bcf must not be None')
jpayne@68 3849
jpayne@68 3850 if contig is None:
jpayne@68 3851 raise ValueError('contig must be specified')
jpayne@68 3852
jpayne@68 3853 if not isinstance(bcf.index, BCFIndex):
jpayne@68 3854 raise ValueError('bcf index required')
jpayne@68 3855
jpayne@68 3856 cdef BCFIndex index = bcf.index
jpayne@68 3857
jpayne@68 3858 self.bcf = bcf
jpayne@68 3859 self.index = index
jpayne@68 3860
jpayne@68 3861 cdef int rid, cstart, cstop
jpayne@68 3862
jpayne@68 3863 try:
jpayne@68 3864 rid = index.refmap[contig]
jpayne@68 3865 except KeyError:
jpayne@68 3866 # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@68 3867 self.iter = NULL
jpayne@68 3868 return
jpayne@68 3869
jpayne@68 3870 if reopen:
jpayne@68 3871 self.bcf = self.bcf.copy()
jpayne@68 3872
jpayne@68 3873 cstart = start if start is not None else 0
jpayne@68 3874 cstop = stop if stop is not None else MAX_POS
jpayne@68 3875
jpayne@68 3876 with nogil:
jpayne@68 3877 self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
jpayne@68 3878
jpayne@68 3879 if not self.iter:
jpayne@68 3880 if errno:
jpayne@68 3881 raise IOError(errno, strerror(errno))
jpayne@68 3882 else:
jpayne@68 3883 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@68 3884
jpayne@68 3885 def __dealloc__(self):
jpayne@68 3886 if self.iter:
jpayne@68 3887 bcf_itr_destroy(self.iter)
jpayne@68 3888 self.iter = NULL
jpayne@68 3889
jpayne@68 3890 def __iter__(self):
jpayne@68 3891 return self
jpayne@68 3892
jpayne@68 3893 def __next__(self):
jpayne@68 3894 if not self.iter:
jpayne@68 3895 raise StopIteration
jpayne@68 3896
jpayne@68 3897 cdef bcf1_t *record = bcf_init1()
jpayne@68 3898
jpayne@68 3899 if not record:
jpayne@68 3900 raise MemoryError('unable to allocate BCF record')
jpayne@68 3901
jpayne@68 3902 record.pos = -1
jpayne@68 3903 if self.bcf.drop_samples:
jpayne@68 3904 record.max_unpack = BCF_UN_SHR
jpayne@68 3905
jpayne@68 3906 cdef int ret
jpayne@68 3907
jpayne@68 3908 with nogil:
jpayne@68 3909 ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
jpayne@68 3910
jpayne@68 3911 if ret < 0:
jpayne@68 3912 _stop_BCFIterator(self, record)
jpayne@68 3913 if ret == -1:
jpayne@68 3914 raise StopIteration
jpayne@68 3915 elif ret == -2:
jpayne@68 3916 raise IOError('truncated file')
jpayne@68 3917 elif errno:
jpayne@68 3918 raise IOError(errno, strerror(errno))
jpayne@68 3919 else:
jpayne@68 3920 raise IOError('unable to fetch next record')
jpayne@68 3921
jpayne@68 3922 ret = bcf_subset_format(self.bcf.header.ptr, record)
jpayne@68 3923
jpayne@68 3924 if ret < 0:
jpayne@68 3925 _stop_BCFIterator(self, record)
jpayne@68 3926 raise ValueError('error in bcf_subset_format')
jpayne@68 3927
jpayne@68 3928 return makeVariantRecord(self.bcf.header, record)
jpayne@68 3929
jpayne@68 3930
jpayne@68 3931 cdef class TabixIterator(BaseIterator):
jpayne@68 3932 def __cinit__(self, *args, **kwargs):
jpayne@68 3933 self.line_buffer.l = 0
jpayne@68 3934 self.line_buffer.m = 0
jpayne@68 3935 self.line_buffer.s = NULL
jpayne@68 3936
jpayne@68 3937 def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@68 3938 if bcf is None:
jpayne@68 3939 raise ValueError('bcf must not be None')
jpayne@68 3940
jpayne@68 3941 if not isinstance(bcf.index, TabixIndex):
jpayne@68 3942 raise ValueError('tabix index required')
jpayne@68 3943
jpayne@68 3944 cdef TabixIndex index = bcf.index
jpayne@68 3945
jpayne@68 3946 self.bcf = bcf
jpayne@68 3947 self.index = index
jpayne@68 3948
jpayne@68 3949 cdef int rid, cstart, cstop
jpayne@68 3950
jpayne@68 3951 try:
jpayne@68 3952 rid = index.refmap[contig]
jpayne@68 3953 except KeyError:
jpayne@68 3954 # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@68 3955 self.iter = NULL
jpayne@68 3956 return
jpayne@68 3957
jpayne@68 3958 if reopen:
jpayne@68 3959 self.bcf = self.bcf.copy()
jpayne@68 3960
jpayne@68 3961 cstart = start if start is not None else 0
jpayne@68 3962 cstop = stop if stop is not None else MAX_POS
jpayne@68 3963
jpayne@68 3964 self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
jpayne@68 3965
jpayne@68 3966 if not self.iter:
jpayne@68 3967 if errno:
jpayne@68 3968 raise IOError(errno, strerror(errno))
jpayne@68 3969 else:
jpayne@68 3970 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@68 3971
jpayne@68 3972 def __dealloc__(self):
jpayne@68 3973 if self.iter:
jpayne@68 3974 tbx_itr_destroy(self.iter)
jpayne@68 3975 self.iter = NULL
jpayne@68 3976
jpayne@68 3977 if self.line_buffer.m:
jpayne@68 3978 free(self.line_buffer.s)
jpayne@68 3979
jpayne@68 3980 self.line_buffer.l = 0
jpayne@68 3981 self.line_buffer.m = 0
jpayne@68 3982 self.line_buffer.s = NULL
jpayne@68 3983
jpayne@68 3984 def __iter__(self):
jpayne@68 3985 return self
jpayne@68 3986
jpayne@68 3987 def __next__(self):
jpayne@68 3988 if not self.iter:
jpayne@68 3989 raise StopIteration
jpayne@68 3990
jpayne@68 3991 cdef int ret
jpayne@68 3992
jpayne@68 3993 with nogil:
jpayne@68 3994 ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
jpayne@68 3995
jpayne@68 3996 if ret < 0:
jpayne@68 3997 tbx_itr_destroy(self.iter)
jpayne@68 3998 self.iter = NULL
jpayne@68 3999 if ret == -1:
jpayne@68 4000 raise StopIteration
jpayne@68 4001 elif ret == -2:
jpayne@68 4002 raise IOError('truncated file')
jpayne@68 4003 elif errno:
jpayne@68 4004 raise IOError(errno, strerror(errno))
jpayne@68 4005 else:
jpayne@68 4006 raise IOError('unable to fetch next record')
jpayne@68 4007
jpayne@68 4008 cdef bcf1_t *record = bcf_init1()
jpayne@68 4009
jpayne@68 4010 if not record:
jpayne@68 4011 raise MemoryError('unable to allocate BCF record')
jpayne@68 4012
jpayne@68 4013 record.pos = -1
jpayne@68 4014 if self.bcf.drop_samples:
jpayne@68 4015 record.max_unpack = BCF_UN_SHR
jpayne@68 4016
jpayne@68 4017 ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
jpayne@68 4018
jpayne@68 4019 # FIXME: stop iteration on parse failure?
jpayne@68 4020 if ret < 0:
jpayne@68 4021 bcf_destroy1(record)
jpayne@68 4022 raise ValueError('error in vcf_parse')
jpayne@68 4023
jpayne@68 4024 return makeVariantRecord(self.bcf.header, record)
jpayne@68 4025
jpayne@68 4026
jpayne@68 4027 ########################################################################
jpayne@68 4028 ########################################################################
jpayne@68 4029 ## Variant File
jpayne@68 4030 ########################################################################
jpayne@68 4031
jpayne@68 4032
jpayne@68 4033 cdef class VariantFile(HTSFile):
jpayne@68 4034 """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
jpayne@68 4035 duplicate_filehandle=True, ignore_truncation=False, threads=1)*
jpayne@68 4036
jpayne@68 4037 A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
jpayne@68 4038 opened.
jpayne@68 4039
jpayne@68 4040 If an index for a variant file exists (.csi or .tbi), it will be
jpayne@68 4041 opened automatically. Without an index random access to records
jpayne@68 4042 via :meth:`fetch` is disabled.
jpayne@68 4043
jpayne@68 4044 For writing, a :class:`VariantHeader` object must be provided,
jpayne@68 4045 typically obtained from another :term:`VCF` file/:term:`BCF`
jpayne@68 4046 file.
jpayne@68 4047
jpayne@68 4048 Parameters
jpayne@68 4049 ----------
jpayne@68 4050 mode : string
jpayne@68 4051 *mode* should be ``r`` for reading or ``w`` for writing. The default is
jpayne@68 4052 text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
jpayne@68 4053 ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
jpayne@68 4054
jpayne@68 4055 If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
jpayne@68 4056 modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
jpayne@68 4057 For instance, to open a :term:`BCF` formatted file for reading, type::
jpayne@68 4058
jpayne@68 4059 f = pysam.VariantFile('ex1.bcf','r')
jpayne@68 4060
jpayne@68 4061 If mode is not specified, we will try to auto-detect the file type. All
jpayne@68 4062 of the following should work::
jpayne@68 4063
jpayne@68 4064 f1 = pysam.VariantFile('ex1.bcf')
jpayne@68 4065 f2 = pysam.VariantFile('ex1.vcf')
jpayne@68 4066 f3 = pysam.VariantFile('ex1.vcf.gz')
jpayne@68 4067
jpayne@68 4068 index_filename : string
jpayne@68 4069 Explicit path to an index file.
jpayne@68 4070
jpayne@68 4071 header : VariantHeader
jpayne@68 4072 :class:`VariantHeader` object required for writing.
jpayne@68 4073
jpayne@68 4074 drop_samples: bool
jpayne@68 4075 Ignore sample information when reading.
jpayne@68 4076
jpayne@68 4077 duplicate_filehandle: bool
jpayne@68 4078 By default, file handles passed either directly or through
jpayne@68 4079 File-like objects will be duplicated before passing them to
jpayne@68 4080 htslib. The duplication prevents issues where the same stream
jpayne@68 4081 will be closed by htslib and through destruction of the
jpayne@68 4082 high-level python object. Set to False to turn off
jpayne@68 4083 duplication.
jpayne@68 4084
jpayne@68 4085 ignore_truncation: bool
jpayne@68 4086 Issue a warning, instead of raising an error if the current file
jpayne@68 4087 appears to be truncated due to a missing EOF marker. Only applies
jpayne@68 4088 to bgzipped formats. (Default=False)
jpayne@68 4089
jpayne@68 4090 threads: integer
jpayne@68 4091 Number of threads to use for compressing/decompressing VCF/BCF files.
jpayne@68 4092 Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@68 4093 (Default=1)
jpayne@68 4094
jpayne@68 4095 """
jpayne@68 4096 def __cinit__(self, *args, **kwargs):
jpayne@68 4097 self.htsfile = NULL
jpayne@68 4098
jpayne@68 4099 def __init__(self, *args, **kwargs):
jpayne@68 4100 self.header = None
jpayne@68 4101 self.index = None
jpayne@68 4102 self.filename = None
jpayne@68 4103 self.mode = None
jpayne@68 4104 self.threads = 1
jpayne@68 4105 self.index_filename = None
jpayne@68 4106 self.is_stream = False
jpayne@68 4107 self.is_remote = False
jpayne@68 4108 self.is_reading = False
jpayne@68 4109 self.drop_samples = False
jpayne@68 4110 self.header_written = False
jpayne@68 4111 self.start_offset = -1
jpayne@68 4112
jpayne@68 4113 self.open(*args, **kwargs)
jpayne@68 4114
jpayne@68 4115 def __dealloc__(self):
jpayne@68 4116 if not self.htsfile or not self.header:
jpayne@68 4117 return
jpayne@68 4118
jpayne@68 4119 # Write header if no records were written
jpayne@68 4120 if self.htsfile.is_write and not self.header_written:
jpayne@68 4121 with nogil:
jpayne@68 4122 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68 4123
jpayne@68 4124 cdef int ret = hts_close(self.htsfile)
jpayne@68 4125 self.htsfile = NULL
jpayne@68 4126 self.header = self.index = None
jpayne@68 4127
jpayne@68 4128 if ret < 0:
jpayne@68 4129 global errno
jpayne@68 4130 if errno == EPIPE:
jpayne@68 4131 errno = 0
jpayne@68 4132 else:
jpayne@68 4133 raise IOError(errno, force_str(strerror(errno)))
jpayne@68 4134
jpayne@68 4135 def close(self):
jpayne@68 4136 """closes the :class:`pysam.VariantFile`."""
jpayne@68 4137 if not self.htsfile:
jpayne@68 4138 return
jpayne@68 4139
jpayne@68 4140 # Write header if no records were written
jpayne@68 4141 if self.htsfile.is_write and not self.header_written:
jpayne@68 4142 with nogil:
jpayne@68 4143 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68 4144
jpayne@68 4145 cdef int ret = hts_close(self.htsfile)
jpayne@68 4146 self.htsfile = NULL
jpayne@68 4147 self.header = self.index = None
jpayne@68 4148
jpayne@68 4149 if ret < 0:
jpayne@68 4150 global errno
jpayne@68 4151 if errno == EPIPE:
jpayne@68 4152 errno = 0
jpayne@68 4153 else:
jpayne@68 4154 raise IOError(errno, force_str(strerror(errno)))
jpayne@68 4155
jpayne@68 4156 def __iter__(self):
jpayne@68 4157 if not self.is_open:
jpayne@68 4158 raise ValueError('I/O operation on closed file')
jpayne@68 4159
jpayne@68 4160 if self.htsfile.is_write:
jpayne@68 4161 raise ValueError('cannot iterate over Variantfile opened for writing')
jpayne@68 4162
jpayne@68 4163 self.is_reading = 1
jpayne@68 4164 return self
jpayne@68 4165
jpayne@68 4166 def __next__(self):
jpayne@68 4167 cdef int ret
jpayne@68 4168 cdef int errcode
jpayne@68 4169 cdef bcf1_t *record = bcf_init1()
jpayne@68 4170
jpayne@68 4171 if not record:
jpayne@68 4172 raise MemoryError('unable to allocate BCF record')
jpayne@68 4173
jpayne@68 4174 record.pos = -1
jpayne@68 4175 if self.drop_samples:
jpayne@68 4176 record.max_unpack = BCF_UN_SHR
jpayne@68 4177
jpayne@68 4178 with nogil:
jpayne@68 4179 ret = bcf_read1(self.htsfile, self.header.ptr, record)
jpayne@68 4180
jpayne@68 4181 if ret < 0:
jpayne@68 4182 errcode = record.errcode
jpayne@68 4183 bcf_destroy1(record)
jpayne@68 4184 if errcode:
jpayne@68 4185 raise IOError('unable to parse next record')
jpayne@68 4186 if ret == -1:
jpayne@68 4187 raise StopIteration
jpayne@68 4188 elif ret == -2:
jpayne@68 4189 raise IOError('truncated file')
jpayne@68 4190 elif errno:
jpayne@68 4191 raise IOError(errno, strerror(errno))
jpayne@68 4192 else:
jpayne@68 4193 raise IOError('unable to fetch next record')
jpayne@68 4194
jpayne@68 4195 return makeVariantRecord(self.header, record)
jpayne@68 4196
jpayne@68 4197 def copy(self):
jpayne@68 4198 if not self.is_open:
jpayne@68 4199 raise ValueError
jpayne@68 4200
jpayne@68 4201 cdef VariantFile vars = VariantFile.__new__(VariantFile)
jpayne@68 4202 cdef bcf_hdr_t *hdr
jpayne@68 4203
jpayne@68 4204 # FIXME: re-open using fd or else header and index could be invalid
jpayne@68 4205 vars.htsfile = self._open_htsfile()
jpayne@68 4206
jpayne@68 4207 if not vars.htsfile:
jpayne@68 4208 raise ValueError('Cannot re-open htsfile')
jpayne@68 4209
jpayne@68 4210 # minimize overhead by re-using header and index. This approach is
jpayne@68 4211 # currently risky, but see above for how this can be mitigated.
jpayne@68 4212 vars.header = self.header
jpayne@68 4213 vars.index = self.index
jpayne@68 4214
jpayne@68 4215 vars.filename = self.filename
jpayne@68 4216 vars.mode = self.mode
jpayne@68 4217 vars.threads = self.threads
jpayne@68 4218 vars.index_filename = self.index_filename
jpayne@68 4219 vars.drop_samples = self.drop_samples
jpayne@68 4220 vars.is_stream = self.is_stream
jpayne@68 4221 vars.is_remote = self.is_remote
jpayne@68 4222 vars.is_reading = self.is_reading
jpayne@68 4223 vars.start_offset = self.start_offset
jpayne@68 4224 vars.header_written = self.header_written
jpayne@68 4225
jpayne@68 4226 if self.htsfile.is_bin:
jpayne@68 4227 vars.seek(self.tell())
jpayne@68 4228 else:
jpayne@68 4229 with nogil:
jpayne@68 4230 hdr = bcf_hdr_read(vars.htsfile)
jpayne@68 4231 makeVariantHeader(hdr)
jpayne@68 4232
jpayne@68 4233 return vars
jpayne@68 4234
jpayne@68 4235 def open(self, filename, mode='r',
jpayne@68 4236 index_filename=None,
jpayne@68 4237 VariantHeader header=None,
jpayne@68 4238 drop_samples=False,
jpayne@68 4239 duplicate_filehandle=True,
jpayne@68 4240 ignore_truncation=False,
jpayne@68 4241 threads=1):
jpayne@68 4242 """open a vcf/bcf file.
jpayne@68 4243
jpayne@68 4244 If open is called on an existing VariantFile, the current file will be
jpayne@68 4245 closed and a new file will be opened.
jpayne@68 4246 """
jpayne@68 4247 cdef bcf_hdr_t *hdr
jpayne@68 4248 cdef BGZF *bgzfp
jpayne@68 4249 cdef hts_idx_t *idx
jpayne@68 4250 cdef tbx_t *tidx
jpayne@68 4251 cdef char *cfilename
jpayne@68 4252 cdef char *cindex_filename = NULL
jpayne@68 4253 cdef char *cmode
jpayne@68 4254
jpayne@68 4255 if threads > 1 and ignore_truncation:
jpayne@68 4256 # This won't raise errors if reaching a truncated alignment,
jpayne@68 4257 # because bgzf_mt_reader in htslib does not deal with
jpayne@68 4258 # bgzf_mt_read_block returning non-zero values, contrary
jpayne@68 4259 # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@68 4260 # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@68 4261 raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@68 4262 self.threads = threads
jpayne@68 4263
jpayne@68 4264 # close a previously opened file
jpayne@68 4265 if self.is_open:
jpayne@68 4266 self.close()
jpayne@68 4267
jpayne@68 4268 if not mode or mode[0] not in 'rwa':
jpayne@68 4269 raise ValueError('mode must begin with r, w or a')
jpayne@68 4270
jpayne@68 4271 self.duplicate_filehandle = duplicate_filehandle
jpayne@68 4272
jpayne@68 4273 format_modes = [m for m in mode[1:] if m in 'bcguz']
jpayne@68 4274 if len(format_modes) > 1:
jpayne@68 4275 raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
jpayne@68 4276
jpayne@68 4277 invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
jpayne@68 4278 if invalid_modes:
jpayne@68 4279 raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
jpayne@68 4280
jpayne@68 4281 # Autodetect mode from filename
jpayne@68 4282 if mode == 'w' and isinstance(filename, str):
jpayne@68 4283 if filename.endswith('.gz'):
jpayne@68 4284 mode = 'wz'
jpayne@68 4285 elif filename.endswith('.bcf'):
jpayne@68 4286 mode = 'wb'
jpayne@68 4287
jpayne@68 4288 # for htslib, wbu seems to not work
jpayne@68 4289 if mode == 'wbu':
jpayne@68 4290 mode = 'wb0'
jpayne@68 4291
jpayne@68 4292 self.mode = mode = force_bytes(mode)
jpayne@68 4293 try:
jpayne@68 4294 filename = encode_filename(filename)
jpayne@68 4295 self.is_remote = hisremote(filename)
jpayne@68 4296 self.is_stream = filename == b'-'
jpayne@68 4297 except TypeError:
jpayne@68 4298 filename = filename
jpayne@68 4299 self.is_remote = False
jpayne@68 4300 self.is_stream = True
jpayne@68 4301
jpayne@68 4302 self.filename = filename
jpayne@68 4303
jpayne@68 4304 if index_filename is not None:
jpayne@68 4305 self.index_filename = index_filename = encode_filename(index_filename)
jpayne@68 4306 else:
jpayne@68 4307 self.index_filename = None
jpayne@68 4308
jpayne@68 4309 self.drop_samples = bool(drop_samples)
jpayne@68 4310 self.header = None
jpayne@68 4311
jpayne@68 4312 self.header_written = False
jpayne@68 4313
jpayne@68 4314 if mode.startswith(b'w'):
jpayne@68 4315 # open file for writing
jpayne@68 4316 if index_filename is not None:
jpayne@68 4317 raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
jpayne@68 4318
jpayne@68 4319 # header structure (used for writing)
jpayne@68 4320 if header:
jpayne@68 4321 self.header = header.copy()
jpayne@68 4322 else:
jpayne@68 4323 self.header = VariantHeader()
jpayne@68 4324 #raise ValueError('a VariantHeader must be specified')
jpayne@68 4325
jpayne@68 4326 # Header is not written until the first write or on close
jpayne@68 4327 self.htsfile = self._open_htsfile()
jpayne@68 4328
jpayne@68 4329 if not self.htsfile:
jpayne@68 4330 raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
jpayne@68 4331
jpayne@68 4332 elif mode.startswith(b'r'):
jpayne@68 4333 # open file for reading
jpayne@68 4334 self.htsfile = self._open_htsfile()
jpayne@68 4335
jpayne@68 4336 if not self.htsfile:
jpayne@68 4337 if errno:
jpayne@68 4338 raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
jpayne@68 4339 else:
jpayne@68 4340 raise ValueError('could not open variant file `{}`'.format(filename))
jpayne@68 4341
jpayne@68 4342 if self.htsfile.format.format not in (bcf, vcf):
jpayne@68 4343 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@68 4344
jpayne@68 4345 self.check_truncation(ignore_truncation)
jpayne@68 4346
jpayne@68 4347 with nogil:
jpayne@68 4348 hdr = bcf_hdr_read(self.htsfile)
jpayne@68 4349
jpayne@68 4350 try:
jpayne@68 4351 self.header = makeVariantHeader(hdr)
jpayne@68 4352 except ValueError:
jpayne@68 4353 raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@68 4354
jpayne@68 4355 if isinstance(self.filename, bytes):
jpayne@68 4356 cfilename = self.filename
jpayne@68 4357 else:
jpayne@68 4358 cfilename = NULL
jpayne@68 4359
jpayne@68 4360 # check for index and open if present
jpayne@68 4361 if self.htsfile.format.format == bcf and cfilename:
jpayne@68 4362 if index_filename is not None:
jpayne@68 4363 cindex_filename = index_filename
jpayne@68 4364 with nogil:
jpayne@68 4365 idx = bcf_index_load2(cfilename, cindex_filename)
jpayne@68 4366 self.index = makeBCFIndex(self.header, idx)
jpayne@68 4367
jpayne@68 4368 elif self.htsfile.format.compression == bgzf and cfilename:
jpayne@68 4369 if index_filename is not None:
jpayne@68 4370 cindex_filename = index_filename
jpayne@68 4371 with nogil:
jpayne@68 4372 tidx = tbx_index_load2(cfilename, cindex_filename)
jpayne@68 4373 self.index = makeTabixIndex(tidx)
jpayne@68 4374
jpayne@68 4375 if not self.is_stream:
jpayne@68 4376 self.start_offset = self.tell()
jpayne@68 4377 else:
jpayne@68 4378 raise ValueError('unknown mode {}'.format(mode))
jpayne@68 4379
jpayne@68 4380 def reset(self):
jpayne@68 4381 """reset file position to beginning of file just after the header."""
jpayne@68 4382 return self.seek(self.start_offset)
jpayne@68 4383
jpayne@68 4384 def is_valid_tid(self, tid):
jpayne@68 4385 """
jpayne@68 4386 return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68 4387
jpayne@68 4388 returns -1 if reference is not known.
jpayne@68 4389 """
jpayne@68 4390 if not self.is_open:
jpayne@68 4391 raise ValueError('I/O operation on closed file')
jpayne@68 4392
jpayne@68 4393 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 4394 cdef int rid = tid
jpayne@68 4395 return 0 <= rid < hdr.n[BCF_DT_CTG]
jpayne@68 4396
jpayne@68 4397 def get_tid(self, reference):
jpayne@68 4398 """
jpayne@68 4399 return the numerical :term:`tid` corresponding to
jpayne@68 4400 :term:`reference`
jpayne@68 4401
jpayne@68 4402 returns -1 if reference is not known.
jpayne@68 4403 """
jpayne@68 4404 if not self.is_open:
jpayne@68 4405 raise ValueError('I/O operation on closed file')
jpayne@68 4406
jpayne@68 4407 cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68 4408 reference = force_bytes(reference)
jpayne@68 4409 cdef khint_t k = kh_get_vdict(d, reference)
jpayne@68 4410 return kh_val_vdict(d, k).id if k != kh_end(d) else -1
jpayne@68 4411
jpayne@68 4412 def get_reference_name(self, tid):
jpayne@68 4413 """
jpayne@68 4414 return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@68 4415 """
jpayne@68 4416 if not self.is_open:
jpayne@68 4417 raise ValueError('I/O operation on closed file')
jpayne@68 4418
jpayne@68 4419 cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68 4420 cdef int rid = tid
jpayne@68 4421 if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68 4422 raise ValueError('Invalid tid')
jpayne@68 4423 return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68 4424
jpayne@68 4425 def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
jpayne@68 4426 """fetch records in a :term:`region`, specified either by
jpayne@68 4427 :term:`contig`, *start*, and *end* (which are 0-based, half-open);
jpayne@68 4428 or alternatively by a samtools :term:`region` string (which is
jpayne@68 4429 1-based inclusive).
jpayne@68 4430
jpayne@68 4431 Without *contig* or *region* all mapped records will be fetched. The
jpayne@68 4432 records will be returned ordered by contig, which will not necessarily
jpayne@68 4433 be the order within the file.
jpayne@68 4434
jpayne@68 4435 Set *reopen* to true if you will be using multiple iterators on the
jpayne@68 4436 same file at the same time. The iterator returned will receive its
jpayne@68 4437 own copy of a filehandle to the file effectively re-opening the
jpayne@68 4438 file. Re-opening a file incurrs some overhead, so use with care.
jpayne@68 4439
jpayne@68 4440 If only *contig* is set, all records on *contig* will be fetched.
jpayne@68 4441 If both *region* and *contig* are given, an exception is raised.
jpayne@68 4442
jpayne@68 4443 Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
jpayne@68 4444 (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
jpayne@68 4445 read sequentially.
jpayne@68 4446 """
jpayne@68 4447 if not self.is_open:
jpayne@68 4448 raise ValueError('I/O operation on closed file')
jpayne@68 4449
jpayne@68 4450 if self.htsfile.is_write:
jpayne@68 4451 raise ValueError('cannot fetch from Variantfile opened for writing')
jpayne@68 4452
jpayne@68 4453 if contig is None and region is None:
jpayne@68 4454 self.is_reading = 1
jpayne@68 4455 bcf = self.copy() if reopen else self
jpayne@68 4456 bcf.seek(self.start_offset)
jpayne@68 4457 return iter(bcf)
jpayne@68 4458
jpayne@68 4459 if self.index is None:
jpayne@68 4460 raise ValueError('fetch requires an index')
jpayne@68 4461
jpayne@68 4462 _, tid, start, stop = self.parse_region(contig, start, stop, region,
jpayne@68 4463 None, end=end, reference=reference)
jpayne@68 4464
jpayne@68 4465 if contig is None:
jpayne@68 4466 contig = self.get_reference_name(tid)
jpayne@68 4467
jpayne@68 4468 self.is_reading = 1
jpayne@68 4469 return self.index.fetch(self, contig, start, stop, reopen)
jpayne@68 4470
jpayne@68 4471 def new_record(self, *args, **kwargs):
jpayne@68 4472 """Create a new empty :class:`VariantRecord`.
jpayne@68 4473
jpayne@68 4474 See :meth:`VariantHeader.new_record`
jpayne@68 4475 """
jpayne@68 4476 return self.header.new_record(*args, **kwargs)
jpayne@68 4477
jpayne@68 4478 cpdef int write(self, VariantRecord record) except -1:
jpayne@68 4479 """
jpayne@68 4480 write a single :class:`pysam.VariantRecord` to disk.
jpayne@68 4481
jpayne@68 4482 returns the number of bytes written.
jpayne@68 4483 """
jpayne@68 4484 if record is None:
jpayne@68 4485 raise ValueError('record must not be None')
jpayne@68 4486
jpayne@68 4487 if not self.is_open:
jpayne@68 4488 return ValueError('I/O operation on closed file')
jpayne@68 4489
jpayne@68 4490 if not self.htsfile.is_write:
jpayne@68 4491 raise ValueError('cannot write to a Variantfile opened for reading')
jpayne@68 4492
jpayne@68 4493 if not self.header_written:
jpayne@68 4494 self.header_written = True
jpayne@68 4495 with nogil:
jpayne@68 4496 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68 4497
jpayne@68 4498 #if record.header is not self.header:
jpayne@68 4499 # record.translate(self.header)
jpayne@68 4500 # raise ValueError('Writing records from a different VariantFile is not yet supported')
jpayne@68 4501
jpayne@68 4502 if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
jpayne@68 4503 msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
jpayne@68 4504 raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
jpayne@68 4505
jpayne@68 4506 # Sync END annotation before writing
jpayne@68 4507 bcf_sync_end(record)
jpayne@68 4508
jpayne@68 4509 cdef int ret
jpayne@68 4510
jpayne@68 4511 with nogil:
jpayne@68 4512 ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
jpayne@68 4513
jpayne@68 4514 if ret < 0:
jpayne@68 4515 raise IOError(errno, strerror(errno))
jpayne@68 4516
jpayne@68 4517 return ret
jpayne@68 4518
jpayne@68 4519 def subset_samples(self, include_samples):
jpayne@68 4520 """
jpayne@68 4521 Read only a subset of samples to reduce processing time and memory.
jpayne@68 4522 Must be called prior to retrieving records.
jpayne@68 4523 """
jpayne@68 4524 if not self.is_open:
jpayne@68 4525 raise ValueError('I/O operation on closed file')
jpayne@68 4526
jpayne@68 4527 if self.htsfile.is_write:
jpayne@68 4528 raise ValueError('cannot subset samples from Variantfile opened for writing')
jpayne@68 4529
jpayne@68 4530 if self.is_reading:
jpayne@68 4531 raise ValueError('cannot subset samples after fetching records')
jpayne@68 4532
jpayne@68 4533 self.header._subset_samples(include_samples)
jpayne@68 4534
jpayne@68 4535 # potentially unnecessary optimization that also sets max_unpack
jpayne@68 4536 if not include_samples:
jpayne@68 4537 self.drop_samples = True
jpayne@68 4538