jpayne@68: # cython: language_level=3 jpayne@68: # cython: embedsignature=True jpayne@68: # cython: profile=True jpayne@68: ############################################################################### jpayne@68: ############################################################################### jpayne@68: ## Cython wrapper for htslib VCF/BCF reader/writer jpayne@68: ############################################################################### jpayne@68: # jpayne@68: # NOTICE: This code is incomplete and preliminary. It offers a nearly jpayne@68: # complete Pythonic interface to VCF/BCF metadata and data with jpayne@68: # reading and writing capability. Documentation and a unit test suite jpayne@68: # are in the works. The code is best tested under Python 2, but jpayne@68: # should also work with Python 3. Please report any remaining jpayne@68: # str/bytes issues on the github site when using Python 3 and I'll jpayne@68: # fix them promptly. jpayne@68: # jpayne@68: # Here is a minimal example of how to use the API: jpayne@68: # jpayne@68: # $ cat bcfview.py jpayne@68: # import sys jpayne@68: # from pysam import VariantFile jpayne@68: # jpayne@68: # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format jpayne@68: # bcf_out = VariantFile('-', 'w', header=bcf_in.header) jpayne@68: # jpayne@68: # for rec in bcf_in: jpayne@68: # bcf_out.write(rec) jpayne@68: # jpayne@68: # Performance is fairly close to that of bcftools view. Here is an example jpayne@68: # using some 1k Genomes data: jpayne@68: # jpayne@68: # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l jpayne@68: # 1103799 jpayne@68: # jpayne@68: # real 0m56.114s jpayne@68: # user 1m4.489s jpayne@68: # sys 0m3.102s jpayne@68: # jpayne@68: # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l jpayne@68: # 1103800 # bcftools adds an extra header jpayne@68: # jpayne@68: # real 0m55.126s jpayne@68: # user 1m3.502s jpayne@68: # sys 0m3.459s jpayne@68: # jpayne@68: ############################################################################### jpayne@68: # jpayne@68: # TODO list: jpayne@68: # jpayne@68: # * more genotype methods jpayne@68: # * unit test suite (perhaps py.test based) jpayne@68: # * documentation jpayne@68: # * pickle support jpayne@68: # * left/right locus normalization jpayne@68: # * fix reopen to re-use fd jpayne@68: # jpayne@68: ############################################################################### jpayne@68: # jpayne@68: # The MIT License jpayne@68: # jpayne@68: # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com) jpayne@68: # jpayne@68: # Permission is hereby granted, free of charge, to any person obtaining a jpayne@68: # copy of this software and associated documentation files (the "Software"), jpayne@68: # to deal in the Software without restriction, including without limitation jpayne@68: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jpayne@68: # and/or sell copies of the Software, and to permit persons to whom the jpayne@68: # Software is furnished to do so, subject to the following conditions: jpayne@68: # jpayne@68: # The above copyright notice and this permission notice shall be included in jpayne@68: # all copies or substantial portions of the Software. jpayne@68: # jpayne@68: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@68: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@68: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL jpayne@68: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@68: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jpayne@68: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jpayne@68: # DEALINGS IN THE SOFTWARE. jpayne@68: # jpayne@68: ############################################################################### jpayne@68: jpayne@68: from __future__ import division, print_function jpayne@68: jpayne@68: import os jpayne@68: import sys jpayne@68: jpayne@68: from libc.errno cimport errno, EPIPE jpayne@68: from libc.string cimport strcmp, strpbrk, strerror jpayne@68: from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX jpayne@68: jpayne@68: cimport cython jpayne@68: jpayne@68: from cpython.object cimport PyObject jpayne@68: from cpython.ref cimport Py_INCREF jpayne@68: from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString jpayne@68: from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM jpayne@68: from cpython.bytes cimport PyBytes_FromStringAndSize jpayne@68: from cpython.unicode cimport PyUnicode_DecodeUTF8 jpayne@68: jpayne@68: from pysam.libchtslib cimport HTSFile, hisremote jpayne@68: jpayne@68: from pysam.utils import unquoted_str jpayne@68: jpayne@68: jpayne@68: __all__ = ['VariantFile', jpayne@68: 'VariantHeader', jpayne@68: 'VariantHeaderRecord', jpayne@68: 'VariantHeaderRecords', jpayne@68: 'VariantMetadata', jpayne@68: 'VariantHeaderMetadata', jpayne@68: 'VariantContig', jpayne@68: 'VariantHeaderContigs', jpayne@68: 'VariantHeaderSamples', jpayne@68: 'VariantRecordFilter', jpayne@68: 'VariantRecordFormat', jpayne@68: 'VariantRecordInfo', jpayne@68: 'VariantRecordSamples', jpayne@68: 'VariantRecord', jpayne@68: 'VariantRecordSample', jpayne@68: 'BaseIndex', jpayne@68: 'BCFIndex', jpayne@68: 'TabixIndex', jpayne@68: 'BaseIterator', jpayne@68: 'BCFIterator', jpayne@68: 'TabixIterator', jpayne@68: 'VariantRecord'] jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Constants jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef int MAX_POS = (1 << 31) - 1 jpayne@68: cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') jpayne@68: cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') jpayne@68: cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Python 3 compatibility functions jpayne@68: ######################################################################## jpayne@68: jpayne@68: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len jpayne@68: from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Sentinel object jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef object _nothing = object() jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## VCF/BCF string intern system jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef dict bcf_str_cache = {} jpayne@68: jpayne@68: cdef inline bcf_str_cache_get_charptr(const char* s): jpayne@68: if s == NULL: jpayne@68: return None jpayne@68: jpayne@68: cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s) jpayne@68: if pystr: jpayne@68: return pystr jpayne@68: jpayne@68: val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) jpayne@68: jpayne@68: PyDict_SetItemString(bcf_str_cache, s, val) jpayne@68: jpayne@68: return val jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Genotype math jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef int comb(int n, int k) except -1: jpayne@68: """Return binomial coefficient: n choose k jpayne@68: jpayne@68: >>> comb(5, 1) jpayne@68: 5 jpayne@68: >>> comb(5, 2) jpayne@68: 10 jpayne@68: >>> comb(2, 2) jpayne@68: 1 jpayne@68: >>> comb(100, 2) jpayne@68: 4950 jpayne@68: """ jpayne@68: if k > n: jpayne@68: return 0 jpayne@68: elif k == n: jpayne@68: return 1 jpayne@68: elif k > n // 2: jpayne@68: k = n - k jpayne@68: jpayne@68: cdef d, result jpayne@68: jpayne@68: d = result = n - k + 1 jpayne@68: for i in range(2, k + 1): jpayne@68: d += 1 jpayne@68: result *= d jpayne@68: result //= i jpayne@68: return result jpayne@68: jpayne@68: jpayne@68: cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1: jpayne@68: """Return the count of genotypes expected for the given ploidy and number of alleles. jpayne@68: jpayne@68: >>> bcf_geno_combinations(1, 2) jpayne@68: 2 jpayne@68: >>> bcf_geno_combinations(2, 2) jpayne@68: 3 jpayne@68: >>> bcf_geno_combinations(2, 3) jpayne@68: 6 jpayne@68: >>> bcf_geno_combinations(3, 2) jpayne@68: 4 jpayne@68: """ jpayne@68: return comb(alleles + ploidy - 1, ploidy) jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Low level type conversion helpers jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): jpayne@68: return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id) jpayne@68: jpayne@68: jpayne@68: cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): jpayne@68: return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0 jpayne@68: jpayne@68: jpayne@68: cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1: jpayne@68: jpayne@68: if sample < 0: jpayne@68: raise ValueError('genotype is only valid as a format field') jpayne@68: jpayne@68: cdef int32_t *gt_arr = NULL jpayne@68: cdef int ngt = 0 jpayne@68: ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt) jpayne@68: jpayne@68: if ngt <= 0 or not gt_arr: jpayne@68: return 0 jpayne@68: jpayne@68: assert ngt % rec.n_sample == 0 jpayne@68: cdef int max_ploidy = ngt // rec.n_sample jpayne@68: cdef int32_t *gt = gt_arr + sample * max_ploidy jpayne@68: cdef int ploidy = 0 jpayne@68: jpayne@68: while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end: jpayne@68: gt += 1 jpayne@68: ploidy += 1 jpayne@68: jpayne@68: free(gt_arr) jpayne@68: jpayne@68: return bcf_geno_combinations(ploidy, rec.n_allele) jpayne@68: jpayne@68: jpayne@68: cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): jpayne@68: if not a: jpayne@68: return None jpayne@68: try: jpayne@68: return tuple(charptr_to_str(a[i]) for i in range(n)) jpayne@68: finally: jpayne@68: if free_after and a: jpayne@68: free(a) jpayne@68: jpayne@68: jpayne@68: cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar): jpayne@68: cdef char *datac jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: cdef float *dataf jpayne@68: cdef int i jpayne@68: cdef bytes b jpayne@68: jpayne@68: if not data or n <= 0: jpayne@68: return None jpayne@68: jpayne@68: if type == BCF_BT_CHAR: jpayne@68: datac = data jpayne@68: jpayne@68: if not n: jpayne@68: value = () jpayne@68: else: jpayne@68: # Check if at least one null terminator is present jpayne@68: if datac[n-1] == bcf_str_vector_end: jpayne@68: # If so, create a string up to the first null terminator jpayne@68: b = datac jpayne@68: else: jpayne@68: # Otherwise, copy the entire block jpayne@68: b = datac[:n] jpayne@68: value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) jpayne@68: else: jpayne@68: value = [] jpayne@68: if type == BCF_BT_INT8: jpayne@68: data8 = data jpayne@68: for i in range(n): jpayne@68: if data8[i] == bcf_int8_vector_end: jpayne@68: break jpayne@68: value.append(data8[i] if data8[i] != bcf_int8_missing else None) jpayne@68: elif type == BCF_BT_INT16: jpayne@68: data16 = data jpayne@68: for i in range(n): jpayne@68: if data16[i] == bcf_int16_vector_end: jpayne@68: break jpayne@68: value.append(data16[i] if data16[i] != bcf_int16_missing else None) jpayne@68: elif type == BCF_BT_INT32: jpayne@68: data32 = data jpayne@68: for i in range(n): jpayne@68: if data32[i] == bcf_int32_vector_end: jpayne@68: break jpayne@68: value.append(data32[i] if data32[i] != bcf_int32_missing else None) jpayne@68: elif type == BCF_BT_FLOAT: jpayne@68: dataf = data jpayne@68: for i in range(n): jpayne@68: if bcf_float_is_vector_end(dataf[i]): jpayne@68: break jpayne@68: value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) jpayne@68: else: jpayne@68: raise TypeError('unsupported info type code') jpayne@68: jpayne@68: # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. jpayne@68: if not value: jpayne@68: if scalar: jpayne@68: value = None jpayne@68: elif count <= 0: jpayne@68: value = () jpayne@68: else: jpayne@68: value = (None,)*count jpayne@68: elif scalar and len(value) == 1: jpayne@68: value = value[0] jpayne@68: else: jpayne@68: value = tuple(value) jpayne@68: jpayne@68: return value jpayne@68: jpayne@68: jpayne@68: cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): jpayne@68: cdef char *datac jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: cdef float *dataf jpayne@68: cdef ssize_t i, value_count = len(values) jpayne@68: jpayne@68: assert value_count <= n jpayne@68: jpayne@68: if bt_type == BCF_BT_CHAR: jpayne@68: if not isinstance(values, (str, bytes)): jpayne@68: values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values) jpayne@68: value_count = len(values) jpayne@68: assert value_count <= n jpayne@68: datac = data jpayne@68: memcpy(datac, values, value_count) jpayne@68: for i in range(value_count, n): jpayne@68: datac[i] = 0 jpayne@68: elif bt_type == BCF_BT_INT8: jpayne@68: datai8 = data jpayne@68: for i in range(value_count): jpayne@68: val = values[i] jpayne@68: datai8[i] = val if val is not None else bcf_int8_missing jpayne@68: for i in range(value_count, n): jpayne@68: datai8[i] = bcf_int8_vector_end jpayne@68: elif bt_type == BCF_BT_INT16: jpayne@68: datai16 = data jpayne@68: for i in range(value_count): jpayne@68: val = values[i] jpayne@68: datai16[i] = val if val is not None else bcf_int16_missing jpayne@68: for i in range(value_count, n): jpayne@68: datai16[i] = bcf_int16_vector_end jpayne@68: elif bt_type == BCF_BT_INT32: jpayne@68: datai32 = data jpayne@68: for i in range(value_count): jpayne@68: val = values[i] jpayne@68: datai32[i] = val if val is not None else bcf_int32_missing jpayne@68: for i in range(value_count, n): jpayne@68: datai32[i] = bcf_int32_vector_end jpayne@68: elif bt_type == BCF_BT_FLOAT: jpayne@68: dataf = data jpayne@68: for i in range(value_count): jpayne@68: val = values[i] jpayne@68: if val is None: jpayne@68: bcf_float_set(dataf + i, bcf_float_missing) jpayne@68: else: jpayne@68: dataf[i] = val jpayne@68: for i in range(value_count, n): jpayne@68: bcf_float_set(dataf + i, bcf_float_vector_end) jpayne@68: else: jpayne@68: raise TypeError('unsupported type') jpayne@68: jpayne@68: jpayne@68: cdef bcf_empty_array(int type, ssize_t n, int vlen): jpayne@68: cdef char *datac jpayne@68: cdef int32_t *data32 jpayne@68: cdef float *dataf jpayne@68: cdef int i jpayne@68: jpayne@68: if n <= 0: jpayne@68: raise ValueError('Cannot create empty array') jpayne@68: jpayne@68: if type == BCF_HT_STR: jpayne@68: value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n) jpayne@68: datac = value jpayne@68: for i in range(n): jpayne@68: datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end jpayne@68: elif type == BCF_HT_INT: jpayne@68: value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n) jpayne@68: data32 = value jpayne@68: for i in range(n): jpayne@68: data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@68: elif type == BCF_HT_REAL: jpayne@68: value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n) jpayne@68: dataf = value jpayne@68: for i in range(n): jpayne@68: bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) jpayne@68: else: jpayne@68: raise TypeError('unsupported header type code') jpayne@68: jpayne@68: return value jpayne@68: jpayne@68: jpayne@68: cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values, jpayne@68: void *dst_data, int dst_type, size_t dst_values, jpayne@68: int vlen): jpayne@68: """copy data from src to dest where the size of the elements (src_type/dst_type) differ jpayne@68: as well as the number of elements (src_values/dst_values). jpayne@68: """ jpayne@68: jpayne@68: cdef char *src_datac jpayne@68: cdef char *dst_datac jpayne@68: cdef int8_t *src_datai8 jpayne@68: cdef int16_t *src_datai16 jpayne@68: cdef int32_t *src_datai32 jpayne@68: cdef int32_t *dst_datai jpayne@68: cdef float *src_dataf jpayne@68: cdef float *dst_dataf jpayne@68: cdef ssize_t src_size, dst_size, i, j jpayne@68: cdef int val jpayne@68: jpayne@68: if src_values > dst_values: jpayne@68: raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values)) jpayne@68: jpayne@68: if src_type == dst_type == BCF_BT_CHAR: jpayne@68: src_datac = src_data jpayne@68: dst_datac = dst_data jpayne@68: memcpy(dst_datac, src_datac, src_values) jpayne@68: for i in range(src_values, dst_values): jpayne@68: dst_datac[i] = 0 jpayne@68: elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: jpayne@68: src_datai8 = src_data jpayne@68: dst_datai = dst_data jpayne@68: for i in range(src_values): jpayne@68: val = src_datai8[i] jpayne@68: if val == bcf_int8_missing: jpayne@68: val = bcf_int32_missing jpayne@68: elif val == bcf_int8_vector_end: jpayne@68: val = bcf_int32_vector_end jpayne@68: dst_datai[i] = val jpayne@68: for i in range(src_values, dst_values): jpayne@68: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@68: elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32: jpayne@68: src_datai16 = src_data jpayne@68: dst_datai = dst_data jpayne@68: for i in range(src_values): jpayne@68: val = src_datai16[i] jpayne@68: if val == bcf_int16_missing: jpayne@68: val = bcf_int32_missing jpayne@68: elif val == bcf_int16_vector_end: jpayne@68: val = bcf_int32_vector_end jpayne@68: dst_datai[i] = val jpayne@68: for i in range(src_values, dst_values): jpayne@68: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@68: elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32: jpayne@68: src_datai32 = src_data jpayne@68: dst_datai = dst_data jpayne@68: for i in range(src_values): jpayne@68: dst_datai[i] = src_datai32[i] jpayne@68: for i in range(src_values, dst_values): jpayne@68: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@68: elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT: jpayne@68: src_dataf = src_data jpayne@68: dst_dataf = dst_data jpayne@68: for i in range(src_values): jpayne@68: dst_dataf[i] = src_dataf[i] jpayne@68: for i in range(src_values, dst_values): jpayne@68: bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) jpayne@68: else: jpayne@68: raise TypeError('unsupported types') jpayne@68: jpayne@68: jpayne@68: cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample): jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@68: cdef bcf1_t *r = record.ptr jpayne@68: jpayne@68: if not check_header_id(hdr, hl_type, id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: cdef int length = bcf_hdr_id2length(hdr, hl_type, id) jpayne@68: cdef int number = bcf_hdr_id2number(hdr, hl_type, id) jpayne@68: jpayne@68: scalar[0] = 0 jpayne@68: jpayne@68: if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id): jpayne@68: count[0] = number jpayne@68: elif length == BCF_VL_FIXED: jpayne@68: if number == 1: jpayne@68: scalar[0] = 1 jpayne@68: count[0] = number jpayne@68: elif length == BCF_VL_R: jpayne@68: count[0] = r.n_allele jpayne@68: elif length == BCF_VL_A: jpayne@68: count[0] = r.n_allele - 1 jpayne@68: elif length == BCF_VL_G: jpayne@68: count[0] = bcf_genotype_count(hdr, r, sample) jpayne@68: elif length == BCF_VL_VAR: jpayne@68: count[0] = -1 jpayne@68: else: jpayne@68: raise ValueError('Unknown format length') jpayne@68: jpayne@68: jpayne@68: cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@68: jpayne@68: cdef char *s jpayne@68: cdef ssize_t count jpayne@68: cdef int scalar jpayne@68: jpayne@68: bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1) jpayne@68: jpayne@68: if z.len == 0: jpayne@68: if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: jpayne@68: value = True jpayne@68: elif scalar: jpayne@68: value = None jpayne@68: else: jpayne@68: value = () jpayne@68: elif z.len == 1: jpayne@68: if z.type == BCF_BT_INT8: jpayne@68: value = z.v1.i if z.v1.i != bcf_int8_missing else None jpayne@68: elif z.type == BCF_BT_INT16: jpayne@68: value = z.v1.i if z.v1.i != bcf_int16_missing else None jpayne@68: elif z.type == BCF_BT_INT32: jpayne@68: value = z.v1.i if z.v1.i != bcf_int32_missing else None jpayne@68: elif z.type == BCF_BT_FLOAT: jpayne@68: value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None jpayne@68: elif z.type == BCF_BT_CHAR: jpayne@68: value = force_str(chr(z.v1.i)) jpayne@68: else: jpayne@68: raise TypeError('unsupported info type code') jpayne@68: jpayne@68: if not scalar and value != (): jpayne@68: value = (value,) jpayne@68: else: jpayne@68: value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar) jpayne@68: jpayne@68: return value jpayne@68: jpayne@68: jpayne@68: cdef object bcf_check_values(VariantRecord record, value, int sample, jpayne@68: int hl_type, int ht_type, jpayne@68: int id, int bt_type, ssize_t bt_len, jpayne@68: ssize_t *value_count, int *scalar, int *realloc): jpayne@68: jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: bcf_get_value_count(record, hl_type, id, value_count, scalar, sample) jpayne@68: jpayne@68: # Validate values now that we know the type and size jpayne@68: values = (value,) if not isinstance(value, (list, tuple)) else value jpayne@68: jpayne@68: # Validate values now that we know the type and size jpayne@68: if ht_type == BCF_HT_FLAG: jpayne@68: value_count[0] = 1 jpayne@68: elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id): jpayne@68: # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) jpayne@68: value_count[0] = -1 jpayne@68: jpayne@68: cdef int given = len(values) jpayne@68: if value_count[0] != -1 and value_count[0] != given: jpayne@68: if scalar[0]: jpayne@68: raise TypeError('value expected to be scalar, given len={}'.format(given)) jpayne@68: else: jpayne@68: raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given)) jpayne@68: jpayne@68: if ht_type == BCF_HT_REAL: jpayne@68: for v in values: jpayne@68: if not(v is None or isinstance(v, (float, int))): jpayne@68: raise TypeError('invalid value for Float format') jpayne@68: elif ht_type == BCF_HT_INT: jpayne@68: for v in values: jpayne@68: if not(v is None or (isinstance(v, (float, int)) and int(v) == v)): jpayne@68: raise TypeError('invalid value for Integer format') jpayne@68: for v in values: jpayne@68: if not(v is None or bcf_int32_missing < v <= INT32_MAX): jpayne@68: raise ValueError('Integer value too small/large to store in VCF/BCF') jpayne@68: elif ht_type == BCF_HT_STR: jpayne@68: values = b','.join(force_bytes(v) if v is not None else b'' for v in values) jpayne@68: elif ht_type == BCF_HT_FLAG: jpayne@68: if values[0] not in (True, False, None, 1, 0): jpayne@68: raise ValueError('Flag values must be: True, False, None, 1, 0') jpayne@68: else: jpayne@68: raise TypeError('unsupported type') jpayne@68: jpayne@68: realloc[0] = 0 jpayne@68: if len(values) <= 1 and hl_type == BCF_HL_INFO: jpayne@68: realloc[0] = 0 jpayne@68: elif len(values) > bt_len: jpayne@68: realloc[0] = 1 jpayne@68: elif bt_type == BCF_BT_INT8: jpayne@68: for v in values: jpayne@68: if v is not None and not(bcf_int8_missing < v <= INT8_MAX): jpayne@68: realloc[0] = 1 jpayne@68: break jpayne@68: elif bt_type == BCF_BT_INT16: jpayne@68: for v in values: jpayne@68: if v is not None and not(bcf_int16_missing < v <= INT16_MAX): jpayne@68: realloc[0] = 1 jpayne@68: break jpayne@68: jpayne@68: return values jpayne@68: jpayne@68: jpayne@68: cdef bcf_encode_alleles(VariantRecord record, values): jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf1_t *r = record.ptr jpayne@68: cdef int32_t nalleles = r.n_allele jpayne@68: cdef list gt_values = [] jpayne@68: cdef char *s jpayne@68: cdef int i jpayne@68: jpayne@68: if values is None: jpayne@68: return () jpayne@68: jpayne@68: if not isinstance(values, (list, tuple)): jpayne@68: values = (values,) jpayne@68: jpayne@68: for value in values: jpayne@68: if value is None: jpayne@68: gt_values.append(bcf_gt_missing) jpayne@68: elif isinstance(value, (str, bytes)): jpayne@68: bvalue = force_bytes(value) jpayne@68: s = bvalue jpayne@68: for i in range(r.n_allele): jpayne@68: if strcmp(r.d.allele[i], s) != 0: jpayne@68: gt_values.append(bcf_gt_unphased(i)) jpayne@68: break jpayne@68: else: jpayne@68: raise ValueError('Unknown allele') jpayne@68: else: jpayne@68: i = value jpayne@68: if not (0 <= i < nalleles): jpayne@68: raise ValueError('Invalid allele index') jpayne@68: gt_values.append(bcf_gt_unphased(i)) jpayne@68: jpayne@68: return gt_values jpayne@68: jpayne@68: jpayne@68: cdef bcf_info_set_value(VariantRecord record, key, value): jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@68: cdef bcf1_t *r = record.ptr jpayne@68: cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 jpayne@68: cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: if info: jpayne@68: info_id = info.key jpayne@68: else: jpayne@68: info_id = bcf_header_get_info_id(hdr, bkey) jpayne@68: jpayne@68: if info_id < 0: jpayne@68: raise KeyError('unknown INFO: {}'.format(key)) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) jpayne@68: values = bcf_check_values(record, value, -1, jpayne@68: BCF_HL_INFO, info_type, info_id, jpayne@68: info.type if info else -1, jpayne@68: info.len if info else -1, jpayne@68: &value_count, &scalar, &realloc) jpayne@68: jpayne@68: if info_type == BCF_HT_FLAG: jpayne@68: if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0: jpayne@68: raise ValueError('Unable to update INFO values') jpayne@68: return jpayne@68: jpayne@68: vlen = value_count < 0 jpayne@68: value_count = len(values) jpayne@68: jpayne@68: # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS jpayne@68: # If we can, write updated values to existing allocated storage jpayne@68: if 0 and info and not realloc: jpayne@68: r.d.shared_dirty |= BCF1_DIRTY_INF jpayne@68: jpayne@68: if value_count == 0: jpayne@68: info.len = 0 jpayne@68: if not info.vptr: jpayne@68: info.vptr = &info.v1.i jpayne@68: jpayne@68: elif value_count == 1: jpayne@68: # FIXME: Check if need to free vptr if info.len > 0? jpayne@68: if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: jpayne@68: bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen) jpayne@68: elif info.type == BCF_BT_FLOAT: jpayne@68: bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) jpayne@68: else: jpayne@68: raise TypeError('unsupported info type code') jpayne@68: jpayne@68: info.len = 1 jpayne@68: if not info.vptr: jpayne@68: info.vptr = &info.v1.i jpayne@68: else: jpayne@68: bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) jpayne@68: jpayne@68: return jpayne@68: jpayne@68: alloc_len = max(1, value_count) jpayne@68: if info and info.len > alloc_len: jpayne@68: alloc_len = info.len jpayne@68: jpayne@68: new_values = bcf_empty_array(info_type, alloc_len, vlen) jpayne@68: cdef char *valp = new_values jpayne@68: jpayne@68: if info_type == BCF_HT_INT: jpayne@68: dst_type = BCF_BT_INT32 jpayne@68: elif info_type == BCF_HT_REAL: jpayne@68: dst_type = BCF_BT_FLOAT jpayne@68: elif info_type == BCF_HT_STR: jpayne@68: dst_type = BCF_BT_CHAR jpayne@68: else: jpayne@68: raise ValueError('Unsupported INFO type') jpayne@68: jpayne@68: bcf_object_to_array(values, valp, dst_type, alloc_len, vlen) jpayne@68: jpayne@68: if bcf_update_info(hdr, r, bkey, valp, alloc_len, info_type) < 0: jpayne@68: raise ValueError('Unable to update INFO values') jpayne@68: jpayne@68: jpayne@68: cdef bcf_info_del_value(VariantRecord record, key): jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@68: cdef bcf1_t *r = record.ptr jpayne@68: cdef ssize_t value_count jpayne@68: cdef int scalar jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: if not info: jpayne@68: raise KeyError(key) jpayne@68: jpayne@68: bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1) jpayne@68: jpayne@68: if value_count <= 0: jpayne@68: null_value = () jpayne@68: elif scalar: jpayne@68: null_value = None jpayne@68: else: jpayne@68: null_value = (None,)*value_count jpayne@68: jpayne@68: bcf_info_set_value(record, bkey, null_value) jpayne@68: jpayne@68: jpayne@68: cdef bcf_format_get_value(VariantRecordSample sample, key): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef ssize_t count jpayne@68: cdef int scalar jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: jpayne@68: if not fmt or not fmt.p: jpayne@68: raise KeyError('invalid FORMAT: {}'.format(key)) jpayne@68: jpayne@68: if is_gt_fmt(hdr, fmt.id): jpayne@68: return bcf_format_get_allele_indices(sample) jpayne@68: jpayne@68: bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index) jpayne@68: jpayne@68: if fmt.p and fmt.n and fmt.size: jpayne@68: return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) jpayne@68: elif scalar: jpayne@68: return None jpayne@68: elif count <= 0: jpayne@68: return () jpayne@68: else: jpayne@68: return (None,)*count jpayne@68: jpayne@68: jpayne@68: cdef bcf_format_set_value(VariantRecordSample sample, key, value): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: if key == 'phased': jpayne@68: sample.phased = bool(value) jpayne@68: return jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef int fmt_id jpayne@68: cdef vdict_t *d jpayne@68: cdef khiter_t k jpayne@68: cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 jpayne@68: cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: jpayne@68: if fmt: jpayne@68: fmt_id = fmt.id jpayne@68: else: jpayne@68: d = hdr.dict[BCF_DT_ID] jpayne@68: k = kh_get_vdict(d, bkey) jpayne@68: jpayne@68: if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: jpayne@68: raise KeyError('unknown format: {}'.format(key)) jpayne@68: jpayne@68: fmt_id = kh_val_vdict(d, k).id jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_FMT, fmt_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) jpayne@68: jpayne@68: if fmt_type == BCF_HT_FLAG: jpayne@68: raise ValueError('Flag types are not allowed on FORMATs') jpayne@68: jpayne@68: if is_gt_fmt(hdr, fmt_id): jpayne@68: value = bcf_encode_alleles(sample.record, value) jpayne@68: # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. jpayne@68: fmt_type = BCF_HT_INT jpayne@68: jpayne@68: values = bcf_check_values(sample.record, value, sample.index, jpayne@68: BCF_HL_FMT, fmt_type, fmt_id, jpayne@68: fmt.type if fmt else -1, jpayne@68: fmt.n if fmt else -1, jpayne@68: &value_count, &scalar, &realloc) jpayne@68: vlen = value_count < 0 jpayne@68: value_count = len(values) jpayne@68: jpayne@68: # If we can, write updated values to existing allocated storage. jpayne@68: if fmt and not realloc: jpayne@68: r.d.indiv_dirty = 1 jpayne@68: bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) jpayne@68: return jpayne@68: jpayne@68: alloc_len = max(1, value_count) jpayne@68: if fmt and fmt.n > alloc_len: jpayne@68: alloc_len = fmt.n jpayne@68: jpayne@68: nsamples = r.n_sample jpayne@68: new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen) jpayne@68: cdef char *new_values_p = new_values jpayne@68: jpayne@68: if fmt_type == BCF_HT_INT: jpayne@68: dst_type = BCF_BT_INT32 jpayne@68: dst_size = sizeof(int32_t) * alloc_len jpayne@68: elif fmt_type == BCF_HT_REAL: jpayne@68: dst_type = BCF_BT_FLOAT jpayne@68: dst_size = sizeof(float) * alloc_len jpayne@68: elif fmt_type == BCF_HT_STR: jpayne@68: dst_type = BCF_BT_CHAR jpayne@68: dst_size = sizeof(char) * alloc_len jpayne@68: else: jpayne@68: raise ValueError('Unsupported FORMAT type') jpayne@68: jpayne@68: if fmt and nsamples > 1: jpayne@68: for i in range(nsamples): jpayne@68: bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n, jpayne@68: new_values_p + i * dst_size, dst_type, alloc_len, jpayne@68: vlen) jpayne@68: jpayne@68: bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen) jpayne@68: jpayne@68: if bcf_update_format(hdr, r, bkey, new_values_p, (nsamples * alloc_len), fmt_type) < 0: jpayne@68: raise ValueError('Unable to update format values') jpayne@68: jpayne@68: jpayne@68: cdef bcf_format_del_value(VariantRecordSample sample, key): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef ssize_t value_count jpayne@68: cdef int scalar jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: jpayne@68: if not fmt or not fmt.p: jpayne@68: raise KeyError(key) jpayne@68: jpayne@68: bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index) jpayne@68: jpayne@68: if value_count <= 0: jpayne@68: null_value = () jpayne@68: elif scalar: jpayne@68: null_value = None jpayne@68: else: jpayne@68: null_value = (None,)*value_count jpayne@68: jpayne@68: bcf_format_set_value(sample, bkey, null_value) jpayne@68: jpayne@68: jpayne@68: cdef bcf_format_get_allele_indices(VariantRecordSample sample): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef int32_t n = r.n_sample jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@68: return () jpayne@68: jpayne@68: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@68: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@68: jpayne@68: if not gt0 or not fmt0.n: jpayne@68: return () jpayne@68: jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: cdef int32_t a, nalleles = r.n_allele jpayne@68: cdef list alleles = [] jpayne@68: jpayne@68: if fmt0.type == BCF_BT_INT8: jpayne@68: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data8[i] == bcf_int8_vector_end: jpayne@68: break jpayne@68: elif data8[i] == bcf_gt_missing: jpayne@68: a = -1 jpayne@68: else: jpayne@68: a = bcf_gt_allele(data8[i]) jpayne@68: alleles.append(a if 0 <= a < nalleles else None) jpayne@68: elif fmt0.type == BCF_BT_INT16: jpayne@68: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data16[i] == bcf_int16_vector_end: jpayne@68: break jpayne@68: elif data16[i] == bcf_gt_missing: jpayne@68: a = -1 jpayne@68: else: jpayne@68: a = bcf_gt_allele(data16[i]) jpayne@68: alleles.append(a if 0 <= a < nalleles else None) jpayne@68: elif fmt0.type == BCF_BT_INT32: jpayne@68: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data32[i] == bcf_int32_vector_end: jpayne@68: break jpayne@68: elif data32[i] == bcf_gt_missing: jpayne@68: a = -1 jpayne@68: else: jpayne@68: a = bcf_gt_allele(data32[i]) jpayne@68: alleles.append(a if 0 <= a < nalleles else None) jpayne@68: jpayne@68: return tuple(alleles) jpayne@68: jpayne@68: jpayne@68: cdef bcf_format_get_alleles(VariantRecordSample sample): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef int32_t nsamples = r.n_sample jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef int32_t nalleles = r.n_allele jpayne@68: jpayne@68: if sample.index < 0 or sample.index >= nsamples or not r.n_fmt: jpayne@68: return () jpayne@68: jpayne@68: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@68: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@68: jpayne@68: if not gt0 or not fmt0.n: jpayne@68: return () jpayne@68: jpayne@68: cdef int32_t a jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: alleles = [] jpayne@68: if fmt0.type == BCF_BT_INT8: jpayne@68: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data8[i] == bcf_int8_vector_end: jpayne@68: break jpayne@68: a = bcf_gt_allele(data8[i]) jpayne@68: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@68: elif fmt0.type == BCF_BT_INT16: jpayne@68: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data16[i] == bcf_int16_vector_end: jpayne@68: break jpayne@68: a = bcf_gt_allele(data16[i]) jpayne@68: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@68: elif fmt0.type == BCF_BT_INT32: jpayne@68: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data32[i] == bcf_int32_vector_end: jpayne@68: break jpayne@68: a = bcf_gt_allele(data32[i]) jpayne@68: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@68: return tuple(alleles) jpayne@68: jpayne@68: jpayne@68: cdef bint bcf_sample_get_phased(VariantRecordSample sample): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef int32_t n = r.n_sample jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@68: return False jpayne@68: jpayne@68: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@68: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@68: jpayne@68: if not gt0 or not fmt0.n: jpayne@68: return False jpayne@68: jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: jpayne@68: cdef bint phased = False jpayne@68: jpayne@68: if fmt0.type == BCF_BT_INT8: jpayne@68: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data8[i] == bcf_int8_vector_end: jpayne@68: break jpayne@68: elif data8[i] == bcf_int8_missing: jpayne@68: continue jpayne@68: elif i and not bcf_gt_is_phased(data8[i]): jpayne@68: return False jpayne@68: else: jpayne@68: phased = True jpayne@68: elif fmt0.type == BCF_BT_INT16: jpayne@68: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data16[i] == bcf_int16_vector_end: jpayne@68: break jpayne@68: elif data16[i] == bcf_int16_missing: jpayne@68: continue jpayne@68: elif i and not bcf_gt_is_phased(data16[i]): jpayne@68: return False jpayne@68: else: jpayne@68: phased = True jpayne@68: elif fmt0.type == BCF_BT_INT32: jpayne@68: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data32[i] == bcf_int32_vector_end: jpayne@68: break jpayne@68: elif data32[i] == bcf_int32_missing: jpayne@68: continue jpayne@68: elif i and not bcf_gt_is_phased(data32[i]): jpayne@68: return False jpayne@68: else: jpayne@68: phased = True jpayne@68: jpayne@68: return phased jpayne@68: jpayne@68: jpayne@68: cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): jpayne@68: if sample is None: jpayne@68: raise ValueError('sample must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@68: cdef bcf1_t *r = sample.record.ptr jpayne@68: cdef int32_t n = r.n_sample jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@68: return jpayne@68: jpayne@68: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@68: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@68: jpayne@68: if not gt0 or not fmt0.n: jpayne@68: raise ValueError('Cannot set phased before genotype is set') jpayne@68: jpayne@68: cdef int8_t *data8 jpayne@68: cdef int16_t *data16 jpayne@68: cdef int32_t *data32 jpayne@68: jpayne@68: if fmt0.type == BCF_BT_INT8: jpayne@68: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data8[i] == bcf_int8_vector_end: jpayne@68: break jpayne@68: elif data8[i] == bcf_int8_missing: jpayne@68: continue jpayne@68: elif i: jpayne@68: data8[i] = (data8[i] & 0xFE) | phased jpayne@68: elif fmt0.type == BCF_BT_INT16: jpayne@68: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data16[i] == bcf_int16_vector_end: jpayne@68: break jpayne@68: elif data16[i] == bcf_int16_missing: jpayne@68: continue jpayne@68: elif i: jpayne@68: data16[i] = (data16[i] & 0xFFFE) | phased jpayne@68: elif fmt0.type == BCF_BT_INT32: jpayne@68: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@68: for i in range(fmt0.n): jpayne@68: if data32[i] == bcf_int32_vector_end: jpayne@68: break jpayne@68: elif data32[i] == bcf_int32_missing: jpayne@68: continue jpayne@68: elif i: jpayne@68: data32[i] = (data32[i] & 0xFFFFFFFE) | phased jpayne@68: jpayne@68: jpayne@68: cdef inline bcf_sync_end(VariantRecord record): jpayne@68: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END') jpayne@68: cdef int ref_len jpayne@68: jpayne@68: # allow missing ref when instantiating a new record jpayne@68: if record.ref is not None: jpayne@68: ref_len = len(record.ref) jpayne@68: else: jpayne@68: ref_len = 0 jpayne@68: jpayne@68: # Delete INFO/END if no alleles are present or if rlen is equal to len(ref) jpayne@68: # Always keep END for symbolic alleles jpayne@68: if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len): jpayne@68: # If INFO/END is not defined in the header, it doesn't exist in the record jpayne@68: if end_id >= 0: jpayne@68: info = bcf_get_info(hdr, record.ptr, b'END') jpayne@68: if info and info.vptr: jpayne@68: if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0: jpayne@68: raise ValueError('Unable to delete END') jpayne@68: else: jpayne@68: # Create END header, if not present jpayne@68: if end_id < 0: jpayne@68: record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval') jpayne@68: jpayne@68: # Update to reflect stop position jpayne@68: bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen) jpayne@68: jpayne@68: jpayne@68: cdef inline int has_symbolic_allele(VariantRecord record): jpayne@68: """Return index of first symbolic allele. 0 if no symbolic alleles.""" jpayne@68: jpayne@68: for i in range(1, record.ptr.n_allele): jpayne@68: alt = record.ptr.d.allele[i] jpayne@68: if alt[0] == b'<' and alt[len(alt) - 1] == b'>': jpayne@68: return i jpayne@68: jpayne@68: return 0 jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Variant Header objects jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef bcf_header_remove_hrec(VariantHeader header, int i): jpayne@68: if header is None: jpayne@68: raise ValueError('header must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = header.ptr jpayne@68: jpayne@68: if i < 0 or i >= hdr.nhrec: jpayne@68: raise ValueError('Invalid header record index') jpayne@68: jpayne@68: cdef bcf_hrec_t *hrec = hdr.hrec[i] jpayne@68: hdr.nhrec -= 1 jpayne@68: jpayne@68: if i < hdr.nhrec: jpayne@68: memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*)) jpayne@68: jpayne@68: bcf_hrec_destroy(hrec) jpayne@68: hdr.hrec[hdr.nhrec] = NULL jpayne@68: hdr.dirty = 1 jpayne@68: jpayne@68: jpayne@68: #FIXME: implement a full mapping interface jpayne@68: #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the jpayne@68: # object lifetime. jpayne@68: cdef class VariantHeaderRecord(object): jpayne@68: """header record from a :class:`VariantHeader` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: @property jpayne@68: def type(self): jpayne@68: """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return None jpayne@68: return METADATA_TYPES[r.type] jpayne@68: jpayne@68: @property jpayne@68: def key(self): jpayne@68: """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: return bcf_str_cache_get_charptr(r.key) if r and r.key else None jpayne@68: jpayne@68: @property jpayne@68: def value(self): jpayne@68: """header value. Set only for generic lines, None for FILTER/INFO, etc.""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: return charptr_to_str(r.value) if r and r.value else None jpayne@68: jpayne@68: @property jpayne@68: def attrs(self): jpayne@68: """sequence of additional header attributes""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return () jpayne@68: cdef int i jpayne@68: return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, jpayne@68: charptr_to_str(r.vals[i]) if r.vals[i] else None) jpayne@68: for i in range(r.nkeys)) jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: return r.nkeys if r else 0 jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: return r != NULL and r.nkeys != 0 jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: """get attribute value""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: cdef int i jpayne@68: if r: jpayne@68: bkey = force_bytes(key) jpayne@68: for i in range(r.nkeys): jpayne@68: if r.keys[i] and r.keys[i] == bkey: jpayne@68: return charptr_to_str(r.vals[i]) if r.vals[i] else None jpayne@68: raise KeyError('cannot find metadata key') jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return jpayne@68: cdef int i jpayne@68: for i in range(r.nkeys): jpayne@68: if r.keys[i]: jpayne@68: yield bcf_str_cache_get_charptr(r.keys[i]) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: try: jpayne@68: self[key] jpayne@68: except KeyError: jpayne@68: return False jpayne@68: else: jpayne@68: return True jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return jpayne@68: cdef int i jpayne@68: for i in range(r.nkeys): jpayne@68: if r.keys[i]: jpayne@68: yield charptr_to_str(r.vals[i]) if r.vals[i] else None jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return jpayne@68: cdef int i jpayne@68: for i in range(r.nkeys): jpayne@68: if r.keys[i]: jpayne@68: yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def update(self, items=None, **kwargs): jpayne@68: """D.update([E, ]**F) -> None. jpayne@68: jpayne@68: Update D from dict/iterable E and F. jpayne@68: """ jpayne@68: for k, v in items.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: if kwargs: jpayne@68: for k, v in kwargs.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: def pop(self, key, default=_nothing): jpayne@68: try: jpayne@68: value = self[key] jpayne@68: del self[key] jpayne@68: return value jpayne@68: except KeyError: jpayne@68: if default is not _nothing: jpayne@68: return default jpayne@68: raise jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: def __str__(self): jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: jpayne@68: if not r: jpayne@68: raise ValueError('cannot convert deleted record to str') jpayne@68: jpayne@68: cdef kstring_t hrec_str jpayne@68: hrec_str.l = hrec_str.m = 0 jpayne@68: hrec_str.s = NULL jpayne@68: jpayne@68: bcf_hrec_format(r, &hrec_str) jpayne@68: jpayne@68: ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l) jpayne@68: jpayne@68: if hrec_str.m: jpayne@68: free(hrec_str.s) jpayne@68: jpayne@68: return ret jpayne@68: jpayne@68: # FIXME: Not safe -- causes trivial segfaults at the moment jpayne@68: def remove(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef bcf_hrec_t *r = self.ptr jpayne@68: if not r: jpayne@68: return jpayne@68: assert r.key jpayne@68: cdef char *key = r.key if r.type == BCF_HL_GEN else r.value jpayne@68: bcf_hdr_remove(hdr, r.type, key) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: jpayne@68: cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: if not hdr: jpayne@68: return None jpayne@68: jpayne@68: cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) jpayne@68: record.header = header jpayne@68: record.ptr = hdr jpayne@68: jpayne@68: return record jpayne@68: jpayne@68: jpayne@68: cdef class VariantHeaderRecords(object): jpayne@68: """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self.header.ptr.nhrec jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return self.header.ptr.nhrec != 0 jpayne@68: jpayne@68: def __getitem__(self, index): jpayne@68: cdef int32_t i = index jpayne@68: if i < 0 or i >= self.header.ptr.nhrec: jpayne@68: raise IndexError('invalid header record index') jpayne@68: return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef int32_t i jpayne@68: for i in range(self.header.ptr.nhrec): jpayne@68: yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) jpayne@68: jpayne@68: __hash__ = None jpayne@68: jpayne@68: jpayne@68: cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) jpayne@68: records.header = header jpayne@68: return records jpayne@68: jpayne@68: jpayne@68: cdef class VariantMetadata(object): jpayne@68: """filter, info or format metadata record from a :class:`VariantHeader` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: @property jpayne@68: def name(self): jpayne@68: """metadata name""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) jpayne@68: jpayne@68: # Q: Should this be exposed? jpayne@68: @property jpayne@68: def id(self): jpayne@68: """metadata internal header id number""" jpayne@68: return self.id jpayne@68: jpayne@68: @property jpayne@68: def number(self): jpayne@68: """metadata number (i.e. cardinality)""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: jpayne@68: if not check_header_id(hdr, self.type, self.id): jpayne@68: raise ValueError('Invalid header id') jpayne@68: jpayne@68: if self.type == BCF_HL_FLT: jpayne@68: return None jpayne@68: jpayne@68: cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) jpayne@68: if l == BCF_VL_FIXED: jpayne@68: return bcf_hdr_id2number(hdr, self.type, self.id) jpayne@68: elif l == BCF_VL_VAR: jpayne@68: return '.' jpayne@68: else: jpayne@68: return METADATA_LENGTHS[l] jpayne@68: jpayne@68: @property jpayne@68: def type(self): jpayne@68: """metadata value type""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: if not check_header_id(hdr, self.type, self.id): jpayne@68: raise ValueError('Invalid header id') jpayne@68: jpayne@68: if self.type == BCF_HL_FLT: jpayne@68: return None jpayne@68: return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] jpayne@68: jpayne@68: @property jpayne@68: def description(self): jpayne@68: """metadata description (or None if not set)""" jpayne@68: descr = self.record.get('Description') jpayne@68: if descr: jpayne@68: descr = descr.strip('"') jpayne@68: return force_str(descr) jpayne@68: jpayne@68: @property jpayne@68: def record(self): jpayne@68: """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: if not check_header_id(hdr, self.type, self.id): jpayne@68: raise ValueError('Invalid header id') jpayne@68: cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] jpayne@68: if not hrec: jpayne@68: return None jpayne@68: return makeVariantHeaderRecord(self.header, hrec) jpayne@68: jpayne@68: def remove_header(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef const char *key = hdr.id[BCF_DT_ID][self.id].key jpayne@68: bcf_hdr_remove(hdr, self.type, key) jpayne@68: jpayne@68: jpayne@68: cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: jpayne@68: raise ValueError('invalid metadata type') jpayne@68: jpayne@68: if id < 0 or id >= header.ptr.n[BCF_DT_ID]: jpayne@68: raise ValueError('invalid metadata id') jpayne@68: jpayne@68: cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) jpayne@68: meta.header = header jpayne@68: meta.type = type jpayne@68: meta.id = id jpayne@68: jpayne@68: return meta jpayne@68: jpayne@68: jpayne@68: cdef class VariantHeaderMetadata(object): jpayne@68: """mapping from filter, info or format name to :class:`VariantMetadata` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def add(self, id, number, type, description, **kwargs): jpayne@68: """Add a new filter, info or format record""" jpayne@68: if id in self: jpayne@68: raise ValueError('Header already exists for id={}'.format(id)) jpayne@68: jpayne@68: if self.type == BCF_HL_FLT: jpayne@68: if number is not None: jpayne@68: raise ValueError('Number must be None when adding a filter') jpayne@68: if type is not None: jpayne@68: raise ValueError('Type must be None when adding a filter') jpayne@68: jpayne@68: items = [('ID', unquoted_str(id)), ('Description', description)] jpayne@68: else: jpayne@68: if type not in VALUE_TYPES: jpayne@68: raise ValueError('unknown type specified: {}'.format(type)) jpayne@68: if number is None: jpayne@68: number = '.' jpayne@68: jpayne@68: items = [('ID', unquoted_str(id)), jpayne@68: ('Number', unquoted_str(number)), jpayne@68: ('Type', unquoted_str(type)), jpayne@68: ('Description', description)] jpayne@68: jpayne@68: items += kwargs.items() jpayne@68: self.header.add_meta(METADATA_TYPES[self.type], items=items) jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef bcf_idpair_t *idpair jpayne@68: cdef int32_t i, n = 0 jpayne@68: jpayne@68: for i in range(hdr.n[BCF_DT_ID]): jpayne@68: idpair = hdr.id[BCF_DT_ID] + i jpayne@68: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@68: n += 1 jpayne@68: return n jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef bcf_idpair_t *idpair jpayne@68: cdef int32_t i jpayne@68: jpayne@68: for i in range(hdr.n[BCF_DT_ID]): jpayne@68: idpair = hdr.id[BCF_DT_ID] + i jpayne@68: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef vdict_t *d = hdr.dict[BCF_DT_ID] jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@68: jpayne@68: if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: jpayne@68: raise KeyError('invalid key: {}'.format(key)) jpayne@68: jpayne@68: return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) jpayne@68: jpayne@68: def remove_header(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef vdict_t *d = hdr.dict[BCF_DT_ID] jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@68: jpayne@68: if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: jpayne@68: raise KeyError('invalid key: {}'.format(key)) jpayne@68: jpayne@68: bcf_hdr_remove(hdr, self.type, bkey) jpayne@68: #bcf_hdr_sync(hdr) jpayne@68: jpayne@68: def clear_header(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: bcf_hdr_remove(hdr, self.type, NULL) jpayne@68: #bcf_hdr_sync(hdr) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef bcf_idpair_t *idpair jpayne@68: cdef int32_t i jpayne@68: jpayne@68: for i in range(hdr.n[BCF_DT_ID]): jpayne@68: idpair = hdr.id[BCF_DT_ID] + i jpayne@68: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@68: yield bcf_str_cache_get_charptr(idpair.key) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: try: jpayne@68: self[key] jpayne@68: except KeyError: jpayne@68: return False jpayne@68: else: jpayne@68: return True jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: jpayne@68: cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) jpayne@68: meta.header = header jpayne@68: meta.type = type jpayne@68: jpayne@68: return meta jpayne@68: jpayne@68: jpayne@68: cdef class VariantContig(object): jpayne@68: """contig metadata from a :class:`VariantHeader`""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: @property jpayne@68: def name(self): jpayne@68: """contig name""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) jpayne@68: jpayne@68: @property jpayne@68: def id(self): jpayne@68: """contig internal id number""" jpayne@68: return self.id jpayne@68: jpayne@68: @property jpayne@68: def length(self): jpayne@68: """contig length or None if not available""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] jpayne@68: return length if length else None jpayne@68: jpayne@68: @property jpayne@68: def header_record(self): jpayne@68: """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] jpayne@68: return makeVariantHeaderRecord(self.header, hrec) jpayne@68: jpayne@68: def remove_header(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key jpayne@68: bcf_hdr_remove(hdr, BCF_HL_CTG, key) jpayne@68: jpayne@68: jpayne@68: cdef VariantContig makeVariantContig(VariantHeader header, int id): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: jpayne@68: raise ValueError('invalid contig id') jpayne@68: jpayne@68: cdef VariantContig contig = VariantContig.__new__(VariantContig) jpayne@68: contig.header = header jpayne@68: contig.id = id jpayne@68: jpayne@68: return contig jpayne@68: jpayne@68: jpayne@68: cdef class VariantHeaderContigs(object): jpayne@68: """mapping from contig name or index to :class:`VariantContig` object.""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] jpayne@68: return hdr.n[BCF_DT_CTG] jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] jpayne@68: return hdr.n[BCF_DT_CTG] != 0 jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int index jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: index = key jpayne@68: if index < 0 or index >= hdr.n[BCF_DT_CTG]: jpayne@68: raise IndexError('invalid contig index') jpayne@68: return makeVariantContig(self.header, index) jpayne@68: jpayne@68: cdef vdict_t *d = hdr.dict[BCF_DT_CTG] jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@68: jpayne@68: if k == kh_end(d): jpayne@68: raise KeyError('invalid contig: {}'.format(key)) jpayne@68: jpayne@68: cdef int id = kh_val_vdict(d, k).id jpayne@68: jpayne@68: return makeVariantContig(self.header, id) jpayne@68: jpayne@68: def remove_header(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int index jpayne@68: cdef const char *ckey jpayne@68: cdef vdict_t *d jpayne@68: cdef khiter_t k jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: index = key jpayne@68: if index < 0 or index >= hdr.n[BCF_DT_CTG]: jpayne@68: raise IndexError('invalid contig index') jpayne@68: ckey = hdr.id[BCF_DT_CTG][self.id].key jpayne@68: else: jpayne@68: d = hdr.dict[BCF_DT_CTG] jpayne@68: key = force_bytes(key) jpayne@68: if kh_get_vdict(d, key) == kh_end(d): jpayne@68: raise KeyError('invalid contig: {}'.format(key)) jpayne@68: ckey = key jpayne@68: jpayne@68: bcf_hdr_remove(hdr, BCF_HL_CTG, ckey) jpayne@68: jpayne@68: def clear_header(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: bcf_hdr_remove(hdr, BCF_HL_CTG, NULL) jpayne@68: #bcf_hdr_sync(hdr) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef vdict_t *d = hdr.dict[BCF_DT_CTG] jpayne@68: cdef uint32_t n = kh_size(d) jpayne@68: jpayne@68: assert n == hdr.n[BCF_DT_CTG] jpayne@68: jpayne@68: for i in range(n): jpayne@68: yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i)) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: try: jpayne@68: self[key] jpayne@68: except KeyError: jpayne@68: return False jpayne@68: else: jpayne@68: return True jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: def add(self, id, length=None, **kwargs): jpayne@68: """Add a new contig record""" jpayne@68: if id in self: jpayne@68: raise ValueError('Header already exists for contig {}'.format(id)) jpayne@68: jpayne@68: items = [('ID', unquoted_str(id))] jpayne@68: if length is not None: jpayne@68: items.append(("length", unquoted_str(length))) jpayne@68: items += kwargs.items() jpayne@68: self.header.add_meta('contig', items=items) jpayne@68: jpayne@68: jpayne@68: cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) jpayne@68: contigs.header = header jpayne@68: jpayne@68: return contigs jpayne@68: jpayne@68: jpayne@68: cdef class VariantHeaderSamples(object): jpayne@68: """sequence of sample names from a :class:`VariantHeader` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: return bcf_hdr_nsamples(self.header.ptr) jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return bcf_hdr_nsamples(self.header.ptr) != 0 jpayne@68: jpayne@68: def __getitem__(self, index): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int32_t n = bcf_hdr_nsamples(hdr) jpayne@68: cdef int32_t i = index jpayne@68: jpayne@68: if i < 0 or i >= n: jpayne@68: raise IndexError('invalid sample index') jpayne@68: jpayne@68: return charptr_to_str(hdr.samples[i]) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int32_t i, n = bcf_hdr_nsamples(hdr) jpayne@68: jpayne@68: for i in range(n): jpayne@68: yield charptr_to_str(hdr.samples[i]) jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef vdict_t *d = hdr.dict[BCF_DT_SAMPLE] jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@68: jpayne@68: return k != kh_end(d) jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: def add(self, name): jpayne@68: """Add a new sample""" jpayne@68: self.header.add_sample(name) jpayne@68: jpayne@68: jpayne@68: cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) jpayne@68: samples.header = header jpayne@68: jpayne@68: return samples jpayne@68: jpayne@68: jpayne@68: cdef class VariantHeader(object): jpayne@68: """header information for a :class:`VariantFile` object""" jpayne@68: #FIXME: Add structured proxy jpayne@68: #FIXME: Add generic proxy jpayne@68: #FIXME: Add mutable methods jpayne@68: jpayne@68: # See makeVariantHeader for C constructor jpayne@68: def __cinit__(self): jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: # Python constructor jpayne@68: def __init__(self): jpayne@68: self.ptr = bcf_hdr_init(b'w') jpayne@68: if not self.ptr: jpayne@68: raise ValueError('cannot create VariantHeader') jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.ptr: jpayne@68: bcf_hdr_destroy(self.ptr) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return self.ptr != NULL jpayne@68: jpayne@68: def copy(self): jpayne@68: return makeVariantHeader(bcf_hdr_dup(self.ptr)) jpayne@68: jpayne@68: def merge(self, VariantHeader header): jpayne@68: if header is None: jpayne@68: raise ValueError('header must not be None') jpayne@68: bcf_hdr_merge(self.ptr, header.ptr) jpayne@68: jpayne@68: @property jpayne@68: def version(self): jpayne@68: """VCF version""" jpayne@68: return force_str(bcf_hdr_get_version(self.ptr)) jpayne@68: jpayne@68: @property jpayne@68: def samples(self): jpayne@68: """samples (:class:`VariantHeaderSamples`)""" jpayne@68: return makeVariantHeaderSamples(self) jpayne@68: jpayne@68: @property jpayne@68: def records(self): jpayne@68: """header records (:class:`VariantHeaderRecords`)""" jpayne@68: return makeVariantHeaderRecords(self) jpayne@68: jpayne@68: @property jpayne@68: def contigs(self): jpayne@68: """contig information (:class:`VariantHeaderContigs`)""" jpayne@68: return makeVariantHeaderContigs(self) jpayne@68: jpayne@68: @property jpayne@68: def filters(self): jpayne@68: """filter metadata (:class:`VariantHeaderMetadata`)""" jpayne@68: return makeVariantHeaderMetadata(self, BCF_HL_FLT) jpayne@68: jpayne@68: @property jpayne@68: def info(self): jpayne@68: """info metadata (:class:`VariantHeaderMetadata`)""" jpayne@68: return makeVariantHeaderMetadata(self, BCF_HL_INFO) jpayne@68: jpayne@68: @property jpayne@68: def formats(self): jpayne@68: """format metadata (:class:`VariantHeaderMetadata`)""" jpayne@68: return makeVariantHeaderMetadata(self, BCF_HL_FMT) jpayne@68: jpayne@68: @property jpayne@68: def alts(self): jpayne@68: """alt metadata (:class:`dict` ID->record). jpayne@68: jpayne@68: The data returned just a snapshot of alt records, is created jpayne@68: every time the property is requested, and modifications will jpayne@68: not be reflected in the header metadata and vice versa. jpayne@68: jpayne@68: i.e. it is just a dict that reflects the state of alt records jpayne@68: at the time it is created. jpayne@68: """ jpayne@68: return {record['ID']:record for record in self.records jpayne@68: if record.key.upper() == 'ALT' } jpayne@68: jpayne@68: # only safe to do when opening an htsfile jpayne@68: cdef _subset_samples(self, include_samples): jpayne@68: keep_samples = set(self.samples) jpayne@68: include_samples = set(include_samples) jpayne@68: missing_samples = include_samples - keep_samples jpayne@68: keep_samples &= include_samples jpayne@68: jpayne@68: if missing_samples: jpayne@68: # FIXME: add specialized exception with payload jpayne@68: raise ValueError( jpayne@68: 'missing {:d} requested samples'.format( jpayne@68: len(missing_samples))) jpayne@68: jpayne@68: keep_samples = force_bytes(','.join(keep_samples)) jpayne@68: cdef char *keep = keep_samples if keep_samples else NULL jpayne@68: cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) jpayne@68: jpayne@68: if ret != 0: jpayne@68: raise ValueError( jpayne@68: 'bcf_hdr_set_samples failed: ret = {}'.format(ret)) jpayne@68: jpayne@68: def __str__(self): jpayne@68: cdef int hlen jpayne@68: cdef kstring_t line jpayne@68: line.l = line.m = 0 jpayne@68: line.s = NULL jpayne@68: jpayne@68: if bcf_hdr_format(self.ptr, 0, &line) < 0: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: raise ValueError('bcf_hdr_format failed') jpayne@68: jpayne@68: ret = charptr_to_str_w_len(line.s, line.l) jpayne@68: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: return ret jpayne@68: jpayne@68: def new_record(self, contig=None, start=0, stop=0, alleles=None, jpayne@68: id=None, qual=None, filter=None, info=None, samples=None, jpayne@68: **kwargs): jpayne@68: """Create a new empty VariantRecord. jpayne@68: jpayne@68: Arguments are currently experimental. Use with caution and expect jpayne@68: changes in upcoming releases. jpayne@68: jpayne@68: """ jpayne@68: rec = makeVariantRecord(self, bcf_init()) jpayne@68: jpayne@68: if not rec: jpayne@68: raise MemoryError('unable to allocate BCF record') jpayne@68: jpayne@68: rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr) jpayne@68: jpayne@68: if contig is not None: jpayne@68: rec.contig = contig jpayne@68: jpayne@68: rec.start = start jpayne@68: rec.stop = stop jpayne@68: rec.id = id jpayne@68: rec.qual = qual jpayne@68: jpayne@68: if alleles is not None: jpayne@68: rec.alleles = alleles jpayne@68: jpayne@68: if filter is not None: jpayne@68: if isinstance(filter, (list, tuple, VariantRecordFilter)): jpayne@68: for f in filter: jpayne@68: rec.filter.add(f) jpayne@68: else: jpayne@68: rec.filter.add(filter) jpayne@68: jpayne@68: if info: jpayne@68: rec.info.update(info) jpayne@68: jpayne@68: if kwargs: jpayne@68: rec.samples[0].update(kwargs) jpayne@68: jpayne@68: if samples: jpayne@68: for i, sample in enumerate(samples): jpayne@68: rec.samples[i].update(sample) jpayne@68: jpayne@68: return rec jpayne@68: jpayne@68: def add_record(self, VariantHeaderRecord record): jpayne@68: """Add an existing :class:`VariantHeaderRecord` to this header""" jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr) jpayne@68: jpayne@68: bcf_hdr_add_hrec(self.ptr, hrec) jpayne@68: jpayne@68: self._hdr_sync() jpayne@68: jpayne@68: def add_line(self, line): jpayne@68: """Add a metadata line to this header""" jpayne@68: bline = force_bytes(line) jpayne@68: if bcf_hdr_append(self.ptr, bline) < 0: jpayne@68: raise ValueError('invalid header line') jpayne@68: jpayne@68: self._hdr_sync() jpayne@68: jpayne@68: jpayne@68: def add_meta(self, key, value=None, items=None): jpayne@68: """Add metadata to this header""" jpayne@68: if not ((value is not None) ^ (items is not None)): jpayne@68: raise ValueError('either value or items must be specified') jpayne@68: jpayne@68: cdef bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)) jpayne@68: cdef int quoted jpayne@68: jpayne@68: try: jpayne@68: key = force_bytes(key) jpayne@68: hrec.key = strdup(key) jpayne@68: jpayne@68: if value is not None: jpayne@68: hrec.value = strdup(force_bytes(value)) jpayne@68: else: jpayne@68: for key, value in items: jpayne@68: quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type") jpayne@68: jpayne@68: key = force_bytes(key) jpayne@68: bcf_hrec_add_key(hrec, key, len(key)) jpayne@68: jpayne@68: value = force_bytes(str(value)) jpayne@68: bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted) jpayne@68: except: jpayne@68: bcf_hrec_destroy(hrec) jpayne@68: raise jpayne@68: jpayne@68: bcf_hdr_add_hrec(self.ptr, hrec) jpayne@68: jpayne@68: self._hdr_sync() jpayne@68: jpayne@68: cdef _add_sample(self, name): jpayne@68: bname = force_bytes(name) jpayne@68: if bcf_hdr_add_sample(self.ptr, bname) < 0: jpayne@68: raise ValueError('Duplicated sample name: {}'.format(name)) jpayne@68: jpayne@68: cdef _hdr_sync(self): jpayne@68: cdef bcf_hdr_t *hdr = self.ptr jpayne@68: if hdr.dirty: jpayne@68: if bcf_hdr_sync(hdr) < 0: jpayne@68: raise MemoryError('unable to reallocate VariantHeader') jpayne@68: jpayne@68: def add_sample(self, name): jpayne@68: """Add a new sample to this header""" jpayne@68: self._add_sample(name) jpayne@68: self._hdr_sync() jpayne@68: jpayne@68: def add_samples(self, *args): jpayne@68: """Add several new samples to this header. jpayne@68: This function takes multiple arguments, each of which may jpayne@68: be either a sample name or an iterable returning sample names jpayne@68: (e.g., a list of sample names). jpayne@68: """ jpayne@68: for arg in args: jpayne@68: if isinstance(arg, str): jpayne@68: self._add_sample(arg) jpayne@68: else: jpayne@68: for name in arg: jpayne@68: self._add_sample(name) jpayne@68: self._hdr_sync() jpayne@68: jpayne@68: jpayne@68: cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): jpayne@68: if not hdr: jpayne@68: raise ValueError('cannot create VariantHeader') jpayne@68: jpayne@68: cdef VariantHeader header = VariantHeader.__new__(VariantHeader) jpayne@68: header.ptr = hdr jpayne@68: jpayne@68: return header jpayne@68: jpayne@68: jpayne@68: cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2: jpayne@68: cdef vdict_t *d jpayne@68: cdef khiter_t k jpayne@68: cdef int info_id jpayne@68: jpayne@68: if isinstance(key, str): jpayne@68: key = force_bytes(key) jpayne@68: jpayne@68: d = hdr.dict[BCF_DT_ID] jpayne@68: k = kh_get_vdict(d, key) jpayne@68: jpayne@68: if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: jpayne@68: return -1 jpayne@68: jpayne@68: return kh_val_vdict(d, k).id jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Variant Record objects jpayne@68: ######################################################################## jpayne@68: jpayne@68: cdef class VariantRecordFilter(object): jpayne@68: """Filters set on a :class:`VariantRecord` object, presented as a mapping from jpayne@68: filter index or name to :class:`VariantMetadata` object""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self.record.ptr.d.n_flt jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return self.record.ptr.d.n_flt != 0 jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int index, id jpayne@68: cdef int n = r.d.n_flt jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: index = key jpayne@68: jpayne@68: if index < 0 or index >= n: jpayne@68: raise IndexError('invalid filter index') jpayne@68: jpayne@68: id = r.d.flt[index] jpayne@68: else: jpayne@68: if key == '.': jpayne@68: key = 'PASS' jpayne@68: jpayne@68: bkey = force_bytes(key) jpayne@68: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): jpayne@68: raise KeyError('Invalid filter: {}'.format(key)) jpayne@68: jpayne@68: return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) jpayne@68: jpayne@68: def add(self, key): jpayne@68: """Add a new filter""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int id jpayne@68: jpayne@68: if key == '.': jpayne@68: key = 'PASS' jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_FLT, id): jpayne@68: raise KeyError('Invalid filter: {}'.format(key)) jpayne@68: jpayne@68: bcf_add_filter(hdr, r, id) jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int index, id jpayne@68: cdef int n = r.d.n_flt jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: index = key jpayne@68: jpayne@68: if index < 0 or index >= n: jpayne@68: raise IndexError('invalid filter index') jpayne@68: jpayne@68: id = r.d.flt[index] jpayne@68: else: jpayne@68: if key == '.': jpayne@68: key = 'PASS' jpayne@68: jpayne@68: bkey = force_bytes(key) jpayne@68: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): jpayne@68: raise KeyError('Invalid filter: {}'.format(key)) jpayne@68: jpayne@68: bcf_remove_filter(hdr, r, id, 0) jpayne@68: jpayne@68: def clear(self): jpayne@68: """Clear all filters""" jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: r.d.shared_dirty |= BCF1_DIRTY_FLT jpayne@68: r.d.n_flt = 0 jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int i jpayne@68: jpayne@68: for i in range(r.d.n_flt): jpayne@68: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: return bcf_has_filter(hdr, r, bkey) == 1 jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op): jpayne@68: if op != 2 and op != 3: jpayne@68: return NotImplemented jpayne@68: jpayne@68: cdef bcf1_t *s = self.record.ptr jpayne@68: cdef bcf1_t *o = other.record.ptr jpayne@68: jpayne@68: cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other)) jpayne@68: jpayne@68: if op == 3: jpayne@68: cmp = not cmp jpayne@68: jpayne@68: return cmp jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: jpayne@68: cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): jpayne@68: if not record: jpayne@68: raise ValueError('invalid VariantRecord') jpayne@68: jpayne@68: cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) jpayne@68: filter.record = record jpayne@68: jpayne@68: return filter jpayne@68: jpayne@68: jpayne@68: cdef class VariantRecordFormat(object): jpayne@68: """Format data present for each sample in a :class:`VariantRecord` object, jpayne@68: presented as mapping from format name to :class:`VariantMetadata` object.""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int i, n = 0 jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: if r.d.fmt[i].p: jpayne@68: n += 1 jpayne@68: return n jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int i jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: if r.d.fmt[i].p: jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: jpayne@68: if not fmt or not fmt.p: jpayne@68: raise KeyError('unknown format: {}'.format(key)) jpayne@68: jpayne@68: return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: jpayne@68: if not fmt or not fmt.p: jpayne@68: raise KeyError('unknown format: {}'.format(key)) jpayne@68: jpayne@68: if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: jpayne@68: raise ValueError('Unable to delete FORMAT') jpayne@68: jpayne@68: def clear(self): jpayne@68: """Clear all formats for all samples within the associated jpayne@68: :class:`VariantRecord` instance""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_fmt_t *fmt jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: for i in reversed(range(r.n_fmt)): jpayne@68: fmt = &r.d.fmt[i] jpayne@68: if fmt.p: jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id) jpayne@68: if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0: jpayne@68: raise ValueError('Unable to delete FORMAT') jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_fmt_t *fmt jpayne@68: cdef int i jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: fmt = &r.d.fmt[i] jpayne@68: if fmt.p: jpayne@68: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: return fmt != NULL and fmt.p != NULL jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: jpayne@68: cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): jpayne@68: if not record: jpayne@68: raise ValueError('invalid VariantRecord') jpayne@68: jpayne@68: cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) jpayne@68: format.record = record jpayne@68: jpayne@68: return format jpayne@68: jpayne@68: jpayne@68: #TODO: Add a getmeta method to return the corresponding VariantMetadata? jpayne@68: cdef class VariantRecordInfo(object): jpayne@68: """Info data stored in a :class:`VariantRecord` object, presented as a jpayne@68: mapping from info metadata name to value.""" jpayne@68: jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i, count = 0 jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: jpayne@68: count += 1 jpayne@68: jpayne@68: return count jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: jpayne@68: return True jpayne@68: jpayne@68: return False jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: jpayne@68: if strcmp(bkey, b'END') == 0: jpayne@68: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@68: jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: # Cannot stop here if info == NULL, since flags must return False jpayne@68: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@68: jpayne@68: if info_id < 0: jpayne@68: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: # Handle type=Flag values jpayne@68: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: jpayne@68: return info != NULL and info.vptr != NULL jpayne@68: jpayne@68: if not info or not info.vptr: jpayne@68: raise KeyError('Invalid INFO field: {}'.format(key)) jpayne@68: jpayne@68: return bcf_info_get_value(self.record, info) jpayne@68: jpayne@68: def __setitem__(self, key, value): jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: jpayne@68: if strcmp(bkey, b'END') == 0: jpayne@68: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@68: jpayne@68: if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: bcf_info_set_value(self.record, key, value) jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: if strcmp(bkey, b'END') == 0: jpayne@68: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: # Cannot stop here if info == NULL, since flags must return False jpayne@68: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@68: jpayne@68: if info_id < 0: jpayne@68: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: # Handle flags jpayne@68: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): jpayne@68: return jpayne@68: jpayne@68: if not info or not info.vptr: jpayne@68: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@68: jpayne@68: if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: jpayne@68: raise ValueError('Unable to delete INFO') jpayne@68: jpayne@68: def clear(self): jpayne@68: """Clear all info data""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: if info and info.vptr: jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if strcmp(key, b'END') == 0: jpayne@68: continue jpayne@68: if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: jpayne@68: raise ValueError('Unable to delete INFO') jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: if info and info.vptr: jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if strcmp(key, b'END') != 0: jpayne@68: yield bcf_str_cache_get_charptr(key) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: jpayne@68: if strcmp(bkey, b'END') == 0: jpayne@68: return default jpayne@68: jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: # Cannot stop here if info == NULL, since flags must return False jpayne@68: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: # Handle flags jpayne@68: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: jpayne@68: return info != NULL and info.vptr != NULL jpayne@68: jpayne@68: if not info or not info.vptr: jpayne@68: return default jpayne@68: jpayne@68: return bcf_info_get_value(self.record, info) jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: jpayne@68: if strcmp(bkey, b'END') == 0: jpayne@68: return False jpayne@68: jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: return info != NULL and info.vptr != NULL jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: if info and info.vptr: jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if strcmp(key, b'END') != 0: jpayne@68: yield bcf_info_get_value(self.record, info) jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_info_t *info jpayne@68: cdef const char *key jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_info): jpayne@68: info = &r.d.info[i] jpayne@68: if info and info.vptr: jpayne@68: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@68: if strcmp(key, b'END') != 0: jpayne@68: value = bcf_info_get_value(self.record, info) jpayne@68: yield bcf_str_cache_get_charptr(key), value jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def update(self, items=None, **kwargs): jpayne@68: """D.update([E, ]**F) -> None. jpayne@68: jpayne@68: Update D from dict/iterable E and F. jpayne@68: """ jpayne@68: for k, v in items.items(): jpayne@68: if k != 'END': jpayne@68: self[k] = v jpayne@68: jpayne@68: if kwargs: jpayne@68: kwargs.pop('END', None) jpayne@68: for k, v in kwargs.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: def pop(self, key, default=_nothing): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@68: jpayne@68: # Cannot stop here if info == NULL, since flags must return False jpayne@68: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@68: jpayne@68: if info_id < 0: jpayne@68: if default is _nothing: jpayne@68: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@68: return default jpayne@68: jpayne@68: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@68: raise ValueError('Invalid header') jpayne@68: jpayne@68: # Handle flags jpayne@68: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): jpayne@68: return jpayne@68: jpayne@68: if not info or not info.vptr: jpayne@68: if default is _nothing: jpayne@68: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@68: return default jpayne@68: jpayne@68: value = bcf_info_get_value(self.record, info) jpayne@68: jpayne@68: if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: jpayne@68: raise ValueError('Unable to delete INFO') jpayne@68: jpayne@68: return value jpayne@68: jpayne@68: def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op): jpayne@68: if op != 2 and op != 3: jpayne@68: return NotImplemented jpayne@68: jpayne@68: cdef bcf1_t *s = self.record.ptr jpayne@68: cdef bcf1_t *o = other.record.ptr jpayne@68: jpayne@68: # Cannot use n_info as shortcut logic, since null values may remain jpayne@68: cdef bint cmp = dict(self) == dict(other) jpayne@68: jpayne@68: if op == 3: jpayne@68: cmp = not cmp jpayne@68: jpayne@68: return cmp jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: jpayne@68: cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): jpayne@68: if not record: jpayne@68: raise ValueError('invalid VariantRecord') jpayne@68: jpayne@68: cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) jpayne@68: info.record = record jpayne@68: jpayne@68: return info jpayne@68: jpayne@68: jpayne@68: cdef class VariantRecordSamples(object): jpayne@68: """mapping from sample index or name to :class:`VariantRecordSample` object.""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __len__(self): jpayne@68: return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr) jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0 jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int n = self.record.ptr.n_sample jpayne@68: cdef int sample_index jpayne@68: cdef vdict_t *d jpayne@68: cdef khiter_t k jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: sample_index = key jpayne@68: else: jpayne@68: bkey = force_bytes(key) jpayne@68: sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) jpayne@68: if sample_index < 0: jpayne@68: raise KeyError('invalid sample name: {}'.format(key)) jpayne@68: jpayne@68: if sample_index < 0 or sample_index >= n: jpayne@68: raise IndexError('invalid sample index') jpayne@68: jpayne@68: return makeVariantRecordSample(self.record, sample_index) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int32_t i, n = self.record.ptr.n_sample jpayne@68: jpayne@68: for i in range(n): jpayne@68: yield charptr_to_str(hdr.samples[i]) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int n = self.record.ptr.n_sample jpayne@68: cdef int sample_index jpayne@68: cdef vdict_t *d jpayne@68: cdef khiter_t k jpayne@68: jpayne@68: if isinstance(key, int): jpayne@68: sample_index = key jpayne@68: else: jpayne@68: bkey = force_bytes(key) jpayne@68: sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) jpayne@68: if sample_index < 0: jpayne@68: raise KeyError('invalid sample name: {}'.format(key)) jpayne@68: jpayne@68: return 0 <= sample_index < n jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int32_t i, n = self.record.ptr.n_sample jpayne@68: jpayne@68: for i in range(n): jpayne@68: yield makeVariantRecordSample(self.record, i) jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int32_t i, n = self.record.ptr.n_sample jpayne@68: jpayne@68: for i in range(n): jpayne@68: yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def update(self, items=None, **kwargs): jpayne@68: """D.update([E, ]**F) -> None. jpayne@68: jpayne@68: Update D from dict/iterable E and F. jpayne@68: """ jpayne@68: for k, v in items.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: if kwargs: jpayne@68: for k, v in kwargs.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: def pop(self, key, default=_nothing): jpayne@68: try: jpayne@68: value = self[key] jpayne@68: del self[key] jpayne@68: return value jpayne@68: except KeyError: jpayne@68: if default is not _nothing: jpayne@68: return default jpayne@68: raise jpayne@68: jpayne@68: def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op): jpayne@68: if op != 2 and op != 3: jpayne@68: return NotImplemented jpayne@68: jpayne@68: cdef bcf1_t *s = self.record.ptr jpayne@68: cdef bcf1_t *o = other.record.ptr jpayne@68: jpayne@68: cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values()) jpayne@68: jpayne@68: if op == 3: jpayne@68: cmp = not cmp jpayne@68: jpayne@68: return cmp jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: jpayne@68: cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): jpayne@68: if not record: jpayne@68: raise ValueError('invalid VariantRecord') jpayne@68: jpayne@68: cdef VariantRecordSamples samples = VariantRecordSamples.__new__( jpayne@68: VariantRecordSamples) jpayne@68: samples.record = record jpayne@68: jpayne@68: return samples jpayne@68: jpayne@68: jpayne@68: cdef class VariantRecord(object): jpayne@68: """Variant record""" jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.ptr: jpayne@68: bcf_destroy1(self.ptr) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: def copy(self): jpayne@68: """return a copy of this VariantRecord object""" jpayne@68: return makeVariantRecord(self.header, bcf_dup(self.ptr)) jpayne@68: jpayne@68: def translate(self, VariantHeader dst_header): jpayne@68: if dst_header is None: jpayne@68: raise ValueError('dst_header must not be None') jpayne@68: jpayne@68: cdef bcf_hdr_t *src_hdr = self.header.ptr jpayne@68: cdef bcf_hdr_t *dst_hdr = dst_header.ptr jpayne@68: jpayne@68: if src_hdr != dst_hdr: jpayne@68: if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr): jpayne@68: msg = 'Cannot translate record. Number of samples does not match header ({} vs {})' jpayne@68: raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) jpayne@68: jpayne@68: bcf_translate(dst_hdr, src_hdr, self.ptr) jpayne@68: self.header = dst_header jpayne@68: jpayne@68: @property jpayne@68: def rid(self): jpayne@68: """internal reference id number""" jpayne@68: return self.ptr.rid jpayne@68: jpayne@68: @rid.setter jpayne@68: def rid(self, value): jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int r = value jpayne@68: if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: jpayne@68: raise ValueError('invalid reference id') jpayne@68: self.ptr.rid = r jpayne@68: jpayne@68: @property jpayne@68: def chrom(self): jpayne@68: """chromosome/contig name""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int rid = self.ptr.rid jpayne@68: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@68: raise ValueError('Invalid header') jpayne@68: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@68: jpayne@68: @chrom.setter jpayne@68: def chrom(self, value): jpayne@68: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@68: bchrom = force_bytes(value) jpayne@68: cdef khint_t k = kh_get_vdict(d, bchrom) jpayne@68: if k == kh_end(d): jpayne@68: raise ValueError('Invalid chromosome/contig') jpayne@68: self.ptr.rid = kh_val_vdict(d, k).id jpayne@68: jpayne@68: @property jpayne@68: def contig(self): jpayne@68: """chromosome/contig name""" jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int rid = self.ptr.rid jpayne@68: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@68: raise ValueError('Invalid header') jpayne@68: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@68: jpayne@68: @contig.setter jpayne@68: def contig(self, value): jpayne@68: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@68: bchrom = force_bytes(value) jpayne@68: cdef khint_t k = kh_get_vdict(d, bchrom) jpayne@68: if k == kh_end(d): jpayne@68: raise ValueError('Invalid chromosome/contig') jpayne@68: self.ptr.rid = kh_val_vdict(d, k).id jpayne@68: jpayne@68: @property jpayne@68: def pos(self): jpayne@68: """record start position on chrom/contig (1-based inclusive)""" jpayne@68: return self.ptr.pos + 1 jpayne@68: jpayne@68: @pos.setter jpayne@68: def pos(self, value): jpayne@68: cdef int p = value jpayne@68: if p < 1: jpayne@68: raise ValueError('Position must be positive') jpayne@68: self.ptr.pos = p - 1 jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def start(self): jpayne@68: """record start position on chrom/contig (0-based inclusive)""" jpayne@68: return self.ptr.pos jpayne@68: jpayne@68: @start.setter jpayne@68: def start(self, value): jpayne@68: cdef int s = value jpayne@68: if s < 0: jpayne@68: raise ValueError('Start coordinate must be non-negative') jpayne@68: self.ptr.pos = s jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def stop(self): jpayne@68: """record stop position on chrom/contig (0-based exclusive)""" jpayne@68: return self.ptr.pos + self.ptr.rlen jpayne@68: jpayne@68: @stop.setter jpayne@68: def stop(self, value): jpayne@68: cdef int s = value jpayne@68: if s < 0: jpayne@68: raise ValueError('Stop coordinate must be non-negative') jpayne@68: self.ptr.rlen = s - self.ptr.pos jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def rlen(self): jpayne@68: """record length on chrom/contig (aka rec.stop - rec.start)""" jpayne@68: return self.ptr.rlen jpayne@68: jpayne@68: @rlen.setter jpayne@68: def rlen(self, value): jpayne@68: cdef int r = value jpayne@68: self.ptr.rlen = r jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def qual(self): jpayne@68: """phred scaled quality score or None if not available""" jpayne@68: return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None jpayne@68: jpayne@68: @qual.setter jpayne@68: def qual(self, value): jpayne@68: if value is not None: jpayne@68: self.ptr.qual = value jpayne@68: else: jpayne@68: bcf_float_set(&self.ptr.qual, bcf_float_missing) jpayne@68: jpayne@68: jpayne@68: # @property jpayne@68: # def n_allele(self): jpayne@68: # return self.ptr.n_allele jpayne@68: jpayne@68: # @property jpayne@68: # def n_sample(self): jpayne@68: # return self.ptr.n_sample jpayne@68: jpayne@68: @property jpayne@68: def id(self): jpayne@68: """record identifier or None if not available""" jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: # causes a memory leak https://github.com/pysam-developers/pysam/issues/773 jpayne@68: # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None jpayne@68: if (r.d.m_id == 0): jpayne@68: raise ValueError('Error extracting ID') jpayne@68: return charptr_to_str(r.d.id) if r.d.id != b'.' else None jpayne@68: jpayne@68: @id.setter jpayne@68: def id(self, value): jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: cdef char *idstr = NULL jpayne@68: if value is not None: jpayne@68: bid = force_bytes(value) jpayne@68: idstr = bid jpayne@68: if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: jpayne@68: raise ValueError('Error updating id') jpayne@68: jpayne@68: @property jpayne@68: def ref(self): jpayne@68: """reference allele""" jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: return charptr_to_str(r.d.allele[0]) if r.d.allele else None jpayne@68: jpayne@68: @ref.setter jpayne@68: def ref(self, value): jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: #FIXME: Set alleles directly -- this is stupid jpayne@68: if not value: jpayne@68: raise ValueError('ref allele must not be null') jpayne@68: value = force_bytes(value) jpayne@68: if r.d.allele and r.n_allele: jpayne@68: alleles = [r.d.allele[i] for i in range(r.n_allele)] jpayne@68: alleles[0] = value jpayne@68: else: jpayne@68: alleles = [value, ''] jpayne@68: self.alleles = alleles jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def alleles(self): jpayne@68: """tuple of reference allele followed by alt alleles""" jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: if not r.d.allele: jpayne@68: return None jpayne@68: cdef tuple res = PyTuple_New(r.n_allele) jpayne@68: for i in range(r.n_allele): jpayne@68: a = charptr_to_str(r.d.allele[i]) jpayne@68: PyTuple_SET_ITEM(res, i, a) jpayne@68: Py_INCREF(a) jpayne@68: return res jpayne@68: jpayne@68: @alleles.setter jpayne@68: def alleles(self, values): jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: jpayne@68: # Cache rlen of symbolic alleles before call to bcf_update_alleles_str jpayne@68: cdef int rlen = r.rlen jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: values = [force_bytes(v) for v in values] jpayne@68: jpayne@68: if len(values) < 2: jpayne@68: raise ValueError('must set at least 2 alleles') jpayne@68: jpayne@68: if b'' in values: jpayne@68: raise ValueError('cannot set null allele') jpayne@68: jpayne@68: value = b','.join(values) jpayne@68: jpayne@68: if bcf_update_alleles_str(self.header.ptr, r, value) < 0: jpayne@68: raise ValueError('Error updating alleles') jpayne@68: jpayne@68: # Reset rlen if alternate allele isn't symbolic, otherwise used cached jpayne@68: if has_symbolic_allele(self): jpayne@68: self.ptr.rlen = rlen jpayne@68: else: jpayne@68: self.ptr.rlen = len(values[0]) jpayne@68: r.d.var_type = -1 jpayne@68: bcf_sync_end(self) jpayne@68: jpayne@68: @property jpayne@68: def alts(self): jpayne@68: """tuple of alt alleles""" jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: if r.n_allele < 2 or not r.d.allele: jpayne@68: return None jpayne@68: cdef tuple res = PyTuple_New(r.n_allele - 1) jpayne@68: for i in range(1, r.n_allele): jpayne@68: a = charptr_to_str(r.d.allele[i]) jpayne@68: PyTuple_SET_ITEM(res, i - 1, a) jpayne@68: Py_INCREF(a) jpayne@68: return res jpayne@68: jpayne@68: @alts.setter jpayne@68: def alts(self, value): jpayne@68: #FIXME: Set alleles directly -- this is stupid jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: value = [force_bytes(v) for v in value] jpayne@68: if b'' in value: jpayne@68: raise ValueError('cannot set null alt allele') jpayne@68: ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] jpayne@68: self.alleles = ref + value jpayne@68: r.d.var_type = -1 jpayne@68: jpayne@68: @property jpayne@68: def filter(self): jpayne@68: """filter information (see :class:`VariantRecordFilter`)""" jpayne@68: if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: return makeVariantRecordFilter(self) jpayne@68: jpayne@68: @property jpayne@68: def info(self): jpayne@68: """info data (see :class:`VariantRecordInfo`)""" jpayne@68: if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: return makeVariantRecordInfo(self) jpayne@68: jpayne@68: @property jpayne@68: def format(self): jpayne@68: """sample format metadata (see :class:`VariantRecordFormat`)""" jpayne@68: if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: return makeVariantRecordFormat(self) jpayne@68: jpayne@68: @property jpayne@68: def samples(self): jpayne@68: """sample data (see :class:`VariantRecordSamples`)""" jpayne@68: if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: return makeVariantRecordSamples(self) jpayne@68: jpayne@68: property alleles_variant_types: jpayne@68: def __get__(self): jpayne@68: cdef bcf1_t *r = self.ptr jpayne@68: cdef tuple result = PyTuple_New(r.n_allele) jpayne@68: jpayne@68: for i in range(r.n_allele): jpayne@68: tp = bcf_get_variant_type(r, i) jpayne@68: jpayne@68: if tp == VCF_REF: jpayne@68: v_type = "REF" jpayne@68: elif tp == VCF_SNP: jpayne@68: v_type = "SNP" jpayne@68: elif tp == VCF_MNP: jpayne@68: v_type = "MNP" jpayne@68: elif tp == VCF_INDEL: jpayne@68: v_type = "INDEL" jpayne@68: elif tp == VCF_BND: jpayne@68: v_type = "BND" jpayne@68: elif tp == VCF_OVERLAP: jpayne@68: v_type = "OVERLAP" jpayne@68: else: jpayne@68: v_type = "OTHER" jpayne@68: jpayne@68: PyTuple_SET_ITEM(result, i, v_type) jpayne@68: Py_INCREF(v_type) jpayne@68: jpayne@68: return result jpayne@68: jpayne@68: def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op): jpayne@68: if op != 2 and op != 3: jpayne@68: return NotImplemented jpayne@68: jpayne@68: cdef bcf1_t *s = self.ptr jpayne@68: cdef bcf1_t *o = other.ptr jpayne@68: jpayne@68: cdef bint cmp = self is other or ( jpayne@68: s.pos == o.pos jpayne@68: and s.rlen == o.rlen jpayne@68: and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual)) jpayne@68: or s.qual == o.qual) jpayne@68: and s.n_sample == o.n_sample jpayne@68: and s.n_allele == o.n_allele jpayne@68: and self.contig == other.contig jpayne@68: and self.alleles == other.alleles jpayne@68: and self.id == other.id jpayne@68: and self.info == other.info jpayne@68: and self.filter == other.filter jpayne@68: and self.samples == other.samples) jpayne@68: jpayne@68: if op == 3: jpayne@68: cmp = not cmp jpayne@68: jpayne@68: return cmp jpayne@68: jpayne@68: def __str__(self): jpayne@68: cdef kstring_t line jpayne@68: cdef char c jpayne@68: jpayne@68: line.l = line.m = 0 jpayne@68: line.s = NULL jpayne@68: jpayne@68: if vcf_format(self.header.ptr, self.ptr, &line) < 0: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: raise ValueError('vcf_format failed') jpayne@68: jpayne@68: # Strip CR/LF? jpayne@68: #while line.l: jpayne@68: # c = line.s[line.l - 1] jpayne@68: # if c != b'\n' and c != b'\r': jpayne@68: # break jpayne@68: # line.l -= 1 jpayne@68: jpayne@68: ret = charptr_to_str_w_len(line.s, line.l) jpayne@68: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: jpayne@68: return ret jpayne@68: jpayne@68: jpayne@68: cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: if not r: jpayne@68: raise ValueError('cannot create VariantRecord') jpayne@68: jpayne@68: if r.errcode: jpayne@68: msg = [] jpayne@68: #if r.errcode & BCF_ERR_CTG_UNDEF: jpayne@68: # msg.append('undefined contig') jpayne@68: #if r.errcode & BCF_ERR_TAG_UNDEF: jpayne@68: # msg.append('undefined tag') jpayne@68: if r.errcode & BCF_ERR_NCOLS: jpayne@68: msg.append('invalid number of columns') jpayne@68: if r.errcode & BCF_ERR_LIMITS: jpayne@68: msg.append('limits violated') jpayne@68: if r.errcode & BCF_ERR_CHAR: jpayne@68: msg.append('invalid character found') jpayne@68: if r.errcode & BCF_ERR_CTG_INVALID: jpayne@68: msg.append('invalid contig') jpayne@68: if r.errcode & BCF_ERR_TAG_INVALID: jpayne@68: msg.append('invalid tag') jpayne@68: jpayne@68: if msg: jpayne@68: msg = ', '.join(msg) jpayne@68: raise ValueError('Error(s) reading record: {}'.format(msg)) jpayne@68: jpayne@68: cdef VariantRecord record = VariantRecord.__new__(VariantRecord) jpayne@68: record.header = header jpayne@68: record.ptr = r jpayne@68: jpayne@68: return record jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Variant Sampletype object jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef class VariantRecordSample(object): jpayne@68: """Data for a single sample from a :class:`VariantRecord` object. jpayne@68: Provides data accessors for genotypes and a mapping interface jpayne@68: from format name to values. jpayne@68: """ jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: raise TypeError('this class cannot be instantiated from Python') jpayne@68: jpayne@68: @property jpayne@68: def name(self): jpayne@68: """sample name""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int32_t n = r.n_sample jpayne@68: jpayne@68: if self.index < 0 or self.index >= n: jpayne@68: raise ValueError('invalid sample index') jpayne@68: jpayne@68: return charptr_to_str(hdr.samples[self.index]) jpayne@68: jpayne@68: @property jpayne@68: def allele_indices(self): jpayne@68: """allele indices for called genotype, if present. Otherwise None""" jpayne@68: return bcf_format_get_allele_indices(self) jpayne@68: jpayne@68: @allele_indices.setter jpayne@68: def allele_indices(self, value): jpayne@68: self['GT'] = value jpayne@68: jpayne@68: @allele_indices.deleter jpayne@68: def allele_indices(self): jpayne@68: self['GT'] = () jpayne@68: jpayne@68: @property jpayne@68: def alleles(self): jpayne@68: """alleles for called genotype, if present. Otherwise None""" jpayne@68: return bcf_format_get_alleles(self) jpayne@68: jpayne@68: @alleles.setter jpayne@68: def alleles(self, value): jpayne@68: # Sets the genotype, supply a tuple of alleles to set. jpayne@68: # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord jpayne@68: # The genotype is reset when an empty tuple, None or (None,) is supplied jpayne@68: jpayne@68: if value==(None,) or value==tuple() or value is None: jpayne@68: self['GT'] = () jpayne@68: return jpayne@68: jpayne@68: if any((type(x) == int for x in value)): jpayne@68: raise ValueError('Use .allele_indices to set integer allele indices') jpayne@68: jpayne@68: # determine and set allele indices: jpayne@68: try: jpayne@68: self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) ) jpayne@68: except ValueError: jpayne@68: raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord." jpayne@68: "First set the .alleles of this record to define the alleles") jpayne@68: jpayne@68: @alleles.deleter jpayne@68: def alleles(self): jpayne@68: self['GT'] = () jpayne@68: jpayne@68: @property jpayne@68: def phased(self): jpayne@68: """False if genotype is missing or any allele is unphased. Otherwise True.""" jpayne@68: return bcf_sample_get_phased(self) jpayne@68: jpayne@68: @phased.setter jpayne@68: def phased(self, value): jpayne@68: bcf_sample_set_phased(self, value) jpayne@68: jpayne@68: def __len__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int i, n = 0 jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_FMT) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: if r.d.fmt[i].p: jpayne@68: n += 1 jpayne@68: return n jpayne@68: jpayne@68: def __bool__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef int i jpayne@68: jpayne@68: if bcf_unpack(r, BCF_UN_FMT) < 0: jpayne@68: raise ValueError('Error unpacking VariantRecord') jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: if r.d.fmt[i].p: jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: return bcf_format_get_value(self, key) jpayne@68: jpayne@68: def __setitem__(self, key, value): jpayne@68: bcf_format_set_value(self, key, value) jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: bcf_format_del_value(self, key) jpayne@68: jpayne@68: def clear(self): jpayne@68: """Clear all format data (including genotype) for this sample""" jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_fmt_t *fmt jpayne@68: cdef int i jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: fmt = &r.d.fmt[i] jpayne@68: if fmt.p: jpayne@68: bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bcf_fmt_t *fmt jpayne@68: cdef int i jpayne@68: jpayne@68: for i in range(r.n_fmt): jpayne@68: fmt = &r.d.fmt[i] jpayne@68: if r.d.fmt[i].p: jpayne@68: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@68: cdef bcf1_t *r = self.record.ptr jpayne@68: cdef bytes bkey = force_bytes(key) jpayne@68: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@68: return fmt != NULL and fmt.p != NULL jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def update(self, items=None, **kwargs): jpayne@68: """D.update([E, ]**F) -> None. jpayne@68: jpayne@68: Update D from dict/iterable E and F. jpayne@68: """ jpayne@68: for k, v in items.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: if kwargs: jpayne@68: for k, v in kwargs.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: def pop(self, key, default=_nothing): jpayne@68: try: jpayne@68: value = self[key] jpayne@68: del self[key] jpayne@68: return value jpayne@68: except KeyError: jpayne@68: if default is not _nothing: jpayne@68: return default jpayne@68: raise jpayne@68: jpayne@68: def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op): jpayne@68: if op != 2 and op != 3: jpayne@68: return NotImplemented jpayne@68: jpayne@68: cdef bint cmp = dict(self) == dict(other) jpayne@68: jpayne@68: if op == 3: jpayne@68: cmp = not cmp jpayne@68: jpayne@68: return cmp jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: jpayne@68: cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): jpayne@68: if not record or sample_index < 0: jpayne@68: raise ValueError('cannot create VariantRecordSample') jpayne@68: jpayne@68: cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) jpayne@68: sample.record = record jpayne@68: sample.index = sample_index jpayne@68: jpayne@68: return sample jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Index objects jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef class BaseIndex(object): jpayne@68: def __init__(self): jpayne@68: self.refs = () jpayne@68: self.remap = {} jpayne@68: jpayne@68: def __len__(self): jpayne@68: return len(self.refs) jpayne@68: jpayne@68: def __bool__(self): jpayne@68: return len(self.refs) != 0 jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: if isinstance(key, int): jpayne@68: return self.refs[key] jpayne@68: else: jpayne@68: return self.refmap[key] jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return iter(self.refs) jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@68: try: jpayne@68: return self[key] jpayne@68: except KeyError: jpayne@68: return default jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: try: jpayne@68: self[key] jpayne@68: except KeyError: jpayne@68: return False jpayne@68: else: jpayne@68: return True jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: """D.iterkeys() -> an iterator over the keys of D""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def itervalues(self): jpayne@68: """D.itervalues() -> an iterator over the values of D""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def iteritems(self): jpayne@68: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@68: for key in self: jpayne@68: yield (key, self[key]) jpayne@68: jpayne@68: def keys(self): jpayne@68: """D.keys() -> list of D's keys""" jpayne@68: return list(self) jpayne@68: jpayne@68: def items(self): jpayne@68: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@68: return list(self.iteritems()) jpayne@68: jpayne@68: def values(self): jpayne@68: """D.values() -> list of D's values""" jpayne@68: return list(self.itervalues()) jpayne@68: jpayne@68: def update(self, items=None, **kwargs): jpayne@68: """D.update([E, ]**F) -> None. jpayne@68: jpayne@68: Update D from dict/iterable E and F. jpayne@68: """ jpayne@68: for k, v in items.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: if kwargs: jpayne@68: for k, v in kwargs.items(): jpayne@68: self[k] = v jpayne@68: jpayne@68: def pop(self, key, default=_nothing): jpayne@68: try: jpayne@68: value = self[key] jpayne@68: del self[key] jpayne@68: return value jpayne@68: except KeyError: jpayne@68: if default is not _nothing: jpayne@68: return default jpayne@68: raise jpayne@68: jpayne@68: # Mappings are not hashable by default, but subclasses can change this jpayne@68: __hash__ = None jpayne@68: jpayne@68: #TODO: implement __richcmp__ jpayne@68: jpayne@68: jpayne@68: cdef class BCFIndex(object): jpayne@68: """CSI index data structure for BCF files""" jpayne@68: def __init__(self): jpayne@68: self.refs = () jpayne@68: self.refmap = {} jpayne@68: jpayne@68: if not self.ptr: jpayne@68: raise ValueError('Invalid index object') jpayne@68: jpayne@68: cdef int n jpayne@68: cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) jpayne@68: jpayne@68: self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () jpayne@68: self.refmap = { r:i for i,r in enumerate(self.refs) } jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.ptr: jpayne@68: hts_idx_destroy(self.ptr) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: def fetch(self, bcf, contig, start, stop, reopen): jpayne@68: return BCFIterator(bcf, contig, start, stop, reopen) jpayne@68: jpayne@68: jpayne@68: cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): jpayne@68: if not idx: jpayne@68: return None jpayne@68: jpayne@68: if not header: jpayne@68: raise ValueError('invalid VariantHeader') jpayne@68: jpayne@68: cdef BCFIndex index = BCFIndex.__new__(BCFIndex) jpayne@68: index.header = header jpayne@68: index.ptr = idx jpayne@68: index.__init__() jpayne@68: jpayne@68: return index jpayne@68: jpayne@68: jpayne@68: cdef class TabixIndex(BaseIndex): jpayne@68: """Tabix index data structure for VCF files""" jpayne@68: def __init__(self): jpayne@68: self.refs = () jpayne@68: self.refmap = {} jpayne@68: jpayne@68: if not self.ptr: jpayne@68: raise ValueError('Invalid index object') jpayne@68: jpayne@68: cdef int n jpayne@68: cdef const char **refs = tbx_seqnames(self.ptr, &n) jpayne@68: jpayne@68: self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () jpayne@68: self.refmap = { r:i for i,r in enumerate(self.refs) } jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.ptr: jpayne@68: tbx_destroy(self.ptr) jpayne@68: self.ptr = NULL jpayne@68: jpayne@68: def fetch(self, bcf, contig, start, stop, reopen): jpayne@68: return TabixIterator(bcf, contig, start, stop, reopen) jpayne@68: jpayne@68: jpayne@68: cdef TabixIndex makeTabixIndex(tbx_t *idx): jpayne@68: if not idx: jpayne@68: return None jpayne@68: jpayne@68: cdef TabixIndex index = TabixIndex.__new__(TabixIndex) jpayne@68: index.ptr = idx jpayne@68: index.__init__() jpayne@68: jpayne@68: return index jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Iterators jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef class BaseIterator(object): jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: # Internal function to clean up after iteration stop or failure. jpayne@68: # This would be a nested function if it weren't a cdef function. jpayne@68: cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): jpayne@68: bcf_destroy1(record) jpayne@68: jpayne@68: # destroy iter so future calls to __next__ raise StopIteration jpayne@68: bcf_itr_destroy(self.iter) jpayne@68: self.iter = NULL jpayne@68: jpayne@68: jpayne@68: cdef class BCFIterator(BaseIterator): jpayne@68: def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): jpayne@68: if bcf is None: jpayne@68: raise ValueError('bcf must not be None') jpayne@68: jpayne@68: if contig is None: jpayne@68: raise ValueError('contig must be specified') jpayne@68: jpayne@68: if not isinstance(bcf.index, BCFIndex): jpayne@68: raise ValueError('bcf index required') jpayne@68: jpayne@68: cdef BCFIndex index = bcf.index jpayne@68: jpayne@68: self.bcf = bcf jpayne@68: self.index = index jpayne@68: jpayne@68: cdef int rid, cstart, cstop jpayne@68: jpayne@68: try: jpayne@68: rid = index.refmap[contig] jpayne@68: except KeyError: jpayne@68: # A query for a non-existent contig yields an empty iterator, does not raise an error jpayne@68: self.iter = NULL jpayne@68: return jpayne@68: jpayne@68: if reopen: jpayne@68: self.bcf = self.bcf.copy() jpayne@68: jpayne@68: cstart = start if start is not None else 0 jpayne@68: cstop = stop if stop is not None else MAX_POS jpayne@68: jpayne@68: with nogil: jpayne@68: self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop) jpayne@68: jpayne@68: if not self.iter: jpayne@68: if errno: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: else: jpayne@68: raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.iter: jpayne@68: bcf_itr_destroy(self.iter) jpayne@68: self.iter = NULL jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: if not self.iter: jpayne@68: raise StopIteration jpayne@68: jpayne@68: cdef bcf1_t *record = bcf_init1() jpayne@68: jpayne@68: if not record: jpayne@68: raise MemoryError('unable to allocate BCF record') jpayne@68: jpayne@68: record.pos = -1 jpayne@68: if self.bcf.drop_samples: jpayne@68: record.max_unpack = BCF_UN_SHR jpayne@68: jpayne@68: cdef int ret jpayne@68: jpayne@68: with nogil: jpayne@68: ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) jpayne@68: jpayne@68: if ret < 0: jpayne@68: _stop_BCFIterator(self, record) jpayne@68: if ret == -1: jpayne@68: raise StopIteration jpayne@68: elif ret == -2: jpayne@68: raise IOError('truncated file') jpayne@68: elif errno: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: else: jpayne@68: raise IOError('unable to fetch next record') jpayne@68: jpayne@68: ret = bcf_subset_format(self.bcf.header.ptr, record) jpayne@68: jpayne@68: if ret < 0: jpayne@68: _stop_BCFIterator(self, record) jpayne@68: raise ValueError('error in bcf_subset_format') jpayne@68: jpayne@68: return makeVariantRecord(self.bcf.header, record) jpayne@68: jpayne@68: jpayne@68: cdef class TabixIterator(BaseIterator): jpayne@68: def __cinit__(self, *args, **kwargs): jpayne@68: self.line_buffer.l = 0 jpayne@68: self.line_buffer.m = 0 jpayne@68: self.line_buffer.s = NULL jpayne@68: jpayne@68: def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): jpayne@68: if bcf is None: jpayne@68: raise ValueError('bcf must not be None') jpayne@68: jpayne@68: if not isinstance(bcf.index, TabixIndex): jpayne@68: raise ValueError('tabix index required') jpayne@68: jpayne@68: cdef TabixIndex index = bcf.index jpayne@68: jpayne@68: self.bcf = bcf jpayne@68: self.index = index jpayne@68: jpayne@68: cdef int rid, cstart, cstop jpayne@68: jpayne@68: try: jpayne@68: rid = index.refmap[contig] jpayne@68: except KeyError: jpayne@68: # A query for a non-existent contig yields an empty iterator, does not raise an error jpayne@68: self.iter = NULL jpayne@68: return jpayne@68: jpayne@68: if reopen: jpayne@68: self.bcf = self.bcf.copy() jpayne@68: jpayne@68: cstart = start if start is not None else 0 jpayne@68: cstop = stop if stop is not None else MAX_POS jpayne@68: jpayne@68: self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) jpayne@68: jpayne@68: if not self.iter: jpayne@68: if errno: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: else: jpayne@68: raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if self.iter: jpayne@68: tbx_itr_destroy(self.iter) jpayne@68: self.iter = NULL jpayne@68: jpayne@68: if self.line_buffer.m: jpayne@68: free(self.line_buffer.s) jpayne@68: jpayne@68: self.line_buffer.l = 0 jpayne@68: self.line_buffer.m = 0 jpayne@68: self.line_buffer.s = NULL jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: if not self.iter: jpayne@68: raise StopIteration jpayne@68: jpayne@68: cdef int ret jpayne@68: jpayne@68: with nogil: jpayne@68: ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) jpayne@68: jpayne@68: if ret < 0: jpayne@68: tbx_itr_destroy(self.iter) jpayne@68: self.iter = NULL jpayne@68: if ret == -1: jpayne@68: raise StopIteration jpayne@68: elif ret == -2: jpayne@68: raise IOError('truncated file') jpayne@68: elif errno: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: else: jpayne@68: raise IOError('unable to fetch next record') jpayne@68: jpayne@68: cdef bcf1_t *record = bcf_init1() jpayne@68: jpayne@68: if not record: jpayne@68: raise MemoryError('unable to allocate BCF record') jpayne@68: jpayne@68: record.pos = -1 jpayne@68: if self.bcf.drop_samples: jpayne@68: record.max_unpack = BCF_UN_SHR jpayne@68: jpayne@68: ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) jpayne@68: jpayne@68: # FIXME: stop iteration on parse failure? jpayne@68: if ret < 0: jpayne@68: bcf_destroy1(record) jpayne@68: raise ValueError('error in vcf_parse') jpayne@68: jpayne@68: return makeVariantRecord(self.bcf.header, record) jpayne@68: jpayne@68: jpayne@68: ######################################################################## jpayne@68: ######################################################################## jpayne@68: ## Variant File jpayne@68: ######################################################################## jpayne@68: jpayne@68: jpayne@68: cdef class VariantFile(HTSFile): jpayne@68: """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, jpayne@68: duplicate_filehandle=True, ignore_truncation=False, threads=1)* jpayne@68: jpayne@68: A :term:`VCF`/:term:`BCF` formatted file. The file is automatically jpayne@68: opened. jpayne@68: jpayne@68: If an index for a variant file exists (.csi or .tbi), it will be jpayne@68: opened automatically. Without an index random access to records jpayne@68: via :meth:`fetch` is disabled. jpayne@68: jpayne@68: For writing, a :class:`VariantHeader` object must be provided, jpayne@68: typically obtained from another :term:`VCF` file/:term:`BCF` jpayne@68: file. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: mode : string jpayne@68: *mode* should be ``r`` for reading or ``w`` for writing. The default is jpayne@68: text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append jpayne@68: ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. jpayne@68: jpayne@68: If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid jpayne@68: modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. jpayne@68: For instance, to open a :term:`BCF` formatted file for reading, type:: jpayne@68: jpayne@68: f = pysam.VariantFile('ex1.bcf','r') jpayne@68: jpayne@68: If mode is not specified, we will try to auto-detect the file type. All jpayne@68: of the following should work:: jpayne@68: jpayne@68: f1 = pysam.VariantFile('ex1.bcf') jpayne@68: f2 = pysam.VariantFile('ex1.vcf') jpayne@68: f3 = pysam.VariantFile('ex1.vcf.gz') jpayne@68: jpayne@68: index_filename : string jpayne@68: Explicit path to an index file. jpayne@68: jpayne@68: header : VariantHeader jpayne@68: :class:`VariantHeader` object required for writing. jpayne@68: jpayne@68: drop_samples: bool jpayne@68: Ignore sample information when reading. jpayne@68: jpayne@68: duplicate_filehandle: bool jpayne@68: By default, file handles passed either directly or through jpayne@68: File-like objects will be duplicated before passing them to jpayne@68: htslib. The duplication prevents issues where the same stream jpayne@68: will be closed by htslib and through destruction of the jpayne@68: high-level python object. Set to False to turn off jpayne@68: duplication. jpayne@68: jpayne@68: ignore_truncation: bool jpayne@68: Issue a warning, instead of raising an error if the current file jpayne@68: appears to be truncated due to a missing EOF marker. Only applies jpayne@68: to bgzipped formats. (Default=False) jpayne@68: jpayne@68: threads: integer jpayne@68: Number of threads to use for compressing/decompressing VCF/BCF files. jpayne@68: Setting threads to > 1 cannot be combined with `ignore_truncation`. jpayne@68: (Default=1) jpayne@68: jpayne@68: """ jpayne@68: def __cinit__(self, *args, **kwargs): jpayne@68: self.htsfile = NULL jpayne@68: jpayne@68: def __init__(self, *args, **kwargs): jpayne@68: self.header = None jpayne@68: self.index = None jpayne@68: self.filename = None jpayne@68: self.mode = None jpayne@68: self.threads = 1 jpayne@68: self.index_filename = None jpayne@68: self.is_stream = False jpayne@68: self.is_remote = False jpayne@68: self.is_reading = False jpayne@68: self.drop_samples = False jpayne@68: self.header_written = False jpayne@68: self.start_offset = -1 jpayne@68: jpayne@68: self.open(*args, **kwargs) jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: if not self.htsfile or not self.header: jpayne@68: return jpayne@68: jpayne@68: # Write header if no records were written jpayne@68: if self.htsfile.is_write and not self.header_written: jpayne@68: with nogil: jpayne@68: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@68: jpayne@68: cdef int ret = hts_close(self.htsfile) jpayne@68: self.htsfile = NULL jpayne@68: self.header = self.index = None jpayne@68: jpayne@68: if ret < 0: jpayne@68: global errno jpayne@68: if errno == EPIPE: jpayne@68: errno = 0 jpayne@68: else: jpayne@68: raise IOError(errno, force_str(strerror(errno))) jpayne@68: jpayne@68: def close(self): jpayne@68: """closes the :class:`pysam.VariantFile`.""" jpayne@68: if not self.htsfile: jpayne@68: return jpayne@68: jpayne@68: # Write header if no records were written jpayne@68: if self.htsfile.is_write and not self.header_written: jpayne@68: with nogil: jpayne@68: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@68: jpayne@68: cdef int ret = hts_close(self.htsfile) jpayne@68: self.htsfile = NULL jpayne@68: self.header = self.index = None jpayne@68: jpayne@68: if ret < 0: jpayne@68: global errno jpayne@68: if errno == EPIPE: jpayne@68: errno = 0 jpayne@68: else: jpayne@68: raise IOError(errno, force_str(strerror(errno))) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: if self.htsfile.is_write: jpayne@68: raise ValueError('cannot iterate over Variantfile opened for writing') jpayne@68: jpayne@68: self.is_reading = 1 jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: cdef int ret jpayne@68: cdef int errcode jpayne@68: cdef bcf1_t *record = bcf_init1() jpayne@68: jpayne@68: if not record: jpayne@68: raise MemoryError('unable to allocate BCF record') jpayne@68: jpayne@68: record.pos = -1 jpayne@68: if self.drop_samples: jpayne@68: record.max_unpack = BCF_UN_SHR jpayne@68: jpayne@68: with nogil: jpayne@68: ret = bcf_read1(self.htsfile, self.header.ptr, record) jpayne@68: jpayne@68: if ret < 0: jpayne@68: errcode = record.errcode jpayne@68: bcf_destroy1(record) jpayne@68: if errcode: jpayne@68: raise IOError('unable to parse next record') jpayne@68: if ret == -1: jpayne@68: raise StopIteration jpayne@68: elif ret == -2: jpayne@68: raise IOError('truncated file') jpayne@68: elif errno: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: else: jpayne@68: raise IOError('unable to fetch next record') jpayne@68: jpayne@68: return makeVariantRecord(self.header, record) jpayne@68: jpayne@68: def copy(self): jpayne@68: if not self.is_open: jpayne@68: raise ValueError jpayne@68: jpayne@68: cdef VariantFile vars = VariantFile.__new__(VariantFile) jpayne@68: cdef bcf_hdr_t *hdr jpayne@68: jpayne@68: # FIXME: re-open using fd or else header and index could be invalid jpayne@68: vars.htsfile = self._open_htsfile() jpayne@68: jpayne@68: if not vars.htsfile: jpayne@68: raise ValueError('Cannot re-open htsfile') jpayne@68: jpayne@68: # minimize overhead by re-using header and index. This approach is jpayne@68: # currently risky, but see above for how this can be mitigated. jpayne@68: vars.header = self.header jpayne@68: vars.index = self.index jpayne@68: jpayne@68: vars.filename = self.filename jpayne@68: vars.mode = self.mode jpayne@68: vars.threads = self.threads jpayne@68: vars.index_filename = self.index_filename jpayne@68: vars.drop_samples = self.drop_samples jpayne@68: vars.is_stream = self.is_stream jpayne@68: vars.is_remote = self.is_remote jpayne@68: vars.is_reading = self.is_reading jpayne@68: vars.start_offset = self.start_offset jpayne@68: vars.header_written = self.header_written jpayne@68: jpayne@68: if self.htsfile.is_bin: jpayne@68: vars.seek(self.tell()) jpayne@68: else: jpayne@68: with nogil: jpayne@68: hdr = bcf_hdr_read(vars.htsfile) jpayne@68: makeVariantHeader(hdr) jpayne@68: jpayne@68: return vars jpayne@68: jpayne@68: def open(self, filename, mode='r', jpayne@68: index_filename=None, jpayne@68: VariantHeader header=None, jpayne@68: drop_samples=False, jpayne@68: duplicate_filehandle=True, jpayne@68: ignore_truncation=False, jpayne@68: threads=1): jpayne@68: """open a vcf/bcf file. jpayne@68: jpayne@68: If open is called on an existing VariantFile, the current file will be jpayne@68: closed and a new file will be opened. jpayne@68: """ jpayne@68: cdef bcf_hdr_t *hdr jpayne@68: cdef BGZF *bgzfp jpayne@68: cdef hts_idx_t *idx jpayne@68: cdef tbx_t *tidx jpayne@68: cdef char *cfilename jpayne@68: cdef char *cindex_filename = NULL jpayne@68: cdef char *cmode jpayne@68: jpayne@68: if threads > 1 and ignore_truncation: jpayne@68: # This won't raise errors if reaching a truncated alignment, jpayne@68: # because bgzf_mt_reader in htslib does not deal with jpayne@68: # bgzf_mt_read_block returning non-zero values, contrary jpayne@68: # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) jpayne@68: # Better to avoid this (for now) than to produce seemingly correct results. jpayne@68: raise ValueError('Cannot add extra threads when "ignore_truncation" is True') jpayne@68: self.threads = threads jpayne@68: jpayne@68: # close a previously opened file jpayne@68: if self.is_open: jpayne@68: self.close() jpayne@68: jpayne@68: if not mode or mode[0] not in 'rwa': jpayne@68: raise ValueError('mode must begin with r, w or a') jpayne@68: jpayne@68: self.duplicate_filehandle = duplicate_filehandle jpayne@68: jpayne@68: format_modes = [m for m in mode[1:] if m in 'bcguz'] jpayne@68: if len(format_modes) > 1: jpayne@68: raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes))) jpayne@68: jpayne@68: invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex'] jpayne@68: if invalid_modes: jpayne@68: raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes))) jpayne@68: jpayne@68: # Autodetect mode from filename jpayne@68: if mode == 'w' and isinstance(filename, str): jpayne@68: if filename.endswith('.gz'): jpayne@68: mode = 'wz' jpayne@68: elif filename.endswith('.bcf'): jpayne@68: mode = 'wb' jpayne@68: jpayne@68: # for htslib, wbu seems to not work jpayne@68: if mode == 'wbu': jpayne@68: mode = 'wb0' jpayne@68: jpayne@68: self.mode = mode = force_bytes(mode) jpayne@68: try: jpayne@68: filename = encode_filename(filename) jpayne@68: self.is_remote = hisremote(filename) jpayne@68: self.is_stream = filename == b'-' jpayne@68: except TypeError: jpayne@68: filename = filename jpayne@68: self.is_remote = False jpayne@68: self.is_stream = True jpayne@68: jpayne@68: self.filename = filename jpayne@68: jpayne@68: if index_filename is not None: jpayne@68: self.index_filename = index_filename = encode_filename(index_filename) jpayne@68: else: jpayne@68: self.index_filename = None jpayne@68: jpayne@68: self.drop_samples = bool(drop_samples) jpayne@68: self.header = None jpayne@68: jpayne@68: self.header_written = False jpayne@68: jpayne@68: if mode.startswith(b'w'): jpayne@68: # open file for writing jpayne@68: if index_filename is not None: jpayne@68: raise ValueError('Cannot specify an index filename when writing a VCF/BCF file') jpayne@68: jpayne@68: # header structure (used for writing) jpayne@68: if header: jpayne@68: self.header = header.copy() jpayne@68: else: jpayne@68: self.header = VariantHeader() jpayne@68: #raise ValueError('a VariantHeader must be specified') jpayne@68: jpayne@68: # Header is not written until the first write or on close jpayne@68: self.htsfile = self._open_htsfile() jpayne@68: jpayne@68: if not self.htsfile: jpayne@68: raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode)) jpayne@68: jpayne@68: elif mode.startswith(b'r'): jpayne@68: # open file for reading jpayne@68: self.htsfile = self._open_htsfile() jpayne@68: jpayne@68: if not self.htsfile: jpayne@68: if errno: jpayne@68: raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno)))) jpayne@68: else: jpayne@68: raise ValueError('could not open variant file `{}`'.format(filename)) jpayne@68: jpayne@68: if self.htsfile.format.format not in (bcf, vcf): jpayne@68: raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) jpayne@68: jpayne@68: self.check_truncation(ignore_truncation) jpayne@68: jpayne@68: with nogil: jpayne@68: hdr = bcf_hdr_read(self.htsfile) jpayne@68: jpayne@68: try: jpayne@68: self.header = makeVariantHeader(hdr) jpayne@68: except ValueError: jpayne@68: raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) jpayne@68: jpayne@68: if isinstance(self.filename, bytes): jpayne@68: cfilename = self.filename jpayne@68: else: jpayne@68: cfilename = NULL jpayne@68: jpayne@68: # check for index and open if present jpayne@68: if self.htsfile.format.format == bcf and cfilename: jpayne@68: if index_filename is not None: jpayne@68: cindex_filename = index_filename jpayne@68: with nogil: jpayne@68: idx = bcf_index_load2(cfilename, cindex_filename) jpayne@68: self.index = makeBCFIndex(self.header, idx) jpayne@68: jpayne@68: elif self.htsfile.format.compression == bgzf and cfilename: jpayne@68: if index_filename is not None: jpayne@68: cindex_filename = index_filename jpayne@68: with nogil: jpayne@68: tidx = tbx_index_load2(cfilename, cindex_filename) jpayne@68: self.index = makeTabixIndex(tidx) jpayne@68: jpayne@68: if not self.is_stream: jpayne@68: self.start_offset = self.tell() jpayne@68: else: jpayne@68: raise ValueError('unknown mode {}'.format(mode)) jpayne@68: jpayne@68: def reset(self): jpayne@68: """reset file position to beginning of file just after the header.""" jpayne@68: return self.seek(self.start_offset) jpayne@68: jpayne@68: def is_valid_tid(self, tid): jpayne@68: """ jpayne@68: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@68: jpayne@68: returns -1 if reference is not known. jpayne@68: """ jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int rid = tid jpayne@68: return 0 <= rid < hdr.n[BCF_DT_CTG] jpayne@68: jpayne@68: def get_tid(self, reference): jpayne@68: """ jpayne@68: return the numerical :term:`tid` corresponding to jpayne@68: :term:`reference` jpayne@68: jpayne@68: returns -1 if reference is not known. jpayne@68: """ jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@68: reference = force_bytes(reference) jpayne@68: cdef khint_t k = kh_get_vdict(d, reference) jpayne@68: return kh_val_vdict(d, k).id if k != kh_end(d) else -1 jpayne@68: jpayne@68: def get_reference_name(self, tid): jpayne@68: """ jpayne@68: return :term:`reference` name corresponding to numerical :term:`tid` jpayne@68: """ jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@68: cdef int rid = tid jpayne@68: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@68: raise ValueError('Invalid tid') jpayne@68: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@68: jpayne@68: def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None): jpayne@68: """fetch records in a :term:`region`, specified either by jpayne@68: :term:`contig`, *start*, and *end* (which are 0-based, half-open); jpayne@68: or alternatively by a samtools :term:`region` string (which is jpayne@68: 1-based inclusive). jpayne@68: jpayne@68: Without *contig* or *region* all mapped records will be fetched. The jpayne@68: records will be returned ordered by contig, which will not necessarily jpayne@68: be the order within the file. jpayne@68: jpayne@68: Set *reopen* to true if you will be using multiple iterators on the jpayne@68: same file at the same time. The iterator returned will receive its jpayne@68: own copy of a filehandle to the file effectively re-opening the jpayne@68: file. Re-opening a file incurrs some overhead, so use with care. jpayne@68: jpayne@68: If only *contig* is set, all records on *contig* will be fetched. jpayne@68: If both *region* and *contig* are given, an exception is raised. jpayne@68: jpayne@68: Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index jpayne@68: (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be jpayne@68: read sequentially. jpayne@68: """ jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: if self.htsfile.is_write: jpayne@68: raise ValueError('cannot fetch from Variantfile opened for writing') jpayne@68: jpayne@68: if contig is None and region is None: jpayne@68: self.is_reading = 1 jpayne@68: bcf = self.copy() if reopen else self jpayne@68: bcf.seek(self.start_offset) jpayne@68: return iter(bcf) jpayne@68: jpayne@68: if self.index is None: jpayne@68: raise ValueError('fetch requires an index') jpayne@68: jpayne@68: _, tid, start, stop = self.parse_region(contig, start, stop, region, jpayne@68: None, end=end, reference=reference) jpayne@68: jpayne@68: if contig is None: jpayne@68: contig = self.get_reference_name(tid) jpayne@68: jpayne@68: self.is_reading = 1 jpayne@68: return self.index.fetch(self, contig, start, stop, reopen) jpayne@68: jpayne@68: def new_record(self, *args, **kwargs): jpayne@68: """Create a new empty :class:`VariantRecord`. jpayne@68: jpayne@68: See :meth:`VariantHeader.new_record` jpayne@68: """ jpayne@68: return self.header.new_record(*args, **kwargs) jpayne@68: jpayne@68: cpdef int write(self, VariantRecord record) except -1: jpayne@68: """ jpayne@68: write a single :class:`pysam.VariantRecord` to disk. jpayne@68: jpayne@68: returns the number of bytes written. jpayne@68: """ jpayne@68: if record is None: jpayne@68: raise ValueError('record must not be None') jpayne@68: jpayne@68: if not self.is_open: jpayne@68: return ValueError('I/O operation on closed file') jpayne@68: jpayne@68: if not self.htsfile.is_write: jpayne@68: raise ValueError('cannot write to a Variantfile opened for reading') jpayne@68: jpayne@68: if not self.header_written: jpayne@68: self.header_written = True jpayne@68: with nogil: jpayne@68: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@68: jpayne@68: #if record.header is not self.header: jpayne@68: # record.translate(self.header) jpayne@68: # raise ValueError('Writing records from a different VariantFile is not yet supported') jpayne@68: jpayne@68: if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr): jpayne@68: msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' jpayne@68: raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) jpayne@68: jpayne@68: # Sync END annotation before writing jpayne@68: bcf_sync_end(record) jpayne@68: jpayne@68: cdef int ret jpayne@68: jpayne@68: with nogil: jpayne@68: ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) jpayne@68: jpayne@68: if ret < 0: jpayne@68: raise IOError(errno, strerror(errno)) jpayne@68: jpayne@68: return ret jpayne@68: jpayne@68: def subset_samples(self, include_samples): jpayne@68: """ jpayne@68: Read only a subset of samples to reduce processing time and memory. jpayne@68: Must be called prior to retrieving records. jpayne@68: """ jpayne@68: if not self.is_open: jpayne@68: raise ValueError('I/O operation on closed file') jpayne@68: jpayne@68: if self.htsfile.is_write: jpayne@68: raise ValueError('cannot subset samples from Variantfile opened for writing') jpayne@68: jpayne@68: if self.is_reading: jpayne@68: raise ValueError('cannot subset samples after fetching records') jpayne@68: jpayne@68: self.header._subset_samples(include_samples) jpayne@68: jpayne@68: # potentially unnecessary optimization that also sets max_unpack jpayne@68: if not include_samples: jpayne@68: self.drop_samples = True jpayne@68: