jpayne@69: # cython: language_level=3 jpayne@69: # cython: embedsignature=True jpayne@69: # cython: profile=True jpayne@69: ############################################################################### jpayne@69: ############################################################################### jpayne@69: ## Cython wrapper for htslib VCF/BCF reader/writer jpayne@69: ############################################################################### jpayne@69: # jpayne@69: # NOTICE: This code is incomplete and preliminary. It offers a nearly jpayne@69: # complete Pythonic interface to VCF/BCF metadata and data with jpayne@69: # reading and writing capability. Documentation and a unit test suite jpayne@69: # are in the works. The code is best tested under Python 2, but jpayne@69: # should also work with Python 3. Please report any remaining jpayne@69: # str/bytes issues on the github site when using Python 3 and I'll jpayne@69: # fix them promptly. jpayne@69: # jpayne@69: # Here is a minimal example of how to use the API: jpayne@69: # jpayne@69: # $ cat bcfview.py jpayne@69: # import sys jpayne@69: # from pysam import VariantFile jpayne@69: # jpayne@69: # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format jpayne@69: # bcf_out = VariantFile('-', 'w', header=bcf_in.header) jpayne@69: # jpayne@69: # for rec in bcf_in: jpayne@69: # bcf_out.write(rec) jpayne@69: # jpayne@69: # Performance is fairly close to that of bcftools view. Here is an example jpayne@69: # using some 1k Genomes data: jpayne@69: # jpayne@69: # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l jpayne@69: # 1103799 jpayne@69: # jpayne@69: # real 0m56.114s jpayne@69: # user 1m4.489s jpayne@69: # sys 0m3.102s jpayne@69: # jpayne@69: # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l jpayne@69: # 1103800 # bcftools adds an extra header jpayne@69: # jpayne@69: # real 0m55.126s jpayne@69: # user 1m3.502s jpayne@69: # sys 0m3.459s jpayne@69: # jpayne@69: ############################################################################### jpayne@69: # jpayne@69: # TODO list: jpayne@69: # jpayne@69: # * more genotype methods jpayne@69: # * unit test suite (perhaps py.test based) jpayne@69: # * documentation jpayne@69: # * pickle support jpayne@69: # * left/right locus normalization jpayne@69: # * fix reopen to re-use fd jpayne@69: # jpayne@69: ############################################################################### jpayne@69: # jpayne@69: # The MIT License jpayne@69: # jpayne@69: # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com) jpayne@69: # jpayne@69: # Permission is hereby granted, free of charge, to any person obtaining a jpayne@69: # copy of this software and associated documentation files (the "Software"), jpayne@69: # to deal in the Software without restriction, including without limitation jpayne@69: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jpayne@69: # and/or sell copies of the Software, and to permit persons to whom the jpayne@69: # Software is furnished to do so, subject to the following conditions: jpayne@69: # jpayne@69: # The above copyright notice and this permission notice shall be included in jpayne@69: # all copies or substantial portions of the Software. jpayne@69: # jpayne@69: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@69: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@69: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL jpayne@69: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@69: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jpayne@69: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jpayne@69: # DEALINGS IN THE SOFTWARE. jpayne@69: # jpayne@69: ############################################################################### jpayne@69: jpayne@69: from __future__ import division, print_function jpayne@69: jpayne@69: import os jpayne@69: import sys jpayne@69: jpayne@69: from libc.errno cimport errno, EPIPE jpayne@69: from libc.string cimport strcmp, strpbrk, strerror jpayne@69: from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX jpayne@69: jpayne@69: cimport cython jpayne@69: jpayne@69: from cpython.object cimport PyObject jpayne@69: from cpython.ref cimport Py_INCREF jpayne@69: from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString jpayne@69: from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM jpayne@69: from cpython.bytes cimport PyBytes_FromStringAndSize jpayne@69: from cpython.unicode cimport PyUnicode_DecodeUTF8 jpayne@69: jpayne@69: from pysam.libchtslib cimport HTSFile, hisremote jpayne@69: jpayne@69: from pysam.utils import unquoted_str jpayne@69: jpayne@69: jpayne@69: __all__ = ['VariantFile', jpayne@69: 'VariantHeader', jpayne@69: 'VariantHeaderRecord', jpayne@69: 'VariantHeaderRecords', jpayne@69: 'VariantMetadata', jpayne@69: 'VariantHeaderMetadata', jpayne@69: 'VariantContig', jpayne@69: 'VariantHeaderContigs', jpayne@69: 'VariantHeaderSamples', jpayne@69: 'VariantRecordFilter', jpayne@69: 'VariantRecordFormat', jpayne@69: 'VariantRecordInfo', jpayne@69: 'VariantRecordSamples', jpayne@69: 'VariantRecord', jpayne@69: 'VariantRecordSample', jpayne@69: 'BaseIndex', jpayne@69: 'BCFIndex', jpayne@69: 'TabixIndex', jpayne@69: 'BaseIterator', jpayne@69: 'BCFIterator', jpayne@69: 'TabixIterator', jpayne@69: 'VariantRecord'] jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Constants jpayne@69: ######################################################################## jpayne@69: jpayne@69: cdef int MAX_POS = (1 << 31) - 1 jpayne@69: cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') jpayne@69: cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') jpayne@69: cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Python 3 compatibility functions jpayne@69: ######################################################################## jpayne@69: jpayne@69: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len jpayne@69: from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Sentinel object jpayne@69: ######################################################################## jpayne@69: jpayne@69: cdef object _nothing = object() jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## VCF/BCF string intern system jpayne@69: ######################################################################## jpayne@69: jpayne@69: cdef dict bcf_str_cache = {} jpayne@69: jpayne@69: cdef inline bcf_str_cache_get_charptr(const char* s): jpayne@69: if s == NULL: jpayne@69: return None jpayne@69: jpayne@69: cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s) jpayne@69: if pystr: jpayne@69: return pystr jpayne@69: jpayne@69: val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) jpayne@69: jpayne@69: PyDict_SetItemString(bcf_str_cache, s, val) jpayne@69: jpayne@69: return val jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Genotype math jpayne@69: ######################################################################## jpayne@69: jpayne@69: cdef int comb(int n, int k) except -1: jpayne@69: """Return binomial coefficient: n choose k jpayne@69: jpayne@69: >>> comb(5, 1) jpayne@69: 5 jpayne@69: >>> comb(5, 2) jpayne@69: 10 jpayne@69: >>> comb(2, 2) jpayne@69: 1 jpayne@69: >>> comb(100, 2) jpayne@69: 4950 jpayne@69: """ jpayne@69: if k > n: jpayne@69: return 0 jpayne@69: elif k == n: jpayne@69: return 1 jpayne@69: elif k > n // 2: jpayne@69: k = n - k jpayne@69: jpayne@69: cdef d, result jpayne@69: jpayne@69: d = result = n - k + 1 jpayne@69: for i in range(2, k + 1): jpayne@69: d += 1 jpayne@69: result *= d jpayne@69: result //= i jpayne@69: return result jpayne@69: jpayne@69: jpayne@69: cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1: jpayne@69: """Return the count of genotypes expected for the given ploidy and number of alleles. jpayne@69: jpayne@69: >>> bcf_geno_combinations(1, 2) jpayne@69: 2 jpayne@69: >>> bcf_geno_combinations(2, 2) jpayne@69: 3 jpayne@69: >>> bcf_geno_combinations(2, 3) jpayne@69: 6 jpayne@69: >>> bcf_geno_combinations(3, 2) jpayne@69: 4 jpayne@69: """ jpayne@69: return comb(alleles + ploidy - 1, ploidy) jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Low level type conversion helpers jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): jpayne@69: return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id) jpayne@69: jpayne@69: jpayne@69: cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): jpayne@69: return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0 jpayne@69: jpayne@69: jpayne@69: cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1: jpayne@69: jpayne@69: if sample < 0: jpayne@69: raise ValueError('genotype is only valid as a format field') jpayne@69: jpayne@69: cdef int32_t *gt_arr = NULL jpayne@69: cdef int ngt = 0 jpayne@69: ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt) jpayne@69: jpayne@69: if ngt <= 0 or not gt_arr: jpayne@69: return 0 jpayne@69: jpayne@69: assert ngt % rec.n_sample == 0 jpayne@69: cdef int max_ploidy = ngt // rec.n_sample jpayne@69: cdef int32_t *gt = gt_arr + sample * max_ploidy jpayne@69: cdef int ploidy = 0 jpayne@69: jpayne@69: while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end: jpayne@69: gt += 1 jpayne@69: ploidy += 1 jpayne@69: jpayne@69: free(gt_arr) jpayne@69: jpayne@69: return bcf_geno_combinations(ploidy, rec.n_allele) jpayne@69: jpayne@69: jpayne@69: cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): jpayne@69: if not a: jpayne@69: return None jpayne@69: try: jpayne@69: return tuple(charptr_to_str(a[i]) for i in range(n)) jpayne@69: finally: jpayne@69: if free_after and a: jpayne@69: free(a) jpayne@69: jpayne@69: jpayne@69: cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar): jpayne@69: cdef char *datac jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: cdef float *dataf jpayne@69: cdef int i jpayne@69: cdef bytes b jpayne@69: jpayne@69: if not data or n <= 0: jpayne@69: return None jpayne@69: jpayne@69: if type == BCF_BT_CHAR: jpayne@69: datac = data jpayne@69: jpayne@69: if not n: jpayne@69: value = () jpayne@69: else: jpayne@69: # Check if at least one null terminator is present jpayne@69: if datac[n-1] == bcf_str_vector_end: jpayne@69: # If so, create a string up to the first null terminator jpayne@69: b = datac jpayne@69: else: jpayne@69: # Otherwise, copy the entire block jpayne@69: b = datac[:n] jpayne@69: value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) jpayne@69: else: jpayne@69: value = [] jpayne@69: if type == BCF_BT_INT8: jpayne@69: data8 = data jpayne@69: for i in range(n): jpayne@69: if data8[i] == bcf_int8_vector_end: jpayne@69: break jpayne@69: value.append(data8[i] if data8[i] != bcf_int8_missing else None) jpayne@69: elif type == BCF_BT_INT16: jpayne@69: data16 = data jpayne@69: for i in range(n): jpayne@69: if data16[i] == bcf_int16_vector_end: jpayne@69: break jpayne@69: value.append(data16[i] if data16[i] != bcf_int16_missing else None) jpayne@69: elif type == BCF_BT_INT32: jpayne@69: data32 = data jpayne@69: for i in range(n): jpayne@69: if data32[i] == bcf_int32_vector_end: jpayne@69: break jpayne@69: value.append(data32[i] if data32[i] != bcf_int32_missing else None) jpayne@69: elif type == BCF_BT_FLOAT: jpayne@69: dataf = data jpayne@69: for i in range(n): jpayne@69: if bcf_float_is_vector_end(dataf[i]): jpayne@69: break jpayne@69: value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) jpayne@69: else: jpayne@69: raise TypeError('unsupported info type code') jpayne@69: jpayne@69: # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. jpayne@69: if not value: jpayne@69: if scalar: jpayne@69: value = None jpayne@69: elif count <= 0: jpayne@69: value = () jpayne@69: else: jpayne@69: value = (None,)*count jpayne@69: elif scalar and len(value) == 1: jpayne@69: value = value[0] jpayne@69: else: jpayne@69: value = tuple(value) jpayne@69: jpayne@69: return value jpayne@69: jpayne@69: jpayne@69: cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): jpayne@69: cdef char *datac jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: cdef float *dataf jpayne@69: cdef ssize_t i, value_count = len(values) jpayne@69: jpayne@69: assert value_count <= n jpayne@69: jpayne@69: if bt_type == BCF_BT_CHAR: jpayne@69: if not isinstance(values, (str, bytes)): jpayne@69: values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values) jpayne@69: value_count = len(values) jpayne@69: assert value_count <= n jpayne@69: datac = data jpayne@69: memcpy(datac, values, value_count) jpayne@69: for i in range(value_count, n): jpayne@69: datac[i] = 0 jpayne@69: elif bt_type == BCF_BT_INT8: jpayne@69: datai8 = data jpayne@69: for i in range(value_count): jpayne@69: val = values[i] jpayne@69: datai8[i] = val if val is not None else bcf_int8_missing jpayne@69: for i in range(value_count, n): jpayne@69: datai8[i] = bcf_int8_vector_end jpayne@69: elif bt_type == BCF_BT_INT16: jpayne@69: datai16 = data jpayne@69: for i in range(value_count): jpayne@69: val = values[i] jpayne@69: datai16[i] = val if val is not None else bcf_int16_missing jpayne@69: for i in range(value_count, n): jpayne@69: datai16[i] = bcf_int16_vector_end jpayne@69: elif bt_type == BCF_BT_INT32: jpayne@69: datai32 = data jpayne@69: for i in range(value_count): jpayne@69: val = values[i] jpayne@69: datai32[i] = val if val is not None else bcf_int32_missing jpayne@69: for i in range(value_count, n): jpayne@69: datai32[i] = bcf_int32_vector_end jpayne@69: elif bt_type == BCF_BT_FLOAT: jpayne@69: dataf = data jpayne@69: for i in range(value_count): jpayne@69: val = values[i] jpayne@69: if val is None: jpayne@69: bcf_float_set(dataf + i, bcf_float_missing) jpayne@69: else: jpayne@69: dataf[i] = val jpayne@69: for i in range(value_count, n): jpayne@69: bcf_float_set(dataf + i, bcf_float_vector_end) jpayne@69: else: jpayne@69: raise TypeError('unsupported type') jpayne@69: jpayne@69: jpayne@69: cdef bcf_empty_array(int type, ssize_t n, int vlen): jpayne@69: cdef char *datac jpayne@69: cdef int32_t *data32 jpayne@69: cdef float *dataf jpayne@69: cdef int i jpayne@69: jpayne@69: if n <= 0: jpayne@69: raise ValueError('Cannot create empty array') jpayne@69: jpayne@69: if type == BCF_HT_STR: jpayne@69: value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n) jpayne@69: datac = value jpayne@69: for i in range(n): jpayne@69: datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end jpayne@69: elif type == BCF_HT_INT: jpayne@69: value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n) jpayne@69: data32 = value jpayne@69: for i in range(n): jpayne@69: data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@69: elif type == BCF_HT_REAL: jpayne@69: value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n) jpayne@69: dataf = value jpayne@69: for i in range(n): jpayne@69: bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) jpayne@69: else: jpayne@69: raise TypeError('unsupported header type code') jpayne@69: jpayne@69: return value jpayne@69: jpayne@69: jpayne@69: cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values, jpayne@69: void *dst_data, int dst_type, size_t dst_values, jpayne@69: int vlen): jpayne@69: """copy data from src to dest where the size of the elements (src_type/dst_type) differ jpayne@69: as well as the number of elements (src_values/dst_values). jpayne@69: """ jpayne@69: jpayne@69: cdef char *src_datac jpayne@69: cdef char *dst_datac jpayne@69: cdef int8_t *src_datai8 jpayne@69: cdef int16_t *src_datai16 jpayne@69: cdef int32_t *src_datai32 jpayne@69: cdef int32_t *dst_datai jpayne@69: cdef float *src_dataf jpayne@69: cdef float *dst_dataf jpayne@69: cdef ssize_t src_size, dst_size, i, j jpayne@69: cdef int val jpayne@69: jpayne@69: if src_values > dst_values: jpayne@69: raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values)) jpayne@69: jpayne@69: if src_type == dst_type == BCF_BT_CHAR: jpayne@69: src_datac = src_data jpayne@69: dst_datac = dst_data jpayne@69: memcpy(dst_datac, src_datac, src_values) jpayne@69: for i in range(src_values, dst_values): jpayne@69: dst_datac[i] = 0 jpayne@69: elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: jpayne@69: src_datai8 = src_data jpayne@69: dst_datai = dst_data jpayne@69: for i in range(src_values): jpayne@69: val = src_datai8[i] jpayne@69: if val == bcf_int8_missing: jpayne@69: val = bcf_int32_missing jpayne@69: elif val == bcf_int8_vector_end: jpayne@69: val = bcf_int32_vector_end jpayne@69: dst_datai[i] = val jpayne@69: for i in range(src_values, dst_values): jpayne@69: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@69: elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32: jpayne@69: src_datai16 = src_data jpayne@69: dst_datai = dst_data jpayne@69: for i in range(src_values): jpayne@69: val = src_datai16[i] jpayne@69: if val == bcf_int16_missing: jpayne@69: val = bcf_int32_missing jpayne@69: elif val == bcf_int16_vector_end: jpayne@69: val = bcf_int32_vector_end jpayne@69: dst_datai[i] = val jpayne@69: for i in range(src_values, dst_values): jpayne@69: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@69: elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32: jpayne@69: src_datai32 = src_data jpayne@69: dst_datai = dst_data jpayne@69: for i in range(src_values): jpayne@69: dst_datai[i] = src_datai32[i] jpayne@69: for i in range(src_values, dst_values): jpayne@69: dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end jpayne@69: elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT: jpayne@69: src_dataf = src_data jpayne@69: dst_dataf = dst_data jpayne@69: for i in range(src_values): jpayne@69: dst_dataf[i] = src_dataf[i] jpayne@69: for i in range(src_values, dst_values): jpayne@69: bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) jpayne@69: else: jpayne@69: raise TypeError('unsupported types') jpayne@69: jpayne@69: jpayne@69: cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample): jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@69: cdef bcf1_t *r = record.ptr jpayne@69: jpayne@69: if not check_header_id(hdr, hl_type, id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: cdef int length = bcf_hdr_id2length(hdr, hl_type, id) jpayne@69: cdef int number = bcf_hdr_id2number(hdr, hl_type, id) jpayne@69: jpayne@69: scalar[0] = 0 jpayne@69: jpayne@69: if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id): jpayne@69: count[0] = number jpayne@69: elif length == BCF_VL_FIXED: jpayne@69: if number == 1: jpayne@69: scalar[0] = 1 jpayne@69: count[0] = number jpayne@69: elif length == BCF_VL_R: jpayne@69: count[0] = r.n_allele jpayne@69: elif length == BCF_VL_A: jpayne@69: count[0] = r.n_allele - 1 jpayne@69: elif length == BCF_VL_G: jpayne@69: count[0] = bcf_genotype_count(hdr, r, sample) jpayne@69: elif length == BCF_VL_VAR: jpayne@69: count[0] = -1 jpayne@69: else: jpayne@69: raise ValueError('Unknown format length') jpayne@69: jpayne@69: jpayne@69: cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@69: jpayne@69: cdef char *s jpayne@69: cdef ssize_t count jpayne@69: cdef int scalar jpayne@69: jpayne@69: bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1) jpayne@69: jpayne@69: if z.len == 0: jpayne@69: if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: jpayne@69: value = True jpayne@69: elif scalar: jpayne@69: value = None jpayne@69: else: jpayne@69: value = () jpayne@69: elif z.len == 1: jpayne@69: if z.type == BCF_BT_INT8: jpayne@69: value = z.v1.i if z.v1.i != bcf_int8_missing else None jpayne@69: elif z.type == BCF_BT_INT16: jpayne@69: value = z.v1.i if z.v1.i != bcf_int16_missing else None jpayne@69: elif z.type == BCF_BT_INT32: jpayne@69: value = z.v1.i if z.v1.i != bcf_int32_missing else None jpayne@69: elif z.type == BCF_BT_FLOAT: jpayne@69: value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None jpayne@69: elif z.type == BCF_BT_CHAR: jpayne@69: value = force_str(chr(z.v1.i)) jpayne@69: else: jpayne@69: raise TypeError('unsupported info type code') jpayne@69: jpayne@69: if not scalar and value != (): jpayne@69: value = (value,) jpayne@69: else: jpayne@69: value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar) jpayne@69: jpayne@69: return value jpayne@69: jpayne@69: jpayne@69: cdef object bcf_check_values(VariantRecord record, value, int sample, jpayne@69: int hl_type, int ht_type, jpayne@69: int id, int bt_type, ssize_t bt_len, jpayne@69: ssize_t *value_count, int *scalar, int *realloc): jpayne@69: jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: bcf_get_value_count(record, hl_type, id, value_count, scalar, sample) jpayne@69: jpayne@69: # Validate values now that we know the type and size jpayne@69: values = (value,) if not isinstance(value, (list, tuple)) else value jpayne@69: jpayne@69: # Validate values now that we know the type and size jpayne@69: if ht_type == BCF_HT_FLAG: jpayne@69: value_count[0] = 1 jpayne@69: elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id): jpayne@69: # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) jpayne@69: value_count[0] = -1 jpayne@69: jpayne@69: cdef int given = len(values) jpayne@69: if value_count[0] != -1 and value_count[0] != given: jpayne@69: if scalar[0]: jpayne@69: raise TypeError('value expected to be scalar, given len={}'.format(given)) jpayne@69: else: jpayne@69: raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given)) jpayne@69: jpayne@69: if ht_type == BCF_HT_REAL: jpayne@69: for v in values: jpayne@69: if not(v is None or isinstance(v, (float, int))): jpayne@69: raise TypeError('invalid value for Float format') jpayne@69: elif ht_type == BCF_HT_INT: jpayne@69: for v in values: jpayne@69: if not(v is None or (isinstance(v, (float, int)) and int(v) == v)): jpayne@69: raise TypeError('invalid value for Integer format') jpayne@69: for v in values: jpayne@69: if not(v is None or bcf_int32_missing < v <= INT32_MAX): jpayne@69: raise ValueError('Integer value too small/large to store in VCF/BCF') jpayne@69: elif ht_type == BCF_HT_STR: jpayne@69: values = b','.join(force_bytes(v) if v is not None else b'' for v in values) jpayne@69: elif ht_type == BCF_HT_FLAG: jpayne@69: if values[0] not in (True, False, None, 1, 0): jpayne@69: raise ValueError('Flag values must be: True, False, None, 1, 0') jpayne@69: else: jpayne@69: raise TypeError('unsupported type') jpayne@69: jpayne@69: realloc[0] = 0 jpayne@69: if len(values) <= 1 and hl_type == BCF_HL_INFO: jpayne@69: realloc[0] = 0 jpayne@69: elif len(values) > bt_len: jpayne@69: realloc[0] = 1 jpayne@69: elif bt_type == BCF_BT_INT8: jpayne@69: for v in values: jpayne@69: if v is not None and not(bcf_int8_missing < v <= INT8_MAX): jpayne@69: realloc[0] = 1 jpayne@69: break jpayne@69: elif bt_type == BCF_BT_INT16: jpayne@69: for v in values: jpayne@69: if v is not None and not(bcf_int16_missing < v <= INT16_MAX): jpayne@69: realloc[0] = 1 jpayne@69: break jpayne@69: jpayne@69: return values jpayne@69: jpayne@69: jpayne@69: cdef bcf_encode_alleles(VariantRecord record, values): jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf1_t *r = record.ptr jpayne@69: cdef int32_t nalleles = r.n_allele jpayne@69: cdef list gt_values = [] jpayne@69: cdef char *s jpayne@69: cdef int i jpayne@69: jpayne@69: if values is None: jpayne@69: return () jpayne@69: jpayne@69: if not isinstance(values, (list, tuple)): jpayne@69: values = (values,) jpayne@69: jpayne@69: for value in values: jpayne@69: if value is None: jpayne@69: gt_values.append(bcf_gt_missing) jpayne@69: elif isinstance(value, (str, bytes)): jpayne@69: bvalue = force_bytes(value) jpayne@69: s = bvalue jpayne@69: for i in range(r.n_allele): jpayne@69: if strcmp(r.d.allele[i], s) != 0: jpayne@69: gt_values.append(bcf_gt_unphased(i)) jpayne@69: break jpayne@69: else: jpayne@69: raise ValueError('Unknown allele') jpayne@69: else: jpayne@69: i = value jpayne@69: if not (0 <= i < nalleles): jpayne@69: raise ValueError('Invalid allele index') jpayne@69: gt_values.append(bcf_gt_unphased(i)) jpayne@69: jpayne@69: return gt_values jpayne@69: jpayne@69: jpayne@69: cdef bcf_info_set_value(VariantRecord record, key, value): jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@69: cdef bcf1_t *r = record.ptr jpayne@69: cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 jpayne@69: cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: if info: jpayne@69: info_id = info.key jpayne@69: else: jpayne@69: info_id = bcf_header_get_info_id(hdr, bkey) jpayne@69: jpayne@69: if info_id < 0: jpayne@69: raise KeyError('unknown INFO: {}'.format(key)) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) jpayne@69: values = bcf_check_values(record, value, -1, jpayne@69: BCF_HL_INFO, info_type, info_id, jpayne@69: info.type if info else -1, jpayne@69: info.len if info else -1, jpayne@69: &value_count, &scalar, &realloc) jpayne@69: jpayne@69: if info_type == BCF_HT_FLAG: jpayne@69: if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0: jpayne@69: raise ValueError('Unable to update INFO values') jpayne@69: return jpayne@69: jpayne@69: vlen = value_count < 0 jpayne@69: value_count = len(values) jpayne@69: jpayne@69: # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS jpayne@69: # If we can, write updated values to existing allocated storage jpayne@69: if 0 and info and not realloc: jpayne@69: r.d.shared_dirty |= BCF1_DIRTY_INF jpayne@69: jpayne@69: if value_count == 0: jpayne@69: info.len = 0 jpayne@69: if not info.vptr: jpayne@69: info.vptr = &info.v1.i jpayne@69: jpayne@69: elif value_count == 1: jpayne@69: # FIXME: Check if need to free vptr if info.len > 0? jpayne@69: if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: jpayne@69: bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen) jpayne@69: elif info.type == BCF_BT_FLOAT: jpayne@69: bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) jpayne@69: else: jpayne@69: raise TypeError('unsupported info type code') jpayne@69: jpayne@69: info.len = 1 jpayne@69: if not info.vptr: jpayne@69: info.vptr = &info.v1.i jpayne@69: else: jpayne@69: bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) jpayne@69: jpayne@69: return jpayne@69: jpayne@69: alloc_len = max(1, value_count) jpayne@69: if info and info.len > alloc_len: jpayne@69: alloc_len = info.len jpayne@69: jpayne@69: new_values = bcf_empty_array(info_type, alloc_len, vlen) jpayne@69: cdef char *valp = new_values jpayne@69: jpayne@69: if info_type == BCF_HT_INT: jpayne@69: dst_type = BCF_BT_INT32 jpayne@69: elif info_type == BCF_HT_REAL: jpayne@69: dst_type = BCF_BT_FLOAT jpayne@69: elif info_type == BCF_HT_STR: jpayne@69: dst_type = BCF_BT_CHAR jpayne@69: else: jpayne@69: raise ValueError('Unsupported INFO type') jpayne@69: jpayne@69: bcf_object_to_array(values, valp, dst_type, alloc_len, vlen) jpayne@69: jpayne@69: if bcf_update_info(hdr, r, bkey, valp, alloc_len, info_type) < 0: jpayne@69: raise ValueError('Unable to update INFO values') jpayne@69: jpayne@69: jpayne@69: cdef bcf_info_del_value(VariantRecord record, key): jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@69: cdef bcf1_t *r = record.ptr jpayne@69: cdef ssize_t value_count jpayne@69: cdef int scalar jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: if not info: jpayne@69: raise KeyError(key) jpayne@69: jpayne@69: bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1) jpayne@69: jpayne@69: if value_count <= 0: jpayne@69: null_value = () jpayne@69: elif scalar: jpayne@69: null_value = None jpayne@69: else: jpayne@69: null_value = (None,)*value_count jpayne@69: jpayne@69: bcf_info_set_value(record, bkey, null_value) jpayne@69: jpayne@69: jpayne@69: cdef bcf_format_get_value(VariantRecordSample sample, key): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef ssize_t count jpayne@69: cdef int scalar jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: jpayne@69: if not fmt or not fmt.p: jpayne@69: raise KeyError('invalid FORMAT: {}'.format(key)) jpayne@69: jpayne@69: if is_gt_fmt(hdr, fmt.id): jpayne@69: return bcf_format_get_allele_indices(sample) jpayne@69: jpayne@69: bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index) jpayne@69: jpayne@69: if fmt.p and fmt.n and fmt.size: jpayne@69: return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) jpayne@69: elif scalar: jpayne@69: return None jpayne@69: elif count <= 0: jpayne@69: return () jpayne@69: else: jpayne@69: return (None,)*count jpayne@69: jpayne@69: jpayne@69: cdef bcf_format_set_value(VariantRecordSample sample, key, value): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: if key == 'phased': jpayne@69: sample.phased = bool(value) jpayne@69: return jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef int fmt_id jpayne@69: cdef vdict_t *d jpayne@69: cdef khiter_t k jpayne@69: cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 jpayne@69: cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: jpayne@69: if fmt: jpayne@69: fmt_id = fmt.id jpayne@69: else: jpayne@69: d = hdr.dict[BCF_DT_ID] jpayne@69: k = kh_get_vdict(d, bkey) jpayne@69: jpayne@69: if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: jpayne@69: raise KeyError('unknown format: {}'.format(key)) jpayne@69: jpayne@69: fmt_id = kh_val_vdict(d, k).id jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_FMT, fmt_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) jpayne@69: jpayne@69: if fmt_type == BCF_HT_FLAG: jpayne@69: raise ValueError('Flag types are not allowed on FORMATs') jpayne@69: jpayne@69: if is_gt_fmt(hdr, fmt_id): jpayne@69: value = bcf_encode_alleles(sample.record, value) jpayne@69: # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. jpayne@69: fmt_type = BCF_HT_INT jpayne@69: jpayne@69: values = bcf_check_values(sample.record, value, sample.index, jpayne@69: BCF_HL_FMT, fmt_type, fmt_id, jpayne@69: fmt.type if fmt else -1, jpayne@69: fmt.n if fmt else -1, jpayne@69: &value_count, &scalar, &realloc) jpayne@69: vlen = value_count < 0 jpayne@69: value_count = len(values) jpayne@69: jpayne@69: # If we can, write updated values to existing allocated storage. jpayne@69: if fmt and not realloc: jpayne@69: r.d.indiv_dirty = 1 jpayne@69: bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) jpayne@69: return jpayne@69: jpayne@69: alloc_len = max(1, value_count) jpayne@69: if fmt and fmt.n > alloc_len: jpayne@69: alloc_len = fmt.n jpayne@69: jpayne@69: nsamples = r.n_sample jpayne@69: new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen) jpayne@69: cdef char *new_values_p = new_values jpayne@69: jpayne@69: if fmt_type == BCF_HT_INT: jpayne@69: dst_type = BCF_BT_INT32 jpayne@69: dst_size = sizeof(int32_t) * alloc_len jpayne@69: elif fmt_type == BCF_HT_REAL: jpayne@69: dst_type = BCF_BT_FLOAT jpayne@69: dst_size = sizeof(float) * alloc_len jpayne@69: elif fmt_type == BCF_HT_STR: jpayne@69: dst_type = BCF_BT_CHAR jpayne@69: dst_size = sizeof(char) * alloc_len jpayne@69: else: jpayne@69: raise ValueError('Unsupported FORMAT type') jpayne@69: jpayne@69: if fmt and nsamples > 1: jpayne@69: for i in range(nsamples): jpayne@69: bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n, jpayne@69: new_values_p + i * dst_size, dst_type, alloc_len, jpayne@69: vlen) jpayne@69: jpayne@69: bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen) jpayne@69: jpayne@69: if bcf_update_format(hdr, r, bkey, new_values_p, (nsamples * alloc_len), fmt_type) < 0: jpayne@69: raise ValueError('Unable to update format values') jpayne@69: jpayne@69: jpayne@69: cdef bcf_format_del_value(VariantRecordSample sample, key): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef ssize_t value_count jpayne@69: cdef int scalar jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: jpayne@69: if not fmt or not fmt.p: jpayne@69: raise KeyError(key) jpayne@69: jpayne@69: bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index) jpayne@69: jpayne@69: if value_count <= 0: jpayne@69: null_value = () jpayne@69: elif scalar: jpayne@69: null_value = None jpayne@69: else: jpayne@69: null_value = (None,)*value_count jpayne@69: jpayne@69: bcf_format_set_value(sample, bkey, null_value) jpayne@69: jpayne@69: jpayne@69: cdef bcf_format_get_allele_indices(VariantRecordSample sample): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef int32_t n = r.n_sample jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@69: return () jpayne@69: jpayne@69: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@69: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@69: jpayne@69: if not gt0 or not fmt0.n: jpayne@69: return () jpayne@69: jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: cdef int32_t a, nalleles = r.n_allele jpayne@69: cdef list alleles = [] jpayne@69: jpayne@69: if fmt0.type == BCF_BT_INT8: jpayne@69: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data8[i] == bcf_int8_vector_end: jpayne@69: break jpayne@69: elif data8[i] == bcf_gt_missing: jpayne@69: a = -1 jpayne@69: else: jpayne@69: a = bcf_gt_allele(data8[i]) jpayne@69: alleles.append(a if 0 <= a < nalleles else None) jpayne@69: elif fmt0.type == BCF_BT_INT16: jpayne@69: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data16[i] == bcf_int16_vector_end: jpayne@69: break jpayne@69: elif data16[i] == bcf_gt_missing: jpayne@69: a = -1 jpayne@69: else: jpayne@69: a = bcf_gt_allele(data16[i]) jpayne@69: alleles.append(a if 0 <= a < nalleles else None) jpayne@69: elif fmt0.type == BCF_BT_INT32: jpayne@69: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data32[i] == bcf_int32_vector_end: jpayne@69: break jpayne@69: elif data32[i] == bcf_gt_missing: jpayne@69: a = -1 jpayne@69: else: jpayne@69: a = bcf_gt_allele(data32[i]) jpayne@69: alleles.append(a if 0 <= a < nalleles else None) jpayne@69: jpayne@69: return tuple(alleles) jpayne@69: jpayne@69: jpayne@69: cdef bcf_format_get_alleles(VariantRecordSample sample): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef int32_t nsamples = r.n_sample jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef int32_t nalleles = r.n_allele jpayne@69: jpayne@69: if sample.index < 0 or sample.index >= nsamples or not r.n_fmt: jpayne@69: return () jpayne@69: jpayne@69: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@69: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@69: jpayne@69: if not gt0 or not fmt0.n: jpayne@69: return () jpayne@69: jpayne@69: cdef int32_t a jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: alleles = [] jpayne@69: if fmt0.type == BCF_BT_INT8: jpayne@69: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data8[i] == bcf_int8_vector_end: jpayne@69: break jpayne@69: a = bcf_gt_allele(data8[i]) jpayne@69: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@69: elif fmt0.type == BCF_BT_INT16: jpayne@69: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data16[i] == bcf_int16_vector_end: jpayne@69: break jpayne@69: a = bcf_gt_allele(data16[i]) jpayne@69: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@69: elif fmt0.type == BCF_BT_INT32: jpayne@69: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data32[i] == bcf_int32_vector_end: jpayne@69: break jpayne@69: a = bcf_gt_allele(data32[i]) jpayne@69: alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) jpayne@69: return tuple(alleles) jpayne@69: jpayne@69: jpayne@69: cdef bint bcf_sample_get_phased(VariantRecordSample sample): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef int32_t n = r.n_sample jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@69: return False jpayne@69: jpayne@69: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@69: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@69: jpayne@69: if not gt0 or not fmt0.n: jpayne@69: return False jpayne@69: jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: jpayne@69: cdef bint phased = False jpayne@69: jpayne@69: if fmt0.type == BCF_BT_INT8: jpayne@69: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data8[i] == bcf_int8_vector_end: jpayne@69: break jpayne@69: elif data8[i] == bcf_int8_missing: jpayne@69: continue jpayne@69: elif i and not bcf_gt_is_phased(data8[i]): jpayne@69: return False jpayne@69: else: jpayne@69: phased = True jpayne@69: elif fmt0.type == BCF_BT_INT16: jpayne@69: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data16[i] == bcf_int16_vector_end: jpayne@69: break jpayne@69: elif data16[i] == bcf_int16_missing: jpayne@69: continue jpayne@69: elif i and not bcf_gt_is_phased(data16[i]): jpayne@69: return False jpayne@69: else: jpayne@69: phased = True jpayne@69: elif fmt0.type == BCF_BT_INT32: jpayne@69: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data32[i] == bcf_int32_vector_end: jpayne@69: break jpayne@69: elif data32[i] == bcf_int32_missing: jpayne@69: continue jpayne@69: elif i and not bcf_gt_is_phased(data32[i]): jpayne@69: return False jpayne@69: else: jpayne@69: phased = True jpayne@69: jpayne@69: return phased jpayne@69: jpayne@69: jpayne@69: cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): jpayne@69: if sample is None: jpayne@69: raise ValueError('sample must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = sample.record.header.ptr jpayne@69: cdef bcf1_t *r = sample.record.ptr jpayne@69: cdef int32_t n = r.n_sample jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: if sample.index < 0 or sample.index >= n or not r.n_fmt: jpayne@69: return jpayne@69: jpayne@69: cdef bcf_fmt_t *fmt0 = r.d.fmt jpayne@69: cdef int gt0 = is_gt_fmt(hdr, fmt0.id) jpayne@69: jpayne@69: if not gt0 or not fmt0.n: jpayne@69: raise ValueError('Cannot set phased before genotype is set') jpayne@69: jpayne@69: cdef int8_t *data8 jpayne@69: cdef int16_t *data16 jpayne@69: cdef int32_t *data32 jpayne@69: jpayne@69: if fmt0.type == BCF_BT_INT8: jpayne@69: data8 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data8[i] == bcf_int8_vector_end: jpayne@69: break jpayne@69: elif data8[i] == bcf_int8_missing: jpayne@69: continue jpayne@69: elif i: jpayne@69: data8[i] = (data8[i] & 0xFE) | phased jpayne@69: elif fmt0.type == BCF_BT_INT16: jpayne@69: data16 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data16[i] == bcf_int16_vector_end: jpayne@69: break jpayne@69: elif data16[i] == bcf_int16_missing: jpayne@69: continue jpayne@69: elif i: jpayne@69: data16[i] = (data16[i] & 0xFFFE) | phased jpayne@69: elif fmt0.type == BCF_BT_INT32: jpayne@69: data32 = (fmt0.p + sample.index * fmt0.size) jpayne@69: for i in range(fmt0.n): jpayne@69: if data32[i] == bcf_int32_vector_end: jpayne@69: break jpayne@69: elif data32[i] == bcf_int32_missing: jpayne@69: continue jpayne@69: elif i: jpayne@69: data32[i] = (data32[i] & 0xFFFFFFFE) | phased jpayne@69: jpayne@69: jpayne@69: cdef inline bcf_sync_end(VariantRecord record): jpayne@69: cdef bcf_hdr_t *hdr = record.header.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END') jpayne@69: cdef int ref_len jpayne@69: jpayne@69: # allow missing ref when instantiating a new record jpayne@69: if record.ref is not None: jpayne@69: ref_len = len(record.ref) jpayne@69: else: jpayne@69: ref_len = 0 jpayne@69: jpayne@69: # Delete INFO/END if no alleles are present or if rlen is equal to len(ref) jpayne@69: # Always keep END for symbolic alleles jpayne@69: if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len): jpayne@69: # If INFO/END is not defined in the header, it doesn't exist in the record jpayne@69: if end_id >= 0: jpayne@69: info = bcf_get_info(hdr, record.ptr, b'END') jpayne@69: if info and info.vptr: jpayne@69: if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0: jpayne@69: raise ValueError('Unable to delete END') jpayne@69: else: jpayne@69: # Create END header, if not present jpayne@69: if end_id < 0: jpayne@69: record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval') jpayne@69: jpayne@69: # Update to reflect stop position jpayne@69: bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen) jpayne@69: jpayne@69: jpayne@69: cdef inline int has_symbolic_allele(VariantRecord record): jpayne@69: """Return index of first symbolic allele. 0 if no symbolic alleles.""" jpayne@69: jpayne@69: for i in range(1, record.ptr.n_allele): jpayne@69: alt = record.ptr.d.allele[i] jpayne@69: if alt[0] == b'<' and alt[len(alt) - 1] == b'>': jpayne@69: return i jpayne@69: jpayne@69: return 0 jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Variant Header objects jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef bcf_header_remove_hrec(VariantHeader header, int i): jpayne@69: if header is None: jpayne@69: raise ValueError('header must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = header.ptr jpayne@69: jpayne@69: if i < 0 or i >= hdr.nhrec: jpayne@69: raise ValueError('Invalid header record index') jpayne@69: jpayne@69: cdef bcf_hrec_t *hrec = hdr.hrec[i] jpayne@69: hdr.nhrec -= 1 jpayne@69: jpayne@69: if i < hdr.nhrec: jpayne@69: memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*)) jpayne@69: jpayne@69: bcf_hrec_destroy(hrec) jpayne@69: hdr.hrec[hdr.nhrec] = NULL jpayne@69: hdr.dirty = 1 jpayne@69: jpayne@69: jpayne@69: #FIXME: implement a full mapping interface jpayne@69: #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the jpayne@69: # object lifetime. jpayne@69: cdef class VariantHeaderRecord(object): jpayne@69: """header record from a :class:`VariantHeader` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: @property jpayne@69: def type(self): jpayne@69: """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return None jpayne@69: return METADATA_TYPES[r.type] jpayne@69: jpayne@69: @property jpayne@69: def key(self): jpayne@69: """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: return bcf_str_cache_get_charptr(r.key) if r and r.key else None jpayne@69: jpayne@69: @property jpayne@69: def value(self): jpayne@69: """header value. Set only for generic lines, None for FILTER/INFO, etc.""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: return charptr_to_str(r.value) if r and r.value else None jpayne@69: jpayne@69: @property jpayne@69: def attrs(self): jpayne@69: """sequence of additional header attributes""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return () jpayne@69: cdef int i jpayne@69: return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, jpayne@69: charptr_to_str(r.vals[i]) if r.vals[i] else None) jpayne@69: for i in range(r.nkeys)) jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: return r.nkeys if r else 0 jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: return r != NULL and r.nkeys != 0 jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: """get attribute value""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: cdef int i jpayne@69: if r: jpayne@69: bkey = force_bytes(key) jpayne@69: for i in range(r.nkeys): jpayne@69: if r.keys[i] and r.keys[i] == bkey: jpayne@69: return charptr_to_str(r.vals[i]) if r.vals[i] else None jpayne@69: raise KeyError('cannot find metadata key') jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return jpayne@69: cdef int i jpayne@69: for i in range(r.nkeys): jpayne@69: if r.keys[i]: jpayne@69: yield bcf_str_cache_get_charptr(r.keys[i]) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: try: jpayne@69: self[key] jpayne@69: except KeyError: jpayne@69: return False jpayne@69: else: jpayne@69: return True jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return jpayne@69: cdef int i jpayne@69: for i in range(r.nkeys): jpayne@69: if r.keys[i]: jpayne@69: yield charptr_to_str(r.vals[i]) if r.vals[i] else None jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return jpayne@69: cdef int i jpayne@69: for i in range(r.nkeys): jpayne@69: if r.keys[i]: jpayne@69: yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def update(self, items=None, **kwargs): jpayne@69: """D.update([E, ]**F) -> None. jpayne@69: jpayne@69: Update D from dict/iterable E and F. jpayne@69: """ jpayne@69: for k, v in items.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: if kwargs: jpayne@69: for k, v in kwargs.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: def pop(self, key, default=_nothing): jpayne@69: try: jpayne@69: value = self[key] jpayne@69: del self[key] jpayne@69: return value jpayne@69: except KeyError: jpayne@69: if default is not _nothing: jpayne@69: return default jpayne@69: raise jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: def __str__(self): jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: jpayne@69: if not r: jpayne@69: raise ValueError('cannot convert deleted record to str') jpayne@69: jpayne@69: cdef kstring_t hrec_str jpayne@69: hrec_str.l = hrec_str.m = 0 jpayne@69: hrec_str.s = NULL jpayne@69: jpayne@69: bcf_hrec_format(r, &hrec_str) jpayne@69: jpayne@69: ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l) jpayne@69: jpayne@69: if hrec_str.m: jpayne@69: free(hrec_str.s) jpayne@69: jpayne@69: return ret jpayne@69: jpayne@69: # FIXME: Not safe -- causes trivial segfaults at the moment jpayne@69: def remove(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef bcf_hrec_t *r = self.ptr jpayne@69: if not r: jpayne@69: return jpayne@69: assert r.key jpayne@69: cdef char *key = r.key if r.type == BCF_HL_GEN else r.value jpayne@69: bcf_hdr_remove(hdr, r.type, key) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: jpayne@69: cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: if not hdr: jpayne@69: return None jpayne@69: jpayne@69: cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) jpayne@69: record.header = header jpayne@69: record.ptr = hdr jpayne@69: jpayne@69: return record jpayne@69: jpayne@69: jpayne@69: cdef class VariantHeaderRecords(object): jpayne@69: """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self.header.ptr.nhrec jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return self.header.ptr.nhrec != 0 jpayne@69: jpayne@69: def __getitem__(self, index): jpayne@69: cdef int32_t i = index jpayne@69: if i < 0 or i >= self.header.ptr.nhrec: jpayne@69: raise IndexError('invalid header record index') jpayne@69: return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef int32_t i jpayne@69: for i in range(self.header.ptr.nhrec): jpayne@69: yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) jpayne@69: jpayne@69: __hash__ = None jpayne@69: jpayne@69: jpayne@69: cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) jpayne@69: records.header = header jpayne@69: return records jpayne@69: jpayne@69: jpayne@69: cdef class VariantMetadata(object): jpayne@69: """filter, info or format metadata record from a :class:`VariantHeader` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: @property jpayne@69: def name(self): jpayne@69: """metadata name""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) jpayne@69: jpayne@69: # Q: Should this be exposed? jpayne@69: @property jpayne@69: def id(self): jpayne@69: """metadata internal header id number""" jpayne@69: return self.id jpayne@69: jpayne@69: @property jpayne@69: def number(self): jpayne@69: """metadata number (i.e. cardinality)""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: jpayne@69: if not check_header_id(hdr, self.type, self.id): jpayne@69: raise ValueError('Invalid header id') jpayne@69: jpayne@69: if self.type == BCF_HL_FLT: jpayne@69: return None jpayne@69: jpayne@69: cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) jpayne@69: if l == BCF_VL_FIXED: jpayne@69: return bcf_hdr_id2number(hdr, self.type, self.id) jpayne@69: elif l == BCF_VL_VAR: jpayne@69: return '.' jpayne@69: else: jpayne@69: return METADATA_LENGTHS[l] jpayne@69: jpayne@69: @property jpayne@69: def type(self): jpayne@69: """metadata value type""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: if not check_header_id(hdr, self.type, self.id): jpayne@69: raise ValueError('Invalid header id') jpayne@69: jpayne@69: if self.type == BCF_HL_FLT: jpayne@69: return None jpayne@69: return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] jpayne@69: jpayne@69: @property jpayne@69: def description(self): jpayne@69: """metadata description (or None if not set)""" jpayne@69: descr = self.record.get('Description') jpayne@69: if descr: jpayne@69: descr = descr.strip('"') jpayne@69: return force_str(descr) jpayne@69: jpayne@69: @property jpayne@69: def record(self): jpayne@69: """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: if not check_header_id(hdr, self.type, self.id): jpayne@69: raise ValueError('Invalid header id') jpayne@69: cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] jpayne@69: if not hrec: jpayne@69: return None jpayne@69: return makeVariantHeaderRecord(self.header, hrec) jpayne@69: jpayne@69: def remove_header(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef const char *key = hdr.id[BCF_DT_ID][self.id].key jpayne@69: bcf_hdr_remove(hdr, self.type, key) jpayne@69: jpayne@69: jpayne@69: cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: jpayne@69: raise ValueError('invalid metadata type') jpayne@69: jpayne@69: if id < 0 or id >= header.ptr.n[BCF_DT_ID]: jpayne@69: raise ValueError('invalid metadata id') jpayne@69: jpayne@69: cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) jpayne@69: meta.header = header jpayne@69: meta.type = type jpayne@69: meta.id = id jpayne@69: jpayne@69: return meta jpayne@69: jpayne@69: jpayne@69: cdef class VariantHeaderMetadata(object): jpayne@69: """mapping from filter, info or format name to :class:`VariantMetadata` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def add(self, id, number, type, description, **kwargs): jpayne@69: """Add a new filter, info or format record""" jpayne@69: if id in self: jpayne@69: raise ValueError('Header already exists for id={}'.format(id)) jpayne@69: jpayne@69: if self.type == BCF_HL_FLT: jpayne@69: if number is not None: jpayne@69: raise ValueError('Number must be None when adding a filter') jpayne@69: if type is not None: jpayne@69: raise ValueError('Type must be None when adding a filter') jpayne@69: jpayne@69: items = [('ID', unquoted_str(id)), ('Description', description)] jpayne@69: else: jpayne@69: if type not in VALUE_TYPES: jpayne@69: raise ValueError('unknown type specified: {}'.format(type)) jpayne@69: if number is None: jpayne@69: number = '.' jpayne@69: jpayne@69: items = [('ID', unquoted_str(id)), jpayne@69: ('Number', unquoted_str(number)), jpayne@69: ('Type', unquoted_str(type)), jpayne@69: ('Description', description)] jpayne@69: jpayne@69: items += kwargs.items() jpayne@69: self.header.add_meta(METADATA_TYPES[self.type], items=items) jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef bcf_idpair_t *idpair jpayne@69: cdef int32_t i, n = 0 jpayne@69: jpayne@69: for i in range(hdr.n[BCF_DT_ID]): jpayne@69: idpair = hdr.id[BCF_DT_ID] + i jpayne@69: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@69: n += 1 jpayne@69: return n jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef bcf_idpair_t *idpair jpayne@69: cdef int32_t i jpayne@69: jpayne@69: for i in range(hdr.n[BCF_DT_ID]): jpayne@69: idpair = hdr.id[BCF_DT_ID] + i jpayne@69: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@69: return True jpayne@69: return False jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef vdict_t *d = hdr.dict[BCF_DT_ID] jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@69: jpayne@69: if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: jpayne@69: raise KeyError('invalid key: {}'.format(key)) jpayne@69: jpayne@69: return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) jpayne@69: jpayne@69: def remove_header(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef vdict_t *d = hdr.dict[BCF_DT_ID] jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@69: jpayne@69: if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: jpayne@69: raise KeyError('invalid key: {}'.format(key)) jpayne@69: jpayne@69: bcf_hdr_remove(hdr, self.type, bkey) jpayne@69: #bcf_hdr_sync(hdr) jpayne@69: jpayne@69: def clear_header(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: bcf_hdr_remove(hdr, self.type, NULL) jpayne@69: #bcf_hdr_sync(hdr) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef bcf_idpair_t *idpair jpayne@69: cdef int32_t i jpayne@69: jpayne@69: for i in range(hdr.n[BCF_DT_ID]): jpayne@69: idpair = hdr.id[BCF_DT_ID] + i jpayne@69: if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: jpayne@69: yield bcf_str_cache_get_charptr(idpair.key) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: try: jpayne@69: self[key] jpayne@69: except KeyError: jpayne@69: return False jpayne@69: else: jpayne@69: return True jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: jpayne@69: cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) jpayne@69: meta.header = header jpayne@69: meta.type = type jpayne@69: jpayne@69: return meta jpayne@69: jpayne@69: jpayne@69: cdef class VariantContig(object): jpayne@69: """contig metadata from a :class:`VariantHeader`""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: @property jpayne@69: def name(self): jpayne@69: """contig name""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) jpayne@69: jpayne@69: @property jpayne@69: def id(self): jpayne@69: """contig internal id number""" jpayne@69: return self.id jpayne@69: jpayne@69: @property jpayne@69: def length(self): jpayne@69: """contig length or None if not available""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] jpayne@69: return length if length else None jpayne@69: jpayne@69: @property jpayne@69: def header_record(self): jpayne@69: """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] jpayne@69: return makeVariantHeaderRecord(self.header, hrec) jpayne@69: jpayne@69: def remove_header(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key jpayne@69: bcf_hdr_remove(hdr, BCF_HL_CTG, key) jpayne@69: jpayne@69: jpayne@69: cdef VariantContig makeVariantContig(VariantHeader header, int id): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: jpayne@69: raise ValueError('invalid contig id') jpayne@69: jpayne@69: cdef VariantContig contig = VariantContig.__new__(VariantContig) jpayne@69: contig.header = header jpayne@69: contig.id = id jpayne@69: jpayne@69: return contig jpayne@69: jpayne@69: jpayne@69: cdef class VariantHeaderContigs(object): jpayne@69: """mapping from contig name or index to :class:`VariantContig` object.""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] jpayne@69: return hdr.n[BCF_DT_CTG] jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] jpayne@69: return hdr.n[BCF_DT_CTG] != 0 jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int index jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: index = key jpayne@69: if index < 0 or index >= hdr.n[BCF_DT_CTG]: jpayne@69: raise IndexError('invalid contig index') jpayne@69: return makeVariantContig(self.header, index) jpayne@69: jpayne@69: cdef vdict_t *d = hdr.dict[BCF_DT_CTG] jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@69: jpayne@69: if k == kh_end(d): jpayne@69: raise KeyError('invalid contig: {}'.format(key)) jpayne@69: jpayne@69: cdef int id = kh_val_vdict(d, k).id jpayne@69: jpayne@69: return makeVariantContig(self.header, id) jpayne@69: jpayne@69: def remove_header(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int index jpayne@69: cdef const char *ckey jpayne@69: cdef vdict_t *d jpayne@69: cdef khiter_t k jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: index = key jpayne@69: if index < 0 or index >= hdr.n[BCF_DT_CTG]: jpayne@69: raise IndexError('invalid contig index') jpayne@69: ckey = hdr.id[BCF_DT_CTG][self.id].key jpayne@69: else: jpayne@69: d = hdr.dict[BCF_DT_CTG] jpayne@69: key = force_bytes(key) jpayne@69: if kh_get_vdict(d, key) == kh_end(d): jpayne@69: raise KeyError('invalid contig: {}'.format(key)) jpayne@69: ckey = key jpayne@69: jpayne@69: bcf_hdr_remove(hdr, BCF_HL_CTG, ckey) jpayne@69: jpayne@69: def clear_header(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: bcf_hdr_remove(hdr, BCF_HL_CTG, NULL) jpayne@69: #bcf_hdr_sync(hdr) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef vdict_t *d = hdr.dict[BCF_DT_CTG] jpayne@69: cdef uint32_t n = kh_size(d) jpayne@69: jpayne@69: assert n == hdr.n[BCF_DT_CTG] jpayne@69: jpayne@69: for i in range(n): jpayne@69: yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i)) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: try: jpayne@69: self[key] jpayne@69: except KeyError: jpayne@69: return False jpayne@69: else: jpayne@69: return True jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: def add(self, id, length=None, **kwargs): jpayne@69: """Add a new contig record""" jpayne@69: if id in self: jpayne@69: raise ValueError('Header already exists for contig {}'.format(id)) jpayne@69: jpayne@69: items = [('ID', unquoted_str(id))] jpayne@69: if length is not None: jpayne@69: items.append(("length", unquoted_str(length))) jpayne@69: items += kwargs.items() jpayne@69: self.header.add_meta('contig', items=items) jpayne@69: jpayne@69: jpayne@69: cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) jpayne@69: contigs.header = header jpayne@69: jpayne@69: return contigs jpayne@69: jpayne@69: jpayne@69: cdef class VariantHeaderSamples(object): jpayne@69: """sequence of sample names from a :class:`VariantHeader` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: return bcf_hdr_nsamples(self.header.ptr) jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return bcf_hdr_nsamples(self.header.ptr) != 0 jpayne@69: jpayne@69: def __getitem__(self, index): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int32_t n = bcf_hdr_nsamples(hdr) jpayne@69: cdef int32_t i = index jpayne@69: jpayne@69: if i < 0 or i >= n: jpayne@69: raise IndexError('invalid sample index') jpayne@69: jpayne@69: return charptr_to_str(hdr.samples[i]) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int32_t i, n = bcf_hdr_nsamples(hdr) jpayne@69: jpayne@69: for i in range(n): jpayne@69: yield charptr_to_str(hdr.samples[i]) jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef vdict_t *d = hdr.dict[BCF_DT_SAMPLE] jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef khiter_t k = kh_get_vdict(d, bkey) jpayne@69: jpayne@69: return k != kh_end(d) jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: def add(self, name): jpayne@69: """Add a new sample""" jpayne@69: self.header.add_sample(name) jpayne@69: jpayne@69: jpayne@69: cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) jpayne@69: samples.header = header jpayne@69: jpayne@69: return samples jpayne@69: jpayne@69: jpayne@69: cdef class VariantHeader(object): jpayne@69: """header information for a :class:`VariantFile` object""" jpayne@69: #FIXME: Add structured proxy jpayne@69: #FIXME: Add generic proxy jpayne@69: #FIXME: Add mutable methods jpayne@69: jpayne@69: # See makeVariantHeader for C constructor jpayne@69: def __cinit__(self): jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: # Python constructor jpayne@69: def __init__(self): jpayne@69: self.ptr = bcf_hdr_init(b'w') jpayne@69: if not self.ptr: jpayne@69: raise ValueError('cannot create VariantHeader') jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.ptr: jpayne@69: bcf_hdr_destroy(self.ptr) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return self.ptr != NULL jpayne@69: jpayne@69: def copy(self): jpayne@69: return makeVariantHeader(bcf_hdr_dup(self.ptr)) jpayne@69: jpayne@69: def merge(self, VariantHeader header): jpayne@69: if header is None: jpayne@69: raise ValueError('header must not be None') jpayne@69: bcf_hdr_merge(self.ptr, header.ptr) jpayne@69: jpayne@69: @property jpayne@69: def version(self): jpayne@69: """VCF version""" jpayne@69: return force_str(bcf_hdr_get_version(self.ptr)) jpayne@69: jpayne@69: @property jpayne@69: def samples(self): jpayne@69: """samples (:class:`VariantHeaderSamples`)""" jpayne@69: return makeVariantHeaderSamples(self) jpayne@69: jpayne@69: @property jpayne@69: def records(self): jpayne@69: """header records (:class:`VariantHeaderRecords`)""" jpayne@69: return makeVariantHeaderRecords(self) jpayne@69: jpayne@69: @property jpayne@69: def contigs(self): jpayne@69: """contig information (:class:`VariantHeaderContigs`)""" jpayne@69: return makeVariantHeaderContigs(self) jpayne@69: jpayne@69: @property jpayne@69: def filters(self): jpayne@69: """filter metadata (:class:`VariantHeaderMetadata`)""" jpayne@69: return makeVariantHeaderMetadata(self, BCF_HL_FLT) jpayne@69: jpayne@69: @property jpayne@69: def info(self): jpayne@69: """info metadata (:class:`VariantHeaderMetadata`)""" jpayne@69: return makeVariantHeaderMetadata(self, BCF_HL_INFO) jpayne@69: jpayne@69: @property jpayne@69: def formats(self): jpayne@69: """format metadata (:class:`VariantHeaderMetadata`)""" jpayne@69: return makeVariantHeaderMetadata(self, BCF_HL_FMT) jpayne@69: jpayne@69: @property jpayne@69: def alts(self): jpayne@69: """alt metadata (:class:`dict` ID->record). jpayne@69: jpayne@69: The data returned just a snapshot of alt records, is created jpayne@69: every time the property is requested, and modifications will jpayne@69: not be reflected in the header metadata and vice versa. jpayne@69: jpayne@69: i.e. it is just a dict that reflects the state of alt records jpayne@69: at the time it is created. jpayne@69: """ jpayne@69: return {record['ID']:record for record in self.records jpayne@69: if record.key.upper() == 'ALT' } jpayne@69: jpayne@69: # only safe to do when opening an htsfile jpayne@69: cdef _subset_samples(self, include_samples): jpayne@69: keep_samples = set(self.samples) jpayne@69: include_samples = set(include_samples) jpayne@69: missing_samples = include_samples - keep_samples jpayne@69: keep_samples &= include_samples jpayne@69: jpayne@69: if missing_samples: jpayne@69: # FIXME: add specialized exception with payload jpayne@69: raise ValueError( jpayne@69: 'missing {:d} requested samples'.format( jpayne@69: len(missing_samples))) jpayne@69: jpayne@69: keep_samples = force_bytes(','.join(keep_samples)) jpayne@69: cdef char *keep = keep_samples if keep_samples else NULL jpayne@69: cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) jpayne@69: jpayne@69: if ret != 0: jpayne@69: raise ValueError( jpayne@69: 'bcf_hdr_set_samples failed: ret = {}'.format(ret)) jpayne@69: jpayne@69: def __str__(self): jpayne@69: cdef int hlen jpayne@69: cdef kstring_t line jpayne@69: line.l = line.m = 0 jpayne@69: line.s = NULL jpayne@69: jpayne@69: if bcf_hdr_format(self.ptr, 0, &line) < 0: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: raise ValueError('bcf_hdr_format failed') jpayne@69: jpayne@69: ret = charptr_to_str_w_len(line.s, line.l) jpayne@69: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: return ret jpayne@69: jpayne@69: def new_record(self, contig=None, start=0, stop=0, alleles=None, jpayne@69: id=None, qual=None, filter=None, info=None, samples=None, jpayne@69: **kwargs): jpayne@69: """Create a new empty VariantRecord. jpayne@69: jpayne@69: Arguments are currently experimental. Use with caution and expect jpayne@69: changes in upcoming releases. jpayne@69: jpayne@69: """ jpayne@69: rec = makeVariantRecord(self, bcf_init()) jpayne@69: jpayne@69: if not rec: jpayne@69: raise MemoryError('unable to allocate BCF record') jpayne@69: jpayne@69: rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr) jpayne@69: jpayne@69: if contig is not None: jpayne@69: rec.contig = contig jpayne@69: jpayne@69: rec.start = start jpayne@69: rec.stop = stop jpayne@69: rec.id = id jpayne@69: rec.qual = qual jpayne@69: jpayne@69: if alleles is not None: jpayne@69: rec.alleles = alleles jpayne@69: jpayne@69: if filter is not None: jpayne@69: if isinstance(filter, (list, tuple, VariantRecordFilter)): jpayne@69: for f in filter: jpayne@69: rec.filter.add(f) jpayne@69: else: jpayne@69: rec.filter.add(filter) jpayne@69: jpayne@69: if info: jpayne@69: rec.info.update(info) jpayne@69: jpayne@69: if kwargs: jpayne@69: rec.samples[0].update(kwargs) jpayne@69: jpayne@69: if samples: jpayne@69: for i, sample in enumerate(samples): jpayne@69: rec.samples[i].update(sample) jpayne@69: jpayne@69: return rec jpayne@69: jpayne@69: def add_record(self, VariantHeaderRecord record): jpayne@69: """Add an existing :class:`VariantHeaderRecord` to this header""" jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr) jpayne@69: jpayne@69: bcf_hdr_add_hrec(self.ptr, hrec) jpayne@69: jpayne@69: self._hdr_sync() jpayne@69: jpayne@69: def add_line(self, line): jpayne@69: """Add a metadata line to this header""" jpayne@69: bline = force_bytes(line) jpayne@69: if bcf_hdr_append(self.ptr, bline) < 0: jpayne@69: raise ValueError('invalid header line') jpayne@69: jpayne@69: self._hdr_sync() jpayne@69: jpayne@69: jpayne@69: def add_meta(self, key, value=None, items=None): jpayne@69: """Add metadata to this header""" jpayne@69: if not ((value is not None) ^ (items is not None)): jpayne@69: raise ValueError('either value or items must be specified') jpayne@69: jpayne@69: cdef bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)) jpayne@69: cdef int quoted jpayne@69: jpayne@69: try: jpayne@69: key = force_bytes(key) jpayne@69: hrec.key = strdup(key) jpayne@69: jpayne@69: if value is not None: jpayne@69: hrec.value = strdup(force_bytes(value)) jpayne@69: else: jpayne@69: for key, value in items: jpayne@69: quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type") jpayne@69: jpayne@69: key = force_bytes(key) jpayne@69: bcf_hrec_add_key(hrec, key, len(key)) jpayne@69: jpayne@69: value = force_bytes(str(value)) jpayne@69: bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted) jpayne@69: except: jpayne@69: bcf_hrec_destroy(hrec) jpayne@69: raise jpayne@69: jpayne@69: bcf_hdr_add_hrec(self.ptr, hrec) jpayne@69: jpayne@69: self._hdr_sync() jpayne@69: jpayne@69: cdef _add_sample(self, name): jpayne@69: bname = force_bytes(name) jpayne@69: if bcf_hdr_add_sample(self.ptr, bname) < 0: jpayne@69: raise ValueError('Duplicated sample name: {}'.format(name)) jpayne@69: jpayne@69: cdef _hdr_sync(self): jpayne@69: cdef bcf_hdr_t *hdr = self.ptr jpayne@69: if hdr.dirty: jpayne@69: if bcf_hdr_sync(hdr) < 0: jpayne@69: raise MemoryError('unable to reallocate VariantHeader') jpayne@69: jpayne@69: def add_sample(self, name): jpayne@69: """Add a new sample to this header""" jpayne@69: self._add_sample(name) jpayne@69: self._hdr_sync() jpayne@69: jpayne@69: def add_samples(self, *args): jpayne@69: """Add several new samples to this header. jpayne@69: This function takes multiple arguments, each of which may jpayne@69: be either a sample name or an iterable returning sample names jpayne@69: (e.g., a list of sample names). jpayne@69: """ jpayne@69: for arg in args: jpayne@69: if isinstance(arg, str): jpayne@69: self._add_sample(arg) jpayne@69: else: jpayne@69: for name in arg: jpayne@69: self._add_sample(name) jpayne@69: self._hdr_sync() jpayne@69: jpayne@69: jpayne@69: cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): jpayne@69: if not hdr: jpayne@69: raise ValueError('cannot create VariantHeader') jpayne@69: jpayne@69: cdef VariantHeader header = VariantHeader.__new__(VariantHeader) jpayne@69: header.ptr = hdr jpayne@69: jpayne@69: return header jpayne@69: jpayne@69: jpayne@69: cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2: jpayne@69: cdef vdict_t *d jpayne@69: cdef khiter_t k jpayne@69: cdef int info_id jpayne@69: jpayne@69: if isinstance(key, str): jpayne@69: key = force_bytes(key) jpayne@69: jpayne@69: d = hdr.dict[BCF_DT_ID] jpayne@69: k = kh_get_vdict(d, key) jpayne@69: jpayne@69: if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: jpayne@69: return -1 jpayne@69: jpayne@69: return kh_val_vdict(d, k).id jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Variant Record objects jpayne@69: ######################################################################## jpayne@69: jpayne@69: cdef class VariantRecordFilter(object): jpayne@69: """Filters set on a :class:`VariantRecord` object, presented as a mapping from jpayne@69: filter index or name to :class:`VariantMetadata` object""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self.record.ptr.d.n_flt jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return self.record.ptr.d.n_flt != 0 jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int index, id jpayne@69: cdef int n = r.d.n_flt jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: index = key jpayne@69: jpayne@69: if index < 0 or index >= n: jpayne@69: raise IndexError('invalid filter index') jpayne@69: jpayne@69: id = r.d.flt[index] jpayne@69: else: jpayne@69: if key == '.': jpayne@69: key = 'PASS' jpayne@69: jpayne@69: bkey = force_bytes(key) jpayne@69: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): jpayne@69: raise KeyError('Invalid filter: {}'.format(key)) jpayne@69: jpayne@69: return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) jpayne@69: jpayne@69: def add(self, key): jpayne@69: """Add a new filter""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int id jpayne@69: jpayne@69: if key == '.': jpayne@69: key = 'PASS' jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_FLT, id): jpayne@69: raise KeyError('Invalid filter: {}'.format(key)) jpayne@69: jpayne@69: bcf_add_filter(hdr, r, id) jpayne@69: jpayne@69: def __delitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int index, id jpayne@69: cdef int n = r.d.n_flt jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: index = key jpayne@69: jpayne@69: if index < 0 or index >= n: jpayne@69: raise IndexError('invalid filter index') jpayne@69: jpayne@69: id = r.d.flt[index] jpayne@69: else: jpayne@69: if key == '.': jpayne@69: key = 'PASS' jpayne@69: jpayne@69: bkey = force_bytes(key) jpayne@69: id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): jpayne@69: raise KeyError('Invalid filter: {}'.format(key)) jpayne@69: jpayne@69: bcf_remove_filter(hdr, r, id, 0) jpayne@69: jpayne@69: def clear(self): jpayne@69: """Clear all filters""" jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: r.d.shared_dirty |= BCF1_DIRTY_FLT jpayne@69: r.d.n_flt = 0 jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int i jpayne@69: jpayne@69: for i in range(r.d.n_flt): jpayne@69: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: return bcf_has_filter(hdr, r, bkey) == 1 jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op): jpayne@69: if op != 2 and op != 3: jpayne@69: return NotImplemented jpayne@69: jpayne@69: cdef bcf1_t *s = self.record.ptr jpayne@69: cdef bcf1_t *o = other.record.ptr jpayne@69: jpayne@69: cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other)) jpayne@69: jpayne@69: if op == 3: jpayne@69: cmp = not cmp jpayne@69: jpayne@69: return cmp jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: jpayne@69: cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): jpayne@69: if not record: jpayne@69: raise ValueError('invalid VariantRecord') jpayne@69: jpayne@69: cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) jpayne@69: filter.record = record jpayne@69: jpayne@69: return filter jpayne@69: jpayne@69: jpayne@69: cdef class VariantRecordFormat(object): jpayne@69: """Format data present for each sample in a :class:`VariantRecord` object, jpayne@69: presented as mapping from format name to :class:`VariantMetadata` object.""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int i, n = 0 jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: if r.d.fmt[i].p: jpayne@69: n += 1 jpayne@69: return n jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int i jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: if r.d.fmt[i].p: jpayne@69: return True jpayne@69: return False jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: jpayne@69: if not fmt or not fmt.p: jpayne@69: raise KeyError('unknown format: {}'.format(key)) jpayne@69: jpayne@69: return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) jpayne@69: jpayne@69: def __delitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: jpayne@69: if not fmt or not fmt.p: jpayne@69: raise KeyError('unknown format: {}'.format(key)) jpayne@69: jpayne@69: if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: jpayne@69: raise ValueError('Unable to delete FORMAT') jpayne@69: jpayne@69: def clear(self): jpayne@69: """Clear all formats for all samples within the associated jpayne@69: :class:`VariantRecord` instance""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_fmt_t *fmt jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: for i in reversed(range(r.n_fmt)): jpayne@69: fmt = &r.d.fmt[i] jpayne@69: if fmt.p: jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id) jpayne@69: if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0: jpayne@69: raise ValueError('Unable to delete FORMAT') jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_fmt_t *fmt jpayne@69: cdef int i jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: fmt = &r.d.fmt[i] jpayne@69: if fmt.p: jpayne@69: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: return fmt != NULL and fmt.p != NULL jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: jpayne@69: cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): jpayne@69: if not record: jpayne@69: raise ValueError('invalid VariantRecord') jpayne@69: jpayne@69: cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) jpayne@69: format.record = record jpayne@69: jpayne@69: return format jpayne@69: jpayne@69: jpayne@69: #TODO: Add a getmeta method to return the corresponding VariantMetadata? jpayne@69: cdef class VariantRecordInfo(object): jpayne@69: """Info data stored in a :class:`VariantRecord` object, presented as a jpayne@69: mapping from info metadata name to value.""" jpayne@69: jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i, count = 0 jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: jpayne@69: count += 1 jpayne@69: jpayne@69: return count jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: jpayne@69: return True jpayne@69: jpayne@69: return False jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: jpayne@69: if strcmp(bkey, b'END') == 0: jpayne@69: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@69: jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: # Cannot stop here if info == NULL, since flags must return False jpayne@69: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@69: jpayne@69: if info_id < 0: jpayne@69: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: # Handle type=Flag values jpayne@69: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: jpayne@69: return info != NULL and info.vptr != NULL jpayne@69: jpayne@69: if not info or not info.vptr: jpayne@69: raise KeyError('Invalid INFO field: {}'.format(key)) jpayne@69: jpayne@69: return bcf_info_get_value(self.record, info) jpayne@69: jpayne@69: def __setitem__(self, key, value): jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: jpayne@69: if strcmp(bkey, b'END') == 0: jpayne@69: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@69: jpayne@69: if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: bcf_info_set_value(self.record, key, value) jpayne@69: jpayne@69: def __delitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: if strcmp(bkey, b'END') == 0: jpayne@69: raise KeyError('END is a reserved attribute; access is via record.stop') jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: # Cannot stop here if info == NULL, since flags must return False jpayne@69: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@69: jpayne@69: if info_id < 0: jpayne@69: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: # Handle flags jpayne@69: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): jpayne@69: return jpayne@69: jpayne@69: if not info or not info.vptr: jpayne@69: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@69: jpayne@69: if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: jpayne@69: raise ValueError('Unable to delete INFO') jpayne@69: jpayne@69: def clear(self): jpayne@69: """Clear all info data""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: if info and info.vptr: jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if strcmp(key, b'END') == 0: jpayne@69: continue jpayne@69: if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: jpayne@69: raise ValueError('Unable to delete INFO') jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: if info and info.vptr: jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if strcmp(key, b'END') != 0: jpayne@69: yield bcf_str_cache_get_charptr(key) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: jpayne@69: if strcmp(bkey, b'END') == 0: jpayne@69: return default jpayne@69: jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: # Cannot stop here if info == NULL, since flags must return False jpayne@69: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: # Handle flags jpayne@69: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: jpayne@69: return info != NULL and info.vptr != NULL jpayne@69: jpayne@69: if not info or not info.vptr: jpayne@69: return default jpayne@69: jpayne@69: return bcf_info_get_value(self.record, info) jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: jpayne@69: if strcmp(bkey, b'END') == 0: jpayne@69: return False jpayne@69: jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: return info != NULL and info.vptr != NULL jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: if info and info.vptr: jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if strcmp(key, b'END') != 0: jpayne@69: yield bcf_info_get_value(self.record, info) jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_info_t *info jpayne@69: cdef const char *key jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_info): jpayne@69: info = &r.d.info[i] jpayne@69: if info and info.vptr: jpayne@69: key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) jpayne@69: if strcmp(key, b'END') != 0: jpayne@69: value = bcf_info_get_value(self.record, info) jpayne@69: yield bcf_str_cache_get_charptr(key), value jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def update(self, items=None, **kwargs): jpayne@69: """D.update([E, ]**F) -> None. jpayne@69: jpayne@69: Update D from dict/iterable E and F. jpayne@69: """ jpayne@69: for k, v in items.items(): jpayne@69: if k != 'END': jpayne@69: self[k] = v jpayne@69: jpayne@69: if kwargs: jpayne@69: kwargs.pop('END', None) jpayne@69: for k, v in kwargs.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: def pop(self, key, default=_nothing): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) jpayne@69: jpayne@69: # Cannot stop here if info == NULL, since flags must return False jpayne@69: cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key jpayne@69: jpayne@69: if info_id < 0: jpayne@69: if default is _nothing: jpayne@69: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@69: return default jpayne@69: jpayne@69: if not check_header_id(hdr, BCF_HL_INFO, info_id): jpayne@69: raise ValueError('Invalid header') jpayne@69: jpayne@69: # Handle flags jpayne@69: if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): jpayne@69: return jpayne@69: jpayne@69: if not info or not info.vptr: jpayne@69: if default is _nothing: jpayne@69: raise KeyError('Unknown INFO field: {}'.format(key)) jpayne@69: return default jpayne@69: jpayne@69: value = bcf_info_get_value(self.record, info) jpayne@69: jpayne@69: if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: jpayne@69: raise ValueError('Unable to delete INFO') jpayne@69: jpayne@69: return value jpayne@69: jpayne@69: def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op): jpayne@69: if op != 2 and op != 3: jpayne@69: return NotImplemented jpayne@69: jpayne@69: cdef bcf1_t *s = self.record.ptr jpayne@69: cdef bcf1_t *o = other.record.ptr jpayne@69: jpayne@69: # Cannot use n_info as shortcut logic, since null values may remain jpayne@69: cdef bint cmp = dict(self) == dict(other) jpayne@69: jpayne@69: if op == 3: jpayne@69: cmp = not cmp jpayne@69: jpayne@69: return cmp jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: jpayne@69: cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): jpayne@69: if not record: jpayne@69: raise ValueError('invalid VariantRecord') jpayne@69: jpayne@69: cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) jpayne@69: info.record = record jpayne@69: jpayne@69: return info jpayne@69: jpayne@69: jpayne@69: cdef class VariantRecordSamples(object): jpayne@69: """mapping from sample index or name to :class:`VariantRecordSample` object.""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __len__(self): jpayne@69: return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr) jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0 jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int n = self.record.ptr.n_sample jpayne@69: cdef int sample_index jpayne@69: cdef vdict_t *d jpayne@69: cdef khiter_t k jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: sample_index = key jpayne@69: else: jpayne@69: bkey = force_bytes(key) jpayne@69: sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) jpayne@69: if sample_index < 0: jpayne@69: raise KeyError('invalid sample name: {}'.format(key)) jpayne@69: jpayne@69: if sample_index < 0 or sample_index >= n: jpayne@69: raise IndexError('invalid sample index') jpayne@69: jpayne@69: return makeVariantRecordSample(self.record, sample_index) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int32_t i, n = self.record.ptr.n_sample jpayne@69: jpayne@69: for i in range(n): jpayne@69: yield charptr_to_str(hdr.samples[i]) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int n = self.record.ptr.n_sample jpayne@69: cdef int sample_index jpayne@69: cdef vdict_t *d jpayne@69: cdef khiter_t k jpayne@69: jpayne@69: if isinstance(key, int): jpayne@69: sample_index = key jpayne@69: else: jpayne@69: bkey = force_bytes(key) jpayne@69: sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) jpayne@69: if sample_index < 0: jpayne@69: raise KeyError('invalid sample name: {}'.format(key)) jpayne@69: jpayne@69: return 0 <= sample_index < n jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int32_t i, n = self.record.ptr.n_sample jpayne@69: jpayne@69: for i in range(n): jpayne@69: yield makeVariantRecordSample(self.record, i) jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int32_t i, n = self.record.ptr.n_sample jpayne@69: jpayne@69: for i in range(n): jpayne@69: yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def update(self, items=None, **kwargs): jpayne@69: """D.update([E, ]**F) -> None. jpayne@69: jpayne@69: Update D from dict/iterable E and F. jpayne@69: """ jpayne@69: for k, v in items.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: if kwargs: jpayne@69: for k, v in kwargs.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: def pop(self, key, default=_nothing): jpayne@69: try: jpayne@69: value = self[key] jpayne@69: del self[key] jpayne@69: return value jpayne@69: except KeyError: jpayne@69: if default is not _nothing: jpayne@69: return default jpayne@69: raise jpayne@69: jpayne@69: def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op): jpayne@69: if op != 2 and op != 3: jpayne@69: return NotImplemented jpayne@69: jpayne@69: cdef bcf1_t *s = self.record.ptr jpayne@69: cdef bcf1_t *o = other.record.ptr jpayne@69: jpayne@69: cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values()) jpayne@69: jpayne@69: if op == 3: jpayne@69: cmp = not cmp jpayne@69: jpayne@69: return cmp jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: jpayne@69: cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): jpayne@69: if not record: jpayne@69: raise ValueError('invalid VariantRecord') jpayne@69: jpayne@69: cdef VariantRecordSamples samples = VariantRecordSamples.__new__( jpayne@69: VariantRecordSamples) jpayne@69: samples.record = record jpayne@69: jpayne@69: return samples jpayne@69: jpayne@69: jpayne@69: cdef class VariantRecord(object): jpayne@69: """Variant record""" jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.ptr: jpayne@69: bcf_destroy1(self.ptr) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: def copy(self): jpayne@69: """return a copy of this VariantRecord object""" jpayne@69: return makeVariantRecord(self.header, bcf_dup(self.ptr)) jpayne@69: jpayne@69: def translate(self, VariantHeader dst_header): jpayne@69: if dst_header is None: jpayne@69: raise ValueError('dst_header must not be None') jpayne@69: jpayne@69: cdef bcf_hdr_t *src_hdr = self.header.ptr jpayne@69: cdef bcf_hdr_t *dst_hdr = dst_header.ptr jpayne@69: jpayne@69: if src_hdr != dst_hdr: jpayne@69: if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr): jpayne@69: msg = 'Cannot translate record. Number of samples does not match header ({} vs {})' jpayne@69: raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) jpayne@69: jpayne@69: bcf_translate(dst_hdr, src_hdr, self.ptr) jpayne@69: self.header = dst_header jpayne@69: jpayne@69: @property jpayne@69: def rid(self): jpayne@69: """internal reference id number""" jpayne@69: return self.ptr.rid jpayne@69: jpayne@69: @rid.setter jpayne@69: def rid(self, value): jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int r = value jpayne@69: if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: jpayne@69: raise ValueError('invalid reference id') jpayne@69: self.ptr.rid = r jpayne@69: jpayne@69: @property jpayne@69: def chrom(self): jpayne@69: """chromosome/contig name""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int rid = self.ptr.rid jpayne@69: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@69: raise ValueError('Invalid header') jpayne@69: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@69: jpayne@69: @chrom.setter jpayne@69: def chrom(self, value): jpayne@69: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@69: bchrom = force_bytes(value) jpayne@69: cdef khint_t k = kh_get_vdict(d, bchrom) jpayne@69: if k == kh_end(d): jpayne@69: raise ValueError('Invalid chromosome/contig') jpayne@69: self.ptr.rid = kh_val_vdict(d, k).id jpayne@69: jpayne@69: @property jpayne@69: def contig(self): jpayne@69: """chromosome/contig name""" jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int rid = self.ptr.rid jpayne@69: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@69: raise ValueError('Invalid header') jpayne@69: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@69: jpayne@69: @contig.setter jpayne@69: def contig(self, value): jpayne@69: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@69: bchrom = force_bytes(value) jpayne@69: cdef khint_t k = kh_get_vdict(d, bchrom) jpayne@69: if k == kh_end(d): jpayne@69: raise ValueError('Invalid chromosome/contig') jpayne@69: self.ptr.rid = kh_val_vdict(d, k).id jpayne@69: jpayne@69: @property jpayne@69: def pos(self): jpayne@69: """record start position on chrom/contig (1-based inclusive)""" jpayne@69: return self.ptr.pos + 1 jpayne@69: jpayne@69: @pos.setter jpayne@69: def pos(self, value): jpayne@69: cdef int p = value jpayne@69: if p < 1: jpayne@69: raise ValueError('Position must be positive') jpayne@69: self.ptr.pos = p - 1 jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def start(self): jpayne@69: """record start position on chrom/contig (0-based inclusive)""" jpayne@69: return self.ptr.pos jpayne@69: jpayne@69: @start.setter jpayne@69: def start(self, value): jpayne@69: cdef int s = value jpayne@69: if s < 0: jpayne@69: raise ValueError('Start coordinate must be non-negative') jpayne@69: self.ptr.pos = s jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def stop(self): jpayne@69: """record stop position on chrom/contig (0-based exclusive)""" jpayne@69: return self.ptr.pos + self.ptr.rlen jpayne@69: jpayne@69: @stop.setter jpayne@69: def stop(self, value): jpayne@69: cdef int s = value jpayne@69: if s < 0: jpayne@69: raise ValueError('Stop coordinate must be non-negative') jpayne@69: self.ptr.rlen = s - self.ptr.pos jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def rlen(self): jpayne@69: """record length on chrom/contig (aka rec.stop - rec.start)""" jpayne@69: return self.ptr.rlen jpayne@69: jpayne@69: @rlen.setter jpayne@69: def rlen(self, value): jpayne@69: cdef int r = value jpayne@69: self.ptr.rlen = r jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def qual(self): jpayne@69: """phred scaled quality score or None if not available""" jpayne@69: return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None jpayne@69: jpayne@69: @qual.setter jpayne@69: def qual(self, value): jpayne@69: if value is not None: jpayne@69: self.ptr.qual = value jpayne@69: else: jpayne@69: bcf_float_set(&self.ptr.qual, bcf_float_missing) jpayne@69: jpayne@69: jpayne@69: # @property jpayne@69: # def n_allele(self): jpayne@69: # return self.ptr.n_allele jpayne@69: jpayne@69: # @property jpayne@69: # def n_sample(self): jpayne@69: # return self.ptr.n_sample jpayne@69: jpayne@69: @property jpayne@69: def id(self): jpayne@69: """record identifier or None if not available""" jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: # causes a memory leak https://github.com/pysam-developers/pysam/issues/773 jpayne@69: # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None jpayne@69: if (r.d.m_id == 0): jpayne@69: raise ValueError('Error extracting ID') jpayne@69: return charptr_to_str(r.d.id) if r.d.id != b'.' else None jpayne@69: jpayne@69: @id.setter jpayne@69: def id(self, value): jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: cdef char *idstr = NULL jpayne@69: if value is not None: jpayne@69: bid = force_bytes(value) jpayne@69: idstr = bid jpayne@69: if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: jpayne@69: raise ValueError('Error updating id') jpayne@69: jpayne@69: @property jpayne@69: def ref(self): jpayne@69: """reference allele""" jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: return charptr_to_str(r.d.allele[0]) if r.d.allele else None jpayne@69: jpayne@69: @ref.setter jpayne@69: def ref(self, value): jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: #FIXME: Set alleles directly -- this is stupid jpayne@69: if not value: jpayne@69: raise ValueError('ref allele must not be null') jpayne@69: value = force_bytes(value) jpayne@69: if r.d.allele and r.n_allele: jpayne@69: alleles = [r.d.allele[i] for i in range(r.n_allele)] jpayne@69: alleles[0] = value jpayne@69: else: jpayne@69: alleles = [value, ''] jpayne@69: self.alleles = alleles jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def alleles(self): jpayne@69: """tuple of reference allele followed by alt alleles""" jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: if not r.d.allele: jpayne@69: return None jpayne@69: cdef tuple res = PyTuple_New(r.n_allele) jpayne@69: for i in range(r.n_allele): jpayne@69: a = charptr_to_str(r.d.allele[i]) jpayne@69: PyTuple_SET_ITEM(res, i, a) jpayne@69: Py_INCREF(a) jpayne@69: return res jpayne@69: jpayne@69: @alleles.setter jpayne@69: def alleles(self, values): jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: jpayne@69: # Cache rlen of symbolic alleles before call to bcf_update_alleles_str jpayne@69: cdef int rlen = r.rlen jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: values = [force_bytes(v) for v in values] jpayne@69: jpayne@69: if len(values) < 2: jpayne@69: raise ValueError('must set at least 2 alleles') jpayne@69: jpayne@69: if b'' in values: jpayne@69: raise ValueError('cannot set null allele') jpayne@69: jpayne@69: value = b','.join(values) jpayne@69: jpayne@69: if bcf_update_alleles_str(self.header.ptr, r, value) < 0: jpayne@69: raise ValueError('Error updating alleles') jpayne@69: jpayne@69: # Reset rlen if alternate allele isn't symbolic, otherwise used cached jpayne@69: if has_symbolic_allele(self): jpayne@69: self.ptr.rlen = rlen jpayne@69: else: jpayne@69: self.ptr.rlen = len(values[0]) jpayne@69: r.d.var_type = -1 jpayne@69: bcf_sync_end(self) jpayne@69: jpayne@69: @property jpayne@69: def alts(self): jpayne@69: """tuple of alt alleles""" jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: if r.n_allele < 2 or not r.d.allele: jpayne@69: return None jpayne@69: cdef tuple res = PyTuple_New(r.n_allele - 1) jpayne@69: for i in range(1, r.n_allele): jpayne@69: a = charptr_to_str(r.d.allele[i]) jpayne@69: PyTuple_SET_ITEM(res, i - 1, a) jpayne@69: Py_INCREF(a) jpayne@69: return res jpayne@69: jpayne@69: @alts.setter jpayne@69: def alts(self, value): jpayne@69: #FIXME: Set alleles directly -- this is stupid jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: if bcf_unpack(r, BCF_UN_STR) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: value = [force_bytes(v) for v in value] jpayne@69: if b'' in value: jpayne@69: raise ValueError('cannot set null alt allele') jpayne@69: ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] jpayne@69: self.alleles = ref + value jpayne@69: r.d.var_type = -1 jpayne@69: jpayne@69: @property jpayne@69: def filter(self): jpayne@69: """filter information (see :class:`VariantRecordFilter`)""" jpayne@69: if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: return makeVariantRecordFilter(self) jpayne@69: jpayne@69: @property jpayne@69: def info(self): jpayne@69: """info data (see :class:`VariantRecordInfo`)""" jpayne@69: if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: return makeVariantRecordInfo(self) jpayne@69: jpayne@69: @property jpayne@69: def format(self): jpayne@69: """sample format metadata (see :class:`VariantRecordFormat`)""" jpayne@69: if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: return makeVariantRecordFormat(self) jpayne@69: jpayne@69: @property jpayne@69: def samples(self): jpayne@69: """sample data (see :class:`VariantRecordSamples`)""" jpayne@69: if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: return makeVariantRecordSamples(self) jpayne@69: jpayne@69: property alleles_variant_types: jpayne@69: def __get__(self): jpayne@69: cdef bcf1_t *r = self.ptr jpayne@69: cdef tuple result = PyTuple_New(r.n_allele) jpayne@69: jpayne@69: for i in range(r.n_allele): jpayne@69: tp = bcf_get_variant_type(r, i) jpayne@69: jpayne@69: if tp == VCF_REF: jpayne@69: v_type = "REF" jpayne@69: elif tp == VCF_SNP: jpayne@69: v_type = "SNP" jpayne@69: elif tp == VCF_MNP: jpayne@69: v_type = "MNP" jpayne@69: elif tp == VCF_INDEL: jpayne@69: v_type = "INDEL" jpayne@69: elif tp == VCF_BND: jpayne@69: v_type = "BND" jpayne@69: elif tp == VCF_OVERLAP: jpayne@69: v_type = "OVERLAP" jpayne@69: else: jpayne@69: v_type = "OTHER" jpayne@69: jpayne@69: PyTuple_SET_ITEM(result, i, v_type) jpayne@69: Py_INCREF(v_type) jpayne@69: jpayne@69: return result jpayne@69: jpayne@69: def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op): jpayne@69: if op != 2 and op != 3: jpayne@69: return NotImplemented jpayne@69: jpayne@69: cdef bcf1_t *s = self.ptr jpayne@69: cdef bcf1_t *o = other.ptr jpayne@69: jpayne@69: cdef bint cmp = self is other or ( jpayne@69: s.pos == o.pos jpayne@69: and s.rlen == o.rlen jpayne@69: and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual)) jpayne@69: or s.qual == o.qual) jpayne@69: and s.n_sample == o.n_sample jpayne@69: and s.n_allele == o.n_allele jpayne@69: and self.contig == other.contig jpayne@69: and self.alleles == other.alleles jpayne@69: and self.id == other.id jpayne@69: and self.info == other.info jpayne@69: and self.filter == other.filter jpayne@69: and self.samples == other.samples) jpayne@69: jpayne@69: if op == 3: jpayne@69: cmp = not cmp jpayne@69: jpayne@69: return cmp jpayne@69: jpayne@69: def __str__(self): jpayne@69: cdef kstring_t line jpayne@69: cdef char c jpayne@69: jpayne@69: line.l = line.m = 0 jpayne@69: line.s = NULL jpayne@69: jpayne@69: if vcf_format(self.header.ptr, self.ptr, &line) < 0: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: raise ValueError('vcf_format failed') jpayne@69: jpayne@69: # Strip CR/LF? jpayne@69: #while line.l: jpayne@69: # c = line.s[line.l - 1] jpayne@69: # if c != b'\n' and c != b'\r': jpayne@69: # break jpayne@69: # line.l -= 1 jpayne@69: jpayne@69: ret = charptr_to_str_w_len(line.s, line.l) jpayne@69: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: jpayne@69: return ret jpayne@69: jpayne@69: jpayne@69: cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: if not r: jpayne@69: raise ValueError('cannot create VariantRecord') jpayne@69: jpayne@69: if r.errcode: jpayne@69: msg = [] jpayne@69: #if r.errcode & BCF_ERR_CTG_UNDEF: jpayne@69: # msg.append('undefined contig') jpayne@69: #if r.errcode & BCF_ERR_TAG_UNDEF: jpayne@69: # msg.append('undefined tag') jpayne@69: if r.errcode & BCF_ERR_NCOLS: jpayne@69: msg.append('invalid number of columns') jpayne@69: if r.errcode & BCF_ERR_LIMITS: jpayne@69: msg.append('limits violated') jpayne@69: if r.errcode & BCF_ERR_CHAR: jpayne@69: msg.append('invalid character found') jpayne@69: if r.errcode & BCF_ERR_CTG_INVALID: jpayne@69: msg.append('invalid contig') jpayne@69: if r.errcode & BCF_ERR_TAG_INVALID: jpayne@69: msg.append('invalid tag') jpayne@69: jpayne@69: if msg: jpayne@69: msg = ', '.join(msg) jpayne@69: raise ValueError('Error(s) reading record: {}'.format(msg)) jpayne@69: jpayne@69: cdef VariantRecord record = VariantRecord.__new__(VariantRecord) jpayne@69: record.header = header jpayne@69: record.ptr = r jpayne@69: jpayne@69: return record jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Variant Sampletype object jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef class VariantRecordSample(object): jpayne@69: """Data for a single sample from a :class:`VariantRecord` object. jpayne@69: Provides data accessors for genotypes and a mapping interface jpayne@69: from format name to values. jpayne@69: """ jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: raise TypeError('this class cannot be instantiated from Python') jpayne@69: jpayne@69: @property jpayne@69: def name(self): jpayne@69: """sample name""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int32_t n = r.n_sample jpayne@69: jpayne@69: if self.index < 0 or self.index >= n: jpayne@69: raise ValueError('invalid sample index') jpayne@69: jpayne@69: return charptr_to_str(hdr.samples[self.index]) jpayne@69: jpayne@69: @property jpayne@69: def allele_indices(self): jpayne@69: """allele indices for called genotype, if present. Otherwise None""" jpayne@69: return bcf_format_get_allele_indices(self) jpayne@69: jpayne@69: @allele_indices.setter jpayne@69: def allele_indices(self, value): jpayne@69: self['GT'] = value jpayne@69: jpayne@69: @allele_indices.deleter jpayne@69: def allele_indices(self): jpayne@69: self['GT'] = () jpayne@69: jpayne@69: @property jpayne@69: def alleles(self): jpayne@69: """alleles for called genotype, if present. Otherwise None""" jpayne@69: return bcf_format_get_alleles(self) jpayne@69: jpayne@69: @alleles.setter jpayne@69: def alleles(self, value): jpayne@69: # Sets the genotype, supply a tuple of alleles to set. jpayne@69: # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord jpayne@69: # The genotype is reset when an empty tuple, None or (None,) is supplied jpayne@69: jpayne@69: if value==(None,) or value==tuple() or value is None: jpayne@69: self['GT'] = () jpayne@69: return jpayne@69: jpayne@69: if any((type(x) == int for x in value)): jpayne@69: raise ValueError('Use .allele_indices to set integer allele indices') jpayne@69: jpayne@69: # determine and set allele indices: jpayne@69: try: jpayne@69: self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) ) jpayne@69: except ValueError: jpayne@69: raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord." jpayne@69: "First set the .alleles of this record to define the alleles") jpayne@69: jpayne@69: @alleles.deleter jpayne@69: def alleles(self): jpayne@69: self['GT'] = () jpayne@69: jpayne@69: @property jpayne@69: def phased(self): jpayne@69: """False if genotype is missing or any allele is unphased. Otherwise True.""" jpayne@69: return bcf_sample_get_phased(self) jpayne@69: jpayne@69: @phased.setter jpayne@69: def phased(self, value): jpayne@69: bcf_sample_set_phased(self, value) jpayne@69: jpayne@69: def __len__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int i, n = 0 jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_FMT) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: if r.d.fmt[i].p: jpayne@69: n += 1 jpayne@69: return n jpayne@69: jpayne@69: def __bool__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef int i jpayne@69: jpayne@69: if bcf_unpack(r, BCF_UN_FMT) < 0: jpayne@69: raise ValueError('Error unpacking VariantRecord') jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: if r.d.fmt[i].p: jpayne@69: return True jpayne@69: return False jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: return bcf_format_get_value(self, key) jpayne@69: jpayne@69: def __setitem__(self, key, value): jpayne@69: bcf_format_set_value(self, key, value) jpayne@69: jpayne@69: def __delitem__(self, key): jpayne@69: bcf_format_del_value(self, key) jpayne@69: jpayne@69: def clear(self): jpayne@69: """Clear all format data (including genotype) for this sample""" jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_fmt_t *fmt jpayne@69: cdef int i jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: fmt = &r.d.fmt[i] jpayne@69: if fmt.p: jpayne@69: bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bcf_fmt_t *fmt jpayne@69: cdef int i jpayne@69: jpayne@69: for i in range(r.n_fmt): jpayne@69: fmt = &r.d.fmt[i] jpayne@69: if r.d.fmt[i].p: jpayne@69: yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: cdef bcf_hdr_t *hdr = self.record.header.ptr jpayne@69: cdef bcf1_t *r = self.record.ptr jpayne@69: cdef bytes bkey = force_bytes(key) jpayne@69: cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) jpayne@69: return fmt != NULL and fmt.p != NULL jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def update(self, items=None, **kwargs): jpayne@69: """D.update([E, ]**F) -> None. jpayne@69: jpayne@69: Update D from dict/iterable E and F. jpayne@69: """ jpayne@69: for k, v in items.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: if kwargs: jpayne@69: for k, v in kwargs.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: def pop(self, key, default=_nothing): jpayne@69: try: jpayne@69: value = self[key] jpayne@69: del self[key] jpayne@69: return value jpayne@69: except KeyError: jpayne@69: if default is not _nothing: jpayne@69: return default jpayne@69: raise jpayne@69: jpayne@69: def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op): jpayne@69: if op != 2 and op != 3: jpayne@69: return NotImplemented jpayne@69: jpayne@69: cdef bint cmp = dict(self) == dict(other) jpayne@69: jpayne@69: if op == 3: jpayne@69: cmp = not cmp jpayne@69: jpayne@69: return cmp jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: jpayne@69: cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): jpayne@69: if not record or sample_index < 0: jpayne@69: raise ValueError('cannot create VariantRecordSample') jpayne@69: jpayne@69: cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) jpayne@69: sample.record = record jpayne@69: sample.index = sample_index jpayne@69: jpayne@69: return sample jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Index objects jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef class BaseIndex(object): jpayne@69: def __init__(self): jpayne@69: self.refs = () jpayne@69: self.remap = {} jpayne@69: jpayne@69: def __len__(self): jpayne@69: return len(self.refs) jpayne@69: jpayne@69: def __bool__(self): jpayne@69: return len(self.refs) != 0 jpayne@69: jpayne@69: def __getitem__(self, key): jpayne@69: if isinstance(key, int): jpayne@69: return self.refs[key] jpayne@69: else: jpayne@69: return self.refmap[key] jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return iter(self.refs) jpayne@69: jpayne@69: def get(self, key, default=None): jpayne@69: """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" jpayne@69: try: jpayne@69: return self[key] jpayne@69: except KeyError: jpayne@69: return default jpayne@69: jpayne@69: def __contains__(self, key): jpayne@69: try: jpayne@69: self[key] jpayne@69: except KeyError: jpayne@69: return False jpayne@69: else: jpayne@69: return True jpayne@69: jpayne@69: def iterkeys(self): jpayne@69: """D.iterkeys() -> an iterator over the keys of D""" jpayne@69: return iter(self) jpayne@69: jpayne@69: def itervalues(self): jpayne@69: """D.itervalues() -> an iterator over the values of D""" jpayne@69: for key in self: jpayne@69: yield self[key] jpayne@69: jpayne@69: def iteritems(self): jpayne@69: """D.iteritems() -> an iterator over the (key, value) items of D""" jpayne@69: for key in self: jpayne@69: yield (key, self[key]) jpayne@69: jpayne@69: def keys(self): jpayne@69: """D.keys() -> list of D's keys""" jpayne@69: return list(self) jpayne@69: jpayne@69: def items(self): jpayne@69: """D.items() -> list of D's (key, value) pairs, as 2-tuples""" jpayne@69: return list(self.iteritems()) jpayne@69: jpayne@69: def values(self): jpayne@69: """D.values() -> list of D's values""" jpayne@69: return list(self.itervalues()) jpayne@69: jpayne@69: def update(self, items=None, **kwargs): jpayne@69: """D.update([E, ]**F) -> None. jpayne@69: jpayne@69: Update D from dict/iterable E and F. jpayne@69: """ jpayne@69: for k, v in items.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: if kwargs: jpayne@69: for k, v in kwargs.items(): jpayne@69: self[k] = v jpayne@69: jpayne@69: def pop(self, key, default=_nothing): jpayne@69: try: jpayne@69: value = self[key] jpayne@69: del self[key] jpayne@69: return value jpayne@69: except KeyError: jpayne@69: if default is not _nothing: jpayne@69: return default jpayne@69: raise jpayne@69: jpayne@69: # Mappings are not hashable by default, but subclasses can change this jpayne@69: __hash__ = None jpayne@69: jpayne@69: #TODO: implement __richcmp__ jpayne@69: jpayne@69: jpayne@69: cdef class BCFIndex(object): jpayne@69: """CSI index data structure for BCF files""" jpayne@69: def __init__(self): jpayne@69: self.refs = () jpayne@69: self.refmap = {} jpayne@69: jpayne@69: if not self.ptr: jpayne@69: raise ValueError('Invalid index object') jpayne@69: jpayne@69: cdef int n jpayne@69: cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) jpayne@69: jpayne@69: self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () jpayne@69: self.refmap = { r:i for i,r in enumerate(self.refs) } jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.ptr: jpayne@69: hts_idx_destroy(self.ptr) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: def fetch(self, bcf, contig, start, stop, reopen): jpayne@69: return BCFIterator(bcf, contig, start, stop, reopen) jpayne@69: jpayne@69: jpayne@69: cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): jpayne@69: if not idx: jpayne@69: return None jpayne@69: jpayne@69: if not header: jpayne@69: raise ValueError('invalid VariantHeader') jpayne@69: jpayne@69: cdef BCFIndex index = BCFIndex.__new__(BCFIndex) jpayne@69: index.header = header jpayne@69: index.ptr = idx jpayne@69: index.__init__() jpayne@69: jpayne@69: return index jpayne@69: jpayne@69: jpayne@69: cdef class TabixIndex(BaseIndex): jpayne@69: """Tabix index data structure for VCF files""" jpayne@69: def __init__(self): jpayne@69: self.refs = () jpayne@69: self.refmap = {} jpayne@69: jpayne@69: if not self.ptr: jpayne@69: raise ValueError('Invalid index object') jpayne@69: jpayne@69: cdef int n jpayne@69: cdef const char **refs = tbx_seqnames(self.ptr, &n) jpayne@69: jpayne@69: self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () jpayne@69: self.refmap = { r:i for i,r in enumerate(self.refs) } jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.ptr: jpayne@69: tbx_destroy(self.ptr) jpayne@69: self.ptr = NULL jpayne@69: jpayne@69: def fetch(self, bcf, contig, start, stop, reopen): jpayne@69: return TabixIterator(bcf, contig, start, stop, reopen) jpayne@69: jpayne@69: jpayne@69: cdef TabixIndex makeTabixIndex(tbx_t *idx): jpayne@69: if not idx: jpayne@69: return None jpayne@69: jpayne@69: cdef TabixIndex index = TabixIndex.__new__(TabixIndex) jpayne@69: index.ptr = idx jpayne@69: index.__init__() jpayne@69: jpayne@69: return index jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Iterators jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef class BaseIterator(object): jpayne@69: pass jpayne@69: jpayne@69: jpayne@69: # Internal function to clean up after iteration stop or failure. jpayne@69: # This would be a nested function if it weren't a cdef function. jpayne@69: cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): jpayne@69: bcf_destroy1(record) jpayne@69: jpayne@69: # destroy iter so future calls to __next__ raise StopIteration jpayne@69: bcf_itr_destroy(self.iter) jpayne@69: self.iter = NULL jpayne@69: jpayne@69: jpayne@69: cdef class BCFIterator(BaseIterator): jpayne@69: def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): jpayne@69: if bcf is None: jpayne@69: raise ValueError('bcf must not be None') jpayne@69: jpayne@69: if contig is None: jpayne@69: raise ValueError('contig must be specified') jpayne@69: jpayne@69: if not isinstance(bcf.index, BCFIndex): jpayne@69: raise ValueError('bcf index required') jpayne@69: jpayne@69: cdef BCFIndex index = bcf.index jpayne@69: jpayne@69: self.bcf = bcf jpayne@69: self.index = index jpayne@69: jpayne@69: cdef int rid, cstart, cstop jpayne@69: jpayne@69: try: jpayne@69: rid = index.refmap[contig] jpayne@69: except KeyError: jpayne@69: # A query for a non-existent contig yields an empty iterator, does not raise an error jpayne@69: self.iter = NULL jpayne@69: return jpayne@69: jpayne@69: if reopen: jpayne@69: self.bcf = self.bcf.copy() jpayne@69: jpayne@69: cstart = start if start is not None else 0 jpayne@69: cstop = stop if stop is not None else MAX_POS jpayne@69: jpayne@69: with nogil: jpayne@69: self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop) jpayne@69: jpayne@69: if not self.iter: jpayne@69: if errno: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: else: jpayne@69: raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.iter: jpayne@69: bcf_itr_destroy(self.iter) jpayne@69: self.iter = NULL jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: if not self.iter: jpayne@69: raise StopIteration jpayne@69: jpayne@69: cdef bcf1_t *record = bcf_init1() jpayne@69: jpayne@69: if not record: jpayne@69: raise MemoryError('unable to allocate BCF record') jpayne@69: jpayne@69: record.pos = -1 jpayne@69: if self.bcf.drop_samples: jpayne@69: record.max_unpack = BCF_UN_SHR jpayne@69: jpayne@69: cdef int ret jpayne@69: jpayne@69: with nogil: jpayne@69: ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) jpayne@69: jpayne@69: if ret < 0: jpayne@69: _stop_BCFIterator(self, record) jpayne@69: if ret == -1: jpayne@69: raise StopIteration jpayne@69: elif ret == -2: jpayne@69: raise IOError('truncated file') jpayne@69: elif errno: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: else: jpayne@69: raise IOError('unable to fetch next record') jpayne@69: jpayne@69: ret = bcf_subset_format(self.bcf.header.ptr, record) jpayne@69: jpayne@69: if ret < 0: jpayne@69: _stop_BCFIterator(self, record) jpayne@69: raise ValueError('error in bcf_subset_format') jpayne@69: jpayne@69: return makeVariantRecord(self.bcf.header, record) jpayne@69: jpayne@69: jpayne@69: cdef class TabixIterator(BaseIterator): jpayne@69: def __cinit__(self, *args, **kwargs): jpayne@69: self.line_buffer.l = 0 jpayne@69: self.line_buffer.m = 0 jpayne@69: self.line_buffer.s = NULL jpayne@69: jpayne@69: def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): jpayne@69: if bcf is None: jpayne@69: raise ValueError('bcf must not be None') jpayne@69: jpayne@69: if not isinstance(bcf.index, TabixIndex): jpayne@69: raise ValueError('tabix index required') jpayne@69: jpayne@69: cdef TabixIndex index = bcf.index jpayne@69: jpayne@69: self.bcf = bcf jpayne@69: self.index = index jpayne@69: jpayne@69: cdef int rid, cstart, cstop jpayne@69: jpayne@69: try: jpayne@69: rid = index.refmap[contig] jpayne@69: except KeyError: jpayne@69: # A query for a non-existent contig yields an empty iterator, does not raise an error jpayne@69: self.iter = NULL jpayne@69: return jpayne@69: jpayne@69: if reopen: jpayne@69: self.bcf = self.bcf.copy() jpayne@69: jpayne@69: cstart = start if start is not None else 0 jpayne@69: cstop = stop if stop is not None else MAX_POS jpayne@69: jpayne@69: self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) jpayne@69: jpayne@69: if not self.iter: jpayne@69: if errno: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: else: jpayne@69: raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if self.iter: jpayne@69: tbx_itr_destroy(self.iter) jpayne@69: self.iter = NULL jpayne@69: jpayne@69: if self.line_buffer.m: jpayne@69: free(self.line_buffer.s) jpayne@69: jpayne@69: self.line_buffer.l = 0 jpayne@69: self.line_buffer.m = 0 jpayne@69: self.line_buffer.s = NULL jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: if not self.iter: jpayne@69: raise StopIteration jpayne@69: jpayne@69: cdef int ret jpayne@69: jpayne@69: with nogil: jpayne@69: ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) jpayne@69: jpayne@69: if ret < 0: jpayne@69: tbx_itr_destroy(self.iter) jpayne@69: self.iter = NULL jpayne@69: if ret == -1: jpayne@69: raise StopIteration jpayne@69: elif ret == -2: jpayne@69: raise IOError('truncated file') jpayne@69: elif errno: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: else: jpayne@69: raise IOError('unable to fetch next record') jpayne@69: jpayne@69: cdef bcf1_t *record = bcf_init1() jpayne@69: jpayne@69: if not record: jpayne@69: raise MemoryError('unable to allocate BCF record') jpayne@69: jpayne@69: record.pos = -1 jpayne@69: if self.bcf.drop_samples: jpayne@69: record.max_unpack = BCF_UN_SHR jpayne@69: jpayne@69: ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) jpayne@69: jpayne@69: # FIXME: stop iteration on parse failure? jpayne@69: if ret < 0: jpayne@69: bcf_destroy1(record) jpayne@69: raise ValueError('error in vcf_parse') jpayne@69: jpayne@69: return makeVariantRecord(self.bcf.header, record) jpayne@69: jpayne@69: jpayne@69: ######################################################################## jpayne@69: ######################################################################## jpayne@69: ## Variant File jpayne@69: ######################################################################## jpayne@69: jpayne@69: jpayne@69: cdef class VariantFile(HTSFile): jpayne@69: """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, jpayne@69: duplicate_filehandle=True, ignore_truncation=False, threads=1)* jpayne@69: jpayne@69: A :term:`VCF`/:term:`BCF` formatted file. The file is automatically jpayne@69: opened. jpayne@69: jpayne@69: If an index for a variant file exists (.csi or .tbi), it will be jpayne@69: opened automatically. Without an index random access to records jpayne@69: via :meth:`fetch` is disabled. jpayne@69: jpayne@69: For writing, a :class:`VariantHeader` object must be provided, jpayne@69: typically obtained from another :term:`VCF` file/:term:`BCF` jpayne@69: file. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: mode : string jpayne@69: *mode* should be ``r`` for reading or ``w`` for writing. The default is jpayne@69: text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append jpayne@69: ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. jpayne@69: jpayne@69: If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid jpayne@69: modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. jpayne@69: For instance, to open a :term:`BCF` formatted file for reading, type:: jpayne@69: jpayne@69: f = pysam.VariantFile('ex1.bcf','r') jpayne@69: jpayne@69: If mode is not specified, we will try to auto-detect the file type. All jpayne@69: of the following should work:: jpayne@69: jpayne@69: f1 = pysam.VariantFile('ex1.bcf') jpayne@69: f2 = pysam.VariantFile('ex1.vcf') jpayne@69: f3 = pysam.VariantFile('ex1.vcf.gz') jpayne@69: jpayne@69: index_filename : string jpayne@69: Explicit path to an index file. jpayne@69: jpayne@69: header : VariantHeader jpayne@69: :class:`VariantHeader` object required for writing. jpayne@69: jpayne@69: drop_samples: bool jpayne@69: Ignore sample information when reading. jpayne@69: jpayne@69: duplicate_filehandle: bool jpayne@69: By default, file handles passed either directly or through jpayne@69: File-like objects will be duplicated before passing them to jpayne@69: htslib. The duplication prevents issues where the same stream jpayne@69: will be closed by htslib and through destruction of the jpayne@69: high-level python object. Set to False to turn off jpayne@69: duplication. jpayne@69: jpayne@69: ignore_truncation: bool jpayne@69: Issue a warning, instead of raising an error if the current file jpayne@69: appears to be truncated due to a missing EOF marker. Only applies jpayne@69: to bgzipped formats. (Default=False) jpayne@69: jpayne@69: threads: integer jpayne@69: Number of threads to use for compressing/decompressing VCF/BCF files. jpayne@69: Setting threads to > 1 cannot be combined with `ignore_truncation`. jpayne@69: (Default=1) jpayne@69: jpayne@69: """ jpayne@69: def __cinit__(self, *args, **kwargs): jpayne@69: self.htsfile = NULL jpayne@69: jpayne@69: def __init__(self, *args, **kwargs): jpayne@69: self.header = None jpayne@69: self.index = None jpayne@69: self.filename = None jpayne@69: self.mode = None jpayne@69: self.threads = 1 jpayne@69: self.index_filename = None jpayne@69: self.is_stream = False jpayne@69: self.is_remote = False jpayne@69: self.is_reading = False jpayne@69: self.drop_samples = False jpayne@69: self.header_written = False jpayne@69: self.start_offset = -1 jpayne@69: jpayne@69: self.open(*args, **kwargs) jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: if not self.htsfile or not self.header: jpayne@69: return jpayne@69: jpayne@69: # Write header if no records were written jpayne@69: if self.htsfile.is_write and not self.header_written: jpayne@69: with nogil: jpayne@69: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@69: jpayne@69: cdef int ret = hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: self.header = self.index = None jpayne@69: jpayne@69: if ret < 0: jpayne@69: global errno jpayne@69: if errno == EPIPE: jpayne@69: errno = 0 jpayne@69: else: jpayne@69: raise IOError(errno, force_str(strerror(errno))) jpayne@69: jpayne@69: def close(self): jpayne@69: """closes the :class:`pysam.VariantFile`.""" jpayne@69: if not self.htsfile: jpayne@69: return jpayne@69: jpayne@69: # Write header if no records were written jpayne@69: if self.htsfile.is_write and not self.header_written: jpayne@69: with nogil: jpayne@69: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@69: jpayne@69: cdef int ret = hts_close(self.htsfile) jpayne@69: self.htsfile = NULL jpayne@69: self.header = self.index = None jpayne@69: jpayne@69: if ret < 0: jpayne@69: global errno jpayne@69: if errno == EPIPE: jpayne@69: errno = 0 jpayne@69: else: jpayne@69: raise IOError(errno, force_str(strerror(errno))) jpayne@69: jpayne@69: def __iter__(self): jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: if self.htsfile.is_write: jpayne@69: raise ValueError('cannot iterate over Variantfile opened for writing') jpayne@69: jpayne@69: self.is_reading = 1 jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: cdef int ret jpayne@69: cdef int errcode jpayne@69: cdef bcf1_t *record = bcf_init1() jpayne@69: jpayne@69: if not record: jpayne@69: raise MemoryError('unable to allocate BCF record') jpayne@69: jpayne@69: record.pos = -1 jpayne@69: if self.drop_samples: jpayne@69: record.max_unpack = BCF_UN_SHR jpayne@69: jpayne@69: with nogil: jpayne@69: ret = bcf_read1(self.htsfile, self.header.ptr, record) jpayne@69: jpayne@69: if ret < 0: jpayne@69: errcode = record.errcode jpayne@69: bcf_destroy1(record) jpayne@69: if errcode: jpayne@69: raise IOError('unable to parse next record') jpayne@69: if ret == -1: jpayne@69: raise StopIteration jpayne@69: elif ret == -2: jpayne@69: raise IOError('truncated file') jpayne@69: elif errno: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: else: jpayne@69: raise IOError('unable to fetch next record') jpayne@69: jpayne@69: return makeVariantRecord(self.header, record) jpayne@69: jpayne@69: def copy(self): jpayne@69: if not self.is_open: jpayne@69: raise ValueError jpayne@69: jpayne@69: cdef VariantFile vars = VariantFile.__new__(VariantFile) jpayne@69: cdef bcf_hdr_t *hdr jpayne@69: jpayne@69: # FIXME: re-open using fd or else header and index could be invalid jpayne@69: vars.htsfile = self._open_htsfile() jpayne@69: jpayne@69: if not vars.htsfile: jpayne@69: raise ValueError('Cannot re-open htsfile') jpayne@69: jpayne@69: # minimize overhead by re-using header and index. This approach is jpayne@69: # currently risky, but see above for how this can be mitigated. jpayne@69: vars.header = self.header jpayne@69: vars.index = self.index jpayne@69: jpayne@69: vars.filename = self.filename jpayne@69: vars.mode = self.mode jpayne@69: vars.threads = self.threads jpayne@69: vars.index_filename = self.index_filename jpayne@69: vars.drop_samples = self.drop_samples jpayne@69: vars.is_stream = self.is_stream jpayne@69: vars.is_remote = self.is_remote jpayne@69: vars.is_reading = self.is_reading jpayne@69: vars.start_offset = self.start_offset jpayne@69: vars.header_written = self.header_written jpayne@69: jpayne@69: if self.htsfile.is_bin: jpayne@69: vars.seek(self.tell()) jpayne@69: else: jpayne@69: with nogil: jpayne@69: hdr = bcf_hdr_read(vars.htsfile) jpayne@69: makeVariantHeader(hdr) jpayne@69: jpayne@69: return vars jpayne@69: jpayne@69: def open(self, filename, mode='r', jpayne@69: index_filename=None, jpayne@69: VariantHeader header=None, jpayne@69: drop_samples=False, jpayne@69: duplicate_filehandle=True, jpayne@69: ignore_truncation=False, jpayne@69: threads=1): jpayne@69: """open a vcf/bcf file. jpayne@69: jpayne@69: If open is called on an existing VariantFile, the current file will be jpayne@69: closed and a new file will be opened. jpayne@69: """ jpayne@69: cdef bcf_hdr_t *hdr jpayne@69: cdef BGZF *bgzfp jpayne@69: cdef hts_idx_t *idx jpayne@69: cdef tbx_t *tidx jpayne@69: cdef char *cfilename jpayne@69: cdef char *cindex_filename = NULL jpayne@69: cdef char *cmode jpayne@69: jpayne@69: if threads > 1 and ignore_truncation: jpayne@69: # This won't raise errors if reaching a truncated alignment, jpayne@69: # because bgzf_mt_reader in htslib does not deal with jpayne@69: # bgzf_mt_read_block returning non-zero values, contrary jpayne@69: # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) jpayne@69: # Better to avoid this (for now) than to produce seemingly correct results. jpayne@69: raise ValueError('Cannot add extra threads when "ignore_truncation" is True') jpayne@69: self.threads = threads jpayne@69: jpayne@69: # close a previously opened file jpayne@69: if self.is_open: jpayne@69: self.close() jpayne@69: jpayne@69: if not mode or mode[0] not in 'rwa': jpayne@69: raise ValueError('mode must begin with r, w or a') jpayne@69: jpayne@69: self.duplicate_filehandle = duplicate_filehandle jpayne@69: jpayne@69: format_modes = [m for m in mode[1:] if m in 'bcguz'] jpayne@69: if len(format_modes) > 1: jpayne@69: raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes))) jpayne@69: jpayne@69: invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex'] jpayne@69: if invalid_modes: jpayne@69: raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes))) jpayne@69: jpayne@69: # Autodetect mode from filename jpayne@69: if mode == 'w' and isinstance(filename, str): jpayne@69: if filename.endswith('.gz'): jpayne@69: mode = 'wz' jpayne@69: elif filename.endswith('.bcf'): jpayne@69: mode = 'wb' jpayne@69: jpayne@69: # for htslib, wbu seems to not work jpayne@69: if mode == 'wbu': jpayne@69: mode = 'wb0' jpayne@69: jpayne@69: self.mode = mode = force_bytes(mode) jpayne@69: try: jpayne@69: filename = encode_filename(filename) jpayne@69: self.is_remote = hisremote(filename) jpayne@69: self.is_stream = filename == b'-' jpayne@69: except TypeError: jpayne@69: filename = filename jpayne@69: self.is_remote = False jpayne@69: self.is_stream = True jpayne@69: jpayne@69: self.filename = filename jpayne@69: jpayne@69: if index_filename is not None: jpayne@69: self.index_filename = index_filename = encode_filename(index_filename) jpayne@69: else: jpayne@69: self.index_filename = None jpayne@69: jpayne@69: self.drop_samples = bool(drop_samples) jpayne@69: self.header = None jpayne@69: jpayne@69: self.header_written = False jpayne@69: jpayne@69: if mode.startswith(b'w'): jpayne@69: # open file for writing jpayne@69: if index_filename is not None: jpayne@69: raise ValueError('Cannot specify an index filename when writing a VCF/BCF file') jpayne@69: jpayne@69: # header structure (used for writing) jpayne@69: if header: jpayne@69: self.header = header.copy() jpayne@69: else: jpayne@69: self.header = VariantHeader() jpayne@69: #raise ValueError('a VariantHeader must be specified') jpayne@69: jpayne@69: # Header is not written until the first write or on close jpayne@69: self.htsfile = self._open_htsfile() jpayne@69: jpayne@69: if not self.htsfile: jpayne@69: raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode)) jpayne@69: jpayne@69: elif mode.startswith(b'r'): jpayne@69: # open file for reading jpayne@69: self.htsfile = self._open_htsfile() jpayne@69: jpayne@69: if not self.htsfile: jpayne@69: if errno: jpayne@69: raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno)))) jpayne@69: else: jpayne@69: raise ValueError('could not open variant file `{}`'.format(filename)) jpayne@69: jpayne@69: if self.htsfile.format.format not in (bcf, vcf): jpayne@69: raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) jpayne@69: jpayne@69: self.check_truncation(ignore_truncation) jpayne@69: jpayne@69: with nogil: jpayne@69: hdr = bcf_hdr_read(self.htsfile) jpayne@69: jpayne@69: try: jpayne@69: self.header = makeVariantHeader(hdr) jpayne@69: except ValueError: jpayne@69: raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) jpayne@69: jpayne@69: if isinstance(self.filename, bytes): jpayne@69: cfilename = self.filename jpayne@69: else: jpayne@69: cfilename = NULL jpayne@69: jpayne@69: # check for index and open if present jpayne@69: if self.htsfile.format.format == bcf and cfilename: jpayne@69: if index_filename is not None: jpayne@69: cindex_filename = index_filename jpayne@69: with nogil: jpayne@69: idx = bcf_index_load2(cfilename, cindex_filename) jpayne@69: self.index = makeBCFIndex(self.header, idx) jpayne@69: jpayne@69: elif self.htsfile.format.compression == bgzf and cfilename: jpayne@69: if index_filename is not None: jpayne@69: cindex_filename = index_filename jpayne@69: with nogil: jpayne@69: tidx = tbx_index_load2(cfilename, cindex_filename) jpayne@69: self.index = makeTabixIndex(tidx) jpayne@69: jpayne@69: if not self.is_stream: jpayne@69: self.start_offset = self.tell() jpayne@69: else: jpayne@69: raise ValueError('unknown mode {}'.format(mode)) jpayne@69: jpayne@69: def reset(self): jpayne@69: """reset file position to beginning of file just after the header.""" jpayne@69: return self.seek(self.start_offset) jpayne@69: jpayne@69: def is_valid_tid(self, tid): jpayne@69: """ jpayne@69: return True if the numerical :term:`tid` is valid; False otherwise. jpayne@69: jpayne@69: returns -1 if reference is not known. jpayne@69: """ jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int rid = tid jpayne@69: return 0 <= rid < hdr.n[BCF_DT_CTG] jpayne@69: jpayne@69: def get_tid(self, reference): jpayne@69: """ jpayne@69: return the numerical :term:`tid` corresponding to jpayne@69: :term:`reference` jpayne@69: jpayne@69: returns -1 if reference is not known. jpayne@69: """ jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] jpayne@69: reference = force_bytes(reference) jpayne@69: cdef khint_t k = kh_get_vdict(d, reference) jpayne@69: return kh_val_vdict(d, k).id if k != kh_end(d) else -1 jpayne@69: jpayne@69: def get_reference_name(self, tid): jpayne@69: """ jpayne@69: return :term:`reference` name corresponding to numerical :term:`tid` jpayne@69: """ jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: cdef bcf_hdr_t *hdr = self.header.ptr jpayne@69: cdef int rid = tid jpayne@69: if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: jpayne@69: raise ValueError('Invalid tid') jpayne@69: return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) jpayne@69: jpayne@69: def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None): jpayne@69: """fetch records in a :term:`region`, specified either by jpayne@69: :term:`contig`, *start*, and *end* (which are 0-based, half-open); jpayne@69: or alternatively by a samtools :term:`region` string (which is jpayne@69: 1-based inclusive). jpayne@69: jpayne@69: Without *contig* or *region* all mapped records will be fetched. The jpayne@69: records will be returned ordered by contig, which will not necessarily jpayne@69: be the order within the file. jpayne@69: jpayne@69: Set *reopen* to true if you will be using multiple iterators on the jpayne@69: same file at the same time. The iterator returned will receive its jpayne@69: own copy of a filehandle to the file effectively re-opening the jpayne@69: file. Re-opening a file incurrs some overhead, so use with care. jpayne@69: jpayne@69: If only *contig* is set, all records on *contig* will be fetched. jpayne@69: If both *region* and *contig* are given, an exception is raised. jpayne@69: jpayne@69: Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index jpayne@69: (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be jpayne@69: read sequentially. jpayne@69: """ jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: if self.htsfile.is_write: jpayne@69: raise ValueError('cannot fetch from Variantfile opened for writing') jpayne@69: jpayne@69: if contig is None and region is None: jpayne@69: self.is_reading = 1 jpayne@69: bcf = self.copy() if reopen else self jpayne@69: bcf.seek(self.start_offset) jpayne@69: return iter(bcf) jpayne@69: jpayne@69: if self.index is None: jpayne@69: raise ValueError('fetch requires an index') jpayne@69: jpayne@69: _, tid, start, stop = self.parse_region(contig, start, stop, region, jpayne@69: None, end=end, reference=reference) jpayne@69: jpayne@69: if contig is None: jpayne@69: contig = self.get_reference_name(tid) jpayne@69: jpayne@69: self.is_reading = 1 jpayne@69: return self.index.fetch(self, contig, start, stop, reopen) jpayne@69: jpayne@69: def new_record(self, *args, **kwargs): jpayne@69: """Create a new empty :class:`VariantRecord`. jpayne@69: jpayne@69: See :meth:`VariantHeader.new_record` jpayne@69: """ jpayne@69: return self.header.new_record(*args, **kwargs) jpayne@69: jpayne@69: cpdef int write(self, VariantRecord record) except -1: jpayne@69: """ jpayne@69: write a single :class:`pysam.VariantRecord` to disk. jpayne@69: jpayne@69: returns the number of bytes written. jpayne@69: """ jpayne@69: if record is None: jpayne@69: raise ValueError('record must not be None') jpayne@69: jpayne@69: if not self.is_open: jpayne@69: return ValueError('I/O operation on closed file') jpayne@69: jpayne@69: if not self.htsfile.is_write: jpayne@69: raise ValueError('cannot write to a Variantfile opened for reading') jpayne@69: jpayne@69: if not self.header_written: jpayne@69: self.header_written = True jpayne@69: with nogil: jpayne@69: bcf_hdr_write(self.htsfile, self.header.ptr) jpayne@69: jpayne@69: #if record.header is not self.header: jpayne@69: # record.translate(self.header) jpayne@69: # raise ValueError('Writing records from a different VariantFile is not yet supported') jpayne@69: jpayne@69: if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr): jpayne@69: msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' jpayne@69: raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) jpayne@69: jpayne@69: # Sync END annotation before writing jpayne@69: bcf_sync_end(record) jpayne@69: jpayne@69: cdef int ret jpayne@69: jpayne@69: with nogil: jpayne@69: ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) jpayne@69: jpayne@69: if ret < 0: jpayne@69: raise IOError(errno, strerror(errno)) jpayne@69: jpayne@69: return ret jpayne@69: jpayne@69: def subset_samples(self, include_samples): jpayne@69: """ jpayne@69: Read only a subset of samples to reduce processing time and memory. jpayne@69: Must be called prior to retrieving records. jpayne@69: """ jpayne@69: if not self.is_open: jpayne@69: raise ValueError('I/O operation on closed file') jpayne@69: jpayne@69: if self.htsfile.is_write: jpayne@69: raise ValueError('cannot subset samples from Variantfile opened for writing') jpayne@69: jpayne@69: if self.is_reading: jpayne@69: raise ValueError('cannot subset samples after fetching records') jpayne@69: jpayne@69: self.header._subset_samples(include_samples) jpayne@69: jpayne@69: # potentially unnecessary optimization that also sets max_unpack jpayne@69: if not include_samples: jpayne@69: self.drop_samples = True jpayne@69: