Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcbcf.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcbcf.pyx Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,4538 @@ +# cython: language_level=3 +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +## Cython wrapper for htslib VCF/BCF reader/writer +############################################################################### +# +# NOTICE: This code is incomplete and preliminary. It offers a nearly +# complete Pythonic interface to VCF/BCF metadata and data with +# reading and writing capability. Documentation and a unit test suite +# are in the works. The code is best tested under Python 2, but +# should also work with Python 3. Please report any remaining +# str/bytes issues on the github site when using Python 3 and I'll +# fix them promptly. +# +# Here is a minimal example of how to use the API: +# +# $ cat bcfview.py +# import sys +# from pysam import VariantFile +# +# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format +# bcf_out = VariantFile('-', 'w', header=bcf_in.header) +# +# for rec in bcf_in: +# bcf_out.write(rec) +# +# Performance is fairly close to that of bcftools view. Here is an example +# using some 1k Genomes data: +# +# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103799 +# +# real 0m56.114s +# user 1m4.489s +# sys 0m3.102s +# +# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103800 # bcftools adds an extra header +# +# real 0m55.126s +# user 1m3.502s +# sys 0m3.459s +# +############################################################################### +# +# TODO list: +# +# * more genotype methods +# * unit test suite (perhaps py.test based) +# * documentation +# * pickle support +# * left/right locus normalization +# * fix reopen to re-use fd +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +from __future__ import division, print_function + +import os +import sys + +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror +from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX + +cimport cython + +from cpython.object cimport PyObject +from cpython.ref cimport Py_INCREF +from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM +from cpython.bytes cimport PyBytes_FromStringAndSize +from cpython.unicode cimport PyUnicode_DecodeUTF8 + +from pysam.libchtslib cimport HTSFile, hisremote + +from pysam.utils import unquoted_str + + +__all__ = ['VariantFile', + 'VariantHeader', + 'VariantHeaderRecord', + 'VariantHeaderRecords', + 'VariantMetadata', + 'VariantHeaderMetadata', + 'VariantContig', + 'VariantHeaderContigs', + 'VariantHeaderSamples', + 'VariantRecordFilter', + 'VariantRecordFormat', + 'VariantRecordInfo', + 'VariantRecordSamples', + 'VariantRecord', + 'VariantRecordSample', + 'BaseIndex', + 'BCFIndex', + 'TabixIndex', + 'BaseIterator', + 'BCFIterator', + 'TabixIterator', + 'VariantRecord'] + +######################################################################## +######################################################################## +## Constants +######################################################################## + +cdef int MAX_POS = (1 << 31) - 1 +cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') +cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') +cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') + + +######################################################################## +######################################################################## +## Python 3 compatibility functions +######################################################################## + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes + + +######################################################################## +######################################################################## +## Sentinel object +######################################################################## + +cdef object _nothing = object() + +######################################################################## +######################################################################## +## VCF/BCF string intern system +######################################################################## + +cdef dict bcf_str_cache = {} + +cdef inline bcf_str_cache_get_charptr(const char* s): + if s == NULL: + return None + + cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s) + if pystr: + return <object>pystr + + val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) + + PyDict_SetItemString(bcf_str_cache, s, val) + + return val + + +######################################################################## +######################################################################## +## Genotype math +######################################################################## + +cdef int comb(int n, int k) except -1: + """Return binomial coefficient: n choose k + + >>> comb(5, 1) + 5 + >>> comb(5, 2) + 10 + >>> comb(2, 2) + 1 + >>> comb(100, 2) + 4950 + """ + if k > n: + return 0 + elif k == n: + return 1 + elif k > n // 2: + k = n - k + + cdef d, result + + d = result = n - k + 1 + for i in range(2, k + 1): + d += 1 + result *= d + result //= i + return result + + +cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1: + """Return the count of genotypes expected for the given ploidy and number of alleles. + + >>> bcf_geno_combinations(1, 2) + 2 + >>> bcf_geno_combinations(2, 2) + 3 + >>> bcf_geno_combinations(2, 3) + 6 + >>> bcf_geno_combinations(3, 2) + 4 + """ + return comb(alleles + ploidy - 1, ploidy) + + +######################################################################## +######################################################################## +## Low level type conversion helpers +######################################################################## + + +cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): + return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id) + + +cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): + return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0 + + +cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1: + + if sample < 0: + raise ValueError('genotype is only valid as a format field') + + cdef int32_t *gt_arr = NULL + cdef int ngt = 0 + ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt) + + if ngt <= 0 or not gt_arr: + return 0 + + assert ngt % rec.n_sample == 0 + cdef int max_ploidy = ngt // rec.n_sample + cdef int32_t *gt = gt_arr + sample * max_ploidy + cdef int ploidy = 0 + + while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end: + gt += 1 + ploidy += 1 + + free(<void*>gt_arr) + + return bcf_geno_combinations(ploidy, rec.n_allele) + + +cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): + if not a: + return None + try: + return tuple(charptr_to_str(a[i]) for i in range(n)) + finally: + if free_after and a: + free(a) + + +cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar): + cdef char *datac + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef float *dataf + cdef int i + cdef bytes b + + if not data or n <= 0: + return None + + if type == BCF_BT_CHAR: + datac = <char *>data + + if not n: + value = () + else: + # Check if at least one null terminator is present + if datac[n-1] == bcf_str_vector_end: + # If so, create a string up to the first null terminator + b = datac + else: + # Otherwise, copy the entire block + b = datac[:n] + value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) + else: + value = [] + if type == BCF_BT_INT8: + data8 = <int8_t *>data + for i in range(n): + if data8[i] == bcf_int8_vector_end: + break + value.append(data8[i] if data8[i] != bcf_int8_missing else None) + elif type == BCF_BT_INT16: + data16 = <int16_t *>data + for i in range(n): + if data16[i] == bcf_int16_vector_end: + break + value.append(data16[i] if data16[i] != bcf_int16_missing else None) + elif type == BCF_BT_INT32: + data32 = <int32_t *>data + for i in range(n): + if data32[i] == bcf_int32_vector_end: + break + value.append(data32[i] if data32[i] != bcf_int32_missing else None) + elif type == BCF_BT_FLOAT: + dataf = <float *>data + for i in range(n): + if bcf_float_is_vector_end(dataf[i]): + break + value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) + else: + raise TypeError('unsupported info type code') + + # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. + if not value: + if scalar: + value = None + elif count <= 0: + value = () + else: + value = (None,)*count + elif scalar and len(value) == 1: + value = value[0] + else: + value = tuple(value) + + return value + + +cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): + cdef char *datac + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef float *dataf + cdef ssize_t i, value_count = len(values) + + assert value_count <= n + + if bt_type == BCF_BT_CHAR: + if not isinstance(values, (str, bytes)): + values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values) + value_count = len(values) + assert value_count <= n + datac = <char *>data + memcpy(datac, <char *>values, value_count) + for i in range(value_count, n): + datac[i] = 0 + elif bt_type == BCF_BT_INT8: + datai8 = <int8_t *>data + for i in range(value_count): + val = values[i] + datai8[i] = val if val is not None else bcf_int8_missing + for i in range(value_count, n): + datai8[i] = bcf_int8_vector_end + elif bt_type == BCF_BT_INT16: + datai16 = <int16_t *>data + for i in range(value_count): + val = values[i] + datai16[i] = val if val is not None else bcf_int16_missing + for i in range(value_count, n): + datai16[i] = bcf_int16_vector_end + elif bt_type == BCF_BT_INT32: + datai32 = <int32_t *>data + for i in range(value_count): + val = values[i] + datai32[i] = val if val is not None else bcf_int32_missing + for i in range(value_count, n): + datai32[i] = bcf_int32_vector_end + elif bt_type == BCF_BT_FLOAT: + dataf = <float *>data + for i in range(value_count): + val = values[i] + if val is None: + bcf_float_set(dataf + i, bcf_float_missing) + else: + dataf[i] = val + for i in range(value_count, n): + bcf_float_set(dataf + i, bcf_float_vector_end) + else: + raise TypeError('unsupported type') + + +cdef bcf_empty_array(int type, ssize_t n, int vlen): + cdef char *datac + cdef int32_t *data32 + cdef float *dataf + cdef int i + + if n <= 0: + raise ValueError('Cannot create empty array') + + if type == BCF_HT_STR: + value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n) + datac = <char *>value + for i in range(n): + datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end + elif type == BCF_HT_INT: + value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n) + data32 = <int32_t *><char *>value + for i in range(n): + data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif type == BCF_HT_REAL: + value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n) + dataf = <float *><char *>value + for i in range(n): + bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) + else: + raise TypeError('unsupported header type code') + + return value + + +cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values, + void *dst_data, int dst_type, size_t dst_values, + int vlen): + """copy data from src to dest where the size of the elements (src_type/dst_type) differ + as well as the number of elements (src_values/dst_values). + """ + + cdef char *src_datac + cdef char *dst_datac + cdef int8_t *src_datai8 + cdef int16_t *src_datai16 + cdef int32_t *src_datai32 + cdef int32_t *dst_datai + cdef float *src_dataf + cdef float *dst_dataf + cdef ssize_t src_size, dst_size, i, j + cdef int val + + if src_values > dst_values: + raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values)) + + if src_type == dst_type == BCF_BT_CHAR: + src_datac = <char *>src_data + dst_datac = <char *>dst_data + memcpy(dst_datac, src_datac, src_values) + for i in range(src_values, dst_values): + dst_datac[i] = 0 + elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: + src_datai8 = <int8_t *>src_data + dst_datai = <int32_t *>dst_data + for i in range(src_values): + val = src_datai8[i] + if val == bcf_int8_missing: + val = bcf_int32_missing + elif val == bcf_int8_vector_end: + val = bcf_int32_vector_end + dst_datai[i] = val + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32: + src_datai16 = <int16_t *>src_data + dst_datai = <int32_t *>dst_data + for i in range(src_values): + val = src_datai16[i] + if val == bcf_int16_missing: + val = bcf_int32_missing + elif val == bcf_int16_vector_end: + val = bcf_int32_vector_end + dst_datai[i] = val + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32: + src_datai32 = <int32_t *>src_data + dst_datai = <int32_t *>dst_data + for i in range(src_values): + dst_datai[i] = src_datai32[i] + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT: + src_dataf = <float *>src_data + dst_dataf = <float *>dst_data + for i in range(src_values): + dst_dataf[i] = src_dataf[i] + for i in range(src_values, dst_values): + bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) + else: + raise TypeError('unsupported types') + + +cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + + if not check_header_id(hdr, hl_type, id): + raise ValueError('Invalid header') + + cdef int length = bcf_hdr_id2length(hdr, hl_type, id) + cdef int number = bcf_hdr_id2number(hdr, hl_type, id) + + scalar[0] = 0 + + if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id): + count[0] = number + elif length == BCF_VL_FIXED: + if number == 1: + scalar[0] = 1 + count[0] = number + elif length == BCF_VL_R: + count[0] = r.n_allele + elif length == BCF_VL_A: + count[0] = r.n_allele - 1 + elif length == BCF_VL_G: + count[0] = bcf_genotype_count(hdr, r, sample) + elif length == BCF_VL_VAR: + count[0] = -1 + else: + raise ValueError('Unknown format length') + + +cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + + cdef char *s + cdef ssize_t count + cdef int scalar + + bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1) + + if z.len == 0: + if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: + value = True + elif scalar: + value = None + else: + value = () + elif z.len == 1: + if z.type == BCF_BT_INT8: + value = z.v1.i if z.v1.i != bcf_int8_missing else None + elif z.type == BCF_BT_INT16: + value = z.v1.i if z.v1.i != bcf_int16_missing else None + elif z.type == BCF_BT_INT32: + value = z.v1.i if z.v1.i != bcf_int32_missing else None + elif z.type == BCF_BT_FLOAT: + value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None + elif z.type == BCF_BT_CHAR: + value = force_str(chr(z.v1.i)) + else: + raise TypeError('unsupported info type code') + + if not scalar and value != (): + value = (value,) + else: + value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar) + + return value + + +cdef object bcf_check_values(VariantRecord record, value, int sample, + int hl_type, int ht_type, + int id, int bt_type, ssize_t bt_len, + ssize_t *value_count, int *scalar, int *realloc): + + if record is None: + raise ValueError('record must not be None') + + bcf_get_value_count(record, hl_type, id, value_count, scalar, sample) + + # Validate values now that we know the type and size + values = (value,) if not isinstance(value, (list, tuple)) else value + + # Validate values now that we know the type and size + if ht_type == BCF_HT_FLAG: + value_count[0] = 1 + elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id): + # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) + value_count[0] = -1 + + cdef int given = len(values) + if value_count[0] != -1 and value_count[0] != given: + if scalar[0]: + raise TypeError('value expected to be scalar, given len={}'.format(given)) + else: + raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given)) + + if ht_type == BCF_HT_REAL: + for v in values: + if not(v is None or isinstance(v, (float, int))): + raise TypeError('invalid value for Float format') + elif ht_type == BCF_HT_INT: + for v in values: + if not(v is None or (isinstance(v, (float, int)) and int(v) == v)): + raise TypeError('invalid value for Integer format') + for v in values: + if not(v is None or bcf_int32_missing < v <= INT32_MAX): + raise ValueError('Integer value too small/large to store in VCF/BCF') + elif ht_type == BCF_HT_STR: + values = b','.join(force_bytes(v) if v is not None else b'' for v in values) + elif ht_type == BCF_HT_FLAG: + if values[0] not in (True, False, None, 1, 0): + raise ValueError('Flag values must be: True, False, None, 1, 0') + else: + raise TypeError('unsupported type') + + realloc[0] = 0 + if len(values) <= 1 and hl_type == BCF_HL_INFO: + realloc[0] = 0 + elif len(values) > bt_len: + realloc[0] = 1 + elif bt_type == BCF_BT_INT8: + for v in values: + if v is not None and not(bcf_int8_missing < v <= INT8_MAX): + realloc[0] = 1 + break + elif bt_type == BCF_BT_INT16: + for v in values: + if v is not None and not(bcf_int16_missing < v <= INT16_MAX): + realloc[0] = 1 + break + + return values + + +cdef bcf_encode_alleles(VariantRecord record, values): + if record is None: + raise ValueError('record must not be None') + + cdef bcf1_t *r = record.ptr + cdef int32_t nalleles = r.n_allele + cdef list gt_values = [] + cdef char *s + cdef int i + + if values is None: + return () + + if not isinstance(values, (list, tuple)): + values = (values,) + + for value in values: + if value is None: + gt_values.append(bcf_gt_missing) + elif isinstance(value, (str, bytes)): + bvalue = force_bytes(value) + s = bvalue + for i in range(r.n_allele): + if strcmp(r.d.allele[i], s) != 0: + gt_values.append(bcf_gt_unphased(i)) + break + else: + raise ValueError('Unknown allele') + else: + i = value + if not (0 <= i < nalleles): + raise ValueError('Invalid allele index') + gt_values.append(bcf_gt_unphased(i)) + + return gt_values + + +cdef bcf_info_set_value(VariantRecord record, key, value): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 + cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if info: + info_id = info.key + else: + info_id = bcf_header_get_info_id(hdr, bkey) + + if info_id < 0: + raise KeyError('unknown INFO: {}'.format(key)) + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) + values = bcf_check_values(record, value, -1, + BCF_HL_INFO, info_type, info_id, + info.type if info else -1, + info.len if info else -1, + &value_count, &scalar, &realloc) + + if info_type == BCF_HT_FLAG: + if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0: + raise ValueError('Unable to update INFO values') + return + + vlen = value_count < 0 + value_count = len(values) + + # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS + # If we can, write updated values to existing allocated storage + if 0 and info and not realloc: + r.d.shared_dirty |= BCF1_DIRTY_INF + + if value_count == 0: + info.len = 0 + if not info.vptr: + info.vptr = <uint8_t *>&info.v1.i + + elif value_count == 1: + # FIXME: Check if need to free vptr if info.len > 0? + if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: + bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen) + elif info.type == BCF_BT_FLOAT: + bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) + else: + raise TypeError('unsupported info type code') + + info.len = 1 + if not info.vptr: + info.vptr = <uint8_t *>&info.v1.i + else: + bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) + + return + + alloc_len = max(1, value_count) + if info and info.len > alloc_len: + alloc_len = info.len + + new_values = bcf_empty_array(info_type, alloc_len, vlen) + cdef char *valp = <char *>new_values + + if info_type == BCF_HT_INT: + dst_type = BCF_BT_INT32 + elif info_type == BCF_HT_REAL: + dst_type = BCF_BT_FLOAT + elif info_type == BCF_HT_STR: + dst_type = BCF_BT_CHAR + else: + raise ValueError('Unsupported INFO type') + + bcf_object_to_array(values, valp, dst_type, alloc_len, vlen) + + if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0: + raise ValueError('Unable to update INFO values') + + +cdef bcf_info_del_value(VariantRecord record, key): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + cdef ssize_t value_count + cdef int scalar + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if not info: + raise KeyError(key) + + bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1) + + if value_count <= 0: + null_value = () + elif scalar: + null_value = None + else: + null_value = (None,)*value_count + + bcf_info_set_value(record, bkey, null_value) + + +cdef bcf_format_get_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef ssize_t count + cdef int scalar + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('invalid FORMAT: {}'.format(key)) + + if is_gt_fmt(hdr, fmt.id): + return bcf_format_get_allele_indices(sample) + + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index) + + if fmt.p and fmt.n and fmt.size: + return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) + elif scalar: + return None + elif count <= 0: + return () + else: + return (None,)*count + + +cdef bcf_format_set_value(VariantRecordSample sample, key, value): + if sample is None: + raise ValueError('sample must not be None') + + if key == 'phased': + sample.phased = bool(value) + return + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int fmt_id + cdef vdict_t *d + cdef khiter_t k + cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 + cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if fmt: + fmt_id = fmt.id + else: + d = <vdict_t *>hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: + raise KeyError('unknown format: {}'.format(key)) + + fmt_id = kh_val_vdict(d, k).id + + if not check_header_id(hdr, BCF_HL_FMT, fmt_id): + raise ValueError('Invalid header') + + fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) + + if fmt_type == BCF_HT_FLAG: + raise ValueError('Flag types are not allowed on FORMATs') + + if is_gt_fmt(hdr, fmt_id): + value = bcf_encode_alleles(sample.record, value) + # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. + fmt_type = BCF_HT_INT + + values = bcf_check_values(sample.record, value, sample.index, + BCF_HL_FMT, fmt_type, fmt_id, + fmt.type if fmt else -1, + fmt.n if fmt else -1, + &value_count, &scalar, &realloc) + vlen = value_count < 0 + value_count = len(values) + + # If we can, write updated values to existing allocated storage. + if fmt and not realloc: + r.d.indiv_dirty = 1 + bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) + return + + alloc_len = max(1, value_count) + if fmt and fmt.n > alloc_len: + alloc_len = fmt.n + + nsamples = r.n_sample + new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen) + cdef char *new_values_p = <char *>new_values + + if fmt_type == BCF_HT_INT: + dst_type = BCF_BT_INT32 + dst_size = sizeof(int32_t) * alloc_len + elif fmt_type == BCF_HT_REAL: + dst_type = BCF_BT_FLOAT + dst_size = sizeof(float) * alloc_len + elif fmt_type == BCF_HT_STR: + dst_type = BCF_BT_CHAR + dst_size = sizeof(char) * alloc_len + else: + raise ValueError('Unsupported FORMAT type') + + if fmt and nsamples > 1: + for i in range(nsamples): + bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n, + new_values_p + i * dst_size, dst_type, alloc_len, + vlen) + + bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen) + + if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0: + raise ValueError('Unable to update format values') + + +cdef bcf_format_del_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef ssize_t value_count + cdef int scalar + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError(key) + + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index) + + if value_count <= 0: + null_value = () + elif scalar: + null_value = None + else: + null_value = (None,)*value_count + + bcf_format_set_value(sample, bkey, null_value) + + +cdef bcf_format_get_allele_indices(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = r.n_sample + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return () + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return () + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef int32_t a, nalleles = r.n_allele + cdef list alleles = [] + + if fmt0.type == BCF_BT_INT8: + data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data8[i]) + alleles.append(a if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT16: + data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data16[i]) + alleles.append(a if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT32: + data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data32[i]) + alleles.append(a if 0 <= a < nalleles else None) + + return tuple(alleles) + + +cdef bcf_format_get_alleles(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t nsamples = r.n_sample + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef int32_t nalleles = r.n_allele + + if sample.index < 0 or sample.index >= nsamples or not r.n_fmt: + return () + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return () + + cdef int32_t a + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + alleles = [] + if fmt0.type == BCF_BT_INT8: + data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + a = bcf_gt_allele(data8[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT16: + data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + a = bcf_gt_allele(data16[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT32: + data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + a = bcf_gt_allele(data32[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + return tuple(alleles) + + +cdef bint bcf_sample_get_phased(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = r.n_sample + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return False + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return False + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + + cdef bint phased = False + + if fmt0.type == BCF_BT_INT8: + data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_int8_missing: + continue + elif i and not bcf_gt_is_phased(data8[i]): + return False + else: + phased = True + elif fmt0.type == BCF_BT_INT16: + data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_int16_missing: + continue + elif i and not bcf_gt_is_phased(data16[i]): + return False + else: + phased = True + elif fmt0.type == BCF_BT_INT32: + data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_int32_missing: + continue + elif i and not bcf_gt_is_phased(data32[i]): + return False + else: + phased = True + + return phased + + +cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = r.n_sample + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + raise ValueError('Cannot set phased before genotype is set') + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + + if fmt0.type == BCF_BT_INT8: + data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_int8_missing: + continue + elif i: + data8[i] = (data8[i] & 0xFE) | phased + elif fmt0.type == BCF_BT_INT16: + data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_int16_missing: + continue + elif i: + data16[i] = (data16[i] & 0xFFFE) | phased + elif fmt0.type == BCF_BT_INT32: + data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_int32_missing: + continue + elif i: + data32[i] = (data32[i] & 0xFFFFFFFE) | phased + + +cdef inline bcf_sync_end(VariantRecord record): + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf_info_t *info + cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END') + cdef int ref_len + + # allow missing ref when instantiating a new record + if record.ref is not None: + ref_len = len(record.ref) + else: + ref_len = 0 + + # Delete INFO/END if no alleles are present or if rlen is equal to len(ref) + # Always keep END for symbolic alleles + if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len): + # If INFO/END is not defined in the header, it doesn't exist in the record + if end_id >= 0: + info = bcf_get_info(hdr, record.ptr, b'END') + if info and info.vptr: + if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0: + raise ValueError('Unable to delete END') + else: + # Create END header, if not present + if end_id < 0: + record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval') + + # Update to reflect stop position + bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen) + + +cdef inline int has_symbolic_allele(VariantRecord record): + """Return index of first symbolic allele. 0 if no symbolic alleles.""" + + for i in range(1, record.ptr.n_allele): + alt = record.ptr.d.allele[i] + if alt[0] == b'<' and alt[len(alt) - 1] == b'>': + return i + + return 0 + + +######################################################################## +######################################################################## +## Variant Header objects +######################################################################## + + +cdef bcf_header_remove_hrec(VariantHeader header, int i): + if header is None: + raise ValueError('header must not be None') + + cdef bcf_hdr_t *hdr = header.ptr + + if i < 0 or i >= hdr.nhrec: + raise ValueError('Invalid header record index') + + cdef bcf_hrec_t *hrec = hdr.hrec[i] + hdr.nhrec -= 1 + + if i < hdr.nhrec: + memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*)) + + bcf_hrec_destroy(hrec) + hdr.hrec[hdr.nhrec] = NULL + hdr.dirty = 1 + + +#FIXME: implement a full mapping interface +#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the +# object lifetime. +cdef class VariantHeaderRecord(object): + """header record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def type(self): + """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return None + return METADATA_TYPES[r.type] + + @property + def key(self): + """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" + cdef bcf_hrec_t *r = self.ptr + return bcf_str_cache_get_charptr(r.key) if r and r.key else None + + @property + def value(self): + """header value. Set only for generic lines, None for FILTER/INFO, etc.""" + cdef bcf_hrec_t *r = self.ptr + return charptr_to_str(r.value) if r and r.value else None + + @property + def attrs(self): + """sequence of additional header attributes""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return () + cdef int i + return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, + charptr_to_str(r.vals[i]) if r.vals[i] else None) + for i in range(r.nkeys)) + + def __len__(self): + cdef bcf_hrec_t *r = self.ptr + return r.nkeys if r else 0 + + def __bool__(self): + cdef bcf_hrec_t *r = self.ptr + return r != NULL and r.nkeys != 0 + + def __getitem__(self, key): + """get attribute value""" + cdef bcf_hrec_t *r = self.ptr + cdef int i + if r: + bkey = force_bytes(key) + for i in range(r.nkeys): + if r.keys[i] and r.keys[i] == bkey: + return charptr_to_str(r.vals[i]) if r.vals[i] else None + raise KeyError('cannot find metadata key') + + def __iter__(self): + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield bcf_str_cache_get_charptr(r.keys[i]) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield charptr_to_str(r.vals[i]) if r.vals[i] else None + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def __str__(self): + cdef bcf_hrec_t *r = self.ptr + + if not r: + raise ValueError('cannot convert deleted record to str') + + cdef kstring_t hrec_str + hrec_str.l = hrec_str.m = 0 + hrec_str.s = NULL + + bcf_hrec_format(r, &hrec_str) + + ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l) + + if hrec_str.m: + free(hrec_str.s) + + return ret + + # FIXME: Not safe -- causes trivial segfaults at the moment + def remove(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *r = self.ptr + if not r: + return + assert r.key + cdef char *key = r.key if r.type == BCF_HL_GEN else r.value + bcf_hdr_remove(hdr, r.type, key) + self.ptr = NULL + + +cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): + if not header: + raise ValueError('invalid VariantHeader') + + if not hdr: + return None + + cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) + record.header = header + record.ptr = hdr + + return record + + +cdef class VariantHeaderRecords(object): + """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.header.ptr.nhrec + + def __bool__(self): + return self.header.ptr.nhrec != 0 + + def __getitem__(self, index): + cdef int32_t i = index + if i < 0 or i >= self.header.ptr.nhrec: + raise IndexError('invalid header record index') + return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + def __iter__(self): + cdef int32_t i + for i in range(self.header.ptr.nhrec): + yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + __hash__ = None + + +cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) + records.header = header + return records + + +cdef class VariantMetadata(object): + """filter, info or format metadata record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """metadata name""" + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) + + # Q: Should this be exposed? + @property + def id(self): + """metadata internal header id number""" + return self.id + + @property + def number(self): + """metadata number (i.e. cardinality)""" + cdef bcf_hdr_t *hdr = self.header.ptr + + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None + + cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) + if l == BCF_VL_FIXED: + return bcf_hdr_id2number(hdr, self.type, self.id) + elif l == BCF_VL_VAR: + return '.' + else: + return METADATA_LENGTHS[l] + + @property + def type(self): + """metadata value type""" + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None + return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] + + @property + def description(self): + """metadata description (or None if not set)""" + descr = self.record.get('Description') + if descr: + descr = descr.strip('"') + return force_str(descr) + + @property + def record(self): + """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] + if not hrec: + return None + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *key = hdr.id[BCF_DT_ID][self.id].key + bcf_hdr_remove(hdr, self.type, key) + + +cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: + raise ValueError('invalid metadata type') + + if id < 0 or id >= header.ptr.n[BCF_DT_ID]: + raise ValueError('invalid metadata id') + + cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) + meta.header = header + meta.type = type + meta.id = id + + return meta + + +cdef class VariantHeaderMetadata(object): + """mapping from filter, info or format name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def add(self, id, number, type, description, **kwargs): + """Add a new filter, info or format record""" + if id in self: + raise ValueError('Header already exists for id={}'.format(id)) + + if self.type == BCF_HL_FLT: + if number is not None: + raise ValueError('Number must be None when adding a filter') + if type is not None: + raise ValueError('Type must be None when adding a filter') + + items = [('ID', unquoted_str(id)), ('Description', description)] + else: + if type not in VALUE_TYPES: + raise ValueError('unknown type specified: {}'.format(type)) + if number is None: + number = '.' + + items = [('ID', unquoted_str(id)), + ('Number', unquoted_str(number)), + ('Type', unquoted_str(type)), + ('Description', description)] + + items += kwargs.items() + self.header.add_meta(METADATA_TYPES[self.type], items=items) + + def __len__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i, n = 0 + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + return True + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID] + + cdef bytes bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid key: {}'.format(key)) + + return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) + + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID] + + cdef bytes bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid key: {}'.format(key)) + + bcf_hdr_remove(hdr, self.type, bkey) + #bcf_hdr_sync(hdr) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, self.type, NULL) + #bcf_hdr_sync(hdr) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + yield bcf_str_cache_get_charptr(idpair.key) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) + meta.header = header + meta.type = type + + return meta + + +cdef class VariantContig(object): + """contig metadata from a :class:`VariantHeader`""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) + + @property + def id(self): + """contig internal id number""" + return self.id + + @property + def length(self): + """contig length or None if not available""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] + return length if length else None + + @property + def header_record(self): + """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key + bcf_hdr_remove(hdr, BCF_HL_CTG, key) + + +cdef VariantContig makeVariantContig(VariantHeader header, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: + raise ValueError('invalid contig id') + + cdef VariantContig contig = VariantContig.__new__(VariantContig) + contig.header = header + contig.id = id + + return contig + + +cdef class VariantHeaderContigs(object): + """mapping from contig name or index to :class:`VariantContig` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] + return hdr.n[BCF_DT_CTG] + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] + return hdr.n[BCF_DT_CTG] != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int index + + if isinstance(key, int): + index = key + if index < 0 or index >= hdr.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + return makeVariantContig(self.header, index) + + cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG] + cdef bytes bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d): + raise KeyError('invalid contig: {}'.format(key)) + + cdef int id = kh_val_vdict(d, k).id + + return makeVariantContig(self.header, id) + + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int index + cdef const char *ckey + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + index = key + if index < 0 or index >= hdr.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + ckey = hdr.id[BCF_DT_CTG][self.id].key + else: + d = <vdict_t *>hdr.dict[BCF_DT_CTG] + key = force_bytes(key) + if kh_get_vdict(d, key) == kh_end(d): + raise KeyError('invalid contig: {}'.format(key)) + ckey = key + + bcf_hdr_remove(hdr, BCF_HL_CTG, ckey) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, BCF_HL_CTG, NULL) + #bcf_hdr_sync(hdr) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG] + cdef uint32_t n = kh_size(d) + + assert n == hdr.n[BCF_DT_CTG] + + for i in range(n): + yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def add(self, id, length=None, **kwargs): + """Add a new contig record""" + if id in self: + raise ValueError('Header already exists for contig {}'.format(id)) + + items = [('ID', unquoted_str(id))] + if length is not None: + items.append(("length", unquoted_str(length))) + items += kwargs.items() + self.header.add_meta('contig', items=items) + + +cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) + contigs.header = header + + return contigs + + +cdef class VariantHeaderSamples(object): + """sequence of sample names from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return bcf_hdr_nsamples(self.header.ptr) + + def __bool__(self): + return bcf_hdr_nsamples(self.header.ptr) != 0 + + def __getitem__(self, index): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t i = index + + if i < 0 or i >= n: + raise IndexError('invalid sample index') + + return charptr_to_str(hdr.samples[i]) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int32_t i, n = bcf_hdr_nsamples(hdr) + + for i in range(n): + yield charptr_to_str(hdr.samples[i]) + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE] + cdef bytes bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + return k != kh_end(d) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def add(self, name): + """Add a new sample""" + self.header.add_sample(name) + + +cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) + samples.header = header + + return samples + + +cdef class VariantHeader(object): + """header information for a :class:`VariantFile` object""" + #FIXME: Add structured proxy + #FIXME: Add generic proxy + #FIXME: Add mutable methods + + # See makeVariantHeader for C constructor + def __cinit__(self): + self.ptr = NULL + + # Python constructor + def __init__(self): + self.ptr = bcf_hdr_init(b'w') + if not self.ptr: + raise ValueError('cannot create VariantHeader') + + def __dealloc__(self): + if self.ptr: + bcf_hdr_destroy(self.ptr) + self.ptr = NULL + + def __bool__(self): + return self.ptr != NULL + + def copy(self): + return makeVariantHeader(bcf_hdr_dup(self.ptr)) + + def merge(self, VariantHeader header): + if header is None: + raise ValueError('header must not be None') + bcf_hdr_merge(self.ptr, header.ptr) + + @property + def version(self): + """VCF version""" + return force_str(bcf_hdr_get_version(self.ptr)) + + @property + def samples(self): + """samples (:class:`VariantHeaderSamples`)""" + return makeVariantHeaderSamples(self) + + @property + def records(self): + """header records (:class:`VariantHeaderRecords`)""" + return makeVariantHeaderRecords(self) + + @property + def contigs(self): + """contig information (:class:`VariantHeaderContigs`)""" + return makeVariantHeaderContigs(self) + + @property + def filters(self): + """filter metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_FLT) + + @property + def info(self): + """info metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_INFO) + + @property + def formats(self): + """format metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_FMT) + + @property + def alts(self): + """alt metadata (:class:`dict` ID->record). + + The data returned just a snapshot of alt records, is created + every time the property is requested, and modifications will + not be reflected in the header metadata and vice versa. + + i.e. it is just a dict that reflects the state of alt records + at the time it is created. + """ + return {record['ID']:record for record in self.records + if record.key.upper() == 'ALT' } + + # only safe to do when opening an htsfile + cdef _subset_samples(self, include_samples): + keep_samples = set(self.samples) + include_samples = set(include_samples) + missing_samples = include_samples - keep_samples + keep_samples &= include_samples + + if missing_samples: + # FIXME: add specialized exception with payload + raise ValueError( + 'missing {:d} requested samples'.format( + len(missing_samples))) + + keep_samples = force_bytes(','.join(keep_samples)) + cdef char *keep = <char *>keep_samples if keep_samples else NULL + cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) + + if ret != 0: + raise ValueError( + 'bcf_hdr_set_samples failed: ret = {}'.format(ret)) + + def __str__(self): + cdef int hlen + cdef kstring_t line + line.l = line.m = 0 + line.s = NULL + + if bcf_hdr_format(self.ptr, 0, &line) < 0: + if line.m: + free(line.s) + raise ValueError('bcf_hdr_format failed') + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + return ret + + def new_record(self, contig=None, start=0, stop=0, alleles=None, + id=None, qual=None, filter=None, info=None, samples=None, + **kwargs): + """Create a new empty VariantRecord. + + Arguments are currently experimental. Use with caution and expect + changes in upcoming releases. + + """ + rec = makeVariantRecord(self, bcf_init()) + + if not rec: + raise MemoryError('unable to allocate BCF record') + + rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr) + + if contig is not None: + rec.contig = contig + + rec.start = start + rec.stop = stop + rec.id = id + rec.qual = qual + + if alleles is not None: + rec.alleles = alleles + + if filter is not None: + if isinstance(filter, (list, tuple, VariantRecordFilter)): + for f in filter: + rec.filter.add(f) + else: + rec.filter.add(filter) + + if info: + rec.info.update(info) + + if kwargs: + rec.samples[0].update(kwargs) + + if samples: + for i, sample in enumerate(samples): + rec.samples[i].update(sample) + + return rec + + def add_record(self, VariantHeaderRecord record): + """Add an existing :class:`VariantHeaderRecord` to this header""" + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr) + + bcf_hdr_add_hrec(self.ptr, hrec) + + self._hdr_sync() + + def add_line(self, line): + """Add a metadata line to this header""" + bline = force_bytes(line) + if bcf_hdr_append(self.ptr, bline) < 0: + raise ValueError('invalid header line') + + self._hdr_sync() + + + def add_meta(self, key, value=None, items=None): + """Add metadata to this header""" + if not ((value is not None) ^ (items is not None)): + raise ValueError('either value or items must be specified') + + cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t)) + cdef int quoted + + try: + key = force_bytes(key) + hrec.key = strdup(key) + + if value is not None: + hrec.value = strdup(force_bytes(value)) + else: + for key, value in items: + quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type") + + key = force_bytes(key) + bcf_hrec_add_key(hrec, key, <int>len(key)) + + value = force_bytes(str(value)) + bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted) + except: + bcf_hrec_destroy(hrec) + raise + + bcf_hdr_add_hrec(self.ptr, hrec) + + self._hdr_sync() + + cdef _add_sample(self, name): + bname = force_bytes(name) + if bcf_hdr_add_sample(self.ptr, bname) < 0: + raise ValueError('Duplicated sample name: {}'.format(name)) + + cdef _hdr_sync(self): + cdef bcf_hdr_t *hdr = self.ptr + if hdr.dirty: + if bcf_hdr_sync(hdr) < 0: + raise MemoryError('unable to reallocate VariantHeader') + + def add_sample(self, name): + """Add a new sample to this header""" + self._add_sample(name) + self._hdr_sync() + + def add_samples(self, *args): + """Add several new samples to this header. + This function takes multiple arguments, each of which may + be either a sample name or an iterable returning sample names + (e.g., a list of sample names). + """ + for arg in args: + if isinstance(arg, str): + self._add_sample(arg) + else: + for name in arg: + self._add_sample(name) + self._hdr_sync() + + +cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): + if not hdr: + raise ValueError('cannot create VariantHeader') + + cdef VariantHeader header = VariantHeader.__new__(VariantHeader) + header.ptr = hdr + + return header + + +cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2: + cdef vdict_t *d + cdef khiter_t k + cdef int info_id + + if isinstance(key, str): + key = force_bytes(key) + + d = <vdict_t *>hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, key) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: + return -1 + + return kh_val_vdict(d, k).id + + +######################################################################## +######################################################################## +## Variant Record objects +######################################################################## + +cdef class VariantRecordFilter(object): + """Filters set on a :class:`VariantRecord` object, presented as a mapping from + filter index or name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.record.ptr.d.n_flt + + def __bool__(self): + return self.record.ptr.d.n_flt != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int index, id + cdef int n = r.d.n_flt + + if isinstance(key, int): + index = key + + if index < 0 or index >= n: + raise IndexError('invalid filter index') + + id = r.d.flt[index] + else: + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): + raise KeyError('Invalid filter: {}'.format(key)) + + return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) + + def add(self, key): + """Add a new filter""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int id + + if key == '.': + key = 'PASS' + + cdef bytes bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id): + raise KeyError('Invalid filter: {}'.format(key)) + + bcf_add_filter(hdr, r, id) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int index, id + cdef int n = r.d.n_flt + + if isinstance(key, int): + index = key + + if index < 0 or index >= n: + raise IndexError('invalid filter index') + + id = r.d.flt[index] + else: + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): + raise KeyError('Invalid filter: {}'.format(key)) + + bcf_remove_filter(hdr, r, id, 0) + + def clear(self): + """Clear all filters""" + cdef bcf1_t *r = self.record.ptr + r.d.shared_dirty |= BCF1_DIRTY_FLT + r.d.n_flt = 0 + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + for i in range(r.d.n_flt): + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bytes bkey = force_bytes(key) + return bcf_has_filter(hdr, r, bkey) == 1 + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other)) + + if op == 3: + cmp = not cmp + + return cmp + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) + filter.record = record + + return filter + + +cdef class VariantRecordFormat(object): + """Format data present for each sample in a :class:`VariantRecord` object, + presented as mapping from format name to :class:`VariantMetadata` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = 0 + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + return True + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('unknown format: {}'.format(key)) + + return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('unknown format: {}'.format(key)) + + if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: + raise ValueError('Unable to delete FORMAT') + + def clear(self): + """Clear all formats for all samples within the associated + :class:`VariantRecord` instance""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef const char *key + cdef int i + + for i in reversed(range(r.n_fmt)): + fmt = &r.d.fmt[i] + if fmt.p: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id) + if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0: + raise ValueError('Unable to delete FORMAT') + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if fmt.p: + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + return fmt != NULL and fmt.p != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) + format.record = record + + return format + + +#TODO: Add a getmeta method to return the corresponding VariantMetadata? +cdef class VariantRecordInfo(object): + """Info data stored in a :class:`VariantRecord` object, presented as a + mapping from info metadata name to value.""" + + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i, count = 0 + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: + count += 1 + + return count + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0: + return True + + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') + + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + raise KeyError('Unknown INFO field: {}'.format(key)) + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle type=Flag values + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: + return info != NULL and info.vptr != NULL + + if not info or not info.vptr: + raise KeyError('Invalid INFO field: {}'.format(key)) + + return bcf_info_get_value(self.record, info) + + def __setitem__(self, key, value): + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') + + if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bcf_info_set_value(self.record, key, value) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + cdef bytes bkey = force_bytes(key) + if strcmp(bkey, b'END') == 0: + raise KeyError('END is a reserved attribute; access is via record.stop') + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + raise KeyError('Unknown INFO field: {}'.format(key)) + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): + return + + if not info or not info.vptr: + raise KeyError('Unknown INFO field: {}'.format(key)) + + if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + def clear(self): + """Clear all info data""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') == 0: + continue + if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') != 0: + yield bcf_str_cache_get_charptr(key) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + return default + + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: + return info != NULL and info.vptr != NULL + + if not info or not info.vptr: + return default + + return bcf_info_get_value(self.record, info) + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + + if strcmp(bkey, b'END') == 0: + return False + + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + return info != NULL and info.vptr != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') != 0: + yield bcf_info_get_value(self.record, info) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if strcmp(key, b'END') != 0: + value = bcf_info_get_value(self.record, info) + yield bcf_str_cache_get_charptr(key), value + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + if k != 'END': + self[k] = v + + if kwargs: + kwargs.pop('END', None) + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef bytes bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + # Cannot stop here if info == NULL, since flags must return False + cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key + + if info_id < 0: + if default is _nothing: + raise KeyError('Unknown INFO field: {}'.format(key)) + return default + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + # Handle flags + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr): + return + + if not info or not info.vptr: + if default is _nothing: + raise KeyError('Unknown INFO field: {}'.format(key)) + return default + + value = bcf_info_get_value(self.record, info) + + if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + return value + + def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + # Cannot use n_info as shortcut logic, since null values may remain + cdef bint cmp = dict(self) == dict(other) + + if op == 3: + cmp = not cmp + + return cmp + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + +cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) + info.record = record + + return info + + +cdef class VariantRecordSamples(object): + """mapping from sample index or name to :class:`VariantRecordSample` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr) + + def __bool__(self): + return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = self.record.ptr.n_sample + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + bkey = force_bytes(key) + sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) + if sample_index < 0: + raise KeyError('invalid sample name: {}'.format(key)) + + if sample_index < 0 or sample_index >= n: + raise IndexError('invalid sample index') + + return makeVariantRecordSample(self.record, sample_index) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = self.record.ptr.n_sample + + for i in range(n): + yield charptr_to_str(hdr.samples[i]) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = self.record.ptr.n_sample + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + bkey = force_bytes(key) + sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) + if sample_index < 0: + raise KeyError('invalid sample name: {}'.format(key)) + + return 0 <= sample_index < n + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = self.record.ptr.n_sample + + for i in range(n): + yield makeVariantRecordSample(self.record, i) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = self.record.ptr.n_sample + + for i in range(n): + yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.record.ptr + cdef bcf1_t *o = other.record.ptr + + cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values()) + + if op == 3: + cmp = not cmp + + return cmp + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + +cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordSamples samples = VariantRecordSamples.__new__( + VariantRecordSamples) + samples.record = record + + return samples + + +cdef class VariantRecord(object): + """Variant record""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __dealloc__(self): + if self.ptr: + bcf_destroy1(self.ptr) + self.ptr = NULL + + def copy(self): + """return a copy of this VariantRecord object""" + return makeVariantRecord(self.header, bcf_dup(self.ptr)) + + def translate(self, VariantHeader dst_header): + if dst_header is None: + raise ValueError('dst_header must not be None') + + cdef bcf_hdr_t *src_hdr = self.header.ptr + cdef bcf_hdr_t *dst_hdr = dst_header.ptr + + if src_hdr != dst_hdr: + if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr): + msg = 'Cannot translate record. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) + + bcf_translate(dst_hdr, src_hdr, self.ptr) + self.header = dst_header + + @property + def rid(self): + """internal reference id number""" + return self.ptr.rid + + @rid.setter + def rid(self, value): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int r = value + if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: + raise ValueError('invalid reference id') + self.ptr.rid = r + + @property + def chrom(self): + """chromosome/contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @chrom.setter + def chrom(self, value): + cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def contig(self): + """chromosome/contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @contig.setter + def contig(self, value): + cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def pos(self): + """record start position on chrom/contig (1-based inclusive)""" + return self.ptr.pos + 1 + + @pos.setter + def pos(self, value): + cdef int p = value + if p < 1: + raise ValueError('Position must be positive') + self.ptr.pos = p - 1 + bcf_sync_end(self) + + @property + def start(self): + """record start position on chrom/contig (0-based inclusive)""" + return self.ptr.pos + + @start.setter + def start(self, value): + cdef int s = value + if s < 0: + raise ValueError('Start coordinate must be non-negative') + self.ptr.pos = s + bcf_sync_end(self) + + @property + def stop(self): + """record stop position on chrom/contig (0-based exclusive)""" + return self.ptr.pos + self.ptr.rlen + + @stop.setter + def stop(self, value): + cdef int s = value + if s < 0: + raise ValueError('Stop coordinate must be non-negative') + self.ptr.rlen = s - self.ptr.pos + bcf_sync_end(self) + + @property + def rlen(self): + """record length on chrom/contig (aka rec.stop - rec.start)""" + return self.ptr.rlen + + @rlen.setter + def rlen(self, value): + cdef int r = value + self.ptr.rlen = r + bcf_sync_end(self) + + @property + def qual(self): + """phred scaled quality score or None if not available""" + return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None + + @qual.setter + def qual(self, value): + if value is not None: + self.ptr.qual = value + else: + bcf_float_set(&self.ptr.qual, bcf_float_missing) + + +# @property +# def n_allele(self): +# return self.ptr.n_allele + +# @property +# def n_sample(self): +# return self.ptr.n_sample + + @property + def id(self): + """record identifier or None if not available""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + # causes a memory leak https://github.com/pysam-developers/pysam/issues/773 + # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None + if (r.d.m_id == 0): + raise ValueError('Error extracting ID') + return charptr_to_str(r.d.id) if r.d.id != b'.' else None + + @id.setter + def id(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + cdef char *idstr = NULL + if value is not None: + bid = force_bytes(value) + idstr = bid + if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: + raise ValueError('Error updating id') + + @property + def ref(self): + """reference allele""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return charptr_to_str(r.d.allele[0]) if r.d.allele else None + + @ref.setter + def ref(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + #FIXME: Set alleles directly -- this is stupid + if not value: + raise ValueError('ref allele must not be null') + value = force_bytes(value) + if r.d.allele and r.n_allele: + alleles = [r.d.allele[i] for i in range(r.n_allele)] + alleles[0] = value + else: + alleles = [value, '<NON_REF>'] + self.alleles = alleles + bcf_sync_end(self) + + @property + def alleles(self): + """tuple of reference allele followed by alt alleles""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele) + for i in range(r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i, a) + Py_INCREF(a) + return res + + @alleles.setter + def alleles(self, values): + cdef bcf1_t *r = self.ptr + + # Cache rlen of symbolic alleles before call to bcf_update_alleles_str + cdef int rlen = r.rlen + + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + + values = [force_bytes(v) for v in values] + + if len(values) < 2: + raise ValueError('must set at least 2 alleles') + + if b'' in values: + raise ValueError('cannot set null allele') + + value = b','.join(values) + + if bcf_update_alleles_str(self.header.ptr, r, value) < 0: + raise ValueError('Error updating alleles') + + # Reset rlen if alternate allele isn't symbolic, otherwise used cached + if has_symbolic_allele(self): + self.ptr.rlen = rlen + else: + self.ptr.rlen = len(values[0]) + r.d.var_type = -1 + bcf_sync_end(self) + + @property + def alts(self): + """tuple of alt alleles""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if r.n_allele < 2 or not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele - 1) + for i in range(1, r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i - 1, a) + Py_INCREF(a) + return res + + @alts.setter + def alts(self, value): + #FIXME: Set alleles directly -- this is stupid + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + value = [force_bytes(v) for v in value] + if b'' in value: + raise ValueError('cannot set null alt allele') + ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] + self.alleles = ref + value + r.d.var_type = -1 + + @property + def filter(self): + """filter information (see :class:`VariantRecordFilter`)""" + if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFilter(self) + + @property + def info(self): + """info data (see :class:`VariantRecordInfo`)""" + if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordInfo(self) + + @property + def format(self): + """sample format metadata (see :class:`VariantRecordFormat`)""" + if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFormat(self) + + @property + def samples(self): + """sample data (see :class:`VariantRecordSamples`)""" + if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordSamples(self) + + property alleles_variant_types: + def __get__(self): + cdef bcf1_t *r = self.ptr + cdef tuple result = PyTuple_New(r.n_allele) + + for i in range(r.n_allele): + tp = bcf_get_variant_type(r, i) + + if tp == VCF_REF: + v_type = "REF" + elif tp == VCF_SNP: + v_type = "SNP" + elif tp == VCF_MNP: + v_type = "MNP" + elif tp == VCF_INDEL: + v_type = "INDEL" + elif tp == VCF_BND: + v_type = "BND" + elif tp == VCF_OVERLAP: + v_type = "OVERLAP" + else: + v_type = "OTHER" + + PyTuple_SET_ITEM(result, i, v_type) + Py_INCREF(v_type) + + return result + + def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bcf1_t *s = self.ptr + cdef bcf1_t *o = other.ptr + + cdef bint cmp = self is other or ( + s.pos == o.pos + and s.rlen == o.rlen + and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual)) + or s.qual == o.qual) + and s.n_sample == o.n_sample + and s.n_allele == o.n_allele + and self.contig == other.contig + and self.alleles == other.alleles + and self.id == other.id + and self.info == other.info + and self.filter == other.filter + and self.samples == other.samples) + + if op == 3: + cmp = not cmp + + return cmp + + def __str__(self): + cdef kstring_t line + cdef char c + + line.l = line.m = 0 + line.s = NULL + + if vcf_format(self.header.ptr, self.ptr, &line) < 0: + if line.m: + free(line.s) + raise ValueError('vcf_format failed') + + # Strip CR/LF? + #while line.l: + # c = line.s[line.l - 1] + # if c != b'\n' and c != b'\r': + # break + # line.l -= 1 + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + + return ret + + +cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): + if not header: + raise ValueError('invalid VariantHeader') + + if not r: + raise ValueError('cannot create VariantRecord') + + if r.errcode: + msg = [] + #if r.errcode & BCF_ERR_CTG_UNDEF: + # msg.append('undefined contig') + #if r.errcode & BCF_ERR_TAG_UNDEF: + # msg.append('undefined tag') + if r.errcode & BCF_ERR_NCOLS: + msg.append('invalid number of columns') + if r.errcode & BCF_ERR_LIMITS: + msg.append('limits violated') + if r.errcode & BCF_ERR_CHAR: + msg.append('invalid character found') + if r.errcode & BCF_ERR_CTG_INVALID: + msg.append('invalid contig') + if r.errcode & BCF_ERR_TAG_INVALID: + msg.append('invalid tag') + + if msg: + msg = ', '.join(msg) + raise ValueError('Error(s) reading record: {}'.format(msg)) + + cdef VariantRecord record = VariantRecord.__new__(VariantRecord) + record.header = header + record.ptr = r + + return record + + +######################################################################## +######################################################################## +## Variant Sampletype object +######################################################################## + + +cdef class VariantRecordSample(object): + """Data for a single sample from a :class:`VariantRecord` object. + Provides data accessors for genotypes and a mapping interface + from format name to values. + """ + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """sample name""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t n = r.n_sample + + if self.index < 0 or self.index >= n: + raise ValueError('invalid sample index') + + return charptr_to_str(hdr.samples[self.index]) + + @property + def allele_indices(self): + """allele indices for called genotype, if present. Otherwise None""" + return bcf_format_get_allele_indices(self) + + @allele_indices.setter + def allele_indices(self, value): + self['GT'] = value + + @allele_indices.deleter + def allele_indices(self): + self['GT'] = () + + @property + def alleles(self): + """alleles for called genotype, if present. Otherwise None""" + return bcf_format_get_alleles(self) + + @alleles.setter + def alleles(self, value): + # Sets the genotype, supply a tuple of alleles to set. + # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord + # The genotype is reset when an empty tuple, None or (None,) is supplied + + if value==(None,) or value==tuple() or value is None: + self['GT'] = () + return + + if any((type(x) == int for x in value)): + raise ValueError('Use .allele_indices to set integer allele indices') + + # determine and set allele indices: + try: + self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) ) + except ValueError: + raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord." + "First set the .alleles of this record to define the alleles") + + @alleles.deleter + def alleles(self): + self['GT'] = () + + @property + def phased(self): + """False if genotype is missing or any allele is unphased. Otherwise True.""" + return bcf_sample_get_phased(self) + + @phased.setter + def phased(self, value): + bcf_sample_set_phased(self, value) + + def __len__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = 0 + + if bcf_unpack(r, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + if bcf_unpack(r, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + return True + return False + + def __getitem__(self, key): + return bcf_format_get_value(self, key) + + def __setitem__(self, key, value): + bcf_format_set_value(self, key, value) + + def __delitem__(self, key): + bcf_format_del_value(self, key) + + def clear(self): + """Clear all format data (including genotype) for this sample""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if fmt.p: + bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if r.d.fmt[i].p: + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bytes bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + return fmt != NULL and fmt.p != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op): + if op != 2 and op != 3: + return NotImplemented + + cdef bint cmp = dict(self) == dict(other) + + if op == 3: + cmp = not cmp + + return cmp + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + +cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): + if not record or sample_index < 0: + raise ValueError('cannot create VariantRecordSample') + + cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) + sample.record = record + sample.index = sample_index + + return sample + + +######################################################################## +######################################################################## +## Index objects +######################################################################## + + +cdef class BaseIndex(object): + def __init__(self): + self.refs = () + self.remap = {} + + def __len__(self): + return len(self.refs) + + def __bool__(self): + return len(self.refs) != 0 + + def __getitem__(self, key): + if isinstance(key, int): + return self.refs[key] + else: + return self.refmap[key] + + def __iter__(self): + return iter(self.refs) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + def update(self, items=None, **kwargs): + """D.update([E, ]**F) -> None. + + Update D from dict/iterable E and F. + """ + for k, v in items.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def pop(self, key, default=_nothing): + try: + value = self[key] + del self[key] + return value + except KeyError: + if default is not _nothing: + return default + raise + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef class BCFIndex(object): + """CSI index data structure for BCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) + + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + hts_idx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, reopen): + return BCFIterator(bcf, contig, start, stop, reopen) + + +cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): + if not idx: + return None + + if not header: + raise ValueError('invalid VariantHeader') + + cdef BCFIndex index = BCFIndex.__new__(BCFIndex) + index.header = header + index.ptr = idx + index.__init__() + + return index + + +cdef class TabixIndex(BaseIndex): + """Tabix index data structure for VCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = tbx_seqnames(self.ptr, &n) + + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + tbx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, reopen): + return TabixIterator(bcf, contig, start, stop, reopen) + + +cdef TabixIndex makeTabixIndex(tbx_t *idx): + if not idx: + return None + + cdef TabixIndex index = TabixIndex.__new__(TabixIndex) + index.ptr = idx + index.__init__() + + return index + + +######################################################################## +######################################################################## +## Iterators +######################################################################## + + +cdef class BaseIterator(object): + pass + + +# Internal function to clean up after iteration stop or failure. +# This would be a nested function if it weren't a cdef function. +cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): + bcf_destroy1(record) + + # destroy iter so future calls to __next__ raise StopIteration + bcf_itr_destroy(self.iter) + self.iter = NULL + + +cdef class BCFIterator(BaseIterator): + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') + + if contig is None: + raise ValueError('contig must be specified') + + if not isinstance(bcf.index, BCFIndex): + raise ValueError('bcf index required') + + cdef BCFIndex index = bcf.index + + self.bcf = bcf + self.index = index + + cdef int rid, cstart, cstop + + try: + rid = index.refmap[contig] + except KeyError: + # A query for a non-existent contig yields an empty iterator, does not raise an error + self.iter = NULL + return + + if reopen: + self.bcf = self.bcf.copy() + + cstart = start if start is not None else 0 + cstop = stop if stop is not None else MAX_POS + + with nogil: + self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop) + + if not self.iter: + if errno: + raise IOError(errno, strerror(errno)) + else: + raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) + + def __dealloc__(self): + if self.iter: + bcf_itr_destroy(self.iter) + self.iter = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef bcf1_t *record = bcf_init1() + + if not record: + raise MemoryError('unable to allocate BCF record') + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + cdef int ret + + with nogil: + ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) + + if ret < 0: + _stop_BCFIterator(self, record) + if ret == -1: + raise StopIteration + elif ret == -2: + raise IOError('truncated file') + elif errno: + raise IOError(errno, strerror(errno)) + else: + raise IOError('unable to fetch next record') + + ret = bcf_subset_format(self.bcf.header.ptr, record) + + if ret < 0: + _stop_BCFIterator(self, record) + raise ValueError('error in bcf_subset_format') + + return makeVariantRecord(self.bcf.header, record) + + +cdef class TabixIterator(BaseIterator): + def __cinit__(self, *args, **kwargs): + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') + + if not isinstance(bcf.index, TabixIndex): + raise ValueError('tabix index required') + + cdef TabixIndex index = bcf.index + + self.bcf = bcf + self.index = index + + cdef int rid, cstart, cstop + + try: + rid = index.refmap[contig] + except KeyError: + # A query for a non-existent contig yields an empty iterator, does not raise an error + self.iter = NULL + return + + if reopen: + self.bcf = self.bcf.copy() + + cstart = start if start is not None else 0 + cstop = stop if stop is not None else MAX_POS + + self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) + + if not self.iter: + if errno: + raise IOError(errno, strerror(errno)) + else: + raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop)) + + def __dealloc__(self): + if self.iter: + tbx_itr_destroy(self.iter) + self.iter = NULL + + if self.line_buffer.m: + free(self.line_buffer.s) + + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef int ret + + with nogil: + ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) + + if ret < 0: + tbx_itr_destroy(self.iter) + self.iter = NULL + if ret == -1: + raise StopIteration + elif ret == -2: + raise IOError('truncated file') + elif errno: + raise IOError(errno, strerror(errno)) + else: + raise IOError('unable to fetch next record') + + cdef bcf1_t *record = bcf_init1() + + if not record: + raise MemoryError('unable to allocate BCF record') + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) + + # FIXME: stop iteration on parse failure? + if ret < 0: + bcf_destroy1(record) + raise ValueError('error in vcf_parse') + + return makeVariantRecord(self.bcf.header, record) + + +######################################################################## +######################################################################## +## Variant File +######################################################################## + + +cdef class VariantFile(HTSFile): + """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, + duplicate_filehandle=True, ignore_truncation=False, threads=1)* + + A :term:`VCF`/:term:`BCF` formatted file. The file is automatically + opened. + + If an index for a variant file exists (.csi or .tbi), it will be + opened automatically. Without an index random access to records + via :meth:`fetch` is disabled. + + For writing, a :class:`VariantHeader` object must be provided, + typically obtained from another :term:`VCF` file/:term:`BCF` + file. + + Parameters + ---------- + mode : string + *mode* should be ``r`` for reading or ``w`` for writing. The default is + text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append + ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. + + If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid + modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. + For instance, to open a :term:`BCF` formatted file for reading, type:: + + f = pysam.VariantFile('ex1.bcf','r') + + If mode is not specified, we will try to auto-detect the file type. All + of the following should work:: + + f1 = pysam.VariantFile('ex1.bcf') + f2 = pysam.VariantFile('ex1.vcf') + f3 = pysam.VariantFile('ex1.vcf.gz') + + index_filename : string + Explicit path to an index file. + + header : VariantHeader + :class:`VariantHeader` object required for writing. + + drop_samples: bool + Ignore sample information when reading. + + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. + + ignore_truncation: bool + Issue a warning, instead of raising an error if the current file + appears to be truncated due to a missing EOF marker. Only applies + to bgzipped formats. (Default=False) + + threads: integer + Number of threads to use for compressing/decompressing VCF/BCF files. + Setting threads to > 1 cannot be combined with `ignore_truncation`. + (Default=1) + + """ + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + + def __init__(self, *args, **kwargs): + self.header = None + self.index = None + self.filename = None + self.mode = None + self.threads = 1 + self.index_filename = None + self.is_stream = False + self.is_remote = False + self.is_reading = False + self.drop_samples = False + self.header_written = False + self.start_offset = -1 + + self.open(*args, **kwargs) + + def __dealloc__(self): + if not self.htsfile or not self.header: + return + + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = self.index = None + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise IOError(errno, force_str(strerror(errno))) + + def close(self): + """closes the :class:`pysam.VariantFile`.""" + if not self.htsfile: + return + + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = self.index = None + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise IOError(errno, force_str(strerror(errno))) + + def __iter__(self): + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot iterate over Variantfile opened for writing') + + self.is_reading = 1 + return self + + def __next__(self): + cdef int ret + cdef int errcode + cdef bcf1_t *record = bcf_init1() + + if not record: + raise MemoryError('unable to allocate BCF record') + + record.pos = -1 + if self.drop_samples: + record.max_unpack = BCF_UN_SHR + + with nogil: + ret = bcf_read1(self.htsfile, self.header.ptr, record) + + if ret < 0: + errcode = record.errcode + bcf_destroy1(record) + if errcode: + raise IOError('unable to parse next record') + if ret == -1: + raise StopIteration + elif ret == -2: + raise IOError('truncated file') + elif errno: + raise IOError(errno, strerror(errno)) + else: + raise IOError('unable to fetch next record') + + return makeVariantRecord(self.header, record) + + def copy(self): + if not self.is_open: + raise ValueError + + cdef VariantFile vars = VariantFile.__new__(VariantFile) + cdef bcf_hdr_t *hdr + + # FIXME: re-open using fd or else header and index could be invalid + vars.htsfile = self._open_htsfile() + + if not vars.htsfile: + raise ValueError('Cannot re-open htsfile') + + # minimize overhead by re-using header and index. This approach is + # currently risky, but see above for how this can be mitigated. + vars.header = self.header + vars.index = self.index + + vars.filename = self.filename + vars.mode = self.mode + vars.threads = self.threads + vars.index_filename = self.index_filename + vars.drop_samples = self.drop_samples + vars.is_stream = self.is_stream + vars.is_remote = self.is_remote + vars.is_reading = self.is_reading + vars.start_offset = self.start_offset + vars.header_written = self.header_written + + if self.htsfile.is_bin: + vars.seek(self.tell()) + else: + with nogil: + hdr = bcf_hdr_read(vars.htsfile) + makeVariantHeader(hdr) + + return vars + + def open(self, filename, mode='r', + index_filename=None, + VariantHeader header=None, + drop_samples=False, + duplicate_filehandle=True, + ignore_truncation=False, + threads=1): + """open a vcf/bcf file. + + If open is called on an existing VariantFile, the current file will be + closed and a new file will be opened. + """ + cdef bcf_hdr_t *hdr + cdef BGZF *bgzfp + cdef hts_idx_t *idx + cdef tbx_t *tidx + cdef char *cfilename + cdef char *cindex_filename = NULL + cdef char *cmode + + if threads > 1 and ignore_truncation: + # This won't raise errors if reaching a truncated alignment, + # because bgzf_mt_reader in htslib does not deal with + # bgzf_mt_read_block returning non-zero values, contrary + # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888) + # Better to avoid this (for now) than to produce seemingly correct results. + raise ValueError('Cannot add extra threads when "ignore_truncation" is True') + self.threads = threads + + # close a previously opened file + if self.is_open: + self.close() + + if not mode or mode[0] not in 'rwa': + raise ValueError('mode must begin with r, w or a') + + self.duplicate_filehandle = duplicate_filehandle + + format_modes = [m for m in mode[1:] if m in 'bcguz'] + if len(format_modes) > 1: + raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes))) + + invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex'] + if invalid_modes: + raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes))) + + # Autodetect mode from filename + if mode == 'w' and isinstance(filename, str): + if filename.endswith('.gz'): + mode = 'wz' + elif filename.endswith('.bcf'): + mode = 'wb' + + # for htslib, wbu seems to not work + if mode == 'wbu': + mode = 'wb0' + + self.mode = mode = force_bytes(mode) + try: + filename = encode_filename(filename) + self.is_remote = hisremote(filename) + self.is_stream = filename == b'-' + except TypeError: + filename = filename + self.is_remote = False + self.is_stream = True + + self.filename = filename + + if index_filename is not None: + self.index_filename = index_filename = encode_filename(index_filename) + else: + self.index_filename = None + + self.drop_samples = bool(drop_samples) + self.header = None + + self.header_written = False + + if mode.startswith(b'w'): + # open file for writing + if index_filename is not None: + raise ValueError('Cannot specify an index filename when writing a VCF/BCF file') + + # header structure (used for writing) + if header: + self.header = header.copy() + else: + self.header = VariantHeader() + #raise ValueError('a VariantHeader must be specified') + + # Header is not written until the first write or on close + self.htsfile = self._open_htsfile() + + if not self.htsfile: + raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode)) + + elif mode.startswith(b'r'): + # open file for reading + self.htsfile = self._open_htsfile() + + if not self.htsfile: + if errno: + raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno)))) + else: + raise ValueError('could not open variant file `{}`'.format(filename)) + + if self.htsfile.format.format not in (bcf, vcf): + raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) + + self.check_truncation(ignore_truncation) + + with nogil: + hdr = bcf_hdr_read(self.htsfile) + + try: + self.header = makeVariantHeader(hdr) + except ValueError: + raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) + + if isinstance(self.filename, bytes): + cfilename = self.filename + else: + cfilename = NULL + + # check for index and open if present + if self.htsfile.format.format == bcf and cfilename: + if index_filename is not None: + cindex_filename = index_filename + with nogil: + idx = bcf_index_load2(cfilename, cindex_filename) + self.index = makeBCFIndex(self.header, idx) + + elif self.htsfile.format.compression == bgzf and cfilename: + if index_filename is not None: + cindex_filename = index_filename + with nogil: + tidx = tbx_index_load2(cfilename, cindex_filename) + self.index = makeTabixIndex(tidx) + + if not self.is_stream: + self.start_offset = self.tell() + else: + raise ValueError('unknown mode {}'.format(mode)) + + def reset(self): + """reset file position to beginning of file just after the header.""" + return self.seek(self.start_offset) + + def is_valid_tid(self, tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + returns -1 if reference is not known. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = tid + return 0 <= rid < hdr.n[BCF_DT_CTG] + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG] + reference = force_bytes(reference) + cdef khint_t k = kh_get_vdict(d, reference) + return kh_val_vdict(d, k).id if k != kh_end(d) else -1 + + def get_reference_name(self, tid): + """ + return :term:`reference` name corresponding to numerical :term:`tid` + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = tid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid tid') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None): + """fetch records in a :term:`region`, specified either by + :term:`contig`, *start*, and *end* (which are 0-based, half-open); + or alternatively by a samtools :term:`region` string (which is + 1-based inclusive). + + Without *contig* or *region* all mapped records will be fetched. The + records will be returned ordered by contig, which will not necessarily + be the order within the file. + + Set *reopen* to true if you will be using multiple iterators on the + same file at the same time. The iterator returned will receive its + own copy of a filehandle to the file effectively re-opening the + file. Re-opening a file incurrs some overhead, so use with care. + + If only *contig* is set, all records on *contig* will be fetched. + If both *region* and *contig* are given, an exception is raised. + + Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index + (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be + read sequentially. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot fetch from Variantfile opened for writing') + + if contig is None and region is None: + self.is_reading = 1 + bcf = self.copy() if reopen else self + bcf.seek(self.start_offset) + return iter(bcf) + + if self.index is None: + raise ValueError('fetch requires an index') + + _, tid, start, stop = self.parse_region(contig, start, stop, region, + None, end=end, reference=reference) + + if contig is None: + contig = self.get_reference_name(tid) + + self.is_reading = 1 + return self.index.fetch(self, contig, start, stop, reopen) + + def new_record(self, *args, **kwargs): + """Create a new empty :class:`VariantRecord`. + + See :meth:`VariantHeader.new_record` + """ + return self.header.new_record(*args, **kwargs) + + cpdef int write(self, VariantRecord record) except -1: + """ + write a single :class:`pysam.VariantRecord` to disk. + + returns the number of bytes written. + """ + if record is None: + raise ValueError('record must not be None') + + if not self.is_open: + return ValueError('I/O operation on closed file') + + if not self.htsfile.is_write: + raise ValueError('cannot write to a Variantfile opened for reading') + + if not self.header_written: + self.header_written = True + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + #if record.header is not self.header: + # record.translate(self.header) + # raise ValueError('Writing records from a different VariantFile is not yet supported') + + if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr): + msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) + + # Sync END annotation before writing + bcf_sync_end(record) + + cdef int ret + + with nogil: + ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) + + if ret < 0: + raise IOError(errno, strerror(errno)) + + return ret + + def subset_samples(self, include_samples): + """ + Read only a subset of samples to reduce processing time and memory. + Must be called prior to retrieving records. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot subset samples from Variantfile opened for writing') + + if self.is_reading: + raise ValueError('cannot subset samples after fetching records') + + self.header._subset_samples(include_samples) + + # potentially unnecessary optimization that also sets max_unpack + if not include_samples: + self.drop_samples = True +