jpayne@68: # cython: language_level=3
jpayne@68: # cython: embedsignature=True
jpayne@68: # cython: profile=True
jpayne@68: ###############################################################################
jpayne@68: ###############################################################################
jpayne@68: ## Cython wrapper for htslib VCF/BCF reader/writer
jpayne@68: ###############################################################################
jpayne@68: #
jpayne@68: # NOTICE: This code is incomplete and preliminary.  It offers a nearly
jpayne@68: #         complete Pythonic interface to VCF/BCF metadata and data with
jpayne@68: #         reading and writing capability.  Documentation and a unit test suite
jpayne@68: #         are in the works.  The code is best tested under Python 2, but
jpayne@68: #         should also work with Python 3.  Please report any remaining
jpayne@68: #         str/bytes issues on the github site when using Python 3 and I'll
jpayne@68: #         fix them promptly.
jpayne@68: #
jpayne@68: # Here is a minimal example of how to use the API:
jpayne@68: #
jpayne@68: #     $ cat bcfview.py
jpayne@68: #     import sys
jpayne@68: #     from pysam import VariantFile
jpayne@68: #
jpayne@68: #     bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
jpayne@68: #     bcf_out = VariantFile('-', 'w', header=bcf_in.header)
jpayne@68: #
jpayne@68: #     for rec in bcf_in:
jpayne@68: #         bcf_out.write(rec)
jpayne@68: #
jpayne@68: # Performance is fairly close to that of bcftools view.  Here is an example
jpayne@68: # using some 1k Genomes data:
jpayne@68: #
jpayne@68: #     $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@68: #      1103799
jpayne@68: #
jpayne@68: #     real	0m56.114s
jpayne@68: #     user	1m4.489s
jpayne@68: #     sys	0m3.102s
jpayne@68: #
jpayne@68: #     $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
jpayne@68: #      1103800  # bcftools adds an extra header
jpayne@68: #
jpayne@68: #     real	0m55.126s
jpayne@68: #     user	1m3.502s
jpayne@68: #     sys	0m3.459s
jpayne@68: #
jpayne@68: ###############################################################################
jpayne@68: #
jpayne@68: # TODO list:
jpayne@68: #
jpayne@68: #   * more genotype methods
jpayne@68: #   * unit test suite (perhaps py.test based)
jpayne@68: #   * documentation
jpayne@68: #   * pickle support
jpayne@68: #   * left/right locus normalization
jpayne@68: #   * fix reopen to re-use fd
jpayne@68: #
jpayne@68: ###############################################################################
jpayne@68: #
jpayne@68: # The MIT License
jpayne@68: #
jpayne@68: # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
jpayne@68: #
jpayne@68: # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68: # copy of this software and associated documentation files (the "Software"),
jpayne@68: # to deal in the Software without restriction, including without limitation
jpayne@68: # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68: # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68: # Software is furnished to do so, subject to the following conditions:
jpayne@68: #
jpayne@68: # The above copyright notice and this permission notice shall be included in
jpayne@68: # all copies or substantial portions of the Software.
jpayne@68: #
jpayne@68: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
jpayne@68: # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68: # DEALINGS IN THE SOFTWARE.
jpayne@68: #
jpayne@68: ###############################################################################
jpayne@68: 
jpayne@68: from __future__ import division, print_function
jpayne@68: 
jpayne@68: import os
jpayne@68: import sys
jpayne@68: 
jpayne@68: from libc.errno  cimport errno, EPIPE
jpayne@68: from libc.string cimport strcmp, strpbrk, strerror
jpayne@68: from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
jpayne@68: 
jpayne@68: cimport cython
jpayne@68: 
jpayne@68: from cpython.object  cimport PyObject
jpayne@68: from cpython.ref     cimport Py_INCREF
jpayne@68: from cpython.dict    cimport PyDict_GetItemString, PyDict_SetItemString
jpayne@68: from cpython.tuple   cimport PyTuple_New, PyTuple_SET_ITEM
jpayne@68: from cpython.bytes   cimport PyBytes_FromStringAndSize
jpayne@68: from cpython.unicode cimport PyUnicode_DecodeUTF8
jpayne@68: 
jpayne@68: from pysam.libchtslib cimport HTSFile, hisremote
jpayne@68: 
jpayne@68: from pysam.utils import unquoted_str
jpayne@68: 
jpayne@68: 
jpayne@68: __all__ = ['VariantFile',
jpayne@68:            'VariantHeader',
jpayne@68:            'VariantHeaderRecord',
jpayne@68:            'VariantHeaderRecords',
jpayne@68:            'VariantMetadata',
jpayne@68:            'VariantHeaderMetadata',
jpayne@68:            'VariantContig',
jpayne@68:            'VariantHeaderContigs',
jpayne@68:            'VariantHeaderSamples',
jpayne@68:            'VariantRecordFilter',
jpayne@68:            'VariantRecordFormat',
jpayne@68:            'VariantRecordInfo',
jpayne@68:            'VariantRecordSamples',
jpayne@68:            'VariantRecord',
jpayne@68:            'VariantRecordSample',
jpayne@68:            'BaseIndex',
jpayne@68:            'BCFIndex',
jpayne@68:            'TabixIndex',
jpayne@68:            'BaseIterator',
jpayne@68:            'BCFIterator',
jpayne@68:            'TabixIterator',
jpayne@68:            'VariantRecord']
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Constants
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: cdef int MAX_POS = (1 << 31) - 1
jpayne@68: cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
jpayne@68: cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
jpayne@68: cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Python 3 compatibility functions
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
jpayne@68: from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Sentinel object
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: cdef object _nothing = object()
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## VCF/BCF string intern system
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: cdef dict bcf_str_cache = {}
jpayne@68: 
jpayne@68: cdef inline bcf_str_cache_get_charptr(const char* s):
jpayne@68:     if s == NULL:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
jpayne@68:     if pystr:
jpayne@68:         return <object>pystr
jpayne@68: 
jpayne@68:     val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
jpayne@68: 
jpayne@68:     PyDict_SetItemString(bcf_str_cache, s, val)
jpayne@68: 
jpayne@68:     return val
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Genotype math
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: cdef int comb(int n, int k) except -1:
jpayne@68:     """Return binomial coefficient: n choose k
jpayne@68: 
jpayne@68:     >>> comb(5, 1)
jpayne@68:     5
jpayne@68:     >>> comb(5, 2)
jpayne@68:     10
jpayne@68:     >>> comb(2, 2)
jpayne@68:     1
jpayne@68:     >>> comb(100, 2)
jpayne@68:     4950
jpayne@68:     """
jpayne@68:     if k > n:
jpayne@68:         return 0
jpayne@68:     elif k == n:
jpayne@68:         return 1
jpayne@68:     elif k > n // 2:
jpayne@68:         k = n - k
jpayne@68: 
jpayne@68:     cdef d, result
jpayne@68: 
jpayne@68:     d = result = n - k + 1
jpayne@68:     for i in range(2, k + 1):
jpayne@68:         d += 1
jpayne@68:         result  *= d
jpayne@68:         result //= i
jpayne@68:     return result
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
jpayne@68:     """Return the count of genotypes expected for the given ploidy and number of alleles.
jpayne@68: 
jpayne@68:     >>> bcf_geno_combinations(1, 2)
jpayne@68:     2
jpayne@68:     >>> bcf_geno_combinations(2, 2)
jpayne@68:     3
jpayne@68:     >>> bcf_geno_combinations(2, 3)
jpayne@68:     6
jpayne@68:     >>> bcf_geno_combinations(3, 2)
jpayne@68:     4
jpayne@68:     """
jpayne@68:     return comb(alleles + ploidy - 1, ploidy)
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Low level type conversion helpers
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
jpayne@68:     return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
jpayne@68:     return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
jpayne@68: 
jpayne@68:     if sample < 0:
jpayne@68:         raise ValueError('genotype is only valid as a format field')
jpayne@68: 
jpayne@68:     cdef int32_t *gt_arr = NULL
jpayne@68:     cdef int ngt = 0
jpayne@68:     ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
jpayne@68: 
jpayne@68:     if ngt <= 0 or not gt_arr:
jpayne@68:         return 0
jpayne@68: 
jpayne@68:     assert ngt % rec.n_sample == 0
jpayne@68:     cdef int max_ploidy = ngt // rec.n_sample
jpayne@68:     cdef int32_t *gt = gt_arr + sample * max_ploidy
jpayne@68:     cdef int ploidy = 0
jpayne@68: 
jpayne@68:     while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
jpayne@68:         gt += 1
jpayne@68:         ploidy += 1
jpayne@68: 
jpayne@68:     free(<void*>gt_arr)
jpayne@68: 
jpayne@68:     return bcf_geno_combinations(ploidy, rec.n_allele)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
jpayne@68:     if not a:
jpayne@68:         return None
jpayne@68:     try:
jpayne@68:          return tuple(charptr_to_str(a[i]) for i in range(n))
jpayne@68:     finally:
jpayne@68:         if free_after and a:
jpayne@68:             free(a)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
jpayne@68:     cdef char    *datac
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68:     cdef float   *dataf
jpayne@68:     cdef int      i
jpayne@68:     cdef bytes    b
jpayne@68: 
jpayne@68:     if not data or n <= 0:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     if type == BCF_BT_CHAR:
jpayne@68:         datac = <char *>data
jpayne@68: 
jpayne@68:         if not n:
jpayne@68:             value = ()
jpayne@68:         else:
jpayne@68:             # Check if at least one null terminator is present
jpayne@68:             if datac[n-1] == bcf_str_vector_end:
jpayne@68:                 # If so, create a string up to the first null terminator
jpayne@68:                 b = datac
jpayne@68:             else:
jpayne@68:                 # Otherwise, copy the entire block
jpayne@68:                 b = datac[:n]
jpayne@68:             value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
jpayne@68:     else:
jpayne@68:         value = []
jpayne@68:         if type == BCF_BT_INT8:
jpayne@68:             data8 = <int8_t *>data
jpayne@68:             for i in range(n):
jpayne@68:                 if data8[i] == bcf_int8_vector_end:
jpayne@68:                     break
jpayne@68:                 value.append(data8[i] if data8[i] != bcf_int8_missing else None)
jpayne@68:         elif type == BCF_BT_INT16:
jpayne@68:             data16 = <int16_t *>data
jpayne@68:             for i in range(n):
jpayne@68:                 if data16[i] == bcf_int16_vector_end:
jpayne@68:                     break
jpayne@68:                 value.append(data16[i] if data16[i] != bcf_int16_missing else None)
jpayne@68:         elif type == BCF_BT_INT32:
jpayne@68:             data32 = <int32_t *>data
jpayne@68:             for i in range(n):
jpayne@68:                 if data32[i] == bcf_int32_vector_end:
jpayne@68:                     break
jpayne@68:                 value.append(data32[i] if data32[i] != bcf_int32_missing else None)
jpayne@68:         elif type == BCF_BT_FLOAT:
jpayne@68:             dataf = <float *>data
jpayne@68:             for i in range(n):
jpayne@68:                 if bcf_float_is_vector_end(dataf[i]):
jpayne@68:                     break
jpayne@68:                 value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
jpayne@68:         else:
jpayne@68:             raise TypeError('unsupported info type code')
jpayne@68: 
jpayne@68:     # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
jpayne@68:     if not value:
jpayne@68:         if scalar:
jpayne@68:             value = None
jpayne@68:         elif count <= 0:
jpayne@68:             value = ()
jpayne@68:         else:
jpayne@68:             value = (None,)*count
jpayne@68:     elif scalar and len(value) == 1:
jpayne@68:         value = value[0]
jpayne@68:     else:
jpayne@68:         value = tuple(value)
jpayne@68: 
jpayne@68:     return value
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
jpayne@68:     cdef char    *datac
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68:     cdef float   *dataf
jpayne@68:     cdef ssize_t  i, value_count = len(values)
jpayne@68: 
jpayne@68:     assert value_count <= n
jpayne@68: 
jpayne@68:     if bt_type == BCF_BT_CHAR:
jpayne@68:         if not isinstance(values, (str, bytes)):
jpayne@68:             values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
jpayne@68:             value_count = len(values)
jpayne@68:         assert value_count <= n
jpayne@68:         datac = <char *>data
jpayne@68:         memcpy(datac, <char *>values, value_count)
jpayne@68:         for i in range(value_count, n):
jpayne@68:             datac[i] = 0
jpayne@68:     elif bt_type == BCF_BT_INT8:
jpayne@68:         datai8 = <int8_t *>data
jpayne@68:         for i in range(value_count):
jpayne@68:             val = values[i]
jpayne@68:             datai8[i] = val if val is not None else bcf_int8_missing
jpayne@68:         for i in range(value_count, n):
jpayne@68:             datai8[i] = bcf_int8_vector_end
jpayne@68:     elif bt_type == BCF_BT_INT16:
jpayne@68:         datai16 = <int16_t *>data
jpayne@68:         for i in range(value_count):
jpayne@68:             val = values[i]
jpayne@68:             datai16[i] = val if val is not None else bcf_int16_missing
jpayne@68:         for i in range(value_count, n):
jpayne@68:             datai16[i] = bcf_int16_vector_end
jpayne@68:     elif bt_type == BCF_BT_INT32:
jpayne@68:         datai32 = <int32_t *>data
jpayne@68:         for i in range(value_count):
jpayne@68:             val = values[i]
jpayne@68:             datai32[i] = val if val is not None else bcf_int32_missing
jpayne@68:         for i in range(value_count, n):
jpayne@68:             datai32[i] = bcf_int32_vector_end
jpayne@68:     elif bt_type == BCF_BT_FLOAT:
jpayne@68:         dataf = <float *>data
jpayne@68:         for i in range(value_count):
jpayne@68:             val = values[i]
jpayne@68:             if val is None:
jpayne@68:                 bcf_float_set(dataf + i, bcf_float_missing)
jpayne@68:             else:
jpayne@68:                 dataf[i] = val
jpayne@68:         for i in range(value_count, n):
jpayne@68:             bcf_float_set(dataf + i, bcf_float_vector_end)
jpayne@68:     else:
jpayne@68:         raise TypeError('unsupported type')
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_empty_array(int type, ssize_t n, int vlen):
jpayne@68:     cdef char    *datac
jpayne@68:     cdef int32_t *data32
jpayne@68:     cdef float   *dataf
jpayne@68:     cdef int      i
jpayne@68: 
jpayne@68:     if n <= 0:
jpayne@68:         raise ValueError('Cannot create empty array')
jpayne@68: 
jpayne@68:     if type == BCF_HT_STR:
jpayne@68:         value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
jpayne@68:         datac = <char *>value
jpayne@68:         for i in range(n):
jpayne@68:             datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
jpayne@68:     elif type == BCF_HT_INT:
jpayne@68:         value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
jpayne@68:         data32 = <int32_t *><char *>value
jpayne@68:         for i in range(n):
jpayne@68:             data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68:     elif type == BCF_HT_REAL:
jpayne@68:         value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
jpayne@68:         dataf = <float *><char *>value
jpayne@68:         for i in range(n):
jpayne@68:             bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@68:     else:
jpayne@68:         raise TypeError('unsupported header type code')
jpayne@68: 
jpayne@68:     return value
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
jpayne@68:                            void *dst_data, int dst_type, size_t dst_values,
jpayne@68:                            int vlen):
jpayne@68:     """copy data from src to dest where the size of the elements (src_type/dst_type) differ
jpayne@68:     as well as the number of elements (src_values/dst_values).
jpayne@68:     """
jpayne@68: 
jpayne@68:     cdef char    *src_datac
jpayne@68:     cdef char    *dst_datac
jpayne@68:     cdef int8_t  *src_datai8
jpayne@68:     cdef int16_t *src_datai16
jpayne@68:     cdef int32_t *src_datai32
jpayne@68:     cdef int32_t *dst_datai
jpayne@68:     cdef float   *src_dataf
jpayne@68:     cdef float   *dst_dataf
jpayne@68:     cdef ssize_t src_size, dst_size, i, j
jpayne@68:     cdef int val
jpayne@68: 
jpayne@68:     if src_values > dst_values:
jpayne@68:         raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
jpayne@68: 
jpayne@68:     if src_type == dst_type == BCF_BT_CHAR:
jpayne@68:         src_datac = <char *>src_data
jpayne@68:         dst_datac = <char *>dst_data
jpayne@68:         memcpy(dst_datac, src_datac, src_values)
jpayne@68:         for i in range(src_values, dst_values):
jpayne@68:             dst_datac[i] = 0
jpayne@68:     elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
jpayne@68:         src_datai8 = <int8_t *>src_data
jpayne@68:         dst_datai  = <int32_t *>dst_data
jpayne@68:         for i in range(src_values):
jpayne@68:             val = src_datai8[i]
jpayne@68:             if val == bcf_int8_missing:
jpayne@68:                 val = bcf_int32_missing
jpayne@68:             elif val == bcf_int8_vector_end:
jpayne@68:                 val = bcf_int32_vector_end
jpayne@68:             dst_datai[i] = val
jpayne@68:         for i in range(src_values, dst_values):
jpayne@68:             dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68:     elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
jpayne@68:         src_datai16 = <int16_t *>src_data
jpayne@68:         dst_datai   = <int32_t *>dst_data
jpayne@68:         for i in range(src_values):
jpayne@68:             val = src_datai16[i]
jpayne@68:             if val == bcf_int16_missing:
jpayne@68:                 val = bcf_int32_missing
jpayne@68:             elif val == bcf_int16_vector_end:
jpayne@68:                 val = bcf_int32_vector_end
jpayne@68:             dst_datai[i] = val
jpayne@68:         for i in range(src_values, dst_values):
jpayne@68:             dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68:     elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
jpayne@68:         src_datai32 = <int32_t *>src_data
jpayne@68:         dst_datai   = <int32_t *>dst_data
jpayne@68:         for i in range(src_values):
jpayne@68:             dst_datai[i] = src_datai32[i]
jpayne@68:         for i in range(src_values, dst_values):
jpayne@68:             dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
jpayne@68:     elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
jpayne@68:         src_dataf = <float *>src_data
jpayne@68:         dst_dataf = <float *>dst_data
jpayne@68:         for i in range(src_values):
jpayne@68:             dst_dataf[i] = src_dataf[i]
jpayne@68:         for i in range(src_values, dst_values):
jpayne@68:             bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
jpayne@68:     else:
jpayne@68:         raise TypeError('unsupported types')
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68:     cdef bcf1_t *r = record.ptr
jpayne@68: 
jpayne@68:     if not check_header_id(hdr, hl_type, id):
jpayne@68:         raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:     cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
jpayne@68:     cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
jpayne@68: 
jpayne@68:     scalar[0] = 0
jpayne@68: 
jpayne@68:     if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
jpayne@68:         count[0] = number
jpayne@68:     elif length == BCF_VL_FIXED:
jpayne@68:         if number == 1:
jpayne@68:             scalar[0] = 1
jpayne@68:         count[0] = number
jpayne@68:     elif length == BCF_VL_R:
jpayne@68:         count[0] = r.n_allele
jpayne@68:     elif length == BCF_VL_A:
jpayne@68:         count[0] = r.n_allele - 1
jpayne@68:     elif length == BCF_VL_G:
jpayne@68:         count[0] = bcf_genotype_count(hdr, r, sample)
jpayne@68:     elif length == BCF_VL_VAR:
jpayne@68:         count[0] = -1
jpayne@68:     else:
jpayne@68:         raise ValueError('Unknown format length')
jpayne@68: 
jpayne@68: 
jpayne@68: cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68: 
jpayne@68:     cdef char *s
jpayne@68:     cdef ssize_t count
jpayne@68:     cdef int scalar
jpayne@68: 
jpayne@68:     bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
jpayne@68: 
jpayne@68:     if z.len == 0:
jpayne@68:         if  bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
jpayne@68:             value = True
jpayne@68:         elif scalar:
jpayne@68:             value = None
jpayne@68:         else:
jpayne@68:             value = ()
jpayne@68:     elif z.len == 1:
jpayne@68:         if z.type == BCF_BT_INT8:
jpayne@68:             value = z.v1.i if z.v1.i != bcf_int8_missing else None
jpayne@68:         elif z.type == BCF_BT_INT16:
jpayne@68:             value = z.v1.i if z.v1.i != bcf_int16_missing else None
jpayne@68:         elif z.type == BCF_BT_INT32:
jpayne@68:             value = z.v1.i if z.v1.i != bcf_int32_missing else None
jpayne@68:         elif z.type == BCF_BT_FLOAT:
jpayne@68:             value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
jpayne@68:         elif z.type == BCF_BT_CHAR:
jpayne@68:             value = force_str(chr(z.v1.i))
jpayne@68:         else:
jpayne@68:             raise TypeError('unsupported info type code')
jpayne@68: 
jpayne@68:         if not scalar and value != ():
jpayne@68:             value = (value,)
jpayne@68:     else:
jpayne@68:         value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
jpayne@68: 
jpayne@68:     return value
jpayne@68: 
jpayne@68: 
jpayne@68: cdef object bcf_check_values(VariantRecord record, value, int sample,
jpayne@68:                              int hl_type, int ht_type,
jpayne@68:                              int id, int bt_type, ssize_t bt_len,
jpayne@68:                              ssize_t *value_count, int *scalar, int *realloc):
jpayne@68: 
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
jpayne@68: 
jpayne@68:     # Validate values now that we know the type and size
jpayne@68:     values = (value,) if not isinstance(value, (list, tuple)) else value
jpayne@68: 
jpayne@68:     # Validate values now that we know the type and size
jpayne@68:     if ht_type == BCF_HT_FLAG:
jpayne@68:         value_count[0] = 1
jpayne@68:     elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
jpayne@68:         # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
jpayne@68:         value_count[0] = -1
jpayne@68: 
jpayne@68:     cdef int given = len(values)
jpayne@68:     if value_count[0] != -1 and value_count[0] != given:
jpayne@68:         if scalar[0]:
jpayne@68:             raise TypeError('value expected to be scalar, given len={}'.format(given))
jpayne@68:         else:
jpayne@68:             raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
jpayne@68: 
jpayne@68:     if ht_type == BCF_HT_REAL:
jpayne@68:         for v in values:
jpayne@68:             if not(v is None or isinstance(v, (float, int))):
jpayne@68:                 raise TypeError('invalid value for Float format')
jpayne@68:     elif ht_type == BCF_HT_INT:
jpayne@68:         for v in values:
jpayne@68:             if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
jpayne@68:                 raise TypeError('invalid value for Integer format')
jpayne@68:         for v in values:
jpayne@68:             if not(v is None or bcf_int32_missing < v <= INT32_MAX):
jpayne@68:                 raise ValueError('Integer value too small/large to store in VCF/BCF')
jpayne@68:     elif ht_type == BCF_HT_STR:
jpayne@68:         values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
jpayne@68:     elif ht_type == BCF_HT_FLAG:
jpayne@68:         if values[0] not in (True, False, None, 1, 0):
jpayne@68:             raise ValueError('Flag values must be: True, False, None, 1, 0')
jpayne@68:     else:
jpayne@68:         raise TypeError('unsupported type')
jpayne@68: 
jpayne@68:     realloc[0] = 0
jpayne@68:     if len(values) <= 1 and hl_type == BCF_HL_INFO:
jpayne@68:         realloc[0] = 0
jpayne@68:     elif len(values) > bt_len:
jpayne@68:         realloc[0] = 1
jpayne@68:     elif bt_type == BCF_BT_INT8:
jpayne@68:         for v in values:
jpayne@68:             if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
jpayne@68:                 realloc[0] = 1
jpayne@68:                 break
jpayne@68:     elif bt_type == BCF_BT_INT16:
jpayne@68:         for v in values:
jpayne@68:             if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
jpayne@68:                 realloc[0] = 1
jpayne@68:                 break
jpayne@68: 
jpayne@68:     return values
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_encode_alleles(VariantRecord record, values):
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     cdef bcf1_t *r = record.ptr
jpayne@68:     cdef int32_t nalleles = r.n_allele
jpayne@68:     cdef list gt_values = []
jpayne@68:     cdef char *s
jpayne@68:     cdef int i
jpayne@68: 
jpayne@68:     if values is None:
jpayne@68:         return ()
jpayne@68: 
jpayne@68:     if not isinstance(values, (list, tuple)):
jpayne@68:         values = (values,)
jpayne@68: 
jpayne@68:     for value in values:
jpayne@68:         if value is None:
jpayne@68:             gt_values.append(bcf_gt_missing)
jpayne@68:         elif isinstance(value, (str, bytes)):
jpayne@68:             bvalue = force_bytes(value)
jpayne@68:             s = bvalue
jpayne@68:             for i in range(r.n_allele):
jpayne@68:                 if strcmp(r.d.allele[i], s) != 0:
jpayne@68:                     gt_values.append(bcf_gt_unphased(i))
jpayne@68:                     break
jpayne@68:             else:
jpayne@68:                 raise ValueError('Unknown allele')
jpayne@68:         else:
jpayne@68:             i = value
jpayne@68:             if not (0 <= i < nalleles):
jpayne@68:                 raise ValueError('Invalid allele index')
jpayne@68:             gt_values.append(bcf_gt_unphased(i))
jpayne@68: 
jpayne@68:     return gt_values
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_info_set_value(VariantRecord record, key, value):
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68:     cdef bcf1_t *r = record.ptr
jpayne@68:     cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
jpayne@68:     cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef bytes bkey = force_bytes(key)
jpayne@68:     cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:     if info:
jpayne@68:         info_id = info.key
jpayne@68:     else:
jpayne@68:         info_id = bcf_header_get_info_id(hdr, bkey)
jpayne@68: 
jpayne@68:     if info_id < 0:
jpayne@68:         raise KeyError('unknown INFO: {}'.format(key))
jpayne@68: 
jpayne@68:     if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68:         raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:     info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
jpayne@68:     values = bcf_check_values(record, value, -1,
jpayne@68:                               BCF_HL_INFO, info_type, info_id,
jpayne@68:                               info.type if info else -1,
jpayne@68:                               info.len  if info else -1,
jpayne@68:                               &value_count, &scalar, &realloc)
jpayne@68: 
jpayne@68:     if info_type == BCF_HT_FLAG:
jpayne@68:         if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
jpayne@68:             raise ValueError('Unable to update INFO values')
jpayne@68:         return
jpayne@68: 
jpayne@68:     vlen = value_count < 0
jpayne@68:     value_count = len(values)
jpayne@68: 
jpayne@68:     # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
jpayne@68:     # If we can, write updated values to existing allocated storage
jpayne@68:     if 0 and info and not realloc:
jpayne@68:         r.d.shared_dirty |= BCF1_DIRTY_INF
jpayne@68: 
jpayne@68:         if value_count == 0:
jpayne@68:             info.len = 0
jpayne@68:             if not info.vptr:
jpayne@68:                 info.vptr = <uint8_t *>&info.v1.i
jpayne@68: 
jpayne@68:         elif value_count == 1:
jpayne@68:             # FIXME: Check if need to free vptr if info.len > 0?
jpayne@68:             if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
jpayne@68:                 bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
jpayne@68:             elif info.type == BCF_BT_FLOAT:
jpayne@68:                 bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
jpayne@68:             else:
jpayne@68:                 raise TypeError('unsupported info type code')
jpayne@68: 
jpayne@68:             info.len = 1
jpayne@68:             if not info.vptr:
jpayne@68:                 info.vptr = <uint8_t *>&info.v1.i
jpayne@68:         else:
jpayne@68:             bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
jpayne@68: 
jpayne@68:         return
jpayne@68: 
jpayne@68:     alloc_len = max(1, value_count)
jpayne@68:     if info and info.len > alloc_len:
jpayne@68:         alloc_len = info.len
jpayne@68: 
jpayne@68:     new_values = bcf_empty_array(info_type, alloc_len, vlen)
jpayne@68:     cdef char *valp = <char *>new_values
jpayne@68: 
jpayne@68:     if info_type == BCF_HT_INT:
jpayne@68:         dst_type = BCF_BT_INT32
jpayne@68:     elif info_type == BCF_HT_REAL:
jpayne@68:         dst_type = BCF_BT_FLOAT
jpayne@68:     elif info_type == BCF_HT_STR:
jpayne@68:         dst_type = BCF_BT_CHAR
jpayne@68:     else:
jpayne@68:         raise ValueError('Unsupported INFO type')
jpayne@68: 
jpayne@68:     bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
jpayne@68: 
jpayne@68:     if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
jpayne@68:         raise ValueError('Unable to update INFO values')
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_info_del_value(VariantRecord record, key):
jpayne@68:     if record is None:
jpayne@68:         raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68:     cdef bcf1_t *r = record.ptr
jpayne@68:     cdef ssize_t value_count
jpayne@68:     cdef int scalar
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef bytes bkey = force_bytes(key)
jpayne@68:     cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:     if not info:
jpayne@68:         raise KeyError(key)
jpayne@68: 
jpayne@68:     bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
jpayne@68: 
jpayne@68:     if value_count <= 0:
jpayne@68:         null_value = ()
jpayne@68:     elif scalar:
jpayne@68:         null_value = None
jpayne@68:     else:
jpayne@68:         null_value = (None,)*value_count
jpayne@68: 
jpayne@68:     bcf_info_set_value(record, bkey, null_value)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_format_get_value(VariantRecordSample sample, key):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef ssize_t count
jpayne@68:     cdef int scalar
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef bytes bkey = force_bytes(key)
jpayne@68:     cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68: 
jpayne@68:     if not fmt or not fmt.p:
jpayne@68:         raise KeyError('invalid FORMAT: {}'.format(key))
jpayne@68: 
jpayne@68:     if is_gt_fmt(hdr, fmt.id):
jpayne@68:         return bcf_format_get_allele_indices(sample)
jpayne@68: 
jpayne@68:     bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
jpayne@68: 
jpayne@68:     if fmt.p and fmt.n and fmt.size:
jpayne@68:         return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
jpayne@68:     elif scalar:
jpayne@68:         return None
jpayne@68:     elif count <= 0:
jpayne@68:         return ()
jpayne@68:     else:
jpayne@68:         return (None,)*count
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_format_set_value(VariantRecordSample sample, key, value):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     if key == 'phased':
jpayne@68:         sample.phased = bool(value)
jpayne@68:         return
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef int fmt_id
jpayne@68:     cdef vdict_t *d
jpayne@68:     cdef khiter_t k
jpayne@68:     cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
jpayne@68:     cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef bytes bkey = force_bytes(key)
jpayne@68:     cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68: 
jpayne@68:     if fmt:
jpayne@68:         fmt_id = fmt.id
jpayne@68:     else:
jpayne@68:         d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68:         k = kh_get_vdict(d, bkey)
jpayne@68: 
jpayne@68:         if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
jpayne@68:             raise KeyError('unknown format: {}'.format(key))
jpayne@68: 
jpayne@68:         fmt_id = kh_val_vdict(d, k).id
jpayne@68: 
jpayne@68:     if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
jpayne@68:         raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:     fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
jpayne@68: 
jpayne@68:     if fmt_type == BCF_HT_FLAG:
jpayne@68:         raise ValueError('Flag types are not allowed on FORMATs')
jpayne@68: 
jpayne@68:     if is_gt_fmt(hdr, fmt_id):
jpayne@68:         value = bcf_encode_alleles(sample.record, value)
jpayne@68:         # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
jpayne@68:         fmt_type = BCF_HT_INT
jpayne@68: 
jpayne@68:     values = bcf_check_values(sample.record, value, sample.index,
jpayne@68:                               BCF_HL_FMT, fmt_type, fmt_id,
jpayne@68:                               fmt.type if fmt else -1,
jpayne@68:                               fmt.n    if fmt else -1,
jpayne@68:                               &value_count, &scalar, &realloc)
jpayne@68:     vlen = value_count < 0
jpayne@68:     value_count = len(values)
jpayne@68: 
jpayne@68:     # If we can, write updated values to existing allocated storage.
jpayne@68:     if fmt and not realloc:
jpayne@68:         r.d.indiv_dirty = 1
jpayne@68:         bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
jpayne@68:         return
jpayne@68: 
jpayne@68:     alloc_len = max(1, value_count)
jpayne@68:     if fmt and fmt.n > alloc_len:
jpayne@68:         alloc_len = fmt.n
jpayne@68: 
jpayne@68:     nsamples = r.n_sample
jpayne@68:     new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
jpayne@68:     cdef char *new_values_p = <char *>new_values
jpayne@68: 
jpayne@68:     if fmt_type == BCF_HT_INT:
jpayne@68:         dst_type = BCF_BT_INT32
jpayne@68:         dst_size = sizeof(int32_t) * alloc_len
jpayne@68:     elif fmt_type == BCF_HT_REAL:
jpayne@68:         dst_type = BCF_BT_FLOAT
jpayne@68:         dst_size = sizeof(float) * alloc_len
jpayne@68:     elif fmt_type == BCF_HT_STR:
jpayne@68:         dst_type = BCF_BT_CHAR
jpayne@68:         dst_size = sizeof(char) * alloc_len
jpayne@68:     else:
jpayne@68:         raise ValueError('Unsupported FORMAT type')
jpayne@68: 
jpayne@68:     if fmt and nsamples > 1:
jpayne@68:         for i in range(nsamples):
jpayne@68:             bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
jpayne@68:                                   new_values_p  + i * dst_size, dst_type, alloc_len,
jpayne@68:                                   vlen)
jpayne@68: 
jpayne@68:     bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
jpayne@68: 
jpayne@68:     if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
jpayne@68:         raise ValueError('Unable to update format values')
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_format_del_value(VariantRecordSample sample, key):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef ssize_t value_count
jpayne@68:     cdef int scalar
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef bytes bkey = force_bytes(key)
jpayne@68:     cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68: 
jpayne@68:     if not fmt or not fmt.p:
jpayne@68:         raise KeyError(key)
jpayne@68: 
jpayne@68:     bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
jpayne@68: 
jpayne@68:     if value_count <= 0:
jpayne@68:         null_value = ()
jpayne@68:     elif scalar:
jpayne@68:         null_value = None
jpayne@68:     else:
jpayne@68:         null_value = (None,)*value_count
jpayne@68: 
jpayne@68:     bcf_format_set_value(sample, bkey, null_value)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_format_get_allele_indices(VariantRecordSample sample):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef int32_t n = r.n_sample
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68:         return ()
jpayne@68: 
jpayne@68:     cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68:     cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68: 
jpayne@68:     if not gt0 or not fmt0.n:
jpayne@68:         return ()
jpayne@68: 
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68:     cdef int32_t a, nalleles = r.n_allele
jpayne@68:     cdef list alleles = []
jpayne@68: 
jpayne@68:     if fmt0.type == BCF_BT_INT8:
jpayne@68:         data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data8[i] == bcf_int8_vector_end:
jpayne@68:                 break
jpayne@68:             elif data8[i] == bcf_gt_missing:
jpayne@68:                 a = -1
jpayne@68:             else:
jpayne@68:                 a = bcf_gt_allele(data8[i])
jpayne@68:             alleles.append(a if 0 <= a < nalleles else None)
jpayne@68:     elif fmt0.type == BCF_BT_INT16:
jpayne@68:         data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data16[i] == bcf_int16_vector_end:
jpayne@68:                 break
jpayne@68:             elif data16[i] == bcf_gt_missing:
jpayne@68:                 a = -1
jpayne@68:             else:
jpayne@68:                 a = bcf_gt_allele(data16[i])
jpayne@68:             alleles.append(a if 0 <= a < nalleles else None)
jpayne@68:     elif fmt0.type == BCF_BT_INT32:
jpayne@68:         data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data32[i] == bcf_int32_vector_end:
jpayne@68:                 break
jpayne@68:             elif data32[i] == bcf_gt_missing:
jpayne@68:                 a = -1
jpayne@68:             else:
jpayne@68:                 a = bcf_gt_allele(data32[i])
jpayne@68:             alleles.append(a if 0 <= a < nalleles else None)
jpayne@68: 
jpayne@68:     return tuple(alleles)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_format_get_alleles(VariantRecordSample sample):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef int32_t nsamples = r.n_sample
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     cdef int32_t nalleles = r.n_allele
jpayne@68: 
jpayne@68:     if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
jpayne@68:         return ()
jpayne@68: 
jpayne@68:     cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68:     cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68: 
jpayne@68:     if not gt0 or not fmt0.n:
jpayne@68:         return ()
jpayne@68: 
jpayne@68:     cdef int32_t  a
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68:     alleles = []
jpayne@68:     if fmt0.type == BCF_BT_INT8:
jpayne@68:         data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data8[i] == bcf_int8_vector_end:
jpayne@68:                 break
jpayne@68:             a = bcf_gt_allele(data8[i])
jpayne@68:             alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68:     elif fmt0.type == BCF_BT_INT16:
jpayne@68:         data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data16[i] == bcf_int16_vector_end:
jpayne@68:                 break
jpayne@68:             a = bcf_gt_allele(data16[i])
jpayne@68:             alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68:     elif fmt0.type == BCF_BT_INT32:
jpayne@68:         data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data32[i] == bcf_int32_vector_end:
jpayne@68:                 break
jpayne@68:             a = bcf_gt_allele(data32[i])
jpayne@68:             alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
jpayne@68:     return tuple(alleles)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bint bcf_sample_get_phased(VariantRecordSample sample):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef int32_t n = r.n_sample
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68:         return False
jpayne@68: 
jpayne@68:     cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68:     cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68: 
jpayne@68:     if not gt0 or not fmt0.n:
jpayne@68:         return False
jpayne@68: 
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68: 
jpayne@68:     cdef bint phased = False
jpayne@68: 
jpayne@68:     if fmt0.type == BCF_BT_INT8:
jpayne@68:         data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data8[i] == bcf_int8_vector_end:
jpayne@68:                 break
jpayne@68:             elif data8[i] == bcf_int8_missing:
jpayne@68:                 continue
jpayne@68:             elif i and not bcf_gt_is_phased(data8[i]):
jpayne@68:                 return False
jpayne@68:             else:
jpayne@68:                 phased = True
jpayne@68:     elif fmt0.type == BCF_BT_INT16:
jpayne@68:         data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data16[i] == bcf_int16_vector_end:
jpayne@68:                 break
jpayne@68:             elif data16[i] == bcf_int16_missing:
jpayne@68:                 continue
jpayne@68:             elif i and not bcf_gt_is_phased(data16[i]):
jpayne@68:                 return False
jpayne@68:             else:
jpayne@68:                 phased = True
jpayne@68:     elif fmt0.type == BCF_BT_INT32:
jpayne@68:         data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data32[i] == bcf_int32_vector_end:
jpayne@68:                 break
jpayne@68:             elif data32[i] == bcf_int32_missing:
jpayne@68:                 continue
jpayne@68:             elif i and not bcf_gt_is_phased(data32[i]):
jpayne@68:                 return False
jpayne@68:             else:
jpayne@68:                 phased = True
jpayne@68: 
jpayne@68:     return phased
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
jpayne@68:     if sample is None:
jpayne@68:         raise ValueError('sample must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = sample.record.header.ptr
jpayne@68:     cdef bcf1_t *r = sample.record.ptr
jpayne@68:     cdef int32_t n = r.n_sample
jpayne@68: 
jpayne@68:     if bcf_unpack(r, BCF_UN_ALL) < 0:
jpayne@68:         raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:     if sample.index < 0 or sample.index >= n or not r.n_fmt:
jpayne@68:         return
jpayne@68: 
jpayne@68:     cdef bcf_fmt_t *fmt0 = r.d.fmt
jpayne@68:     cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
jpayne@68: 
jpayne@68:     if not gt0 or not fmt0.n:
jpayne@68:         raise ValueError('Cannot set phased before genotype is set')
jpayne@68: 
jpayne@68:     cdef int8_t  *data8
jpayne@68:     cdef int16_t *data16
jpayne@68:     cdef int32_t *data32
jpayne@68: 
jpayne@68:     if fmt0.type == BCF_BT_INT8:
jpayne@68:         data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data8[i] == bcf_int8_vector_end:
jpayne@68:                 break
jpayne@68:             elif data8[i] == bcf_int8_missing:
jpayne@68:                 continue
jpayne@68:             elif i:
jpayne@68:                 data8[i] = (data8[i] & 0xFE) | phased
jpayne@68:     elif fmt0.type == BCF_BT_INT16:
jpayne@68:         data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data16[i] == bcf_int16_vector_end:
jpayne@68:                 break
jpayne@68:             elif data16[i] == bcf_int16_missing:
jpayne@68:                 continue
jpayne@68:             elif i:
jpayne@68:                 data16[i] = (data16[i] & 0xFFFE) | phased
jpayne@68:     elif fmt0.type == BCF_BT_INT32:
jpayne@68:         data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
jpayne@68:         for i in range(fmt0.n):
jpayne@68:             if data32[i] == bcf_int32_vector_end:
jpayne@68:                 break
jpayne@68:             elif data32[i] == bcf_int32_missing:
jpayne@68:                 continue
jpayne@68:             elif i:
jpayne@68:                 data32[i] = (data32[i] & 0xFFFFFFFE) | phased
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline bcf_sync_end(VariantRecord record):
jpayne@68:     cdef bcf_hdr_t *hdr = record.header.ptr
jpayne@68:     cdef bcf_info_t *info
jpayne@68:     cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
jpayne@68:     cdef int ref_len
jpayne@68: 
jpayne@68:     # allow missing ref when instantiating a new record
jpayne@68:     if record.ref is not None:
jpayne@68:         ref_len = len(record.ref)
jpayne@68:     else:
jpayne@68:         ref_len = 0
jpayne@68: 
jpayne@68:     # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
jpayne@68:     # Always keep END for symbolic alleles
jpayne@68:     if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
jpayne@68:         # If INFO/END is not defined in the header, it doesn't exist in the record
jpayne@68:         if end_id >= 0:
jpayne@68:             info = bcf_get_info(hdr, record.ptr, b'END')
jpayne@68:             if info and info.vptr:
jpayne@68:                 if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
jpayne@68:                     raise ValueError('Unable to delete END')
jpayne@68:     else:
jpayne@68:         # Create END header, if not present
jpayne@68:         if end_id < 0:
jpayne@68:             record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
jpayne@68: 
jpayne@68:         # Update to reflect stop position
jpayne@68:         bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline int has_symbolic_allele(VariantRecord record):
jpayne@68:     """Return index of first symbolic allele. 0 if no symbolic alleles."""
jpayne@68: 
jpayne@68:     for i in range(1, record.ptr.n_allele):
jpayne@68:         alt = record.ptr.d.allele[i]
jpayne@68:         if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
jpayne@68:             return i
jpayne@68: 
jpayne@68:     return 0
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Variant Header objects
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef bcf_header_remove_hrec(VariantHeader header, int i):
jpayne@68:     if header is None:
jpayne@68:         raise ValueError('header must not be None')
jpayne@68: 
jpayne@68:     cdef bcf_hdr_t *hdr = header.ptr
jpayne@68: 
jpayne@68:     if i < 0 or i >= hdr.nhrec:
jpayne@68:         raise ValueError('Invalid header record index')
jpayne@68: 
jpayne@68:     cdef bcf_hrec_t *hrec = hdr.hrec[i]
jpayne@68:     hdr.nhrec -= 1
jpayne@68: 
jpayne@68:     if i < hdr.nhrec:
jpayne@68:         memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
jpayne@68: 
jpayne@68:     bcf_hrec_destroy(hrec)
jpayne@68:     hdr.hrec[hdr.nhrec] = NULL
jpayne@68:     hdr.dirty = 1
jpayne@68: 
jpayne@68: 
jpayne@68: #FIXME: implement a full mapping interface
jpayne@68: #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
jpayne@68: #       object lifetime.
jpayne@68: cdef class VariantHeaderRecord(object):
jpayne@68:     """header record from a :class:`VariantHeader` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def type(self):
jpayne@68:         """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return None
jpayne@68:         return METADATA_TYPES[r.type]
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def key(self):
jpayne@68:         """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         return bcf_str_cache_get_charptr(r.key) if r and r.key else None
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def value(self):
jpayne@68:         """header value.  Set only for generic lines, None for FILTER/INFO, etc."""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         return charptr_to_str(r.value) if r and r.value else None
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def attrs(self):
jpayne@68:         """sequence of additional header attributes"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return ()
jpayne@68:         cdef int i
jpayne@68:         return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
jpayne@68:                       charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@68:                      for i in range(r.nkeys))
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         return r.nkeys if r else 0
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         return r != NULL and r.nkeys != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         """get attribute value"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         cdef int i
jpayne@68:         if r:
jpayne@68:             bkey = force_bytes(key)
jpayne@68:             for i in range(r.nkeys):
jpayne@68:                 if r.keys[i] and r.keys[i] == bkey:
jpayne@68:                     return charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@68:         raise KeyError('cannot find metadata key')
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return
jpayne@68:         cdef int i
jpayne@68:         for i in range(r.nkeys):
jpayne@68:             if r.keys[i]:
jpayne@68:                 yield bcf_str_cache_get_charptr(r.keys[i])
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         try:
jpayne@68:             self[key]
jpayne@68:         except KeyError:
jpayne@68:             return False
jpayne@68:         else:
jpayne@68:             return True
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return
jpayne@68:         cdef int i
jpayne@68:         for i in range(r.nkeys):
jpayne@68:             if r.keys[i]:
jpayne@68:                 yield charptr_to_str(r.vals[i]) if r.vals[i] else None
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return
jpayne@68:         cdef int i
jpayne@68:         for i in range(r.nkeys):
jpayne@68:             if r.keys[i]:
jpayne@68:                 yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def update(self, items=None, **kwargs):
jpayne@68:         """D.update([E, ]**F) -> None.
jpayne@68: 
jpayne@68:         Update D from dict/iterable E and F.
jpayne@68:         """
jpayne@68:         for k, v in items.items():
jpayne@68:             self[k] = v
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             for k, v in kwargs.items():
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:     def pop(self, key, default=_nothing):
jpayne@68:         try:
jpayne@68:             value = self[key]
jpayne@68:             del self[key]
jpayne@68:             return value
jpayne@68:         except KeyError:
jpayne@68:             if default is not _nothing:
jpayne@68:                 return default
jpayne@68:             raise
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68: 
jpayne@68:         if not r:
jpayne@68:             raise ValueError('cannot convert deleted record to str')
jpayne@68: 
jpayne@68:         cdef kstring_t hrec_str
jpayne@68:         hrec_str.l = hrec_str.m = 0
jpayne@68:         hrec_str.s = NULL
jpayne@68: 
jpayne@68:         bcf_hrec_format(r, &hrec_str)
jpayne@68: 
jpayne@68:         ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
jpayne@68: 
jpayne@68:         if hrec_str.m:
jpayne@68:             free(hrec_str.s)
jpayne@68: 
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     # FIXME: Not safe -- causes trivial segfaults at the moment
jpayne@68:     def remove(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef bcf_hrec_t *r = self.ptr
jpayne@68:         if not r:
jpayne@68:             return
jpayne@68:         assert r.key
jpayne@68:         cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
jpayne@68:         bcf_hdr_remove(hdr, r.type, key)
jpayne@68:         self.ptr = NULL
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     if not hdr:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
jpayne@68:     record.header = header
jpayne@68:     record.ptr = hdr
jpayne@68: 
jpayne@68:     return record
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantHeaderRecords(object):
jpayne@68:     """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self.header.ptr.nhrec
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return self.header.ptr.nhrec != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, index):
jpayne@68:         cdef int32_t i = index
jpayne@68:         if i < 0 or i >= self.header.ptr.nhrec:
jpayne@68:             raise IndexError('invalid header record index')
jpayne@68:         return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef int32_t i
jpayne@68:         for i in range(self.header.ptr.nhrec):
jpayne@68:             yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
jpayne@68: 
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
jpayne@68:     records.header = header
jpayne@68:     return records
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantMetadata(object):
jpayne@68:     """filter, info or format metadata record from a :class:`VariantHeader` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def name(self):
jpayne@68:         """metadata name"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
jpayne@68: 
jpayne@68:     # Q: Should this be exposed?
jpayne@68:     @property
jpayne@68:     def id(self):
jpayne@68:         """metadata internal header id number"""
jpayne@68:         return self.id
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def number(self):
jpayne@68:         """metadata number (i.e. cardinality)"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, self.type, self.id):
jpayne@68:             raise ValueError('Invalid header id')
jpayne@68: 
jpayne@68:         if self.type == BCF_HL_FLT:
jpayne@68:             return None
jpayne@68: 
jpayne@68:         cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
jpayne@68:         if l == BCF_VL_FIXED:
jpayne@68:             return bcf_hdr_id2number(hdr, self.type, self.id)
jpayne@68:         elif l == BCF_VL_VAR:
jpayne@68:             return '.'
jpayne@68:         else:
jpayne@68:             return METADATA_LENGTHS[l]
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def type(self):
jpayne@68:         """metadata value type"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         if not check_header_id(hdr, self.type, self.id):
jpayne@68:             raise ValueError('Invalid header id')
jpayne@68: 
jpayne@68:         if self.type == BCF_HL_FLT:
jpayne@68:             return None
jpayne@68:         return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def description(self):
jpayne@68:         """metadata description (or None if not set)"""
jpayne@68:         descr = self.record.get('Description')
jpayne@68:         if descr:
jpayne@68:             descr = descr.strip('"')
jpayne@68:         return force_str(descr)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def record(self):
jpayne@68:         """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         if not check_header_id(hdr, self.type, self.id):
jpayne@68:             raise ValueError('Invalid header id')
jpayne@68:         cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
jpayne@68:         if not hrec:
jpayne@68:             return None
jpayne@68:         return makeVariantHeaderRecord(self.header, hrec)
jpayne@68: 
jpayne@68:     def remove_header(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
jpayne@68:         bcf_hdr_remove(hdr, self.type, key)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
jpayne@68:         raise ValueError('invalid metadata type')
jpayne@68: 
jpayne@68:     if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
jpayne@68:         raise ValueError('invalid metadata id')
jpayne@68: 
jpayne@68:     cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
jpayne@68:     meta.header = header
jpayne@68:     meta.type = type
jpayne@68:     meta.id = id
jpayne@68: 
jpayne@68:     return meta
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantHeaderMetadata(object):
jpayne@68:     """mapping from filter, info or format name to :class:`VariantMetadata` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def add(self, id, number, type, description, **kwargs):
jpayne@68:         """Add a new filter, info or format record"""
jpayne@68:         if id in self:
jpayne@68:             raise ValueError('Header already exists for id={}'.format(id))
jpayne@68: 
jpayne@68:         if self.type == BCF_HL_FLT:
jpayne@68:             if number is not None:
jpayne@68:                 raise ValueError('Number must be None when adding a filter')
jpayne@68:             if type is not None:
jpayne@68:                 raise ValueError('Type must be None when adding a filter')
jpayne@68: 
jpayne@68:             items = [('ID', unquoted_str(id)), ('Description', description)]
jpayne@68:         else:
jpayne@68:             if type not in VALUE_TYPES:
jpayne@68:                 raise ValueError('unknown type specified: {}'.format(type))
jpayne@68:             if number is None:
jpayne@68:                 number = '.'
jpayne@68: 
jpayne@68:             items = [('ID', unquoted_str(id)),
jpayne@68:                      ('Number', unquoted_str(number)),
jpayne@68:                      ('Type', unquoted_str(type)),
jpayne@68:                      ('Description', description)]
jpayne@68: 
jpayne@68:         items += kwargs.items()
jpayne@68:         self.header.add_meta(METADATA_TYPES[self.type], items=items)
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef bcf_idpair_t *idpair
jpayne@68:         cdef int32_t i, n = 0
jpayne@68: 
jpayne@68:         for i in range(hdr.n[BCF_DT_ID]):
jpayne@68:             idpair = hdr.id[BCF_DT_ID] + i
jpayne@68:             if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68:                 n += 1
jpayne@68:         return n
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef bcf_idpair_t *idpair
jpayne@68:         cdef int32_t i
jpayne@68: 
jpayne@68:         for i in range(hdr.n[BCF_DT_ID]):
jpayne@68:             idpair = hdr.id[BCF_DT_ID] + i
jpayne@68:             if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68:                 return True
jpayne@68:         return False
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68: 
jpayne@68:         if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@68:             raise KeyError('invalid key: {}'.format(key))
jpayne@68: 
jpayne@68:         return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
jpayne@68: 
jpayne@68:     def remove_header(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68: 
jpayne@68:         if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
jpayne@68:             raise KeyError('invalid key: {}'.format(key))
jpayne@68: 
jpayne@68:         bcf_hdr_remove(hdr, self.type, bkey)
jpayne@68:         #bcf_hdr_sync(hdr)
jpayne@68: 
jpayne@68:     def clear_header(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         bcf_hdr_remove(hdr, self.type, NULL)
jpayne@68:         #bcf_hdr_sync(hdr)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef bcf_idpair_t *idpair
jpayne@68:         cdef int32_t i
jpayne@68: 
jpayne@68:         for i in range(hdr.n[BCF_DT_ID]):
jpayne@68:             idpair = hdr.id[BCF_DT_ID] + i
jpayne@68:             if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
jpayne@68:                 yield bcf_str_cache_get_charptr(idpair.key)
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         try:
jpayne@68:             self[key]
jpayne@68:         except KeyError:
jpayne@68:             return False
jpayne@68:         else:
jpayne@68:             return True
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
jpayne@68:     meta.header = header
jpayne@68:     meta.type = type
jpayne@68: 
jpayne@68:     return meta
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantContig(object):
jpayne@68:     """contig metadata from a :class:`VariantHeader`"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def name(self):
jpayne@68:         """contig name"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def id(self):
jpayne@68:         """contig internal id number"""
jpayne@68:         return self.id
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def length(self):
jpayne@68:         """contig length or None if not available"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
jpayne@68:         return length if length else None
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def header_record(self):
jpayne@68:         """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
jpayne@68:         return makeVariantHeaderRecord(self.header, hrec)
jpayne@68: 
jpayne@68:     def remove_header(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
jpayne@68:         bcf_hdr_remove(hdr, BCF_HL_CTG, key)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantContig makeVariantContig(VariantHeader header, int id):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
jpayne@68:         raise ValueError('invalid contig id')
jpayne@68: 
jpayne@68:     cdef VariantContig contig = VariantContig.__new__(VariantContig)
jpayne@68:     contig.header = header
jpayne@68:     contig.id = id
jpayne@68: 
jpayne@68:     return contig
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantHeaderContigs(object):
jpayne@68:     """mapping from contig name or index to :class:`VariantContig` object."""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@68:         return hdr.n[BCF_DT_CTG]
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
jpayne@68:         return hdr.n[BCF_DT_CTG] != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int index
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             index = key
jpayne@68:             if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@68:                 raise IndexError('invalid contig index')
jpayne@68:             return makeVariantContig(self.header, index)
jpayne@68: 
jpayne@68:         cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68: 
jpayne@68:         if k == kh_end(d):
jpayne@68:             raise KeyError('invalid contig: {}'.format(key))
jpayne@68: 
jpayne@68:         cdef int id = kh_val_vdict(d, k).id
jpayne@68: 
jpayne@68:         return makeVariantContig(self.header, id)
jpayne@68: 
jpayne@68:     def remove_header(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int index
jpayne@68:         cdef const char *ckey
jpayne@68:         cdef vdict_t *d
jpayne@68:         cdef khiter_t k
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             index = key
jpayne@68:             if index < 0 or index >= hdr.n[BCF_DT_CTG]:
jpayne@68:                 raise IndexError('invalid contig index')
jpayne@68:             ckey = hdr.id[BCF_DT_CTG][self.id].key
jpayne@68:         else:
jpayne@68:             d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68:             key = force_bytes(key)
jpayne@68:             if kh_get_vdict(d, key) == kh_end(d):
jpayne@68:                 raise KeyError('invalid contig: {}'.format(key))
jpayne@68:             ckey = key
jpayne@68: 
jpayne@68:         bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
jpayne@68: 
jpayne@68:     def clear_header(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
jpayne@68:         #bcf_hdr_sync(hdr)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
jpayne@68:         cdef uint32_t n = kh_size(d)
jpayne@68: 
jpayne@68:         assert n == hdr.n[BCF_DT_CTG]
jpayne@68: 
jpayne@68:         for i in range(n):
jpayne@68:             yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         try:
jpayne@68:             self[key]
jpayne@68:         except KeyError:
jpayne@68:             return False
jpayne@68:         else:
jpayne@68:             return True
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68:     def add(self, id, length=None, **kwargs):
jpayne@68:         """Add a new contig record"""
jpayne@68:         if id in self:
jpayne@68:             raise ValueError('Header already exists for contig {}'.format(id))
jpayne@68: 
jpayne@68:         items = [('ID', unquoted_str(id))]
jpayne@68:         if length is not None:
jpayne@68:             items.append(("length", unquoted_str(length)))
jpayne@68:         items += kwargs.items()
jpayne@68:         self.header.add_meta('contig', items=items)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
jpayne@68:     contigs.header = header
jpayne@68: 
jpayne@68:     return contigs
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantHeaderSamples(object):
jpayne@68:     """sequence of sample names from a :class:`VariantHeader` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return bcf_hdr_nsamples(self.header.ptr)
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return bcf_hdr_nsamples(self.header.ptr) != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, index):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int32_t n = bcf_hdr_nsamples(hdr)
jpayne@68:         cdef int32_t i = index
jpayne@68: 
jpayne@68:         if i < 0 or i >= n:
jpayne@68:             raise IndexError('invalid sample index')
jpayne@68: 
jpayne@68:         return charptr_to_str(hdr.samples[i])
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int32_t i, n = bcf_hdr_nsamples(hdr)
jpayne@68: 
jpayne@68:         for i in range(n):
jpayne@68:             yield charptr_to_str(hdr.samples[i])
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef khiter_t k = kh_get_vdict(d, bkey)
jpayne@68: 
jpayne@68:         return k != kh_end(d)
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68:     def add(self, name):
jpayne@68:         """Add a new sample"""
jpayne@68:         self.header.add_sample(name)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
jpayne@68:     samples.header = header
jpayne@68: 
jpayne@68:     return samples
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantHeader(object):
jpayne@68:     """header information for a :class:`VariantFile` object"""
jpayne@68:     #FIXME: Add structured proxy
jpayne@68:     #FIXME: Add generic proxy
jpayne@68:     #FIXME: Add mutable methods
jpayne@68: 
jpayne@68:     # See makeVariantHeader for C constructor
jpayne@68:     def __cinit__(self):
jpayne@68:         self.ptr = NULL
jpayne@68: 
jpayne@68:     # Python constructor
jpayne@68:     def __init__(self):
jpayne@68:         self.ptr = bcf_hdr_init(b'w')
jpayne@68:         if not self.ptr:
jpayne@68:             raise ValueError('cannot create VariantHeader')
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.ptr:
jpayne@68:             bcf_hdr_destroy(self.ptr)
jpayne@68:             self.ptr = NULL
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return self.ptr != NULL
jpayne@68: 
jpayne@68:     def copy(self):
jpayne@68:         return makeVariantHeader(bcf_hdr_dup(self.ptr))
jpayne@68: 
jpayne@68:     def merge(self, VariantHeader header):
jpayne@68:         if header is None:
jpayne@68:             raise ValueError('header must not be None')
jpayne@68:         bcf_hdr_merge(self.ptr, header.ptr)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def version(self):
jpayne@68:         """VCF version"""
jpayne@68:         return force_str(bcf_hdr_get_version(self.ptr))
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def samples(self):
jpayne@68:         """samples (:class:`VariantHeaderSamples`)"""
jpayne@68:         return makeVariantHeaderSamples(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def records(self):
jpayne@68:         """header records (:class:`VariantHeaderRecords`)"""
jpayne@68:         return makeVariantHeaderRecords(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def contigs(self):
jpayne@68:         """contig information (:class:`VariantHeaderContigs`)"""
jpayne@68:         return makeVariantHeaderContigs(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def filters(self):
jpayne@68:         """filter metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68:         return makeVariantHeaderMetadata(self, BCF_HL_FLT)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def info(self):
jpayne@68:         """info metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68:         return makeVariantHeaderMetadata(self, BCF_HL_INFO)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def formats(self):
jpayne@68:         """format metadata (:class:`VariantHeaderMetadata`)"""
jpayne@68:         return makeVariantHeaderMetadata(self, BCF_HL_FMT)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def alts(self):
jpayne@68:         """alt metadata (:class:`dict` ID->record).
jpayne@68: 
jpayne@68:         The data returned just a snapshot of alt records, is created
jpayne@68:         every time the property is requested, and modifications will
jpayne@68:         not be reflected in the header metadata and vice versa.
jpayne@68: 
jpayne@68:         i.e. it is just a dict that reflects the state of alt records
jpayne@68:         at the time it is created.
jpayne@68:         """
jpayne@68:         return {record['ID']:record for record in self.records
jpayne@68:                 if record.key.upper() == 'ALT' }
jpayne@68: 
jpayne@68:     # only safe to do when opening an htsfile
jpayne@68:     cdef _subset_samples(self, include_samples):
jpayne@68:         keep_samples    = set(self.samples)
jpayne@68:         include_samples = set(include_samples)
jpayne@68:         missing_samples = include_samples - keep_samples
jpayne@68:         keep_samples   &= include_samples
jpayne@68: 
jpayne@68:         if missing_samples:
jpayne@68:             # FIXME: add specialized exception with payload
jpayne@68:             raise ValueError(
jpayne@68:                 'missing {:d} requested samples'.format(
jpayne@68:                     len(missing_samples)))
jpayne@68: 
jpayne@68:         keep_samples = force_bytes(','.join(keep_samples))
jpayne@68:         cdef char *keep = <char *>keep_samples if keep_samples else NULL
jpayne@68:         cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
jpayne@68: 
jpayne@68:         if ret != 0:
jpayne@68:             raise ValueError(
jpayne@68:                 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         cdef int hlen
jpayne@68:         cdef kstring_t line
jpayne@68:         line.l = line.m = 0
jpayne@68:         line.s = NULL
jpayne@68: 
jpayne@68:         if bcf_hdr_format(self.ptr, 0, &line) < 0:
jpayne@68:             if line.m:
jpayne@68:                 free(line.s)
jpayne@68:             raise ValueError('bcf_hdr_format failed')
jpayne@68: 
jpayne@68:         ret = charptr_to_str_w_len(line.s, line.l)
jpayne@68: 
jpayne@68:         if line.m:
jpayne@68:             free(line.s)
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def new_record(self, contig=None, start=0, stop=0, alleles=None,
jpayne@68:                          id=None, qual=None, filter=None, info=None, samples=None,
jpayne@68:                          **kwargs):
jpayne@68:         """Create a new empty VariantRecord.
jpayne@68: 
jpayne@68:         Arguments are currently experimental.  Use with caution and expect
jpayne@68:         changes in upcoming releases.
jpayne@68: 
jpayne@68:         """
jpayne@68:         rec = makeVariantRecord(self, bcf_init())
jpayne@68: 
jpayne@68:         if not rec:
jpayne@68:             raise MemoryError('unable to allocate BCF record')
jpayne@68: 
jpayne@68:         rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
jpayne@68: 
jpayne@68:         if contig is not None:
jpayne@68:             rec.contig  = contig
jpayne@68: 
jpayne@68:         rec.start = start
jpayne@68:         rec.stop  = stop
jpayne@68:         rec.id    = id
jpayne@68:         rec.qual  = qual
jpayne@68: 
jpayne@68:         if alleles is not None:
jpayne@68:             rec.alleles = alleles
jpayne@68: 
jpayne@68:         if filter is not None:
jpayne@68:             if isinstance(filter, (list, tuple, VariantRecordFilter)):
jpayne@68:                 for f in filter:
jpayne@68:                     rec.filter.add(f)
jpayne@68:             else:
jpayne@68:                 rec.filter.add(filter)
jpayne@68: 
jpayne@68:         if info:
jpayne@68:             rec.info.update(info)
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             rec.samples[0].update(kwargs)
jpayne@68: 
jpayne@68:         if samples:
jpayne@68:             for i, sample in enumerate(samples):
jpayne@68:                 rec.samples[i].update(sample)
jpayne@68: 
jpayne@68:         return rec
jpayne@68: 
jpayne@68:     def add_record(self, VariantHeaderRecord record):
jpayne@68:         """Add an existing :class:`VariantHeaderRecord` to this header"""
jpayne@68:         if record is None:
jpayne@68:             raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:         cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
jpayne@68: 
jpayne@68:         bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@68: 
jpayne@68:         self._hdr_sync()
jpayne@68: 
jpayne@68:     def add_line(self, line):
jpayne@68:         """Add a metadata line to this header"""
jpayne@68:         bline = force_bytes(line)
jpayne@68:         if bcf_hdr_append(self.ptr, bline) < 0:
jpayne@68:             raise ValueError('invalid header line')
jpayne@68: 
jpayne@68:         self._hdr_sync()
jpayne@68: 
jpayne@68: 
jpayne@68:     def add_meta(self, key, value=None, items=None):
jpayne@68:         """Add metadata to this header"""
jpayne@68:         if not ((value is not None) ^ (items is not None)):
jpayne@68:             raise ValueError('either value or items must be specified')
jpayne@68: 
jpayne@68:         cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
jpayne@68:         cdef int quoted
jpayne@68: 
jpayne@68:         try:
jpayne@68:             key = force_bytes(key)
jpayne@68:             hrec.key = strdup(key)
jpayne@68: 
jpayne@68:             if value is not None:
jpayne@68:                 hrec.value = strdup(force_bytes(value))
jpayne@68:             else:
jpayne@68:                 for key, value in items:
jpayne@68:                     quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
jpayne@68: 
jpayne@68:                     key = force_bytes(key)
jpayne@68:                     bcf_hrec_add_key(hrec, key, <int>len(key))
jpayne@68: 
jpayne@68:                     value = force_bytes(str(value))
jpayne@68:                     bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
jpayne@68:         except:
jpayne@68:             bcf_hrec_destroy(hrec)
jpayne@68:             raise
jpayne@68: 
jpayne@68:         bcf_hdr_add_hrec(self.ptr, hrec)
jpayne@68: 
jpayne@68:         self._hdr_sync()
jpayne@68: 
jpayne@68:     cdef _add_sample(self, name):
jpayne@68:         bname = force_bytes(name)
jpayne@68:         if bcf_hdr_add_sample(self.ptr, bname) < 0:
jpayne@68:             raise ValueError('Duplicated sample name: {}'.format(name))
jpayne@68: 
jpayne@68:     cdef _hdr_sync(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.ptr
jpayne@68:         if hdr.dirty:
jpayne@68:             if bcf_hdr_sync(hdr) < 0:
jpayne@68:                 raise MemoryError('unable to reallocate VariantHeader')
jpayne@68: 
jpayne@68:     def add_sample(self, name):
jpayne@68:         """Add a new sample to this header"""
jpayne@68:         self._add_sample(name)
jpayne@68:         self._hdr_sync()
jpayne@68: 
jpayne@68:     def add_samples(self, *args):
jpayne@68:         """Add several new samples to this header.
jpayne@68:         This function takes multiple arguments, each of which may
jpayne@68:         be either a sample name or an iterable returning sample names
jpayne@68:         (e.g., a list of sample names).
jpayne@68:         """
jpayne@68:         for arg in args:
jpayne@68:             if isinstance(arg, str):
jpayne@68:                 self._add_sample(arg)
jpayne@68:             else:
jpayne@68:                 for name in arg:
jpayne@68:                     self._add_sample(name)
jpayne@68:         self._hdr_sync()
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
jpayne@68:     if not hdr:
jpayne@68:         raise ValueError('cannot create VariantHeader')
jpayne@68: 
jpayne@68:     cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
jpayne@68:     header.ptr = hdr
jpayne@68: 
jpayne@68:     return header
jpayne@68: 
jpayne@68: 
jpayne@68: cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
jpayne@68:     cdef vdict_t *d
jpayne@68:     cdef khiter_t k
jpayne@68:     cdef int info_id
jpayne@68: 
jpayne@68:     if isinstance(key, str):
jpayne@68:         key = force_bytes(key)
jpayne@68: 
jpayne@68:     d = <vdict_t *>hdr.dict[BCF_DT_ID]
jpayne@68:     k = kh_get_vdict(d, key)
jpayne@68: 
jpayne@68:     if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
jpayne@68:         return -1
jpayne@68: 
jpayne@68:     return kh_val_vdict(d, k).id
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Variant Record objects
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: cdef class VariantRecordFilter(object):
jpayne@68:     """Filters set on a :class:`VariantRecord` object, presented as a mapping from
jpayne@68:        filter index or name to :class:`VariantMetadata` object"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self.record.ptr.d.n_flt
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return self.record.ptr.d.n_flt != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int index, id
jpayne@68:         cdef int n = r.d.n_flt
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             index = key
jpayne@68: 
jpayne@68:             if index < 0 or index >= n:
jpayne@68:                 raise IndexError('invalid filter index')
jpayne@68: 
jpayne@68:             id = r.d.flt[index]
jpayne@68:         else:
jpayne@68:             if key == '.':
jpayne@68:                 key = 'PASS'
jpayne@68: 
jpayne@68:             bkey = force_bytes(key)
jpayne@68:             id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68: 
jpayne@68:             if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@68:                 raise KeyError('Invalid filter: {}'.format(key))
jpayne@68: 
jpayne@68:         return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
jpayne@68: 
jpayne@68:     def add(self, key):
jpayne@68:         """Add a new filter"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int id
jpayne@68: 
jpayne@68:         if key == '.':
jpayne@68:             key = 'PASS'
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, BCF_HL_FLT, id):
jpayne@68:             raise KeyError('Invalid filter: {}'.format(key))
jpayne@68: 
jpayne@68:         bcf_add_filter(hdr, r, id)
jpayne@68: 
jpayne@68:     def __delitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int index, id
jpayne@68:         cdef int n = r.d.n_flt
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             index = key
jpayne@68: 
jpayne@68:             if index < 0 or index >= n:
jpayne@68:                 raise IndexError('invalid filter index')
jpayne@68: 
jpayne@68:             id = r.d.flt[index]
jpayne@68:         else:
jpayne@68:             if key == '.':
jpayne@68:                 key = 'PASS'
jpayne@68: 
jpayne@68:             bkey = force_bytes(key)
jpayne@68:             id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
jpayne@68: 
jpayne@68:             if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
jpayne@68:                 raise KeyError('Invalid filter: {}'.format(key))
jpayne@68: 
jpayne@68:         bcf_remove_filter(hdr, r, id, 0)
jpayne@68: 
jpayne@68:     def clear(self):
jpayne@68:         """Clear all filters"""
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         r.d.shared_dirty |= BCF1_DIRTY_FLT
jpayne@68:         r.d.n_flt = 0
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in range(r.d.n_flt):
jpayne@68:             yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         return bcf_has_filter(hdr, r, bkey) == 1
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
jpayne@68:         if op != 2 and op != 3:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:         cdef bcf1_t *s = self.record.ptr
jpayne@68:         cdef bcf1_t *o = other.record.ptr
jpayne@68: 
jpayne@68:         cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
jpayne@68: 
jpayne@68:         if op == 3:
jpayne@68:             cmp = not cmp
jpayne@68: 
jpayne@68:         return cmp
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
jpayne@68:     if not record:
jpayne@68:         raise ValueError('invalid VariantRecord')
jpayne@68: 
jpayne@68:     cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
jpayne@68:     filter.record = record
jpayne@68: 
jpayne@68:     return filter
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantRecordFormat(object):
jpayne@68:     """Format data present for each sample in a :class:`VariantRecord` object,
jpayne@68:        presented as mapping from format name to :class:`VariantMetadata` object."""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int i, n = 0
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             if r.d.fmt[i].p:
jpayne@68:                 n += 1
jpayne@68:         return n
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             if r.d.fmt[i].p:
jpayne@68:                 return True
jpayne@68:         return False
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68: 
jpayne@68:         if not fmt or not fmt.p:
jpayne@68:             raise KeyError('unknown format: {}'.format(key))
jpayne@68: 
jpayne@68:         return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
jpayne@68: 
jpayne@68:     def __delitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68: 
jpayne@68:         if not fmt or not fmt.p:
jpayne@68:             raise KeyError('unknown format: {}'.format(key))
jpayne@68: 
jpayne@68:         if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
jpayne@68:             raise ValueError('Unable to delete FORMAT')
jpayne@68: 
jpayne@68:     def clear(self):
jpayne@68:         """Clear all formats for all samples within the associated
jpayne@68:            :class:`VariantRecord` instance"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_fmt_t *fmt
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in reversed(range(r.n_fmt)):
jpayne@68:             fmt = &r.d.fmt[i]
jpayne@68:             if fmt.p:
jpayne@68:                 key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
jpayne@68:                 if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
jpayne@68:                     raise ValueError('Unable to delete FORMAT')
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_fmt_t *fmt
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             fmt = &r.d.fmt[i]
jpayne@68:             if fmt.p:
jpayne@68:                 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68:         return fmt != NULL and fmt.p != NULL
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
jpayne@68:     if not record:
jpayne@68:         raise ValueError('invalid VariantRecord')
jpayne@68: 
jpayne@68:     cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
jpayne@68:     format.record = record
jpayne@68: 
jpayne@68:     return format
jpayne@68: 
jpayne@68: 
jpayne@68: #TODO: Add a getmeta method to return the corresponding VariantMetadata?
jpayne@68: cdef class VariantRecordInfo(object):
jpayne@68:     """Info data stored in a :class:`VariantRecord` object, presented as a
jpayne@68:        mapping from info metadata name to value."""
jpayne@68: 
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i, count = 0
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:             if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@68:                 count += 1
jpayne@68: 
jpayne@68:         return count
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:             if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
jpayne@68:                 return True
jpayne@68: 
jpayne@68:         return False
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68: 
jpayne@68:         if strcmp(bkey, b'END') == 0:
jpayne@68:             raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68: 
jpayne@68:         cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:         # Cannot stop here if info == NULL, since flags must return False
jpayne@68:         cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68: 
jpayne@68:         if info_id < 0:
jpayne@68:             raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68:             raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:         # Handle type=Flag values
jpayne@68:         if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@68:             return info != NULL and info.vptr != NULL
jpayne@68: 
jpayne@68:         if not info or not info.vptr:
jpayne@68:             raise KeyError('Invalid INFO field: {}'.format(key))
jpayne@68: 
jpayne@68:         return bcf_info_get_value(self.record, info)
jpayne@68: 
jpayne@68:     def __setitem__(self, key, value):
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68: 
jpayne@68:         if strcmp(bkey, b'END') == 0:
jpayne@68:             raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68: 
jpayne@68:         if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         bcf_info_set_value(self.record, key, value)
jpayne@68: 
jpayne@68:     def __delitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         if strcmp(bkey, b'END') == 0:
jpayne@68:             raise KeyError('END is a reserved attribute; access is via record.stop')
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:         # Cannot stop here if info == NULL, since flags must return False
jpayne@68:         cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68: 
jpayne@68:         if info_id < 0:
jpayne@68:             raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68:             raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:         # Handle flags
jpayne@68:         if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@68:             return
jpayne@68: 
jpayne@68:         if not info or not info.vptr:
jpayne@68:             raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68: 
jpayne@68:         if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@68:             raise ValueError('Unable to delete INFO')
jpayne@68: 
jpayne@68:     def clear(self):
jpayne@68:         """Clear all info data"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             if info and info.vptr:
jpayne@68:                 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:                 if strcmp(key, b'END') == 0:
jpayne@68:                     continue
jpayne@68:                 if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
jpayne@68:                     raise ValueError('Unable to delete INFO')
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             if info and info.vptr:
jpayne@68:                 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:                 if strcmp(key, b'END') != 0:
jpayne@68:                     yield bcf_str_cache_get_charptr(key)
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68: 
jpayne@68:         if strcmp(bkey, b'END') == 0:
jpayne@68:             return default
jpayne@68: 
jpayne@68:         cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:         # Cannot stop here if info == NULL, since flags must return False
jpayne@68:         cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68:             raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:         # Handle flags
jpayne@68:         if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
jpayne@68:             return info != NULL and info.vptr != NULL
jpayne@68: 
jpayne@68:         if not info or not info.vptr:
jpayne@68:             return default
jpayne@68: 
jpayne@68:         return bcf_info_get_value(self.record, info)
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68: 
jpayne@68:         if strcmp(bkey, b'END') == 0:
jpayne@68:             return False
jpayne@68: 
jpayne@68:         cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:         return info != NULL and info.vptr != NULL
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             if info and info.vptr:
jpayne@68:                 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:                 if strcmp(key, b'END') != 0:
jpayne@68:                     yield bcf_info_get_value(self.record, info)
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_info_t *info
jpayne@68:         cdef const char *key
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_info):
jpayne@68:             info = &r.d.info[i]
jpayne@68:             if info and info.vptr:
jpayne@68:                 key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
jpayne@68:                 if strcmp(key, b'END') != 0:
jpayne@68:                     value = bcf_info_get_value(self.record, info)
jpayne@68:                     yield bcf_str_cache_get_charptr(key), value
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def update(self, items=None, **kwargs):
jpayne@68:         """D.update([E, ]**F) -> None.
jpayne@68: 
jpayne@68:         Update D from dict/iterable E and F.
jpayne@68:         """
jpayne@68:         for k, v in items.items():
jpayne@68:             if k != 'END':
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             kwargs.pop('END', None)
jpayne@68:             for k, v in kwargs.items():
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:     def pop(self, key, default=_nothing):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
jpayne@68: 
jpayne@68:         # Cannot stop here if info == NULL, since flags must return False
jpayne@68:         cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
jpayne@68: 
jpayne@68:         if info_id < 0:
jpayne@68:             if default is _nothing:
jpayne@68:                 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68:             return default
jpayne@68: 
jpayne@68:         if not check_header_id(hdr, BCF_HL_INFO, info_id):
jpayne@68:             raise ValueError('Invalid header')
jpayne@68: 
jpayne@68:         # Handle flags
jpayne@68:         if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
jpayne@68:             return
jpayne@68: 
jpayne@68:         if not info or not info.vptr:
jpayne@68:             if default is _nothing:
jpayne@68:                 raise KeyError('Unknown INFO field: {}'.format(key))
jpayne@68:             return default
jpayne@68: 
jpayne@68:         value = bcf_info_get_value(self.record, info)
jpayne@68: 
jpayne@68:         if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
jpayne@68:             raise ValueError('Unable to delete INFO')
jpayne@68: 
jpayne@68:         return value
jpayne@68: 
jpayne@68:     def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
jpayne@68:         if op != 2 and op != 3:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:         cdef bcf1_t *s = self.record.ptr
jpayne@68:         cdef bcf1_t *o = other.record.ptr
jpayne@68: 
jpayne@68:         # Cannot use n_info as shortcut logic, since null values may remain
jpayne@68:         cdef bint cmp = dict(self) == dict(other)
jpayne@68: 
jpayne@68:         if op == 3:
jpayne@68:             cmp = not cmp
jpayne@68: 
jpayne@68:         return cmp
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
jpayne@68:     if not record:
jpayne@68:         raise ValueError('invalid VariantRecord')
jpayne@68: 
jpayne@68:     cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
jpayne@68:     info.record = record
jpayne@68: 
jpayne@68:     return info
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantRecordSamples(object):
jpayne@68:     """mapping from sample index or name to :class:`VariantRecordSample` object."""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return self.record.ptr.n_sample  # bcf_hdr_nsamples(self.record.header.ptr)
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return self.record.ptr.n_sample != 0  # bcf_hdr_nsamples(self.record.header.ptr) != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int n = self.record.ptr.n_sample
jpayne@68:         cdef int sample_index
jpayne@68:         cdef vdict_t *d
jpayne@68:         cdef khiter_t k
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             sample_index = key
jpayne@68:         else:
jpayne@68:             bkey = force_bytes(key)
jpayne@68:             sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@68:             if sample_index < 0:
jpayne@68:                 raise KeyError('invalid sample name: {}'.format(key))
jpayne@68: 
jpayne@68:         if sample_index < 0 or sample_index >= n:
jpayne@68:             raise IndexError('invalid sample index')
jpayne@68: 
jpayne@68:         return makeVariantRecordSample(self.record, sample_index)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68: 
jpayne@68:         for i in range(n):
jpayne@68:             yield charptr_to_str(hdr.samples[i])
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int n = self.record.ptr.n_sample
jpayne@68:         cdef int sample_index
jpayne@68:         cdef vdict_t *d
jpayne@68:         cdef khiter_t k
jpayne@68: 
jpayne@68:         if isinstance(key, int):
jpayne@68:             sample_index = key
jpayne@68:         else:
jpayne@68:             bkey = force_bytes(key)
jpayne@68:             sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
jpayne@68:             if sample_index < 0:
jpayne@68:                 raise KeyError('invalid sample name: {}'.format(key))
jpayne@68: 
jpayne@68:         return 0 <= sample_index < n
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68: 
jpayne@68:         for i in range(n):
jpayne@68:             yield makeVariantRecordSample(self.record, i)
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int32_t i, n = self.record.ptr.n_sample
jpayne@68: 
jpayne@68:         for i in range(n):
jpayne@68:             yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def update(self, items=None, **kwargs):
jpayne@68:         """D.update([E, ]**F) -> None.
jpayne@68: 
jpayne@68:         Update D from dict/iterable E and F.
jpayne@68:         """
jpayne@68:         for k, v in items.items():
jpayne@68:             self[k] = v
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             for k, v in kwargs.items():
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:     def pop(self, key, default=_nothing):
jpayne@68:         try:
jpayne@68:             value = self[key]
jpayne@68:             del self[key]
jpayne@68:             return value
jpayne@68:         except KeyError:
jpayne@68:             if default is not _nothing:
jpayne@68:                 return default
jpayne@68:             raise
jpayne@68: 
jpayne@68:     def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
jpayne@68:         if op != 2 and op != 3:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:         cdef bcf1_t *s = self.record.ptr
jpayne@68:         cdef bcf1_t *o = other.record.ptr
jpayne@68: 
jpayne@68:         cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
jpayne@68: 
jpayne@68:         if op == 3:
jpayne@68:             cmp = not cmp
jpayne@68: 
jpayne@68:         return cmp
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
jpayne@68:     if not record:
jpayne@68:         raise ValueError('invalid VariantRecord')
jpayne@68: 
jpayne@68:     cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
jpayne@68:         VariantRecordSamples)
jpayne@68:     samples.record = record
jpayne@68: 
jpayne@68:     return samples
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantRecord(object):
jpayne@68:     """Variant record"""
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.ptr:
jpayne@68:             bcf_destroy1(self.ptr)
jpayne@68:             self.ptr = NULL
jpayne@68: 
jpayne@68:     def copy(self):
jpayne@68:         """return a copy of this VariantRecord object"""
jpayne@68:         return makeVariantRecord(self.header, bcf_dup(self.ptr))
jpayne@68: 
jpayne@68:     def translate(self, VariantHeader dst_header):
jpayne@68:         if dst_header is None:
jpayne@68:             raise ValueError('dst_header must not be None')
jpayne@68: 
jpayne@68:         cdef bcf_hdr_t *src_hdr = self.header.ptr
jpayne@68:         cdef bcf_hdr_t *dst_hdr = dst_header.ptr
jpayne@68: 
jpayne@68:         if src_hdr != dst_hdr:
jpayne@68:             if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
jpayne@68:                 msg = 'Cannot translate record.  Number of samples does not match header ({} vs {})'
jpayne@68:                 raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
jpayne@68: 
jpayne@68:             bcf_translate(dst_hdr, src_hdr, self.ptr)
jpayne@68:             self.header = dst_header
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def rid(self):
jpayne@68:         """internal reference id number"""
jpayne@68:         return self.ptr.rid
jpayne@68: 
jpayne@68:     @rid.setter
jpayne@68:     def rid(self, value):
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int r = value
jpayne@68:         if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
jpayne@68:             raise ValueError('invalid reference id')
jpayne@68:         self.ptr.rid = r
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def chrom(self):
jpayne@68:         """chromosome/contig name"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int rid = self.ptr.rid
jpayne@68:         if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68:             raise ValueError('Invalid header')
jpayne@68:         return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68: 
jpayne@68:     @chrom.setter
jpayne@68:     def chrom(self, value):
jpayne@68:         cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68:         bchrom = force_bytes(value)
jpayne@68:         cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@68:         if k == kh_end(d):
jpayne@68:             raise ValueError('Invalid chromosome/contig')
jpayne@68:         self.ptr.rid = kh_val_vdict(d, k).id
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def contig(self):
jpayne@68:         """chromosome/contig name"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int rid = self.ptr.rid
jpayne@68:         if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68:             raise ValueError('Invalid header')
jpayne@68:         return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68: 
jpayne@68:     @contig.setter
jpayne@68:     def contig(self, value):
jpayne@68:         cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68:         bchrom = force_bytes(value)
jpayne@68:         cdef khint_t k = kh_get_vdict(d, bchrom)
jpayne@68:         if k == kh_end(d):
jpayne@68:             raise ValueError('Invalid chromosome/contig')
jpayne@68:         self.ptr.rid = kh_val_vdict(d, k).id
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def pos(self):
jpayne@68:         """record start position on chrom/contig (1-based inclusive)"""
jpayne@68:         return self.ptr.pos + 1
jpayne@68: 
jpayne@68:     @pos.setter
jpayne@68:     def pos(self, value):
jpayne@68:         cdef int p = value
jpayne@68:         if p < 1:
jpayne@68:             raise ValueError('Position must be positive')
jpayne@68:         self.ptr.pos = p - 1
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def start(self):
jpayne@68:         """record start position on chrom/contig (0-based inclusive)"""
jpayne@68:         return self.ptr.pos
jpayne@68: 
jpayne@68:     @start.setter
jpayne@68:     def start(self, value):
jpayne@68:         cdef int s = value
jpayne@68:         if s < 0:
jpayne@68:             raise ValueError('Start coordinate must be non-negative')
jpayne@68:         self.ptr.pos = s
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def stop(self):
jpayne@68:         """record stop position on chrom/contig (0-based exclusive)"""
jpayne@68:         return self.ptr.pos + self.ptr.rlen
jpayne@68: 
jpayne@68:     @stop.setter
jpayne@68:     def stop(self, value):
jpayne@68:         cdef int s = value
jpayne@68:         if s < 0:
jpayne@68:             raise ValueError('Stop coordinate must be non-negative')
jpayne@68:         self.ptr.rlen = s - self.ptr.pos
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def rlen(self):
jpayne@68:         """record length on chrom/contig (aka rec.stop - rec.start)"""
jpayne@68:         return self.ptr.rlen
jpayne@68: 
jpayne@68:     @rlen.setter
jpayne@68:     def rlen(self, value):
jpayne@68:         cdef int r = value
jpayne@68:         self.ptr.rlen = r
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def qual(self):
jpayne@68:         """phred scaled quality score or None if not available"""
jpayne@68:         return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
jpayne@68: 
jpayne@68:     @qual.setter
jpayne@68:     def qual(self, value):
jpayne@68:         if value is not None:
jpayne@68:             self.ptr.qual = value
jpayne@68:         else:
jpayne@68:             bcf_float_set(&self.ptr.qual, bcf_float_missing)
jpayne@68: 
jpayne@68: 
jpayne@68: #   @property
jpayne@68: #   def n_allele(self):
jpayne@68: #       return self.ptr.n_allele
jpayne@68: 
jpayne@68: #   @property
jpayne@68: #   def n_sample(self):
jpayne@68: #       return self.ptr.n_sample
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def id(self):
jpayne@68:         """record identifier or None if not available"""
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
jpayne@68:         # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
jpayne@68:         if (r.d.m_id == 0):
jpayne@68:             raise ValueError('Error extracting ID')
jpayne@68:         return charptr_to_str(r.d.id) if r.d.id != b'.' else None
jpayne@68: 
jpayne@68:     @id.setter
jpayne@68:     def id(self, value):
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         cdef char *idstr = NULL
jpayne@68:         if value is not None:
jpayne@68:             bid = force_bytes(value)
jpayne@68:             idstr = bid
jpayne@68:         if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
jpayne@68:             raise ValueError('Error updating id')
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def ref(self):
jpayne@68:         """reference allele"""
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         return charptr_to_str(r.d.allele[0]) if r.d.allele else None
jpayne@68: 
jpayne@68:     @ref.setter
jpayne@68:     def ref(self, value):
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         #FIXME: Set alleles directly -- this is stupid
jpayne@68:         if not value:
jpayne@68:             raise ValueError('ref allele must not be null')
jpayne@68:         value = force_bytes(value)
jpayne@68:         if r.d.allele and r.n_allele:
jpayne@68:             alleles = [r.d.allele[i] for i in range(r.n_allele)]
jpayne@68:             alleles[0] = value
jpayne@68:         else:
jpayne@68:             alleles = [value, '<NON_REF>']
jpayne@68:         self.alleles = alleles
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def alleles(self):
jpayne@68:         """tuple of reference allele followed by alt alleles"""
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         if not r.d.allele:
jpayne@68:             return None
jpayne@68:         cdef tuple res = PyTuple_New(r.n_allele)
jpayne@68:         for i in range(r.n_allele):
jpayne@68:             a = charptr_to_str(r.d.allele[i])
jpayne@68:             PyTuple_SET_ITEM(res, i, a)
jpayne@68:             Py_INCREF(a)
jpayne@68:         return res
jpayne@68: 
jpayne@68:     @alleles.setter
jpayne@68:     def alleles(self, values):
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68: 
jpayne@68:         # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
jpayne@68:         cdef int rlen = r.rlen
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         values = [force_bytes(v) for v in values]
jpayne@68: 
jpayne@68:         if len(values) < 2:
jpayne@68:             raise ValueError('must set at least 2 alleles')
jpayne@68: 
jpayne@68:         if b'' in values:
jpayne@68:             raise ValueError('cannot set null allele')
jpayne@68: 
jpayne@68:         value = b','.join(values)
jpayne@68: 
jpayne@68:         if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
jpayne@68:             raise ValueError('Error updating alleles')
jpayne@68: 
jpayne@68:         # Reset rlen if alternate allele isn't symbolic, otherwise used cached
jpayne@68:         if has_symbolic_allele(self):
jpayne@68:             self.ptr.rlen = rlen
jpayne@68:         else:
jpayne@68:             self.ptr.rlen = len(values[0])
jpayne@68:         r.d.var_type = -1
jpayne@68:         bcf_sync_end(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def alts(self):
jpayne@68:         """tuple of alt alleles"""
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         if r.n_allele < 2 or not r.d.allele:
jpayne@68:             return None
jpayne@68:         cdef tuple res = PyTuple_New(r.n_allele - 1)
jpayne@68:         for i in range(1, r.n_allele):
jpayne@68:             a = charptr_to_str(r.d.allele[i])
jpayne@68:             PyTuple_SET_ITEM(res, i - 1, a)
jpayne@68:             Py_INCREF(a)
jpayne@68:         return res
jpayne@68: 
jpayne@68:     @alts.setter
jpayne@68:     def alts(self, value):
jpayne@68:         #FIXME: Set alleles directly -- this is stupid
jpayne@68:         cdef bcf1_t *r = self.ptr
jpayne@68:         if bcf_unpack(r, BCF_UN_STR) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         value = [force_bytes(v) for v in value]
jpayne@68:         if b'' in value:
jpayne@68:             raise ValueError('cannot set null alt allele')
jpayne@68:         ref  = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
jpayne@68:         self.alleles = ref + value
jpayne@68:         r.d.var_type = -1
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def filter(self):
jpayne@68:         """filter information (see :class:`VariantRecordFilter`)"""
jpayne@68:         if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         return makeVariantRecordFilter(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def info(self):
jpayne@68:         """info data (see :class:`VariantRecordInfo`)"""
jpayne@68:         if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         return makeVariantRecordInfo(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def format(self):
jpayne@68:         """sample format metadata (see :class:`VariantRecordFormat`)"""
jpayne@68:         if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         return makeVariantRecordFormat(self)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def samples(self):
jpayne@68:         """sample data (see :class:`VariantRecordSamples`)"""
jpayne@68:         if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68:         return makeVariantRecordSamples(self)
jpayne@68: 
jpayne@68:     property alleles_variant_types:
jpayne@68:         def __get__(self):
jpayne@68:             cdef bcf1_t *r = self.ptr
jpayne@68:             cdef tuple result = PyTuple_New(r.n_allele)
jpayne@68: 
jpayne@68:             for i in range(r.n_allele):
jpayne@68:                 tp = bcf_get_variant_type(r, i)
jpayne@68: 
jpayne@68:                 if tp == VCF_REF:
jpayne@68:                     v_type = "REF"
jpayne@68:                 elif tp == VCF_SNP:
jpayne@68:                     v_type = "SNP"
jpayne@68:                 elif tp == VCF_MNP:
jpayne@68:                     v_type = "MNP"
jpayne@68:                 elif tp == VCF_INDEL:
jpayne@68:                     v_type = "INDEL"
jpayne@68:                 elif tp == VCF_BND:
jpayne@68:                     v_type = "BND"
jpayne@68:                 elif tp == VCF_OVERLAP:
jpayne@68:                     v_type = "OVERLAP"
jpayne@68:                 else:
jpayne@68:                     v_type = "OTHER"
jpayne@68: 
jpayne@68:                 PyTuple_SET_ITEM(result, i, v_type)
jpayne@68:                 Py_INCREF(v_type)
jpayne@68: 
jpayne@68:             return result
jpayne@68: 
jpayne@68:     def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
jpayne@68:         if op != 2 and op != 3:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:         cdef bcf1_t *s = self.ptr
jpayne@68:         cdef bcf1_t *o = other.ptr
jpayne@68: 
jpayne@68:         cdef bint cmp = self is other or (
jpayne@68:                              s.pos        == o.pos
jpayne@68:                         and  s.rlen       == o.rlen
jpayne@68:                         and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
jpayne@68:                           or s.qual       == o.qual)
jpayne@68:                         and  s.n_sample   == o.n_sample
jpayne@68:                         and  s.n_allele   == o.n_allele
jpayne@68:                         and  self.contig  == other.contig
jpayne@68:                         and  self.alleles == other.alleles
jpayne@68:                         and  self.id      == other.id
jpayne@68:                         and  self.info    == other.info
jpayne@68:                         and  self.filter  == other.filter
jpayne@68:                         and  self.samples == other.samples)
jpayne@68: 
jpayne@68:         if op == 3:
jpayne@68:             cmp = not cmp
jpayne@68: 
jpayne@68:         return cmp
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         cdef kstring_t line
jpayne@68:         cdef char c
jpayne@68: 
jpayne@68:         line.l = line.m = 0
jpayne@68:         line.s = NULL
jpayne@68: 
jpayne@68:         if vcf_format(self.header.ptr, self.ptr, &line) < 0:
jpayne@68:             if line.m:
jpayne@68:                 free(line.s)
jpayne@68:             raise ValueError('vcf_format failed')
jpayne@68: 
jpayne@68:         # Strip CR/LF?
jpayne@68:         #while line.l:
jpayne@68:         #    c = line.s[line.l - 1]
jpayne@68:         #    if c != b'\n' and c != b'\r':
jpayne@68:         #        break
jpayne@68:         #    line.l -= 1
jpayne@68: 
jpayne@68:         ret = charptr_to_str_w_len(line.s, line.l)
jpayne@68: 
jpayne@68:         if line.m:
jpayne@68:             free(line.s)
jpayne@68: 
jpayne@68:         return ret
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     if not r:
jpayne@68:         raise ValueError('cannot create VariantRecord')
jpayne@68: 
jpayne@68:     if r.errcode:
jpayne@68:         msg = []
jpayne@68:         #if r.errcode & BCF_ERR_CTG_UNDEF:
jpayne@68:         #    msg.append('undefined contig')
jpayne@68:         #if r.errcode & BCF_ERR_TAG_UNDEF:
jpayne@68:         #    msg.append('undefined tag')
jpayne@68:         if r.errcode & BCF_ERR_NCOLS:
jpayne@68:             msg.append('invalid number of columns')
jpayne@68:         if r.errcode & BCF_ERR_LIMITS:
jpayne@68:             msg.append('limits violated')
jpayne@68:         if r.errcode & BCF_ERR_CHAR:
jpayne@68:             msg.append('invalid character found')
jpayne@68:         if r.errcode & BCF_ERR_CTG_INVALID:
jpayne@68:             msg.append('invalid contig')
jpayne@68:         if r.errcode & BCF_ERR_TAG_INVALID:
jpayne@68:             msg.append('invalid tag')
jpayne@68: 
jpayne@68:         if msg:
jpayne@68:             msg = ', '.join(msg)
jpayne@68:             raise ValueError('Error(s) reading record: {}'.format(msg))
jpayne@68: 
jpayne@68:     cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
jpayne@68:     record.header = header
jpayne@68:     record.ptr = r
jpayne@68: 
jpayne@68:     return record
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Variant Sampletype object
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantRecordSample(object):
jpayne@68:     """Data for a single sample from a :class:`VariantRecord` object.
jpayne@68:        Provides data accessors for genotypes and a mapping interface
jpayne@68:        from format name to values.
jpayne@68:     """
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         raise TypeError('this class cannot be instantiated from Python')
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def name(self):
jpayne@68:         """sample name"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int32_t n = r.n_sample
jpayne@68: 
jpayne@68:         if self.index < 0 or self.index >= n:
jpayne@68:             raise ValueError('invalid sample index')
jpayne@68: 
jpayne@68:         return charptr_to_str(hdr.samples[self.index])
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def allele_indices(self):
jpayne@68:         """allele indices for called genotype, if present.  Otherwise None"""
jpayne@68:         return bcf_format_get_allele_indices(self)
jpayne@68: 
jpayne@68:     @allele_indices.setter
jpayne@68:     def allele_indices(self, value):
jpayne@68:         self['GT'] = value
jpayne@68: 
jpayne@68:     @allele_indices.deleter
jpayne@68:     def allele_indices(self):
jpayne@68:         self['GT'] = ()
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def alleles(self):
jpayne@68:         """alleles for called genotype, if present.  Otherwise None"""
jpayne@68:         return bcf_format_get_alleles(self)
jpayne@68: 
jpayne@68:     @alleles.setter
jpayne@68:     def alleles(self, value):
jpayne@68:         # Sets the genotype, supply a tuple of alleles to set.
jpayne@68:         # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
jpayne@68:         # The genotype is reset when an empty tuple, None or (None,) is supplied
jpayne@68: 
jpayne@68:         if value==(None,) or value==tuple() or value is None:
jpayne@68:             self['GT'] = ()
jpayne@68:             return
jpayne@68: 
jpayne@68:         if any((type(x) == int for x in value)):
jpayne@68:             raise ValueError('Use .allele_indices to set integer allele indices')
jpayne@68: 
jpayne@68:         # determine and set allele indices:    
jpayne@68:         try:
jpayne@68:             self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
jpayne@68:         except ValueError:
jpayne@68:             raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
jpayne@68:                              "First set the .alleles of this record to define the alleles")
jpayne@68: 
jpayne@68:     @alleles.deleter
jpayne@68:     def alleles(self):
jpayne@68:         self['GT'] = ()
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def phased(self):
jpayne@68:         """False if genotype is missing or any allele is unphased.  Otherwise True."""
jpayne@68:         return bcf_sample_get_phased(self)
jpayne@68: 
jpayne@68:     @phased.setter
jpayne@68:     def phased(self, value):
jpayne@68:         bcf_sample_set_phased(self, value)
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int i, n = 0
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             if r.d.fmt[i].p:
jpayne@68:                 n += 1
jpayne@68:         return n
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         if bcf_unpack(r, BCF_UN_FMT) < 0:
jpayne@68:             raise ValueError('Error unpacking VariantRecord')
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             if r.d.fmt[i].p:
jpayne@68:                 return True
jpayne@68:         return False
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         return bcf_format_get_value(self, key)
jpayne@68: 
jpayne@68:     def __setitem__(self, key, value):
jpayne@68:         bcf_format_set_value(self, key, value)
jpayne@68: 
jpayne@68:     def __delitem__(self, key):
jpayne@68:         bcf_format_del_value(self, key)
jpayne@68: 
jpayne@68:     def clear(self):
jpayne@68:         """Clear all format data (including genotype) for this sample"""
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_fmt_t *fmt
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             fmt = &r.d.fmt[i]
jpayne@68:             if fmt.p:
jpayne@68:                 bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bcf_fmt_t *fmt
jpayne@68:         cdef int i
jpayne@68: 
jpayne@68:         for i in range(r.n_fmt):
jpayne@68:             fmt = &r.d.fmt[i]
jpayne@68:             if r.d.fmt[i].p:
jpayne@68:                 yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         cdef bcf_hdr_t *hdr = self.record.header.ptr
jpayne@68:         cdef bcf1_t *r = self.record.ptr
jpayne@68:         cdef bytes bkey = force_bytes(key)
jpayne@68:         cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
jpayne@68:         return fmt != NULL and fmt.p != NULL
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def update(self, items=None, **kwargs):
jpayne@68:         """D.update([E, ]**F) -> None.
jpayne@68: 
jpayne@68:         Update D from dict/iterable E and F.
jpayne@68:         """
jpayne@68:         for k, v in items.items():
jpayne@68:             self[k] = v
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             for k, v in kwargs.items():
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:     def pop(self, key, default=_nothing):
jpayne@68:         try:
jpayne@68:             value = self[key]
jpayne@68:             del self[key]
jpayne@68:             return value
jpayne@68:         except KeyError:
jpayne@68:             if default is not _nothing:
jpayne@68:                 return default
jpayne@68:             raise
jpayne@68: 
jpayne@68:     def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
jpayne@68:         if op != 2 and op != 3:
jpayne@68:             return NotImplemented
jpayne@68: 
jpayne@68:         cdef bint cmp = dict(self) == dict(other)
jpayne@68: 
jpayne@68:         if op == 3:
jpayne@68:             cmp = not cmp
jpayne@68: 
jpayne@68:         return cmp
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68: 
jpayne@68: cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
jpayne@68:     if not record or sample_index < 0:
jpayne@68:         raise ValueError('cannot create VariantRecordSample')
jpayne@68: 
jpayne@68:     cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
jpayne@68:     sample.record = record
jpayne@68:     sample.index = sample_index
jpayne@68: 
jpayne@68:     return sample
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Index objects
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class BaseIndex(object):
jpayne@68:     def __init__(self):
jpayne@68:         self.refs = ()
jpayne@68:         self.remap = {}
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return len(self.refs)
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         return len(self.refs) != 0
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         if isinstance(key, int):
jpayne@68:             return self.refs[key]
jpayne@68:         else:
jpayne@68:             return self.refmap[key]
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return iter(self.refs)
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
jpayne@68:         try:
jpayne@68:             return self[key]
jpayne@68:         except KeyError:
jpayne@68:             return default
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         try:
jpayne@68:             self[key]
jpayne@68:         except KeyError:
jpayne@68:             return False
jpayne@68:         else:
jpayne@68:             return True
jpayne@68: 
jpayne@68:     def iterkeys(self):
jpayne@68:         """D.iterkeys() -> an iterator over the keys of D"""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def itervalues(self):
jpayne@68:         """D.itervalues() -> an iterator over the values of D"""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def iteritems(self):
jpayne@68:         """D.iteritems() -> an iterator over the (key, value) items of D"""
jpayne@68:         for key in self:
jpayne@68:             yield (key, self[key])
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """D.keys() -> list of D's keys"""
jpayne@68:         return list(self)
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
jpayne@68:         return list(self.iteritems())
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """D.values() -> list of D's values"""
jpayne@68:         return list(self.itervalues())
jpayne@68: 
jpayne@68:     def update(self, items=None, **kwargs):
jpayne@68:         """D.update([E, ]**F) -> None.
jpayne@68: 
jpayne@68:         Update D from dict/iterable E and F.
jpayne@68:         """
jpayne@68:         for k, v in items.items():
jpayne@68:             self[k] = v
jpayne@68: 
jpayne@68:         if kwargs:
jpayne@68:             for k, v in kwargs.items():
jpayne@68:                 self[k] = v
jpayne@68: 
jpayne@68:     def pop(self, key, default=_nothing):
jpayne@68:         try:
jpayne@68:             value = self[key]
jpayne@68:             del self[key]
jpayne@68:             return value
jpayne@68:         except KeyError:
jpayne@68:             if default is not _nothing:
jpayne@68:                 return default
jpayne@68:             raise
jpayne@68: 
jpayne@68:     # Mappings are not hashable by default, but subclasses can change this
jpayne@68:     __hash__ = None
jpayne@68: 
jpayne@68:     #TODO: implement __richcmp__
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class BCFIndex(object):
jpayne@68:     """CSI index data structure for BCF files"""
jpayne@68:     def __init__(self):
jpayne@68:         self.refs = ()
jpayne@68:         self.refmap = {}
jpayne@68: 
jpayne@68:         if not self.ptr:
jpayne@68:             raise ValueError('Invalid index object')
jpayne@68: 
jpayne@68:         cdef int n
jpayne@68:         cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
jpayne@68: 
jpayne@68:         self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@68:         self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.ptr:
jpayne@68:             hts_idx_destroy(self.ptr)
jpayne@68:             self.ptr = NULL
jpayne@68: 
jpayne@68:     def fetch(self, bcf, contig, start, stop, reopen):
jpayne@68:         return BCFIterator(bcf, contig, start, stop, reopen)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
jpayne@68:     if not idx:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     if not header:
jpayne@68:         raise ValueError('invalid VariantHeader')
jpayne@68: 
jpayne@68:     cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
jpayne@68:     index.header = header
jpayne@68:     index.ptr = idx
jpayne@68:     index.__init__()
jpayne@68: 
jpayne@68:     return index
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class TabixIndex(BaseIndex):
jpayne@68:     """Tabix index data structure for VCF files"""
jpayne@68:     def __init__(self):
jpayne@68:         self.refs = ()
jpayne@68:         self.refmap = {}
jpayne@68: 
jpayne@68:         if not self.ptr:
jpayne@68:             raise ValueError('Invalid index object')
jpayne@68: 
jpayne@68:         cdef int n
jpayne@68:         cdef const char **refs = tbx_seqnames(self.ptr, &n)
jpayne@68: 
jpayne@68:         self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
jpayne@68:         self.refmap = { r:i for i,r in enumerate(self.refs) }
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.ptr:
jpayne@68:             tbx_destroy(self.ptr)
jpayne@68:             self.ptr = NULL
jpayne@68: 
jpayne@68:     def fetch(self, bcf, contig, start, stop, reopen):
jpayne@68:         return TabixIterator(bcf, contig, start, stop, reopen)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef TabixIndex makeTabixIndex(tbx_t *idx):
jpayne@68:     if not idx:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
jpayne@68:     index.ptr = idx
jpayne@68:     index.__init__()
jpayne@68: 
jpayne@68:     return index
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Iterators
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class BaseIterator(object):
jpayne@68:     pass
jpayne@68: 
jpayne@68: 
jpayne@68: # Internal function to clean up after iteration stop or failure.
jpayne@68: # This would be a nested function if it weren't a cdef function.
jpayne@68: cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
jpayne@68:     bcf_destroy1(record)
jpayne@68: 
jpayne@68:     # destroy iter so future calls to __next__ raise StopIteration
jpayne@68:     bcf_itr_destroy(self.iter)
jpayne@68:     self.iter = NULL
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class BCFIterator(BaseIterator):
jpayne@68:     def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@68:         if bcf is None:
jpayne@68:             raise ValueError('bcf must not be None')
jpayne@68: 
jpayne@68:         if contig is None:
jpayne@68:             raise ValueError('contig must be specified')
jpayne@68: 
jpayne@68:         if not isinstance(bcf.index, BCFIndex):
jpayne@68:             raise ValueError('bcf index required')
jpayne@68: 
jpayne@68:         cdef BCFIndex index = bcf.index
jpayne@68: 
jpayne@68:         self.bcf = bcf
jpayne@68:         self.index = index
jpayne@68: 
jpayne@68:         cdef int rid, cstart, cstop
jpayne@68: 
jpayne@68:         try:
jpayne@68:             rid = index.refmap[contig]
jpayne@68:         except KeyError:
jpayne@68:             # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@68:             self.iter = NULL
jpayne@68:             return
jpayne@68: 
jpayne@68:         if reopen:
jpayne@68:             self.bcf = self.bcf.copy()
jpayne@68: 
jpayne@68:         cstart = start if start is not None else 0
jpayne@68:         cstop  = stop  if stop  is not None else MAX_POS
jpayne@68: 
jpayne@68:         with nogil:
jpayne@68:             self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
jpayne@68: 
jpayne@68:         if not self.iter:
jpayne@68:             if errno:
jpayne@68:                 raise IOError(errno, strerror(errno))
jpayne@68:             else:
jpayne@68:                 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.iter:
jpayne@68:             bcf_itr_destroy(self.iter)
jpayne@68:             self.iter = NULL
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         if not self.iter:
jpayne@68:             raise StopIteration
jpayne@68: 
jpayne@68:         cdef bcf1_t *record = bcf_init1()
jpayne@68: 
jpayne@68:         if not record:
jpayne@68:             raise MemoryError('unable to allocate BCF record')
jpayne@68: 
jpayne@68:         record.pos = -1
jpayne@68:         if self.bcf.drop_samples:
jpayne@68:             record.max_unpack = BCF_UN_SHR
jpayne@68: 
jpayne@68:         cdef int ret
jpayne@68: 
jpayne@68:         with nogil:
jpayne@68:             ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             _stop_BCFIterator(self, record)
jpayne@68:             if ret == -1:
jpayne@68:                 raise StopIteration
jpayne@68:             elif ret == -2:
jpayne@68:                 raise IOError('truncated file')
jpayne@68:             elif errno:
jpayne@68:                 raise IOError(errno, strerror(errno))
jpayne@68:             else:
jpayne@68:                 raise IOError('unable to fetch next record')
jpayne@68: 
jpayne@68:         ret = bcf_subset_format(self.bcf.header.ptr, record)
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             _stop_BCFIterator(self, record)
jpayne@68:             raise ValueError('error in bcf_subset_format')
jpayne@68: 
jpayne@68:         return makeVariantRecord(self.bcf.header, record)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class TabixIterator(BaseIterator):
jpayne@68:     def __cinit__(self, *args, **kwargs):
jpayne@68:         self.line_buffer.l = 0
jpayne@68:         self.line_buffer.m = 0
jpayne@68:         self.line_buffer.s = NULL
jpayne@68: 
jpayne@68:     def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
jpayne@68:         if bcf is None:
jpayne@68:             raise ValueError('bcf must not be None')
jpayne@68: 
jpayne@68:         if not isinstance(bcf.index, TabixIndex):
jpayne@68:             raise ValueError('tabix index required')
jpayne@68: 
jpayne@68:         cdef TabixIndex index = bcf.index
jpayne@68: 
jpayne@68:         self.bcf = bcf
jpayne@68:         self.index = index
jpayne@68: 
jpayne@68:         cdef int rid, cstart, cstop
jpayne@68: 
jpayne@68:         try:
jpayne@68:             rid = index.refmap[contig]
jpayne@68:         except KeyError:
jpayne@68:             # A query for a non-existent contig yields an empty iterator, does not raise an error
jpayne@68:             self.iter = NULL
jpayne@68:             return
jpayne@68: 
jpayne@68:         if reopen:
jpayne@68:             self.bcf = self.bcf.copy()
jpayne@68: 
jpayne@68:         cstart = start if start is not None else 0
jpayne@68:         cstop  = stop  if stop  is not None else MAX_POS
jpayne@68: 
jpayne@68:         self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
jpayne@68: 
jpayne@68:         if not self.iter:
jpayne@68:             if errno:
jpayne@68:                 raise IOError(errno, strerror(errno))
jpayne@68:             else:
jpayne@68:                 raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if self.iter:
jpayne@68:             tbx_itr_destroy(self.iter)
jpayne@68:             self.iter = NULL
jpayne@68: 
jpayne@68:         if self.line_buffer.m:
jpayne@68:             free(self.line_buffer.s)
jpayne@68: 
jpayne@68:         self.line_buffer.l = 0
jpayne@68:         self.line_buffer.m = 0
jpayne@68:         self.line_buffer.s = NULL
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         return self
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         if not self.iter:
jpayne@68:             raise StopIteration
jpayne@68: 
jpayne@68:         cdef int ret
jpayne@68: 
jpayne@68:         with nogil:
jpayne@68:             ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             tbx_itr_destroy(self.iter)
jpayne@68:             self.iter = NULL
jpayne@68:             if ret == -1:
jpayne@68:                 raise StopIteration
jpayne@68:             elif ret == -2:
jpayne@68:                 raise IOError('truncated file')
jpayne@68:             elif errno:
jpayne@68:                 raise IOError(errno, strerror(errno))
jpayne@68:             else:
jpayne@68:                 raise IOError('unable to fetch next record')
jpayne@68: 
jpayne@68:         cdef bcf1_t *record = bcf_init1()
jpayne@68: 
jpayne@68:         if not record:
jpayne@68:             raise MemoryError('unable to allocate BCF record')
jpayne@68: 
jpayne@68:         record.pos = -1
jpayne@68:         if self.bcf.drop_samples:
jpayne@68:             record.max_unpack = BCF_UN_SHR
jpayne@68: 
jpayne@68:         ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
jpayne@68: 
jpayne@68:         # FIXME: stop iteration on parse failure?
jpayne@68:         if ret < 0:
jpayne@68:             bcf_destroy1(record)
jpayne@68:             raise ValueError('error in vcf_parse')
jpayne@68: 
jpayne@68:         return makeVariantRecord(self.bcf.header, record)
jpayne@68: 
jpayne@68: 
jpayne@68: ########################################################################
jpayne@68: ########################################################################
jpayne@68: ## Variant File
jpayne@68: ########################################################################
jpayne@68: 
jpayne@68: 
jpayne@68: cdef class VariantFile(HTSFile):
jpayne@68:     """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
jpayne@68:     duplicate_filehandle=True, ignore_truncation=False, threads=1)*
jpayne@68: 
jpayne@68:     A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
jpayne@68:     opened.
jpayne@68: 
jpayne@68:     If an index for a variant file exists (.csi or .tbi), it will be
jpayne@68:     opened automatically.  Without an index random access to records
jpayne@68:     via :meth:`fetch` is disabled.
jpayne@68: 
jpayne@68:     For writing, a :class:`VariantHeader` object must be provided,
jpayne@68:     typically obtained from another :term:`VCF` file/:term:`BCF`
jpayne@68:     file.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     mode : string
jpayne@68:         *mode* should be ``r`` for reading or ``w`` for writing. The default is
jpayne@68:         text mode (:term:`VCF`).  For binary (:term:`BCF`) I/O you should append
jpayne@68:         ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
jpayne@68: 
jpayne@68:         If ``b`` is present, it must immediately follow ``r`` or ``w``.  Valid
jpayne@68:         modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
jpayne@68:         For instance, to open a :term:`BCF` formatted file for reading, type::
jpayne@68: 
jpayne@68:             f = pysam.VariantFile('ex1.bcf','r')
jpayne@68: 
jpayne@68:         If mode is not specified, we will try to auto-detect the file type.  All
jpayne@68:         of the following should work::
jpayne@68: 
jpayne@68:             f1 = pysam.VariantFile('ex1.bcf')
jpayne@68:             f2 = pysam.VariantFile('ex1.vcf')
jpayne@68:             f3 = pysam.VariantFile('ex1.vcf.gz')
jpayne@68: 
jpayne@68:     index_filename : string
jpayne@68:         Explicit path to an index file.
jpayne@68: 
jpayne@68:     header : VariantHeader
jpayne@68:         :class:`VariantHeader` object required for writing.
jpayne@68: 
jpayne@68:     drop_samples: bool
jpayne@68:         Ignore sample information when reading.
jpayne@68: 
jpayne@68:     duplicate_filehandle: bool
jpayne@68:         By default, file handles passed either directly or through
jpayne@68:         File-like objects will be duplicated before passing them to
jpayne@68:         htslib. The duplication prevents issues where the same stream
jpayne@68:         will be closed by htslib and through destruction of the
jpayne@68:         high-level python object. Set to False to turn off
jpayne@68:         duplication.
jpayne@68: 
jpayne@68:     ignore_truncation: bool
jpayne@68:         Issue a warning, instead of raising an error if the current file
jpayne@68:         appears to be truncated due to a missing EOF marker.  Only applies
jpayne@68:         to bgzipped formats. (Default=False)
jpayne@68: 
jpayne@68:     threads: integer
jpayne@68:         Number of threads to use for compressing/decompressing VCF/BCF files.
jpayne@68:         Setting threads to > 1 cannot be combined with `ignore_truncation`.
jpayne@68:         (Default=1)
jpayne@68: 
jpayne@68:     """
jpayne@68:     def __cinit__(self, *args, **kwargs):
jpayne@68:         self.htsfile = NULL
jpayne@68: 
jpayne@68:     def __init__(self, *args, **kwargs):
jpayne@68:         self.header         = None
jpayne@68:         self.index          = None
jpayne@68:         self.filename       = None
jpayne@68:         self.mode           = None
jpayne@68:         self.threads        = 1
jpayne@68:         self.index_filename = None
jpayne@68:         self.is_stream      = False
jpayne@68:         self.is_remote      = False
jpayne@68:         self.is_reading     = False
jpayne@68:         self.drop_samples   = False
jpayne@68:         self.header_written = False
jpayne@68:         self.start_offset   = -1
jpayne@68: 
jpayne@68:         self.open(*args, **kwargs)
jpayne@68: 
jpayne@68:     def __dealloc__(self):
jpayne@68:         if not self.htsfile or not self.header:
jpayne@68:             return
jpayne@68: 
jpayne@68:         # Write header if no records were written
jpayne@68:         if self.htsfile.is_write and not self.header_written:
jpayne@68:             with nogil:
jpayne@68:                 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68: 
jpayne@68:         cdef int ret = hts_close(self.htsfile)
jpayne@68:         self.htsfile = NULL
jpayne@68:         self.header = self.index = None
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             global errno
jpayne@68:             if errno == EPIPE:
jpayne@68:                 errno = 0
jpayne@68:             else:
jpayne@68:                 raise IOError(errno, force_str(strerror(errno)))
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """closes the :class:`pysam.VariantFile`."""
jpayne@68:         if not self.htsfile:
jpayne@68:             return
jpayne@68: 
jpayne@68:         # Write header if no records were written
jpayne@68:         if self.htsfile.is_write and not self.header_written:
jpayne@68:             with nogil:
jpayne@68:                 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68: 
jpayne@68:         cdef int ret = hts_close(self.htsfile)
jpayne@68:         self.htsfile = NULL
jpayne@68:         self.header = self.index = None
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             global errno
jpayne@68:             if errno == EPIPE:
jpayne@68:                 errno = 0
jpayne@68:             else:
jpayne@68:                 raise IOError(errno, force_str(strerror(errno)))
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         if self.htsfile.is_write:
jpayne@68:             raise ValueError('cannot iterate over Variantfile opened for writing')
jpayne@68: 
jpayne@68:         self.is_reading = 1
jpayne@68:         return self
jpayne@68: 
jpayne@68:     def __next__(self):
jpayne@68:         cdef int ret
jpayne@68:         cdef int errcode
jpayne@68:         cdef bcf1_t *record = bcf_init1()
jpayne@68: 
jpayne@68:         if not record:
jpayne@68:             raise MemoryError('unable to allocate BCF record')
jpayne@68: 
jpayne@68:         record.pos = -1
jpayne@68:         if self.drop_samples:
jpayne@68:             record.max_unpack = BCF_UN_SHR
jpayne@68: 
jpayne@68:         with nogil:
jpayne@68:             ret = bcf_read1(self.htsfile, self.header.ptr, record)
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             errcode = record.errcode
jpayne@68:             bcf_destroy1(record)
jpayne@68:             if errcode:
jpayne@68:                 raise IOError('unable to parse next record')
jpayne@68:             if ret == -1:
jpayne@68:                 raise StopIteration
jpayne@68:             elif ret == -2:
jpayne@68:                 raise IOError('truncated file')
jpayne@68:             elif errno:
jpayne@68:                 raise IOError(errno, strerror(errno))
jpayne@68:             else:
jpayne@68:                 raise IOError('unable to fetch next record')
jpayne@68: 
jpayne@68:         return makeVariantRecord(self.header, record)
jpayne@68: 
jpayne@68:     def copy(self):
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError
jpayne@68: 
jpayne@68:         cdef VariantFile vars = VariantFile.__new__(VariantFile)
jpayne@68:         cdef bcf_hdr_t *hdr
jpayne@68: 
jpayne@68:         # FIXME: re-open using fd or else header and index could be invalid
jpayne@68:         vars.htsfile = self._open_htsfile()
jpayne@68: 
jpayne@68:         if not vars.htsfile:
jpayne@68:             raise ValueError('Cannot re-open htsfile')
jpayne@68: 
jpayne@68:         # minimize overhead by re-using header and index.  This approach is
jpayne@68:         # currently risky, but see above for how this can be mitigated.
jpayne@68:         vars.header         = self.header
jpayne@68:         vars.index          = self.index
jpayne@68: 
jpayne@68:         vars.filename       = self.filename
jpayne@68:         vars.mode           = self.mode
jpayne@68:         vars.threads        = self.threads
jpayne@68:         vars.index_filename = self.index_filename
jpayne@68:         vars.drop_samples   = self.drop_samples
jpayne@68:         vars.is_stream      = self.is_stream
jpayne@68:         vars.is_remote      = self.is_remote
jpayne@68:         vars.is_reading     = self.is_reading
jpayne@68:         vars.start_offset   = self.start_offset
jpayne@68:         vars.header_written = self.header_written
jpayne@68: 
jpayne@68:         if self.htsfile.is_bin:
jpayne@68:             vars.seek(self.tell())
jpayne@68:         else:
jpayne@68:             with nogil:
jpayne@68:                 hdr = bcf_hdr_read(vars.htsfile)
jpayne@68:             makeVariantHeader(hdr)
jpayne@68: 
jpayne@68:         return vars
jpayne@68: 
jpayne@68:     def open(self, filename, mode='r',
jpayne@68:              index_filename=None,
jpayne@68:              VariantHeader header=None,
jpayne@68:              drop_samples=False,
jpayne@68:              duplicate_filehandle=True,
jpayne@68:              ignore_truncation=False,
jpayne@68:              threads=1):
jpayne@68:         """open a vcf/bcf file.
jpayne@68: 
jpayne@68:         If open is called on an existing VariantFile, the current file will be
jpayne@68:         closed and a new file will be opened.
jpayne@68:         """
jpayne@68:         cdef bcf_hdr_t *hdr
jpayne@68:         cdef BGZF *bgzfp
jpayne@68:         cdef hts_idx_t *idx
jpayne@68:         cdef tbx_t *tidx
jpayne@68:         cdef char *cfilename
jpayne@68:         cdef char *cindex_filename = NULL
jpayne@68:         cdef char *cmode
jpayne@68: 
jpayne@68:         if threads > 1 and ignore_truncation:
jpayne@68:             # This won't raise errors if reaching a truncated alignment,
jpayne@68:             # because bgzf_mt_reader in htslib does not deal with
jpayne@68:             # bgzf_mt_read_block returning non-zero values, contrary
jpayne@68:             # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
jpayne@68:             # Better to avoid this (for now) than to produce seemingly correct results.
jpayne@68:             raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
jpayne@68:         self.threads = threads
jpayne@68: 
jpayne@68:         # close a previously opened file
jpayne@68:         if self.is_open:
jpayne@68:             self.close()
jpayne@68: 
jpayne@68:         if not mode or mode[0] not in 'rwa':
jpayne@68:             raise ValueError('mode must begin with r, w or a')
jpayne@68: 
jpayne@68:         self.duplicate_filehandle = duplicate_filehandle
jpayne@68: 
jpayne@68:         format_modes = [m for m in mode[1:] if m in 'bcguz']
jpayne@68:         if len(format_modes) > 1:
jpayne@68:             raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
jpayne@68: 
jpayne@68:         invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
jpayne@68:         if invalid_modes:
jpayne@68:             raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
jpayne@68: 
jpayne@68:         # Autodetect mode from filename
jpayne@68:         if mode == 'w' and isinstance(filename, str):
jpayne@68:             if filename.endswith('.gz'):
jpayne@68:                 mode = 'wz'
jpayne@68:             elif filename.endswith('.bcf'):
jpayne@68:                 mode = 'wb'
jpayne@68: 
jpayne@68:         # for htslib, wbu seems to not work
jpayne@68:         if mode == 'wbu':
jpayne@68:             mode = 'wb0'
jpayne@68: 
jpayne@68:         self.mode = mode = force_bytes(mode)
jpayne@68:         try:
jpayne@68:             filename = encode_filename(filename)
jpayne@68:             self.is_remote = hisremote(filename)
jpayne@68:             self.is_stream = filename == b'-'
jpayne@68:         except TypeError:
jpayne@68:             filename = filename
jpayne@68:             self.is_remote = False
jpayne@68:             self.is_stream = True
jpayne@68: 
jpayne@68:         self.filename = filename
jpayne@68: 
jpayne@68:         if index_filename is not None:
jpayne@68:             self.index_filename = index_filename = encode_filename(index_filename)
jpayne@68:         else:
jpayne@68:             self.index_filename = None
jpayne@68: 
jpayne@68:         self.drop_samples = bool(drop_samples)
jpayne@68:         self.header = None
jpayne@68: 
jpayne@68:         self.header_written = False
jpayne@68: 
jpayne@68:         if mode.startswith(b'w'):
jpayne@68:             # open file for writing
jpayne@68:             if index_filename is not None:
jpayne@68:                 raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
jpayne@68: 
jpayne@68:             # header structure (used for writing)
jpayne@68:             if header:
jpayne@68:                 self.header = header.copy()
jpayne@68:             else:
jpayne@68:                 self.header = VariantHeader()
jpayne@68:                 #raise ValueError('a VariantHeader must be specified')
jpayne@68: 
jpayne@68:             # Header is not written until the first write or on close
jpayne@68:             self.htsfile = self._open_htsfile()
jpayne@68: 
jpayne@68:             if not self.htsfile:
jpayne@68:                 raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
jpayne@68: 
jpayne@68:         elif mode.startswith(b'r'):
jpayne@68:             # open file for reading
jpayne@68:             self.htsfile = self._open_htsfile()
jpayne@68: 
jpayne@68:             if not self.htsfile:
jpayne@68:                 if errno:
jpayne@68:                     raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
jpayne@68:                 else:
jpayne@68:                     raise ValueError('could not open variant file `{}`'.format(filename))
jpayne@68: 
jpayne@68:             if self.htsfile.format.format not in (bcf, vcf):
jpayne@68:                 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@68: 
jpayne@68:             self.check_truncation(ignore_truncation)
jpayne@68: 
jpayne@68:             with nogil:
jpayne@68:                 hdr = bcf_hdr_read(self.htsfile)
jpayne@68: 
jpayne@68:             try:
jpayne@68:                 self.header = makeVariantHeader(hdr)
jpayne@68:             except ValueError:
jpayne@68:                 raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
jpayne@68: 
jpayne@68:             if isinstance(self.filename, bytes):
jpayne@68:                 cfilename = self.filename
jpayne@68:             else:
jpayne@68:                 cfilename = NULL
jpayne@68: 
jpayne@68:             # check for index and open if present
jpayne@68:             if self.htsfile.format.format == bcf and cfilename:
jpayne@68:                 if index_filename is not None:
jpayne@68:                     cindex_filename = index_filename
jpayne@68:                 with nogil:
jpayne@68:                     idx = bcf_index_load2(cfilename, cindex_filename)
jpayne@68:                 self.index = makeBCFIndex(self.header, idx)
jpayne@68: 
jpayne@68:             elif self.htsfile.format.compression == bgzf and cfilename:
jpayne@68:                 if index_filename is not None:
jpayne@68:                     cindex_filename = index_filename
jpayne@68:                 with nogil:
jpayne@68:                     tidx = tbx_index_load2(cfilename, cindex_filename)
jpayne@68:                 self.index = makeTabixIndex(tidx)
jpayne@68: 
jpayne@68:             if not self.is_stream:
jpayne@68:                 self.start_offset = self.tell()
jpayne@68:         else:
jpayne@68:             raise ValueError('unknown mode {}'.format(mode))
jpayne@68: 
jpayne@68:     def reset(self):
jpayne@68:         """reset file position to beginning of file just after the header."""
jpayne@68:         return self.seek(self.start_offset)
jpayne@68: 
jpayne@68:     def is_valid_tid(self, tid):
jpayne@68:         """
jpayne@68:         return True if the numerical :term:`tid` is valid; False otherwise.
jpayne@68: 
jpayne@68:         returns -1 if reference is not known.
jpayne@68:         """
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int rid = tid
jpayne@68:         return 0 <= rid < hdr.n[BCF_DT_CTG]
jpayne@68: 
jpayne@68:     def get_tid(self, reference):
jpayne@68:         """
jpayne@68:         return the numerical :term:`tid` corresponding to
jpayne@68:         :term:`reference`
jpayne@68: 
jpayne@68:         returns -1 if reference is not known.
jpayne@68:         """
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
jpayne@68:         reference = force_bytes(reference)
jpayne@68:         cdef khint_t k = kh_get_vdict(d, reference)
jpayne@68:         return kh_val_vdict(d, k).id if k != kh_end(d) else -1
jpayne@68: 
jpayne@68:     def get_reference_name(self, tid):
jpayne@68:         """
jpayne@68:         return :term:`reference` name corresponding to numerical :term:`tid`
jpayne@68:         """
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         cdef bcf_hdr_t *hdr = self.header.ptr
jpayne@68:         cdef int rid = tid
jpayne@68:         if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
jpayne@68:             raise ValueError('Invalid tid')
jpayne@68:         return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
jpayne@68: 
jpayne@68:     def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
jpayne@68:         """fetch records in a :term:`region`, specified either by
jpayne@68:         :term:`contig`, *start*, and *end* (which are 0-based, half-open);
jpayne@68:         or alternatively by a samtools :term:`region` string (which is
jpayne@68:         1-based inclusive).
jpayne@68: 
jpayne@68:         Without *contig* or *region* all mapped records will be fetched.  The
jpayne@68:         records will be returned ordered by contig, which will not necessarily
jpayne@68:         be the order within the file.
jpayne@68: 
jpayne@68:         Set *reopen* to true if you will be using multiple iterators on the
jpayne@68:         same file at the same time.  The iterator returned will receive its
jpayne@68:         own copy of a filehandle to the file effectively re-opening the
jpayne@68:         file.  Re-opening a file incurrs some overhead, so use with care.
jpayne@68: 
jpayne@68:         If only *contig* is set, all records on *contig* will be fetched.
jpayne@68:         If both *region* and *contig* are given, an exception is raised.
jpayne@68: 
jpayne@68:         Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
jpayne@68:         (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
jpayne@68:         read sequentially.
jpayne@68:         """
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         if self.htsfile.is_write:
jpayne@68:             raise ValueError('cannot fetch from Variantfile opened for writing')
jpayne@68: 
jpayne@68:         if contig is None and region is None:
jpayne@68:             self.is_reading = 1
jpayne@68:             bcf = self.copy() if reopen else self
jpayne@68:             bcf.seek(self.start_offset)
jpayne@68:             return iter(bcf)
jpayne@68: 
jpayne@68:         if self.index is None:
jpayne@68:             raise ValueError('fetch requires an index')
jpayne@68: 
jpayne@68:         _, tid, start, stop = self.parse_region(contig, start, stop, region,
jpayne@68:                                                 None, end=end, reference=reference)
jpayne@68: 
jpayne@68:         if contig is None:
jpayne@68:             contig = self.get_reference_name(tid)
jpayne@68: 
jpayne@68:         self.is_reading = 1
jpayne@68:         return self.index.fetch(self, contig, start, stop, reopen)
jpayne@68: 
jpayne@68:     def new_record(self, *args, **kwargs):
jpayne@68:         """Create a new empty :class:`VariantRecord`.
jpayne@68: 
jpayne@68:         See :meth:`VariantHeader.new_record`
jpayne@68:         """
jpayne@68:         return self.header.new_record(*args, **kwargs)
jpayne@68: 
jpayne@68:     cpdef int write(self, VariantRecord record) except -1:
jpayne@68:         """
jpayne@68:         write a single :class:`pysam.VariantRecord` to disk.
jpayne@68: 
jpayne@68:         returns the number of bytes written.
jpayne@68:         """
jpayne@68:         if record is None:
jpayne@68:             raise ValueError('record must not be None')
jpayne@68: 
jpayne@68:         if not self.is_open:
jpayne@68:             return ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         if not self.htsfile.is_write:
jpayne@68:             raise ValueError('cannot write to a Variantfile opened for reading')
jpayne@68: 
jpayne@68:         if not self.header_written:
jpayne@68:             self.header_written = True
jpayne@68:             with nogil:
jpayne@68:                 bcf_hdr_write(self.htsfile, self.header.ptr)
jpayne@68: 
jpayne@68:         #if record.header is not self.header:
jpayne@68:         #    record.translate(self.header)
jpayne@68:         #    raise ValueError('Writing records from a different VariantFile is not yet supported')
jpayne@68: 
jpayne@68:         if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
jpayne@68:             msg = 'Invalid VariantRecord.  Number of samples does not match header ({} vs {})'
jpayne@68:             raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
jpayne@68: 
jpayne@68:         # Sync END annotation before writing
jpayne@68:         bcf_sync_end(record)
jpayne@68: 
jpayne@68:         cdef int ret
jpayne@68: 
jpayne@68:         with nogil:
jpayne@68:             ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
jpayne@68: 
jpayne@68:         if ret < 0:
jpayne@68:             raise IOError(errno, strerror(errno))
jpayne@68: 
jpayne@68:         return ret
jpayne@68: 
jpayne@68:     def subset_samples(self, include_samples):
jpayne@68:         """
jpayne@68:         Read only a subset of samples to reduce processing time and memory.
jpayne@68:         Must be called prior to retrieving records.
jpayne@68:         """
jpayne@68:         if not self.is_open:
jpayne@68:             raise ValueError('I/O operation on closed file')
jpayne@68: 
jpayne@68:         if self.htsfile.is_write:
jpayne@68:             raise ValueError('cannot subset samples from Variantfile opened for writing')
jpayne@68: 
jpayne@68:         if self.is_reading:
jpayne@68:             raise ValueError('cannot subset samples after fetching records')
jpayne@68: 
jpayne@68:         self.header._subset_samples(include_samples)
jpayne@68: 
jpayne@68:         # potentially unnecessary optimization that also sets max_unpack
jpayne@68:         if not include_samples:
jpayne@68:             self.drop_samples = True
jpayne@68: