jpayne@68: # cython: language_level=3 jpayne@68: from libc.stdint cimport int8_t, int16_t, int32_t, int64_t jpayne@68: from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t jpayne@68: from libc.stdlib cimport malloc, calloc, realloc, free jpayne@68: from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup jpayne@68: from libc.stdio cimport FILE, printf jpayne@68: from posix.types cimport off_t jpayne@68: jpayne@68: cdef extern from "Python.h": jpayne@68: FILE* PyFile_AsFile(object) jpayne@68: jpayne@68: jpayne@68: # cython does not wrap stdarg jpayne@68: cdef extern from "stdarg.h": jpayne@68: ctypedef struct va_list: jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/kstring.h" nogil: jpayne@68: ctypedef struct kstring_t: jpayne@68: size_t l, m jpayne@68: char *s jpayne@68: jpayne@68: int kputc(int c, kstring_t *s) jpayne@68: int kputw(int c, kstring_t *s) jpayne@68: int kputl(long c, kstring_t *s) jpayne@68: int ksprintf(kstring_t *s, const char *fmt, ...) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib_util.h" nogil: jpayne@68: int hts_set_verbosity(int verbosity) jpayne@68: int hts_get_verbosity() jpayne@68: jpayne@68: ctypedef uint32_t khint32_t jpayne@68: ctypedef uint32_t khint_t jpayne@68: ctypedef khint_t khiter_t jpayne@68: jpayne@68: # Used to manage BCF Header info jpayne@68: ctypedef struct vdict_t: jpayne@68: khint_t n_buckets, size, n_occupied, upper_bound jpayne@68: khint32_t *flags jpayne@68: const char *keys jpayne@68: bcf_idinfo_t *vals jpayne@68: jpayne@68: # Used to manage indexed contigs in Tabix jpayne@68: ctypedef struct s2i_t: jpayne@68: khint_t n_buckets, size, n_occupied, upper_bound jpayne@68: khint32_t *flags jpayne@68: const char *keys jpayne@68: int64_t *vals jpayne@68: jpayne@68: # Generic khash methods jpayne@68: khint_t kh_size(void *d) jpayne@68: khint_t kh_begin(void *d) jpayne@68: khint_t kh_end(void *d) jpayne@68: int kh_exist(void *d, khiter_t i) jpayne@68: jpayne@68: # Specialized khash methods for vdict jpayne@68: khint_t kh_get_vdict(vdict_t *d, const char *key) jpayne@68: const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) jpayne@68: bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/hfile.h" nogil: jpayne@68: ctypedef struct hFILE jpayne@68: jpayne@68: # @abstract Open the named file or URL as a stream jpayne@68: # @return An hFILE pointer, or NULL (with errno set) if an error occurred. jpayne@68: hFILE *hopen(const char *filename, const char *mode, ...) jpayne@68: jpayne@68: # @abstract Associate a stream with an existing open file descriptor jpayne@68: # @return An hFILE pointer, or NULL (with errno set) if an error occurred. jpayne@68: # @notes For socket descriptors (on Windows), mode should contain 's'. jpayne@68: hFILE *hdopen(int fd, const char *mode) jpayne@68: jpayne@68: # @abstract Report whether the file name or URL denotes remote storage jpayne@68: # @return 0 if local, 1 if remote. jpayne@68: # @notes "Remote" means involving e.g. explicit network access, with the jpayne@68: # implication that callers may wish to cache such files' contents locally. jpayne@68: int hisremote(const char *filename) jpayne@68: jpayne@68: # @abstract Flush (for output streams) and close the stream jpayne@68: # @return 0 if successful, or EOF (with errno set) if an error occurred. jpayne@68: int hclose(hFILE *fp) jpayne@68: jpayne@68: # @abstract Close the stream, without flushing or propagating errors jpayne@68: # @notes For use while cleaning up after an error only. Preserves errno. jpayne@68: void hclose_abruptly(hFILE *fp) jpayne@68: jpayne@68: # @abstract Return the stream's error indicator jpayne@68: # @return Non-zero (in fact, an errno value) if an error has occurred. jpayne@68: # @notes This would be called herror() and return true/false to parallel jpayne@68: # ferror(3), but a networking-related herror(3) function already exists. */ jpayne@68: int herrno(hFILE *fp) jpayne@68: jpayne@68: # @abstract Clear the stream's error indicator jpayne@68: void hclearerr(hFILE *fp) jpayne@68: jpayne@68: # @abstract Reposition the read/write stream offset jpayne@68: # @return The resulting offset within the stream (as per lseek(2)), jpayne@68: # or negative if an error occurred. jpayne@68: off_t hseek(hFILE *fp, off_t offset, int whence) jpayne@68: jpayne@68: # @abstract Report the current stream offset jpayne@68: # @return The offset within the stream, starting from zero. jpayne@68: off_t htell(hFILE *fp) jpayne@68: jpayne@68: # @abstract Read one character from the stream jpayne@68: # @return The character read, or EOF on end-of-file or error jpayne@68: int hgetc(hFILE *fp) jpayne@68: jpayne@68: # Read from the stream until the delimiter, up to a maximum length jpayne@68: # @param buffer The buffer into which bytes will be written jpayne@68: # @param size The size of the buffer jpayne@68: # @param delim The delimiter (interpreted as an `unsigned char`) jpayne@68: # @param fp The file stream jpayne@68: # @return The number of bytes read, or negative on error. jpayne@68: # @since 1.4 jpayne@68: # jpayne@68: # Bytes will be read into the buffer up to and including a delimiter, until jpayne@68: # EOF is reached, or _size-1_ bytes have been written, whichever comes first. jpayne@68: # The string will then be terminated with a NUL byte (`\0`). jpayne@68: ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) jpayne@68: jpayne@68: # Read a line from the stream, up to a maximum length jpayne@68: # @param buffer The buffer into which bytes will be written jpayne@68: # @param size The size of the buffer jpayne@68: # @param fp The file stream jpayne@68: # @return The number of bytes read, or negative on error. jpayne@68: # @since 1.4 jpayne@68: # jpayne@68: # Specialization of hgetdelim() for a `\n` delimiter. jpayne@68: ssize_t hgetln(char *buffer, size_t size, hFILE *fp) jpayne@68: jpayne@68: # Read a line from the stream, up to a maximum length jpayne@68: # @param buffer The buffer into which bytes will be written jpayne@68: # @param size The size of the buffer (must be > 1 to be useful) jpayne@68: # @param fp The file stream jpayne@68: # @return _buffer_ on success, or `NULL` if an error occurred. jpayne@68: # @since 1.4 jpayne@68: # jpayne@68: # This function can be used as a replacement for `fgets(3)`, or together with jpayne@68: # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. jpayne@68: char *hgets(char *buffer, int size, hFILE *fp) jpayne@68: jpayne@68: # @abstract Peek at characters to be read without removing them from buffers jpayne@68: # @param fp The file stream jpayne@68: # @param buffer The buffer to which the peeked bytes will be written jpayne@68: # @param nbytes The number of bytes to peek at; limited by the size of the jpayne@68: # internal buffer, which could be as small as 4K. jpayne@68: # @return The number of bytes peeked, which may be less than nbytes if EOF jpayne@68: # is encountered; or negative, if there was an I/O error. jpayne@68: # @notes The characters peeked at remain in the stream's internal buffer, jpayne@68: # and will be returned by later hread() etc calls. jpayne@68: ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) jpayne@68: jpayne@68: # @abstract Read a block of characters from the file jpayne@68: # @return The number of bytes read, or negative if an error occurred. jpayne@68: # @notes The full nbytes requested will be returned, except as limited jpayne@68: # by EOF or I/O errors. jpayne@68: ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) jpayne@68: jpayne@68: # @abstract Write a character to the stream jpayne@68: # @return The character written, or EOF if an error occurred. jpayne@68: int hputc(int c, hFILE *fp) jpayne@68: jpayne@68: # @abstract Write a string to the stream jpayne@68: # @return 0 if successful, or EOF if an error occurred. jpayne@68: int hputs(const char *text, hFILE *fp) jpayne@68: jpayne@68: # @abstract Write a block of characters to the file jpayne@68: # @return Either nbytes, or negative if an error occurred. jpayne@68: # @notes In the absence of I/O errors, the full nbytes will be written. jpayne@68: ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) jpayne@68: jpayne@68: # @abstract For writing streams, flush buffered output to the underlying stream jpayne@68: # @return 0 if successful, or EOF if an error occurred. jpayne@68: int hflush(hFILE *fp) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/bgzf.h" nogil: jpayne@68: ctypedef struct bgzf_mtaux_t jpayne@68: ctypedef struct bgzidx_t jpayne@68: ctypedef struct z_stream jpayne@68: jpayne@68: ctypedef struct BGZF: jpayne@68: unsigned errcode jpayne@68: unsigned is_write jpayne@68: int is_be jpayne@68: int compress_level jpayne@68: int is_compressed jpayne@68: int is_gzip jpayne@68: int cache_size jpayne@68: int64_t block_address jpayne@68: int64_t uncompressed_address jpayne@68: void *uncompressed_block jpayne@68: void *compressed_block jpayne@68: void *cache jpayne@68: hFILE *fp jpayne@68: bgzf_mtaux_t *mt jpayne@68: bgzidx_t *idx jpayne@68: int idx_build_otf jpayne@68: z_stream *gz_stream jpayne@68: jpayne@68: #***************** jpayne@68: # Basic routines * jpayne@68: # *****************/ jpayne@68: jpayne@68: # Open an existing file descriptor for reading or writing. jpayne@68: # jpayne@68: # @param fd file descriptor jpayne@68: # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for jpayne@68: # writing, 'a' for appending, 'g' for gzip rather than BGZF jpayne@68: # compression (with 'w' only), and digit specifies the zlib jpayne@68: # compression level. jpayne@68: # Note that there is a distinction between 'u' and '0': the jpayne@68: # first yields plain uncompressed output whereas the latter jpayne@68: # outputs uncompressed data wrapped in the zlib format. jpayne@68: # @return BGZF file handler; 0 on error jpayne@68: jpayne@68: BGZF* bgzf_dopen(int fd, const char *mode) jpayne@68: BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility jpayne@68: jpayne@68: # Open the specified file for reading or writing. jpayne@68: BGZF* bgzf_open(const char* path, const char *mode) jpayne@68: jpayne@68: # Open an existing hFILE stream for reading or writing. jpayne@68: BGZF* bgzf_hopen(hFILE *fp, const char *mode) jpayne@68: jpayne@68: # Close the BGZF and free all associated resources. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @return 0 on success and -1 on error jpayne@68: int bgzf_close(BGZF *fp) jpayne@68: jpayne@68: # Read up to _length_ bytes from the file storing into _data_. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param data data array to read into jpayne@68: # @param length size of data to read jpayne@68: # @return number of bytes actually read; 0 on end-of-file and -1 on error jpayne@68: ssize_t bgzf_read(BGZF *fp, void *data, size_t length) jpayne@68: jpayne@68: # Write _length_ bytes from _data_ to the file. If no I/O errors occur, jpayne@68: # the complete _length_ bytes will be written (or queued for writing). jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param data data array to write jpayne@68: # @param length size of data to write jpayne@68: # @return number of bytes written (i.e., _length_); negative on error jpayne@68: ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) jpayne@68: jpayne@68: # Read up to _length_ bytes directly from the underlying stream without jpayne@68: # decompressing. Bypasses BGZF blocking, so must be used with care in jpayne@68: # specialised circumstances only. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param data data array to read into jpayne@68: # @param length number of raw bytes to read jpayne@68: # @return number of bytes actually read; 0 on end-of-file and -1 on error jpayne@68: ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) jpayne@68: jpayne@68: # Write _length_ bytes directly to the underlying stream without jpayne@68: # compressing. Bypasses BGZF blocking, so must be used with care jpayne@68: # in specialised circumstances only. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param data data array to write jpayne@68: # @param length number of raw bytes to write jpayne@68: # @return number of bytes actually written; -1 on error jpayne@68: ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) jpayne@68: jpayne@68: # Write the data in the buffer to the file. jpayne@68: int bgzf_flush(BGZF *fp) jpayne@68: jpayne@68: # Return a virtual file pointer to the current location in the file. jpayne@68: # No interpretation of the value should be made, other than a subsequent jpayne@68: # call to bgzf_seek can be used to position the file at the same point. jpayne@68: # Return value is non-negative on success. jpayne@68: int64_t bgzf_tell(BGZF *fp) jpayne@68: jpayne@68: # Set the file to read from the location specified by _pos_. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param pos virtual file offset returned by bgzf_tell() jpayne@68: # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd) jpayne@68: # @return 0 on success and -1 on error jpayne@68: # / jpayne@68: int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) jpayne@68: jpayne@68: # Check if the BGZF end-of-file (EOF) marker is present jpayne@68: # jpayne@68: # @param fp BGZF file handler opened for reading jpayne@68: # @return 1 if the EOF marker is present and correct jpayne@68: # 2 if it can't be checked, e.g., because fp isn't seekable jpayne@68: # 0 if the EOF marker is absent jpayne@68: # -1 (with errno set) on error jpayne@68: int bgzf_check_EOF(BGZF *fp) jpayne@68: jpayne@68: # Check if a file is in the BGZF format jpayne@68: # jpayne@68: # @param fn file name jpayne@68: # @return 1 if _fn_ is BGZF; 0 if not or on I/O error jpayne@68: int bgzf_is_bgzf(const char *fn) jpayne@68: jpayne@68: #********************* jpayne@68: # Advanced routines * jpayne@68: #********************* jpayne@68: jpayne@68: # Set the cache size. Only effective when compiled with -DBGZF_CACHE. jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param size size of cache in bytes; 0 to disable caching (default) jpayne@68: void bgzf_set_cache_size(BGZF *fp, int size) jpayne@68: jpayne@68: # Flush the file if the remaining buffer size is smaller than _size_ jpayne@68: # @return 0 if flushing succeeded or was not needed; negative on error jpayne@68: int bgzf_flush_try(BGZF *fp, ssize_t size) jpayne@68: jpayne@68: # Read one byte from a BGZF file. It is faster than bgzf_read() jpayne@68: # @param fp BGZF file handler jpayne@68: # @return byte read; -1 on end-of-file or error jpayne@68: int bgzf_getc(BGZF *fp) jpayne@68: jpayne@68: # Read one line from a BGZF file. It is faster than bgzf_getc() jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param delim delimiter jpayne@68: # @param str string to write to; must be initialized jpayne@68: # @return length of the string; 0 on end-of-file; negative on error jpayne@68: int bgzf_getline(BGZF *fp, int delim, kstring_t *str) jpayne@68: jpayne@68: # Read the next BGZF block. jpayne@68: int bgzf_read_block(BGZF *fp) jpayne@68: jpayne@68: # Enable multi-threading (only effective on writing and when the jpayne@68: # library was compiled with -DBGZF_MT) jpayne@68: # jpayne@68: # @param fp BGZF file handler; must be opened for writing jpayne@68: # @param n_threads #threads used for writing jpayne@68: # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended jpayne@68: int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) jpayne@68: jpayne@68: jpayne@68: # Compress a single BGZF block. jpayne@68: # jpayne@68: # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) jpayne@68: # @param dlen size of output buffer; updated on return to the number jpayne@68: # of bytes actually written to dst jpayne@68: # @param src buffer to be compressed jpayne@68: # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) jpayne@68: # @param level compression level jpayne@68: # @return 0 on success and negative on error jpayne@68: # jpayne@68: int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) jpayne@68: jpayne@68: #******************* jpayne@68: # bgzidx routines * jpayne@68: # BGZF at the uncompressed offset jpayne@68: # jpayne@68: # @param fp BGZF file handler; must be opened for reading jpayne@68: # @param uoffset file offset in the uncompressed data jpayne@68: # @param where SEEK_SET (cimported from libc.stdio) supported atm jpayne@68: # jpayne@68: # Returns 0 on success and -1 on error. jpayne@68: int bgzf_useek(BGZF *fp, long uoffset, int where) jpayne@68: jpayne@68: # Position in uncompressed BGZF jpayne@68: # jpayne@68: # @param fp BGZF file handler; must be opened for reading jpayne@68: # jpayne@68: # Returns the current offset on success and -1 on error. jpayne@68: long bgzf_utell(BGZF *fp) jpayne@68: jpayne@68: # Tell BGZF to build index while compressing. jpayne@68: # jpayne@68: # @param fp BGZF file handler; can be opened for reading or writing. jpayne@68: # jpayne@68: # Returns 0 on success and -1 on error. jpayne@68: int bgzf_index_build_init(BGZF *fp) jpayne@68: jpayne@68: # Load BGZF index jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param bname base name jpayne@68: # @param suffix suffix to add to bname (can be NULL) jpayne@68: # jpayne@68: # Returns 0 on success and -1 on error. jpayne@68: int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) jpayne@68: jpayne@68: # Save BGZF index jpayne@68: # jpayne@68: # @param fp BGZF file handler jpayne@68: # @param bname base name jpayne@68: # @param suffix suffix to add to bname (can be NULL) jpayne@68: # jpayne@68: # Returns 0 on success and -1 on error. jpayne@68: int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/hts.h" nogil: jpayne@68: uint32_t kroundup32(uint32_t x) jpayne@68: jpayne@68: ctypedef struct cram_fd jpayne@68: jpayne@68: union FilePointerUnion: jpayne@68: BGZF *bgzf jpayne@68: cram_fd *cram jpayne@68: hFILE *hfile jpayne@68: void *voidp jpayne@68: jpayne@68: enum htsFormatCategory: jpayne@68: unknown_category jpayne@68: sequence_data # Sequence data -- SAM, BAM, CRAM, etc jpayne@68: variant_data # Variant calling data -- VCF, BCF, etc jpayne@68: index_file # Index file associated with some data file jpayne@68: region_list # Coordinate intervals or regions -- BED, etc jpayne@68: category_maximum jpayne@68: jpayne@68: enum htsExactFormat: jpayne@68: unknown_format jpayne@68: binary_format jpayne@68: text_format jpayne@68: sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed jpayne@68: format_maximum jpayne@68: jpayne@68: enum htsCompression: jpayne@68: no_compression, gzip, bgzf, custom jpayne@68: compression_maximum jpayne@68: jpayne@68: cdef enum hts_fmt_option: jpayne@68: CRAM_OPT_DECODE_MD, jpayne@68: CRAM_OPT_PREFIX, jpayne@68: CRAM_OPT_VERBOSITY, jpayne@68: CRAM_OPT_SEQS_PER_SLICE, jpayne@68: CRAM_OPT_SLICES_PER_CONTAINER, jpayne@68: CRAM_OPT_RANGE, jpayne@68: CRAM_OPT_VERSION, jpayne@68: CRAM_OPT_EMBED_REF, jpayne@68: CRAM_OPT_IGNORE_MD5, jpayne@68: CRAM_OPT_REFERENCE, jpayne@68: CRAM_OPT_MULTI_SEQ_PER_SLICE, jpayne@68: CRAM_OPT_NO_REF, jpayne@68: CRAM_OPT_USE_BZIP2, jpayne@68: CRAM_OPT_SHARED_REF, jpayne@68: CRAM_OPT_NTHREADS, jpayne@68: CRAM_OPT_THREAD_POOL, jpayne@68: CRAM_OPT_USE_LZMA, jpayne@68: CRAM_OPT_USE_RANS, jpayne@68: CRAM_OPT_REQUIRED_FIELDS, jpayne@68: HTS_OPT_COMPRESSION_LEVEL, jpayne@68: HTS_OPT_NTHREADS, jpayne@68: jpayne@68: ctypedef struct htsVersion: jpayne@68: short major, minor jpayne@68: jpayne@68: ctypedef struct htsFormat: jpayne@68: htsFormatCategory category jpayne@68: htsExactFormat format jpayne@68: htsVersion version jpayne@68: htsCompression compression jpayne@68: short compression_level jpayne@68: void *specific jpayne@68: jpayne@68: ctypedef struct htsFile: jpayne@68: uint8_t is_bin jpayne@68: uint8_t is_write jpayne@68: uint8_t is_be jpayne@68: uint8_t is_cram jpayne@68: int64_t lineno jpayne@68: kstring_t line jpayne@68: char *fn jpayne@68: char *fn_aux jpayne@68: FilePointerUnion fp jpayne@68: htsFormat format jpayne@68: jpayne@68: int hts_verbose jpayne@68: jpayne@68: cdef union hts_opt_val_union: jpayne@68: int i jpayne@68: char *s jpayne@68: jpayne@68: ctypedef struct hts_opt: jpayne@68: char *arg jpayne@68: hts_fmt_option opt jpayne@68: hts_opt_val_union val jpayne@68: void *next jpayne@68: jpayne@68: # @abstract Parses arg and appends it to the option list. jpayne@68: # @return 0 on success and -1 on failure jpayne@68: int hts_opt_add(hts_opt **opts, const char *c_arg) jpayne@68: jpayne@68: # @abstract Applies an hts_opt option list to a given htsFile. jpayne@68: # @return 0 on success and -1 on failure jpayne@68: int hts_opt_apply(htsFile *fp, hts_opt *opts) jpayne@68: jpayne@68: # @abstract Frees an hts_opt list. jpayne@68: void hts_opt_free(hts_opt *opts) jpayne@68: jpayne@68: # @abstract Table for converting a nucleotide character to 4-bit encoding. jpayne@68: # The input character may be either an IUPAC ambiguity code, '=' for 0, or jpayne@68: # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 jpayne@68: # for A/C/G/T or combinations of these bits for ambiguous bases. jpayne@68: const unsigned char *seq_nt16_table jpayne@68: jpayne@68: # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC jpayne@68: # ambiguity code letter (or '=' when given 0). jpayne@68: const char *seq_nt16_str jpayne@68: jpayne@68: # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. jpayne@68: # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). jpayne@68: const int *seq_nt16_int jpayne@68: jpayne@68: # @abstract Get the htslib version number jpayne@68: # @return For released versions, a string like "N.N[.N]"; or git describe jpayne@68: # output if using a library built within a Git repository. jpayne@68: const char *hts_version() jpayne@68: jpayne@68: # @abstract Determine format by peeking at the start of a file jpayne@68: # @param fp File opened for reading, positioned at the beginning jpayne@68: # @param fmt Format structure that will be filled out on return jpayne@68: # @return 0 for success, or negative if an error occurred. jpayne@68: int hts_detect_format(hFILE *fp, htsFormat *fmt) jpayne@68: jpayne@68: # @abstract Get a human-readable description of the file format jpayne@68: # @return Description string, to be freed by the caller after use. jpayne@68: char *hts_format_description(const htsFormat *format) jpayne@68: jpayne@68: # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file jpayne@68: # @param fn The file name or "-" for stdin/stdout jpayne@68: # @param mode Mode matching / [rwa][bceguxz0-9]* / jpayne@68: # @discussion jpayne@68: # With 'r' opens for reading; any further format mode letters are ignored jpayne@68: # as the format is detected by checking the first few bytes or BGZF blocks jpayne@68: # of the file. With 'w' or 'a' opens for writing or appending, with format jpayne@68: # specifier letters: jpayne@68: # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) jpayne@68: # c CRAM format jpayne@68: # g gzip compressed jpayne@68: # u uncompressed jpayne@68: # z bgzf compressed jpayne@68: # [0-9] zlib compression level jpayne@68: # and with non-format option letters (for any of 'r'/'w'/'a'): jpayne@68: # e close the file on exec(2) (opens with O_CLOEXEC, where supported) jpayne@68: # x create the file exclusively (opens with O_EXCL, where supported) jpayne@68: # Note that there is a distinction between 'u' and '0': the first yields jpayne@68: # plain uncompressed output whereas the latter outputs uncompressed data jpayne@68: # wrapped in the zlib format. jpayne@68: # @example jpayne@68: # [rw]b .. compressed BCF, BAM, FAI jpayne@68: # [rw]bu .. uncompressed BCF jpayne@68: # [rw]z .. compressed VCF jpayne@68: # [rw] .. uncompressed VCF jpayne@68: htsFile *hts_open(const char *fn, const char *mode) jpayne@68: jpayne@68: # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file jpayne@68: # @param fn The file name or "-" for stdin/stdout jpayne@68: # @param mode Open mode, as per hts_open() jpayne@68: # @param fmt Optional format specific parameters jpayne@68: # @discussion jpayne@68: # See hts_open() for description of fn and mode. jpayne@68: # // TODO Update documentation for s/opts/fmt/ jpayne@68: # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, jpayne@68: # if defined, override mode. Opts also contains a linked list of hts_opt jpayne@68: # structures to apply to the open file handle. These can contain things jpayne@68: # like pointers to the reference or information on compression levels, jpayne@68: # block sizes, etc. jpayne@68: htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) jpayne@68: jpayne@68: # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file jpayne@68: # @param fp The already-open file handle jpayne@68: # @param fn The file name or "-" for stdin/stdout jpayne@68: # @param mode Open mode, as per hts_open() jpayne@68: htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) jpayne@68: jpayne@68: # @abstract For output streams, flush any buffered data jpayne@68: # @param fp The file handle to be flushed jpayne@68: # @return 0 for success, or negative if an error occurred. jpayne@68: # @since 1.14 jpayne@68: int hts_flush(htsFile *fp) jpayne@68: jpayne@68: # @abstract Close a file handle, flushing buffered data for output streams jpayne@68: # @param fp The file handle to be closed jpayne@68: # @return 0 for success, or negative if an error occurred. jpayne@68: int hts_close(htsFile *fp) jpayne@68: jpayne@68: # @abstract Returns the file's format information jpayne@68: # @param fp The file handle jpayne@68: # @return Read-only pointer to the file's htsFormat. jpayne@68: const htsFormat *hts_get_format(htsFile *fp) jpayne@68: jpayne@68: # @ abstract Returns a string containing the file format extension. jpayne@68: # @ param format Format structure containing the file type. jpayne@68: # @ return A string ("sam", "bam", etc) or "?" for unknown formats. jpayne@68: const char *hts_format_file_extension(const htsFormat *format) jpayne@68: jpayne@68: # @abstract Sets a specified CRAM option on the open file handle. jpayne@68: # @param fp The file handle open the open file. jpayne@68: # @param opt The CRAM_OPT_* option. jpayne@68: # @param ... Optional arguments, dependent on the option used. jpayne@68: # @return 0 for success, or negative if an error occurred. jpayne@68: int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) jpayne@68: jpayne@68: int hts_getline(htsFile *fp, int delimiter, kstring_t *str) jpayne@68: char **hts_readlines(const char *fn, int *_n) jpayne@68: jpayne@68: # @abstract Parse comma-separated list or read list from a file jpayne@68: # @param list File name or comma-separated list jpayne@68: # @param is_file jpayne@68: # @param _n Size of the output array (number of items read) jpayne@68: # @return NULL on failure or pointer to newly allocated array of jpayne@68: # strings jpayne@68: char **hts_readlist(const char *fn, int is_file, int *_n) jpayne@68: jpayne@68: # @abstract Create extra threads to aid compress/decompression for this file jpayne@68: # @param fp The file handle jpayne@68: # @param n The number of worker threads to create jpayne@68: # @return 0 for success, or negative if an error occurred. jpayne@68: # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. jpayne@68: int hts_set_threads(htsFile *fp, int n) jpayne@68: jpayne@68: # @abstract Set .fai filename for a file opened for reading jpayne@68: # @return 0 for success, negative on failure jpayne@68: # @discussion jpayne@68: # Called before *_hdr_read(), this provides the name of a .fai file jpayne@68: # used to provide a reference list if the htsFile contains no @SQ headers. jpayne@68: int hts_set_fai_filename(htsFile *fp, const char *fn_aux) jpayne@68: jpayne@68: int8_t HTS_IDX_NOCOOR jpayne@68: int8_t HTS_IDX_START jpayne@68: int8_t HTS_IDX_REST jpayne@68: int8_t HTS_IDX_NONE jpayne@68: jpayne@68: int8_t HTS_FMT_CSI jpayne@68: int8_t HTS_FMT_BAI jpayne@68: int8_t HTS_FMT_TBI jpayne@68: int8_t HTS_FMT_CRAI jpayne@68: jpayne@68: BGZF *hts_get_bgzfp(htsFile *fp) jpayne@68: jpayne@68: ctypedef struct hts_idx_t jpayne@68: jpayne@68: ctypedef struct hts_pair64_t: jpayne@68: uint64_t u, v jpayne@68: jpayne@68: ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) jpayne@68: jpayne@68: ctypedef struct hts_bins_t: jpayne@68: int n, m jpayne@68: int *a jpayne@68: jpayne@68: ctypedef struct hts_itr_t: jpayne@68: uint32_t read_rest jpayne@68: uint32_t finished jpayne@68: int tid, bed, end, n_off, i jpayne@68: int curr_tid, curr_beg, curr_end jpayne@68: uint64_t curr_off jpayne@68: hts_pair64_t *off jpayne@68: hts_readrec_func *readfunc jpayne@68: hts_bins_t bins jpayne@68: jpayne@68: hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) jpayne@68: void hts_idx_destroy(hts_idx_t *idx) jpayne@68: int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) jpayne@68: void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) jpayne@68: jpayne@68: #### Save an index to a file jpayne@68: # @param idx Index to be written jpayne@68: # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added jpayne@68: # @param fmt One of the HTS_FMT_* index formats jpayne@68: # @return 0 if successful, or negative if an error occurred. jpayne@68: int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) jpayne@68: jpayne@68: #### Save an index to a specific file jpayne@68: # @param idx Index to be written jpayne@68: # @param fn Input BAM/BCF/etc filename jpayne@68: # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn jpayne@68: # @param fmt One of the HTS_FMT_* index formats jpayne@68: # @return 0 if successful, or negative if an error occurred. jpayne@68: int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) jpayne@68: jpayne@68: #### Load an index file jpayne@68: # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or jpayne@68: # the extension substituted, to search for an existing index file jpayne@68: # @param fmt One of the HTS_FMT_* index formats jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *hts_idx_load(const char *fn, int fmt) jpayne@68: jpayne@68: #### Load a specific index file jpayne@68: # @param fn Input BAM/BCF/etc filename jpayne@68: # @param fnidx The input index filename jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) jpayne@68: jpayne@68: #### Load a specific index file jpayne@68: # @param fn Input BAM/BCF/etc filename jpayne@68: # @param fnidx The input index filename jpayne@68: # @param fmt One of the HTS_FMT_* index formats jpayne@68: # @param flags Flags to alter behaviour (see description) jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) jpayne@68: jpayne@68: int HTS_IDX_SAVE_REMOTE jpayne@68: int HTS_IDX_SILENT_FAIL jpayne@68: jpayne@68: uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) jpayne@68: void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) jpayne@68: jpayne@68: int hts_idx_get_stat(const hts_idx_t* idx, int tid, jpayne@68: uint64_t* mapped, uint64_t* unmapped) jpayne@68: jpayne@68: uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) jpayne@68: jpayne@68: int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers jpayne@68: jpayne@68: # Parse a numeric string jpayne@68: # The number may be expressed in scientific notation, and optionally may jpayne@68: # contain commas in the integer part (before any decimal point or E notation). jpayne@68: # @param str String to be parsed jpayne@68: # @param strend If non-NULL, set on return to point to the first character jpayne@68: # in @a str after those forming the parsed number jpayne@68: # @param flags Or'ed-together combination of HTS_PARSE_* flags jpayne@68: # @return Converted value of the parsed number. jpayne@68: # jpayne@68: # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 jpayne@68: # or more) if there are any trailing characters after the number. jpayne@68: long long hts_parse_decimal(const char *str, char **strend, int flags) jpayne@68: jpayne@68: # Parse a "CHR:START-END"-style region string jpayne@68: # @param str String to be parsed jpayne@68: # @param beg Set on return to the 0-based start of the region jpayne@68: # @param end Set on return to the 1-based end of the region jpayne@68: # @return Pointer to the colon or '\0' after the reference sequence name, jpayne@68: # or NULL if @a str could not be parsed. jpayne@68: const char *hts_parse_reg(const char *str, int *beg, int *end) jpayne@68: jpayne@68: hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) jpayne@68: void hts_itr_destroy(hts_itr_t *iter) jpayne@68: jpayne@68: ctypedef int (*hts_name2id_f)(void*, const char*) jpayne@68: ctypedef const char *(*hts_id2name_f)(void*, int) jpayne@68: ctypedef hts_itr_t *hts_itr_query_func( jpayne@68: const hts_idx_t *idx, jpayne@68: int tid, jpayne@68: int beg, jpayne@68: int end, jpayne@68: hts_readrec_func *readrec) jpayne@68: jpayne@68: hts_itr_t *hts_itr_querys( jpayne@68: const hts_idx_t *idx, jpayne@68: const char *reg, jpayne@68: hts_name2id_f getid, jpayne@68: void *hdr, jpayne@68: hts_itr_query_func *itr_query, jpayne@68: hts_readrec_func *readrec) jpayne@68: jpayne@68: int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) jpayne@68: const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values jpayne@68: jpayne@68: # hts_file_type() - Convenience function to determine file type jpayne@68: # @fname: the file name jpayne@68: # jpayne@68: # Returns one of the FT_* defines. jpayne@68: # jpayne@68: # DEPRECATED: This function has been replaced by hts_detect_format(). jpayne@68: # It and these FT_* macros will be removed in a future HTSlib release. jpayne@68: int FT_UNKN jpayne@68: int FT_GZ jpayne@68: int FT_VCF jpayne@68: int FT_VCF_GZ jpayne@68: int FT_BCF jpayne@68: int FT_BCF_GZ jpayne@68: int FT_STDIN jpayne@68: jpayne@68: int hts_file_type(const char *fname) jpayne@68: jpayne@68: # /*************************** jpayne@68: # * Revised MAQ error model * jpayne@68: # ***************************/ jpayne@68: jpayne@68: ctypedef struct errmod_t jpayne@68: jpayne@68: errmod_t *errmod_init(double depcorr) jpayne@68: void errmod_destroy(errmod_t *em) jpayne@68: jpayne@68: # /* jpayne@68: # n: number of bases jpayne@68: # m: maximum base jpayne@68: # bases[i]: qual:6, strand:1, base:4 jpayne@68: # q[i*m+j]: phred-scaled likelihood of (i,j) jpayne@68: # */ jpayne@68: int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic) jpayne@68: jpayne@68: # /***************************************** jpayne@68: # * q banded glocal alignment * jpayne@68: # *****************************************/ jpayne@68: jpayne@68: ctypedef struct probaln_par_t: jpayne@68: float d, e jpayne@68: int bw jpayne@68: jpayne@68: int probaln_glocal(const uint8_t *ref, jpayne@68: int l_ref, jpayne@68: const uint8_t *query, jpayne@68: int l_query, const uint8_t *iqual, jpayne@68: const probaln_par_t *c, jpayne@68: int *state, uint8_t *q) jpayne@68: jpayne@68: # /********************** jpayne@68: # * MD5 implementation * jpayne@68: # **********************/ jpayne@68: jpayne@68: ctypedef struct hts_md5_context jpayne@68: jpayne@68: # /*! @abstract Initialises an MD5 context. jpayne@68: # * @discussion jpayne@68: # * The expected use is to allocate an hts_md5_context using jpayne@68: # * hts_md5_init(). This pointer is then passed into one or more calls jpayne@68: # * of hts_md5_update() to compute successive internal portions of the jpayne@68: # * MD5 sum, which can then be externalised as a full 16-byte MD5sum jpayne@68: # * calculation by calling hts_md5_final(). This can then be turned jpayne@68: # * into ASCII via hts_md5_hex(). jpayne@68: # * jpayne@68: # * To dealloate any resources created by hts_md5_init() call the jpayne@68: # * hts_md5_destroy() function. jpayne@68: # * jpayne@68: # * @return hts_md5_context pointer on success, NULL otherwise. jpayne@68: # */ jpayne@68: hts_md5_context *hts_md5_init() jpayne@68: jpayne@68: # /*! @abstract Updates the context with the MD5 of the data. */ jpayne@68: void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size) jpayne@68: jpayne@68: # /*! @abstract Computes the final 128-bit MD5 hash from the given context */ jpayne@68: void hts_md5_final(unsigned char *digest, hts_md5_context *ctx) jpayne@68: jpayne@68: # /*! @abstract Resets an md5_context to the initial state, as returned jpayne@68: # * by hts_md5_init(). jpayne@68: # */ jpayne@68: void hts_md5_reset(hts_md5_context *ctx) jpayne@68: jpayne@68: # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated jpayne@68: # * hex string. jpayne@68: # */ jpayne@68: void hts_md5_hex(char *hex, const unsigned char *digest) jpayne@68: jpayne@68: # /*! @abstract Deallocates any memory allocated by hts_md5_init. */ jpayne@68: void hts_md5_destroy(hts_md5_context *ctx) jpayne@68: jpayne@68: int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) jpayne@68: int hts_bin_bot(int bin, int n_lvls) jpayne@68: jpayne@68: # * Endianness * jpayne@68: int ed_is_big() jpayne@68: uint16_t ed_swap_2(uint16_t v) jpayne@68: void *ed_swap_2p(void *x) jpayne@68: uint32_t ed_swap_4(uint32_t v) jpayne@68: void *ed_swap_4p(void *x) jpayne@68: uint64_t ed_swap_8(uint64_t v) jpayne@68: void *ed_swap_8p(void *x) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/sam.h" nogil: jpayne@68: #********************** jpayne@68: #*** SAM/BAM header *** jpayne@68: #********************** jpayne@68: jpayne@68: # @abstract Structure for the alignment header. jpayne@68: # @field n_targets number of reference sequences jpayne@68: # @field l_text length of the plain text in the header jpayne@68: # @field target_len lengths of the reference sequences jpayne@68: # @field target_name names of the reference sequences jpayne@68: # @field text plain text jpayne@68: # @field sdict header dictionary jpayne@68: jpayne@68: ctypedef struct bam_hdr_t: jpayne@68: int32_t n_targets, ignore_sam_err jpayne@68: uint32_t l_text jpayne@68: uint32_t *target_len jpayne@68: uint8_t *cigar_tab jpayne@68: char **target_name jpayne@68: char *text jpayne@68: void *sdict jpayne@68: jpayne@68: #**************************** jpayne@68: #*** CIGAR related macros *** jpayne@68: #**************************** jpayne@68: jpayne@68: int BAM_CMATCH jpayne@68: int BAM_CINS jpayne@68: int BAM_CDEL jpayne@68: int BAM_CREF_SKIP jpayne@68: int BAM_CSOFT_CLIP jpayne@68: int BAM_CHARD_CLIP jpayne@68: int BAM_CPAD jpayne@68: int BAM_CEQUAL jpayne@68: int BAM_CDIFF jpayne@68: int BAM_CBACK jpayne@68: jpayne@68: char *BAM_CIGAR_STR jpayne@68: int BAM_CIGAR_SHIFT jpayne@68: uint32_t BAM_CIGAR_MASK jpayne@68: uint32_t BAM_CIGAR_TYPE jpayne@68: jpayne@68: char bam_cigar_op(uint32_t c) jpayne@68: uint32_t bam_cigar_oplen(uint32_t c) jpayne@68: char bam_cigar_opchr(uint32_t) jpayne@68: uint32_t bam_cigar_gen(char, uint32_t) jpayne@68: int bam_cigar_type(char o) jpayne@68: jpayne@68: # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair jpayne@68: int BAM_FPAIRED jpayne@68: # @abstract the read is mapped in a proper pair jpayne@68: int BAM_FPROPER_PAIR jpayne@68: # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR jpayne@68: int BAM_FUNMAP jpayne@68: # @abstract the mate is unmapped jpayne@68: int BAM_FMUNMAP jpayne@68: # @abstract the read is mapped to the reverse strand jpayne@68: int BAM_FREVERSE jpayne@68: # @abstract the mate is mapped to the reverse strand jpayne@68: int BAM_FMREVERSE jpayne@68: # @abstract this is read1 jpayne@68: int BAM_FREAD1 jpayne@68: # @abstract this is read2 jpayne@68: int BAM_FREAD2 jpayne@68: # @abstract not primary alignment jpayne@68: int BAM_FSECONDARY jpayne@68: # @abstract QC failure jpayne@68: int BAM_FQCFAIL jpayne@68: # @abstract optical or PCR duplicate jpayne@68: int BAM_FDUP jpayne@68: # @abstract supplementary alignment jpayne@68: int BAM_FSUPPLEMENTARY jpayne@68: jpayne@68: #************************* jpayne@68: #*** Alignment records *** jpayne@68: #************************* jpayne@68: jpayne@68: # @abstract Structure for core alignment information. jpayne@68: # @field tid chromosome ID, defined by bam_hdr_t jpayne@68: # @field pos 0-based leftmost coordinate jpayne@68: # @field bin bin calculated by bam_reg2bin() jpayne@68: # @field qual mapping quality jpayne@68: # @field l_qname length of the query name jpayne@68: # @field flag bitwise flag jpayne@68: # @field n_cigar number of CIGAR operations jpayne@68: # @field l_qseq length of the query sequence (read) jpayne@68: # @field mtid chromosome ID of next read in template, defined by bam_hdr_t jpayne@68: # @field mpos 0-based leftmost coordinate of next read in template jpayne@68: jpayne@68: ctypedef struct bam1_core_t: jpayne@68: int32_t tid jpayne@68: int32_t pos jpayne@68: uint16_t bin jpayne@68: uint8_t qual jpayne@68: uint8_t l_qname jpayne@68: uint16_t flag jpayne@68: uint8_t unused1 jpayne@68: uint8_t l_extranul jpayne@68: uint32_t n_cigar jpayne@68: int32_t l_qseq jpayne@68: int32_t mtid jpayne@68: int32_t mpos jpayne@68: int32_t isize jpayne@68: jpayne@68: # @abstract Structure for one alignment. jpayne@68: # @field core core information about the alignment jpayne@68: # @field l_data current length of bam1_t::data jpayne@68: # @field m_data maximum length of bam1_t::data jpayne@68: # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux jpayne@68: # jpayne@68: # @discussion Notes: jpayne@68: # jpayne@68: # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. jpayne@68: # 2. l_qseq is calculated from the total length of an alignment block jpayne@68: # on reading or from CIGAR. jpayne@68: # 3. cigar data is encoded 4 bytes per CIGAR operation. jpayne@68: # 4. seq is nybble-encoded according to seq_nt16_table. jpayne@68: ctypedef struct bam1_t: jpayne@68: bam1_core_t core jpayne@68: int l_data jpayne@68: uint32_t m_data jpayne@68: uint8_t *data jpayne@68: uint64_t id jpayne@68: jpayne@68: # @abstract Get whether the query is on the reverse strand jpayne@68: # @param b pointer to an alignment jpayne@68: # @return boolean true if query is on the reverse strand jpayne@68: int bam_is_rev(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get whether the query's mate is on the reverse strand jpayne@68: # @param b pointer to an alignment jpayne@68: # @return boolean true if query's mate on the reverse strand jpayne@68: int bam_is_mrev(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get the name of the query jpayne@68: # @param b pointer to an alignment jpayne@68: # @return pointer to the name string, null terminated jpayne@68: char *bam_get_qname(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get the CIGAR array jpayne@68: # @param b pointer to an alignment jpayne@68: # @return pointer to the CIGAR array jpayne@68: # jpayne@68: # @discussion In the CIGAR array, each element is a 32-bit integer. The jpayne@68: # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the jpayne@68: # length of a CIGAR. jpayne@68: uint32_t *bam_get_cigar(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get query sequence jpayne@68: # @param b pointer to an alignment jpayne@68: # @return pointer to sequence jpayne@68: # jpayne@68: # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, jpayne@68: # 8 for T and 15 for N. Two bases are packed in one byte with the base jpayne@68: # at the higher 4 bits having smaller coordinate on the read. It is jpayne@68: # recommended to use bam_seqi() macro to get the base. jpayne@68: char *bam_get_seq(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get query quality jpayne@68: # @param b pointer to an alignment jpayne@68: # @return pointer to quality string jpayne@68: uint8_t *bam_get_qual(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get auxiliary data jpayne@68: # @param b pointer to an alignment jpayne@68: # @return pointer to the concatenated auxiliary data jpayne@68: uint8_t *bam_get_aux(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get length of auxiliary data jpayne@68: # @param b pointer to an alignment jpayne@68: # @return length of the concatenated auxiliary data jpayne@68: int bam_get_l_aux(bam1_t *b) jpayne@68: jpayne@68: # @abstract Get a base on read jpayne@68: # @param s Query sequence returned by bam1_seq() jpayne@68: # @param i The i-th position, 0-based jpayne@68: # @return 4-bit integer representing the base. jpayne@68: char bam_seqi(char *s, int i) jpayne@68: jpayne@68: #************************** jpayne@68: #*** Exported functions *** jpayne@68: #************************** jpayne@68: jpayne@68: #*************** jpayne@68: #*** BAM I/O *** jpayne@68: #*************** jpayne@68: jpayne@68: bam_hdr_t *bam_hdr_init() jpayne@68: bam_hdr_t *bam_hdr_read(BGZF *fp) jpayne@68: int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) jpayne@68: void bam_hdr_destroy(bam_hdr_t *h) jpayne@68: int bam_name2id(bam_hdr_t *h, const char *ref) jpayne@68: bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) jpayne@68: jpayne@68: bam1_t *bam_init1() jpayne@68: void bam_destroy1(bam1_t *b) jpayne@68: int bam_read1(BGZF *fp, bam1_t *b) jpayne@68: int bam_write1(BGZF *fp, const bam1_t *b) jpayne@68: bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) jpayne@68: bam1_t *bam_dup1(const bam1_t *bsrc) jpayne@68: jpayne@68: int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) jpayne@68: int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) jpayne@68: jpayne@68: # @abstract Calculate the rightmost base position of an alignment on the jpayne@68: # reference genome. jpayne@68: jpayne@68: # @param b pointer to an alignment jpayne@68: # @return the coordinate of the first base after the alignment, 0-based jpayne@68: jpayne@68: # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. jpayne@68: # For an unmapped read (either according to its flags or if it has no cigar jpayne@68: # string), we return b->core.pos + 1 by convention. jpayne@68: int32_t bam_endpos(const bam1_t *b) jpayne@68: jpayne@68: int bam_str2flag(const char *str) # returns negative value on error jpayne@68: char *bam_flag2str(int flag) # The string must be freed by the user jpayne@68: jpayne@68: #************************* jpayne@68: #*** BAM/CRAM indexing *** jpayne@68: #************************* jpayne@68: jpayne@68: # These BAM iterator functions work only on BAM files. To work with either jpayne@68: # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. jpayne@68: void bam_itr_destroy(hts_itr_t *iter) jpayne@68: hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) jpayne@68: hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) jpayne@68: int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) jpayne@68: jpayne@68: # Load/build .csi or .bai BAM index file. Does not work with CRAM. jpayne@68: # It is recommended to use the sam_index_* functions below instead. jpayne@68: hts_idx_t *bam_index_load(const char *fn) jpayne@68: int bam_index_build(const char *fn, int min_shift) jpayne@68: jpayne@68: # Load a BAM (.csi or .bai) or CRAM (.crai) index file jpayne@68: # @param fp File handle of the data file whose index is being opened jpayne@68: # @param fn BAM/CRAM/etc filename to search alongside for the index file jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *sam_index_load(htsFile *fp, const char *fn) jpayne@68: jpayne@68: # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file jpayne@68: # @param fp File handle of the data file whose index is being opened jpayne@68: # @param fn BAM/CRAM/etc data file filename jpayne@68: # @param fnidx Index filename, or NULL to search alongside @a fn jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) jpayne@68: jpayne@68: # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file jpayne@68: # @param fp File handle of the data file whose index is being opened jpayne@68: # @param fn BAM/CRAM/etc data file filename jpayne@68: # @param fnidx Index filename, or NULL to search alongside @a fn jpayne@68: # @param flags Flags to alter behaviour jpayne@68: # @return The index, or NULL if an error occurred. jpayne@68: hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) jpayne@68: jpayne@68: # Generate and save an index file jpayne@68: # @param fn Input BAM/etc filename, to which .csi/etc will be added jpayne@68: # @param min_shift Positive to generate CSI, or 0 to generate BAI jpayne@68: # @return 0 if successful, or negative if an error occurred (usually -1; or jpayne@68: # -2: opening fn failed; -3: format not indexable) jpayne@68: int sam_index_build(const char *fn, int min_shift) jpayne@68: jpayne@68: # Generate and save an index to a specific file jpayne@68: # @param fn Input BAM/CRAM/etc filename jpayne@68: # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn jpayne@68: # @param min_shift Positive to generate CSI, or 0 to generate BAI jpayne@68: # @return 0 if successful, or negative if an error occurred. jpayne@68: int sam_index_build2(const char *fn, const char *fnidx, int min_shift) jpayne@68: jpayne@68: void sam_itr_destroy(hts_itr_t *iter) jpayne@68: hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) jpayne@68: hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) jpayne@68: int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) jpayne@68: jpayne@68: #*************** jpayne@68: #*** SAM I/O *** jpayne@68: #*************** jpayne@68: jpayne@68: htsFile *sam_open(const char *fn, const char *mode) jpayne@68: htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) jpayne@68: int sam_close(htsFile *fp) jpayne@68: jpayne@68: int sam_open_mode(char *mode, const char *fn, const char *format) jpayne@68: jpayne@68: # A version of sam_open_mode that can handle ,key=value options. jpayne@68: # The format string is allocated and returned, to be freed by the caller. jpayne@68: # Prefix should be "r" or "w", jpayne@68: char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) jpayne@68: jpayne@68: bam_hdr_t *sam_hdr_parse(int l_text, const char *text) jpayne@68: bam_hdr_t *sam_hdr_read(htsFile *fp) jpayne@68: int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) jpayne@68: jpayne@68: int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) jpayne@68: int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) jpayne@68: int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) jpayne@68: int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) jpayne@68: jpayne@68: #************************************* jpayne@68: #*** Manipulating auxiliary fields *** jpayne@68: #************************************* jpayne@68: jpayne@68: uint8_t *bam_aux_get(const bam1_t *b, const char *tag) jpayne@68: int64_t bam_aux2i(const uint8_t *s) jpayne@68: double bam_aux2f(const uint8_t *s) jpayne@68: char bam_aux2A(const uint8_t *s) jpayne@68: char *bam_aux2Z(const uint8_t *s) jpayne@68: jpayne@68: void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) jpayne@68: int bam_aux_del(bam1_t *b, uint8_t *s) jpayne@68: jpayne@68: #************************** jpayne@68: #*** Pileup and Mpileup *** jpayne@68: #************************** jpayne@68: jpayne@68: # @abstract Generic pileup 'client data'. jpayne@68: # @discussion The pileup iterator allows setting a constructor and jpayne@68: # destructor function, which will be called every time a sequence is jpayne@68: # fetched and discarded. This permits caching of per-sequence data in jpayne@68: # a tidy manner during the pileup process. This union is the cached jpayne@68: # data to be manipulated by the "client" (the caller of pileup). jpayne@68: # jpayne@68: union bam_pileup_cd: jpayne@68: void *p jpayne@68: int64_t i jpayne@68: double f jpayne@68: jpayne@68: # @abstract Structure for one alignment covering the pileup position. jpayne@68: # @field b pointer to the alignment jpayne@68: # @field qpos position of the read base at the pileup site, 0-based jpayne@68: # @field indel indel length; 0 for no indel, positive for ins and negative for del jpayne@68: # @field level the level of the read in the "viewer" mode jpayne@68: # @field is_del 1 iff the base on the padded read is a deletion jpayne@68: # @field is_head ??? jpayne@68: # @field is_tail ??? jpayne@68: # @field is_refskip ??? jpayne@68: # @field aux ??? jpayne@68: # jpayne@68: # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The jpayne@68: # difference between the two functions is that the former does not jpayne@68: # set bam_pileup1_t::level, while the later does. Level helps the jpayne@68: # implementation of alignment viewers, but calculating this has some jpayne@68: # overhead. jpayne@68: # jpayne@68: # is_del, is_head, etc are a bit field, declaring as below should jpayne@68: # work as expected, see jpayne@68: # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J jpayne@68: jpayne@68: ctypedef struct bam_pileup1_t: jpayne@68: bam1_t *b jpayne@68: int32_t qpos jpayne@68: int indel, level jpayne@68: uint32_t is_del jpayne@68: uint32_t is_head jpayne@68: uint32_t is_tail jpayne@68: uint32_t is_refskip jpayne@68: uint32_t aux jpayne@68: bam_pileup_cd cd jpayne@68: jpayne@68: ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) jpayne@68: ctypedef int (*bam_test_f)() jpayne@68: jpayne@68: ctypedef struct __bam_plp_t jpayne@68: ctypedef __bam_plp_t *bam_plp_t jpayne@68: jpayne@68: ctypedef struct __bam_mplp_t jpayne@68: ctypedef __bam_mplp_t *bam_mplp_t jpayne@68: jpayne@68: # bam_plp_init() - sets an iterator over multiple jpayne@68: # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return jpayne@68: # status: 0 on success, -1 on end, < -1 on non-recoverable errors jpayne@68: # @data: user data to pass to @func jpayne@68: bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) jpayne@68: void bam_plp_destroy(bam_plp_t iter) jpayne@68: int bam_plp_push(bam_plp_t iter, const bam1_t *b) jpayne@68: const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) jpayne@68: const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) jpayne@68: void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) jpayne@68: void bam_plp_reset(bam_plp_t iter) jpayne@68: jpayne@68: bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) jpayne@68: jpayne@68: # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping jpayne@68: # read pairs and for each base pair set the base quality of the jpayne@68: # lower-quality base to zero, thus effectively discarding it from jpayne@68: # calling. If the two bases are identical, the quality of the other base jpayne@68: # is increased to the sum of their qualities (capped at 200), otherwise jpayne@68: # it is multiplied by 0.8. jpayne@68: void bam_mplp_init_overlaps(bam_mplp_t iter) jpayne@68: void bam_mplp_destroy(bam_mplp_t iter) jpayne@68: void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) jpayne@68: int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) jpayne@68: void bam_mplp_reset(bam_mplp_t iter) jpayne@68: void bam_mplp_constructor(bam_mplp_t iter, jpayne@68: int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) jpayne@68: void bam_mplp_destructor(bam_mplp_t iter, jpayne@68: int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) jpayne@68: jpayne@68: # Added by AH jpayne@68: # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" jpayne@68: jpayne@68: jpayne@68: jpayne@68: jpayne@68: # // --------------------------- jpayne@68: # // Base modification retrieval jpayne@68: jpayne@68: # /*! @typedef jpayne@68: # @abstract Holds a single base modification. jpayne@68: # @field modified_base The short base code (m, h, etc) or -ChEBI (negative) jpayne@68: # @field canonical_base The canonical base referred to in the MM tag. jpayne@68: # One of A, C, G, T or N. Note this may not be the jpayne@68: # explicit base recorded in the SEQ column (esp. if N). jpayne@68: # @field strand 0 or 1, indicating + or - strand from MM tag. jpayne@68: # @field qual Quality code (256*probability), or -1 if unknown jpayne@68: jpayne@68: # @discussion jpayne@68: # Note this doesn't hold any location data or information on which other jpayne@68: # modifications may be possible at this site. jpayne@68: ctypedef struct hts_base_mod: jpayne@68: int modified_base jpayne@68: int canonical_base jpayne@68: int strand jpayne@68: int qual jpayne@68: jpayne@68: # /// Allocates an hts_base_mode_state. jpayne@68: # /** jpayne@68: # * @return An hts_base_mode_state pointer on success, jpayne@68: # * NULL on failure. jpayne@68: # * jpayne@68: # * This just allocates the memory. The initialisation of the contents is jpayne@68: # * done using bam_parse_basemod. Successive calls may be made to that jpayne@68: # * without the need to free and allocate a new state. jpayne@68: # * jpayne@68: # * The state be destroyed using the hts_base_mode_state_free function. jpayne@68: # */ jpayne@68: ctypedef struct hts_base_mod_state jpayne@68: hts_base_mod_state *hts_base_mod_state_alloc() jpayne@68: jpayne@68: jpayne@68: # /// Destroys an hts_base_mode_state. jpayne@68: # /** jpayne@68: # * @param state The base modification state pointer. jpayne@68: # * jpayne@68: # * The should have previously been created by hts_base_mode_state_alloc. jpayne@68: # */ jpayne@68: void hts_base_mod_state_free(hts_base_mod_state *state) jpayne@68: jpayne@68: # /// Parses the Mm and Ml tags out of a bam record. jpayne@68: # /** jpayne@68: # * @param b BAM alignment record jpayne@68: # * @param state The base modification state pointer. jpayne@68: # * @return 0 on success, jpayne@68: # * -1 on failure. jpayne@68: # * jpayne@68: # * This fills out the contents of the modification state, resetting the jpayne@68: # * iterator location to the first sequence base. jpayne@68: # */ jpayne@68: int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) jpayne@68: jpayne@68: # /// Finds the next location containing base modifications and returns them jpayne@68: # /** jpayne@68: # * @param b BAM alignment record jpayne@68: # * @param state The base modification state pointer. jpayne@68: # * @param mods A supplied array for returning base modifications jpayne@68: # * @param n_mods The size of the mods array jpayne@68: # * @return The number of modifications found on success, jpayne@68: # * 0 if no more modifications are present, jpayne@68: # * -1 on failure. jpayne@68: # * jpayne@68: # * Unlike bam_mods_at_next_pos this skips ahead to the next site jpayne@68: # * with modifications. jpayne@68: # * jpayne@68: # * If more than n_mods modifications are found, the total found is returned. jpayne@68: # * Note this means the caller needs to check whether this is higher than jpayne@68: # * n_mods. jpayne@68: # */ jpayne@68: jpayne@68: int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos) jpayne@68: jpayne@68: # *********************************** jpayne@68: # * BAQ calculation and realignment * jpayne@68: # ***********************************/ jpayne@68: int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) jpayne@68: int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/faidx.h" nogil: jpayne@68: jpayne@68: ctypedef struct faidx_t: jpayne@68: pass jpayne@68: jpayne@68: # /// Build index for a FASTA or bgzip-compressed FASTA file. jpayne@68: # /** @param fn FASTA file name jpayne@68: # @param fnfai Name of .fai file to build. jpayne@68: # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). jpayne@68: # @return 0 on success; or -1 on failure jpayne@68: jpayne@68: # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. jpayne@68: # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI jpayne@68: # file will only be built if fn is bgzip-compressed. jpayne@68: # */ jpayne@68: int fai_build3(const char *fn, jpayne@68: const char *fnfai, jpayne@68: const char *fngzi) jpayne@68: jpayne@68: # /// Build index for a FASTA or bgzip-compressed FASTA file. jpayne@68: # /** @param fn FASTA file name jpayne@68: # @return 0 on success; or -1 on failure jpayne@68: # jpayne@68: # File "fn.fai" will be generated. This function is equivalent to jpayne@68: # fai_build3(fn, NULL, NULL); jpayne@68: # */ jpayne@68: int fai_build(char *fn) jpayne@68: jpayne@68: # /// Destroy a faidx_t struct jpayne@68: void fai_destroy(faidx_t *fai) jpayne@68: jpayne@68: # /// Load FASTA indexes. jpayne@68: # /** @param fn File name of the FASTA file (can be compressed with bgzip). jpayne@68: # @param fnfai File name of the FASTA index. jpayne@68: # @param fngzi File name of the bgzip index. jpayne@68: # @param flags Option flags to control index file caching and creation. jpayne@68: # @return Pointer to a faidx_t struct on success, NULL on failure. jpayne@68: jpayne@68: # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. jpayne@68: # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. jpayne@68: # The bgzip index is only needed if fn is compressed. jpayne@68: jpayne@68: # If (flags & FAI_CREATE) is true, the index files will be built using jpayne@68: # fai_build3() if they are not already present. jpayne@68: # */ jpayne@68: faidx_t *fai_load3(const char *fn, jpayne@68: const char *fnfai, jpayne@68: const char *fngzi, jpayne@68: int flags) jpayne@68: jpayne@68: # /// Load index from "fn.fai". jpayne@68: # /** @param fn File name of the FASTA file jpayne@68: # @return Pointer to a faidx_t struct on success, NULL on failure. jpayne@68: # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); jpayne@68: # */ jpayne@68: faidx_t *fai_load(char *fn) jpayne@68: jpayne@68: # /// Fetch the sequence in a region jpayne@68: # /** @param fai Pointer to the faidx_t struct jpayne@68: # @param reg Region in the format "chr2:20,000-30,000" jpayne@68: # @param len Length of the region; -2 if seq not present, -1 general error jpayne@68: # @return Pointer to the sequence; `NULL` on failure jpayne@68: # The returned sequence is allocated by `malloc()` family and should be destroyed jpayne@68: # by end users by calling `free()` on it. jpayne@68: # */ jpayne@68: char *fai_fetch(faidx_t *fai, jpayne@68: char *reg, jpayne@68: int *len) jpayne@68: jpayne@68: # /// Fetch the sequence in a region jpayne@68: # /** @param fai Pointer to the faidx_t struct jpayne@68: # @param c_name Region name jpayne@68: # @param p_beg_i Beginning position number (zero-based) jpayne@68: # @param p_end_i End position number (zero-based) jpayne@68: # @param len Length of the region; -2 if c_name not present, -1 general error jpayne@68: # @return Pointer to the sequence; null on failure jpayne@68: # The returned sequence is allocated by `malloc()` family and should be destroyed jpayne@68: # by end users by calling `free()` on it. jpayne@68: # */ jpayne@68: char *faidx_fetch_seq(faidx_t *fai, jpayne@68: char *c_name, jpayne@68: int p_beg_i, jpayne@68: int p_end_i, jpayne@68: int *len) jpayne@68: jpayne@68: # /// Query if sequence is present jpayne@68: # /** @param fai Pointer to the faidx_t struct jpayne@68: # @param seq Sequence name jpayne@68: # @return 1 if present or 0 if absent jpayne@68: # */ jpayne@68: int faidx_has_seq(faidx_t *fai, const char *seq) jpayne@68: jpayne@68: # /// Fetch the number of sequences jpayne@68: # /** @param fai Pointer to the faidx_t struct jpayne@68: # @return The number of sequences jpayne@68: # */ jpayne@68: int faidx_nseq(const faidx_t *fai) jpayne@68: jpayne@68: # /// Return name of i-th sequence jpayne@68: const char *faidx_iseq(const faidx_t *fai, int i) jpayne@68: jpayne@68: # /// Return sequence length, -1 if not present jpayne@68: int faidx_seq_len(faidx_t *fai, const char *seq) jpayne@68: jpayne@68: # tabix support jpayne@68: cdef extern from "htslib/tbx.h" nogil: jpayne@68: jpayne@68: # tbx.h definitions jpayne@68: int8_t TBX_MAX_SHIFT jpayne@68: int32_t TBX_GENERIC jpayne@68: int32_t TBX_SAM jpayne@68: int32_t TBX_VCF jpayne@68: int32_t TBX_UCSC jpayne@68: jpayne@68: ctypedef struct tbx_conf_t: jpayne@68: int32_t preset jpayne@68: int32_t sc, bc, ec # seq col., beg col. and end col. jpayne@68: int32_t meta_char, line_skip jpayne@68: jpayne@68: ctypedef struct tbx_t: jpayne@68: tbx_conf_t conf jpayne@68: hts_idx_t *idx jpayne@68: void * dict jpayne@68: jpayne@68: tbx_conf_t tbx_conf_gff jpayne@68: tbx_conf_t tbx_conf_bed jpayne@68: tbx_conf_t tbx_conf_psltbl jpayne@68: tbx_conf_t tbx_conf_sam jpayne@68: tbx_conf_t tbx_conf_vcf jpayne@68: jpayne@68: void tbx_itr_destroy(hts_itr_t * iter) jpayne@68: hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) jpayne@68: hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) jpayne@68: int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) jpayne@68: jpayne@68: int tbx_name2id(tbx_t *tbx, char *ss) jpayne@68: jpayne@68: int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) jpayne@68: int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) jpayne@68: jpayne@68: tbx_t * tbx_index_load(char *fn) jpayne@68: tbx_t *tbx_index_load2(const char *fn, const char *fnidx) jpayne@68: tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) jpayne@68: jpayne@68: # free the array but not the values jpayne@68: char **tbx_seqnames(tbx_t *tbx, int *n) jpayne@68: jpayne@68: void tbx_destroy(tbx_t *tbx) jpayne@68: jpayne@68: jpayne@68: # VCF/BCF API jpayne@68: cdef extern from "htslib/vcf.h" nogil: jpayne@68: jpayne@68: # Header struct jpayne@68: jpayne@68: uint8_t BCF_HL_FLT # header line jpayne@68: uint8_t BCF_HL_INFO jpayne@68: uint8_t BCF_HL_FMT jpayne@68: uint8_t BCF_HL_CTG jpayne@68: uint8_t BCF_HL_STR # structured header line TAG= jpayne@68: uint8_t BCF_HL_GEN # generic header line jpayne@68: jpayne@68: uint8_t BCF_HT_FLAG # header type jpayne@68: uint8_t BCF_HT_INT jpayne@68: uint8_t BCF_HT_REAL jpayne@68: uint8_t BCF_HT_STR jpayne@68: jpayne@68: uint8_t BCF_VL_FIXED # variable length jpayne@68: uint8_t BCF_VL_VAR jpayne@68: uint8_t BCF_VL_A jpayne@68: uint8_t BCF_VL_G jpayne@68: uint8_t BCF_VL_R jpayne@68: jpayne@68: # === Dictionary === jpayne@68: # jpayne@68: # The header keeps three dictionaries. The first keeps IDs in the jpayne@68: # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths jpayne@68: # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] jpayne@68: # is the actual hash table, which is opaque to the end users. In the hash jpayne@68: # table, the key is the ID or sample name as a C string and the value is a jpayne@68: # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash jpayne@68: # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the jpayne@68: # size of the hash table or, equivalently, the length of the id[] arrays. jpayne@68: jpayne@68: uint8_t BCF_DT_ID # dictionary type jpayne@68: uint8_t BCF_DT_CTG jpayne@68: uint8_t BCF_DT_SAMPLE jpayne@68: jpayne@68: # Complete textual representation of a header line jpayne@68: ctypedef struct bcf_hrec_t: jpayne@68: int type # One of the BCF_HL_* type jpayne@68: char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. jpayne@68: char *value # Set only for generic lines, NULL for FILTER/INFO, etc. jpayne@68: int nkeys # Number of structured fields jpayne@68: char **keys # The key=value pairs jpayne@68: char **vals jpayne@68: jpayne@68: ctypedef struct bcf_idinfo_t: jpayne@68: uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] jpayne@68: bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG jpayne@68: int id jpayne@68: jpayne@68: ctypedef struct bcf_idpair_t: jpayne@68: const char *key jpayne@68: const bcf_idinfo_t *val jpayne@68: jpayne@68: ctypedef struct bcf_hdr_t: jpayne@68: int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) jpayne@68: bcf_idpair_t *id[3] jpayne@68: void *dict[3] # ID dictionary, contig dict and sample dict jpayne@68: char **samples jpayne@68: bcf_hrec_t **hrec jpayne@68: int nhrec, dirty jpayne@68: int ntransl jpayne@68: int *transl[2] # for bcf_translate() jpayne@68: int nsamples_ori # for bcf_hdr_set_samples() jpayne@68: uint8_t *keep_samples jpayne@68: kstring_t mem jpayne@68: int32_t m[3] # m: allocated size of the dictionary block in use (see n above) jpayne@68: jpayne@68: uint8_t bcf_type_shift[] jpayne@68: jpayne@68: # * VCF record * jpayne@68: jpayne@68: uint8_t BCF_BT_NULL jpayne@68: uint8_t BCF_BT_INT8 jpayne@68: uint8_t BCF_BT_INT16 jpayne@68: uint8_t BCF_BT_INT32 jpayne@68: uint8_t BCF_BT_FLOAT jpayne@68: uint8_t BCF_BT_CHAR jpayne@68: jpayne@68: uint8_t VCF_REF jpayne@68: uint8_t VCF_SNP jpayne@68: uint8_t VCF_MNP jpayne@68: uint8_t VCF_INDEL jpayne@68: uint8_t VCF_OTHER jpayne@68: uint8_t VCF_BND jpayne@68: uint8_t VCF_OVERLAP jpayne@68: jpayne@68: jpayne@68: ctypedef struct variant_t: jpayne@68: int type, n # variant type and the number of bases affected, negative for deletions jpayne@68: jpayne@68: ctypedef struct bcf_fmt_t: jpayne@68: int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key jpayne@68: int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types jpayne@68: uint8_t *p # same as vptr and vptr_* in bcf_info_t below jpayne@68: uint32_t p_len jpayne@68: uint32_t p_off jpayne@68: uint8_t p_free jpayne@68: jpayne@68: union bcf_info_union_t: jpayne@68: int32_t i # integer value jpayne@68: float f # float value jpayne@68: jpayne@68: ctypedef struct bcf_info_t: jpayne@68: int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key jpayne@68: int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars jpayne@68: jpayne@68: # v1 union only set if $len==1; for easier access jpayne@68: bcf_info_union_t v1 jpayne@68: uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes jpayne@68: uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset jpayne@68: uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes jpayne@68: uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new jpayne@68: # data block is bigger than the original jpayne@68: jpayne@68: uint8_t BCF1_DIRTY_ID jpayne@68: uint8_t BCF1_DIRTY_ALS jpayne@68: uint8_t BCF1_DIRTY_FLT jpayne@68: uint8_t BCF1_DIRTY_INF jpayne@68: jpayne@68: ctypedef struct bcf_dec_t: jpayne@68: int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change jpayne@68: int n_flt # Number of FILTER fields jpayne@68: int *flt # FILTER keys in the dictionary jpayne@68: char *id # ID jpayne@68: char *als # REF+ALT block (\0-seperated) jpayne@68: char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated jpayne@68: bcf_info_t *info # INFO jpayne@68: bcf_fmt_t *fmt # FORMAT and individual sample jpayne@68: variant_t *var # $var and $var_type set only when set_variant_types called jpayne@68: int n_var, var_type jpayne@68: int shared_dirty # if set, shared.s must be recreated on BCF output jpayne@68: int indiv_dirty # if set, indiv.s must be recreated on BCF output jpayne@68: jpayne@68: uint8_t BCF_ERR_CTG_UNDEF jpayne@68: uint8_t BCF_ERR_TAG_UNDEF jpayne@68: uint8_t BCF_ERR_NCOLS jpayne@68: uint8_t BCF_ERR_LIMITS jpayne@68: uint8_t BCF_ERR_CHAR jpayne@68: uint8_t BCF_ERR_CTG_INVALID jpayne@68: uint8_t BCF_ERR_TAG_INVALID jpayne@68: jpayne@68: # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file jpayne@68: # is slower because the string is first to be parsed, packed into BCF line jpayne@68: # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it jpayne@68: # is known in advance that some of the fields will not be required (notably jpayne@68: # the sample columns), parsing of these can be skipped by setting max_unpack jpayne@68: # appropriately. jpayne@68: # Similarly, it is fast to output a BCF line because the columns (kept in jpayne@68: # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF jpayne@68: # line must be formatted in vcf_format. jpayne@68: jpayne@68: ctypedef struct bcf1_t: jpayne@68: int32_t rid # CHROM jpayne@68: int32_t pos # POS jpayne@68: int32_t rlen # length of REF jpayne@68: float qual # QUAL jpayne@68: uint32_t n_info, n_allele jpayne@68: uint32_t n_fmt, n_sample jpayne@68: kstring_t shared, indiv jpayne@68: bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() jpayne@68: int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed jpayne@68: int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work jpayne@68: int unpack_size[3] # the original block size of ID, REF+ALT and FILTER jpayne@68: int errcode # one of BCF_ERR_* codes jpayne@68: jpayne@68: ####### API ####### jpayne@68: jpayne@68: # BCF and VCF I/O jpayne@68: # jpayne@68: # A note about naming conventions: htslib internally represents VCF jpayne@68: # records as bcf1_t data structures, therefore most functions are jpayne@68: # prefixed with bcf_. There are a few exceptions where the functions must jpayne@68: # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In jpayne@68: # these cases, functions prefixed with bcf_ are more general and work jpayne@68: # with both BCF and VCF. jpayne@68: jpayne@68: # bcf_hdr_init() - create an empty BCF header. jpayne@68: # @param mode "r" or "w" jpayne@68: # jpayne@68: # When opened for writing, the mandatory fileFormat and jpayne@68: # FILTER=PASS lines are added automatically. jpayne@68: bcf_hdr_t *bcf_hdr_init(const char *mode) jpayne@68: jpayne@68: # Destroy a BCF header struct jpayne@68: void bcf_hdr_destroy(bcf_hdr_t *h) jpayne@68: jpayne@68: # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) jpayne@68: bcf1_t *bcf_init() jpayne@68: jpayne@68: # Deallocate a bcf1_t object jpayne@68: void bcf_destroy(bcf1_t *v) jpayne@68: jpayne@68: # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, jpayne@68: # not the bcf1_t object itself. jpayne@68: void bcf_empty(bcf1_t *v) jpayne@68: jpayne@68: # Make the bcf1_t object ready for next read. Intended mostly for jpayne@68: # internal use, the user should rarely need to call this function jpayne@68: # directly. jpayne@68: void bcf_clear(bcf1_t *v) jpayne@68: jpayne@68: # Reads VCF or BCF header jpayne@68: bcf_hdr_t *bcf_hdr_read(htsFile *fp) jpayne@68: jpayne@68: # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed jpayne@68: # @samples: samples to include or exclude from file or as a comma-separated string. jpayne@68: # LIST|FILE .. select samples in list/file jpayne@68: # ^LIST|FILE .. exclude samples from list/file jpayne@68: # - .. include all samples jpayne@68: # NULL .. exclude all samples jpayne@68: # @is_file: @samples is a file (1) or a comma-separated list (0) jpayne@68: # jpayne@68: # The bottleneck of VCF reading is parsing of genotype fields. If the jpayne@68: # reader knows in advance that only subset of samples is needed (possibly jpayne@68: # no samples at all), the performance of bcf_read() can be significantly jpayne@68: # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). jpayne@68: # The function bcf_read() will subset the VCF/BCF records automatically jpayne@68: # with the notable exception when reading records via bcf_itr_next(). jpayne@68: # In this case, bcf_subset_format() must be called explicitly, because jpayne@68: # bcf_readrec() does not see the header. jpayne@68: # jpayne@68: # Returns 0 on success, -1 on error or a positive integer if the list jpayne@68: # contains samples not present in the VCF header. In such a case, the jpayne@68: # return value is the index of the offending sample. jpayne@68: # jpayne@68: int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) jpayne@68: int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) jpayne@68: jpayne@68: # Writes VCF or BCF header jpayne@68: int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) jpayne@68: jpayne@68: # Parse VCF line contained in kstring and populate the bcf1_t struct jpayne@68: int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: jpayne@68: # The opposite of vcf_parse. It should rarely be called directly, see vcf_write jpayne@68: int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) jpayne@68: jpayne@68: # bcf_read() - read next VCF or BCF record jpayne@68: # jpayne@68: # Returns -1 on critical errors, 0 otherwise. On errors which are not jpayne@68: # critical for reading, such as missing header definitions, v->errcode is jpayne@68: # set to one of BCF_ERR* code and must be checked before calling jpayne@68: # vcf_write(). jpayne@68: int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: jpayne@68: # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) jpayne@68: # jpayne@68: # Note that bcf_unpack() must be called even when reading VCF. It is safe jpayne@68: # to call the function repeatedly, it will not unpack the same field jpayne@68: # twice. jpayne@68: uint8_t BCF_UN_STR # up to ALT inclusive jpayne@68: uint8_t BCF_UN_FLT # up to FILTER jpayne@68: uint8_t BCF_UN_INFO # up to INFO jpayne@68: uint8_t BCF_UN_SHR # all shared information jpayne@68: uint8_t BCF_UN_FMT # unpack format and each sample jpayne@68: uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT jpayne@68: uint8_t BCF_UN_ALL # everything jpayne@68: jpayne@68: int bcf_unpack(bcf1_t *b, int which) jpayne@68: jpayne@68: # bcf_dup() - create a copy of BCF record. jpayne@68: # jpayne@68: # Note that bcf_unpack() must be called on the returned copy as if it was jpayne@68: # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) jpayne@68: # internally to reflect any changes made by bcf_update_* functions. jpayne@68: bcf1_t *bcf_dup(bcf1_t *src) jpayne@68: bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) jpayne@68: jpayne@68: # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. jpayne@68: int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) jpayne@68: jpayne@68: # The following functions work only with VCFs and should rarely be called jpayne@68: # directly. Usually one wants to use their bcf_* alternatives, which work jpayne@68: # transparently with both VCFs and BCFs. jpayne@68: bcf_hdr_t *vcf_hdr_read(htsFile *fp) jpayne@68: int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) jpayne@68: int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: jpayne@68: #************************************************************************ jpayne@68: # Header querying and manipulation routines jpayne@68: #************************************************************************ jpayne@68: jpayne@68: # Create a new header using the supplied template jpayne@68: bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) jpayne@68: jpayne@68: # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). jpayne@68: # Returns 0 on success or sets a bit on error: jpayne@68: # 1 .. conflicting definitions of tag length jpayne@68: # # todo jpayne@68: int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) jpayne@68: jpayne@68: # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() jpayne@68: # @param dst: the destination header to be merged into, NULL on the first pass jpayne@68: # @param src: the source header jpayne@68: # jpayne@68: # Notes: jpayne@68: # - use as: jpayne@68: # bcf_hdr_t *dst = NULL; jpayne@68: # for (i=0; i 0 ) jpayne@68: # for (i=0; i=0 jpayne@68: # jpayne@68: # The returned values are: jpayne@68: # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* jpayne@68: # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields jpayne@68: # bcf_hdr_id2type .. the field type, one of BCF_HT_* jpayne@68: # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* jpayne@68: # jpayne@68: # Notes: Prior to using the macros, the presence of the info should be jpayne@68: # tested with bcf_hdr_idinfo_exists(). jpayne@68: # jpayne@68: int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) jpayne@68: int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) jpayne@68: int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) jpayne@68: int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) jpayne@68: int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) jpayne@68: bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) jpayne@68: jpayne@68: void bcf_fmt_array(kstring_t *s, int n, int type, void *data) jpayne@68: uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) jpayne@68: jpayne@68: void bcf_enc_vchar(kstring_t *s, int l, const char *a) jpayne@68: void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) jpayne@68: void bcf_enc_vfloat(kstring_t *s, int n, float *a) jpayne@68: jpayne@68: #************************************************************************ jpayne@68: # BCF index jpayne@68: # jpayne@68: # Note that these functions work with BCFs only. See synced_bcf_reader.h jpayne@68: # which provides (amongst other things) an API to work transparently with jpayne@68: # both indexed BCFs and VCFs. jpayne@68: #************************************************************************ jpayne@68: jpayne@68: hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) jpayne@68: hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) jpayne@68: int bcf_index_build(const char *fn, int min_shift) jpayne@68: int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) jpayne@68: jpayne@68: #******************* jpayne@68: # Typed value I/O * jpayne@68: #****************** jpayne@68: jpayne@68: # Note that in contrast with BCFv2.1 specification, HTSlib implementation jpayne@68: # allows missing values in vectors. For integer types, the values 0x80, jpayne@68: # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, jpayne@68: # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of jpayne@68: # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an jpayne@68: # end-of-vector indicator. jpayne@68: # Note that the end-of-vector byte is not part of the vector. jpayne@68: jpayne@68: # This trial BCF version (v2.2) is compatible with the VCF specification and jpayne@68: # enables to handle correctly vectors with different ploidy in presence of jpayne@68: # missing values. jpayne@68: jpayne@68: int32_t bcf_int8_vector_end jpayne@68: int32_t bcf_int16_vector_end jpayne@68: int32_t bcf_int32_vector_end jpayne@68: int32_t bcf_str_vector_end jpayne@68: int32_t bcf_int8_missing jpayne@68: int32_t bcf_int16_missing jpayne@68: int32_t bcf_int32_missing jpayne@68: int32_t bcf_str_missing jpayne@68: jpayne@68: uint32_t bcf_float_vector_end jpayne@68: uint32_t bcf_float_missing jpayne@68: jpayne@68: void bcf_float_set(float *ptr, uint32_t value) jpayne@68: void bcf_float_set_vector_end(float *x) jpayne@68: void bcf_float_set_missing(float *x) jpayne@68: jpayne@68: int bcf_float_is_missing(float f) jpayne@68: int bcf_float_is_vector_end(float f) jpayne@68: void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) jpayne@68: void bcf_enc_size(kstring_t *s, int size, int type) jpayne@68: int bcf_enc_inttype(long x) jpayne@68: void bcf_enc_int1(kstring_t *s, int32_t x) jpayne@68: int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) jpayne@68: int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) jpayne@68: int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) jpayne@68: jpayne@68: # These trivial wrappers are defined only for consistency with other parts of htslib jpayne@68: bcf1_t *bcf_init1() jpayne@68: int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: void bcf_destroy1(bcf1_t *v) jpayne@68: void bcf_empty1(bcf1_t *v) jpayne@68: int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) jpayne@68: void bcf_clear1(bcf1_t *v) jpayne@68: int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) jpayne@68: jpayne@68: # Other nice wrappers jpayne@68: void bcf_itr_destroy(hts_itr_t *iter) jpayne@68: hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) jpayne@68: hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) jpayne@68: int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) jpayne@68: hts_idx_t *bcf_index_load(const char *fn) jpayne@68: const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) jpayne@68: jpayne@68: jpayne@68: # VCF/BCF utility functions jpayne@68: cdef extern from "htslib/vcfutils.h" nogil: jpayne@68: struct kbitset_t jpayne@68: jpayne@68: # bcf_trim_alleles() - remove ALT alleles unused in genotype fields jpayne@68: # @header: for access to BCF_DT_ID dictionary jpayne@68: # @line: VCF line obtain from vcf_parse1 jpayne@68: # jpayne@68: # Returns the number of removed alleles on success or negative jpayne@68: # on error: jpayne@68: # -1 .. some allele index is out of bounds jpayne@68: int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) jpayne@68: jpayne@68: # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask jpayne@68: # @header: for access to BCF_DT_ID dictionary jpayne@68: # @line: VCF line obtained from vcf_parse1 jpayne@68: # @mask: alleles to remove jpayne@68: # jpayne@68: # If you have more than 31 alleles, then the integer bit mask will jpayne@68: # overflow, so use bcf_remove_allele_set instead jpayne@68: void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) jpayne@68: jpayne@68: # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set jpayne@68: # @header: for access to BCF_DT_ID dictionary jpayne@68: # @line: VCF line obtained from vcf_parse1 jpayne@68: # @rm_set: pointer to kbitset_t object with bits set for allele jpayne@68: # indexes to remove jpayne@68: # jpayne@68: # Number=A,R,G INFO and FORMAT fields will be updated accordingly. jpayne@68: void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) jpayne@68: jpayne@68: # bcf_calc_ac() - calculate the number of REF and ALT alleles jpayne@68: # @header: for access to BCF_DT_ID dictionary jpayne@68: # @line: VCF line obtained from vcf_parse1 jpayne@68: # @ac: array of length line->n_allele jpayne@68: # @which: determine if INFO/AN,AC and indv fields be used jpayne@68: # jpayne@68: # Returns 1 if the call succeeded, or 0 if the value could not jpayne@68: # be determined. jpayne@68: # jpayne@68: # The value of @which determines if existing INFO/AC,AN can be jpayne@68: # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT). jpayne@68: int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) jpayne@68: jpayne@68: # bcf_gt_type() - determines type of the genotype jpayne@68: # @fmt_ptr: the GT format field as set for example by set_fmt_ptr jpayne@68: # @isample: sample index (starting from 0) jpayne@68: # @ial: index of the 1st non-reference allele (starting from 1) jpayne@68: # @jal: index of the 2nd non-reference allele (starting from 1) jpayne@68: # jpayne@68: # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, jpayne@68: # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial jpayne@68: # is not NULL and the genotype has one or more non-reference jpayne@68: # alleles, $ial will be set. In case of GT_HET_AA, $ial is the jpayne@68: # position of the allele which appeared first in ALT. If $jal is jpayne@68: # not null and the genotype is GT_HET_AA, $jal will be set and is jpayne@68: # the position of the second allele in ALT. jpayne@68: uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation jpayne@68: uint8_t GT_HOM_AA jpayne@68: uint8_t GT_HET_RA jpayne@68: uint8_t GT_HET_AA jpayne@68: uint8_t GT_HAPL_R jpayne@68: uint8_t GT_HAPL_A jpayne@68: uint8_t GT_UNKN jpayne@68: int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal) jpayne@68: jpayne@68: int bcf_acgt2int(char c) jpayne@68: char bcf_int2acgt(int i) jpayne@68: jpayne@68: # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) jpayne@68: # @i,j: allele indexes, 0-based, i<=j jpayne@68: # Returns index to the Number=G diploid array jpayne@68: uint32_t bcf_ij2G(uint32_t i, uint32_t j) jpayne@68: jpayne@68: jpayne@68: cdef extern from "htslib/cram.h" nogil: jpayne@68: jpayne@68: enum cram_block_method: jpayne@68: ERROR jpayne@68: RAW jpayne@68: GZIP jpayne@68: BZIP2 jpayne@68: LZMA jpayne@68: RANS jpayne@68: RANS0 jpayne@68: RANS1 jpayne@68: GZIP_RLE jpayne@68: jpayne@68: enum cram_content_type: jpayne@68: CT_ERROR jpayne@68: FILE_HEADER jpayne@68: COMPRESSION_HEADER jpayne@68: MAPPED_SLICE jpayne@68: UNMAPPED_SLICE jpayne@68: EXTERNAL jpayne@68: CORE jpayne@68: jpayne@68: # Opaque data types, see cram_structs for the fully fledged versions. jpayne@68: ctypedef struct SAM_hdr jpayne@68: ctypedef struct cram_file_def jpayne@68: ctypedef struct cram_fd jpayne@68: ctypedef struct cram_container jpayne@68: ctypedef struct cram_block jpayne@68: ctypedef struct cram_slice jpayne@68: ctypedef struct cram_metrics jpayne@68: ctypedef struct cram_block_slice_hdr jpayne@68: ctypedef struct cram_block_compression_hdr jpayne@68: ctypedef struct refs_t jpayne@68: jpayne@68: # Accessor functions jpayne@68: jpayne@68: # jpayne@68: #----------------------------------------------------------------------------- jpayne@68: # cram_fd jpayne@68: # jpayne@68: SAM_hdr *cram_fd_get_header(cram_fd *fd) jpayne@68: void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) jpayne@68: jpayne@68: int cram_fd_get_version(cram_fd *fd) jpayne@68: void cram_fd_set_version(cram_fd *fd, int vers) jpayne@68: jpayne@68: int cram_major_vers(cram_fd *fd) jpayne@68: int cram_minor_vers(cram_fd *fd) jpayne@68: jpayne@68: hFILE *cram_fd_get_fp(cram_fd *fd) jpayne@68: void cram_fd_set_fp(cram_fd *fd, hFILE *fp) jpayne@68: jpayne@68: # jpayne@68: #----------------------------------------------------------------------------- jpayne@68: # cram_container jpayne@68: # jpayne@68: int32_t cram_container_get_length(cram_container *c) jpayne@68: void cram_container_set_length(cram_container *c, int32_t length) jpayne@68: int32_t cram_container_get_num_blocks(cram_container *c) jpayne@68: void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) jpayne@68: int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) jpayne@68: void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, jpayne@68: int32_t *landmarks) jpayne@68: jpayne@68: # Returns true if the container is empty (EOF marker) */ jpayne@68: int cram_container_is_empty(cram_fd *fd) jpayne@68: jpayne@68: jpayne@68: # jpayne@68: #----------------------------------------------------------------------------- jpayne@68: # cram_block jpayne@68: # jpayne@68: int32_t cram_block_get_content_id(cram_block *b) jpayne@68: int32_t cram_block_get_comp_size(cram_block *b) jpayne@68: int32_t cram_block_get_uncomp_size(cram_block *b) jpayne@68: int32_t cram_block_get_crc32(cram_block *b) jpayne@68: void * cram_block_get_data(cram_block *b) jpayne@68: jpayne@68: cram_content_type cram_block_get_content_type(cram_block *b) jpayne@68: jpayne@68: void cram_block_set_content_id(cram_block *b, int32_t id) jpayne@68: void cram_block_set_comp_size(cram_block *b, int32_t size) jpayne@68: void cram_block_set_uncomp_size(cram_block *b, int32_t size) jpayne@68: void cram_block_set_crc32(cram_block *b, int32_t crc) jpayne@68: void cram_block_set_data(cram_block *b, void *data) jpayne@68: jpayne@68: int cram_block_append(cram_block *b, void *data, int size) jpayne@68: void cram_block_update_size(cram_block *b) jpayne@68: jpayne@68: # Offset is known as "size" internally, but it can be confusing. jpayne@68: size_t cram_block_get_offset(cram_block *b) jpayne@68: void cram_block_set_offset(cram_block *b, size_t offset) jpayne@68: jpayne@68: # jpayne@68: # Computes the size of a cram block, including the block jpayne@68: # header itself. jpayne@68: # jpayne@68: uint32_t cram_block_size(cram_block *b) jpayne@68: jpayne@68: # jpayne@68: # Renumbers RG numbers in a cram compression header. jpayne@68: # jpayne@68: # CRAM stores RG as the Nth number in the header, rather than a jpayne@68: # string holding the ID: tag. This is smaller in space, but means jpayne@68: # "samtools cat" to join files together that contain single but jpayne@68: # different RG lines needs a way of renumbering them. jpayne@68: # jpayne@68: # The file descriptor is expected to be immediately after the jpayne@68: # cram_container structure (ie before the cram compression header). jpayne@68: # Due to the nature of the CRAM format, this needs to read and write jpayne@68: # the blocks itself. Note that there may be multiple slices within jpayne@68: # the container, meaning multiple compression headers to manipulate. jpayne@68: # Changing RG may change the size of the compression header and jpayne@68: # therefore the length field in the container. Hence we rewrite all jpayne@68: # blocks just in case and also emit the adjusted container. jpayne@68: # jpayne@68: # The current implementation can only cope with renumbering a single jpayne@68: # RG (and only then if it is using HUFFMAN or BETA codecs). In jpayne@68: # theory it *may* be possible to renumber multiple RGs if they use jpayne@68: # HUFFMAN to the CORE block or use an external block unshared by any jpayne@68: # other data series. So we have an API that can be upgraded to jpayne@68: # support this, but do not implement it for now. An example jpayne@68: # implementation of RG as an EXTERNAL block would be to find that jpayne@68: # block and rewrite it, returning the number of blocks consumed. jpayne@68: # jpayne@68: # Returns 0 on success; jpayne@68: # -1 if unable to edit; jpayne@68: # -2 on other errors (eg I/O). jpayne@68: # jpayne@68: int cram_transcode_rg(cram_fd *input, cram_fd *output, jpayne@68: cram_container *c, jpayne@68: int nrg, int *in_rg, int *out_rg) jpayne@68: jpayne@68: # jpayne@68: # Copies the blocks representing the next num_slice slices from a jpayne@68: # container from 'in' to 'out'. It is expected that the file pointer jpayne@68: # is just after the read of the cram_container and cram compression jpayne@68: # header. jpayne@68: # jpayne@68: # Returns 0 on success jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) jpayne@68: jpayne@68: # jpayne@68: #----------------------------------------------------------------------------- jpayne@68: # SAM_hdr jpayne@68: # jpayne@68: jpayne@68: # Tokenises a SAM header into a hash table. jpayne@68: # jpayne@68: # Also extracts a few bits on specific data types, such as @RG lines. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns a SAM_hdr struct on success (free with sam_hdr_free()) jpayne@68: # NULL on failure jpayne@68: # jpayne@68: SAM_hdr *sam_hdr_parse_(const char *hdr, int len) jpayne@68: jpayne@68: jpayne@68: # jpayne@68: #----------------------------------------------------------------------------- jpayne@68: # cram_io basics jpayne@68: # jpayne@68: jpayne@68: # CRAM blocks - the dynamically growable data block. We have code to jpayne@68: # create, update, (un)compress and read/write. jpayne@68: # jpayne@68: # These are derived from the deflate_interlaced.c blocks, but with the jpayne@68: # CRAM extension of content types and IDs. jpayne@68: # jpayne@68: jpayne@68: # Allocates a new cram_block structure with a specified content_type and jpayne@68: # id. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns block pointer on success; jpayne@68: # NULL on failure jpayne@68: # jpayne@68: cram_block *cram_new_block(cram_content_type content_type, jpayne@68: int content_id) jpayne@68: jpayne@68: # Reads a block from a cram file. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns cram_block pointer on success; jpayne@68: # NULL on failure jpayne@68: # jpayne@68: cram_block *cram_read_block(cram_fd *fd) jpayne@68: jpayne@68: # Writes a CRAM block. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_write_block(cram_fd *fd, cram_block *b) jpayne@68: jpayne@68: # Frees a CRAM block, deallocating internal data too. jpayne@68: # jpayne@68: void cram_free_block(cram_block *b) jpayne@68: jpayne@68: # Uncompresses a CRAM block, if compressed. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_uncompress_block(cram_block *b) jpayne@68: jpayne@68: # Compresses a block. jpayne@68: # jpayne@68: # Compresses a block using one of two different zlib strategies. If we only jpayne@68: # want one choice set strat2 to be -1. jpayne@68: # jpayne@68: # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED jpayne@68: # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is jpayne@68: # significantly faster. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, jpayne@68: int method, int level) jpayne@68: jpayne@68: # Containers jpayne@68: # jpayne@68: jpayne@68: # Creates a new container, specifying the maximum number of slices jpayne@68: # and records permitted. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns cram_container ptr on success; jpayne@68: # NULL on failure jpayne@68: # jpayne@68: cram_container *cram_new_container(int nrec, int nslice) jpayne@68: void cram_free_container(cram_container *c) jpayne@68: jpayne@68: # Reads a container header. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns cram_container on success; jpayne@68: # NULL on failure or no container left (fd->err == 0). jpayne@68: # jpayne@68: cram_container *cram_read_container(cram_fd *fd) jpayne@68: jpayne@68: # Writes a container structure. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_write_container(cram_fd *fd, cram_container *h) jpayne@68: jpayne@68: # jpayne@68: # Stores the container structure in dat and returns *size as the jpayne@68: # number of bytes written to dat[]. The input size of dat is also jpayne@68: # held in *size and should be initialised to cram_container_size(c). jpayne@68: # jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) jpayne@68: jpayne@68: int cram_container_size(cram_container *c) jpayne@68: jpayne@68: # The top-level cram opening, closing and option handling jpayne@68: # jpayne@68: jpayne@68: # Opens a CRAM file for read (mode "rb") or write ("wb"). jpayne@68: # jpayne@68: # The filename may be "-" to indicate stdin or stdout. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns file handle on success; jpayne@68: # NULL on failure. jpayne@68: # jpayne@68: cram_fd *cram_open(const char *filename, const char *mode) jpayne@68: jpayne@68: # Opens an existing stream for reading or writing. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns file handle on success; jpayne@68: # NULL on failure. jpayne@68: # jpayne@68: cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) jpayne@68: jpayne@68: # Closes a CRAM file. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_close(cram_fd *fd) jpayne@68: jpayne@68: # jpayne@68: # Seek within a CRAM file. jpayne@68: # jpayne@68: # Returns 0 on success jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_seek(cram_fd *fd, off_t offset, int whence) jpayne@68: jpayne@68: # jpayne@68: # Flushes a CRAM file. jpayne@68: # Useful for when writing to stdout without wishing to close the stream. jpayne@68: # jpayne@68: # Returns 0 on success jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_flush(cram_fd *fd) jpayne@68: jpayne@68: # Checks for end of file on a cram_fd stream. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 if not at end of file jpayne@68: # 1 if we hit an expected EOF (end of range or EOF block) jpayne@68: # 2 for other EOF (end of stream without EOF block) jpayne@68: # jpayne@68: int cram_eof(cram_fd *fd) jpayne@68: jpayne@68: # Sets options on the cram_fd. jpayne@68: # jpayne@68: # See CRAM_OPT_* definitions in hts.h. jpayne@68: # Use this immediately after opening. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) jpayne@68: jpayne@68: # Sets options on the cram_fd. jpayne@68: # jpayne@68: # See CRAM_OPT_* definitions in hts.h. jpayne@68: # Use this immediately after opening. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) jpayne@68: jpayne@68: # jpayne@68: # Attaches a header to a cram_fd. jpayne@68: # jpayne@68: # This should be used when creating a new cram_fd for writing where jpayne@68: # we have an SAM_hdr already constructed (eg from a file we've read jpayne@68: # in). jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int cram_set_header(cram_fd *fd, SAM_hdr *hdr) jpayne@68: jpayne@68: # Check if this file has a proper EOF block jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 3 if the file is a version of CRAM that does not contain EOF blocks jpayne@68: # 2 if the file is a stream and thus unseekable jpayne@68: # 1 if the file contains an EOF block jpayne@68: # 0 if the file does not contain an EOF block jpayne@68: # -1 if an error occurred whilst reading the file or we could not seek back to where we were jpayne@68: # jpayne@68: # jpayne@68: int cram_check_EOF(cram_fd *fd) jpayne@68: jpayne@68: # As int32_decoded/encode, but from/to blocks instead of cram_fd */ jpayne@68: int int32_put_blk(cram_block *b, int32_t val) jpayne@68: jpayne@68: # Deallocates all storage used by a SAM_hdr struct. jpayne@68: # jpayne@68: # This also decrements the header reference count. If after decrementing jpayne@68: # it is still non-zero then the header is assumed to be in use by another jpayne@68: # caller and the free is not done. jpayne@68: # jpayne@68: # This is a synonym for sam_hdr_dec_ref(). jpayne@68: # jpayne@68: void sam_hdr_free(SAM_hdr *hdr) jpayne@68: jpayne@68: # Returns the current length of the SAM_hdr in text form. jpayne@68: # jpayne@68: # Call sam_hdr_rebuild() first if editing has taken place. jpayne@68: # jpayne@68: int sam_hdr_length(SAM_hdr *hdr) jpayne@68: jpayne@68: # Returns the string form of the SAM_hdr. jpayne@68: # jpayne@68: # Call sam_hdr_rebuild() first if editing has taken place. jpayne@68: # jpayne@68: char *sam_hdr_str(SAM_hdr *hdr) jpayne@68: jpayne@68: # Appends a formatted line to an existing SAM header. jpayne@68: # jpayne@68: # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with jpayne@68: # optional new-line. If it contains more than 1 line then multiple lines jpayne@68: # will be added in order. jpayne@68: # jpayne@68: # Len is the length of the text data, or 0 if unknown (in which case jpayne@68: # it should be null terminated). jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: jpayne@68: # Add an @PG line. jpayne@68: # jpayne@68: # If we wish complete control over this use sam_hdr_add() directly. This jpayne@68: # function uses that, but attempts to do a lot of tedious house work for jpayne@68: # you too. jpayne@68: # jpayne@68: # - It will generate a suitable ID if the supplied one clashes. jpayne@68: # - It will generate multiple @PG records if we have multiple PG chains. jpayne@68: # jpayne@68: # Call it as per sam_hdr_add() with a series of key,value pairs ending jpayne@68: # in NULL. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns 0 on success; jpayne@68: # -1 on failure jpayne@68: # jpayne@68: int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) jpayne@68: jpayne@68: # jpayne@68: # A function to help with construction of CL tags in @PG records. jpayne@68: # Takes an argc, argv pair and returns a single space-separated string. jpayne@68: # This string should be deallocated by the calling function. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns malloced char * on success; jpayne@68: # NULL on failure jpayne@68: # jpayne@68: char *stringify_argv(int argc, char *argv[]) jpayne@68: jpayne@68: # jpayne@68: # Returns the refs_t structure used by a cram file handle. jpayne@68: # jpayne@68: # This may be used in conjunction with option CRAM_OPT_SHARED_REF to jpayne@68: # share reference memory between multiple file handles. jpayne@68: # jpayne@68: # @return jpayne@68: # Returns NULL if none exists or the file handle is not a CRAM file. jpayne@68: # jpayne@68: refs_t *cram_get_refs(htsFile *fd) jpayne@68: jpayne@68: jpayne@68: cdef class HTSFile(object): jpayne@68: cdef htsFile *htsfile # pointer to htsFile structure jpayne@68: cdef int64_t start_offset # BGZF offset of first record jpayne@68: jpayne@68: cdef readonly object filename # filename as supplied by user jpayne@68: cdef readonly object mode # file opening mode jpayne@68: cdef readonly object threads # number of threads to use jpayne@68: cdef readonly object index_filename # filename of index, if supplied by user jpayne@68: jpayne@68: cdef readonly bint is_stream # Is htsfile a non-seekable stream jpayne@68: cdef readonly bint is_remote # Is htsfile a remote stream jpayne@68: cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh jpayne@68: jpayne@68: cdef htsFile *_open_htsfile(self) except? NULL