annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libchtslib.pxd @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # cython: language_level=3
jpayne@69 2 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
jpayne@69 3 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
jpayne@69 4 from libc.stdlib cimport malloc, calloc, realloc, free
jpayne@69 5 from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
jpayne@69 6 from libc.stdio cimport FILE, printf
jpayne@69 7 from posix.types cimport off_t
jpayne@69 8
jpayne@69 9 cdef extern from "Python.h":
jpayne@69 10 FILE* PyFile_AsFile(object)
jpayne@69 11
jpayne@69 12
jpayne@69 13 # cython does not wrap stdarg
jpayne@69 14 cdef extern from "stdarg.h":
jpayne@69 15 ctypedef struct va_list:
jpayne@69 16 pass
jpayne@69 17
jpayne@69 18
jpayne@69 19 cdef extern from "htslib/kstring.h" nogil:
jpayne@69 20 ctypedef struct kstring_t:
jpayne@69 21 size_t l, m
jpayne@69 22 char *s
jpayne@69 23
jpayne@69 24 int kputc(int c, kstring_t *s)
jpayne@69 25 int kputw(int c, kstring_t *s)
jpayne@69 26 int kputl(long c, kstring_t *s)
jpayne@69 27 int ksprintf(kstring_t *s, const char *fmt, ...)
jpayne@69 28
jpayne@69 29
jpayne@69 30 cdef extern from "htslib_util.h" nogil:
jpayne@69 31 int hts_set_verbosity(int verbosity)
jpayne@69 32 int hts_get_verbosity()
jpayne@69 33
jpayne@69 34 ctypedef uint32_t khint32_t
jpayne@69 35 ctypedef uint32_t khint_t
jpayne@69 36 ctypedef khint_t khiter_t
jpayne@69 37
jpayne@69 38 # Used to manage BCF Header info
jpayne@69 39 ctypedef struct vdict_t:
jpayne@69 40 khint_t n_buckets, size, n_occupied, upper_bound
jpayne@69 41 khint32_t *flags
jpayne@69 42 const char *keys
jpayne@69 43 bcf_idinfo_t *vals
jpayne@69 44
jpayne@69 45 # Used to manage indexed contigs in Tabix
jpayne@69 46 ctypedef struct s2i_t:
jpayne@69 47 khint_t n_buckets, size, n_occupied, upper_bound
jpayne@69 48 khint32_t *flags
jpayne@69 49 const char *keys
jpayne@69 50 int64_t *vals
jpayne@69 51
jpayne@69 52 # Generic khash methods
jpayne@69 53 khint_t kh_size(void *d)
jpayne@69 54 khint_t kh_begin(void *d)
jpayne@69 55 khint_t kh_end(void *d)
jpayne@69 56 int kh_exist(void *d, khiter_t i)
jpayne@69 57
jpayne@69 58 # Specialized khash methods for vdict
jpayne@69 59 khint_t kh_get_vdict(vdict_t *d, const char *key)
jpayne@69 60 const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
jpayne@69 61 bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
jpayne@69 62
jpayne@69 63
jpayne@69 64 cdef extern from "htslib/hfile.h" nogil:
jpayne@69 65 ctypedef struct hFILE
jpayne@69 66
jpayne@69 67 # @abstract Open the named file or URL as a stream
jpayne@69 68 # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
jpayne@69 69 hFILE *hopen(const char *filename, const char *mode, ...)
jpayne@69 70
jpayne@69 71 # @abstract Associate a stream with an existing open file descriptor
jpayne@69 72 # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
jpayne@69 73 # @notes For socket descriptors (on Windows), mode should contain 's'.
jpayne@69 74 hFILE *hdopen(int fd, const char *mode)
jpayne@69 75
jpayne@69 76 # @abstract Report whether the file name or URL denotes remote storage
jpayne@69 77 # @return 0 if local, 1 if remote.
jpayne@69 78 # @notes "Remote" means involving e.g. explicit network access, with the
jpayne@69 79 # implication that callers may wish to cache such files' contents locally.
jpayne@69 80 int hisremote(const char *filename)
jpayne@69 81
jpayne@69 82 # @abstract Flush (for output streams) and close the stream
jpayne@69 83 # @return 0 if successful, or EOF (with errno set) if an error occurred.
jpayne@69 84 int hclose(hFILE *fp)
jpayne@69 85
jpayne@69 86 # @abstract Close the stream, without flushing or propagating errors
jpayne@69 87 # @notes For use while cleaning up after an error only. Preserves errno.
jpayne@69 88 void hclose_abruptly(hFILE *fp)
jpayne@69 89
jpayne@69 90 # @abstract Return the stream's error indicator
jpayne@69 91 # @return Non-zero (in fact, an errno value) if an error has occurred.
jpayne@69 92 # @notes This would be called herror() and return true/false to parallel
jpayne@69 93 # ferror(3), but a networking-related herror(3) function already exists. */
jpayne@69 94 int herrno(hFILE *fp)
jpayne@69 95
jpayne@69 96 # @abstract Clear the stream's error indicator
jpayne@69 97 void hclearerr(hFILE *fp)
jpayne@69 98
jpayne@69 99 # @abstract Reposition the read/write stream offset
jpayne@69 100 # @return The resulting offset within the stream (as per lseek(2)),
jpayne@69 101 # or negative if an error occurred.
jpayne@69 102 off_t hseek(hFILE *fp, off_t offset, int whence)
jpayne@69 103
jpayne@69 104 # @abstract Report the current stream offset
jpayne@69 105 # @return The offset within the stream, starting from zero.
jpayne@69 106 off_t htell(hFILE *fp)
jpayne@69 107
jpayne@69 108 # @abstract Read one character from the stream
jpayne@69 109 # @return The character read, or EOF on end-of-file or error
jpayne@69 110 int hgetc(hFILE *fp)
jpayne@69 111
jpayne@69 112 # Read from the stream until the delimiter, up to a maximum length
jpayne@69 113 # @param buffer The buffer into which bytes will be written
jpayne@69 114 # @param size The size of the buffer
jpayne@69 115 # @param delim The delimiter (interpreted as an `unsigned char`)
jpayne@69 116 # @param fp The file stream
jpayne@69 117 # @return The number of bytes read, or negative on error.
jpayne@69 118 # @since 1.4
jpayne@69 119 #
jpayne@69 120 # Bytes will be read into the buffer up to and including a delimiter, until
jpayne@69 121 # EOF is reached, or _size-1_ bytes have been written, whichever comes first.
jpayne@69 122 # The string will then be terminated with a NUL byte (`\0`).
jpayne@69 123 ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
jpayne@69 124
jpayne@69 125 # Read a line from the stream, up to a maximum length
jpayne@69 126 # @param buffer The buffer into which bytes will be written
jpayne@69 127 # @param size The size of the buffer
jpayne@69 128 # @param fp The file stream
jpayne@69 129 # @return The number of bytes read, or negative on error.
jpayne@69 130 # @since 1.4
jpayne@69 131 #
jpayne@69 132 # Specialization of hgetdelim() for a `\n` delimiter.
jpayne@69 133 ssize_t hgetln(char *buffer, size_t size, hFILE *fp)
jpayne@69 134
jpayne@69 135 # Read a line from the stream, up to a maximum length
jpayne@69 136 # @param buffer The buffer into which bytes will be written
jpayne@69 137 # @param size The size of the buffer (must be > 1 to be useful)
jpayne@69 138 # @param fp The file stream
jpayne@69 139 # @return _buffer_ on success, or `NULL` if an error occurred.
jpayne@69 140 # @since 1.4
jpayne@69 141 #
jpayne@69 142 # This function can be used as a replacement for `fgets(3)`, or together with
jpayne@69 143 # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_.
jpayne@69 144 char *hgets(char *buffer, int size, hFILE *fp)
jpayne@69 145
jpayne@69 146 # @abstract Peek at characters to be read without removing them from buffers
jpayne@69 147 # @param fp The file stream
jpayne@69 148 # @param buffer The buffer to which the peeked bytes will be written
jpayne@69 149 # @param nbytes The number of bytes to peek at; limited by the size of the
jpayne@69 150 # internal buffer, which could be as small as 4K.
jpayne@69 151 # @return The number of bytes peeked, which may be less than nbytes if EOF
jpayne@69 152 # is encountered; or negative, if there was an I/O error.
jpayne@69 153 # @notes The characters peeked at remain in the stream's internal buffer,
jpayne@69 154 # and will be returned by later hread() etc calls.
jpayne@69 155 ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
jpayne@69 156
jpayne@69 157 # @abstract Read a block of characters from the file
jpayne@69 158 # @return The number of bytes read, or negative if an error occurred.
jpayne@69 159 # @notes The full nbytes requested will be returned, except as limited
jpayne@69 160 # by EOF or I/O errors.
jpayne@69 161 ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
jpayne@69 162
jpayne@69 163 # @abstract Write a character to the stream
jpayne@69 164 # @return The character written, or EOF if an error occurred.
jpayne@69 165 int hputc(int c, hFILE *fp)
jpayne@69 166
jpayne@69 167 # @abstract Write a string to the stream
jpayne@69 168 # @return 0 if successful, or EOF if an error occurred.
jpayne@69 169 int hputs(const char *text, hFILE *fp)
jpayne@69 170
jpayne@69 171 # @abstract Write a block of characters to the file
jpayne@69 172 # @return Either nbytes, or negative if an error occurred.
jpayne@69 173 # @notes In the absence of I/O errors, the full nbytes will be written.
jpayne@69 174 ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
jpayne@69 175
jpayne@69 176 # @abstract For writing streams, flush buffered output to the underlying stream
jpayne@69 177 # @return 0 if successful, or EOF if an error occurred.
jpayne@69 178 int hflush(hFILE *fp)
jpayne@69 179
jpayne@69 180
jpayne@69 181 cdef extern from "htslib/bgzf.h" nogil:
jpayne@69 182 ctypedef struct bgzf_mtaux_t
jpayne@69 183 ctypedef struct bgzidx_t
jpayne@69 184 ctypedef struct z_stream
jpayne@69 185
jpayne@69 186 ctypedef struct BGZF:
jpayne@69 187 unsigned errcode
jpayne@69 188 unsigned is_write
jpayne@69 189 int is_be
jpayne@69 190 int compress_level
jpayne@69 191 int is_compressed
jpayne@69 192 int is_gzip
jpayne@69 193 int cache_size
jpayne@69 194 int64_t block_address
jpayne@69 195 int64_t uncompressed_address
jpayne@69 196 void *uncompressed_block
jpayne@69 197 void *compressed_block
jpayne@69 198 void *cache
jpayne@69 199 hFILE *fp
jpayne@69 200 bgzf_mtaux_t *mt
jpayne@69 201 bgzidx_t *idx
jpayne@69 202 int idx_build_otf
jpayne@69 203 z_stream *gz_stream
jpayne@69 204
jpayne@69 205 #*****************
jpayne@69 206 # Basic routines *
jpayne@69 207 # *****************/
jpayne@69 208
jpayne@69 209 # Open an existing file descriptor for reading or writing.
jpayne@69 210 #
jpayne@69 211 # @param fd file descriptor
jpayne@69 212 # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
jpayne@69 213 # writing, 'a' for appending, 'g' for gzip rather than BGZF
jpayne@69 214 # compression (with 'w' only), and digit specifies the zlib
jpayne@69 215 # compression level.
jpayne@69 216 # Note that there is a distinction between 'u' and '0': the
jpayne@69 217 # first yields plain uncompressed output whereas the latter
jpayne@69 218 # outputs uncompressed data wrapped in the zlib format.
jpayne@69 219 # @return BGZF file handler; 0 on error
jpayne@69 220
jpayne@69 221 BGZF* bgzf_dopen(int fd, const char *mode)
jpayne@69 222 BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
jpayne@69 223
jpayne@69 224 # Open the specified file for reading or writing.
jpayne@69 225 BGZF* bgzf_open(const char* path, const char *mode)
jpayne@69 226
jpayne@69 227 # Open an existing hFILE stream for reading or writing.
jpayne@69 228 BGZF* bgzf_hopen(hFILE *fp, const char *mode)
jpayne@69 229
jpayne@69 230 # Close the BGZF and free all associated resources.
jpayne@69 231 #
jpayne@69 232 # @param fp BGZF file handler
jpayne@69 233 # @return 0 on success and -1 on error
jpayne@69 234 int bgzf_close(BGZF *fp)
jpayne@69 235
jpayne@69 236 # Read up to _length_ bytes from the file storing into _data_.
jpayne@69 237 #
jpayne@69 238 # @param fp BGZF file handler
jpayne@69 239 # @param data data array to read into
jpayne@69 240 # @param length size of data to read
jpayne@69 241 # @return number of bytes actually read; 0 on end-of-file and -1 on error
jpayne@69 242 ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
jpayne@69 243
jpayne@69 244 # Write _length_ bytes from _data_ to the file. If no I/O errors occur,
jpayne@69 245 # the complete _length_ bytes will be written (or queued for writing).
jpayne@69 246 #
jpayne@69 247 # @param fp BGZF file handler
jpayne@69 248 # @param data data array to write
jpayne@69 249 # @param length size of data to write
jpayne@69 250 # @return number of bytes written (i.e., _length_); negative on error
jpayne@69 251 ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
jpayne@69 252
jpayne@69 253 # Read up to _length_ bytes directly from the underlying stream without
jpayne@69 254 # decompressing. Bypasses BGZF blocking, so must be used with care in
jpayne@69 255 # specialised circumstances only.
jpayne@69 256 #
jpayne@69 257 # @param fp BGZF file handler
jpayne@69 258 # @param data data array to read into
jpayne@69 259 # @param length number of raw bytes to read
jpayne@69 260 # @return number of bytes actually read; 0 on end-of-file and -1 on error
jpayne@69 261 ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
jpayne@69 262
jpayne@69 263 # Write _length_ bytes directly to the underlying stream without
jpayne@69 264 # compressing. Bypasses BGZF blocking, so must be used with care
jpayne@69 265 # in specialised circumstances only.
jpayne@69 266 #
jpayne@69 267 # @param fp BGZF file handler
jpayne@69 268 # @param data data array to write
jpayne@69 269 # @param length number of raw bytes to write
jpayne@69 270 # @return number of bytes actually written; -1 on error
jpayne@69 271 ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
jpayne@69 272
jpayne@69 273 # Write the data in the buffer to the file.
jpayne@69 274 int bgzf_flush(BGZF *fp)
jpayne@69 275
jpayne@69 276 # Return a virtual file pointer to the current location in the file.
jpayne@69 277 # No interpretation of the value should be made, other than a subsequent
jpayne@69 278 # call to bgzf_seek can be used to position the file at the same point.
jpayne@69 279 # Return value is non-negative on success.
jpayne@69 280 int64_t bgzf_tell(BGZF *fp)
jpayne@69 281
jpayne@69 282 # Set the file to read from the location specified by _pos_.
jpayne@69 283 #
jpayne@69 284 # @param fp BGZF file handler
jpayne@69 285 # @param pos virtual file offset returned by bgzf_tell()
jpayne@69 286 # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd)
jpayne@69 287 # @return 0 on success and -1 on error
jpayne@69 288 # /
jpayne@69 289 int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
jpayne@69 290
jpayne@69 291 # Check if the BGZF end-of-file (EOF) marker is present
jpayne@69 292 #
jpayne@69 293 # @param fp BGZF file handler opened for reading
jpayne@69 294 # @return 1 if the EOF marker is present and correct
jpayne@69 295 # 2 if it can't be checked, e.g., because fp isn't seekable
jpayne@69 296 # 0 if the EOF marker is absent
jpayne@69 297 # -1 (with errno set) on error
jpayne@69 298 int bgzf_check_EOF(BGZF *fp)
jpayne@69 299
jpayne@69 300 # Check if a file is in the BGZF format
jpayne@69 301 #
jpayne@69 302 # @param fn file name
jpayne@69 303 # @return 1 if _fn_ is BGZF; 0 if not or on I/O error
jpayne@69 304 int bgzf_is_bgzf(const char *fn)
jpayne@69 305
jpayne@69 306 #*********************
jpayne@69 307 # Advanced routines *
jpayne@69 308 #*********************
jpayne@69 309
jpayne@69 310 # Set the cache size. Only effective when compiled with -DBGZF_CACHE.
jpayne@69 311 #
jpayne@69 312 # @param fp BGZF file handler
jpayne@69 313 # @param size size of cache in bytes; 0 to disable caching (default)
jpayne@69 314 void bgzf_set_cache_size(BGZF *fp, int size)
jpayne@69 315
jpayne@69 316 # Flush the file if the remaining buffer size is smaller than _size_
jpayne@69 317 # @return 0 if flushing succeeded or was not needed; negative on error
jpayne@69 318 int bgzf_flush_try(BGZF *fp, ssize_t size)
jpayne@69 319
jpayne@69 320 # Read one byte from a BGZF file. It is faster than bgzf_read()
jpayne@69 321 # @param fp BGZF file handler
jpayne@69 322 # @return byte read; -1 on end-of-file or error
jpayne@69 323 int bgzf_getc(BGZF *fp)
jpayne@69 324
jpayne@69 325 # Read one line from a BGZF file. It is faster than bgzf_getc()
jpayne@69 326 #
jpayne@69 327 # @param fp BGZF file handler
jpayne@69 328 # @param delim delimiter
jpayne@69 329 # @param str string to write to; must be initialized
jpayne@69 330 # @return length of the string; 0 on end-of-file; negative on error
jpayne@69 331 int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
jpayne@69 332
jpayne@69 333 # Read the next BGZF block.
jpayne@69 334 int bgzf_read_block(BGZF *fp)
jpayne@69 335
jpayne@69 336 # Enable multi-threading (only effective on writing and when the
jpayne@69 337 # library was compiled with -DBGZF_MT)
jpayne@69 338 #
jpayne@69 339 # @param fp BGZF file handler; must be opened for writing
jpayne@69 340 # @param n_threads #threads used for writing
jpayne@69 341 # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
jpayne@69 342 int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
jpayne@69 343
jpayne@69 344
jpayne@69 345 # Compress a single BGZF block.
jpayne@69 346 #
jpayne@69 347 # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
jpayne@69 348 # @param dlen size of output buffer; updated on return to the number
jpayne@69 349 # of bytes actually written to dst
jpayne@69 350 # @param src buffer to be compressed
jpayne@69 351 # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
jpayne@69 352 # @param level compression level
jpayne@69 353 # @return 0 on success and negative on error
jpayne@69 354 #
jpayne@69 355 int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
jpayne@69 356
jpayne@69 357 #*******************
jpayne@69 358 # bgzidx routines *
jpayne@69 359 # BGZF at the uncompressed offset
jpayne@69 360 #
jpayne@69 361 # @param fp BGZF file handler; must be opened for reading
jpayne@69 362 # @param uoffset file offset in the uncompressed data
jpayne@69 363 # @param where SEEK_SET (cimported from libc.stdio) supported atm
jpayne@69 364 #
jpayne@69 365 # Returns 0 on success and -1 on error.
jpayne@69 366 int bgzf_useek(BGZF *fp, long uoffset, int where)
jpayne@69 367
jpayne@69 368 # Position in uncompressed BGZF
jpayne@69 369 #
jpayne@69 370 # @param fp BGZF file handler; must be opened for reading
jpayne@69 371 #
jpayne@69 372 # Returns the current offset on success and -1 on error.
jpayne@69 373 long bgzf_utell(BGZF *fp)
jpayne@69 374
jpayne@69 375 # Tell BGZF to build index while compressing.
jpayne@69 376 #
jpayne@69 377 # @param fp BGZF file handler; can be opened for reading or writing.
jpayne@69 378 #
jpayne@69 379 # Returns 0 on success and -1 on error.
jpayne@69 380 int bgzf_index_build_init(BGZF *fp)
jpayne@69 381
jpayne@69 382 # Load BGZF index
jpayne@69 383 #
jpayne@69 384 # @param fp BGZF file handler
jpayne@69 385 # @param bname base name
jpayne@69 386 # @param suffix suffix to add to bname (can be NULL)
jpayne@69 387 #
jpayne@69 388 # Returns 0 on success and -1 on error.
jpayne@69 389 int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
jpayne@69 390
jpayne@69 391 # Save BGZF index
jpayne@69 392 #
jpayne@69 393 # @param fp BGZF file handler
jpayne@69 394 # @param bname base name
jpayne@69 395 # @param suffix suffix to add to bname (can be NULL)
jpayne@69 396 #
jpayne@69 397 # Returns 0 on success and -1 on error.
jpayne@69 398 int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
jpayne@69 399
jpayne@69 400
jpayne@69 401 cdef extern from "htslib/hts.h" nogil:
jpayne@69 402 uint32_t kroundup32(uint32_t x)
jpayne@69 403
jpayne@69 404 ctypedef struct cram_fd
jpayne@69 405
jpayne@69 406 union FilePointerUnion:
jpayne@69 407 BGZF *bgzf
jpayne@69 408 cram_fd *cram
jpayne@69 409 hFILE *hfile
jpayne@69 410 void *voidp
jpayne@69 411
jpayne@69 412 enum htsFormatCategory:
jpayne@69 413 unknown_category
jpayne@69 414 sequence_data # Sequence data -- SAM, BAM, CRAM, etc
jpayne@69 415 variant_data # Variant calling data -- VCF, BCF, etc
jpayne@69 416 index_file # Index file associated with some data file
jpayne@69 417 region_list # Coordinate intervals or regions -- BED, etc
jpayne@69 418 category_maximum
jpayne@69 419
jpayne@69 420 enum htsExactFormat:
jpayne@69 421 unknown_format
jpayne@69 422 binary_format
jpayne@69 423 text_format
jpayne@69 424 sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
jpayne@69 425 format_maximum
jpayne@69 426
jpayne@69 427 enum htsCompression:
jpayne@69 428 no_compression, gzip, bgzf, custom
jpayne@69 429 compression_maximum
jpayne@69 430
jpayne@69 431 cdef enum hts_fmt_option:
jpayne@69 432 CRAM_OPT_DECODE_MD,
jpayne@69 433 CRAM_OPT_PREFIX,
jpayne@69 434 CRAM_OPT_VERBOSITY,
jpayne@69 435 CRAM_OPT_SEQS_PER_SLICE,
jpayne@69 436 CRAM_OPT_SLICES_PER_CONTAINER,
jpayne@69 437 CRAM_OPT_RANGE,
jpayne@69 438 CRAM_OPT_VERSION,
jpayne@69 439 CRAM_OPT_EMBED_REF,
jpayne@69 440 CRAM_OPT_IGNORE_MD5,
jpayne@69 441 CRAM_OPT_REFERENCE,
jpayne@69 442 CRAM_OPT_MULTI_SEQ_PER_SLICE,
jpayne@69 443 CRAM_OPT_NO_REF,
jpayne@69 444 CRAM_OPT_USE_BZIP2,
jpayne@69 445 CRAM_OPT_SHARED_REF,
jpayne@69 446 CRAM_OPT_NTHREADS,
jpayne@69 447 CRAM_OPT_THREAD_POOL,
jpayne@69 448 CRAM_OPT_USE_LZMA,
jpayne@69 449 CRAM_OPT_USE_RANS,
jpayne@69 450 CRAM_OPT_REQUIRED_FIELDS,
jpayne@69 451 HTS_OPT_COMPRESSION_LEVEL,
jpayne@69 452 HTS_OPT_NTHREADS,
jpayne@69 453
jpayne@69 454 ctypedef struct htsVersion:
jpayne@69 455 short major, minor
jpayne@69 456
jpayne@69 457 ctypedef struct htsFormat:
jpayne@69 458 htsFormatCategory category
jpayne@69 459 htsExactFormat format
jpayne@69 460 htsVersion version
jpayne@69 461 htsCompression compression
jpayne@69 462 short compression_level
jpayne@69 463 void *specific
jpayne@69 464
jpayne@69 465 ctypedef struct htsFile:
jpayne@69 466 uint8_t is_bin
jpayne@69 467 uint8_t is_write
jpayne@69 468 uint8_t is_be
jpayne@69 469 uint8_t is_cram
jpayne@69 470 int64_t lineno
jpayne@69 471 kstring_t line
jpayne@69 472 char *fn
jpayne@69 473 char *fn_aux
jpayne@69 474 FilePointerUnion fp
jpayne@69 475 htsFormat format
jpayne@69 476
jpayne@69 477 int hts_verbose
jpayne@69 478
jpayne@69 479 cdef union hts_opt_val_union:
jpayne@69 480 int i
jpayne@69 481 char *s
jpayne@69 482
jpayne@69 483 ctypedef struct hts_opt:
jpayne@69 484 char *arg
jpayne@69 485 hts_fmt_option opt
jpayne@69 486 hts_opt_val_union val
jpayne@69 487 void *next
jpayne@69 488
jpayne@69 489 # @abstract Parses arg and appends it to the option list.
jpayne@69 490 # @return 0 on success and -1 on failure
jpayne@69 491 int hts_opt_add(hts_opt **opts, const char *c_arg)
jpayne@69 492
jpayne@69 493 # @abstract Applies an hts_opt option list to a given htsFile.
jpayne@69 494 # @return 0 on success and -1 on failure
jpayne@69 495 int hts_opt_apply(htsFile *fp, hts_opt *opts)
jpayne@69 496
jpayne@69 497 # @abstract Frees an hts_opt list.
jpayne@69 498 void hts_opt_free(hts_opt *opts)
jpayne@69 499
jpayne@69 500 # @abstract Table for converting a nucleotide character to 4-bit encoding.
jpayne@69 501 # The input character may be either an IUPAC ambiguity code, '=' for 0, or
jpayne@69 502 # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
jpayne@69 503 # for A/C/G/T or combinations of these bits for ambiguous bases.
jpayne@69 504 const unsigned char *seq_nt16_table
jpayne@69 505
jpayne@69 506 # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
jpayne@69 507 # ambiguity code letter (or '=' when given 0).
jpayne@69 508 const char *seq_nt16_str
jpayne@69 509
jpayne@69 510 # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
jpayne@69 511 # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
jpayne@69 512 const int *seq_nt16_int
jpayne@69 513
jpayne@69 514 # @abstract Get the htslib version number
jpayne@69 515 # @return For released versions, a string like "N.N[.N]"; or git describe
jpayne@69 516 # output if using a library built within a Git repository.
jpayne@69 517 const char *hts_version()
jpayne@69 518
jpayne@69 519 # @abstract Determine format by peeking at the start of a file
jpayne@69 520 # @param fp File opened for reading, positioned at the beginning
jpayne@69 521 # @param fmt Format structure that will be filled out on return
jpayne@69 522 # @return 0 for success, or negative if an error occurred.
jpayne@69 523 int hts_detect_format(hFILE *fp, htsFormat *fmt)
jpayne@69 524
jpayne@69 525 # @abstract Get a human-readable description of the file format
jpayne@69 526 # @return Description string, to be freed by the caller after use.
jpayne@69 527 char *hts_format_description(const htsFormat *format)
jpayne@69 528
jpayne@69 529 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
jpayne@69 530 # @param fn The file name or "-" for stdin/stdout
jpayne@69 531 # @param mode Mode matching / [rwa][bceguxz0-9]* /
jpayne@69 532 # @discussion
jpayne@69 533 # With 'r' opens for reading; any further format mode letters are ignored
jpayne@69 534 # as the format is detected by checking the first few bytes or BGZF blocks
jpayne@69 535 # of the file. With 'w' or 'a' opens for writing or appending, with format
jpayne@69 536 # specifier letters:
jpayne@69 537 # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
jpayne@69 538 # c CRAM format
jpayne@69 539 # g gzip compressed
jpayne@69 540 # u uncompressed
jpayne@69 541 # z bgzf compressed
jpayne@69 542 # [0-9] zlib compression level
jpayne@69 543 # and with non-format option letters (for any of 'r'/'w'/'a'):
jpayne@69 544 # e close the file on exec(2) (opens with O_CLOEXEC, where supported)
jpayne@69 545 # x create the file exclusively (opens with O_EXCL, where supported)
jpayne@69 546 # Note that there is a distinction between 'u' and '0': the first yields
jpayne@69 547 # plain uncompressed output whereas the latter outputs uncompressed data
jpayne@69 548 # wrapped in the zlib format.
jpayne@69 549 # @example
jpayne@69 550 # [rw]b .. compressed BCF, BAM, FAI
jpayne@69 551 # [rw]bu .. uncompressed BCF
jpayne@69 552 # [rw]z .. compressed VCF
jpayne@69 553 # [rw] .. uncompressed VCF
jpayne@69 554 htsFile *hts_open(const char *fn, const char *mode)
jpayne@69 555
jpayne@69 556 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
jpayne@69 557 # @param fn The file name or "-" for stdin/stdout
jpayne@69 558 # @param mode Open mode, as per hts_open()
jpayne@69 559 # @param fmt Optional format specific parameters
jpayne@69 560 # @discussion
jpayne@69 561 # See hts_open() for description of fn and mode.
jpayne@69 562 # // TODO Update documentation for s/opts/fmt/
jpayne@69 563 # Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
jpayne@69 564 # if defined, override mode. Opts also contains a linked list of hts_opt
jpayne@69 565 # structures to apply to the open file handle. These can contain things
jpayne@69 566 # like pointers to the reference or information on compression levels,
jpayne@69 567 # block sizes, etc.
jpayne@69 568 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
jpayne@69 569
jpayne@69 570 # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
jpayne@69 571 # @param fp The already-open file handle
jpayne@69 572 # @param fn The file name or "-" for stdin/stdout
jpayne@69 573 # @param mode Open mode, as per hts_open()
jpayne@69 574 htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
jpayne@69 575
jpayne@69 576 # @abstract For output streams, flush any buffered data
jpayne@69 577 # @param fp The file handle to be flushed
jpayne@69 578 # @return 0 for success, or negative if an error occurred.
jpayne@69 579 # @since 1.14
jpayne@69 580 int hts_flush(htsFile *fp)
jpayne@69 581
jpayne@69 582 # @abstract Close a file handle, flushing buffered data for output streams
jpayne@69 583 # @param fp The file handle to be closed
jpayne@69 584 # @return 0 for success, or negative if an error occurred.
jpayne@69 585 int hts_close(htsFile *fp)
jpayne@69 586
jpayne@69 587 # @abstract Returns the file's format information
jpayne@69 588 # @param fp The file handle
jpayne@69 589 # @return Read-only pointer to the file's htsFormat.
jpayne@69 590 const htsFormat *hts_get_format(htsFile *fp)
jpayne@69 591
jpayne@69 592 # @ abstract Returns a string containing the file format extension.
jpayne@69 593 # @ param format Format structure containing the file type.
jpayne@69 594 # @ return A string ("sam", "bam", etc) or "?" for unknown formats.
jpayne@69 595 const char *hts_format_file_extension(const htsFormat *format)
jpayne@69 596
jpayne@69 597 # @abstract Sets a specified CRAM option on the open file handle.
jpayne@69 598 # @param fp The file handle open the open file.
jpayne@69 599 # @param opt The CRAM_OPT_* option.
jpayne@69 600 # @param ... Optional arguments, dependent on the option used.
jpayne@69 601 # @return 0 for success, or negative if an error occurred.
jpayne@69 602 int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
jpayne@69 603
jpayne@69 604 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
jpayne@69 605 char **hts_readlines(const char *fn, int *_n)
jpayne@69 606
jpayne@69 607 # @abstract Parse comma-separated list or read list from a file
jpayne@69 608 # @param list File name or comma-separated list
jpayne@69 609 # @param is_file
jpayne@69 610 # @param _n Size of the output array (number of items read)
jpayne@69 611 # @return NULL on failure or pointer to newly allocated array of
jpayne@69 612 # strings
jpayne@69 613 char **hts_readlist(const char *fn, int is_file, int *_n)
jpayne@69 614
jpayne@69 615 # @abstract Create extra threads to aid compress/decompression for this file
jpayne@69 616 # @param fp The file handle
jpayne@69 617 # @param n The number of worker threads to create
jpayne@69 618 # @return 0 for success, or negative if an error occurred.
jpayne@69 619 # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
jpayne@69 620 int hts_set_threads(htsFile *fp, int n)
jpayne@69 621
jpayne@69 622 # @abstract Set .fai filename for a file opened for reading
jpayne@69 623 # @return 0 for success, negative on failure
jpayne@69 624 # @discussion
jpayne@69 625 # Called before *_hdr_read(), this provides the name of a .fai file
jpayne@69 626 # used to provide a reference list if the htsFile contains no @SQ headers.
jpayne@69 627 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
jpayne@69 628
jpayne@69 629 int8_t HTS_IDX_NOCOOR
jpayne@69 630 int8_t HTS_IDX_START
jpayne@69 631 int8_t HTS_IDX_REST
jpayne@69 632 int8_t HTS_IDX_NONE
jpayne@69 633
jpayne@69 634 int8_t HTS_FMT_CSI
jpayne@69 635 int8_t HTS_FMT_BAI
jpayne@69 636 int8_t HTS_FMT_TBI
jpayne@69 637 int8_t HTS_FMT_CRAI
jpayne@69 638
jpayne@69 639 BGZF *hts_get_bgzfp(htsFile *fp)
jpayne@69 640
jpayne@69 641 ctypedef struct hts_idx_t
jpayne@69 642
jpayne@69 643 ctypedef struct hts_pair64_t:
jpayne@69 644 uint64_t u, v
jpayne@69 645
jpayne@69 646 ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
jpayne@69 647
jpayne@69 648 ctypedef struct hts_bins_t:
jpayne@69 649 int n, m
jpayne@69 650 int *a
jpayne@69 651
jpayne@69 652 ctypedef struct hts_itr_t:
jpayne@69 653 uint32_t read_rest
jpayne@69 654 uint32_t finished
jpayne@69 655 int tid, bed, end, n_off, i
jpayne@69 656 int curr_tid, curr_beg, curr_end
jpayne@69 657 uint64_t curr_off
jpayne@69 658 hts_pair64_t *off
jpayne@69 659 hts_readrec_func *readfunc
jpayne@69 660 hts_bins_t bins
jpayne@69 661
jpayne@69 662 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
jpayne@69 663 void hts_idx_destroy(hts_idx_t *idx)
jpayne@69 664 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
jpayne@69 665 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
jpayne@69 666
jpayne@69 667 #### Save an index to a file
jpayne@69 668 # @param idx Index to be written
jpayne@69 669 # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
jpayne@69 670 # @param fmt One of the HTS_FMT_* index formats
jpayne@69 671 # @return 0 if successful, or negative if an error occurred.
jpayne@69 672 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
jpayne@69 673
jpayne@69 674 #### Save an index to a specific file
jpayne@69 675 # @param idx Index to be written
jpayne@69 676 # @param fn Input BAM/BCF/etc filename
jpayne@69 677 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
jpayne@69 678 # @param fmt One of the HTS_FMT_* index formats
jpayne@69 679 # @return 0 if successful, or negative if an error occurred.
jpayne@69 680 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
jpayne@69 681
jpayne@69 682 #### Load an index file
jpayne@69 683 # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
jpayne@69 684 # the extension substituted, to search for an existing index file
jpayne@69 685 # @param fmt One of the HTS_FMT_* index formats
jpayne@69 686 # @return The index, or NULL if an error occurred.
jpayne@69 687 hts_idx_t *hts_idx_load(const char *fn, int fmt)
jpayne@69 688
jpayne@69 689 #### Load a specific index file
jpayne@69 690 # @param fn Input BAM/BCF/etc filename
jpayne@69 691 # @param fnidx The input index filename
jpayne@69 692 # @return The index, or NULL if an error occurred.
jpayne@69 693 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
jpayne@69 694
jpayne@69 695 #### Load a specific index file
jpayne@69 696 # @param fn Input BAM/BCF/etc filename
jpayne@69 697 # @param fnidx The input index filename
jpayne@69 698 # @param fmt One of the HTS_FMT_* index formats
jpayne@69 699 # @param flags Flags to alter behaviour (see description)
jpayne@69 700 # @return The index, or NULL if an error occurred.
jpayne@69 701 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
jpayne@69 702
jpayne@69 703 int HTS_IDX_SAVE_REMOTE
jpayne@69 704 int HTS_IDX_SILENT_FAIL
jpayne@69 705
jpayne@69 706 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
jpayne@69 707 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
jpayne@69 708
jpayne@69 709 int hts_idx_get_stat(const hts_idx_t* idx, int tid,
jpayne@69 710 uint64_t* mapped, uint64_t* unmapped)
jpayne@69 711
jpayne@69 712 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
jpayne@69 713
jpayne@69 714 int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers
jpayne@69 715
jpayne@69 716 # Parse a numeric string
jpayne@69 717 # The number may be expressed in scientific notation, and optionally may
jpayne@69 718 # contain commas in the integer part (before any decimal point or E notation).
jpayne@69 719 # @param str String to be parsed
jpayne@69 720 # @param strend If non-NULL, set on return to point to the first character
jpayne@69 721 # in @a str after those forming the parsed number
jpayne@69 722 # @param flags Or'ed-together combination of HTS_PARSE_* flags
jpayne@69 723 # @return Converted value of the parsed number.
jpayne@69 724 #
jpayne@69 725 # When @a strend is NULL, a warning will be printed (if hts_verbose is 2
jpayne@69 726 # or more) if there are any trailing characters after the number.
jpayne@69 727 long long hts_parse_decimal(const char *str, char **strend, int flags)
jpayne@69 728
jpayne@69 729 # Parse a "CHR:START-END"-style region string
jpayne@69 730 # @param str String to be parsed
jpayne@69 731 # @param beg Set on return to the 0-based start of the region
jpayne@69 732 # @param end Set on return to the 1-based end of the region
jpayne@69 733 # @return Pointer to the colon or '\0' after the reference sequence name,
jpayne@69 734 # or NULL if @a str could not be parsed.
jpayne@69 735 const char *hts_parse_reg(const char *str, int *beg, int *end)
jpayne@69 736
jpayne@69 737 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
jpayne@69 738 void hts_itr_destroy(hts_itr_t *iter)
jpayne@69 739
jpayne@69 740 ctypedef int (*hts_name2id_f)(void*, const char*)
jpayne@69 741 ctypedef const char *(*hts_id2name_f)(void*, int)
jpayne@69 742 ctypedef hts_itr_t *hts_itr_query_func(
jpayne@69 743 const hts_idx_t *idx,
jpayne@69 744 int tid,
jpayne@69 745 int beg,
jpayne@69 746 int end,
jpayne@69 747 hts_readrec_func *readrec)
jpayne@69 748
jpayne@69 749 hts_itr_t *hts_itr_querys(
jpayne@69 750 const hts_idx_t *idx,
jpayne@69 751 const char *reg,
jpayne@69 752 hts_name2id_f getid,
jpayne@69 753 void *hdr,
jpayne@69 754 hts_itr_query_func *itr_query,
jpayne@69 755 hts_readrec_func *readrec)
jpayne@69 756
jpayne@69 757 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
jpayne@69 758 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values
jpayne@69 759
jpayne@69 760 # hts_file_type() - Convenience function to determine file type
jpayne@69 761 # @fname: the file name
jpayne@69 762 #
jpayne@69 763 # Returns one of the FT_* defines.
jpayne@69 764 #
jpayne@69 765 # DEPRECATED: This function has been replaced by hts_detect_format().
jpayne@69 766 # It and these FT_* macros will be removed in a future HTSlib release.
jpayne@69 767 int FT_UNKN
jpayne@69 768 int FT_GZ
jpayne@69 769 int FT_VCF
jpayne@69 770 int FT_VCF_GZ
jpayne@69 771 int FT_BCF
jpayne@69 772 int FT_BCF_GZ
jpayne@69 773 int FT_STDIN
jpayne@69 774
jpayne@69 775 int hts_file_type(const char *fname)
jpayne@69 776
jpayne@69 777 # /***************************
jpayne@69 778 # * Revised MAQ error model *
jpayne@69 779 # ***************************/
jpayne@69 780
jpayne@69 781 ctypedef struct errmod_t
jpayne@69 782
jpayne@69 783 errmod_t *errmod_init(double depcorr)
jpayne@69 784 void errmod_destroy(errmod_t *em)
jpayne@69 785
jpayne@69 786 # /*
jpayne@69 787 # n: number of bases
jpayne@69 788 # m: maximum base
jpayne@69 789 # bases[i]: qual:6, strand:1, base:4
jpayne@69 790 # q[i*m+j]: phred-scaled likelihood of (i,j)
jpayne@69 791 # */
jpayne@69 792 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic)
jpayne@69 793
jpayne@69 794 # /*****************************************
jpayne@69 795 # * q banded glocal alignment *
jpayne@69 796 # *****************************************/
jpayne@69 797
jpayne@69 798 ctypedef struct probaln_par_t:
jpayne@69 799 float d, e
jpayne@69 800 int bw
jpayne@69 801
jpayne@69 802 int probaln_glocal(const uint8_t *ref,
jpayne@69 803 int l_ref,
jpayne@69 804 const uint8_t *query,
jpayne@69 805 int l_query, const uint8_t *iqual,
jpayne@69 806 const probaln_par_t *c,
jpayne@69 807 int *state, uint8_t *q)
jpayne@69 808
jpayne@69 809 # /**********************
jpayne@69 810 # * MD5 implementation *
jpayne@69 811 # **********************/
jpayne@69 812
jpayne@69 813 ctypedef struct hts_md5_context
jpayne@69 814
jpayne@69 815 # /*! @abstract Initialises an MD5 context.
jpayne@69 816 # * @discussion
jpayne@69 817 # * The expected use is to allocate an hts_md5_context using
jpayne@69 818 # * hts_md5_init(). This pointer is then passed into one or more calls
jpayne@69 819 # * of hts_md5_update() to compute successive internal portions of the
jpayne@69 820 # * MD5 sum, which can then be externalised as a full 16-byte MD5sum
jpayne@69 821 # * calculation by calling hts_md5_final(). This can then be turned
jpayne@69 822 # * into ASCII via hts_md5_hex().
jpayne@69 823 # *
jpayne@69 824 # * To dealloate any resources created by hts_md5_init() call the
jpayne@69 825 # * hts_md5_destroy() function.
jpayne@69 826 # *
jpayne@69 827 # * @return hts_md5_context pointer on success, NULL otherwise.
jpayne@69 828 # */
jpayne@69 829 hts_md5_context *hts_md5_init()
jpayne@69 830
jpayne@69 831 # /*! @abstract Updates the context with the MD5 of the data. */
jpayne@69 832 void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
jpayne@69 833
jpayne@69 834 # /*! @abstract Computes the final 128-bit MD5 hash from the given context */
jpayne@69 835 void hts_md5_final(unsigned char *digest, hts_md5_context *ctx)
jpayne@69 836
jpayne@69 837 # /*! @abstract Resets an md5_context to the initial state, as returned
jpayne@69 838 # * by hts_md5_init().
jpayne@69 839 # */
jpayne@69 840 void hts_md5_reset(hts_md5_context *ctx)
jpayne@69 841
jpayne@69 842 # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
jpayne@69 843 # * hex string.
jpayne@69 844 # */
jpayne@69 845 void hts_md5_hex(char *hex, const unsigned char *digest)
jpayne@69 846
jpayne@69 847 # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
jpayne@69 848 void hts_md5_destroy(hts_md5_context *ctx)
jpayne@69 849
jpayne@69 850 int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
jpayne@69 851 int hts_bin_bot(int bin, int n_lvls)
jpayne@69 852
jpayne@69 853 # * Endianness *
jpayne@69 854 int ed_is_big()
jpayne@69 855 uint16_t ed_swap_2(uint16_t v)
jpayne@69 856 void *ed_swap_2p(void *x)
jpayne@69 857 uint32_t ed_swap_4(uint32_t v)
jpayne@69 858 void *ed_swap_4p(void *x)
jpayne@69 859 uint64_t ed_swap_8(uint64_t v)
jpayne@69 860 void *ed_swap_8p(void *x)
jpayne@69 861
jpayne@69 862
jpayne@69 863 cdef extern from "htslib/sam.h" nogil:
jpayne@69 864 #**********************
jpayne@69 865 #*** SAM/BAM header ***
jpayne@69 866 #**********************
jpayne@69 867
jpayne@69 868 # @abstract Structure for the alignment header.
jpayne@69 869 # @field n_targets number of reference sequences
jpayne@69 870 # @field l_text length of the plain text in the header
jpayne@69 871 # @field target_len lengths of the reference sequences
jpayne@69 872 # @field target_name names of the reference sequences
jpayne@69 873 # @field text plain text
jpayne@69 874 # @field sdict header dictionary
jpayne@69 875
jpayne@69 876 ctypedef struct bam_hdr_t:
jpayne@69 877 int32_t n_targets, ignore_sam_err
jpayne@69 878 uint32_t l_text
jpayne@69 879 uint32_t *target_len
jpayne@69 880 uint8_t *cigar_tab
jpayne@69 881 char **target_name
jpayne@69 882 char *text
jpayne@69 883 void *sdict
jpayne@69 884
jpayne@69 885 #****************************
jpayne@69 886 #*** CIGAR related macros ***
jpayne@69 887 #****************************
jpayne@69 888
jpayne@69 889 int BAM_CMATCH
jpayne@69 890 int BAM_CINS
jpayne@69 891 int BAM_CDEL
jpayne@69 892 int BAM_CREF_SKIP
jpayne@69 893 int BAM_CSOFT_CLIP
jpayne@69 894 int BAM_CHARD_CLIP
jpayne@69 895 int BAM_CPAD
jpayne@69 896 int BAM_CEQUAL
jpayne@69 897 int BAM_CDIFF
jpayne@69 898 int BAM_CBACK
jpayne@69 899
jpayne@69 900 char *BAM_CIGAR_STR
jpayne@69 901 int BAM_CIGAR_SHIFT
jpayne@69 902 uint32_t BAM_CIGAR_MASK
jpayne@69 903 uint32_t BAM_CIGAR_TYPE
jpayne@69 904
jpayne@69 905 char bam_cigar_op(uint32_t c)
jpayne@69 906 uint32_t bam_cigar_oplen(uint32_t c)
jpayne@69 907 char bam_cigar_opchr(uint32_t)
jpayne@69 908 uint32_t bam_cigar_gen(char, uint32_t)
jpayne@69 909 int bam_cigar_type(char o)
jpayne@69 910
jpayne@69 911 # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
jpayne@69 912 int BAM_FPAIRED
jpayne@69 913 # @abstract the read is mapped in a proper pair
jpayne@69 914 int BAM_FPROPER_PAIR
jpayne@69 915 # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
jpayne@69 916 int BAM_FUNMAP
jpayne@69 917 # @abstract the mate is unmapped
jpayne@69 918 int BAM_FMUNMAP
jpayne@69 919 # @abstract the read is mapped to the reverse strand
jpayne@69 920 int BAM_FREVERSE
jpayne@69 921 # @abstract the mate is mapped to the reverse strand
jpayne@69 922 int BAM_FMREVERSE
jpayne@69 923 # @abstract this is read1
jpayne@69 924 int BAM_FREAD1
jpayne@69 925 # @abstract this is read2
jpayne@69 926 int BAM_FREAD2
jpayne@69 927 # @abstract not primary alignment
jpayne@69 928 int BAM_FSECONDARY
jpayne@69 929 # @abstract QC failure
jpayne@69 930 int BAM_FQCFAIL
jpayne@69 931 # @abstract optical or PCR duplicate
jpayne@69 932 int BAM_FDUP
jpayne@69 933 # @abstract supplementary alignment
jpayne@69 934 int BAM_FSUPPLEMENTARY
jpayne@69 935
jpayne@69 936 #*************************
jpayne@69 937 #*** Alignment records ***
jpayne@69 938 #*************************
jpayne@69 939
jpayne@69 940 # @abstract Structure for core alignment information.
jpayne@69 941 # @field tid chromosome ID, defined by bam_hdr_t
jpayne@69 942 # @field pos 0-based leftmost coordinate
jpayne@69 943 # @field bin bin calculated by bam_reg2bin()
jpayne@69 944 # @field qual mapping quality
jpayne@69 945 # @field l_qname length of the query name
jpayne@69 946 # @field flag bitwise flag
jpayne@69 947 # @field n_cigar number of CIGAR operations
jpayne@69 948 # @field l_qseq length of the query sequence (read)
jpayne@69 949 # @field mtid chromosome ID of next read in template, defined by bam_hdr_t
jpayne@69 950 # @field mpos 0-based leftmost coordinate of next read in template
jpayne@69 951
jpayne@69 952 ctypedef struct bam1_core_t:
jpayne@69 953 int32_t tid
jpayne@69 954 int32_t pos
jpayne@69 955 uint16_t bin
jpayne@69 956 uint8_t qual
jpayne@69 957 uint8_t l_qname
jpayne@69 958 uint16_t flag
jpayne@69 959 uint8_t unused1
jpayne@69 960 uint8_t l_extranul
jpayne@69 961 uint32_t n_cigar
jpayne@69 962 int32_t l_qseq
jpayne@69 963 int32_t mtid
jpayne@69 964 int32_t mpos
jpayne@69 965 int32_t isize
jpayne@69 966
jpayne@69 967 # @abstract Structure for one alignment.
jpayne@69 968 # @field core core information about the alignment
jpayne@69 969 # @field l_data current length of bam1_t::data
jpayne@69 970 # @field m_data maximum length of bam1_t::data
jpayne@69 971 # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
jpayne@69 972 #
jpayne@69 973 # @discussion Notes:
jpayne@69 974 #
jpayne@69 975 # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
jpayne@69 976 # 2. l_qseq is calculated from the total length of an alignment block
jpayne@69 977 # on reading or from CIGAR.
jpayne@69 978 # 3. cigar data is encoded 4 bytes per CIGAR operation.
jpayne@69 979 # 4. seq is nybble-encoded according to seq_nt16_table.
jpayne@69 980 ctypedef struct bam1_t:
jpayne@69 981 bam1_core_t core
jpayne@69 982 int l_data
jpayne@69 983 uint32_t m_data
jpayne@69 984 uint8_t *data
jpayne@69 985 uint64_t id
jpayne@69 986
jpayne@69 987 # @abstract Get whether the query is on the reverse strand
jpayne@69 988 # @param b pointer to an alignment
jpayne@69 989 # @return boolean true if query is on the reverse strand
jpayne@69 990 int bam_is_rev(bam1_t *b)
jpayne@69 991
jpayne@69 992 # @abstract Get whether the query's mate is on the reverse strand
jpayne@69 993 # @param b pointer to an alignment
jpayne@69 994 # @return boolean true if query's mate on the reverse strand
jpayne@69 995 int bam_is_mrev(bam1_t *b)
jpayne@69 996
jpayne@69 997 # @abstract Get the name of the query
jpayne@69 998 # @param b pointer to an alignment
jpayne@69 999 # @return pointer to the name string, null terminated
jpayne@69 1000 char *bam_get_qname(bam1_t *b)
jpayne@69 1001
jpayne@69 1002 # @abstract Get the CIGAR array
jpayne@69 1003 # @param b pointer to an alignment
jpayne@69 1004 # @return pointer to the CIGAR array
jpayne@69 1005 #
jpayne@69 1006 # @discussion In the CIGAR array, each element is a 32-bit integer. The
jpayne@69 1007 # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
jpayne@69 1008 # length of a CIGAR.
jpayne@69 1009 uint32_t *bam_get_cigar(bam1_t *b)
jpayne@69 1010
jpayne@69 1011 # @abstract Get query sequence
jpayne@69 1012 # @param b pointer to an alignment
jpayne@69 1013 # @return pointer to sequence
jpayne@69 1014 #
jpayne@69 1015 # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
jpayne@69 1016 # 8 for T and 15 for N. Two bases are packed in one byte with the base
jpayne@69 1017 # at the higher 4 bits having smaller coordinate on the read. It is
jpayne@69 1018 # recommended to use bam_seqi() macro to get the base.
jpayne@69 1019 char *bam_get_seq(bam1_t *b)
jpayne@69 1020
jpayne@69 1021 # @abstract Get query quality
jpayne@69 1022 # @param b pointer to an alignment
jpayne@69 1023 # @return pointer to quality string
jpayne@69 1024 uint8_t *bam_get_qual(bam1_t *b)
jpayne@69 1025
jpayne@69 1026 # @abstract Get auxiliary data
jpayne@69 1027 # @param b pointer to an alignment
jpayne@69 1028 # @return pointer to the concatenated auxiliary data
jpayne@69 1029 uint8_t *bam_get_aux(bam1_t *b)
jpayne@69 1030
jpayne@69 1031 # @abstract Get length of auxiliary data
jpayne@69 1032 # @param b pointer to an alignment
jpayne@69 1033 # @return length of the concatenated auxiliary data
jpayne@69 1034 int bam_get_l_aux(bam1_t *b)
jpayne@69 1035
jpayne@69 1036 # @abstract Get a base on read
jpayne@69 1037 # @param s Query sequence returned by bam1_seq()
jpayne@69 1038 # @param i The i-th position, 0-based
jpayne@69 1039 # @return 4-bit integer representing the base.
jpayne@69 1040 char bam_seqi(char *s, int i)
jpayne@69 1041
jpayne@69 1042 #**************************
jpayne@69 1043 #*** Exported functions ***
jpayne@69 1044 #**************************
jpayne@69 1045
jpayne@69 1046 #***************
jpayne@69 1047 #*** BAM I/O ***
jpayne@69 1048 #***************
jpayne@69 1049
jpayne@69 1050 bam_hdr_t *bam_hdr_init()
jpayne@69 1051 bam_hdr_t *bam_hdr_read(BGZF *fp)
jpayne@69 1052 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
jpayne@69 1053 void bam_hdr_destroy(bam_hdr_t *h)
jpayne@69 1054 int bam_name2id(bam_hdr_t *h, const char *ref)
jpayne@69 1055 bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
jpayne@69 1056
jpayne@69 1057 bam1_t *bam_init1()
jpayne@69 1058 void bam_destroy1(bam1_t *b)
jpayne@69 1059 int bam_read1(BGZF *fp, bam1_t *b)
jpayne@69 1060 int bam_write1(BGZF *fp, const bam1_t *b)
jpayne@69 1061 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
jpayne@69 1062 bam1_t *bam_dup1(const bam1_t *bsrc)
jpayne@69 1063
jpayne@69 1064 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
jpayne@69 1065 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
jpayne@69 1066
jpayne@69 1067 # @abstract Calculate the rightmost base position of an alignment on the
jpayne@69 1068 # reference genome.
jpayne@69 1069
jpayne@69 1070 # @param b pointer to an alignment
jpayne@69 1071 # @return the coordinate of the first base after the alignment, 0-based
jpayne@69 1072
jpayne@69 1073 # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
jpayne@69 1074 # For an unmapped read (either according to its flags or if it has no cigar
jpayne@69 1075 # string), we return b->core.pos + 1 by convention.
jpayne@69 1076 int32_t bam_endpos(const bam1_t *b)
jpayne@69 1077
jpayne@69 1078 int bam_str2flag(const char *str) # returns negative value on error
jpayne@69 1079 char *bam_flag2str(int flag) # The string must be freed by the user
jpayne@69 1080
jpayne@69 1081 #*************************
jpayne@69 1082 #*** BAM/CRAM indexing ***
jpayne@69 1083 #*************************
jpayne@69 1084
jpayne@69 1085 # These BAM iterator functions work only on BAM files. To work with either
jpayne@69 1086 # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
jpayne@69 1087 void bam_itr_destroy(hts_itr_t *iter)
jpayne@69 1088 hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
jpayne@69 1089 hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
jpayne@69 1090 int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
jpayne@69 1091
jpayne@69 1092 # Load/build .csi or .bai BAM index file. Does not work with CRAM.
jpayne@69 1093 # It is recommended to use the sam_index_* functions below instead.
jpayne@69 1094 hts_idx_t *bam_index_load(const char *fn)
jpayne@69 1095 int bam_index_build(const char *fn, int min_shift)
jpayne@69 1096
jpayne@69 1097 # Load a BAM (.csi or .bai) or CRAM (.crai) index file
jpayne@69 1098 # @param fp File handle of the data file whose index is being opened
jpayne@69 1099 # @param fn BAM/CRAM/etc filename to search alongside for the index file
jpayne@69 1100 # @return The index, or NULL if an error occurred.
jpayne@69 1101 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
jpayne@69 1102
jpayne@69 1103 # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
jpayne@69 1104 # @param fp File handle of the data file whose index is being opened
jpayne@69 1105 # @param fn BAM/CRAM/etc data file filename
jpayne@69 1106 # @param fnidx Index filename, or NULL to search alongside @a fn
jpayne@69 1107 # @return The index, or NULL if an error occurred.
jpayne@69 1108 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
jpayne@69 1109
jpayne@69 1110 # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
jpayne@69 1111 # @param fp File handle of the data file whose index is being opened
jpayne@69 1112 # @param fn BAM/CRAM/etc data file filename
jpayne@69 1113 # @param fnidx Index filename, or NULL to search alongside @a fn
jpayne@69 1114 # @param flags Flags to alter behaviour
jpayne@69 1115 # @return The index, or NULL if an error occurred.
jpayne@69 1116 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
jpayne@69 1117
jpayne@69 1118 # Generate and save an index file
jpayne@69 1119 # @param fn Input BAM/etc filename, to which .csi/etc will be added
jpayne@69 1120 # @param min_shift Positive to generate CSI, or 0 to generate BAI
jpayne@69 1121 # @return 0 if successful, or negative if an error occurred (usually -1; or
jpayne@69 1122 # -2: opening fn failed; -3: format not indexable)
jpayne@69 1123 int sam_index_build(const char *fn, int min_shift)
jpayne@69 1124
jpayne@69 1125 # Generate and save an index to a specific file
jpayne@69 1126 # @param fn Input BAM/CRAM/etc filename
jpayne@69 1127 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
jpayne@69 1128 # @param min_shift Positive to generate CSI, or 0 to generate BAI
jpayne@69 1129 # @return 0 if successful, or negative if an error occurred.
jpayne@69 1130 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
jpayne@69 1131
jpayne@69 1132 void sam_itr_destroy(hts_itr_t *iter)
jpayne@69 1133 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
jpayne@69 1134 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
jpayne@69 1135 int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
jpayne@69 1136
jpayne@69 1137 #***************
jpayne@69 1138 #*** SAM I/O ***
jpayne@69 1139 #***************
jpayne@69 1140
jpayne@69 1141 htsFile *sam_open(const char *fn, const char *mode)
jpayne@69 1142 htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
jpayne@69 1143 int sam_close(htsFile *fp)
jpayne@69 1144
jpayne@69 1145 int sam_open_mode(char *mode, const char *fn, const char *format)
jpayne@69 1146
jpayne@69 1147 # A version of sam_open_mode that can handle ,key=value options.
jpayne@69 1148 # The format string is allocated and returned, to be freed by the caller.
jpayne@69 1149 # Prefix should be "r" or "w",
jpayne@69 1150 char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
jpayne@69 1151
jpayne@69 1152 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
jpayne@69 1153 bam_hdr_t *sam_hdr_read(htsFile *fp)
jpayne@69 1154 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
jpayne@69 1155
jpayne@69 1156 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
jpayne@69 1157 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
jpayne@69 1158 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
jpayne@69 1159 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
jpayne@69 1160
jpayne@69 1161 #*************************************
jpayne@69 1162 #*** Manipulating auxiliary fields ***
jpayne@69 1163 #*************************************
jpayne@69 1164
jpayne@69 1165 uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
jpayne@69 1166 int64_t bam_aux2i(const uint8_t *s)
jpayne@69 1167 double bam_aux2f(const uint8_t *s)
jpayne@69 1168 char bam_aux2A(const uint8_t *s)
jpayne@69 1169 char *bam_aux2Z(const uint8_t *s)
jpayne@69 1170
jpayne@69 1171 void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
jpayne@69 1172 int bam_aux_del(bam1_t *b, uint8_t *s)
jpayne@69 1173
jpayne@69 1174 #**************************
jpayne@69 1175 #*** Pileup and Mpileup ***
jpayne@69 1176 #**************************
jpayne@69 1177
jpayne@69 1178 # @abstract Generic pileup 'client data'.
jpayne@69 1179 # @discussion The pileup iterator allows setting a constructor and
jpayne@69 1180 # destructor function, which will be called every time a sequence is
jpayne@69 1181 # fetched and discarded. This permits caching of per-sequence data in
jpayne@69 1182 # a tidy manner during the pileup process. This union is the cached
jpayne@69 1183 # data to be manipulated by the "client" (the caller of pileup).
jpayne@69 1184 #
jpayne@69 1185 union bam_pileup_cd:
jpayne@69 1186 void *p
jpayne@69 1187 int64_t i
jpayne@69 1188 double f
jpayne@69 1189
jpayne@69 1190 # @abstract Structure for one alignment covering the pileup position.
jpayne@69 1191 # @field b pointer to the alignment
jpayne@69 1192 # @field qpos position of the read base at the pileup site, 0-based
jpayne@69 1193 # @field indel indel length; 0 for no indel, positive for ins and negative for del
jpayne@69 1194 # @field level the level of the read in the "viewer" mode
jpayne@69 1195 # @field is_del 1 iff the base on the padded read is a deletion
jpayne@69 1196 # @field is_head ???
jpayne@69 1197 # @field is_tail ???
jpayne@69 1198 # @field is_refskip ???
jpayne@69 1199 # @field aux ???
jpayne@69 1200 #
jpayne@69 1201 # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
jpayne@69 1202 # difference between the two functions is that the former does not
jpayne@69 1203 # set bam_pileup1_t::level, while the later does. Level helps the
jpayne@69 1204 # implementation of alignment viewers, but calculating this has some
jpayne@69 1205 # overhead.
jpayne@69 1206 #
jpayne@69 1207 # is_del, is_head, etc are a bit field, declaring as below should
jpayne@69 1208 # work as expected, see
jpayne@69 1209 # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
jpayne@69 1210
jpayne@69 1211 ctypedef struct bam_pileup1_t:
jpayne@69 1212 bam1_t *b
jpayne@69 1213 int32_t qpos
jpayne@69 1214 int indel, level
jpayne@69 1215 uint32_t is_del
jpayne@69 1216 uint32_t is_head
jpayne@69 1217 uint32_t is_tail
jpayne@69 1218 uint32_t is_refskip
jpayne@69 1219 uint32_t aux
jpayne@69 1220 bam_pileup_cd cd
jpayne@69 1221
jpayne@69 1222 ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
jpayne@69 1223 ctypedef int (*bam_test_f)()
jpayne@69 1224
jpayne@69 1225 ctypedef struct __bam_plp_t
jpayne@69 1226 ctypedef __bam_plp_t *bam_plp_t
jpayne@69 1227
jpayne@69 1228 ctypedef struct __bam_mplp_t
jpayne@69 1229 ctypedef __bam_mplp_t *bam_mplp_t
jpayne@69 1230
jpayne@69 1231 # bam_plp_init() - sets an iterator over multiple
jpayne@69 1232 # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
jpayne@69 1233 # status: 0 on success, -1 on end, < -1 on non-recoverable errors
jpayne@69 1234 # @data: user data to pass to @func
jpayne@69 1235 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
jpayne@69 1236 void bam_plp_destroy(bam_plp_t iter)
jpayne@69 1237 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
jpayne@69 1238 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
jpayne@69 1239 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
jpayne@69 1240 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
jpayne@69 1241 void bam_plp_reset(bam_plp_t iter)
jpayne@69 1242
jpayne@69 1243 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
jpayne@69 1244
jpayne@69 1245 # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
jpayne@69 1246 # read pairs and for each base pair set the base quality of the
jpayne@69 1247 # lower-quality base to zero, thus effectively discarding it from
jpayne@69 1248 # calling. If the two bases are identical, the quality of the other base
jpayne@69 1249 # is increased to the sum of their qualities (capped at 200), otherwise
jpayne@69 1250 # it is multiplied by 0.8.
jpayne@69 1251 void bam_mplp_init_overlaps(bam_mplp_t iter)
jpayne@69 1252 void bam_mplp_destroy(bam_mplp_t iter)
jpayne@69 1253 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
jpayne@69 1254 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
jpayne@69 1255 void bam_mplp_reset(bam_mplp_t iter)
jpayne@69 1256 void bam_mplp_constructor(bam_mplp_t iter,
jpayne@69 1257 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
jpayne@69 1258 void bam_mplp_destructor(bam_mplp_t iter,
jpayne@69 1259 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
jpayne@69 1260
jpayne@69 1261 # Added by AH
jpayne@69 1262 # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
jpayne@69 1263
jpayne@69 1264
jpayne@69 1265
jpayne@69 1266
jpayne@69 1267 # // ---------------------------
jpayne@69 1268 # // Base modification retrieval
jpayne@69 1269
jpayne@69 1270 # /*! @typedef
jpayne@69 1271 # @abstract Holds a single base modification.
jpayne@69 1272 # @field modified_base The short base code (m, h, etc) or -ChEBI (negative)
jpayne@69 1273 # @field canonical_base The canonical base referred to in the MM tag.
jpayne@69 1274 # One of A, C, G, T or N. Note this may not be the
jpayne@69 1275 # explicit base recorded in the SEQ column (esp. if N).
jpayne@69 1276 # @field strand 0 or 1, indicating + or - strand from MM tag.
jpayne@69 1277 # @field qual Quality code (256*probability), or -1 if unknown
jpayne@69 1278
jpayne@69 1279 # @discussion
jpayne@69 1280 # Note this doesn't hold any location data or information on which other
jpayne@69 1281 # modifications may be possible at this site.
jpayne@69 1282 ctypedef struct hts_base_mod:
jpayne@69 1283 int modified_base
jpayne@69 1284 int canonical_base
jpayne@69 1285 int strand
jpayne@69 1286 int qual
jpayne@69 1287
jpayne@69 1288 # /// Allocates an hts_base_mode_state.
jpayne@69 1289 # /**
jpayne@69 1290 # * @return An hts_base_mode_state pointer on success,
jpayne@69 1291 # * NULL on failure.
jpayne@69 1292 # *
jpayne@69 1293 # * This just allocates the memory. The initialisation of the contents is
jpayne@69 1294 # * done using bam_parse_basemod. Successive calls may be made to that
jpayne@69 1295 # * without the need to free and allocate a new state.
jpayne@69 1296 # *
jpayne@69 1297 # * The state be destroyed using the hts_base_mode_state_free function.
jpayne@69 1298 # */
jpayne@69 1299 ctypedef struct hts_base_mod_state
jpayne@69 1300 hts_base_mod_state *hts_base_mod_state_alloc()
jpayne@69 1301
jpayne@69 1302
jpayne@69 1303 # /// Destroys an hts_base_mode_state.
jpayne@69 1304 # /**
jpayne@69 1305 # * @param state The base modification state pointer.
jpayne@69 1306 # *
jpayne@69 1307 # * The should have previously been created by hts_base_mode_state_alloc.
jpayne@69 1308 # */
jpayne@69 1309 void hts_base_mod_state_free(hts_base_mod_state *state)
jpayne@69 1310
jpayne@69 1311 # /// Parses the Mm and Ml tags out of a bam record.
jpayne@69 1312 # /**
jpayne@69 1313 # * @param b BAM alignment record
jpayne@69 1314 # * @param state The base modification state pointer.
jpayne@69 1315 # * @return 0 on success,
jpayne@69 1316 # * -1 on failure.
jpayne@69 1317 # *
jpayne@69 1318 # * This fills out the contents of the modification state, resetting the
jpayne@69 1319 # * iterator location to the first sequence base.
jpayne@69 1320 # */
jpayne@69 1321 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state)
jpayne@69 1322
jpayne@69 1323 # /// Finds the next location containing base modifications and returns them
jpayne@69 1324 # /**
jpayne@69 1325 # * @param b BAM alignment record
jpayne@69 1326 # * @param state The base modification state pointer.
jpayne@69 1327 # * @param mods A supplied array for returning base modifications
jpayne@69 1328 # * @param n_mods The size of the mods array
jpayne@69 1329 # * @return The number of modifications found on success,
jpayne@69 1330 # * 0 if no more modifications are present,
jpayne@69 1331 # * -1 on failure.
jpayne@69 1332 # *
jpayne@69 1333 # * Unlike bam_mods_at_next_pos this skips ahead to the next site
jpayne@69 1334 # * with modifications.
jpayne@69 1335 # *
jpayne@69 1336 # * If more than n_mods modifications are found, the total found is returned.
jpayne@69 1337 # * Note this means the caller needs to check whether this is higher than
jpayne@69 1338 # * n_mods.
jpayne@69 1339 # */
jpayne@69 1340
jpayne@69 1341 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos)
jpayne@69 1342
jpayne@69 1343 # ***********************************
jpayne@69 1344 # * BAQ calculation and realignment *
jpayne@69 1345 # ***********************************/
jpayne@69 1346 int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
jpayne@69 1347 int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
jpayne@69 1348
jpayne@69 1349
jpayne@69 1350 cdef extern from "htslib/faidx.h" nogil:
jpayne@69 1351
jpayne@69 1352 ctypedef struct faidx_t:
jpayne@69 1353 pass
jpayne@69 1354
jpayne@69 1355 # /// Build index for a FASTA or bgzip-compressed FASTA file.
jpayne@69 1356 # /** @param fn FASTA file name
jpayne@69 1357 # @param fnfai Name of .fai file to build.
jpayne@69 1358 # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed).
jpayne@69 1359 # @return 0 on success; or -1 on failure
jpayne@69 1360
jpayne@69 1361 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
jpayne@69 1362 # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI
jpayne@69 1363 # file will only be built if fn is bgzip-compressed.
jpayne@69 1364 # */
jpayne@69 1365 int fai_build3(const char *fn,
jpayne@69 1366 const char *fnfai,
jpayne@69 1367 const char *fngzi)
jpayne@69 1368
jpayne@69 1369 # /// Build index for a FASTA or bgzip-compressed FASTA file.
jpayne@69 1370 # /** @param fn FASTA file name
jpayne@69 1371 # @return 0 on success; or -1 on failure
jpayne@69 1372 #
jpayne@69 1373 # File "fn.fai" will be generated. This function is equivalent to
jpayne@69 1374 # fai_build3(fn, NULL, NULL);
jpayne@69 1375 # */
jpayne@69 1376 int fai_build(char *fn)
jpayne@69 1377
jpayne@69 1378 # /// Destroy a faidx_t struct
jpayne@69 1379 void fai_destroy(faidx_t *fai)
jpayne@69 1380
jpayne@69 1381 # /// Load FASTA indexes.
jpayne@69 1382 # /** @param fn File name of the FASTA file (can be compressed with bgzip).
jpayne@69 1383 # @param fnfai File name of the FASTA index.
jpayne@69 1384 # @param fngzi File name of the bgzip index.
jpayne@69 1385 # @param flags Option flags to control index file caching and creation.
jpayne@69 1386 # @return Pointer to a faidx_t struct on success, NULL on failure.
jpayne@69 1387
jpayne@69 1388 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
jpayne@69 1389 # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
jpayne@69 1390 # The bgzip index is only needed if fn is compressed.
jpayne@69 1391
jpayne@69 1392 # If (flags & FAI_CREATE) is true, the index files will be built using
jpayne@69 1393 # fai_build3() if they are not already present.
jpayne@69 1394 # */
jpayne@69 1395 faidx_t *fai_load3(const char *fn,
jpayne@69 1396 const char *fnfai,
jpayne@69 1397 const char *fngzi,
jpayne@69 1398 int flags)
jpayne@69 1399
jpayne@69 1400 # /// Load index from "fn.fai".
jpayne@69 1401 # /** @param fn File name of the FASTA file
jpayne@69 1402 # @return Pointer to a faidx_t struct on success, NULL on failure.
jpayne@69 1403 # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
jpayne@69 1404 # */
jpayne@69 1405 faidx_t *fai_load(char *fn)
jpayne@69 1406
jpayne@69 1407 # /// Fetch the sequence in a region
jpayne@69 1408 # /** @param fai Pointer to the faidx_t struct
jpayne@69 1409 # @param reg Region in the format "chr2:20,000-30,000"
jpayne@69 1410 # @param len Length of the region; -2 if seq not present, -1 general error
jpayne@69 1411 # @return Pointer to the sequence; `NULL` on failure
jpayne@69 1412 # The returned sequence is allocated by `malloc()` family and should be destroyed
jpayne@69 1413 # by end users by calling `free()` on it.
jpayne@69 1414 # */
jpayne@69 1415 char *fai_fetch(faidx_t *fai,
jpayne@69 1416 char *reg,
jpayne@69 1417 int *len)
jpayne@69 1418
jpayne@69 1419 # /// Fetch the sequence in a region
jpayne@69 1420 # /** @param fai Pointer to the faidx_t struct
jpayne@69 1421 # @param c_name Region name
jpayne@69 1422 # @param p_beg_i Beginning position number (zero-based)
jpayne@69 1423 # @param p_end_i End position number (zero-based)
jpayne@69 1424 # @param len Length of the region; -2 if c_name not present, -1 general error
jpayne@69 1425 # @return Pointer to the sequence; null on failure
jpayne@69 1426 # The returned sequence is allocated by `malloc()` family and should be destroyed
jpayne@69 1427 # by end users by calling `free()` on it.
jpayne@69 1428 # */
jpayne@69 1429 char *faidx_fetch_seq(faidx_t *fai,
jpayne@69 1430 char *c_name,
jpayne@69 1431 int p_beg_i,
jpayne@69 1432 int p_end_i,
jpayne@69 1433 int *len)
jpayne@69 1434
jpayne@69 1435 # /// Query if sequence is present
jpayne@69 1436 # /** @param fai Pointer to the faidx_t struct
jpayne@69 1437 # @param seq Sequence name
jpayne@69 1438 # @return 1 if present or 0 if absent
jpayne@69 1439 # */
jpayne@69 1440 int faidx_has_seq(faidx_t *fai, const char *seq)
jpayne@69 1441
jpayne@69 1442 # /// Fetch the number of sequences
jpayne@69 1443 # /** @param fai Pointer to the faidx_t struct
jpayne@69 1444 # @return The number of sequences
jpayne@69 1445 # */
jpayne@69 1446 int faidx_nseq(const faidx_t *fai)
jpayne@69 1447
jpayne@69 1448 # /// Return name of i-th sequence
jpayne@69 1449 const char *faidx_iseq(const faidx_t *fai, int i)
jpayne@69 1450
jpayne@69 1451 # /// Return sequence length, -1 if not present
jpayne@69 1452 int faidx_seq_len(faidx_t *fai, const char *seq)
jpayne@69 1453
jpayne@69 1454 # tabix support
jpayne@69 1455 cdef extern from "htslib/tbx.h" nogil:
jpayne@69 1456
jpayne@69 1457 # tbx.h definitions
jpayne@69 1458 int8_t TBX_MAX_SHIFT
jpayne@69 1459 int32_t TBX_GENERIC
jpayne@69 1460 int32_t TBX_SAM
jpayne@69 1461 int32_t TBX_VCF
jpayne@69 1462 int32_t TBX_UCSC
jpayne@69 1463
jpayne@69 1464 ctypedef struct tbx_conf_t:
jpayne@69 1465 int32_t preset
jpayne@69 1466 int32_t sc, bc, ec # seq col., beg col. and end col.
jpayne@69 1467 int32_t meta_char, line_skip
jpayne@69 1468
jpayne@69 1469 ctypedef struct tbx_t:
jpayne@69 1470 tbx_conf_t conf
jpayne@69 1471 hts_idx_t *idx
jpayne@69 1472 void * dict
jpayne@69 1473
jpayne@69 1474 tbx_conf_t tbx_conf_gff
jpayne@69 1475 tbx_conf_t tbx_conf_bed
jpayne@69 1476 tbx_conf_t tbx_conf_psltbl
jpayne@69 1477 tbx_conf_t tbx_conf_sam
jpayne@69 1478 tbx_conf_t tbx_conf_vcf
jpayne@69 1479
jpayne@69 1480 void tbx_itr_destroy(hts_itr_t * iter)
jpayne@69 1481 hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
jpayne@69 1482 hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
jpayne@69 1483 int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
jpayne@69 1484
jpayne@69 1485 int tbx_name2id(tbx_t *tbx, char *ss)
jpayne@69 1486
jpayne@69 1487 int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
jpayne@69 1488 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
jpayne@69 1489
jpayne@69 1490 tbx_t * tbx_index_load(char *fn)
jpayne@69 1491 tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
jpayne@69 1492 tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
jpayne@69 1493
jpayne@69 1494 # free the array but not the values
jpayne@69 1495 char **tbx_seqnames(tbx_t *tbx, int *n)
jpayne@69 1496
jpayne@69 1497 void tbx_destroy(tbx_t *tbx)
jpayne@69 1498
jpayne@69 1499
jpayne@69 1500 # VCF/BCF API
jpayne@69 1501 cdef extern from "htslib/vcf.h" nogil:
jpayne@69 1502
jpayne@69 1503 # Header struct
jpayne@69 1504
jpayne@69 1505 uint8_t BCF_HL_FLT # header line
jpayne@69 1506 uint8_t BCF_HL_INFO
jpayne@69 1507 uint8_t BCF_HL_FMT
jpayne@69 1508 uint8_t BCF_HL_CTG
jpayne@69 1509 uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..>
jpayne@69 1510 uint8_t BCF_HL_GEN # generic header line
jpayne@69 1511
jpayne@69 1512 uint8_t BCF_HT_FLAG # header type
jpayne@69 1513 uint8_t BCF_HT_INT
jpayne@69 1514 uint8_t BCF_HT_REAL
jpayne@69 1515 uint8_t BCF_HT_STR
jpayne@69 1516
jpayne@69 1517 uint8_t BCF_VL_FIXED # variable length
jpayne@69 1518 uint8_t BCF_VL_VAR
jpayne@69 1519 uint8_t BCF_VL_A
jpayne@69 1520 uint8_t BCF_VL_G
jpayne@69 1521 uint8_t BCF_VL_R
jpayne@69 1522
jpayne@69 1523 # === Dictionary ===
jpayne@69 1524 #
jpayne@69 1525 # The header keeps three dictionaries. The first keeps IDs in the
jpayne@69 1526 # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
jpayne@69 1527 # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
jpayne@69 1528 # is the actual hash table, which is opaque to the end users. In the hash
jpayne@69 1529 # table, the key is the ID or sample name as a C string and the value is a
jpayne@69 1530 # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
jpayne@69 1531 # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
jpayne@69 1532 # size of the hash table or, equivalently, the length of the id[] arrays.
jpayne@69 1533
jpayne@69 1534 uint8_t BCF_DT_ID # dictionary type
jpayne@69 1535 uint8_t BCF_DT_CTG
jpayne@69 1536 uint8_t BCF_DT_SAMPLE
jpayne@69 1537
jpayne@69 1538 # Complete textual representation of a header line
jpayne@69 1539 ctypedef struct bcf_hrec_t:
jpayne@69 1540 int type # One of the BCF_HL_* type
jpayne@69 1541 char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
jpayne@69 1542 char *value # Set only for generic lines, NULL for FILTER/INFO, etc.
jpayne@69 1543 int nkeys # Number of structured fields
jpayne@69 1544 char **keys # The key=value pairs
jpayne@69 1545 char **vals
jpayne@69 1546
jpayne@69 1547 ctypedef struct bcf_idinfo_t:
jpayne@69 1548 uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
jpayne@69 1549 bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
jpayne@69 1550 int id
jpayne@69 1551
jpayne@69 1552 ctypedef struct bcf_idpair_t:
jpayne@69 1553 const char *key
jpayne@69 1554 const bcf_idinfo_t *val
jpayne@69 1555
jpayne@69 1556 ctypedef struct bcf_hdr_t:
jpayne@69 1557 int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
jpayne@69 1558 bcf_idpair_t *id[3]
jpayne@69 1559 void *dict[3] # ID dictionary, contig dict and sample dict
jpayne@69 1560 char **samples
jpayne@69 1561 bcf_hrec_t **hrec
jpayne@69 1562 int nhrec, dirty
jpayne@69 1563 int ntransl
jpayne@69 1564 int *transl[2] # for bcf_translate()
jpayne@69 1565 int nsamples_ori # for bcf_hdr_set_samples()
jpayne@69 1566 uint8_t *keep_samples
jpayne@69 1567 kstring_t mem
jpayne@69 1568 int32_t m[3] # m: allocated size of the dictionary block in use (see n above)
jpayne@69 1569
jpayne@69 1570 uint8_t bcf_type_shift[]
jpayne@69 1571
jpayne@69 1572 # * VCF record *
jpayne@69 1573
jpayne@69 1574 uint8_t BCF_BT_NULL
jpayne@69 1575 uint8_t BCF_BT_INT8
jpayne@69 1576 uint8_t BCF_BT_INT16
jpayne@69 1577 uint8_t BCF_BT_INT32
jpayne@69 1578 uint8_t BCF_BT_FLOAT
jpayne@69 1579 uint8_t BCF_BT_CHAR
jpayne@69 1580
jpayne@69 1581 uint8_t VCF_REF
jpayne@69 1582 uint8_t VCF_SNP
jpayne@69 1583 uint8_t VCF_MNP
jpayne@69 1584 uint8_t VCF_INDEL
jpayne@69 1585 uint8_t VCF_OTHER
jpayne@69 1586 uint8_t VCF_BND
jpayne@69 1587 uint8_t VCF_OVERLAP
jpayne@69 1588
jpayne@69 1589
jpayne@69 1590 ctypedef struct variant_t:
jpayne@69 1591 int type, n # variant type and the number of bases affected, negative for deletions
jpayne@69 1592
jpayne@69 1593 ctypedef struct bcf_fmt_t:
jpayne@69 1594 int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
jpayne@69 1595 int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
jpayne@69 1596 uint8_t *p # same as vptr and vptr_* in bcf_info_t below
jpayne@69 1597 uint32_t p_len
jpayne@69 1598 uint32_t p_off
jpayne@69 1599 uint8_t p_free
jpayne@69 1600
jpayne@69 1601 union bcf_info_union_t:
jpayne@69 1602 int32_t i # integer value
jpayne@69 1603 float f # float value
jpayne@69 1604
jpayne@69 1605 ctypedef struct bcf_info_t:
jpayne@69 1606 int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
jpayne@69 1607 int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars
jpayne@69 1608
jpayne@69 1609 # v1 union only set if $len==1; for easier access
jpayne@69 1610 bcf_info_union_t v1
jpayne@69 1611 uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
jpayne@69 1612 uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset
jpayne@69 1613 uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes
jpayne@69 1614 uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new
jpayne@69 1615 # data block is bigger than the original
jpayne@69 1616
jpayne@69 1617 uint8_t BCF1_DIRTY_ID
jpayne@69 1618 uint8_t BCF1_DIRTY_ALS
jpayne@69 1619 uint8_t BCF1_DIRTY_FLT
jpayne@69 1620 uint8_t BCF1_DIRTY_INF
jpayne@69 1621
jpayne@69 1622 ctypedef struct bcf_dec_t:
jpayne@69 1623 int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change
jpayne@69 1624 int n_flt # Number of FILTER fields
jpayne@69 1625 int *flt # FILTER keys in the dictionary
jpayne@69 1626 char *id # ID
jpayne@69 1627 char *als # REF+ALT block (\0-seperated)
jpayne@69 1628 char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated
jpayne@69 1629 bcf_info_t *info # INFO
jpayne@69 1630 bcf_fmt_t *fmt # FORMAT and individual sample
jpayne@69 1631 variant_t *var # $var and $var_type set only when set_variant_types called
jpayne@69 1632 int n_var, var_type
jpayne@69 1633 int shared_dirty # if set, shared.s must be recreated on BCF output
jpayne@69 1634 int indiv_dirty # if set, indiv.s must be recreated on BCF output
jpayne@69 1635
jpayne@69 1636 uint8_t BCF_ERR_CTG_UNDEF
jpayne@69 1637 uint8_t BCF_ERR_TAG_UNDEF
jpayne@69 1638 uint8_t BCF_ERR_NCOLS
jpayne@69 1639 uint8_t BCF_ERR_LIMITS
jpayne@69 1640 uint8_t BCF_ERR_CHAR
jpayne@69 1641 uint8_t BCF_ERR_CTG_INVALID
jpayne@69 1642 uint8_t BCF_ERR_TAG_INVALID
jpayne@69 1643
jpayne@69 1644 # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
jpayne@69 1645 # is slower because the string is first to be parsed, packed into BCF line
jpayne@69 1646 # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
jpayne@69 1647 # is known in advance that some of the fields will not be required (notably
jpayne@69 1648 # the sample columns), parsing of these can be skipped by setting max_unpack
jpayne@69 1649 # appropriately.
jpayne@69 1650 # Similarly, it is fast to output a BCF line because the columns (kept in
jpayne@69 1651 # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
jpayne@69 1652 # line must be formatted in vcf_format.
jpayne@69 1653
jpayne@69 1654 ctypedef struct bcf1_t:
jpayne@69 1655 int32_t rid # CHROM
jpayne@69 1656 int32_t pos # POS
jpayne@69 1657 int32_t rlen # length of REF
jpayne@69 1658 float qual # QUAL
jpayne@69 1659 uint32_t n_info, n_allele
jpayne@69 1660 uint32_t n_fmt, n_sample
jpayne@69 1661 kstring_t shared, indiv
jpayne@69 1662 bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
jpayne@69 1663 int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
jpayne@69 1664 int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
jpayne@69 1665 int unpack_size[3] # the original block size of ID, REF+ALT and FILTER
jpayne@69 1666 int errcode # one of BCF_ERR_* codes
jpayne@69 1667
jpayne@69 1668 ####### API #######
jpayne@69 1669
jpayne@69 1670 # BCF and VCF I/O
jpayne@69 1671 #
jpayne@69 1672 # A note about naming conventions: htslib internally represents VCF
jpayne@69 1673 # records as bcf1_t data structures, therefore most functions are
jpayne@69 1674 # prefixed with bcf_. There are a few exceptions where the functions must
jpayne@69 1675 # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
jpayne@69 1676 # these cases, functions prefixed with bcf_ are more general and work
jpayne@69 1677 # with both BCF and VCF.
jpayne@69 1678
jpayne@69 1679 # bcf_hdr_init() - create an empty BCF header.
jpayne@69 1680 # @param mode "r" or "w"
jpayne@69 1681 #
jpayne@69 1682 # When opened for writing, the mandatory fileFormat and
jpayne@69 1683 # FILTER=PASS lines are added automatically.
jpayne@69 1684 bcf_hdr_t *bcf_hdr_init(const char *mode)
jpayne@69 1685
jpayne@69 1686 # Destroy a BCF header struct
jpayne@69 1687 void bcf_hdr_destroy(bcf_hdr_t *h)
jpayne@69 1688
jpayne@69 1689 # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
jpayne@69 1690 bcf1_t *bcf_init()
jpayne@69 1691
jpayne@69 1692 # Deallocate a bcf1_t object
jpayne@69 1693 void bcf_destroy(bcf1_t *v)
jpayne@69 1694
jpayne@69 1695 # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
jpayne@69 1696 # not the bcf1_t object itself.
jpayne@69 1697 void bcf_empty(bcf1_t *v)
jpayne@69 1698
jpayne@69 1699 # Make the bcf1_t object ready for next read. Intended mostly for
jpayne@69 1700 # internal use, the user should rarely need to call this function
jpayne@69 1701 # directly.
jpayne@69 1702 void bcf_clear(bcf1_t *v)
jpayne@69 1703
jpayne@69 1704 # Reads VCF or BCF header
jpayne@69 1705 bcf_hdr_t *bcf_hdr_read(htsFile *fp)
jpayne@69 1706
jpayne@69 1707 # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
jpayne@69 1708 # @samples: samples to include or exclude from file or as a comma-separated string.
jpayne@69 1709 # LIST|FILE .. select samples in list/file
jpayne@69 1710 # ^LIST|FILE .. exclude samples from list/file
jpayne@69 1711 # - .. include all samples
jpayne@69 1712 # NULL .. exclude all samples
jpayne@69 1713 # @is_file: @samples is a file (1) or a comma-separated list (0)
jpayne@69 1714 #
jpayne@69 1715 # The bottleneck of VCF reading is parsing of genotype fields. If the
jpayne@69 1716 # reader knows in advance that only subset of samples is needed (possibly
jpayne@69 1717 # no samples at all), the performance of bcf_read() can be significantly
jpayne@69 1718 # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
jpayne@69 1719 # The function bcf_read() will subset the VCF/BCF records automatically
jpayne@69 1720 # with the notable exception when reading records via bcf_itr_next().
jpayne@69 1721 # In this case, bcf_subset_format() must be called explicitly, because
jpayne@69 1722 # bcf_readrec() does not see the header.
jpayne@69 1723 #
jpayne@69 1724 # Returns 0 on success, -1 on error or a positive integer if the list
jpayne@69 1725 # contains samples not present in the VCF header. In such a case, the
jpayne@69 1726 # return value is the index of the offending sample.
jpayne@69 1727 #
jpayne@69 1728 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
jpayne@69 1729 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
jpayne@69 1730
jpayne@69 1731 # Writes VCF or BCF header
jpayne@69 1732 int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
jpayne@69 1733
jpayne@69 1734 # Parse VCF line contained in kstring and populate the bcf1_t struct
jpayne@69 1735 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 1736
jpayne@69 1737 # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
jpayne@69 1738 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
jpayne@69 1739
jpayne@69 1740 # bcf_read() - read next VCF or BCF record
jpayne@69 1741 #
jpayne@69 1742 # Returns -1 on critical errors, 0 otherwise. On errors which are not
jpayne@69 1743 # critical for reading, such as missing header definitions, v->errcode is
jpayne@69 1744 # set to one of BCF_ERR* code and must be checked before calling
jpayne@69 1745 # vcf_write().
jpayne@69 1746 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 1747
jpayne@69 1748 # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
jpayne@69 1749 #
jpayne@69 1750 # Note that bcf_unpack() must be called even when reading VCF. It is safe
jpayne@69 1751 # to call the function repeatedly, it will not unpack the same field
jpayne@69 1752 # twice.
jpayne@69 1753 uint8_t BCF_UN_STR # up to ALT inclusive
jpayne@69 1754 uint8_t BCF_UN_FLT # up to FILTER
jpayne@69 1755 uint8_t BCF_UN_INFO # up to INFO
jpayne@69 1756 uint8_t BCF_UN_SHR # all shared information
jpayne@69 1757 uint8_t BCF_UN_FMT # unpack format and each sample
jpayne@69 1758 uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT
jpayne@69 1759 uint8_t BCF_UN_ALL # everything
jpayne@69 1760
jpayne@69 1761 int bcf_unpack(bcf1_t *b, int which)
jpayne@69 1762
jpayne@69 1763 # bcf_dup() - create a copy of BCF record.
jpayne@69 1764 #
jpayne@69 1765 # Note that bcf_unpack() must be called on the returned copy as if it was
jpayne@69 1766 # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
jpayne@69 1767 # internally to reflect any changes made by bcf_update_* functions.
jpayne@69 1768 bcf1_t *bcf_dup(bcf1_t *src)
jpayne@69 1769 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
jpayne@69 1770
jpayne@69 1771 # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
jpayne@69 1772 int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
jpayne@69 1773
jpayne@69 1774 # The following functions work only with VCFs and should rarely be called
jpayne@69 1775 # directly. Usually one wants to use their bcf_* alternatives, which work
jpayne@69 1776 # transparently with both VCFs and BCFs.
jpayne@69 1777 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
jpayne@69 1778 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
jpayne@69 1779 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 1780 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 1781
jpayne@69 1782 #************************************************************************
jpayne@69 1783 # Header querying and manipulation routines
jpayne@69 1784 #************************************************************************
jpayne@69 1785
jpayne@69 1786 # Create a new header using the supplied template
jpayne@69 1787 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
jpayne@69 1788
jpayne@69 1789 # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
jpayne@69 1790 # Returns 0 on success or sets a bit on error:
jpayne@69 1791 # 1 .. conflicting definitions of tag length
jpayne@69 1792 # # todo
jpayne@69 1793 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
jpayne@69 1794
jpayne@69 1795 # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
jpayne@69 1796 # @param dst: the destination header to be merged into, NULL on the first pass
jpayne@69 1797 # @param src: the source header
jpayne@69 1798 #
jpayne@69 1799 # Notes:
jpayne@69 1800 # - use as:
jpayne@69 1801 # bcf_hdr_t *dst = NULL;
jpayne@69 1802 # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
jpayne@69 1803 #
jpayne@69 1804 # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
jpayne@69 1805 # combining multiple BCF headers. The current bcf_hdr_combine()
jpayne@69 1806 # does not have this problem, but became slow when used for many files.
jpayne@69 1807 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
jpayne@69 1808
jpayne@69 1809 # bcf_hdr_add_sample() - add a new sample.
jpayne@69 1810 # @param sample: sample name to be added
jpayne@69 1811 int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
jpayne@69 1812
jpayne@69 1813 # Read VCF header from a file and update the header
jpayne@69 1814 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
jpayne@69 1815
jpayne@69 1816 # Appends formatted header text to _str_.
jpayne@69 1817 # If _is_bcf_ is zero, `IDX` fields are discarded.
jpayne@69 1818 # @return 0 if successful, or negative if an error occurred
jpayne@69 1819 # @since 1.4
jpayne@69 1820 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
jpayne@69 1821
jpayne@69 1822 # Returns formatted header (newly allocated string) and its length,
jpayne@69 1823 # excluding the terminating \0. If is_bcf parameter is unset, IDX
jpayne@69 1824 # fields are discarded.
jpayne@69 1825 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
jpayne@69 1826
jpayne@69 1827 # Append new VCF header line, returns 0 on success
jpayne@69 1828 int bcf_hdr_append(bcf_hdr_t *h, const char *line)
jpayne@69 1829 int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
jpayne@69 1830
jpayne@69 1831 # VCF version, e.g. VCFv4.2
jpayne@69 1832 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
jpayne@69 1833 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
jpayne@69 1834
jpayne@69 1835 # bcf_hdr_remove() - remove VCF header tag
jpayne@69 1836 # @param type: one of BCF_HL_*
jpayne@69 1837 # @param key: tag name or NULL to remove all tags of the given type
jpayne@69 1838 void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
jpayne@69 1839
jpayne@69 1840 # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
jpayne@69 1841 # @param n: number of samples to keep
jpayne@69 1842 # @param samples: names of the samples to keep
jpayne@69 1843 # @param imap: mapping from index in @samples to the sample index in the original file
jpayne@69 1844 #
jpayne@69 1845 # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
jpayne@69 1846 # by comparing n and bcf_hdr_nsamples(out_hdr).
jpayne@69 1847 # This function can be used to reorder samples.
jpayne@69 1848 # See also bcf_subset() which subsets individual records.
jpayne@69 1849 #
jpayne@69 1850 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
jpayne@69 1851
jpayne@69 1852 # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
jpayne@69 1853 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
jpayne@69 1854
jpayne@69 1855 # Get number of samples
jpayne@69 1856 int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
jpayne@69 1857
jpayne@69 1858 # The following functions are for internal use and should rarely be called directly
jpayne@69 1859 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
jpayne@69 1860 int bcf_hdr_sync(bcf_hdr_t *h)
jpayne@69 1861 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
jpayne@69 1862 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
jpayne@69 1863 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
jpayne@69 1864
jpayne@69 1865 # bcf_hdr_get_hrec() - get header line info
jpayne@69 1866 # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
jpayne@69 1867 # @param key: the header key for generic lines (e.g. "fileformat"), any field
jpayne@69 1868 # for structured lines, typically "ID".
jpayne@69 1869 # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
jpayne@69 1870 # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
jpayne@69 1871 #
jpayne@69 1872 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
jpayne@69 1873 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
jpayne@69 1874 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
jpayne@69 1875 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
jpayne@69 1876 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
jpayne@69 1877 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
jpayne@69 1878 void bcf_hrec_destroy(bcf_hrec_t *hrec)
jpayne@69 1879
jpayne@69 1880 #************************************************************************
jpayne@69 1881 # Individual record querying and manipulation routines
jpayne@69 1882 #************************************************************************
jpayne@69 1883
jpayne@69 1884 # See the description of bcf_hdr_subset()
jpayne@69 1885 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
jpayne@69 1886
jpayne@69 1887 # bcf_translate() - translate tags ids to be consistent with different header. This function
jpayne@69 1888 # is useful when lines from multiple VCF need to be combined.
jpayne@69 1889 # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
jpayne@69 1890 # @src_hdr: the source header, used in bcf_read()
jpayne@69 1891 # @src_line: line obtained by bcf_read()
jpayne@69 1892 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
jpayne@69 1893
jpayne@69 1894 # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
jpayne@69 1895 int bcf_get_variant_types(bcf1_t *rec)
jpayne@69 1896 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
jpayne@69 1897 int bcf_is_snp(bcf1_t *v)
jpayne@69 1898
jpayne@69 1899 # bcf_update_filter() - sets the FILTER column
jpayne@69 1900 # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
jpayne@69 1901 # @n: Number of filters. If n==0, all filters are removed
jpayne@69 1902 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
jpayne@69 1903
jpayne@69 1904 # bcf_add_filter() - adds to the FILTER column
jpayne@69 1905 # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
jpayne@69 1906 #
jpayne@69 1907 # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
jpayne@69 1908 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
jpayne@69 1909
jpayne@69 1910 # bcf_remove_filter() - removes from the FILTER column
jpayne@69 1911 # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
jpayne@69 1912 # @pass: when set to 1 and no filters are present, set to PASS
jpayne@69 1913 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
jpayne@69 1914
jpayne@69 1915 # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
jpayne@69 1916 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
jpayne@69 1917
jpayne@69 1918 # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
jpayne@69 1919 # @alleles: Array of alleles
jpayne@69 1920 # @nals: Number of alleles
jpayne@69 1921 # @alleles_string: Comma-separated alleles, starting with the REF allele
jpayne@69 1922 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
jpayne@69 1923 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
jpayne@69 1924
jpayne@69 1925 # bcf_update_id() - sets new ID string
jpayne@69 1926 # bcf_add_id() - adds to the ID string checking for duplicates
jpayne@69 1927 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
jpayne@69 1928 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
jpayne@69 1929
jpayne@69 1930 # bcf_update_info_*() - functions for updating INFO fields
jpayne@69 1931 # @hdr: the BCF header
jpayne@69 1932 # @line: VCF line to be edited
jpayne@69 1933 # @key: the INFO tag to be updated
jpayne@69 1934 # @values: pointer to the array of values. Pass NULL to remove the tag.
jpayne@69 1935 # @n: number of values in the array. When set to 0, the INFO tag is removed
jpayne@69 1936 #
jpayne@69 1937 # The @string in bcf_update_info_flag() is optional, @n indicates whether
jpayne@69 1938 # the flag is set or removed.
jpayne@69 1939 #
jpayne@69 1940 # Returns 0 on success or negative value on error.
jpayne@69 1941 #
jpayne@69 1942 int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
jpayne@69 1943 int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
jpayne@69 1944 int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
jpayne@69 1945 int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
jpayne@69 1946 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
jpayne@69 1947
jpayne@69 1948 # bcf_update_format_*() - functions for updating FORMAT fields
jpayne@69 1949 # @values: pointer to the array of values, the same number of elements
jpayne@69 1950 # is expected for each sample. Missing values must be padded
jpayne@69 1951 # with bcf_*_missing or bcf_*_vector_end values.
jpayne@69 1952 # @n: number of values in the array. If n==0, existing tag is removed.
jpayne@69 1953 #
jpayne@69 1954 # The function bcf_update_format_string() is a higher-level (slower) variant of
jpayne@69 1955 # bcf_update_format_char(). The former accepts array of \0-terminated strings
jpayne@69 1956 # whereas the latter requires that the strings are collapsed into a single array
jpayne@69 1957 # of fixed-length strings. In case of strings with variable length, shorter strings
jpayne@69 1958 # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
jpayne@69 1959 # are not \0-terminated.
jpayne@69 1960 #
jpayne@69 1961 # Returns 0 on success or negative value on error.
jpayne@69 1962 #
jpayne@69 1963 int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
jpayne@69 1964 int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
jpayne@69 1965 int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
jpayne@69 1966 int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
jpayne@69 1967 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
jpayne@69 1968 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
jpayne@69 1969
jpayne@69 1970 # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
jpayne@69 1971 # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
jpayne@69 1972 # from bcf_get_genotypes() below.
jpayne@69 1973 uint32_t bcf_gt_phased(uint32_t idx)
jpayne@69 1974 uint32_t bcf_gt_unphased(uint32_t idx)
jpayne@69 1975 uint32_t bcf_gt_missing
jpayne@69 1976 uint32_t bcf_gt_is_missing(uint32_t val)
jpayne@69 1977 uint32_t bcf_gt_is_phased(uint32_t idx)
jpayne@69 1978 uint32_t bcf_gt_allele(uint32_t val)
jpayne@69 1979
jpayne@69 1980 # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
jpayne@69 1981 uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
jpayne@69 1982 void bcf_gt2alleles(int igt, int *a, int *b)
jpayne@69 1983
jpayne@69 1984 # bcf_get_fmt() - returns pointer to FORMAT's field data
jpayne@69 1985 # @header: for access to BCF_DT_ID dictionary
jpayne@69 1986 # @line: VCF line obtained from vcf_parse1
jpayne@69 1987 # @fmt: one of GT,PL,...
jpayne@69 1988 #
jpayne@69 1989 # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
jpayne@69 1990 # is not available.
jpayne@69 1991 #
jpayne@69 1992 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
jpayne@69 1993 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
jpayne@69 1994
jpayne@69 1995 # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
jpayne@69 1996 # @line: VCF line obtained from vcf_parse1
jpayne@69 1997 # @id: The header index for the tag, obtained from bcf_hdr_id2int()
jpayne@69 1998 #
jpayne@69 1999 # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
jpayne@69 2000 # as their goal is to avoid the header lookup.
jpayne@69 2001 #
jpayne@69 2002 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
jpayne@69 2003 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
jpayne@69 2004
jpayne@69 2005 # bcf_get_info_*() - get INFO values, integers or floats
jpayne@69 2006 # @hdr: BCF header
jpayne@69 2007 # @line: BCF record
jpayne@69 2008 # @tag: INFO tag to retrieve
jpayne@69 2009 # @dst: *dst is pointer to a memory location, can point to NULL
jpayne@69 2010 # @ndst: pointer to the size of allocated memory
jpayne@69 2011 #
jpayne@69 2012 # Returns negative value on error or the number of written values on
jpayne@69 2013 # success. bcf_get_info_string() returns on success the number of
jpayne@69 2014 # characters written excluding the null-terminating byte. bcf_get_info_flag()
jpayne@69 2015 # returns 1 when flag is set or 0 if not.
jpayne@69 2016 #
jpayne@69 2017 # List of return codes:
jpayne@69 2018 # -1 .. no such INFO tag defined in the header
jpayne@69 2019 # -2 .. clash between types defined in the header and encountered in the VCF record
jpayne@69 2020 # -3 .. tag is not present in the VCF record
jpayne@69 2021 #
jpayne@69 2022 int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
jpayne@69 2023 int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
jpayne@69 2024 int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
jpayne@69 2025 int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
jpayne@69 2026 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
jpayne@69 2027
jpayne@69 2028 # bcf_get_format_*() - same as bcf_get_info*() above
jpayne@69 2029 #
jpayne@69 2030 # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
jpayne@69 2031 # see the description of bcf_update_format_string() and bcf_update_format_char() above.
jpayne@69 2032 # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
jpayne@69 2033 # a single block of \0-terminated strings collapsed into a single array and an array of pointers
jpayne@69 2034 # to these strings. Both arrays must be cleaned by the user.
jpayne@69 2035 #
jpayne@69 2036 # Returns negative value on error or the number of written values on success.
jpayne@69 2037 #
jpayne@69 2038 # Example:
jpayne@69 2039 # int ndst = 0; char **dst = NULL
jpayne@69 2040 # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
jpayne@69 2041 # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
jpayne@69 2042 # free(dst[0]); free(dst)
jpayne@69 2043 #
jpayne@69 2044 # Example:
jpayne@69 2045 # int ngt, *gt_arr = NULL, ngt_arr = 0
jpayne@69 2046 # ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr)
jpayne@69 2047 #
jpayne@69 2048 int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
jpayne@69 2049 int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
jpayne@69 2050 int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
jpayne@69 2051 int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst)
jpayne@69 2052 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
jpayne@69 2053 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
jpayne@69 2054
jpayne@69 2055 #************************************************************************
jpayne@69 2056 # Helper functions
jpayne@69 2057 #************************************************************************
jpayne@69 2058
jpayne@69 2059 #
jpayne@69 2060 # bcf_hdr_id2int() - Translates string into numeric ID
jpayne@69 2061 # bcf_hdr_int2id() - Translates numeric ID into string
jpayne@69 2062 # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
jpayne@69 2063 # @id: tag name, such as: PL, DP, GT, etc.
jpayne@69 2064 #
jpayne@69 2065 # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
jpayne@69 2066 # fields in BCF records.
jpayne@69 2067 #
jpayne@69 2068 int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
jpayne@69 2069 const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2070
jpayne@69 2071 # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
jpayne@69 2072 # bcf_hdr_id2name() - Translates numeric ID to sequence name
jpayne@69 2073 #
jpayne@69 2074 int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
jpayne@69 2075 const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
jpayne@69 2076 const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
jpayne@69 2077
jpayne@69 2078 #
jpayne@69 2079 # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
jpayne@69 2080 # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
jpayne@69 2081 # @int_id: return value of bcf_hdr_id2int, must be >=0
jpayne@69 2082 #
jpayne@69 2083 # The returned values are:
jpayne@69 2084 # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
jpayne@69 2085 # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
jpayne@69 2086 # bcf_hdr_id2type .. the field type, one of BCF_HT_*
jpayne@69 2087 # bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
jpayne@69 2088 #
jpayne@69 2089 # Notes: Prior to using the macros, the presence of the info should be
jpayne@69 2090 # tested with bcf_hdr_idinfo_exists().
jpayne@69 2091 #
jpayne@69 2092 int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2093 int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2094 int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2095 int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2096 int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
jpayne@69 2097 bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
jpayne@69 2098
jpayne@69 2099 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
jpayne@69 2100 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
jpayne@69 2101
jpayne@69 2102 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
jpayne@69 2103 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
jpayne@69 2104 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
jpayne@69 2105
jpayne@69 2106 #************************************************************************
jpayne@69 2107 # BCF index
jpayne@69 2108 #
jpayne@69 2109 # Note that these functions work with BCFs only. See synced_bcf_reader.h
jpayne@69 2110 # which provides (amongst other things) an API to work transparently with
jpayne@69 2111 # both indexed BCFs and VCFs.
jpayne@69 2112 #************************************************************************
jpayne@69 2113
jpayne@69 2114 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
jpayne@69 2115 hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
jpayne@69 2116 int bcf_index_build(const char *fn, int min_shift)
jpayne@69 2117 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
jpayne@69 2118
jpayne@69 2119 #*******************
jpayne@69 2120 # Typed value I/O *
jpayne@69 2121 #******************
jpayne@69 2122
jpayne@69 2123 # Note that in contrast with BCFv2.1 specification, HTSlib implementation
jpayne@69 2124 # allows missing values in vectors. For integer types, the values 0x80,
jpayne@69 2125 # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
jpayne@69 2126 # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
jpayne@69 2127 # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
jpayne@69 2128 # end-of-vector indicator.
jpayne@69 2129 # Note that the end-of-vector byte is not part of the vector.
jpayne@69 2130
jpayne@69 2131 # This trial BCF version (v2.2) is compatible with the VCF specification and
jpayne@69 2132 # enables to handle correctly vectors with different ploidy in presence of
jpayne@69 2133 # missing values.
jpayne@69 2134
jpayne@69 2135 int32_t bcf_int8_vector_end
jpayne@69 2136 int32_t bcf_int16_vector_end
jpayne@69 2137 int32_t bcf_int32_vector_end
jpayne@69 2138 int32_t bcf_str_vector_end
jpayne@69 2139 int32_t bcf_int8_missing
jpayne@69 2140 int32_t bcf_int16_missing
jpayne@69 2141 int32_t bcf_int32_missing
jpayne@69 2142 int32_t bcf_str_missing
jpayne@69 2143
jpayne@69 2144 uint32_t bcf_float_vector_end
jpayne@69 2145 uint32_t bcf_float_missing
jpayne@69 2146
jpayne@69 2147 void bcf_float_set(float *ptr, uint32_t value)
jpayne@69 2148 void bcf_float_set_vector_end(float *x)
jpayne@69 2149 void bcf_float_set_missing(float *x)
jpayne@69 2150
jpayne@69 2151 int bcf_float_is_missing(float f)
jpayne@69 2152 int bcf_float_is_vector_end(float f)
jpayne@69 2153 void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
jpayne@69 2154 void bcf_enc_size(kstring_t *s, int size, int type)
jpayne@69 2155 int bcf_enc_inttype(long x)
jpayne@69 2156 void bcf_enc_int1(kstring_t *s, int32_t x)
jpayne@69 2157 int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
jpayne@69 2158 int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
jpayne@69 2159 int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
jpayne@69 2160
jpayne@69 2161 # These trivial wrappers are defined only for consistency with other parts of htslib
jpayne@69 2162 bcf1_t *bcf_init1()
jpayne@69 2163 int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 2164 int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 2165 int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 2166 int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 2167 void bcf_destroy1(bcf1_t *v)
jpayne@69 2168 void bcf_empty1(bcf1_t *v)
jpayne@69 2169 int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
jpayne@69 2170 void bcf_clear1(bcf1_t *v)
jpayne@69 2171 int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
jpayne@69 2172
jpayne@69 2173 # Other nice wrappers
jpayne@69 2174 void bcf_itr_destroy(hts_itr_t *iter)
jpayne@69 2175 hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
jpayne@69 2176 hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
jpayne@69 2177 int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
jpayne@69 2178 hts_idx_t *bcf_index_load(const char *fn)
jpayne@69 2179 const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
jpayne@69 2180
jpayne@69 2181
jpayne@69 2182 # VCF/BCF utility functions
jpayne@69 2183 cdef extern from "htslib/vcfutils.h" nogil:
jpayne@69 2184 struct kbitset_t
jpayne@69 2185
jpayne@69 2186 # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
jpayne@69 2187 # @header: for access to BCF_DT_ID dictionary
jpayne@69 2188 # @line: VCF line obtain from vcf_parse1
jpayne@69 2189 #
jpayne@69 2190 # Returns the number of removed alleles on success or negative
jpayne@69 2191 # on error:
jpayne@69 2192 # -1 .. some allele index is out of bounds
jpayne@69 2193 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
jpayne@69 2194
jpayne@69 2195 # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
jpayne@69 2196 # @header: for access to BCF_DT_ID dictionary
jpayne@69 2197 # @line: VCF line obtained from vcf_parse1
jpayne@69 2198 # @mask: alleles to remove
jpayne@69 2199 #
jpayne@69 2200 # If you have more than 31 alleles, then the integer bit mask will
jpayne@69 2201 # overflow, so use bcf_remove_allele_set instead
jpayne@69 2202 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
jpayne@69 2203
jpayne@69 2204 # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
jpayne@69 2205 # @header: for access to BCF_DT_ID dictionary
jpayne@69 2206 # @line: VCF line obtained from vcf_parse1
jpayne@69 2207 # @rm_set: pointer to kbitset_t object with bits set for allele
jpayne@69 2208 # indexes to remove
jpayne@69 2209 #
jpayne@69 2210 # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
jpayne@69 2211 void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
jpayne@69 2212
jpayne@69 2213 # bcf_calc_ac() - calculate the number of REF and ALT alleles
jpayne@69 2214 # @header: for access to BCF_DT_ID dictionary
jpayne@69 2215 # @line: VCF line obtained from vcf_parse1
jpayne@69 2216 # @ac: array of length line->n_allele
jpayne@69 2217 # @which: determine if INFO/AN,AC and indv fields be used
jpayne@69 2218 #
jpayne@69 2219 # Returns 1 if the call succeeded, or 0 if the value could not
jpayne@69 2220 # be determined.
jpayne@69 2221 #
jpayne@69 2222 # The value of @which determines if existing INFO/AC,AN can be
jpayne@69 2223 # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT).
jpayne@69 2224 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
jpayne@69 2225
jpayne@69 2226 # bcf_gt_type() - determines type of the genotype
jpayne@69 2227 # @fmt_ptr: the GT format field as set for example by set_fmt_ptr
jpayne@69 2228 # @isample: sample index (starting from 0)
jpayne@69 2229 # @ial: index of the 1st non-reference allele (starting from 1)
jpayne@69 2230 # @jal: index of the 2nd non-reference allele (starting from 1)
jpayne@69 2231 #
jpayne@69 2232 # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
jpayne@69 2233 # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
jpayne@69 2234 # is not NULL and the genotype has one or more non-reference
jpayne@69 2235 # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
jpayne@69 2236 # position of the allele which appeared first in ALT. If $jal is
jpayne@69 2237 # not null and the genotype is GT_HET_AA, $jal will be set and is
jpayne@69 2238 # the position of the second allele in ALT.
jpayne@69 2239 uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation
jpayne@69 2240 uint8_t GT_HOM_AA
jpayne@69 2241 uint8_t GT_HET_RA
jpayne@69 2242 uint8_t GT_HET_AA
jpayne@69 2243 uint8_t GT_HAPL_R
jpayne@69 2244 uint8_t GT_HAPL_A
jpayne@69 2245 uint8_t GT_UNKN
jpayne@69 2246 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
jpayne@69 2247
jpayne@69 2248 int bcf_acgt2int(char c)
jpayne@69 2249 char bcf_int2acgt(int i)
jpayne@69 2250
jpayne@69 2251 # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
jpayne@69 2252 # @i,j: allele indexes, 0-based, i<=j
jpayne@69 2253 # Returns index to the Number=G diploid array
jpayne@69 2254 uint32_t bcf_ij2G(uint32_t i, uint32_t j)
jpayne@69 2255
jpayne@69 2256
jpayne@69 2257 cdef extern from "htslib/cram.h" nogil:
jpayne@69 2258
jpayne@69 2259 enum cram_block_method:
jpayne@69 2260 ERROR
jpayne@69 2261 RAW
jpayne@69 2262 GZIP
jpayne@69 2263 BZIP2
jpayne@69 2264 LZMA
jpayne@69 2265 RANS
jpayne@69 2266 RANS0
jpayne@69 2267 RANS1
jpayne@69 2268 GZIP_RLE
jpayne@69 2269
jpayne@69 2270 enum cram_content_type:
jpayne@69 2271 CT_ERROR
jpayne@69 2272 FILE_HEADER
jpayne@69 2273 COMPRESSION_HEADER
jpayne@69 2274 MAPPED_SLICE
jpayne@69 2275 UNMAPPED_SLICE
jpayne@69 2276 EXTERNAL
jpayne@69 2277 CORE
jpayne@69 2278
jpayne@69 2279 # Opaque data types, see cram_structs for the fully fledged versions.
jpayne@69 2280 ctypedef struct SAM_hdr
jpayne@69 2281 ctypedef struct cram_file_def
jpayne@69 2282 ctypedef struct cram_fd
jpayne@69 2283 ctypedef struct cram_container
jpayne@69 2284 ctypedef struct cram_block
jpayne@69 2285 ctypedef struct cram_slice
jpayne@69 2286 ctypedef struct cram_metrics
jpayne@69 2287 ctypedef struct cram_block_slice_hdr
jpayne@69 2288 ctypedef struct cram_block_compression_hdr
jpayne@69 2289 ctypedef struct refs_t
jpayne@69 2290
jpayne@69 2291 # Accessor functions
jpayne@69 2292
jpayne@69 2293 #
jpayne@69 2294 #-----------------------------------------------------------------------------
jpayne@69 2295 # cram_fd
jpayne@69 2296 #
jpayne@69 2297 SAM_hdr *cram_fd_get_header(cram_fd *fd)
jpayne@69 2298 void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr)
jpayne@69 2299
jpayne@69 2300 int cram_fd_get_version(cram_fd *fd)
jpayne@69 2301 void cram_fd_set_version(cram_fd *fd, int vers)
jpayne@69 2302
jpayne@69 2303 int cram_major_vers(cram_fd *fd)
jpayne@69 2304 int cram_minor_vers(cram_fd *fd)
jpayne@69 2305
jpayne@69 2306 hFILE *cram_fd_get_fp(cram_fd *fd)
jpayne@69 2307 void cram_fd_set_fp(cram_fd *fd, hFILE *fp)
jpayne@69 2308
jpayne@69 2309 #
jpayne@69 2310 #-----------------------------------------------------------------------------
jpayne@69 2311 # cram_container
jpayne@69 2312 #
jpayne@69 2313 int32_t cram_container_get_length(cram_container *c)
jpayne@69 2314 void cram_container_set_length(cram_container *c, int32_t length)
jpayne@69 2315 int32_t cram_container_get_num_blocks(cram_container *c)
jpayne@69 2316 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks)
jpayne@69 2317 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
jpayne@69 2318 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
jpayne@69 2319 int32_t *landmarks)
jpayne@69 2320
jpayne@69 2321 # Returns true if the container is empty (EOF marker) */
jpayne@69 2322 int cram_container_is_empty(cram_fd *fd)
jpayne@69 2323
jpayne@69 2324
jpayne@69 2325 #
jpayne@69 2326 #-----------------------------------------------------------------------------
jpayne@69 2327 # cram_block
jpayne@69 2328 #
jpayne@69 2329 int32_t cram_block_get_content_id(cram_block *b)
jpayne@69 2330 int32_t cram_block_get_comp_size(cram_block *b)
jpayne@69 2331 int32_t cram_block_get_uncomp_size(cram_block *b)
jpayne@69 2332 int32_t cram_block_get_crc32(cram_block *b)
jpayne@69 2333 void * cram_block_get_data(cram_block *b)
jpayne@69 2334
jpayne@69 2335 cram_content_type cram_block_get_content_type(cram_block *b)
jpayne@69 2336
jpayne@69 2337 void cram_block_set_content_id(cram_block *b, int32_t id)
jpayne@69 2338 void cram_block_set_comp_size(cram_block *b, int32_t size)
jpayne@69 2339 void cram_block_set_uncomp_size(cram_block *b, int32_t size)
jpayne@69 2340 void cram_block_set_crc32(cram_block *b, int32_t crc)
jpayne@69 2341 void cram_block_set_data(cram_block *b, void *data)
jpayne@69 2342
jpayne@69 2343 int cram_block_append(cram_block *b, void *data, int size)
jpayne@69 2344 void cram_block_update_size(cram_block *b)
jpayne@69 2345
jpayne@69 2346 # Offset is known as "size" internally, but it can be confusing.
jpayne@69 2347 size_t cram_block_get_offset(cram_block *b)
jpayne@69 2348 void cram_block_set_offset(cram_block *b, size_t offset)
jpayne@69 2349
jpayne@69 2350 #
jpayne@69 2351 # Computes the size of a cram block, including the block
jpayne@69 2352 # header itself.
jpayne@69 2353 #
jpayne@69 2354 uint32_t cram_block_size(cram_block *b)
jpayne@69 2355
jpayne@69 2356 #
jpayne@69 2357 # Renumbers RG numbers in a cram compression header.
jpayne@69 2358 #
jpayne@69 2359 # CRAM stores RG as the Nth number in the header, rather than a
jpayne@69 2360 # string holding the ID: tag. This is smaller in space, but means
jpayne@69 2361 # "samtools cat" to join files together that contain single but
jpayne@69 2362 # different RG lines needs a way of renumbering them.
jpayne@69 2363 #
jpayne@69 2364 # The file descriptor is expected to be immediately after the
jpayne@69 2365 # cram_container structure (ie before the cram compression header).
jpayne@69 2366 # Due to the nature of the CRAM format, this needs to read and write
jpayne@69 2367 # the blocks itself. Note that there may be multiple slices within
jpayne@69 2368 # the container, meaning multiple compression headers to manipulate.
jpayne@69 2369 # Changing RG may change the size of the compression header and
jpayne@69 2370 # therefore the length field in the container. Hence we rewrite all
jpayne@69 2371 # blocks just in case and also emit the adjusted container.
jpayne@69 2372 #
jpayne@69 2373 # The current implementation can only cope with renumbering a single
jpayne@69 2374 # RG (and only then if it is using HUFFMAN or BETA codecs). In
jpayne@69 2375 # theory it *may* be possible to renumber multiple RGs if they use
jpayne@69 2376 # HUFFMAN to the CORE block or use an external block unshared by any
jpayne@69 2377 # other data series. So we have an API that can be upgraded to
jpayne@69 2378 # support this, but do not implement it for now. An example
jpayne@69 2379 # implementation of RG as an EXTERNAL block would be to find that
jpayne@69 2380 # block and rewrite it, returning the number of blocks consumed.
jpayne@69 2381 #
jpayne@69 2382 # Returns 0 on success;
jpayne@69 2383 # -1 if unable to edit;
jpayne@69 2384 # -2 on other errors (eg I/O).
jpayne@69 2385 #
jpayne@69 2386 int cram_transcode_rg(cram_fd *input, cram_fd *output,
jpayne@69 2387 cram_container *c,
jpayne@69 2388 int nrg, int *in_rg, int *out_rg)
jpayne@69 2389
jpayne@69 2390 #
jpayne@69 2391 # Copies the blocks representing the next num_slice slices from a
jpayne@69 2392 # container from 'in' to 'out'. It is expected that the file pointer
jpayne@69 2393 # is just after the read of the cram_container and cram compression
jpayne@69 2394 # header.
jpayne@69 2395 #
jpayne@69 2396 # Returns 0 on success
jpayne@69 2397 # -1 on failure
jpayne@69 2398 #
jpayne@69 2399 int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice)
jpayne@69 2400
jpayne@69 2401 #
jpayne@69 2402 #-----------------------------------------------------------------------------
jpayne@69 2403 # SAM_hdr
jpayne@69 2404 #
jpayne@69 2405
jpayne@69 2406 # Tokenises a SAM header into a hash table.
jpayne@69 2407 #
jpayne@69 2408 # Also extracts a few bits on specific data types, such as @RG lines.
jpayne@69 2409 #
jpayne@69 2410 # @return
jpayne@69 2411 # Returns a SAM_hdr struct on success (free with sam_hdr_free())
jpayne@69 2412 # NULL on failure
jpayne@69 2413 #
jpayne@69 2414 SAM_hdr *sam_hdr_parse_(const char *hdr, int len)
jpayne@69 2415
jpayne@69 2416
jpayne@69 2417 #
jpayne@69 2418 #-----------------------------------------------------------------------------
jpayne@69 2419 # cram_io basics
jpayne@69 2420 #
jpayne@69 2421
jpayne@69 2422 # CRAM blocks - the dynamically growable data block. We have code to
jpayne@69 2423 # create, update, (un)compress and read/write.
jpayne@69 2424 #
jpayne@69 2425 # These are derived from the deflate_interlaced.c blocks, but with the
jpayne@69 2426 # CRAM extension of content types and IDs.
jpayne@69 2427 #
jpayne@69 2428
jpayne@69 2429 # Allocates a new cram_block structure with a specified content_type and
jpayne@69 2430 # id.
jpayne@69 2431 #
jpayne@69 2432 # @return
jpayne@69 2433 # Returns block pointer on success;
jpayne@69 2434 # NULL on failure
jpayne@69 2435 #
jpayne@69 2436 cram_block *cram_new_block(cram_content_type content_type,
jpayne@69 2437 int content_id)
jpayne@69 2438
jpayne@69 2439 # Reads a block from a cram file.
jpayne@69 2440 #
jpayne@69 2441 # @return
jpayne@69 2442 # Returns cram_block pointer on success;
jpayne@69 2443 # NULL on failure
jpayne@69 2444 #
jpayne@69 2445 cram_block *cram_read_block(cram_fd *fd)
jpayne@69 2446
jpayne@69 2447 # Writes a CRAM block.
jpayne@69 2448 #
jpayne@69 2449 # @return
jpayne@69 2450 # Returns 0 on success;
jpayne@69 2451 # -1 on failure
jpayne@69 2452 #
jpayne@69 2453 int cram_write_block(cram_fd *fd, cram_block *b)
jpayne@69 2454
jpayne@69 2455 # Frees a CRAM block, deallocating internal data too.
jpayne@69 2456 #
jpayne@69 2457 void cram_free_block(cram_block *b)
jpayne@69 2458
jpayne@69 2459 # Uncompresses a CRAM block, if compressed.
jpayne@69 2460 #
jpayne@69 2461 # @return
jpayne@69 2462 # Returns 0 on success;
jpayne@69 2463 # -1 on failure
jpayne@69 2464 #
jpayne@69 2465 int cram_uncompress_block(cram_block *b)
jpayne@69 2466
jpayne@69 2467 # Compresses a block.
jpayne@69 2468 #
jpayne@69 2469 # Compresses a block using one of two different zlib strategies. If we only
jpayne@69 2470 # want one choice set strat2 to be -1.
jpayne@69 2471 #
jpayne@69 2472 # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
jpayne@69 2473 # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
jpayne@69 2474 # significantly faster.
jpayne@69 2475 #
jpayne@69 2476 # @return
jpayne@69 2477 # Returns 0 on success;
jpayne@69 2478 # -1 on failure
jpayne@69 2479 #
jpayne@69 2480 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
jpayne@69 2481 int method, int level)
jpayne@69 2482
jpayne@69 2483 # Containers
jpayne@69 2484 #
jpayne@69 2485
jpayne@69 2486 # Creates a new container, specifying the maximum number of slices
jpayne@69 2487 # and records permitted.
jpayne@69 2488 #
jpayne@69 2489 # @return
jpayne@69 2490 # Returns cram_container ptr on success;
jpayne@69 2491 # NULL on failure
jpayne@69 2492 #
jpayne@69 2493 cram_container *cram_new_container(int nrec, int nslice)
jpayne@69 2494 void cram_free_container(cram_container *c)
jpayne@69 2495
jpayne@69 2496 # Reads a container header.
jpayne@69 2497 #
jpayne@69 2498 # @return
jpayne@69 2499 # Returns cram_container on success;
jpayne@69 2500 # NULL on failure or no container left (fd->err == 0).
jpayne@69 2501 #
jpayne@69 2502 cram_container *cram_read_container(cram_fd *fd)
jpayne@69 2503
jpayne@69 2504 # Writes a container structure.
jpayne@69 2505 #
jpayne@69 2506 # @return
jpayne@69 2507 # Returns 0 on success;
jpayne@69 2508 # -1 on failure
jpayne@69 2509 #
jpayne@69 2510 int cram_write_container(cram_fd *fd, cram_container *h)
jpayne@69 2511
jpayne@69 2512 #
jpayne@69 2513 # Stores the container structure in dat and returns *size as the
jpayne@69 2514 # number of bytes written to dat[]. The input size of dat is also
jpayne@69 2515 # held in *size and should be initialised to cram_container_size(c).
jpayne@69 2516 #
jpayne@69 2517 # Returns 0 on success;
jpayne@69 2518 # -1 on failure
jpayne@69 2519 #
jpayne@69 2520 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
jpayne@69 2521
jpayne@69 2522 int cram_container_size(cram_container *c)
jpayne@69 2523
jpayne@69 2524 # The top-level cram opening, closing and option handling
jpayne@69 2525 #
jpayne@69 2526
jpayne@69 2527 # Opens a CRAM file for read (mode "rb") or write ("wb").
jpayne@69 2528 #
jpayne@69 2529 # The filename may be "-" to indicate stdin or stdout.
jpayne@69 2530 #
jpayne@69 2531 # @return
jpayne@69 2532 # Returns file handle on success;
jpayne@69 2533 # NULL on failure.
jpayne@69 2534 #
jpayne@69 2535 cram_fd *cram_open(const char *filename, const char *mode)
jpayne@69 2536
jpayne@69 2537 # Opens an existing stream for reading or writing.
jpayne@69 2538 #
jpayne@69 2539 # @return
jpayne@69 2540 # Returns file handle on success;
jpayne@69 2541 # NULL on failure.
jpayne@69 2542 #
jpayne@69 2543 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode)
jpayne@69 2544
jpayne@69 2545 # Closes a CRAM file.
jpayne@69 2546 #
jpayne@69 2547 # @return
jpayne@69 2548 # Returns 0 on success;
jpayne@69 2549 # -1 on failure
jpayne@69 2550 #
jpayne@69 2551 int cram_close(cram_fd *fd)
jpayne@69 2552
jpayne@69 2553 #
jpayne@69 2554 # Seek within a CRAM file.
jpayne@69 2555 #
jpayne@69 2556 # Returns 0 on success
jpayne@69 2557 # -1 on failure
jpayne@69 2558 #
jpayne@69 2559 int cram_seek(cram_fd *fd, off_t offset, int whence)
jpayne@69 2560
jpayne@69 2561 #
jpayne@69 2562 # Flushes a CRAM file.
jpayne@69 2563 # Useful for when writing to stdout without wishing to close the stream.
jpayne@69 2564 #
jpayne@69 2565 # Returns 0 on success
jpayne@69 2566 # -1 on failure
jpayne@69 2567 #
jpayne@69 2568 int cram_flush(cram_fd *fd)
jpayne@69 2569
jpayne@69 2570 # Checks for end of file on a cram_fd stream.
jpayne@69 2571 #
jpayne@69 2572 # @return
jpayne@69 2573 # Returns 0 if not at end of file
jpayne@69 2574 # 1 if we hit an expected EOF (end of range or EOF block)
jpayne@69 2575 # 2 for other EOF (end of stream without EOF block)
jpayne@69 2576 #
jpayne@69 2577 int cram_eof(cram_fd *fd)
jpayne@69 2578
jpayne@69 2579 # Sets options on the cram_fd.
jpayne@69 2580 #
jpayne@69 2581 # See CRAM_OPT_* definitions in hts.h.
jpayne@69 2582 # Use this immediately after opening.
jpayne@69 2583 #
jpayne@69 2584 # @return
jpayne@69 2585 # Returns 0 on success;
jpayne@69 2586 # -1 on failure
jpayne@69 2587 #
jpayne@69 2588 int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...)
jpayne@69 2589
jpayne@69 2590 # Sets options on the cram_fd.
jpayne@69 2591 #
jpayne@69 2592 # See CRAM_OPT_* definitions in hts.h.
jpayne@69 2593 # Use this immediately after opening.
jpayne@69 2594 #
jpayne@69 2595 # @return
jpayne@69 2596 # Returns 0 on success;
jpayne@69 2597 # -1 on failure
jpayne@69 2598 #
jpayne@69 2599 int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args)
jpayne@69 2600
jpayne@69 2601 #
jpayne@69 2602 # Attaches a header to a cram_fd.
jpayne@69 2603 #
jpayne@69 2604 # This should be used when creating a new cram_fd for writing where
jpayne@69 2605 # we have an SAM_hdr already constructed (eg from a file we've read
jpayne@69 2606 # in).
jpayne@69 2607 #
jpayne@69 2608 # @return
jpayne@69 2609 # Returns 0 on success;
jpayne@69 2610 # -1 on failure
jpayne@69 2611 #
jpayne@69 2612 int cram_set_header(cram_fd *fd, SAM_hdr *hdr)
jpayne@69 2613
jpayne@69 2614 # Check if this file has a proper EOF block
jpayne@69 2615 #
jpayne@69 2616 # @return
jpayne@69 2617 # Returns 3 if the file is a version of CRAM that does not contain EOF blocks
jpayne@69 2618 # 2 if the file is a stream and thus unseekable
jpayne@69 2619 # 1 if the file contains an EOF block
jpayne@69 2620 # 0 if the file does not contain an EOF block
jpayne@69 2621 # -1 if an error occurred whilst reading the file or we could not seek back to where we were
jpayne@69 2622 #
jpayne@69 2623 #
jpayne@69 2624 int cram_check_EOF(cram_fd *fd)
jpayne@69 2625
jpayne@69 2626 # As int32_decoded/encode, but from/to blocks instead of cram_fd */
jpayne@69 2627 int int32_put_blk(cram_block *b, int32_t val)
jpayne@69 2628
jpayne@69 2629 # Deallocates all storage used by a SAM_hdr struct.
jpayne@69 2630 #
jpayne@69 2631 # This also decrements the header reference count. If after decrementing
jpayne@69 2632 # it is still non-zero then the header is assumed to be in use by another
jpayne@69 2633 # caller and the free is not done.
jpayne@69 2634 #
jpayne@69 2635 # This is a synonym for sam_hdr_dec_ref().
jpayne@69 2636 #
jpayne@69 2637 void sam_hdr_free(SAM_hdr *hdr)
jpayne@69 2638
jpayne@69 2639 # Returns the current length of the SAM_hdr in text form.
jpayne@69 2640 #
jpayne@69 2641 # Call sam_hdr_rebuild() first if editing has taken place.
jpayne@69 2642 #
jpayne@69 2643 int sam_hdr_length(SAM_hdr *hdr)
jpayne@69 2644
jpayne@69 2645 # Returns the string form of the SAM_hdr.
jpayne@69 2646 #
jpayne@69 2647 # Call sam_hdr_rebuild() first if editing has taken place.
jpayne@69 2648 #
jpayne@69 2649 char *sam_hdr_str(SAM_hdr *hdr)
jpayne@69 2650
jpayne@69 2651 # Appends a formatted line to an existing SAM header.
jpayne@69 2652 #
jpayne@69 2653 # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
jpayne@69 2654 # optional new-line. If it contains more than 1 line then multiple lines
jpayne@69 2655 # will be added in order.
jpayne@69 2656 #
jpayne@69 2657 # Len is the length of the text data, or 0 if unknown (in which case
jpayne@69 2658 # it should be null terminated).
jpayne@69 2659 #
jpayne@69 2660 # @return
jpayne@69 2661 # Returns 0 on success;
jpayne@69 2662 # -1 on failure
jpayne@69 2663 #
jpayne@69 2664
jpayne@69 2665 # Add an @PG line.
jpayne@69 2666 #
jpayne@69 2667 # If we wish complete control over this use sam_hdr_add() directly. This
jpayne@69 2668 # function uses that, but attempts to do a lot of tedious house work for
jpayne@69 2669 # you too.
jpayne@69 2670 #
jpayne@69 2671 # - It will generate a suitable ID if the supplied one clashes.
jpayne@69 2672 # - It will generate multiple @PG records if we have multiple PG chains.
jpayne@69 2673 #
jpayne@69 2674 # Call it as per sam_hdr_add() with a series of key,value pairs ending
jpayne@69 2675 # in NULL.
jpayne@69 2676 #
jpayne@69 2677 # @return
jpayne@69 2678 # Returns 0 on success;
jpayne@69 2679 # -1 on failure
jpayne@69 2680 #
jpayne@69 2681 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...)
jpayne@69 2682
jpayne@69 2683 #
jpayne@69 2684 # A function to help with construction of CL tags in @PG records.
jpayne@69 2685 # Takes an argc, argv pair and returns a single space-separated string.
jpayne@69 2686 # This string should be deallocated by the calling function.
jpayne@69 2687 #
jpayne@69 2688 # @return
jpayne@69 2689 # Returns malloced char * on success;
jpayne@69 2690 # NULL on failure
jpayne@69 2691 #
jpayne@69 2692 char *stringify_argv(int argc, char *argv[])
jpayne@69 2693
jpayne@69 2694 #
jpayne@69 2695 # Returns the refs_t structure used by a cram file handle.
jpayne@69 2696 #
jpayne@69 2697 # This may be used in conjunction with option CRAM_OPT_SHARED_REF to
jpayne@69 2698 # share reference memory between multiple file handles.
jpayne@69 2699 #
jpayne@69 2700 # @return
jpayne@69 2701 # Returns NULL if none exists or the file handle is not a CRAM file.
jpayne@69 2702 #
jpayne@69 2703 refs_t *cram_get_refs(htsFile *fd)
jpayne@69 2704
jpayne@69 2705
jpayne@69 2706 cdef class HTSFile(object):
jpayne@69 2707 cdef htsFile *htsfile # pointer to htsFile structure
jpayne@69 2708 cdef int64_t start_offset # BGZF offset of first record
jpayne@69 2709
jpayne@69 2710 cdef readonly object filename # filename as supplied by user
jpayne@69 2711 cdef readonly object mode # file opening mode
jpayne@69 2712 cdef readonly object threads # number of threads to use
jpayne@69 2713 cdef readonly object index_filename # filename of index, if supplied by user
jpayne@69 2714
jpayne@69 2715 cdef readonly bint is_stream # Is htsfile a non-seekable stream
jpayne@69 2716 cdef readonly bint is_remote # Is htsfile a remote stream
jpayne@69 2717 cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh
jpayne@69 2718
jpayne@69 2719 cdef htsFile *_open_htsfile(self) except? NULL