Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libchtslib.pxd @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # cython: language_level=3 | |
2 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t | |
3 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t | |
4 from libc.stdlib cimport malloc, calloc, realloc, free | |
5 from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup | |
6 from libc.stdio cimport FILE, printf | |
7 from posix.types cimport off_t | |
8 | |
9 cdef extern from "Python.h": | |
10 FILE* PyFile_AsFile(object) | |
11 | |
12 | |
13 # cython does not wrap stdarg | |
14 cdef extern from "stdarg.h": | |
15 ctypedef struct va_list: | |
16 pass | |
17 | |
18 | |
19 cdef extern from "htslib/kstring.h" nogil: | |
20 ctypedef struct kstring_t: | |
21 size_t l, m | |
22 char *s | |
23 | |
24 int kputc(int c, kstring_t *s) | |
25 int kputw(int c, kstring_t *s) | |
26 int kputl(long c, kstring_t *s) | |
27 int ksprintf(kstring_t *s, const char *fmt, ...) | |
28 | |
29 | |
30 cdef extern from "htslib_util.h" nogil: | |
31 int hts_set_verbosity(int verbosity) | |
32 int hts_get_verbosity() | |
33 | |
34 ctypedef uint32_t khint32_t | |
35 ctypedef uint32_t khint_t | |
36 ctypedef khint_t khiter_t | |
37 | |
38 # Used to manage BCF Header info | |
39 ctypedef struct vdict_t: | |
40 khint_t n_buckets, size, n_occupied, upper_bound | |
41 khint32_t *flags | |
42 const char *keys | |
43 bcf_idinfo_t *vals | |
44 | |
45 # Used to manage indexed contigs in Tabix | |
46 ctypedef struct s2i_t: | |
47 khint_t n_buckets, size, n_occupied, upper_bound | |
48 khint32_t *flags | |
49 const char *keys | |
50 int64_t *vals | |
51 | |
52 # Generic khash methods | |
53 khint_t kh_size(void *d) | |
54 khint_t kh_begin(void *d) | |
55 khint_t kh_end(void *d) | |
56 int kh_exist(void *d, khiter_t i) | |
57 | |
58 # Specialized khash methods for vdict | |
59 khint_t kh_get_vdict(vdict_t *d, const char *key) | |
60 const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) | |
61 bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) | |
62 | |
63 | |
64 cdef extern from "htslib/hfile.h" nogil: | |
65 ctypedef struct hFILE | |
66 | |
67 # @abstract Open the named file or URL as a stream | |
68 # @return An hFILE pointer, or NULL (with errno set) if an error occurred. | |
69 hFILE *hopen(const char *filename, const char *mode, ...) | |
70 | |
71 # @abstract Associate a stream with an existing open file descriptor | |
72 # @return An hFILE pointer, or NULL (with errno set) if an error occurred. | |
73 # @notes For socket descriptors (on Windows), mode should contain 's'. | |
74 hFILE *hdopen(int fd, const char *mode) | |
75 | |
76 # @abstract Report whether the file name or URL denotes remote storage | |
77 # @return 0 if local, 1 if remote. | |
78 # @notes "Remote" means involving e.g. explicit network access, with the | |
79 # implication that callers may wish to cache such files' contents locally. | |
80 int hisremote(const char *filename) | |
81 | |
82 # @abstract Flush (for output streams) and close the stream | |
83 # @return 0 if successful, or EOF (with errno set) if an error occurred. | |
84 int hclose(hFILE *fp) | |
85 | |
86 # @abstract Close the stream, without flushing or propagating errors | |
87 # @notes For use while cleaning up after an error only. Preserves errno. | |
88 void hclose_abruptly(hFILE *fp) | |
89 | |
90 # @abstract Return the stream's error indicator | |
91 # @return Non-zero (in fact, an errno value) if an error has occurred. | |
92 # @notes This would be called herror() and return true/false to parallel | |
93 # ferror(3), but a networking-related herror(3) function already exists. */ | |
94 int herrno(hFILE *fp) | |
95 | |
96 # @abstract Clear the stream's error indicator | |
97 void hclearerr(hFILE *fp) | |
98 | |
99 # @abstract Reposition the read/write stream offset | |
100 # @return The resulting offset within the stream (as per lseek(2)), | |
101 # or negative if an error occurred. | |
102 off_t hseek(hFILE *fp, off_t offset, int whence) | |
103 | |
104 # @abstract Report the current stream offset | |
105 # @return The offset within the stream, starting from zero. | |
106 off_t htell(hFILE *fp) | |
107 | |
108 # @abstract Read one character from the stream | |
109 # @return The character read, or EOF on end-of-file or error | |
110 int hgetc(hFILE *fp) | |
111 | |
112 # Read from the stream until the delimiter, up to a maximum length | |
113 # @param buffer The buffer into which bytes will be written | |
114 # @param size The size of the buffer | |
115 # @param delim The delimiter (interpreted as an `unsigned char`) | |
116 # @param fp The file stream | |
117 # @return The number of bytes read, or negative on error. | |
118 # @since 1.4 | |
119 # | |
120 # Bytes will be read into the buffer up to and including a delimiter, until | |
121 # EOF is reached, or _size-1_ bytes have been written, whichever comes first. | |
122 # The string will then be terminated with a NUL byte (`\0`). | |
123 ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) | |
124 | |
125 # Read a line from the stream, up to a maximum length | |
126 # @param buffer The buffer into which bytes will be written | |
127 # @param size The size of the buffer | |
128 # @param fp The file stream | |
129 # @return The number of bytes read, or negative on error. | |
130 # @since 1.4 | |
131 # | |
132 # Specialization of hgetdelim() for a `\n` delimiter. | |
133 ssize_t hgetln(char *buffer, size_t size, hFILE *fp) | |
134 | |
135 # Read a line from the stream, up to a maximum length | |
136 # @param buffer The buffer into which bytes will be written | |
137 # @param size The size of the buffer (must be > 1 to be useful) | |
138 # @param fp The file stream | |
139 # @return _buffer_ on success, or `NULL` if an error occurred. | |
140 # @since 1.4 | |
141 # | |
142 # This function can be used as a replacement for `fgets(3)`, or together with | |
143 # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. | |
144 char *hgets(char *buffer, int size, hFILE *fp) | |
145 | |
146 # @abstract Peek at characters to be read without removing them from buffers | |
147 # @param fp The file stream | |
148 # @param buffer The buffer to which the peeked bytes will be written | |
149 # @param nbytes The number of bytes to peek at; limited by the size of the | |
150 # internal buffer, which could be as small as 4K. | |
151 # @return The number of bytes peeked, which may be less than nbytes if EOF | |
152 # is encountered; or negative, if there was an I/O error. | |
153 # @notes The characters peeked at remain in the stream's internal buffer, | |
154 # and will be returned by later hread() etc calls. | |
155 ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) | |
156 | |
157 # @abstract Read a block of characters from the file | |
158 # @return The number of bytes read, or negative if an error occurred. | |
159 # @notes The full nbytes requested will be returned, except as limited | |
160 # by EOF or I/O errors. | |
161 ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) | |
162 | |
163 # @abstract Write a character to the stream | |
164 # @return The character written, or EOF if an error occurred. | |
165 int hputc(int c, hFILE *fp) | |
166 | |
167 # @abstract Write a string to the stream | |
168 # @return 0 if successful, or EOF if an error occurred. | |
169 int hputs(const char *text, hFILE *fp) | |
170 | |
171 # @abstract Write a block of characters to the file | |
172 # @return Either nbytes, or negative if an error occurred. | |
173 # @notes In the absence of I/O errors, the full nbytes will be written. | |
174 ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) | |
175 | |
176 # @abstract For writing streams, flush buffered output to the underlying stream | |
177 # @return 0 if successful, or EOF if an error occurred. | |
178 int hflush(hFILE *fp) | |
179 | |
180 | |
181 cdef extern from "htslib/bgzf.h" nogil: | |
182 ctypedef struct bgzf_mtaux_t | |
183 ctypedef struct bgzidx_t | |
184 ctypedef struct z_stream | |
185 | |
186 ctypedef struct BGZF: | |
187 unsigned errcode | |
188 unsigned is_write | |
189 int is_be | |
190 int compress_level | |
191 int is_compressed | |
192 int is_gzip | |
193 int cache_size | |
194 int64_t block_address | |
195 int64_t uncompressed_address | |
196 void *uncompressed_block | |
197 void *compressed_block | |
198 void *cache | |
199 hFILE *fp | |
200 bgzf_mtaux_t *mt | |
201 bgzidx_t *idx | |
202 int idx_build_otf | |
203 z_stream *gz_stream | |
204 | |
205 #***************** | |
206 # Basic routines * | |
207 # *****************/ | |
208 | |
209 # Open an existing file descriptor for reading or writing. | |
210 # | |
211 # @param fd file descriptor | |
212 # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for | |
213 # writing, 'a' for appending, 'g' for gzip rather than BGZF | |
214 # compression (with 'w' only), and digit specifies the zlib | |
215 # compression level. | |
216 # Note that there is a distinction between 'u' and '0': the | |
217 # first yields plain uncompressed output whereas the latter | |
218 # outputs uncompressed data wrapped in the zlib format. | |
219 # @return BGZF file handler; 0 on error | |
220 | |
221 BGZF* bgzf_dopen(int fd, const char *mode) | |
222 BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility | |
223 | |
224 # Open the specified file for reading or writing. | |
225 BGZF* bgzf_open(const char* path, const char *mode) | |
226 | |
227 # Open an existing hFILE stream for reading or writing. | |
228 BGZF* bgzf_hopen(hFILE *fp, const char *mode) | |
229 | |
230 # Close the BGZF and free all associated resources. | |
231 # | |
232 # @param fp BGZF file handler | |
233 # @return 0 on success and -1 on error | |
234 int bgzf_close(BGZF *fp) | |
235 | |
236 # Read up to _length_ bytes from the file storing into _data_. | |
237 # | |
238 # @param fp BGZF file handler | |
239 # @param data data array to read into | |
240 # @param length size of data to read | |
241 # @return number of bytes actually read; 0 on end-of-file and -1 on error | |
242 ssize_t bgzf_read(BGZF *fp, void *data, size_t length) | |
243 | |
244 # Write _length_ bytes from _data_ to the file. If no I/O errors occur, | |
245 # the complete _length_ bytes will be written (or queued for writing). | |
246 # | |
247 # @param fp BGZF file handler | |
248 # @param data data array to write | |
249 # @param length size of data to write | |
250 # @return number of bytes written (i.e., _length_); negative on error | |
251 ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) | |
252 | |
253 # Read up to _length_ bytes directly from the underlying stream without | |
254 # decompressing. Bypasses BGZF blocking, so must be used with care in | |
255 # specialised circumstances only. | |
256 # | |
257 # @param fp BGZF file handler | |
258 # @param data data array to read into | |
259 # @param length number of raw bytes to read | |
260 # @return number of bytes actually read; 0 on end-of-file and -1 on error | |
261 ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) | |
262 | |
263 # Write _length_ bytes directly to the underlying stream without | |
264 # compressing. Bypasses BGZF blocking, so must be used with care | |
265 # in specialised circumstances only. | |
266 # | |
267 # @param fp BGZF file handler | |
268 # @param data data array to write | |
269 # @param length number of raw bytes to write | |
270 # @return number of bytes actually written; -1 on error | |
271 ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) | |
272 | |
273 # Write the data in the buffer to the file. | |
274 int bgzf_flush(BGZF *fp) | |
275 | |
276 # Return a virtual file pointer to the current location in the file. | |
277 # No interpretation of the value should be made, other than a subsequent | |
278 # call to bgzf_seek can be used to position the file at the same point. | |
279 # Return value is non-negative on success. | |
280 int64_t bgzf_tell(BGZF *fp) | |
281 | |
282 # Set the file to read from the location specified by _pos_. | |
283 # | |
284 # @param fp BGZF file handler | |
285 # @param pos virtual file offset returned by bgzf_tell() | |
286 # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd) | |
287 # @return 0 on success and -1 on error | |
288 # / | |
289 int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) | |
290 | |
291 # Check if the BGZF end-of-file (EOF) marker is present | |
292 # | |
293 # @param fp BGZF file handler opened for reading | |
294 # @return 1 if the EOF marker is present and correct | |
295 # 2 if it can't be checked, e.g., because fp isn't seekable | |
296 # 0 if the EOF marker is absent | |
297 # -1 (with errno set) on error | |
298 int bgzf_check_EOF(BGZF *fp) | |
299 | |
300 # Check if a file is in the BGZF format | |
301 # | |
302 # @param fn file name | |
303 # @return 1 if _fn_ is BGZF; 0 if not or on I/O error | |
304 int bgzf_is_bgzf(const char *fn) | |
305 | |
306 #********************* | |
307 # Advanced routines * | |
308 #********************* | |
309 | |
310 # Set the cache size. Only effective when compiled with -DBGZF_CACHE. | |
311 # | |
312 # @param fp BGZF file handler | |
313 # @param size size of cache in bytes; 0 to disable caching (default) | |
314 void bgzf_set_cache_size(BGZF *fp, int size) | |
315 | |
316 # Flush the file if the remaining buffer size is smaller than _size_ | |
317 # @return 0 if flushing succeeded or was not needed; negative on error | |
318 int bgzf_flush_try(BGZF *fp, ssize_t size) | |
319 | |
320 # Read one byte from a BGZF file. It is faster than bgzf_read() | |
321 # @param fp BGZF file handler | |
322 # @return byte read; -1 on end-of-file or error | |
323 int bgzf_getc(BGZF *fp) | |
324 | |
325 # Read one line from a BGZF file. It is faster than bgzf_getc() | |
326 # | |
327 # @param fp BGZF file handler | |
328 # @param delim delimiter | |
329 # @param str string to write to; must be initialized | |
330 # @return length of the string; 0 on end-of-file; negative on error | |
331 int bgzf_getline(BGZF *fp, int delim, kstring_t *str) | |
332 | |
333 # Read the next BGZF block. | |
334 int bgzf_read_block(BGZF *fp) | |
335 | |
336 # Enable multi-threading (only effective on writing and when the | |
337 # library was compiled with -DBGZF_MT) | |
338 # | |
339 # @param fp BGZF file handler; must be opened for writing | |
340 # @param n_threads #threads used for writing | |
341 # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended | |
342 int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) | |
343 | |
344 | |
345 # Compress a single BGZF block. | |
346 # | |
347 # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) | |
348 # @param dlen size of output buffer; updated on return to the number | |
349 # of bytes actually written to dst | |
350 # @param src buffer to be compressed | |
351 # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) | |
352 # @param level compression level | |
353 # @return 0 on success and negative on error | |
354 # | |
355 int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) | |
356 | |
357 #******************* | |
358 # bgzidx routines * | |
359 # BGZF at the uncompressed offset | |
360 # | |
361 # @param fp BGZF file handler; must be opened for reading | |
362 # @param uoffset file offset in the uncompressed data | |
363 # @param where SEEK_SET (cimported from libc.stdio) supported atm | |
364 # | |
365 # Returns 0 on success and -1 on error. | |
366 int bgzf_useek(BGZF *fp, long uoffset, int where) | |
367 | |
368 # Position in uncompressed BGZF | |
369 # | |
370 # @param fp BGZF file handler; must be opened for reading | |
371 # | |
372 # Returns the current offset on success and -1 on error. | |
373 long bgzf_utell(BGZF *fp) | |
374 | |
375 # Tell BGZF to build index while compressing. | |
376 # | |
377 # @param fp BGZF file handler; can be opened for reading or writing. | |
378 # | |
379 # Returns 0 on success and -1 on error. | |
380 int bgzf_index_build_init(BGZF *fp) | |
381 | |
382 # Load BGZF index | |
383 # | |
384 # @param fp BGZF file handler | |
385 # @param bname base name | |
386 # @param suffix suffix to add to bname (can be NULL) | |
387 # | |
388 # Returns 0 on success and -1 on error. | |
389 int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) | |
390 | |
391 # Save BGZF index | |
392 # | |
393 # @param fp BGZF file handler | |
394 # @param bname base name | |
395 # @param suffix suffix to add to bname (can be NULL) | |
396 # | |
397 # Returns 0 on success and -1 on error. | |
398 int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) | |
399 | |
400 | |
401 cdef extern from "htslib/hts.h" nogil: | |
402 uint32_t kroundup32(uint32_t x) | |
403 | |
404 ctypedef struct cram_fd | |
405 | |
406 union FilePointerUnion: | |
407 BGZF *bgzf | |
408 cram_fd *cram | |
409 hFILE *hfile | |
410 void *voidp | |
411 | |
412 enum htsFormatCategory: | |
413 unknown_category | |
414 sequence_data # Sequence data -- SAM, BAM, CRAM, etc | |
415 variant_data # Variant calling data -- VCF, BCF, etc | |
416 index_file # Index file associated with some data file | |
417 region_list # Coordinate intervals or regions -- BED, etc | |
418 category_maximum | |
419 | |
420 enum htsExactFormat: | |
421 unknown_format | |
422 binary_format | |
423 text_format | |
424 sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed | |
425 format_maximum | |
426 | |
427 enum htsCompression: | |
428 no_compression, gzip, bgzf, custom | |
429 compression_maximum | |
430 | |
431 cdef enum hts_fmt_option: | |
432 CRAM_OPT_DECODE_MD, | |
433 CRAM_OPT_PREFIX, | |
434 CRAM_OPT_VERBOSITY, | |
435 CRAM_OPT_SEQS_PER_SLICE, | |
436 CRAM_OPT_SLICES_PER_CONTAINER, | |
437 CRAM_OPT_RANGE, | |
438 CRAM_OPT_VERSION, | |
439 CRAM_OPT_EMBED_REF, | |
440 CRAM_OPT_IGNORE_MD5, | |
441 CRAM_OPT_REFERENCE, | |
442 CRAM_OPT_MULTI_SEQ_PER_SLICE, | |
443 CRAM_OPT_NO_REF, | |
444 CRAM_OPT_USE_BZIP2, | |
445 CRAM_OPT_SHARED_REF, | |
446 CRAM_OPT_NTHREADS, | |
447 CRAM_OPT_THREAD_POOL, | |
448 CRAM_OPT_USE_LZMA, | |
449 CRAM_OPT_USE_RANS, | |
450 CRAM_OPT_REQUIRED_FIELDS, | |
451 HTS_OPT_COMPRESSION_LEVEL, | |
452 HTS_OPT_NTHREADS, | |
453 | |
454 ctypedef struct htsVersion: | |
455 short major, minor | |
456 | |
457 ctypedef struct htsFormat: | |
458 htsFormatCategory category | |
459 htsExactFormat format | |
460 htsVersion version | |
461 htsCompression compression | |
462 short compression_level | |
463 void *specific | |
464 | |
465 ctypedef struct htsFile: | |
466 uint8_t is_bin | |
467 uint8_t is_write | |
468 uint8_t is_be | |
469 uint8_t is_cram | |
470 int64_t lineno | |
471 kstring_t line | |
472 char *fn | |
473 char *fn_aux | |
474 FilePointerUnion fp | |
475 htsFormat format | |
476 | |
477 int hts_verbose | |
478 | |
479 cdef union hts_opt_val_union: | |
480 int i | |
481 char *s | |
482 | |
483 ctypedef struct hts_opt: | |
484 char *arg | |
485 hts_fmt_option opt | |
486 hts_opt_val_union val | |
487 void *next | |
488 | |
489 # @abstract Parses arg and appends it to the option list. | |
490 # @return 0 on success and -1 on failure | |
491 int hts_opt_add(hts_opt **opts, const char *c_arg) | |
492 | |
493 # @abstract Applies an hts_opt option list to a given htsFile. | |
494 # @return 0 on success and -1 on failure | |
495 int hts_opt_apply(htsFile *fp, hts_opt *opts) | |
496 | |
497 # @abstract Frees an hts_opt list. | |
498 void hts_opt_free(hts_opt *opts) | |
499 | |
500 # @abstract Table for converting a nucleotide character to 4-bit encoding. | |
501 # The input character may be either an IUPAC ambiguity code, '=' for 0, or | |
502 # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 | |
503 # for A/C/G/T or combinations of these bits for ambiguous bases. | |
504 const unsigned char *seq_nt16_table | |
505 | |
506 # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC | |
507 # ambiguity code letter (or '=' when given 0). | |
508 const char *seq_nt16_str | |
509 | |
510 # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. | |
511 # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). | |
512 const int *seq_nt16_int | |
513 | |
514 # @abstract Get the htslib version number | |
515 # @return For released versions, a string like "N.N[.N]"; or git describe | |
516 # output if using a library built within a Git repository. | |
517 const char *hts_version() | |
518 | |
519 # @abstract Determine format by peeking at the start of a file | |
520 # @param fp File opened for reading, positioned at the beginning | |
521 # @param fmt Format structure that will be filled out on return | |
522 # @return 0 for success, or negative if an error occurred. | |
523 int hts_detect_format(hFILE *fp, htsFormat *fmt) | |
524 | |
525 # @abstract Get a human-readable description of the file format | |
526 # @return Description string, to be freed by the caller after use. | |
527 char *hts_format_description(const htsFormat *format) | |
528 | |
529 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file | |
530 # @param fn The file name or "-" for stdin/stdout | |
531 # @param mode Mode matching / [rwa][bceguxz0-9]* / | |
532 # @discussion | |
533 # With 'r' opens for reading; any further format mode letters are ignored | |
534 # as the format is detected by checking the first few bytes or BGZF blocks | |
535 # of the file. With 'w' or 'a' opens for writing or appending, with format | |
536 # specifier letters: | |
537 # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) | |
538 # c CRAM format | |
539 # g gzip compressed | |
540 # u uncompressed | |
541 # z bgzf compressed | |
542 # [0-9] zlib compression level | |
543 # and with non-format option letters (for any of 'r'/'w'/'a'): | |
544 # e close the file on exec(2) (opens with O_CLOEXEC, where supported) | |
545 # x create the file exclusively (opens with O_EXCL, where supported) | |
546 # Note that there is a distinction between 'u' and '0': the first yields | |
547 # plain uncompressed output whereas the latter outputs uncompressed data | |
548 # wrapped in the zlib format. | |
549 # @example | |
550 # [rw]b .. compressed BCF, BAM, FAI | |
551 # [rw]bu .. uncompressed BCF | |
552 # [rw]z .. compressed VCF | |
553 # [rw] .. uncompressed VCF | |
554 htsFile *hts_open(const char *fn, const char *mode) | |
555 | |
556 # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file | |
557 # @param fn The file name or "-" for stdin/stdout | |
558 # @param mode Open mode, as per hts_open() | |
559 # @param fmt Optional format specific parameters | |
560 # @discussion | |
561 # See hts_open() for description of fn and mode. | |
562 # // TODO Update documentation for s/opts/fmt/ | |
563 # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, | |
564 # if defined, override mode. Opts also contains a linked list of hts_opt | |
565 # structures to apply to the open file handle. These can contain things | |
566 # like pointers to the reference or information on compression levels, | |
567 # block sizes, etc. | |
568 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) | |
569 | |
570 # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file | |
571 # @param fp The already-open file handle | |
572 # @param fn The file name or "-" for stdin/stdout | |
573 # @param mode Open mode, as per hts_open() | |
574 htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) | |
575 | |
576 # @abstract For output streams, flush any buffered data | |
577 # @param fp The file handle to be flushed | |
578 # @return 0 for success, or negative if an error occurred. | |
579 # @since 1.14 | |
580 int hts_flush(htsFile *fp) | |
581 | |
582 # @abstract Close a file handle, flushing buffered data for output streams | |
583 # @param fp The file handle to be closed | |
584 # @return 0 for success, or negative if an error occurred. | |
585 int hts_close(htsFile *fp) | |
586 | |
587 # @abstract Returns the file's format information | |
588 # @param fp The file handle | |
589 # @return Read-only pointer to the file's htsFormat. | |
590 const htsFormat *hts_get_format(htsFile *fp) | |
591 | |
592 # @ abstract Returns a string containing the file format extension. | |
593 # @ param format Format structure containing the file type. | |
594 # @ return A string ("sam", "bam", etc) or "?" for unknown formats. | |
595 const char *hts_format_file_extension(const htsFormat *format) | |
596 | |
597 # @abstract Sets a specified CRAM option on the open file handle. | |
598 # @param fp The file handle open the open file. | |
599 # @param opt The CRAM_OPT_* option. | |
600 # @param ... Optional arguments, dependent on the option used. | |
601 # @return 0 for success, or negative if an error occurred. | |
602 int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) | |
603 | |
604 int hts_getline(htsFile *fp, int delimiter, kstring_t *str) | |
605 char **hts_readlines(const char *fn, int *_n) | |
606 | |
607 # @abstract Parse comma-separated list or read list from a file | |
608 # @param list File name or comma-separated list | |
609 # @param is_file | |
610 # @param _n Size of the output array (number of items read) | |
611 # @return NULL on failure or pointer to newly allocated array of | |
612 # strings | |
613 char **hts_readlist(const char *fn, int is_file, int *_n) | |
614 | |
615 # @abstract Create extra threads to aid compress/decompression for this file | |
616 # @param fp The file handle | |
617 # @param n The number of worker threads to create | |
618 # @return 0 for success, or negative if an error occurred. | |
619 # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. | |
620 int hts_set_threads(htsFile *fp, int n) | |
621 | |
622 # @abstract Set .fai filename for a file opened for reading | |
623 # @return 0 for success, negative on failure | |
624 # @discussion | |
625 # Called before *_hdr_read(), this provides the name of a .fai file | |
626 # used to provide a reference list if the htsFile contains no @SQ headers. | |
627 int hts_set_fai_filename(htsFile *fp, const char *fn_aux) | |
628 | |
629 int8_t HTS_IDX_NOCOOR | |
630 int8_t HTS_IDX_START | |
631 int8_t HTS_IDX_REST | |
632 int8_t HTS_IDX_NONE | |
633 | |
634 int8_t HTS_FMT_CSI | |
635 int8_t HTS_FMT_BAI | |
636 int8_t HTS_FMT_TBI | |
637 int8_t HTS_FMT_CRAI | |
638 | |
639 BGZF *hts_get_bgzfp(htsFile *fp) | |
640 | |
641 ctypedef struct hts_idx_t | |
642 | |
643 ctypedef struct hts_pair64_t: | |
644 uint64_t u, v | |
645 | |
646 ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) | |
647 | |
648 ctypedef struct hts_bins_t: | |
649 int n, m | |
650 int *a | |
651 | |
652 ctypedef struct hts_itr_t: | |
653 uint32_t read_rest | |
654 uint32_t finished | |
655 int tid, bed, end, n_off, i | |
656 int curr_tid, curr_beg, curr_end | |
657 uint64_t curr_off | |
658 hts_pair64_t *off | |
659 hts_readrec_func *readfunc | |
660 hts_bins_t bins | |
661 | |
662 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) | |
663 void hts_idx_destroy(hts_idx_t *idx) | |
664 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) | |
665 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) | |
666 | |
667 #### Save an index to a file | |
668 # @param idx Index to be written | |
669 # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added | |
670 # @param fmt One of the HTS_FMT_* index formats | |
671 # @return 0 if successful, or negative if an error occurred. | |
672 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) | |
673 | |
674 #### Save an index to a specific file | |
675 # @param idx Index to be written | |
676 # @param fn Input BAM/BCF/etc filename | |
677 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn | |
678 # @param fmt One of the HTS_FMT_* index formats | |
679 # @return 0 if successful, or negative if an error occurred. | |
680 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) | |
681 | |
682 #### Load an index file | |
683 # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or | |
684 # the extension substituted, to search for an existing index file | |
685 # @param fmt One of the HTS_FMT_* index formats | |
686 # @return The index, or NULL if an error occurred. | |
687 hts_idx_t *hts_idx_load(const char *fn, int fmt) | |
688 | |
689 #### Load a specific index file | |
690 # @param fn Input BAM/BCF/etc filename | |
691 # @param fnidx The input index filename | |
692 # @return The index, or NULL if an error occurred. | |
693 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) | |
694 | |
695 #### Load a specific index file | |
696 # @param fn Input BAM/BCF/etc filename | |
697 # @param fnidx The input index filename | |
698 # @param fmt One of the HTS_FMT_* index formats | |
699 # @param flags Flags to alter behaviour (see description) | |
700 # @return The index, or NULL if an error occurred. | |
701 hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) | |
702 | |
703 int HTS_IDX_SAVE_REMOTE | |
704 int HTS_IDX_SILENT_FAIL | |
705 | |
706 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) | |
707 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) | |
708 | |
709 int hts_idx_get_stat(const hts_idx_t* idx, int tid, | |
710 uint64_t* mapped, uint64_t* unmapped) | |
711 | |
712 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) | |
713 | |
714 int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers | |
715 | |
716 # Parse a numeric string | |
717 # The number may be expressed in scientific notation, and optionally may | |
718 # contain commas in the integer part (before any decimal point or E notation). | |
719 # @param str String to be parsed | |
720 # @param strend If non-NULL, set on return to point to the first character | |
721 # in @a str after those forming the parsed number | |
722 # @param flags Or'ed-together combination of HTS_PARSE_* flags | |
723 # @return Converted value of the parsed number. | |
724 # | |
725 # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 | |
726 # or more) if there are any trailing characters after the number. | |
727 long long hts_parse_decimal(const char *str, char **strend, int flags) | |
728 | |
729 # Parse a "CHR:START-END"-style region string | |
730 # @param str String to be parsed | |
731 # @param beg Set on return to the 0-based start of the region | |
732 # @param end Set on return to the 1-based end of the region | |
733 # @return Pointer to the colon or '\0' after the reference sequence name, | |
734 # or NULL if @a str could not be parsed. | |
735 const char *hts_parse_reg(const char *str, int *beg, int *end) | |
736 | |
737 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) | |
738 void hts_itr_destroy(hts_itr_t *iter) | |
739 | |
740 ctypedef int (*hts_name2id_f)(void*, const char*) | |
741 ctypedef const char *(*hts_id2name_f)(void*, int) | |
742 ctypedef hts_itr_t *hts_itr_query_func( | |
743 const hts_idx_t *idx, | |
744 int tid, | |
745 int beg, | |
746 int end, | |
747 hts_readrec_func *readrec) | |
748 | |
749 hts_itr_t *hts_itr_querys( | |
750 const hts_idx_t *idx, | |
751 const char *reg, | |
752 hts_name2id_f getid, | |
753 void *hdr, | |
754 hts_itr_query_func *itr_query, | |
755 hts_readrec_func *readrec) | |
756 | |
757 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) | |
758 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values | |
759 | |
760 # hts_file_type() - Convenience function to determine file type | |
761 # @fname: the file name | |
762 # | |
763 # Returns one of the FT_* defines. | |
764 # | |
765 # DEPRECATED: This function has been replaced by hts_detect_format(). | |
766 # It and these FT_* macros will be removed in a future HTSlib release. | |
767 int FT_UNKN | |
768 int FT_GZ | |
769 int FT_VCF | |
770 int FT_VCF_GZ | |
771 int FT_BCF | |
772 int FT_BCF_GZ | |
773 int FT_STDIN | |
774 | |
775 int hts_file_type(const char *fname) | |
776 | |
777 # /*************************** | |
778 # * Revised MAQ error model * | |
779 # ***************************/ | |
780 | |
781 ctypedef struct errmod_t | |
782 | |
783 errmod_t *errmod_init(double depcorr) | |
784 void errmod_destroy(errmod_t *em) | |
785 | |
786 # /* | |
787 # n: number of bases | |
788 # m: maximum base | |
789 # bases[i]: qual:6, strand:1, base:4 | |
790 # q[i*m+j]: phred-scaled likelihood of (i,j) | |
791 # */ | |
792 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic) | |
793 | |
794 # /***************************************** | |
795 # * q banded glocal alignment * | |
796 # *****************************************/ | |
797 | |
798 ctypedef struct probaln_par_t: | |
799 float d, e | |
800 int bw | |
801 | |
802 int probaln_glocal(const uint8_t *ref, | |
803 int l_ref, | |
804 const uint8_t *query, | |
805 int l_query, const uint8_t *iqual, | |
806 const probaln_par_t *c, | |
807 int *state, uint8_t *q) | |
808 | |
809 # /********************** | |
810 # * MD5 implementation * | |
811 # **********************/ | |
812 | |
813 ctypedef struct hts_md5_context | |
814 | |
815 # /*! @abstract Initialises an MD5 context. | |
816 # * @discussion | |
817 # * The expected use is to allocate an hts_md5_context using | |
818 # * hts_md5_init(). This pointer is then passed into one or more calls | |
819 # * of hts_md5_update() to compute successive internal portions of the | |
820 # * MD5 sum, which can then be externalised as a full 16-byte MD5sum | |
821 # * calculation by calling hts_md5_final(). This can then be turned | |
822 # * into ASCII via hts_md5_hex(). | |
823 # * | |
824 # * To dealloate any resources created by hts_md5_init() call the | |
825 # * hts_md5_destroy() function. | |
826 # * | |
827 # * @return hts_md5_context pointer on success, NULL otherwise. | |
828 # */ | |
829 hts_md5_context *hts_md5_init() | |
830 | |
831 # /*! @abstract Updates the context with the MD5 of the data. */ | |
832 void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size) | |
833 | |
834 # /*! @abstract Computes the final 128-bit MD5 hash from the given context */ | |
835 void hts_md5_final(unsigned char *digest, hts_md5_context *ctx) | |
836 | |
837 # /*! @abstract Resets an md5_context to the initial state, as returned | |
838 # * by hts_md5_init(). | |
839 # */ | |
840 void hts_md5_reset(hts_md5_context *ctx) | |
841 | |
842 # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated | |
843 # * hex string. | |
844 # */ | |
845 void hts_md5_hex(char *hex, const unsigned char *digest) | |
846 | |
847 # /*! @abstract Deallocates any memory allocated by hts_md5_init. */ | |
848 void hts_md5_destroy(hts_md5_context *ctx) | |
849 | |
850 int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) | |
851 int hts_bin_bot(int bin, int n_lvls) | |
852 | |
853 # * Endianness * | |
854 int ed_is_big() | |
855 uint16_t ed_swap_2(uint16_t v) | |
856 void *ed_swap_2p(void *x) | |
857 uint32_t ed_swap_4(uint32_t v) | |
858 void *ed_swap_4p(void *x) | |
859 uint64_t ed_swap_8(uint64_t v) | |
860 void *ed_swap_8p(void *x) | |
861 | |
862 | |
863 cdef extern from "htslib/sam.h" nogil: | |
864 #********************** | |
865 #*** SAM/BAM header *** | |
866 #********************** | |
867 | |
868 # @abstract Structure for the alignment header. | |
869 # @field n_targets number of reference sequences | |
870 # @field l_text length of the plain text in the header | |
871 # @field target_len lengths of the reference sequences | |
872 # @field target_name names of the reference sequences | |
873 # @field text plain text | |
874 # @field sdict header dictionary | |
875 | |
876 ctypedef struct bam_hdr_t: | |
877 int32_t n_targets, ignore_sam_err | |
878 uint32_t l_text | |
879 uint32_t *target_len | |
880 uint8_t *cigar_tab | |
881 char **target_name | |
882 char *text | |
883 void *sdict | |
884 | |
885 #**************************** | |
886 #*** CIGAR related macros *** | |
887 #**************************** | |
888 | |
889 int BAM_CMATCH | |
890 int BAM_CINS | |
891 int BAM_CDEL | |
892 int BAM_CREF_SKIP | |
893 int BAM_CSOFT_CLIP | |
894 int BAM_CHARD_CLIP | |
895 int BAM_CPAD | |
896 int BAM_CEQUAL | |
897 int BAM_CDIFF | |
898 int BAM_CBACK | |
899 | |
900 char *BAM_CIGAR_STR | |
901 int BAM_CIGAR_SHIFT | |
902 uint32_t BAM_CIGAR_MASK | |
903 uint32_t BAM_CIGAR_TYPE | |
904 | |
905 char bam_cigar_op(uint32_t c) | |
906 uint32_t bam_cigar_oplen(uint32_t c) | |
907 char bam_cigar_opchr(uint32_t) | |
908 uint32_t bam_cigar_gen(char, uint32_t) | |
909 int bam_cigar_type(char o) | |
910 | |
911 # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair | |
912 int BAM_FPAIRED | |
913 # @abstract the read is mapped in a proper pair | |
914 int BAM_FPROPER_PAIR | |
915 # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR | |
916 int BAM_FUNMAP | |
917 # @abstract the mate is unmapped | |
918 int BAM_FMUNMAP | |
919 # @abstract the read is mapped to the reverse strand | |
920 int BAM_FREVERSE | |
921 # @abstract the mate is mapped to the reverse strand | |
922 int BAM_FMREVERSE | |
923 # @abstract this is read1 | |
924 int BAM_FREAD1 | |
925 # @abstract this is read2 | |
926 int BAM_FREAD2 | |
927 # @abstract not primary alignment | |
928 int BAM_FSECONDARY | |
929 # @abstract QC failure | |
930 int BAM_FQCFAIL | |
931 # @abstract optical or PCR duplicate | |
932 int BAM_FDUP | |
933 # @abstract supplementary alignment | |
934 int BAM_FSUPPLEMENTARY | |
935 | |
936 #************************* | |
937 #*** Alignment records *** | |
938 #************************* | |
939 | |
940 # @abstract Structure for core alignment information. | |
941 # @field tid chromosome ID, defined by bam_hdr_t | |
942 # @field pos 0-based leftmost coordinate | |
943 # @field bin bin calculated by bam_reg2bin() | |
944 # @field qual mapping quality | |
945 # @field l_qname length of the query name | |
946 # @field flag bitwise flag | |
947 # @field n_cigar number of CIGAR operations | |
948 # @field l_qseq length of the query sequence (read) | |
949 # @field mtid chromosome ID of next read in template, defined by bam_hdr_t | |
950 # @field mpos 0-based leftmost coordinate of next read in template | |
951 | |
952 ctypedef struct bam1_core_t: | |
953 int32_t tid | |
954 int32_t pos | |
955 uint16_t bin | |
956 uint8_t qual | |
957 uint8_t l_qname | |
958 uint16_t flag | |
959 uint8_t unused1 | |
960 uint8_t l_extranul | |
961 uint32_t n_cigar | |
962 int32_t l_qseq | |
963 int32_t mtid | |
964 int32_t mpos | |
965 int32_t isize | |
966 | |
967 # @abstract Structure for one alignment. | |
968 # @field core core information about the alignment | |
969 # @field l_data current length of bam1_t::data | |
970 # @field m_data maximum length of bam1_t::data | |
971 # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux | |
972 # | |
973 # @discussion Notes: | |
974 # | |
975 # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. | |
976 # 2. l_qseq is calculated from the total length of an alignment block | |
977 # on reading or from CIGAR. | |
978 # 3. cigar data is encoded 4 bytes per CIGAR operation. | |
979 # 4. seq is nybble-encoded according to seq_nt16_table. | |
980 ctypedef struct bam1_t: | |
981 bam1_core_t core | |
982 int l_data | |
983 uint32_t m_data | |
984 uint8_t *data | |
985 uint64_t id | |
986 | |
987 # @abstract Get whether the query is on the reverse strand | |
988 # @param b pointer to an alignment | |
989 # @return boolean true if query is on the reverse strand | |
990 int bam_is_rev(bam1_t *b) | |
991 | |
992 # @abstract Get whether the query's mate is on the reverse strand | |
993 # @param b pointer to an alignment | |
994 # @return boolean true if query's mate on the reverse strand | |
995 int bam_is_mrev(bam1_t *b) | |
996 | |
997 # @abstract Get the name of the query | |
998 # @param b pointer to an alignment | |
999 # @return pointer to the name string, null terminated | |
1000 char *bam_get_qname(bam1_t *b) | |
1001 | |
1002 # @abstract Get the CIGAR array | |
1003 # @param b pointer to an alignment | |
1004 # @return pointer to the CIGAR array | |
1005 # | |
1006 # @discussion In the CIGAR array, each element is a 32-bit integer. The | |
1007 # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the | |
1008 # length of a CIGAR. | |
1009 uint32_t *bam_get_cigar(bam1_t *b) | |
1010 | |
1011 # @abstract Get query sequence | |
1012 # @param b pointer to an alignment | |
1013 # @return pointer to sequence | |
1014 # | |
1015 # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, | |
1016 # 8 for T and 15 for N. Two bases are packed in one byte with the base | |
1017 # at the higher 4 bits having smaller coordinate on the read. It is | |
1018 # recommended to use bam_seqi() macro to get the base. | |
1019 char *bam_get_seq(bam1_t *b) | |
1020 | |
1021 # @abstract Get query quality | |
1022 # @param b pointer to an alignment | |
1023 # @return pointer to quality string | |
1024 uint8_t *bam_get_qual(bam1_t *b) | |
1025 | |
1026 # @abstract Get auxiliary data | |
1027 # @param b pointer to an alignment | |
1028 # @return pointer to the concatenated auxiliary data | |
1029 uint8_t *bam_get_aux(bam1_t *b) | |
1030 | |
1031 # @abstract Get length of auxiliary data | |
1032 # @param b pointer to an alignment | |
1033 # @return length of the concatenated auxiliary data | |
1034 int bam_get_l_aux(bam1_t *b) | |
1035 | |
1036 # @abstract Get a base on read | |
1037 # @param s Query sequence returned by bam1_seq() | |
1038 # @param i The i-th position, 0-based | |
1039 # @return 4-bit integer representing the base. | |
1040 char bam_seqi(char *s, int i) | |
1041 | |
1042 #************************** | |
1043 #*** Exported functions *** | |
1044 #************************** | |
1045 | |
1046 #*************** | |
1047 #*** BAM I/O *** | |
1048 #*************** | |
1049 | |
1050 bam_hdr_t *bam_hdr_init() | |
1051 bam_hdr_t *bam_hdr_read(BGZF *fp) | |
1052 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) | |
1053 void bam_hdr_destroy(bam_hdr_t *h) | |
1054 int bam_name2id(bam_hdr_t *h, const char *ref) | |
1055 bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) | |
1056 | |
1057 bam1_t *bam_init1() | |
1058 void bam_destroy1(bam1_t *b) | |
1059 int bam_read1(BGZF *fp, bam1_t *b) | |
1060 int bam_write1(BGZF *fp, const bam1_t *b) | |
1061 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) | |
1062 bam1_t *bam_dup1(const bam1_t *bsrc) | |
1063 | |
1064 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) | |
1065 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) | |
1066 | |
1067 # @abstract Calculate the rightmost base position of an alignment on the | |
1068 # reference genome. | |
1069 | |
1070 # @param b pointer to an alignment | |
1071 # @return the coordinate of the first base after the alignment, 0-based | |
1072 | |
1073 # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. | |
1074 # For an unmapped read (either according to its flags or if it has no cigar | |
1075 # string), we return b->core.pos + 1 by convention. | |
1076 int32_t bam_endpos(const bam1_t *b) | |
1077 | |
1078 int bam_str2flag(const char *str) # returns negative value on error | |
1079 char *bam_flag2str(int flag) # The string must be freed by the user | |
1080 | |
1081 #************************* | |
1082 #*** BAM/CRAM indexing *** | |
1083 #************************* | |
1084 | |
1085 # These BAM iterator functions work only on BAM files. To work with either | |
1086 # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. | |
1087 void bam_itr_destroy(hts_itr_t *iter) | |
1088 hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) | |
1089 hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) | |
1090 int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) | |
1091 | |
1092 # Load/build .csi or .bai BAM index file. Does not work with CRAM. | |
1093 # It is recommended to use the sam_index_* functions below instead. | |
1094 hts_idx_t *bam_index_load(const char *fn) | |
1095 int bam_index_build(const char *fn, int min_shift) | |
1096 | |
1097 # Load a BAM (.csi or .bai) or CRAM (.crai) index file | |
1098 # @param fp File handle of the data file whose index is being opened | |
1099 # @param fn BAM/CRAM/etc filename to search alongside for the index file | |
1100 # @return The index, or NULL if an error occurred. | |
1101 hts_idx_t *sam_index_load(htsFile *fp, const char *fn) | |
1102 | |
1103 # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file | |
1104 # @param fp File handle of the data file whose index is being opened | |
1105 # @param fn BAM/CRAM/etc data file filename | |
1106 # @param fnidx Index filename, or NULL to search alongside @a fn | |
1107 # @return The index, or NULL if an error occurred. | |
1108 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) | |
1109 | |
1110 # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file | |
1111 # @param fp File handle of the data file whose index is being opened | |
1112 # @param fn BAM/CRAM/etc data file filename | |
1113 # @param fnidx Index filename, or NULL to search alongside @a fn | |
1114 # @param flags Flags to alter behaviour | |
1115 # @return The index, or NULL if an error occurred. | |
1116 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) | |
1117 | |
1118 # Generate and save an index file | |
1119 # @param fn Input BAM/etc filename, to which .csi/etc will be added | |
1120 # @param min_shift Positive to generate CSI, or 0 to generate BAI | |
1121 # @return 0 if successful, or negative if an error occurred (usually -1; or | |
1122 # -2: opening fn failed; -3: format not indexable) | |
1123 int sam_index_build(const char *fn, int min_shift) | |
1124 | |
1125 # Generate and save an index to a specific file | |
1126 # @param fn Input BAM/CRAM/etc filename | |
1127 # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn | |
1128 # @param min_shift Positive to generate CSI, or 0 to generate BAI | |
1129 # @return 0 if successful, or negative if an error occurred. | |
1130 int sam_index_build2(const char *fn, const char *fnidx, int min_shift) | |
1131 | |
1132 void sam_itr_destroy(hts_itr_t *iter) | |
1133 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) | |
1134 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) | |
1135 int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) | |
1136 | |
1137 #*************** | |
1138 #*** SAM I/O *** | |
1139 #*************** | |
1140 | |
1141 htsFile *sam_open(const char *fn, const char *mode) | |
1142 htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) | |
1143 int sam_close(htsFile *fp) | |
1144 | |
1145 int sam_open_mode(char *mode, const char *fn, const char *format) | |
1146 | |
1147 # A version of sam_open_mode that can handle ,key=value options. | |
1148 # The format string is allocated and returned, to be freed by the caller. | |
1149 # Prefix should be "r" or "w", | |
1150 char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) | |
1151 | |
1152 bam_hdr_t *sam_hdr_parse(int l_text, const char *text) | |
1153 bam_hdr_t *sam_hdr_read(htsFile *fp) | |
1154 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) | |
1155 | |
1156 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) | |
1157 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) | |
1158 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) | |
1159 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) | |
1160 | |
1161 #************************************* | |
1162 #*** Manipulating auxiliary fields *** | |
1163 #************************************* | |
1164 | |
1165 uint8_t *bam_aux_get(const bam1_t *b, const char *tag) | |
1166 int64_t bam_aux2i(const uint8_t *s) | |
1167 double bam_aux2f(const uint8_t *s) | |
1168 char bam_aux2A(const uint8_t *s) | |
1169 char *bam_aux2Z(const uint8_t *s) | |
1170 | |
1171 void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) | |
1172 int bam_aux_del(bam1_t *b, uint8_t *s) | |
1173 | |
1174 #************************** | |
1175 #*** Pileup and Mpileup *** | |
1176 #************************** | |
1177 | |
1178 # @abstract Generic pileup 'client data'. | |
1179 # @discussion The pileup iterator allows setting a constructor and | |
1180 # destructor function, which will be called every time a sequence is | |
1181 # fetched and discarded. This permits caching of per-sequence data in | |
1182 # a tidy manner during the pileup process. This union is the cached | |
1183 # data to be manipulated by the "client" (the caller of pileup). | |
1184 # | |
1185 union bam_pileup_cd: | |
1186 void *p | |
1187 int64_t i | |
1188 double f | |
1189 | |
1190 # @abstract Structure for one alignment covering the pileup position. | |
1191 # @field b pointer to the alignment | |
1192 # @field qpos position of the read base at the pileup site, 0-based | |
1193 # @field indel indel length; 0 for no indel, positive for ins and negative for del | |
1194 # @field level the level of the read in the "viewer" mode | |
1195 # @field is_del 1 iff the base on the padded read is a deletion | |
1196 # @field is_head ??? | |
1197 # @field is_tail ??? | |
1198 # @field is_refskip ??? | |
1199 # @field aux ??? | |
1200 # | |
1201 # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The | |
1202 # difference between the two functions is that the former does not | |
1203 # set bam_pileup1_t::level, while the later does. Level helps the | |
1204 # implementation of alignment viewers, but calculating this has some | |
1205 # overhead. | |
1206 # | |
1207 # is_del, is_head, etc are a bit field, declaring as below should | |
1208 # work as expected, see | |
1209 # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J | |
1210 | |
1211 ctypedef struct bam_pileup1_t: | |
1212 bam1_t *b | |
1213 int32_t qpos | |
1214 int indel, level | |
1215 uint32_t is_del | |
1216 uint32_t is_head | |
1217 uint32_t is_tail | |
1218 uint32_t is_refskip | |
1219 uint32_t aux | |
1220 bam_pileup_cd cd | |
1221 | |
1222 ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) | |
1223 ctypedef int (*bam_test_f)() | |
1224 | |
1225 ctypedef struct __bam_plp_t | |
1226 ctypedef __bam_plp_t *bam_plp_t | |
1227 | |
1228 ctypedef struct __bam_mplp_t | |
1229 ctypedef __bam_mplp_t *bam_mplp_t | |
1230 | |
1231 # bam_plp_init() - sets an iterator over multiple | |
1232 # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return | |
1233 # status: 0 on success, -1 on end, < -1 on non-recoverable errors | |
1234 # @data: user data to pass to @func | |
1235 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) | |
1236 void bam_plp_destroy(bam_plp_t iter) | |
1237 int bam_plp_push(bam_plp_t iter, const bam1_t *b) | |
1238 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) | |
1239 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) | |
1240 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) | |
1241 void bam_plp_reset(bam_plp_t iter) | |
1242 | |
1243 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) | |
1244 | |
1245 # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping | |
1246 # read pairs and for each base pair set the base quality of the | |
1247 # lower-quality base to zero, thus effectively discarding it from | |
1248 # calling. If the two bases are identical, the quality of the other base | |
1249 # is increased to the sum of their qualities (capped at 200), otherwise | |
1250 # it is multiplied by 0.8. | |
1251 void bam_mplp_init_overlaps(bam_mplp_t iter) | |
1252 void bam_mplp_destroy(bam_mplp_t iter) | |
1253 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) | |
1254 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) | |
1255 void bam_mplp_reset(bam_mplp_t iter) | |
1256 void bam_mplp_constructor(bam_mplp_t iter, | |
1257 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) | |
1258 void bam_mplp_destructor(bam_mplp_t iter, | |
1259 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) | |
1260 | |
1261 # Added by AH | |
1262 # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" | |
1263 | |
1264 | |
1265 | |
1266 | |
1267 # // --------------------------- | |
1268 # // Base modification retrieval | |
1269 | |
1270 # /*! @typedef | |
1271 # @abstract Holds a single base modification. | |
1272 # @field modified_base The short base code (m, h, etc) or -ChEBI (negative) | |
1273 # @field canonical_base The canonical base referred to in the MM tag. | |
1274 # One of A, C, G, T or N. Note this may not be the | |
1275 # explicit base recorded in the SEQ column (esp. if N). | |
1276 # @field strand 0 or 1, indicating + or - strand from MM tag. | |
1277 # @field qual Quality code (256*probability), or -1 if unknown | |
1278 | |
1279 # @discussion | |
1280 # Note this doesn't hold any location data or information on which other | |
1281 # modifications may be possible at this site. | |
1282 ctypedef struct hts_base_mod: | |
1283 int modified_base | |
1284 int canonical_base | |
1285 int strand | |
1286 int qual | |
1287 | |
1288 # /// Allocates an hts_base_mode_state. | |
1289 # /** | |
1290 # * @return An hts_base_mode_state pointer on success, | |
1291 # * NULL on failure. | |
1292 # * | |
1293 # * This just allocates the memory. The initialisation of the contents is | |
1294 # * done using bam_parse_basemod. Successive calls may be made to that | |
1295 # * without the need to free and allocate a new state. | |
1296 # * | |
1297 # * The state be destroyed using the hts_base_mode_state_free function. | |
1298 # */ | |
1299 ctypedef struct hts_base_mod_state | |
1300 hts_base_mod_state *hts_base_mod_state_alloc() | |
1301 | |
1302 | |
1303 # /// Destroys an hts_base_mode_state. | |
1304 # /** | |
1305 # * @param state The base modification state pointer. | |
1306 # * | |
1307 # * The should have previously been created by hts_base_mode_state_alloc. | |
1308 # */ | |
1309 void hts_base_mod_state_free(hts_base_mod_state *state) | |
1310 | |
1311 # /// Parses the Mm and Ml tags out of a bam record. | |
1312 # /** | |
1313 # * @param b BAM alignment record | |
1314 # * @param state The base modification state pointer. | |
1315 # * @return 0 on success, | |
1316 # * -1 on failure. | |
1317 # * | |
1318 # * This fills out the contents of the modification state, resetting the | |
1319 # * iterator location to the first sequence base. | |
1320 # */ | |
1321 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) | |
1322 | |
1323 # /// Finds the next location containing base modifications and returns them | |
1324 # /** | |
1325 # * @param b BAM alignment record | |
1326 # * @param state The base modification state pointer. | |
1327 # * @param mods A supplied array for returning base modifications | |
1328 # * @param n_mods The size of the mods array | |
1329 # * @return The number of modifications found on success, | |
1330 # * 0 if no more modifications are present, | |
1331 # * -1 on failure. | |
1332 # * | |
1333 # * Unlike bam_mods_at_next_pos this skips ahead to the next site | |
1334 # * with modifications. | |
1335 # * | |
1336 # * If more than n_mods modifications are found, the total found is returned. | |
1337 # * Note this means the caller needs to check whether this is higher than | |
1338 # * n_mods. | |
1339 # */ | |
1340 | |
1341 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos) | |
1342 | |
1343 # *********************************** | |
1344 # * BAQ calculation and realignment * | |
1345 # ***********************************/ | |
1346 int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) | |
1347 int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) | |
1348 | |
1349 | |
1350 cdef extern from "htslib/faidx.h" nogil: | |
1351 | |
1352 ctypedef struct faidx_t: | |
1353 pass | |
1354 | |
1355 # /// Build index for a FASTA or bgzip-compressed FASTA file. | |
1356 # /** @param fn FASTA file name | |
1357 # @param fnfai Name of .fai file to build. | |
1358 # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). | |
1359 # @return 0 on success; or -1 on failure | |
1360 | |
1361 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. | |
1362 # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI | |
1363 # file will only be built if fn is bgzip-compressed. | |
1364 # */ | |
1365 int fai_build3(const char *fn, | |
1366 const char *fnfai, | |
1367 const char *fngzi) | |
1368 | |
1369 # /// Build index for a FASTA or bgzip-compressed FASTA file. | |
1370 # /** @param fn FASTA file name | |
1371 # @return 0 on success; or -1 on failure | |
1372 # | |
1373 # File "fn.fai" will be generated. This function is equivalent to | |
1374 # fai_build3(fn, NULL, NULL); | |
1375 # */ | |
1376 int fai_build(char *fn) | |
1377 | |
1378 # /// Destroy a faidx_t struct | |
1379 void fai_destroy(faidx_t *fai) | |
1380 | |
1381 # /// Load FASTA indexes. | |
1382 # /** @param fn File name of the FASTA file (can be compressed with bgzip). | |
1383 # @param fnfai File name of the FASTA index. | |
1384 # @param fngzi File name of the bgzip index. | |
1385 # @param flags Option flags to control index file caching and creation. | |
1386 # @return Pointer to a faidx_t struct on success, NULL on failure. | |
1387 | |
1388 # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. | |
1389 # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. | |
1390 # The bgzip index is only needed if fn is compressed. | |
1391 | |
1392 # If (flags & FAI_CREATE) is true, the index files will be built using | |
1393 # fai_build3() if they are not already present. | |
1394 # */ | |
1395 faidx_t *fai_load3(const char *fn, | |
1396 const char *fnfai, | |
1397 const char *fngzi, | |
1398 int flags) | |
1399 | |
1400 # /// Load index from "fn.fai". | |
1401 # /** @param fn File name of the FASTA file | |
1402 # @return Pointer to a faidx_t struct on success, NULL on failure. | |
1403 # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); | |
1404 # */ | |
1405 faidx_t *fai_load(char *fn) | |
1406 | |
1407 # /// Fetch the sequence in a region | |
1408 # /** @param fai Pointer to the faidx_t struct | |
1409 # @param reg Region in the format "chr2:20,000-30,000" | |
1410 # @param len Length of the region; -2 if seq not present, -1 general error | |
1411 # @return Pointer to the sequence; `NULL` on failure | |
1412 # The returned sequence is allocated by `malloc()` family and should be destroyed | |
1413 # by end users by calling `free()` on it. | |
1414 # */ | |
1415 char *fai_fetch(faidx_t *fai, | |
1416 char *reg, | |
1417 int *len) | |
1418 | |
1419 # /// Fetch the sequence in a region | |
1420 # /** @param fai Pointer to the faidx_t struct | |
1421 # @param c_name Region name | |
1422 # @param p_beg_i Beginning position number (zero-based) | |
1423 # @param p_end_i End position number (zero-based) | |
1424 # @param len Length of the region; -2 if c_name not present, -1 general error | |
1425 # @return Pointer to the sequence; null on failure | |
1426 # The returned sequence is allocated by `malloc()` family and should be destroyed | |
1427 # by end users by calling `free()` on it. | |
1428 # */ | |
1429 char *faidx_fetch_seq(faidx_t *fai, | |
1430 char *c_name, | |
1431 int p_beg_i, | |
1432 int p_end_i, | |
1433 int *len) | |
1434 | |
1435 # /// Query if sequence is present | |
1436 # /** @param fai Pointer to the faidx_t struct | |
1437 # @param seq Sequence name | |
1438 # @return 1 if present or 0 if absent | |
1439 # */ | |
1440 int faidx_has_seq(faidx_t *fai, const char *seq) | |
1441 | |
1442 # /// Fetch the number of sequences | |
1443 # /** @param fai Pointer to the faidx_t struct | |
1444 # @return The number of sequences | |
1445 # */ | |
1446 int faidx_nseq(const faidx_t *fai) | |
1447 | |
1448 # /// Return name of i-th sequence | |
1449 const char *faidx_iseq(const faidx_t *fai, int i) | |
1450 | |
1451 # /// Return sequence length, -1 if not present | |
1452 int faidx_seq_len(faidx_t *fai, const char *seq) | |
1453 | |
1454 # tabix support | |
1455 cdef extern from "htslib/tbx.h" nogil: | |
1456 | |
1457 # tbx.h definitions | |
1458 int8_t TBX_MAX_SHIFT | |
1459 int32_t TBX_GENERIC | |
1460 int32_t TBX_SAM | |
1461 int32_t TBX_VCF | |
1462 int32_t TBX_UCSC | |
1463 | |
1464 ctypedef struct tbx_conf_t: | |
1465 int32_t preset | |
1466 int32_t sc, bc, ec # seq col., beg col. and end col. | |
1467 int32_t meta_char, line_skip | |
1468 | |
1469 ctypedef struct tbx_t: | |
1470 tbx_conf_t conf | |
1471 hts_idx_t *idx | |
1472 void * dict | |
1473 | |
1474 tbx_conf_t tbx_conf_gff | |
1475 tbx_conf_t tbx_conf_bed | |
1476 tbx_conf_t tbx_conf_psltbl | |
1477 tbx_conf_t tbx_conf_sam | |
1478 tbx_conf_t tbx_conf_vcf | |
1479 | |
1480 void tbx_itr_destroy(hts_itr_t * iter) | |
1481 hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) | |
1482 hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) | |
1483 int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) | |
1484 | |
1485 int tbx_name2id(tbx_t *tbx, char *ss) | |
1486 | |
1487 int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) | |
1488 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) | |
1489 | |
1490 tbx_t * tbx_index_load(char *fn) | |
1491 tbx_t *tbx_index_load2(const char *fn, const char *fnidx) | |
1492 tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) | |
1493 | |
1494 # free the array but not the values | |
1495 char **tbx_seqnames(tbx_t *tbx, int *n) | |
1496 | |
1497 void tbx_destroy(tbx_t *tbx) | |
1498 | |
1499 | |
1500 # VCF/BCF API | |
1501 cdef extern from "htslib/vcf.h" nogil: | |
1502 | |
1503 # Header struct | |
1504 | |
1505 uint8_t BCF_HL_FLT # header line | |
1506 uint8_t BCF_HL_INFO | |
1507 uint8_t BCF_HL_FMT | |
1508 uint8_t BCF_HL_CTG | |
1509 uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..> | |
1510 uint8_t BCF_HL_GEN # generic header line | |
1511 | |
1512 uint8_t BCF_HT_FLAG # header type | |
1513 uint8_t BCF_HT_INT | |
1514 uint8_t BCF_HT_REAL | |
1515 uint8_t BCF_HT_STR | |
1516 | |
1517 uint8_t BCF_VL_FIXED # variable length | |
1518 uint8_t BCF_VL_VAR | |
1519 uint8_t BCF_VL_A | |
1520 uint8_t BCF_VL_G | |
1521 uint8_t BCF_VL_R | |
1522 | |
1523 # === Dictionary === | |
1524 # | |
1525 # The header keeps three dictionaries. The first keeps IDs in the | |
1526 # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths | |
1527 # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] | |
1528 # is the actual hash table, which is opaque to the end users. In the hash | |
1529 # table, the key is the ID or sample name as a C string and the value is a | |
1530 # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash | |
1531 # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the | |
1532 # size of the hash table or, equivalently, the length of the id[] arrays. | |
1533 | |
1534 uint8_t BCF_DT_ID # dictionary type | |
1535 uint8_t BCF_DT_CTG | |
1536 uint8_t BCF_DT_SAMPLE | |
1537 | |
1538 # Complete textual representation of a header line | |
1539 ctypedef struct bcf_hrec_t: | |
1540 int type # One of the BCF_HL_* type | |
1541 char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. | |
1542 char *value # Set only for generic lines, NULL for FILTER/INFO, etc. | |
1543 int nkeys # Number of structured fields | |
1544 char **keys # The key=value pairs | |
1545 char **vals | |
1546 | |
1547 ctypedef struct bcf_idinfo_t: | |
1548 uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] | |
1549 bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG | |
1550 int id | |
1551 | |
1552 ctypedef struct bcf_idpair_t: | |
1553 const char *key | |
1554 const bcf_idinfo_t *val | |
1555 | |
1556 ctypedef struct bcf_hdr_t: | |
1557 int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) | |
1558 bcf_idpair_t *id[3] | |
1559 void *dict[3] # ID dictionary, contig dict and sample dict | |
1560 char **samples | |
1561 bcf_hrec_t **hrec | |
1562 int nhrec, dirty | |
1563 int ntransl | |
1564 int *transl[2] # for bcf_translate() | |
1565 int nsamples_ori # for bcf_hdr_set_samples() | |
1566 uint8_t *keep_samples | |
1567 kstring_t mem | |
1568 int32_t m[3] # m: allocated size of the dictionary block in use (see n above) | |
1569 | |
1570 uint8_t bcf_type_shift[] | |
1571 | |
1572 # * VCF record * | |
1573 | |
1574 uint8_t BCF_BT_NULL | |
1575 uint8_t BCF_BT_INT8 | |
1576 uint8_t BCF_BT_INT16 | |
1577 uint8_t BCF_BT_INT32 | |
1578 uint8_t BCF_BT_FLOAT | |
1579 uint8_t BCF_BT_CHAR | |
1580 | |
1581 uint8_t VCF_REF | |
1582 uint8_t VCF_SNP | |
1583 uint8_t VCF_MNP | |
1584 uint8_t VCF_INDEL | |
1585 uint8_t VCF_OTHER | |
1586 uint8_t VCF_BND | |
1587 uint8_t VCF_OVERLAP | |
1588 | |
1589 | |
1590 ctypedef struct variant_t: | |
1591 int type, n # variant type and the number of bases affected, negative for deletions | |
1592 | |
1593 ctypedef struct bcf_fmt_t: | |
1594 int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key | |
1595 int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types | |
1596 uint8_t *p # same as vptr and vptr_* in bcf_info_t below | |
1597 uint32_t p_len | |
1598 uint32_t p_off | |
1599 uint8_t p_free | |
1600 | |
1601 union bcf_info_union_t: | |
1602 int32_t i # integer value | |
1603 float f # float value | |
1604 | |
1605 ctypedef struct bcf_info_t: | |
1606 int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key | |
1607 int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars | |
1608 | |
1609 # v1 union only set if $len==1; for easier access | |
1610 bcf_info_union_t v1 | |
1611 uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes | |
1612 uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset | |
1613 uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes | |
1614 uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new | |
1615 # data block is bigger than the original | |
1616 | |
1617 uint8_t BCF1_DIRTY_ID | |
1618 uint8_t BCF1_DIRTY_ALS | |
1619 uint8_t BCF1_DIRTY_FLT | |
1620 uint8_t BCF1_DIRTY_INF | |
1621 | |
1622 ctypedef struct bcf_dec_t: | |
1623 int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change | |
1624 int n_flt # Number of FILTER fields | |
1625 int *flt # FILTER keys in the dictionary | |
1626 char *id # ID | |
1627 char *als # REF+ALT block (\0-seperated) | |
1628 char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated | |
1629 bcf_info_t *info # INFO | |
1630 bcf_fmt_t *fmt # FORMAT and individual sample | |
1631 variant_t *var # $var and $var_type set only when set_variant_types called | |
1632 int n_var, var_type | |
1633 int shared_dirty # if set, shared.s must be recreated on BCF output | |
1634 int indiv_dirty # if set, indiv.s must be recreated on BCF output | |
1635 | |
1636 uint8_t BCF_ERR_CTG_UNDEF | |
1637 uint8_t BCF_ERR_TAG_UNDEF | |
1638 uint8_t BCF_ERR_NCOLS | |
1639 uint8_t BCF_ERR_LIMITS | |
1640 uint8_t BCF_ERR_CHAR | |
1641 uint8_t BCF_ERR_CTG_INVALID | |
1642 uint8_t BCF_ERR_TAG_INVALID | |
1643 | |
1644 # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file | |
1645 # is slower because the string is first to be parsed, packed into BCF line | |
1646 # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it | |
1647 # is known in advance that some of the fields will not be required (notably | |
1648 # the sample columns), parsing of these can be skipped by setting max_unpack | |
1649 # appropriately. | |
1650 # Similarly, it is fast to output a BCF line because the columns (kept in | |
1651 # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF | |
1652 # line must be formatted in vcf_format. | |
1653 | |
1654 ctypedef struct bcf1_t: | |
1655 int32_t rid # CHROM | |
1656 int32_t pos # POS | |
1657 int32_t rlen # length of REF | |
1658 float qual # QUAL | |
1659 uint32_t n_info, n_allele | |
1660 uint32_t n_fmt, n_sample | |
1661 kstring_t shared, indiv | |
1662 bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() | |
1663 int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed | |
1664 int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work | |
1665 int unpack_size[3] # the original block size of ID, REF+ALT and FILTER | |
1666 int errcode # one of BCF_ERR_* codes | |
1667 | |
1668 ####### API ####### | |
1669 | |
1670 # BCF and VCF I/O | |
1671 # | |
1672 # A note about naming conventions: htslib internally represents VCF | |
1673 # records as bcf1_t data structures, therefore most functions are | |
1674 # prefixed with bcf_. There are a few exceptions where the functions must | |
1675 # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In | |
1676 # these cases, functions prefixed with bcf_ are more general and work | |
1677 # with both BCF and VCF. | |
1678 | |
1679 # bcf_hdr_init() - create an empty BCF header. | |
1680 # @param mode "r" or "w" | |
1681 # | |
1682 # When opened for writing, the mandatory fileFormat and | |
1683 # FILTER=PASS lines are added automatically. | |
1684 bcf_hdr_t *bcf_hdr_init(const char *mode) | |
1685 | |
1686 # Destroy a BCF header struct | |
1687 void bcf_hdr_destroy(bcf_hdr_t *h) | |
1688 | |
1689 # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) | |
1690 bcf1_t *bcf_init() | |
1691 | |
1692 # Deallocate a bcf1_t object | |
1693 void bcf_destroy(bcf1_t *v) | |
1694 | |
1695 # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, | |
1696 # not the bcf1_t object itself. | |
1697 void bcf_empty(bcf1_t *v) | |
1698 | |
1699 # Make the bcf1_t object ready for next read. Intended mostly for | |
1700 # internal use, the user should rarely need to call this function | |
1701 # directly. | |
1702 void bcf_clear(bcf1_t *v) | |
1703 | |
1704 # Reads VCF or BCF header | |
1705 bcf_hdr_t *bcf_hdr_read(htsFile *fp) | |
1706 | |
1707 # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed | |
1708 # @samples: samples to include or exclude from file or as a comma-separated string. | |
1709 # LIST|FILE .. select samples in list/file | |
1710 # ^LIST|FILE .. exclude samples from list/file | |
1711 # - .. include all samples | |
1712 # NULL .. exclude all samples | |
1713 # @is_file: @samples is a file (1) or a comma-separated list (0) | |
1714 # | |
1715 # The bottleneck of VCF reading is parsing of genotype fields. If the | |
1716 # reader knows in advance that only subset of samples is needed (possibly | |
1717 # no samples at all), the performance of bcf_read() can be significantly | |
1718 # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). | |
1719 # The function bcf_read() will subset the VCF/BCF records automatically | |
1720 # with the notable exception when reading records via bcf_itr_next(). | |
1721 # In this case, bcf_subset_format() must be called explicitly, because | |
1722 # bcf_readrec() does not see the header. | |
1723 # | |
1724 # Returns 0 on success, -1 on error or a positive integer if the list | |
1725 # contains samples not present in the VCF header. In such a case, the | |
1726 # return value is the index of the offending sample. | |
1727 # | |
1728 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) | |
1729 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) | |
1730 | |
1731 # Writes VCF or BCF header | |
1732 int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) | |
1733 | |
1734 # Parse VCF line contained in kstring and populate the bcf1_t struct | |
1735 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) | |
1736 | |
1737 # The opposite of vcf_parse. It should rarely be called directly, see vcf_write | |
1738 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) | |
1739 | |
1740 # bcf_read() - read next VCF or BCF record | |
1741 # | |
1742 # Returns -1 on critical errors, 0 otherwise. On errors which are not | |
1743 # critical for reading, such as missing header definitions, v->errcode is | |
1744 # set to one of BCF_ERR* code and must be checked before calling | |
1745 # vcf_write(). | |
1746 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
1747 | |
1748 # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) | |
1749 # | |
1750 # Note that bcf_unpack() must be called even when reading VCF. It is safe | |
1751 # to call the function repeatedly, it will not unpack the same field | |
1752 # twice. | |
1753 uint8_t BCF_UN_STR # up to ALT inclusive | |
1754 uint8_t BCF_UN_FLT # up to FILTER | |
1755 uint8_t BCF_UN_INFO # up to INFO | |
1756 uint8_t BCF_UN_SHR # all shared information | |
1757 uint8_t BCF_UN_FMT # unpack format and each sample | |
1758 uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT | |
1759 uint8_t BCF_UN_ALL # everything | |
1760 | |
1761 int bcf_unpack(bcf1_t *b, int which) | |
1762 | |
1763 # bcf_dup() - create a copy of BCF record. | |
1764 # | |
1765 # Note that bcf_unpack() must be called on the returned copy as if it was | |
1766 # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) | |
1767 # internally to reflect any changes made by bcf_update_* functions. | |
1768 bcf1_t *bcf_dup(bcf1_t *src) | |
1769 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) | |
1770 | |
1771 # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. | |
1772 int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) | |
1773 | |
1774 # The following functions work only with VCFs and should rarely be called | |
1775 # directly. Usually one wants to use their bcf_* alternatives, which work | |
1776 # transparently with both VCFs and BCFs. | |
1777 bcf_hdr_t *vcf_hdr_read(htsFile *fp) | |
1778 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) | |
1779 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
1780 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
1781 | |
1782 #************************************************************************ | |
1783 # Header querying and manipulation routines | |
1784 #************************************************************************ | |
1785 | |
1786 # Create a new header using the supplied template | |
1787 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) | |
1788 | |
1789 # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). | |
1790 # Returns 0 on success or sets a bit on error: | |
1791 # 1 .. conflicting definitions of tag length | |
1792 # # todo | |
1793 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) | |
1794 | |
1795 # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() | |
1796 # @param dst: the destination header to be merged into, NULL on the first pass | |
1797 # @param src: the source header | |
1798 # | |
1799 # Notes: | |
1800 # - use as: | |
1801 # bcf_hdr_t *dst = NULL; | |
1802 # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]); | |
1803 # | |
1804 # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when | |
1805 # combining multiple BCF headers. The current bcf_hdr_combine() | |
1806 # does not have this problem, but became slow when used for many files. | |
1807 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) | |
1808 | |
1809 # bcf_hdr_add_sample() - add a new sample. | |
1810 # @param sample: sample name to be added | |
1811 int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample) | |
1812 | |
1813 # Read VCF header from a file and update the header | |
1814 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) | |
1815 | |
1816 # Appends formatted header text to _str_. | |
1817 # If _is_bcf_ is zero, `IDX` fields are discarded. | |
1818 # @return 0 if successful, or negative if an error occurred | |
1819 # @since 1.4 | |
1820 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str); | |
1821 | |
1822 # Returns formatted header (newly allocated string) and its length, | |
1823 # excluding the terminating \0. If is_bcf parameter is unset, IDX | |
1824 # fields are discarded. | |
1825 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) | |
1826 | |
1827 # Append new VCF header line, returns 0 on success | |
1828 int bcf_hdr_append(bcf_hdr_t *h, const char *line) | |
1829 int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...) | |
1830 | |
1831 # VCF version, e.g. VCFv4.2 | |
1832 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) | |
1833 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) | |
1834 | |
1835 # bcf_hdr_remove() - remove VCF header tag | |
1836 # @param type: one of BCF_HL_* | |
1837 # @param key: tag name or NULL to remove all tags of the given type | |
1838 void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key) | |
1839 | |
1840 # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples | |
1841 # @param n: number of samples to keep | |
1842 # @param samples: names of the samples to keep | |
1843 # @param imap: mapping from index in @samples to the sample index in the original file | |
1844 # | |
1845 # Sample names not present in h0 are ignored. The number of unmatched samples can be checked | |
1846 # by comparing n and bcf_hdr_nsamples(out_hdr). | |
1847 # This function can be used to reorder samples. | |
1848 # See also bcf_subset() which subsets individual records. | |
1849 # | |
1850 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) | |
1851 | |
1852 # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) | |
1853 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs) | |
1854 | |
1855 # Get number of samples | |
1856 int32_t bcf_hdr_nsamples(const bcf_hdr_t *h) | |
1857 | |
1858 # The following functions are for internal use and should rarely be called directly | |
1859 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) | |
1860 int bcf_hdr_sync(bcf_hdr_t *h) | |
1861 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) | |
1862 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) | |
1863 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) | |
1864 | |
1865 # bcf_hdr_get_hrec() - get header line info | |
1866 # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN | |
1867 # @param key: the header key for generic lines (e.g. "fileformat"), any field | |
1868 # for structured lines, typically "ID". | |
1869 # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN | |
1870 # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL | |
1871 # | |
1872 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) | |
1873 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) | |
1874 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) | |
1875 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) | |
1876 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) | |
1877 void hrec_add_idx(bcf_hrec_t *hrec, int idx) | |
1878 void bcf_hrec_destroy(bcf_hrec_t *hrec) | |
1879 | |
1880 #************************************************************************ | |
1881 # Individual record querying and manipulation routines | |
1882 #************************************************************************ | |
1883 | |
1884 # See the description of bcf_hdr_subset() | |
1885 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) | |
1886 | |
1887 # bcf_translate() - translate tags ids to be consistent with different header. This function | |
1888 # is useful when lines from multiple VCF need to be combined. | |
1889 # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() | |
1890 # @src_hdr: the source header, used in bcf_read() | |
1891 # @src_line: line obtained by bcf_read() | |
1892 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line) | |
1893 | |
1894 # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc | |
1895 int bcf_get_variant_types(bcf1_t *rec) | |
1896 int bcf_get_variant_type(bcf1_t *rec, int ith_allele) | |
1897 int bcf_is_snp(bcf1_t *v) | |
1898 | |
1899 # bcf_update_filter() - sets the FILTER column | |
1900 # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") | |
1901 # @n: Number of filters. If n==0, all filters are removed | |
1902 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) | |
1903 | |
1904 # bcf_add_filter() - adds to the FILTER column | |
1905 # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") | |
1906 # | |
1907 # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. | |
1908 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) | |
1909 | |
1910 # bcf_remove_filter() - removes from the FILTER column | |
1911 # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") | |
1912 # @pass: when set to 1 and no filters are present, set to PASS | |
1913 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass) | |
1914 | |
1915 # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. | |
1916 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) | |
1917 | |
1918 # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column | |
1919 # @alleles: Array of alleles | |
1920 # @nals: Number of alleles | |
1921 # @alleles_string: Comma-separated alleles, starting with the REF allele | |
1922 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) | |
1923 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) | |
1924 | |
1925 # bcf_update_id() - sets new ID string | |
1926 # bcf_add_id() - adds to the ID string checking for duplicates | |
1927 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) | |
1928 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) | |
1929 | |
1930 # bcf_update_info_*() - functions for updating INFO fields | |
1931 # @hdr: the BCF header | |
1932 # @line: VCF line to be edited | |
1933 # @key: the INFO tag to be updated | |
1934 # @values: pointer to the array of values. Pass NULL to remove the tag. | |
1935 # @n: number of values in the array. When set to 0, the INFO tag is removed | |
1936 # | |
1937 # The @string in bcf_update_info_flag() is optional, @n indicates whether | |
1938 # the flag is set or removed. | |
1939 # | |
1940 # Returns 0 on success or negative value on error. | |
1941 # | |
1942 int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) | |
1943 int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) | |
1944 int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) | |
1945 int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) | |
1946 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) | |
1947 | |
1948 # bcf_update_format_*() - functions for updating FORMAT fields | |
1949 # @values: pointer to the array of values, the same number of elements | |
1950 # is expected for each sample. Missing values must be padded | |
1951 # with bcf_*_missing or bcf_*_vector_end values. | |
1952 # @n: number of values in the array. If n==0, existing tag is removed. | |
1953 # | |
1954 # The function bcf_update_format_string() is a higher-level (slower) variant of | |
1955 # bcf_update_format_char(). The former accepts array of \0-terminated strings | |
1956 # whereas the latter requires that the strings are collapsed into a single array | |
1957 # of fixed-length strings. In case of strings with variable length, shorter strings | |
1958 # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() | |
1959 # are not \0-terminated. | |
1960 # | |
1961 # Returns 0 on success or negative value on error. | |
1962 # | |
1963 int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) | |
1964 int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) | |
1965 int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) | |
1966 int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n) | |
1967 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) | |
1968 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) | |
1969 | |
1970 # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds | |
1971 # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained | |
1972 # from bcf_get_genotypes() below. | |
1973 uint32_t bcf_gt_phased(uint32_t idx) | |
1974 uint32_t bcf_gt_unphased(uint32_t idx) | |
1975 uint32_t bcf_gt_missing | |
1976 uint32_t bcf_gt_is_missing(uint32_t val) | |
1977 uint32_t bcf_gt_is_phased(uint32_t idx) | |
1978 uint32_t bcf_gt_allele(uint32_t val) | |
1979 | |
1980 # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) | |
1981 uint32_t bcf_alleles2gt(uint32_t a, uint32_t b) | |
1982 void bcf_gt2alleles(int igt, int *a, int *b) | |
1983 | |
1984 # bcf_get_fmt() - returns pointer to FORMAT's field data | |
1985 # @header: for access to BCF_DT_ID dictionary | |
1986 # @line: VCF line obtained from vcf_parse1 | |
1987 # @fmt: one of GT,PL,... | |
1988 # | |
1989 # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field | |
1990 # is not available. | |
1991 # | |
1992 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) | |
1993 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) | |
1994 | |
1995 # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID | |
1996 # @line: VCF line obtained from vcf_parse1 | |
1997 # @id: The header index for the tag, obtained from bcf_hdr_id2int() | |
1998 # | |
1999 # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid | |
2000 # as their goal is to avoid the header lookup. | |
2001 # | |
2002 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id) | |
2003 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id) | |
2004 | |
2005 # bcf_get_info_*() - get INFO values, integers or floats | |
2006 # @hdr: BCF header | |
2007 # @line: BCF record | |
2008 # @tag: INFO tag to retrieve | |
2009 # @dst: *dst is pointer to a memory location, can point to NULL | |
2010 # @ndst: pointer to the size of allocated memory | |
2011 # | |
2012 # Returns negative value on error or the number of written values on | |
2013 # success. bcf_get_info_string() returns on success the number of | |
2014 # characters written excluding the null-terminating byte. bcf_get_info_flag() | |
2015 # returns 1 when flag is set or 0 if not. | |
2016 # | |
2017 # List of return codes: | |
2018 # -1 .. no such INFO tag defined in the header | |
2019 # -2 .. clash between types defined in the header and encountered in the VCF record | |
2020 # -3 .. tag is not present in the VCF record | |
2021 # | |
2022 int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) | |
2023 int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) | |
2024 int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) | |
2025 int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst) | |
2026 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) | |
2027 | |
2028 # bcf_get_format_*() - same as bcf_get_info*() above | |
2029 # | |
2030 # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char(). | |
2031 # see the description of bcf_update_format_string() and bcf_update_format_char() above. | |
2032 # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays: | |
2033 # a single block of \0-terminated strings collapsed into a single array and an array of pointers | |
2034 # to these strings. Both arrays must be cleaned by the user. | |
2035 # | |
2036 # Returns negative value on error or the number of written values on success. | |
2037 # | |
2038 # Example: | |
2039 # int ndst = 0; char **dst = NULL | |
2040 # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 ) | |
2041 # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]) | |
2042 # free(dst[0]); free(dst) | |
2043 # | |
2044 # Example: | |
2045 # int ngt, *gt_arr = NULL, ngt_arr = 0 | |
2046 # ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr) | |
2047 # | |
2048 int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) | |
2049 int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) | |
2050 int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) | |
2051 int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst) | |
2052 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) | |
2053 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) | |
2054 | |
2055 #************************************************************************ | |
2056 # Helper functions | |
2057 #************************************************************************ | |
2058 | |
2059 # | |
2060 # bcf_hdr_id2int() - Translates string into numeric ID | |
2061 # bcf_hdr_int2id() - Translates numeric ID into string | |
2062 # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE | |
2063 # @id: tag name, such as: PL, DP, GT, etc. | |
2064 # | |
2065 # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies | |
2066 # fields in BCF records. | |
2067 # | |
2068 int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id) | |
2069 const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id) | |
2070 | |
2071 # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID | |
2072 # bcf_hdr_id2name() - Translates numeric ID to sequence name | |
2073 # | |
2074 int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) | |
2075 const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) | |
2076 const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) | |
2077 | |
2078 # | |
2079 # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t | |
2080 # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT | |
2081 # @int_id: return value of bcf_hdr_id2int, must be >=0 | |
2082 # | |
2083 # The returned values are: | |
2084 # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* | |
2085 # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields | |
2086 # bcf_hdr_id2type .. the field type, one of BCF_HT_* | |
2087 # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* | |
2088 # | |
2089 # Notes: Prior to using the macros, the presence of the info should be | |
2090 # tested with bcf_hdr_idinfo_exists(). | |
2091 # | |
2092 int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) | |
2093 int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) | |
2094 int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) | |
2095 int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) | |
2096 int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) | |
2097 bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) | |
2098 | |
2099 void bcf_fmt_array(kstring_t *s, int n, int type, void *data) | |
2100 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) | |
2101 | |
2102 void bcf_enc_vchar(kstring_t *s, int l, const char *a) | |
2103 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) | |
2104 void bcf_enc_vfloat(kstring_t *s, int n, float *a) | |
2105 | |
2106 #************************************************************************ | |
2107 # BCF index | |
2108 # | |
2109 # Note that these functions work with BCFs only. See synced_bcf_reader.h | |
2110 # which provides (amongst other things) an API to work transparently with | |
2111 # both indexed BCFs and VCFs. | |
2112 #************************************************************************ | |
2113 | |
2114 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) | |
2115 hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) | |
2116 int bcf_index_build(const char *fn, int min_shift) | |
2117 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) | |
2118 | |
2119 #******************* | |
2120 # Typed value I/O * | |
2121 #****************** | |
2122 | |
2123 # Note that in contrast with BCFv2.1 specification, HTSlib implementation | |
2124 # allows missing values in vectors. For integer types, the values 0x80, | |
2125 # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, | |
2126 # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of | |
2127 # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an | |
2128 # end-of-vector indicator. | |
2129 # Note that the end-of-vector byte is not part of the vector. | |
2130 | |
2131 # This trial BCF version (v2.2) is compatible with the VCF specification and | |
2132 # enables to handle correctly vectors with different ploidy in presence of | |
2133 # missing values. | |
2134 | |
2135 int32_t bcf_int8_vector_end | |
2136 int32_t bcf_int16_vector_end | |
2137 int32_t bcf_int32_vector_end | |
2138 int32_t bcf_str_vector_end | |
2139 int32_t bcf_int8_missing | |
2140 int32_t bcf_int16_missing | |
2141 int32_t bcf_int32_missing | |
2142 int32_t bcf_str_missing | |
2143 | |
2144 uint32_t bcf_float_vector_end | |
2145 uint32_t bcf_float_missing | |
2146 | |
2147 void bcf_float_set(float *ptr, uint32_t value) | |
2148 void bcf_float_set_vector_end(float *x) | |
2149 void bcf_float_set_missing(float *x) | |
2150 | |
2151 int bcf_float_is_missing(float f) | |
2152 int bcf_float_is_vector_end(float f) | |
2153 void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) | |
2154 void bcf_enc_size(kstring_t *s, int size, int type) | |
2155 int bcf_enc_inttype(long x) | |
2156 void bcf_enc_int1(kstring_t *s, int32_t x) | |
2157 int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) | |
2158 int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) | |
2159 int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) | |
2160 | |
2161 # These trivial wrappers are defined only for consistency with other parts of htslib | |
2162 bcf1_t *bcf_init1() | |
2163 int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
2164 int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
2165 int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
2166 int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
2167 void bcf_destroy1(bcf1_t *v) | |
2168 void bcf_empty1(bcf1_t *v) | |
2169 int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) | |
2170 void bcf_clear1(bcf1_t *v) | |
2171 int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) | |
2172 | |
2173 # Other nice wrappers | |
2174 void bcf_itr_destroy(hts_itr_t *iter) | |
2175 hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) | |
2176 hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) | |
2177 int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) | |
2178 hts_idx_t *bcf_index_load(const char *fn) | |
2179 const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) | |
2180 | |
2181 | |
2182 # VCF/BCF utility functions | |
2183 cdef extern from "htslib/vcfutils.h" nogil: | |
2184 struct kbitset_t | |
2185 | |
2186 # bcf_trim_alleles() - remove ALT alleles unused in genotype fields | |
2187 # @header: for access to BCF_DT_ID dictionary | |
2188 # @line: VCF line obtain from vcf_parse1 | |
2189 # | |
2190 # Returns the number of removed alleles on success or negative | |
2191 # on error: | |
2192 # -1 .. some allele index is out of bounds | |
2193 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) | |
2194 | |
2195 # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask | |
2196 # @header: for access to BCF_DT_ID dictionary | |
2197 # @line: VCF line obtained from vcf_parse1 | |
2198 # @mask: alleles to remove | |
2199 # | |
2200 # If you have more than 31 alleles, then the integer bit mask will | |
2201 # overflow, so use bcf_remove_allele_set instead | |
2202 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) | |
2203 | |
2204 # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set | |
2205 # @header: for access to BCF_DT_ID dictionary | |
2206 # @line: VCF line obtained from vcf_parse1 | |
2207 # @rm_set: pointer to kbitset_t object with bits set for allele | |
2208 # indexes to remove | |
2209 # | |
2210 # Number=A,R,G INFO and FORMAT fields will be updated accordingly. | |
2211 void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) | |
2212 | |
2213 # bcf_calc_ac() - calculate the number of REF and ALT alleles | |
2214 # @header: for access to BCF_DT_ID dictionary | |
2215 # @line: VCF line obtained from vcf_parse1 | |
2216 # @ac: array of length line->n_allele | |
2217 # @which: determine if INFO/AN,AC and indv fields be used | |
2218 # | |
2219 # Returns 1 if the call succeeded, or 0 if the value could not | |
2220 # be determined. | |
2221 # | |
2222 # The value of @which determines if existing INFO/AC,AN can be | |
2223 # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT). | |
2224 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) | |
2225 | |
2226 # bcf_gt_type() - determines type of the genotype | |
2227 # @fmt_ptr: the GT format field as set for example by set_fmt_ptr | |
2228 # @isample: sample index (starting from 0) | |
2229 # @ial: index of the 1st non-reference allele (starting from 1) | |
2230 # @jal: index of the 2nd non-reference allele (starting from 1) | |
2231 # | |
2232 # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, | |
2233 # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial | |
2234 # is not NULL and the genotype has one or more non-reference | |
2235 # alleles, $ial will be set. In case of GT_HET_AA, $ial is the | |
2236 # position of the allele which appeared first in ALT. If $jal is | |
2237 # not null and the genotype is GT_HET_AA, $jal will be set and is | |
2238 # the position of the second allele in ALT. | |
2239 uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation | |
2240 uint8_t GT_HOM_AA | |
2241 uint8_t GT_HET_RA | |
2242 uint8_t GT_HET_AA | |
2243 uint8_t GT_HAPL_R | |
2244 uint8_t GT_HAPL_A | |
2245 uint8_t GT_UNKN | |
2246 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal) | |
2247 | |
2248 int bcf_acgt2int(char c) | |
2249 char bcf_int2acgt(int i) | |
2250 | |
2251 # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) | |
2252 # @i,j: allele indexes, 0-based, i<=j | |
2253 # Returns index to the Number=G diploid array | |
2254 uint32_t bcf_ij2G(uint32_t i, uint32_t j) | |
2255 | |
2256 | |
2257 cdef extern from "htslib/cram.h" nogil: | |
2258 | |
2259 enum cram_block_method: | |
2260 ERROR | |
2261 RAW | |
2262 GZIP | |
2263 BZIP2 | |
2264 LZMA | |
2265 RANS | |
2266 RANS0 | |
2267 RANS1 | |
2268 GZIP_RLE | |
2269 | |
2270 enum cram_content_type: | |
2271 CT_ERROR | |
2272 FILE_HEADER | |
2273 COMPRESSION_HEADER | |
2274 MAPPED_SLICE | |
2275 UNMAPPED_SLICE | |
2276 EXTERNAL | |
2277 CORE | |
2278 | |
2279 # Opaque data types, see cram_structs for the fully fledged versions. | |
2280 ctypedef struct SAM_hdr | |
2281 ctypedef struct cram_file_def | |
2282 ctypedef struct cram_fd | |
2283 ctypedef struct cram_container | |
2284 ctypedef struct cram_block | |
2285 ctypedef struct cram_slice | |
2286 ctypedef struct cram_metrics | |
2287 ctypedef struct cram_block_slice_hdr | |
2288 ctypedef struct cram_block_compression_hdr | |
2289 ctypedef struct refs_t | |
2290 | |
2291 # Accessor functions | |
2292 | |
2293 # | |
2294 #----------------------------------------------------------------------------- | |
2295 # cram_fd | |
2296 # | |
2297 SAM_hdr *cram_fd_get_header(cram_fd *fd) | |
2298 void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) | |
2299 | |
2300 int cram_fd_get_version(cram_fd *fd) | |
2301 void cram_fd_set_version(cram_fd *fd, int vers) | |
2302 | |
2303 int cram_major_vers(cram_fd *fd) | |
2304 int cram_minor_vers(cram_fd *fd) | |
2305 | |
2306 hFILE *cram_fd_get_fp(cram_fd *fd) | |
2307 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) | |
2308 | |
2309 # | |
2310 #----------------------------------------------------------------------------- | |
2311 # cram_container | |
2312 # | |
2313 int32_t cram_container_get_length(cram_container *c) | |
2314 void cram_container_set_length(cram_container *c, int32_t length) | |
2315 int32_t cram_container_get_num_blocks(cram_container *c) | |
2316 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) | |
2317 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) | |
2318 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, | |
2319 int32_t *landmarks) | |
2320 | |
2321 # Returns true if the container is empty (EOF marker) */ | |
2322 int cram_container_is_empty(cram_fd *fd) | |
2323 | |
2324 | |
2325 # | |
2326 #----------------------------------------------------------------------------- | |
2327 # cram_block | |
2328 # | |
2329 int32_t cram_block_get_content_id(cram_block *b) | |
2330 int32_t cram_block_get_comp_size(cram_block *b) | |
2331 int32_t cram_block_get_uncomp_size(cram_block *b) | |
2332 int32_t cram_block_get_crc32(cram_block *b) | |
2333 void * cram_block_get_data(cram_block *b) | |
2334 | |
2335 cram_content_type cram_block_get_content_type(cram_block *b) | |
2336 | |
2337 void cram_block_set_content_id(cram_block *b, int32_t id) | |
2338 void cram_block_set_comp_size(cram_block *b, int32_t size) | |
2339 void cram_block_set_uncomp_size(cram_block *b, int32_t size) | |
2340 void cram_block_set_crc32(cram_block *b, int32_t crc) | |
2341 void cram_block_set_data(cram_block *b, void *data) | |
2342 | |
2343 int cram_block_append(cram_block *b, void *data, int size) | |
2344 void cram_block_update_size(cram_block *b) | |
2345 | |
2346 # Offset is known as "size" internally, but it can be confusing. | |
2347 size_t cram_block_get_offset(cram_block *b) | |
2348 void cram_block_set_offset(cram_block *b, size_t offset) | |
2349 | |
2350 # | |
2351 # Computes the size of a cram block, including the block | |
2352 # header itself. | |
2353 # | |
2354 uint32_t cram_block_size(cram_block *b) | |
2355 | |
2356 # | |
2357 # Renumbers RG numbers in a cram compression header. | |
2358 # | |
2359 # CRAM stores RG as the Nth number in the header, rather than a | |
2360 # string holding the ID: tag. This is smaller in space, but means | |
2361 # "samtools cat" to join files together that contain single but | |
2362 # different RG lines needs a way of renumbering them. | |
2363 # | |
2364 # The file descriptor is expected to be immediately after the | |
2365 # cram_container structure (ie before the cram compression header). | |
2366 # Due to the nature of the CRAM format, this needs to read and write | |
2367 # the blocks itself. Note that there may be multiple slices within | |
2368 # the container, meaning multiple compression headers to manipulate. | |
2369 # Changing RG may change the size of the compression header and | |
2370 # therefore the length field in the container. Hence we rewrite all | |
2371 # blocks just in case and also emit the adjusted container. | |
2372 # | |
2373 # The current implementation can only cope with renumbering a single | |
2374 # RG (and only then if it is using HUFFMAN or BETA codecs). In | |
2375 # theory it *may* be possible to renumber multiple RGs if they use | |
2376 # HUFFMAN to the CORE block or use an external block unshared by any | |
2377 # other data series. So we have an API that can be upgraded to | |
2378 # support this, but do not implement it for now. An example | |
2379 # implementation of RG as an EXTERNAL block would be to find that | |
2380 # block and rewrite it, returning the number of blocks consumed. | |
2381 # | |
2382 # Returns 0 on success; | |
2383 # -1 if unable to edit; | |
2384 # -2 on other errors (eg I/O). | |
2385 # | |
2386 int cram_transcode_rg(cram_fd *input, cram_fd *output, | |
2387 cram_container *c, | |
2388 int nrg, int *in_rg, int *out_rg) | |
2389 | |
2390 # | |
2391 # Copies the blocks representing the next num_slice slices from a | |
2392 # container from 'in' to 'out'. It is expected that the file pointer | |
2393 # is just after the read of the cram_container and cram compression | |
2394 # header. | |
2395 # | |
2396 # Returns 0 on success | |
2397 # -1 on failure | |
2398 # | |
2399 int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice) | |
2400 | |
2401 # | |
2402 #----------------------------------------------------------------------------- | |
2403 # SAM_hdr | |
2404 # | |
2405 | |
2406 # Tokenises a SAM header into a hash table. | |
2407 # | |
2408 # Also extracts a few bits on specific data types, such as @RG lines. | |
2409 # | |
2410 # @return | |
2411 # Returns a SAM_hdr struct on success (free with sam_hdr_free()) | |
2412 # NULL on failure | |
2413 # | |
2414 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) | |
2415 | |
2416 | |
2417 # | |
2418 #----------------------------------------------------------------------------- | |
2419 # cram_io basics | |
2420 # | |
2421 | |
2422 # CRAM blocks - the dynamically growable data block. We have code to | |
2423 # create, update, (un)compress and read/write. | |
2424 # | |
2425 # These are derived from the deflate_interlaced.c blocks, but with the | |
2426 # CRAM extension of content types and IDs. | |
2427 # | |
2428 | |
2429 # Allocates a new cram_block structure with a specified content_type and | |
2430 # id. | |
2431 # | |
2432 # @return | |
2433 # Returns block pointer on success; | |
2434 # NULL on failure | |
2435 # | |
2436 cram_block *cram_new_block(cram_content_type content_type, | |
2437 int content_id) | |
2438 | |
2439 # Reads a block from a cram file. | |
2440 # | |
2441 # @return | |
2442 # Returns cram_block pointer on success; | |
2443 # NULL on failure | |
2444 # | |
2445 cram_block *cram_read_block(cram_fd *fd) | |
2446 | |
2447 # Writes a CRAM block. | |
2448 # | |
2449 # @return | |
2450 # Returns 0 on success; | |
2451 # -1 on failure | |
2452 # | |
2453 int cram_write_block(cram_fd *fd, cram_block *b) | |
2454 | |
2455 # Frees a CRAM block, deallocating internal data too. | |
2456 # | |
2457 void cram_free_block(cram_block *b) | |
2458 | |
2459 # Uncompresses a CRAM block, if compressed. | |
2460 # | |
2461 # @return | |
2462 # Returns 0 on success; | |
2463 # -1 on failure | |
2464 # | |
2465 int cram_uncompress_block(cram_block *b) | |
2466 | |
2467 # Compresses a block. | |
2468 # | |
2469 # Compresses a block using one of two different zlib strategies. If we only | |
2470 # want one choice set strat2 to be -1. | |
2471 # | |
2472 # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED | |
2473 # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is | |
2474 # significantly faster. | |
2475 # | |
2476 # @return | |
2477 # Returns 0 on success; | |
2478 # -1 on failure | |
2479 # | |
2480 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, | |
2481 int method, int level) | |
2482 | |
2483 # Containers | |
2484 # | |
2485 | |
2486 # Creates a new container, specifying the maximum number of slices | |
2487 # and records permitted. | |
2488 # | |
2489 # @return | |
2490 # Returns cram_container ptr on success; | |
2491 # NULL on failure | |
2492 # | |
2493 cram_container *cram_new_container(int nrec, int nslice) | |
2494 void cram_free_container(cram_container *c) | |
2495 | |
2496 # Reads a container header. | |
2497 # | |
2498 # @return | |
2499 # Returns cram_container on success; | |
2500 # NULL on failure or no container left (fd->err == 0). | |
2501 # | |
2502 cram_container *cram_read_container(cram_fd *fd) | |
2503 | |
2504 # Writes a container structure. | |
2505 # | |
2506 # @return | |
2507 # Returns 0 on success; | |
2508 # -1 on failure | |
2509 # | |
2510 int cram_write_container(cram_fd *fd, cram_container *h) | |
2511 | |
2512 # | |
2513 # Stores the container structure in dat and returns *size as the | |
2514 # number of bytes written to dat[]. The input size of dat is also | |
2515 # held in *size and should be initialised to cram_container_size(c). | |
2516 # | |
2517 # Returns 0 on success; | |
2518 # -1 on failure | |
2519 # | |
2520 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) | |
2521 | |
2522 int cram_container_size(cram_container *c) | |
2523 | |
2524 # The top-level cram opening, closing and option handling | |
2525 # | |
2526 | |
2527 # Opens a CRAM file for read (mode "rb") or write ("wb"). | |
2528 # | |
2529 # The filename may be "-" to indicate stdin or stdout. | |
2530 # | |
2531 # @return | |
2532 # Returns file handle on success; | |
2533 # NULL on failure. | |
2534 # | |
2535 cram_fd *cram_open(const char *filename, const char *mode) | |
2536 | |
2537 # Opens an existing stream for reading or writing. | |
2538 # | |
2539 # @return | |
2540 # Returns file handle on success; | |
2541 # NULL on failure. | |
2542 # | |
2543 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) | |
2544 | |
2545 # Closes a CRAM file. | |
2546 # | |
2547 # @return | |
2548 # Returns 0 on success; | |
2549 # -1 on failure | |
2550 # | |
2551 int cram_close(cram_fd *fd) | |
2552 | |
2553 # | |
2554 # Seek within a CRAM file. | |
2555 # | |
2556 # Returns 0 on success | |
2557 # -1 on failure | |
2558 # | |
2559 int cram_seek(cram_fd *fd, off_t offset, int whence) | |
2560 | |
2561 # | |
2562 # Flushes a CRAM file. | |
2563 # Useful for when writing to stdout without wishing to close the stream. | |
2564 # | |
2565 # Returns 0 on success | |
2566 # -1 on failure | |
2567 # | |
2568 int cram_flush(cram_fd *fd) | |
2569 | |
2570 # Checks for end of file on a cram_fd stream. | |
2571 # | |
2572 # @return | |
2573 # Returns 0 if not at end of file | |
2574 # 1 if we hit an expected EOF (end of range or EOF block) | |
2575 # 2 for other EOF (end of stream without EOF block) | |
2576 # | |
2577 int cram_eof(cram_fd *fd) | |
2578 | |
2579 # Sets options on the cram_fd. | |
2580 # | |
2581 # See CRAM_OPT_* definitions in hts.h. | |
2582 # Use this immediately after opening. | |
2583 # | |
2584 # @return | |
2585 # Returns 0 on success; | |
2586 # -1 on failure | |
2587 # | |
2588 int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...) | |
2589 | |
2590 # Sets options on the cram_fd. | |
2591 # | |
2592 # See CRAM_OPT_* definitions in hts.h. | |
2593 # Use this immediately after opening. | |
2594 # | |
2595 # @return | |
2596 # Returns 0 on success; | |
2597 # -1 on failure | |
2598 # | |
2599 int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args) | |
2600 | |
2601 # | |
2602 # Attaches a header to a cram_fd. | |
2603 # | |
2604 # This should be used when creating a new cram_fd for writing where | |
2605 # we have an SAM_hdr already constructed (eg from a file we've read | |
2606 # in). | |
2607 # | |
2608 # @return | |
2609 # Returns 0 on success; | |
2610 # -1 on failure | |
2611 # | |
2612 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) | |
2613 | |
2614 # Check if this file has a proper EOF block | |
2615 # | |
2616 # @return | |
2617 # Returns 3 if the file is a version of CRAM that does not contain EOF blocks | |
2618 # 2 if the file is a stream and thus unseekable | |
2619 # 1 if the file contains an EOF block | |
2620 # 0 if the file does not contain an EOF block | |
2621 # -1 if an error occurred whilst reading the file or we could not seek back to where we were | |
2622 # | |
2623 # | |
2624 int cram_check_EOF(cram_fd *fd) | |
2625 | |
2626 # As int32_decoded/encode, but from/to blocks instead of cram_fd */ | |
2627 int int32_put_blk(cram_block *b, int32_t val) | |
2628 | |
2629 # Deallocates all storage used by a SAM_hdr struct. | |
2630 # | |
2631 # This also decrements the header reference count. If after decrementing | |
2632 # it is still non-zero then the header is assumed to be in use by another | |
2633 # caller and the free is not done. | |
2634 # | |
2635 # This is a synonym for sam_hdr_dec_ref(). | |
2636 # | |
2637 void sam_hdr_free(SAM_hdr *hdr) | |
2638 | |
2639 # Returns the current length of the SAM_hdr in text form. | |
2640 # | |
2641 # Call sam_hdr_rebuild() first if editing has taken place. | |
2642 # | |
2643 int sam_hdr_length(SAM_hdr *hdr) | |
2644 | |
2645 # Returns the string form of the SAM_hdr. | |
2646 # | |
2647 # Call sam_hdr_rebuild() first if editing has taken place. | |
2648 # | |
2649 char *sam_hdr_str(SAM_hdr *hdr) | |
2650 | |
2651 # Appends a formatted line to an existing SAM header. | |
2652 # | |
2653 # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with | |
2654 # optional new-line. If it contains more than 1 line then multiple lines | |
2655 # will be added in order. | |
2656 # | |
2657 # Len is the length of the text data, or 0 if unknown (in which case | |
2658 # it should be null terminated). | |
2659 # | |
2660 # @return | |
2661 # Returns 0 on success; | |
2662 # -1 on failure | |
2663 # | |
2664 | |
2665 # Add an @PG line. | |
2666 # | |
2667 # If we wish complete control over this use sam_hdr_add() directly. This | |
2668 # function uses that, but attempts to do a lot of tedious house work for | |
2669 # you too. | |
2670 # | |
2671 # - It will generate a suitable ID if the supplied one clashes. | |
2672 # - It will generate multiple @PG records if we have multiple PG chains. | |
2673 # | |
2674 # Call it as per sam_hdr_add() with a series of key,value pairs ending | |
2675 # in NULL. | |
2676 # | |
2677 # @return | |
2678 # Returns 0 on success; | |
2679 # -1 on failure | |
2680 # | |
2681 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) | |
2682 | |
2683 # | |
2684 # A function to help with construction of CL tags in @PG records. | |
2685 # Takes an argc, argv pair and returns a single space-separated string. | |
2686 # This string should be deallocated by the calling function. | |
2687 # | |
2688 # @return | |
2689 # Returns malloced char * on success; | |
2690 # NULL on failure | |
2691 # | |
2692 char *stringify_argv(int argc, char *argv[]) | |
2693 | |
2694 # | |
2695 # Returns the refs_t structure used by a cram file handle. | |
2696 # | |
2697 # This may be used in conjunction with option CRAM_OPT_SHARED_REF to | |
2698 # share reference memory between multiple file handles. | |
2699 # | |
2700 # @return | |
2701 # Returns NULL if none exists or the file handle is not a CRAM file. | |
2702 # | |
2703 refs_t *cram_get_refs(htsFile *fd) | |
2704 | |
2705 | |
2706 cdef class HTSFile(object): | |
2707 cdef htsFile *htsfile # pointer to htsFile structure | |
2708 cdef int64_t start_offset # BGZF offset of first record | |
2709 | |
2710 cdef readonly object filename # filename as supplied by user | |
2711 cdef readonly object mode # file opening mode | |
2712 cdef readonly object threads # number of threads to use | |
2713 cdef readonly object index_filename # filename of index, if supplied by user | |
2714 | |
2715 cdef readonly bint is_stream # Is htsfile a non-seekable stream | |
2716 cdef readonly bint is_remote # Is htsfile a remote stream | |
2717 cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh | |
2718 | |
2719 cdef htsFile *_open_htsfile(self) except? NULL |