jpayne@69
|
1 # cython: language_level=3
|
jpayne@69
|
2 # cython: embedsignature=True
|
jpayne@69
|
3 # cython: profile=True
|
jpayne@69
|
4 ###############################################################################
|
jpayne@69
|
5 ###############################################################################
|
jpayne@69
|
6 # Cython wrapper for access to tabix indexed files in bgzf format
|
jpayne@69
|
7 ###############################################################################
|
jpayne@69
|
8 # The principal classes and functions defined in this module are:
|
jpayne@69
|
9 #
|
jpayne@69
|
10 # class TabixFile class wrapping tabix indexed files in bgzf format
|
jpayne@69
|
11 #
|
jpayne@69
|
12 # class asTuple Parser class for tuples
|
jpayne@69
|
13 # class asGTF Parser class for GTF formatted rows
|
jpayne@69
|
14 # class asGFF3 Parser class for GFF3 formatted rows
|
jpayne@69
|
15 # class asBed Parser class for Bed formatted rows
|
jpayne@69
|
16 # class asVCF Parser class for VCF formatted rows
|
jpayne@69
|
17 #
|
jpayne@69
|
18 # class tabix_generic_iterator Streamed iterator of bgzf formatted files
|
jpayne@69
|
19 #
|
jpayne@69
|
20 # Additionally this module defines several additional classes that are part
|
jpayne@69
|
21 # of the internal API. These are:
|
jpayne@69
|
22 #
|
jpayne@69
|
23 # class Parser base class for parsers of tab-separated rows
|
jpayne@69
|
24 # class tabix_file_iterator
|
jpayne@69
|
25 # class TabixIterator iterator class over rows in bgzf file
|
jpayne@69
|
26 # class EmptyIterator
|
jpayne@69
|
27 #
|
jpayne@69
|
28 # For backwards compatibility, the following classes are also defined:
|
jpayne@69
|
29 #
|
jpayne@69
|
30 # class Tabixfile equivalent to TabixFile
|
jpayne@69
|
31 #
|
jpayne@69
|
32 ###############################################################################
|
jpayne@69
|
33 #
|
jpayne@69
|
34 # The MIT License
|
jpayne@69
|
35 #
|
jpayne@69
|
36 # Copyright (c) 2015 Andreas Heger
|
jpayne@69
|
37 #
|
jpayne@69
|
38 # Permission is hereby granted, free of charge, to any person obtaining a
|
jpayne@69
|
39 # copy of this software and associated documentation files (the "Software"),
|
jpayne@69
|
40 # to deal in the Software without restriction, including without limitation
|
jpayne@69
|
41 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jpayne@69
|
42 # and/or sell copies of the Software, and to permit persons to whom the
|
jpayne@69
|
43 # Software is furnished to do so, subject to the following conditions:
|
jpayne@69
|
44 #
|
jpayne@69
|
45 # The above copyright notice and this permission notice shall be included in
|
jpayne@69
|
46 # all copies or substantial portions of the Software.
|
jpayne@69
|
47 #
|
jpayne@69
|
48 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@69
|
49 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@69
|
50 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
jpayne@69
|
51 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@69
|
52 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jpayne@69
|
53 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jpayne@69
|
54 # DEALINGS IN THE SOFTWARE.
|
jpayne@69
|
55 #
|
jpayne@69
|
56 ###############################################################################
|
jpayne@69
|
57 import os
|
jpayne@69
|
58 import sys
|
jpayne@69
|
59
|
jpayne@69
|
60 from libc.stdio cimport printf, fprintf, stderr
|
jpayne@69
|
61 from libc.string cimport strerror
|
jpayne@69
|
62 from libc.errno cimport errno
|
jpayne@69
|
63 from posix.unistd cimport dup
|
jpayne@69
|
64
|
jpayne@69
|
65 from cpython cimport PyErr_SetString, PyBytes_Check, \
|
jpayne@69
|
66 PyUnicode_Check, PyBytes_FromStringAndSize, \
|
jpayne@69
|
67 PyObject_AsFileDescriptor
|
jpayne@69
|
68
|
jpayne@69
|
69 cimport pysam.libctabixproxies as ctabixproxies
|
jpayne@69
|
70
|
jpayne@69
|
71 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
|
jpayne@69
|
72 BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
|
jpayne@69
|
73 tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
|
jpayne@69
|
74 tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
|
jpayne@69
|
75 tbx_destroy, hisremote, region_list, hts_getline, \
|
jpayne@69
|
76 TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
|
jpayne@69
|
77 no_compression, bcf, bcf_index_build2
|
jpayne@69
|
78
|
jpayne@69
|
79 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
|
jpayne@69
|
80 from pysam.libcutils cimport encode_filename, from_string_and_size
|
jpayne@69
|
81
|
jpayne@69
|
82 cdef class Parser:
|
jpayne@69
|
83
|
jpayne@69
|
84 def __init__(self, encoding="ascii"):
|
jpayne@69
|
85 self.encoding = encoding
|
jpayne@69
|
86
|
jpayne@69
|
87 def set_encoding(self, encoding):
|
jpayne@69
|
88 self.encoding = encoding
|
jpayne@69
|
89
|
jpayne@69
|
90 def get_encoding(self):
|
jpayne@69
|
91 return self.encoding
|
jpayne@69
|
92
|
jpayne@69
|
93 cdef parse(self, char * buffer, int length):
|
jpayne@69
|
94 raise NotImplementedError(
|
jpayne@69
|
95 'parse method of %s not implemented' % str(self))
|
jpayne@69
|
96
|
jpayne@69
|
97 def __call__(self, char * buffer, int length):
|
jpayne@69
|
98 return self.parse(buffer, length)
|
jpayne@69
|
99
|
jpayne@69
|
100
|
jpayne@69
|
101 cdef class asTuple(Parser):
|
jpayne@69
|
102 '''converts a :term:`tabix row` into a python tuple.
|
jpayne@69
|
103
|
jpayne@69
|
104 A field in a row is accessed by numeric index.
|
jpayne@69
|
105 '''
|
jpayne@69
|
106 cdef parse(self, char * buffer, int len):
|
jpayne@69
|
107 cdef ctabixproxies.TupleProxy r
|
jpayne@69
|
108 r = ctabixproxies.TupleProxy(self.encoding)
|
jpayne@69
|
109 # need to copy - there were some
|
jpayne@69
|
110 # persistence issues with "present"
|
jpayne@69
|
111 r.copy(buffer, len)
|
jpayne@69
|
112 return r
|
jpayne@69
|
113
|
jpayne@69
|
114
|
jpayne@69
|
115 cdef class asGFF3(Parser):
|
jpayne@69
|
116 '''converts a :term:`tabix row` into a GFF record with the following
|
jpayne@69
|
117 fields:
|
jpayne@69
|
118
|
jpayne@69
|
119 +----------+----------+-------------------------------+
|
jpayne@69
|
120 |*Column* |*Name* |*Content* |
|
jpayne@69
|
121 +----------+----------+-------------------------------+
|
jpayne@69
|
122 |1 |contig |the chromosome name |
|
jpayne@69
|
123 +----------+----------+-------------------------------+
|
jpayne@69
|
124 |2 |feature |The feature type |
|
jpayne@69
|
125 +----------+----------+-------------------------------+
|
jpayne@69
|
126 |3 |source |The feature source |
|
jpayne@69
|
127 +----------+----------+-------------------------------+
|
jpayne@69
|
128 |4 |start |genomic start coordinate |
|
jpayne@69
|
129 | | |(0-based) |
|
jpayne@69
|
130 +----------+----------+-------------------------------+
|
jpayne@69
|
131 |5 |end |genomic end coordinate |
|
jpayne@69
|
132 | | |(0-based) |
|
jpayne@69
|
133 +----------+----------+-------------------------------+
|
jpayne@69
|
134 |6 |score |feature score |
|
jpayne@69
|
135 +----------+----------+-------------------------------+
|
jpayne@69
|
136 |7 |strand |strand |
|
jpayne@69
|
137 +----------+----------+-------------------------------+
|
jpayne@69
|
138 |8 |frame |frame |
|
jpayne@69
|
139 +----------+----------+-------------------------------+
|
jpayne@69
|
140 |9 |attributes|the attribute field |
|
jpayne@69
|
141 +----------+----------+-------------------------------+
|
jpayne@69
|
142
|
jpayne@69
|
143 '''
|
jpayne@69
|
144 cdef parse(self, char * buffer, int len):
|
jpayne@69
|
145 cdef ctabixproxies.GFF3Proxy r
|
jpayne@69
|
146 r = ctabixproxies.GFF3Proxy(self.encoding)
|
jpayne@69
|
147 r.copy(buffer, len)
|
jpayne@69
|
148 return r
|
jpayne@69
|
149
|
jpayne@69
|
150
|
jpayne@69
|
151 cdef class asGTF(Parser):
|
jpayne@69
|
152 '''converts a :term:`tabix row` into a GTF record with the following
|
jpayne@69
|
153 fields:
|
jpayne@69
|
154
|
jpayne@69
|
155 +----------+----------+-------------------------------+
|
jpayne@69
|
156 |*Column* |*Name* |*Content* |
|
jpayne@69
|
157 +----------+----------+-------------------------------+
|
jpayne@69
|
158 |1 |contig |the chromosome name |
|
jpayne@69
|
159 +----------+----------+-------------------------------+
|
jpayne@69
|
160 |2 |feature |The feature type |
|
jpayne@69
|
161 +----------+----------+-------------------------------+
|
jpayne@69
|
162 |3 |source |The feature source |
|
jpayne@69
|
163 +----------+----------+-------------------------------+
|
jpayne@69
|
164 |4 |start |genomic start coordinate |
|
jpayne@69
|
165 | | |(0-based) |
|
jpayne@69
|
166 +----------+----------+-------------------------------+
|
jpayne@69
|
167 |5 |end |genomic end coordinate |
|
jpayne@69
|
168 | | |(0-based) |
|
jpayne@69
|
169 +----------+----------+-------------------------------+
|
jpayne@69
|
170 |6 |score |feature score |
|
jpayne@69
|
171 +----------+----------+-------------------------------+
|
jpayne@69
|
172 |7 |strand |strand |
|
jpayne@69
|
173 +----------+----------+-------------------------------+
|
jpayne@69
|
174 |8 |frame |frame |
|
jpayne@69
|
175 +----------+----------+-------------------------------+
|
jpayne@69
|
176 |9 |attributes|the attribute field |
|
jpayne@69
|
177 +----------+----------+-------------------------------+
|
jpayne@69
|
178
|
jpayne@69
|
179 GTF formatted entries also define the following fields that
|
jpayne@69
|
180 are derived from the attributes field:
|
jpayne@69
|
181
|
jpayne@69
|
182 +--------------------+------------------------------+
|
jpayne@69
|
183 |*Name* |*Content* |
|
jpayne@69
|
184 +--------------------+------------------------------+
|
jpayne@69
|
185 |gene_id |the gene identifier |
|
jpayne@69
|
186 +--------------------+------------------------------+
|
jpayne@69
|
187 |transcript_id |the transcript identifier |
|
jpayne@69
|
188 +--------------------+------------------------------+
|
jpayne@69
|
189
|
jpayne@69
|
190 '''
|
jpayne@69
|
191 cdef parse(self, char * buffer, int len):
|
jpayne@69
|
192 cdef ctabixproxies.GTFProxy r
|
jpayne@69
|
193 r = ctabixproxies.GTFProxy(self.encoding)
|
jpayne@69
|
194 r.copy(buffer, len)
|
jpayne@69
|
195 return r
|
jpayne@69
|
196
|
jpayne@69
|
197
|
jpayne@69
|
198 cdef class asBed(Parser):
|
jpayne@69
|
199 '''converts a :term:`tabix row` into a bed record
|
jpayne@69
|
200 with the following fields:
|
jpayne@69
|
201
|
jpayne@69
|
202 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
203 |*Column* |*Field* |*Contents* |
|
jpayne@69
|
204 | | | |
|
jpayne@69
|
205 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
206 |1 |contig |contig |
|
jpayne@69
|
207 | | | |
|
jpayne@69
|
208 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
209 |2 |start |genomic start coordinate (zero-based) |
|
jpayne@69
|
210 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
211 |3 |end |genomic end coordinate plus one |
|
jpayne@69
|
212 | | |(zero-based) |
|
jpayne@69
|
213 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
214 |4 |name |name of feature. |
|
jpayne@69
|
215 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
216 |5 |score |score of feature |
|
jpayne@69
|
217 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
218 |6 |strand |strand of feature |
|
jpayne@69
|
219 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
220 |7 |thickStart |thickStart |
|
jpayne@69
|
221 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
222 |8 |thickEnd |thickEnd |
|
jpayne@69
|
223 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
224 |9 |itemRGB |itemRGB |
|
jpayne@69
|
225 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
226 |10 |blockCount |number of bocks |
|
jpayne@69
|
227 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
228 |11 |blockSizes |',' separated string of block sizes |
|
jpayne@69
|
229 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
230 |12 |blockStarts|',' separated string of block genomic |
|
jpayne@69
|
231 | | |start positions |
|
jpayne@69
|
232 +-----------+-----------+------------------------------------------+
|
jpayne@69
|
233
|
jpayne@69
|
234 Only the first three fields are required. Additional
|
jpayne@69
|
235 fields are optional, but if one is defined, all the preceding
|
jpayne@69
|
236 need to be defined as well.
|
jpayne@69
|
237
|
jpayne@69
|
238 '''
|
jpayne@69
|
239 cdef parse(self, char * buffer, int len):
|
jpayne@69
|
240 cdef ctabixproxies.BedProxy r
|
jpayne@69
|
241 r = ctabixproxies.BedProxy(self.encoding)
|
jpayne@69
|
242 r.copy(buffer, len)
|
jpayne@69
|
243 return r
|
jpayne@69
|
244
|
jpayne@69
|
245
|
jpayne@69
|
246 cdef class asVCF(Parser):
|
jpayne@69
|
247 '''converts a :term:`tabix row` into a VCF record with
|
jpayne@69
|
248 the following fields:
|
jpayne@69
|
249
|
jpayne@69
|
250 +----------+---------+------------------------------------+
|
jpayne@69
|
251 |*Column* |*Field* |*Contents* |
|
jpayne@69
|
252 | | | |
|
jpayne@69
|
253 +----------+---------+------------------------------------+
|
jpayne@69
|
254 |1 |contig |chromosome |
|
jpayne@69
|
255 +----------+---------+------------------------------------+
|
jpayne@69
|
256 |2 |pos |chromosomal position, zero-based |
|
jpayne@69
|
257 +----------+---------+------------------------------------+
|
jpayne@69
|
258 |3 |id |id |
|
jpayne@69
|
259 +----------+---------+------------------------------------+
|
jpayne@69
|
260 |4 |ref |reference allele |
|
jpayne@69
|
261 +----------+---------+------------------------------------+
|
jpayne@69
|
262 |5 |alt |alternate alleles |
|
jpayne@69
|
263 +----------+---------+------------------------------------+
|
jpayne@69
|
264 |6 |qual |quality |
|
jpayne@69
|
265 +----------+---------+------------------------------------+
|
jpayne@69
|
266 |7 |filter |filter |
|
jpayne@69
|
267 +----------+---------+------------------------------------+
|
jpayne@69
|
268 |8 |info |info |
|
jpayne@69
|
269 +----------+---------+------------------------------------+
|
jpayne@69
|
270 |9 |format |format specifier. |
|
jpayne@69
|
271 +----------+---------+------------------------------------+
|
jpayne@69
|
272
|
jpayne@69
|
273 Access to genotypes is via index::
|
jpayne@69
|
274
|
jpayne@69
|
275 contig = vcf.contig
|
jpayne@69
|
276 first_sample_genotype = vcf[0]
|
jpayne@69
|
277 second_sample_genotype = vcf[1]
|
jpayne@69
|
278
|
jpayne@69
|
279 '''
|
jpayne@69
|
280 cdef parse(self, char * buffer, int len):
|
jpayne@69
|
281 cdef ctabixproxies.VCFProxy r
|
jpayne@69
|
282 r = ctabixproxies.VCFProxy(self.encoding)
|
jpayne@69
|
283 r.copy(buffer, len)
|
jpayne@69
|
284 return r
|
jpayne@69
|
285
|
jpayne@69
|
286
|
jpayne@69
|
287 cdef class TabixFile:
|
jpayne@69
|
288 """Random access to bgzf formatted files that
|
jpayne@69
|
289 have been indexed by :term:`tabix`.
|
jpayne@69
|
290
|
jpayne@69
|
291 The file is automatically opened. The index file of file
|
jpayne@69
|
292 ``<filename>`` is expected to be called ``<filename>.tbi``
|
jpayne@69
|
293 by default (see parameter `index`).
|
jpayne@69
|
294
|
jpayne@69
|
295 Parameters
|
jpayne@69
|
296 ----------
|
jpayne@69
|
297
|
jpayne@69
|
298 filename : string
|
jpayne@69
|
299 Filename of bgzf file to be opened.
|
jpayne@69
|
300
|
jpayne@69
|
301 index : string
|
jpayne@69
|
302 The filename of the index. If not set, the default is to
|
jpayne@69
|
303 assume that the index is called ``filename.tbi``
|
jpayne@69
|
304
|
jpayne@69
|
305 mode : char
|
jpayne@69
|
306 The file opening mode. Currently, only ``r`` is permitted.
|
jpayne@69
|
307
|
jpayne@69
|
308 parser : :class:`pysam.Parser`
|
jpayne@69
|
309
|
jpayne@69
|
310 sets the default parser for this tabix file. If `parser`
|
jpayne@69
|
311 is None, the results are returned as an unparsed string.
|
jpayne@69
|
312 Otherwise, `parser` is assumed to be a functor that will return
|
jpayne@69
|
313 parsed data (see for example :class:`~pysam.asTuple` and
|
jpayne@69
|
314 :class:`~pysam.asGTF`).
|
jpayne@69
|
315
|
jpayne@69
|
316 encoding : string
|
jpayne@69
|
317
|
jpayne@69
|
318 The encoding passed to the parser
|
jpayne@69
|
319
|
jpayne@69
|
320 threads: integer
|
jpayne@69
|
321 Number of threads to use for decompressing Tabix files.
|
jpayne@69
|
322 (Default=1)
|
jpayne@69
|
323
|
jpayne@69
|
324
|
jpayne@69
|
325 Raises
|
jpayne@69
|
326 ------
|
jpayne@69
|
327
|
jpayne@69
|
328 ValueError
|
jpayne@69
|
329 if index file is missing.
|
jpayne@69
|
330
|
jpayne@69
|
331 IOError
|
jpayne@69
|
332 if file could not be opened
|
jpayne@69
|
333 """
|
jpayne@69
|
334 def __cinit__(self,
|
jpayne@69
|
335 filename,
|
jpayne@69
|
336 mode='r',
|
jpayne@69
|
337 parser=None,
|
jpayne@69
|
338 index=None,
|
jpayne@69
|
339 encoding="ascii",
|
jpayne@69
|
340 threads=1,
|
jpayne@69
|
341 *args,
|
jpayne@69
|
342 **kwargs ):
|
jpayne@69
|
343
|
jpayne@69
|
344 self.htsfile = NULL
|
jpayne@69
|
345 self.is_remote = False
|
jpayne@69
|
346 self.is_stream = False
|
jpayne@69
|
347 self.parser = parser
|
jpayne@69
|
348 self.threads = threads
|
jpayne@69
|
349 self._open(filename, mode, index, *args, **kwargs)
|
jpayne@69
|
350 self.encoding = encoding
|
jpayne@69
|
351
|
jpayne@69
|
352 def _open( self,
|
jpayne@69
|
353 filename,
|
jpayne@69
|
354 mode='r',
|
jpayne@69
|
355 index=None,
|
jpayne@69
|
356 threads=1,
|
jpayne@69
|
357 ):
|
jpayne@69
|
358 '''open a :term:`tabix file` for reading.'''
|
jpayne@69
|
359
|
jpayne@69
|
360 if mode != 'r':
|
jpayne@69
|
361 raise ValueError("invalid file opening mode `%s`" % mode)
|
jpayne@69
|
362
|
jpayne@69
|
363 if self.htsfile != NULL:
|
jpayne@69
|
364 self.close()
|
jpayne@69
|
365 self.htsfile = NULL
|
jpayne@69
|
366 self.threads=threads
|
jpayne@69
|
367
|
jpayne@69
|
368 filename_index = index or (filename + ".tbi")
|
jpayne@69
|
369 # encode all the strings to pass to tabix
|
jpayne@69
|
370 self.filename = encode_filename(filename)
|
jpayne@69
|
371 self.filename_index = encode_filename(filename_index)
|
jpayne@69
|
372
|
jpayne@69
|
373 self.is_stream = self.filename == b'-'
|
jpayne@69
|
374 self.is_remote = hisremote(self.filename)
|
jpayne@69
|
375
|
jpayne@69
|
376 if not self.is_remote:
|
jpayne@69
|
377 if not os.path.exists(filename):
|
jpayne@69
|
378 raise IOError("file `%s` not found" % filename)
|
jpayne@69
|
379
|
jpayne@69
|
380 if not os.path.exists(filename_index):
|
jpayne@69
|
381 raise IOError("index `%s` not found" % filename_index)
|
jpayne@69
|
382
|
jpayne@69
|
383 # open file
|
jpayne@69
|
384 cdef char *cfilename = self.filename
|
jpayne@69
|
385 cdef char *cfilename_index = self.filename_index
|
jpayne@69
|
386 with nogil:
|
jpayne@69
|
387 self.htsfile = hts_open(cfilename, 'r')
|
jpayne@69
|
388
|
jpayne@69
|
389 if self.htsfile == NULL:
|
jpayne@69
|
390 raise IOError("could not open file `%s`" % filename)
|
jpayne@69
|
391
|
jpayne@69
|
392 #if self.htsfile.format.category != region_list:
|
jpayne@69
|
393 # raise ValueError("file does not contain region data")
|
jpayne@69
|
394
|
jpayne@69
|
395 with nogil:
|
jpayne@69
|
396 self.index = tbx_index_load2(cfilename, cfilename_index)
|
jpayne@69
|
397
|
jpayne@69
|
398 if self.index == NULL:
|
jpayne@69
|
399 raise IOError("could not open index for `%s`" % filename)
|
jpayne@69
|
400
|
jpayne@69
|
401 if not self.is_stream:
|
jpayne@69
|
402 self.start_offset = self.tell()
|
jpayne@69
|
403
|
jpayne@69
|
404 def _dup(self):
|
jpayne@69
|
405 '''return a copy of this tabix file.
|
jpayne@69
|
406
|
jpayne@69
|
407 The file is being re-opened.
|
jpayne@69
|
408 '''
|
jpayne@69
|
409 return TabixFile(self.filename,
|
jpayne@69
|
410 mode="r",
|
jpayne@69
|
411 threads=self.threads,
|
jpayne@69
|
412 parser=self.parser,
|
jpayne@69
|
413 index=self.filename_index,
|
jpayne@69
|
414 encoding=self.encoding)
|
jpayne@69
|
415
|
jpayne@69
|
416 def fetch(self,
|
jpayne@69
|
417 reference=None,
|
jpayne@69
|
418 start=None,
|
jpayne@69
|
419 end=None,
|
jpayne@69
|
420 region=None,
|
jpayne@69
|
421 parser=None,
|
jpayne@69
|
422 multiple_iterators=False):
|
jpayne@69
|
423 '''fetch one or more rows in a :term:`region` using 0-based
|
jpayne@69
|
424 indexing. The region is specified by :term:`reference`,
|
jpayne@69
|
425 *start* and *end*. Alternatively, a samtools :term:`region`
|
jpayne@69
|
426 string can be supplied.
|
jpayne@69
|
427
|
jpayne@69
|
428 Without *reference* or *region* all entries will be fetched.
|
jpayne@69
|
429
|
jpayne@69
|
430 If only *reference* is set, all reads matching on *reference*
|
jpayne@69
|
431 will be fetched.
|
jpayne@69
|
432
|
jpayne@69
|
433 If *parser* is None, the default parser will be used for
|
jpayne@69
|
434 parsing.
|
jpayne@69
|
435
|
jpayne@69
|
436 Set *multiple_iterators* to true if you will be using multiple
|
jpayne@69
|
437 iterators on the same file at the same time. The iterator
|
jpayne@69
|
438 returned will receive its own copy of a filehandle to the file
|
jpayne@69
|
439 effectively re-opening the file. Re-opening a file creates
|
jpayne@69
|
440 some overhead, so beware.
|
jpayne@69
|
441
|
jpayne@69
|
442 '''
|
jpayne@69
|
443 if not self.is_open():
|
jpayne@69
|
444 raise ValueError("I/O operation on closed file")
|
jpayne@69
|
445
|
jpayne@69
|
446 # convert coordinates to region string, which is one-based
|
jpayne@69
|
447 if reference:
|
jpayne@69
|
448 if end is not None:
|
jpayne@69
|
449 if end < 0:
|
jpayne@69
|
450 raise ValueError("end out of range (%i)" % end)
|
jpayne@69
|
451 if start is None:
|
jpayne@69
|
452 start = 0
|
jpayne@69
|
453
|
jpayne@69
|
454 if start < 0:
|
jpayne@69
|
455 raise ValueError("start out of range (%i)" % end)
|
jpayne@69
|
456 elif start > end:
|
jpayne@69
|
457 raise ValueError(
|
jpayne@69
|
458 'start (%i) >= end (%i)' % (start, end))
|
jpayne@69
|
459 elif start == end:
|
jpayne@69
|
460 return EmptyIterator()
|
jpayne@69
|
461 else:
|
jpayne@69
|
462 region = '%s:%i-%i' % (reference, start + 1, end)
|
jpayne@69
|
463 elif start is not None:
|
jpayne@69
|
464 if start < 0:
|
jpayne@69
|
465 raise ValueError("start out of range (%i)" % end)
|
jpayne@69
|
466 region = '%s:%i' % (reference, start + 1)
|
jpayne@69
|
467 else:
|
jpayne@69
|
468 region = reference
|
jpayne@69
|
469
|
jpayne@69
|
470 # get iterator
|
jpayne@69
|
471 cdef hts_itr_t * itr
|
jpayne@69
|
472 cdef char *cstr
|
jpayne@69
|
473 cdef TabixFile fileobj
|
jpayne@69
|
474
|
jpayne@69
|
475 # reopen the same file if necessary
|
jpayne@69
|
476 if multiple_iterators:
|
jpayne@69
|
477 fileobj = self._dup()
|
jpayne@69
|
478 else:
|
jpayne@69
|
479 fileobj = self
|
jpayne@69
|
480
|
jpayne@69
|
481 if region is None:
|
jpayne@69
|
482 # without region or reference - iterate from start
|
jpayne@69
|
483 with nogil:
|
jpayne@69
|
484 itr = tbx_itr_queryi(fileobj.index,
|
jpayne@69
|
485 HTS_IDX_START,
|
jpayne@69
|
486 0,
|
jpayne@69
|
487 0)
|
jpayne@69
|
488 else:
|
jpayne@69
|
489 s = force_bytes(region, encoding=fileobj.encoding)
|
jpayne@69
|
490 cstr = s
|
jpayne@69
|
491 with nogil:
|
jpayne@69
|
492 itr = tbx_itr_querys(fileobj.index, cstr)
|
jpayne@69
|
493
|
jpayne@69
|
494 if itr == NULL:
|
jpayne@69
|
495 if region is None:
|
jpayne@69
|
496 if len(self.contigs) > 0:
|
jpayne@69
|
497 # when accessing a tabix file created prior tabix 1.0
|
jpayne@69
|
498 # the full-file iterator is empty.
|
jpayne@69
|
499 raise ValueError(
|
jpayne@69
|
500 "could not create iterator, possible "
|
jpayne@69
|
501 "tabix version mismatch")
|
jpayne@69
|
502 else:
|
jpayne@69
|
503 # possible reason is that the file is empty -
|
jpayne@69
|
504 # return an empty iterator
|
jpayne@69
|
505 return EmptyIterator()
|
jpayne@69
|
506 else:
|
jpayne@69
|
507 raise ValueError(
|
jpayne@69
|
508 "could not create iterator for region '%s'" %
|
jpayne@69
|
509 region)
|
jpayne@69
|
510
|
jpayne@69
|
511 # use default parser if no parser is specified
|
jpayne@69
|
512 if parser is None:
|
jpayne@69
|
513 parser = fileobj.parser
|
jpayne@69
|
514
|
jpayne@69
|
515 cdef TabixIterator a
|
jpayne@69
|
516 if parser is None:
|
jpayne@69
|
517 a = TabixIterator(encoding=fileobj.encoding)
|
jpayne@69
|
518 else:
|
jpayne@69
|
519 parser.set_encoding(fileobj.encoding)
|
jpayne@69
|
520 a = TabixIteratorParsed(parser)
|
jpayne@69
|
521
|
jpayne@69
|
522 a.tabixfile = fileobj
|
jpayne@69
|
523 a.iterator = itr
|
jpayne@69
|
524
|
jpayne@69
|
525 return a
|
jpayne@69
|
526
|
jpayne@69
|
527 ###############################################################
|
jpayne@69
|
528 ###############################################################
|
jpayne@69
|
529 ###############################################################
|
jpayne@69
|
530 ## properties
|
jpayne@69
|
531 ###############################################################
|
jpayne@69
|
532 property header:
|
jpayne@69
|
533 '''the file header.
|
jpayne@69
|
534
|
jpayne@69
|
535 The file header consists of the lines at the beginning of a
|
jpayne@69
|
536 file that are prefixed by the comment character ``#``.
|
jpayne@69
|
537
|
jpayne@69
|
538 .. note::
|
jpayne@69
|
539 The header is returned as an iterator presenting lines
|
jpayne@69
|
540 without the newline character.
|
jpayne@69
|
541 '''
|
jpayne@69
|
542
|
jpayne@69
|
543 def __get__(self):
|
jpayne@69
|
544
|
jpayne@69
|
545 cdef char *cfilename = self.filename
|
jpayne@69
|
546 cdef char *cfilename_index = self.filename_index
|
jpayne@69
|
547
|
jpayne@69
|
548 cdef kstring_t buffer
|
jpayne@69
|
549 buffer.l = buffer.m = 0
|
jpayne@69
|
550 buffer.s = NULL
|
jpayne@69
|
551
|
jpayne@69
|
552 cdef htsFile * fp = NULL
|
jpayne@69
|
553 cdef int KS_SEP_LINE = 2
|
jpayne@69
|
554 cdef tbx_t * tbx = NULL
|
jpayne@69
|
555 lines = []
|
jpayne@69
|
556 with nogil:
|
jpayne@69
|
557 fp = hts_open(cfilename, 'r')
|
jpayne@69
|
558
|
jpayne@69
|
559 if fp == NULL:
|
jpayne@69
|
560 raise OSError("could not open {} for reading header".format(self.filename))
|
jpayne@69
|
561
|
jpayne@69
|
562 with nogil:
|
jpayne@69
|
563 tbx = tbx_index_load2(cfilename, cfilename_index)
|
jpayne@69
|
564
|
jpayne@69
|
565 if tbx == NULL:
|
jpayne@69
|
566 raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
|
jpayne@69
|
567
|
jpayne@69
|
568 while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
|
jpayne@69
|
569 if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
|
jpayne@69
|
570 break
|
jpayne@69
|
571 lines.append(force_str(buffer.s, self.encoding))
|
jpayne@69
|
572
|
jpayne@69
|
573 with nogil:
|
jpayne@69
|
574 hts_close(fp)
|
jpayne@69
|
575 free(buffer.s)
|
jpayne@69
|
576
|
jpayne@69
|
577 return lines
|
jpayne@69
|
578
|
jpayne@69
|
579 property contigs:
|
jpayne@69
|
580 '''list of chromosome names'''
|
jpayne@69
|
581 def __get__(self):
|
jpayne@69
|
582 cdef const char ** sequences
|
jpayne@69
|
583 cdef int nsequences
|
jpayne@69
|
584
|
jpayne@69
|
585 with nogil:
|
jpayne@69
|
586 sequences = tbx_seqnames(self.index, &nsequences)
|
jpayne@69
|
587 cdef int x
|
jpayne@69
|
588 result = []
|
jpayne@69
|
589 for x from 0 <= x < nsequences:
|
jpayne@69
|
590 result.append(force_str(sequences[x]))
|
jpayne@69
|
591
|
jpayne@69
|
592 # htslib instructions:
|
jpayne@69
|
593 # only free container, not the sequences themselves
|
jpayne@69
|
594 free(sequences)
|
jpayne@69
|
595
|
jpayne@69
|
596 return result
|
jpayne@69
|
597
|
jpayne@69
|
598 def close(self):
|
jpayne@69
|
599 '''
|
jpayne@69
|
600 closes the :class:`pysam.TabixFile`.'''
|
jpayne@69
|
601 if self.htsfile != NULL:
|
jpayne@69
|
602 hts_close(self.htsfile)
|
jpayne@69
|
603 self.htsfile = NULL
|
jpayne@69
|
604 if self.index != NULL:
|
jpayne@69
|
605 tbx_destroy(self.index)
|
jpayne@69
|
606 self.index = NULL
|
jpayne@69
|
607
|
jpayne@69
|
608 def __dealloc__( self ):
|
jpayne@69
|
609 # remember: dealloc cannot call other python methods
|
jpayne@69
|
610 # note: no doc string
|
jpayne@69
|
611 # note: __del__ is not called.
|
jpayne@69
|
612 if self.htsfile != NULL:
|
jpayne@69
|
613 hts_close(self.htsfile)
|
jpayne@69
|
614 self.htsfile = NULL
|
jpayne@69
|
615 if self.index != NULL:
|
jpayne@69
|
616 tbx_destroy(self.index)
|
jpayne@69
|
617
|
jpayne@69
|
618
|
jpayne@69
|
619 cdef class TabixIterator:
|
jpayne@69
|
620 """iterates over rows in *tabixfile* in region
|
jpayne@69
|
621 given by *tid*, *start* and *end*.
|
jpayne@69
|
622 """
|
jpayne@69
|
623
|
jpayne@69
|
624 def __init__(self, encoding="ascii"):
|
jpayne@69
|
625 self.encoding = encoding
|
jpayne@69
|
626
|
jpayne@69
|
627 def __iter__(self):
|
jpayne@69
|
628 self.buffer.s = NULL
|
jpayne@69
|
629 self.buffer.l = 0
|
jpayne@69
|
630 self.buffer.m = 0
|
jpayne@69
|
631
|
jpayne@69
|
632 return self
|
jpayne@69
|
633
|
jpayne@69
|
634 cdef int __cnext__(self):
|
jpayne@69
|
635 '''iterate to next element.
|
jpayne@69
|
636
|
jpayne@69
|
637 Return -5 if file has been closed when this function
|
jpayne@69
|
638 was called.
|
jpayne@69
|
639 '''
|
jpayne@69
|
640 if self.tabixfile.htsfile == NULL:
|
jpayne@69
|
641 return -5
|
jpayne@69
|
642
|
jpayne@69
|
643 cdef int retval
|
jpayne@69
|
644
|
jpayne@69
|
645 while 1:
|
jpayne@69
|
646 with nogil:
|
jpayne@69
|
647 retval = tbx_itr_next(
|
jpayne@69
|
648 self.tabixfile.htsfile,
|
jpayne@69
|
649 self.tabixfile.index,
|
jpayne@69
|
650 self.iterator,
|
jpayne@69
|
651 &self.buffer)
|
jpayne@69
|
652
|
jpayne@69
|
653 if retval < 0:
|
jpayne@69
|
654 break
|
jpayne@69
|
655
|
jpayne@69
|
656 if self.buffer.s[0] != b'#':
|
jpayne@69
|
657 break
|
jpayne@69
|
658
|
jpayne@69
|
659 return retval
|
jpayne@69
|
660
|
jpayne@69
|
661 def __next__(self):
|
jpayne@69
|
662 """python version of next().
|
jpayne@69
|
663
|
jpayne@69
|
664 pyrex uses this non-standard name instead of next()
|
jpayne@69
|
665 """
|
jpayne@69
|
666
|
jpayne@69
|
667 cdef int retval = self.__cnext__()
|
jpayne@69
|
668 if retval == -5:
|
jpayne@69
|
669 raise IOError("iteration on closed file")
|
jpayne@69
|
670 elif retval < 0:
|
jpayne@69
|
671 raise StopIteration
|
jpayne@69
|
672
|
jpayne@69
|
673 return charptr_to_str(self.buffer.s, self.encoding)
|
jpayne@69
|
674
|
jpayne@69
|
675 def __dealloc__(self):
|
jpayne@69
|
676 if <void*>self.iterator != NULL:
|
jpayne@69
|
677 tbx_itr_destroy(self.iterator)
|
jpayne@69
|
678 if self.buffer.s != NULL:
|
jpayne@69
|
679 free(self.buffer.s)
|
jpayne@69
|
680
|
jpayne@69
|
681
|
jpayne@69
|
682 class EmptyIterator:
|
jpayne@69
|
683 '''empty iterator'''
|
jpayne@69
|
684
|
jpayne@69
|
685 def __iter__(self):
|
jpayne@69
|
686 return self
|
jpayne@69
|
687
|
jpayne@69
|
688 def __next__(self):
|
jpayne@69
|
689 raise StopIteration()
|
jpayne@69
|
690
|
jpayne@69
|
691
|
jpayne@69
|
692 cdef class TabixIteratorParsed(TabixIterator):
|
jpayne@69
|
693 """iterates over mapped reads in a region.
|
jpayne@69
|
694
|
jpayne@69
|
695 The *parser* determines the encoding.
|
jpayne@69
|
696
|
jpayne@69
|
697 Returns parsed data.
|
jpayne@69
|
698 """
|
jpayne@69
|
699
|
jpayne@69
|
700 def __init__(self, Parser parser):
|
jpayne@69
|
701 super().__init__()
|
jpayne@69
|
702 self.parser = parser
|
jpayne@69
|
703
|
jpayne@69
|
704 def __next__(self):
|
jpayne@69
|
705 """python version of next().
|
jpayne@69
|
706
|
jpayne@69
|
707 pyrex uses this non-standard name instead of next()
|
jpayne@69
|
708 """
|
jpayne@69
|
709
|
jpayne@69
|
710 cdef int retval = self.__cnext__()
|
jpayne@69
|
711 if retval == -5:
|
jpayne@69
|
712 raise IOError("iteration on closed file")
|
jpayne@69
|
713 elif retval < 0:
|
jpayne@69
|
714 raise StopIteration
|
jpayne@69
|
715
|
jpayne@69
|
716 return self.parser.parse(self.buffer.s,
|
jpayne@69
|
717 self.buffer.l)
|
jpayne@69
|
718
|
jpayne@69
|
719
|
jpayne@69
|
720 cdef class GZIterator:
|
jpayne@69
|
721 def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
|
jpayne@69
|
722 '''iterate line-by-line through gzip (or bgzip)
|
jpayne@69
|
723 compressed file.
|
jpayne@69
|
724 '''
|
jpayne@69
|
725 if not os.path.exists(filename):
|
jpayne@69
|
726 raise IOError("No such file or directory: %s" % filename)
|
jpayne@69
|
727
|
jpayne@69
|
728 filename = encode_filename(filename)
|
jpayne@69
|
729 cdef char *cfilename = filename
|
jpayne@69
|
730 with nogil:
|
jpayne@69
|
731 self.gzipfile = bgzf_open(cfilename, "r")
|
jpayne@69
|
732 self._filename = filename
|
jpayne@69
|
733 self.kstream = ks_init(self.gzipfile)
|
jpayne@69
|
734 self.encoding = encoding
|
jpayne@69
|
735
|
jpayne@69
|
736 self.buffer.l = 0
|
jpayne@69
|
737 self.buffer.m = 0
|
jpayne@69
|
738 self.buffer.s = <char*>malloc(buffer_size)
|
jpayne@69
|
739
|
jpayne@69
|
740 def __dealloc__(self):
|
jpayne@69
|
741 '''close file.'''
|
jpayne@69
|
742 if self.gzipfile != NULL:
|
jpayne@69
|
743 bgzf_close(self.gzipfile)
|
jpayne@69
|
744 self.gzipfile = NULL
|
jpayne@69
|
745 if self.buffer.s != NULL:
|
jpayne@69
|
746 free(self.buffer.s)
|
jpayne@69
|
747 if self.kstream != NULL:
|
jpayne@69
|
748 ks_destroy(self.kstream)
|
jpayne@69
|
749
|
jpayne@69
|
750 def __iter__(self):
|
jpayne@69
|
751 return self
|
jpayne@69
|
752
|
jpayne@69
|
753 cdef int __cnext__(self):
|
jpayne@69
|
754 cdef int dret = 0
|
jpayne@69
|
755 cdef int retval = 0
|
jpayne@69
|
756 while 1:
|
jpayne@69
|
757 with nogil:
|
jpayne@69
|
758 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
|
jpayne@69
|
759
|
jpayne@69
|
760 if retval < 0:
|
jpayne@69
|
761 break
|
jpayne@69
|
762
|
jpayne@69
|
763 return dret
|
jpayne@69
|
764 return -1
|
jpayne@69
|
765
|
jpayne@69
|
766 def __next__(self):
|
jpayne@69
|
767 """python version of next().
|
jpayne@69
|
768 """
|
jpayne@69
|
769 cdef int retval = self.__cnext__()
|
jpayne@69
|
770 if retval < 0:
|
jpayne@69
|
771 raise StopIteration
|
jpayne@69
|
772 return force_str(self.buffer.s, self.encoding)
|
jpayne@69
|
773
|
jpayne@69
|
774
|
jpayne@69
|
775 cdef class GZIteratorHead(GZIterator):
|
jpayne@69
|
776 '''iterate line-by-line through gzip (or bgzip)
|
jpayne@69
|
777 compressed file returning comments at top of file.
|
jpayne@69
|
778 '''
|
jpayne@69
|
779
|
jpayne@69
|
780 def __next__(self):
|
jpayne@69
|
781 """python version of next().
|
jpayne@69
|
782 """
|
jpayne@69
|
783 cdef int retval = self.__cnext__()
|
jpayne@69
|
784 if retval < 0:
|
jpayne@69
|
785 raise StopIteration
|
jpayne@69
|
786 if self.buffer.s[0] == b'#':
|
jpayne@69
|
787 return self.buffer.s
|
jpayne@69
|
788 else:
|
jpayne@69
|
789 raise StopIteration
|
jpayne@69
|
790
|
jpayne@69
|
791
|
jpayne@69
|
792 cdef class GZIteratorParsed(GZIterator):
|
jpayne@69
|
793 '''iterate line-by-line through gzip (or bgzip)
|
jpayne@69
|
794 compressed file returning comments at top of file.
|
jpayne@69
|
795 '''
|
jpayne@69
|
796
|
jpayne@69
|
797 def __init__(self, parser):
|
jpayne@69
|
798 self.parser = parser
|
jpayne@69
|
799
|
jpayne@69
|
800 def __next__(self):
|
jpayne@69
|
801 """python version of next().
|
jpayne@69
|
802 """
|
jpayne@69
|
803 cdef int retval = self.__cnext__()
|
jpayne@69
|
804 if retval < 0:
|
jpayne@69
|
805 raise StopIteration
|
jpayne@69
|
806
|
jpayne@69
|
807 return self.parser.parse(self.buffer.s,
|
jpayne@69
|
808 self.buffer.l)
|
jpayne@69
|
809
|
jpayne@69
|
810
|
jpayne@69
|
811 def tabix_compress(filename_in,
|
jpayne@69
|
812 filename_out,
|
jpayne@69
|
813 force=False):
|
jpayne@69
|
814 '''compress *filename_in* writing the output to *filename_out*.
|
jpayne@69
|
815
|
jpayne@69
|
816 Raise an IOError if *filename_out* already exists, unless *force*
|
jpayne@69
|
817 is set.
|
jpayne@69
|
818 '''
|
jpayne@69
|
819
|
jpayne@69
|
820 if not force and os.path.exists(filename_out):
|
jpayne@69
|
821 raise IOError(
|
jpayne@69
|
822 "Filename '%s' already exists, use *force* to "
|
jpayne@69
|
823 "overwrite" % filename_out)
|
jpayne@69
|
824
|
jpayne@69
|
825 cdef int WINDOW_SIZE
|
jpayne@69
|
826 cdef int c, r
|
jpayne@69
|
827 cdef void * buffer
|
jpayne@69
|
828 cdef BGZF * fp
|
jpayne@69
|
829 cdef int fd_src
|
jpayne@69
|
830 cdef bint is_empty = True
|
jpayne@69
|
831 cdef int O_RDONLY
|
jpayne@69
|
832 O_RDONLY = os.O_RDONLY
|
jpayne@69
|
833
|
jpayne@69
|
834 WINDOW_SIZE = 64 * 1024
|
jpayne@69
|
835
|
jpayne@69
|
836 fn = encode_filename(filename_out)
|
jpayne@69
|
837 cdef char *cfn = fn
|
jpayne@69
|
838 with nogil:
|
jpayne@69
|
839 fp = bgzf_open(cfn, "w")
|
jpayne@69
|
840 if fp == NULL:
|
jpayne@69
|
841 raise IOError("could not open '%s' for writing" % filename_out)
|
jpayne@69
|
842
|
jpayne@69
|
843 fn = encode_filename(filename_in)
|
jpayne@69
|
844 fd_src = open(fn, O_RDONLY)
|
jpayne@69
|
845 if fd_src == 0:
|
jpayne@69
|
846 raise IOError("could not open '%s' for reading" % filename_in)
|
jpayne@69
|
847
|
jpayne@69
|
848 buffer = malloc(WINDOW_SIZE)
|
jpayne@69
|
849 c = 1
|
jpayne@69
|
850
|
jpayne@69
|
851 while c > 0:
|
jpayne@69
|
852 with nogil:
|
jpayne@69
|
853 c = read(fd_src, buffer, WINDOW_SIZE)
|
jpayne@69
|
854 if c > 0:
|
jpayne@69
|
855 is_empty = False
|
jpayne@69
|
856 r = bgzf_write(fp, buffer, c)
|
jpayne@69
|
857 if r < 0:
|
jpayne@69
|
858 free(buffer)
|
jpayne@69
|
859 raise IOError("writing failed")
|
jpayne@69
|
860
|
jpayne@69
|
861 free(buffer)
|
jpayne@69
|
862 r = bgzf_close(fp)
|
jpayne@69
|
863 if r < 0:
|
jpayne@69
|
864 raise IOError("error %i when writing to file %s" % (r, filename_out))
|
jpayne@69
|
865
|
jpayne@69
|
866 r = close(fd_src)
|
jpayne@69
|
867 # an empty file will return with -1, thus ignore this.
|
jpayne@69
|
868 if r < 0:
|
jpayne@69
|
869 if not (r == -1 and is_empty):
|
jpayne@69
|
870 raise IOError("error %i when closing file %s" % (r, filename_in))
|
jpayne@69
|
871
|
jpayne@69
|
872
|
jpayne@69
|
873 def tabix_index(filename,
|
jpayne@69
|
874 force=False,
|
jpayne@69
|
875 seq_col=None,
|
jpayne@69
|
876 start_col=None,
|
jpayne@69
|
877 end_col=None,
|
jpayne@69
|
878 preset=None,
|
jpayne@69
|
879 meta_char="#",
|
jpayne@69
|
880 int line_skip=0,
|
jpayne@69
|
881 zerobased=False,
|
jpayne@69
|
882 int min_shift=-1,
|
jpayne@69
|
883 index=None,
|
jpayne@69
|
884 keep_original=False,
|
jpayne@69
|
885 csi=False,
|
jpayne@69
|
886 ):
|
jpayne@69
|
887 '''index tab-separated *filename* using tabix.
|
jpayne@69
|
888
|
jpayne@69
|
889 An existing index will not be overwritten unless *force* is set.
|
jpayne@69
|
890
|
jpayne@69
|
891 The index will be built from coordinates in columns *seq_col*,
|
jpayne@69
|
892 *start_col* and *end_col*.
|
jpayne@69
|
893
|
jpayne@69
|
894 The contents of *filename* have to be sorted by contig and
|
jpayne@69
|
895 position - the method does not check if the file is sorted.
|
jpayne@69
|
896
|
jpayne@69
|
897 Column indices are 0-based. Note that this is different from the
|
jpayne@69
|
898 tabix command line utility where column indices start at 1.
|
jpayne@69
|
899
|
jpayne@69
|
900 Coordinates in the file are assumed to be 1-based unless
|
jpayne@69
|
901 *zerobased* is set.
|
jpayne@69
|
902
|
jpayne@69
|
903 If *preset* is provided, the column coordinates are taken from a
|
jpayne@69
|
904 preset. Valid values for preset are "gff", "bed", "sam", "vcf",
|
jpayne@69
|
905 psltbl", "pileup".
|
jpayne@69
|
906
|
jpayne@69
|
907 Lines beginning with *meta_char* and the first *line_skip* lines
|
jpayne@69
|
908 will be skipped.
|
jpayne@69
|
909
|
jpayne@69
|
910 If *filename* is not detected as a gzip file it will be automatically
|
jpayne@69
|
911 compressed. The original file will be removed and only the compressed
|
jpayne@69
|
912 file will be retained.
|
jpayne@69
|
913
|
jpayne@69
|
914 By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
|
jpayne@69
|
915 is greater than zero and/or *csi* is True, creates a CSI index with a
|
jpayne@69
|
916 minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
|
jpayne@69
|
917
|
jpayne@69
|
918 *index* controls the filename which should be used for creating the index.
|
jpayne@69
|
919 If not set, the default is to append ``.tbi`` to *filename*.
|
jpayne@69
|
920
|
jpayne@69
|
921 When automatically compressing files, if *keep_original* is set the
|
jpayne@69
|
922 uncompressed file will not be deleted.
|
jpayne@69
|
923
|
jpayne@69
|
924 returns the filename of the compressed data
|
jpayne@69
|
925
|
jpayne@69
|
926 '''
|
jpayne@69
|
927
|
jpayne@69
|
928 if preset is None and \
|
jpayne@69
|
929 (seq_col is None or start_col is None or end_col is None):
|
jpayne@69
|
930 raise ValueError(
|
jpayne@69
|
931 "neither preset nor seq_col,start_col and end_col given")
|
jpayne@69
|
932
|
jpayne@69
|
933 fn = encode_filename(filename)
|
jpayne@69
|
934 cdef char *cfn = fn
|
jpayne@69
|
935
|
jpayne@69
|
936 cdef htsFile *fp = hts_open(cfn, "r")
|
jpayne@69
|
937 if fp == NULL:
|
jpayne@69
|
938 raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
|
jpayne@69
|
939
|
jpayne@69
|
940 cdef htsFormat fmt = hts_get_format(fp)[0]
|
jpayne@69
|
941 hts_close(fp)
|
jpayne@69
|
942
|
jpayne@69
|
943 if fmt.compression == no_compression:
|
jpayne@69
|
944 tabix_compress(filename, filename + ".gz", force=force)
|
jpayne@69
|
945 if not keep_original:
|
jpayne@69
|
946 os.unlink(filename)
|
jpayne@69
|
947 filename += ".gz"
|
jpayne@69
|
948 fn = encode_filename(filename)
|
jpayne@69
|
949 cfn = fn
|
jpayne@69
|
950
|
jpayne@69
|
951 # columns (1-based):
|
jpayne@69
|
952 # preset-code, contig, start, end, metachar for
|
jpayne@69
|
953 # comments, lines to ignore at beginning
|
jpayne@69
|
954 # 0 is a missing column
|
jpayne@69
|
955 preset2conf = {
|
jpayne@69
|
956 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
|
jpayne@69
|
957 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
|
jpayne@69
|
958 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
|
jpayne@69
|
959 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
|
jpayne@69
|
960 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
|
jpayne@69
|
961 }
|
jpayne@69
|
962
|
jpayne@69
|
963 conf_data = None
|
jpayne@69
|
964 if preset == "bcf" or fmt.format == bcf:
|
jpayne@69
|
965 csi = True
|
jpayne@69
|
966 elif preset:
|
jpayne@69
|
967 try:
|
jpayne@69
|
968 conf_data = preset2conf[preset]
|
jpayne@69
|
969 except KeyError:
|
jpayne@69
|
970 raise KeyError(
|
jpayne@69
|
971 "unknown preset '%s', valid presets are '%s'" %
|
jpayne@69
|
972 (preset, ",".join(preset2conf.keys())))
|
jpayne@69
|
973 else:
|
jpayne@69
|
974 if end_col is None:
|
jpayne@69
|
975 end_col = -1
|
jpayne@69
|
976
|
jpayne@69
|
977 preset = 0
|
jpayne@69
|
978 # tabix internally works with 0-based coordinates and
|
jpayne@69
|
979 # open/closed intervals. When using a preset, conversion is
|
jpayne@69
|
980 # automatically taken care of. Otherwise, the coordinates are
|
jpayne@69
|
981 # assumed to be 1-based closed intervals and -1 is subtracted
|
jpayne@69
|
982 # from the start coordinate. To avoid doing this, set the
|
jpayne@69
|
983 # TI_FLAG_UCSC=0x10000 flag:
|
jpayne@69
|
984 if zerobased:
|
jpayne@69
|
985 preset = preset | TBX_UCSC
|
jpayne@69
|
986
|
jpayne@69
|
987 conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
|
jpayne@69
|
988
|
jpayne@69
|
989 cdef tbx_conf_t conf
|
jpayne@69
|
990 if conf_data:
|
jpayne@69
|
991 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
|
jpayne@69
|
992
|
jpayne@69
|
993 if csi or min_shift > 0:
|
jpayne@69
|
994 suffix = ".csi"
|
jpayne@69
|
995 if min_shift <= 0: min_shift = 14
|
jpayne@69
|
996 else:
|
jpayne@69
|
997 suffix = ".tbi"
|
jpayne@69
|
998 min_shift = 0
|
jpayne@69
|
999
|
jpayne@69
|
1000 index = index or filename + suffix
|
jpayne@69
|
1001 fn_index = encode_filename(index)
|
jpayne@69
|
1002
|
jpayne@69
|
1003 if not force and os.path.exists(index):
|
jpayne@69
|
1004 raise IOError(
|
jpayne@69
|
1005 "filename '%s' already exists, use *force* to overwrite" % index)
|
jpayne@69
|
1006
|
jpayne@69
|
1007 cdef char *fnidx = fn_index
|
jpayne@69
|
1008 cdef int retval = 0
|
jpayne@69
|
1009
|
jpayne@69
|
1010 if csi and fmt.format == bcf:
|
jpayne@69
|
1011 with nogil:
|
jpayne@69
|
1012 retval = bcf_index_build2(cfn, fnidx, min_shift)
|
jpayne@69
|
1013 else:
|
jpayne@69
|
1014 with nogil:
|
jpayne@69
|
1015 retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
|
jpayne@69
|
1016
|
jpayne@69
|
1017 if retval != 0:
|
jpayne@69
|
1018 raise OSError("building of index for {} failed".format(filename))
|
jpayne@69
|
1019
|
jpayne@69
|
1020 return filename
|
jpayne@69
|
1021
|
jpayne@69
|
1022 # #########################################################
|
jpayne@69
|
1023 # cdef class tabix_file_iterator_old:
|
jpayne@69
|
1024 # '''iterate over ``infile``.
|
jpayne@69
|
1025
|
jpayne@69
|
1026 # This iterator is not safe. If the :meth:`__next__()` method is called
|
jpayne@69
|
1027 # after ``infile`` is closed, the result is undefined (see ``fclose()``).
|
jpayne@69
|
1028
|
jpayne@69
|
1029 # The iterator might either raise a StopIteration or segfault.
|
jpayne@69
|
1030 # '''
|
jpayne@69
|
1031
|
jpayne@69
|
1032
|
jpayne@69
|
1033 # def __cinit__(self,
|
jpayne@69
|
1034 # infile,
|
jpayne@69
|
1035 # Parser parser,
|
jpayne@69
|
1036 # int buffer_size = 65536 ):
|
jpayne@69
|
1037
|
jpayne@69
|
1038 # cdef int fd = PyObject_AsFileDescriptor( infile )
|
jpayne@69
|
1039 # if fd == -1: raise ValueError( "I/O operation on closed file." )
|
jpayne@69
|
1040 # self.infile = fdopen( fd, 'r')
|
jpayne@69
|
1041
|
jpayne@69
|
1042 # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
|
jpayne@69
|
1043
|
jpayne@69
|
1044 # self.buffer = <char*>malloc( buffer_size )
|
jpayne@69
|
1045 # self.size = buffer_size
|
jpayne@69
|
1046 # self.parser = parser
|
jpayne@69
|
1047
|
jpayne@69
|
1048 # def __iter__(self):
|
jpayne@69
|
1049 # return self
|
jpayne@69
|
1050
|
jpayne@69
|
1051 # cdef __cnext__(self):
|
jpayne@69
|
1052
|
jpayne@69
|
1053 # cdef char * b
|
jpayne@69
|
1054 # cdef size_t nbytes
|
jpayne@69
|
1055 # b = self.buffer
|
jpayne@69
|
1056
|
jpayne@69
|
1057 # while not feof( self.infile ):
|
jpayne@69
|
1058 # nbytes = getline( &b, &self.size, self.infile)
|
jpayne@69
|
1059
|
jpayne@69
|
1060 # # stop at first error or eof
|
jpayne@69
|
1061 # if (nbytes == -1): break
|
jpayne@69
|
1062 # # skip comments
|
jpayne@69
|
1063 # if (b[0] == '#'): continue
|
jpayne@69
|
1064
|
jpayne@69
|
1065 # # skip empty lines
|
jpayne@69
|
1066 # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
|
jpayne@69
|
1067
|
jpayne@69
|
1068 # # make sure that entry is complete
|
jpayne@69
|
1069 # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
|
jpayne@69
|
1070 # result = b
|
jpayne@69
|
1071 # raise ValueError( "incomplete line at %s" % result )
|
jpayne@69
|
1072
|
jpayne@69
|
1073 # # make sure that this goes fully through C
|
jpayne@69
|
1074 # # otherwise buffer is copied to/from a
|
jpayne@69
|
1075 # # Python object causing segfaults as
|
jpayne@69
|
1076 # # the wrong memory is freed
|
jpayne@69
|
1077 # return self.parser.parse( b, nbytes )
|
jpayne@69
|
1078
|
jpayne@69
|
1079 # raise StopIteration
|
jpayne@69
|
1080
|
jpayne@69
|
1081 # def __dealloc__(self):
|
jpayne@69
|
1082 # free(self.buffer)
|
jpayne@69
|
1083
|
jpayne@69
|
1084 # def __next__(self):
|
jpayne@69
|
1085 # return self.__cnext__()
|
jpayne@69
|
1086
|
jpayne@69
|
1087 #########################################################
|
jpayne@69
|
1088 #########################################################
|
jpayne@69
|
1089 #########################################################
|
jpayne@69
|
1090 ## Iterators for parsing through unindexed files.
|
jpayne@69
|
1091 #########################################################
|
jpayne@69
|
1092 # cdef buildGzipError(void *gzfp):
|
jpayne@69
|
1093 # cdef int errnum = 0
|
jpayne@69
|
1094 # cdef char *s = gzerror(gzfp, &errnum)
|
jpayne@69
|
1095 # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
|
jpayne@69
|
1096
|
jpayne@69
|
1097
|
jpayne@69
|
1098 cdef class tabix_file_iterator:
|
jpayne@69
|
1099 '''iterate over a compressed or uncompressed ``infile``.
|
jpayne@69
|
1100 '''
|
jpayne@69
|
1101
|
jpayne@69
|
1102 def __cinit__(self,
|
jpayne@69
|
1103 infile,
|
jpayne@69
|
1104 Parser parser,
|
jpayne@69
|
1105 int buffer_size=65536):
|
jpayne@69
|
1106
|
jpayne@69
|
1107 if infile.closed:
|
jpayne@69
|
1108 raise ValueError("I/O operation on closed file.")
|
jpayne@69
|
1109
|
jpayne@69
|
1110 self.infile = infile
|
jpayne@69
|
1111
|
jpayne@69
|
1112 cdef int fd = PyObject_AsFileDescriptor(infile)
|
jpayne@69
|
1113 if fd == -1:
|
jpayne@69
|
1114 raise ValueError("I/O operation on closed file.")
|
jpayne@69
|
1115
|
jpayne@69
|
1116 self.duplicated_fd = dup(fd)
|
jpayne@69
|
1117
|
jpayne@69
|
1118 # From the manual:
|
jpayne@69
|
1119 # gzopen can be used to read a file which is not in gzip format;
|
jpayne@69
|
1120 # in this case gzread will directly read from the file without decompression.
|
jpayne@69
|
1121 # When reading, this will be detected automatically by looking
|
jpayne@69
|
1122 # for the magic two-byte gzip header.
|
jpayne@69
|
1123 self.fh = bgzf_dopen(self.duplicated_fd, 'r')
|
jpayne@69
|
1124
|
jpayne@69
|
1125 if self.fh == NULL:
|
jpayne@69
|
1126 raise IOError('%s' % strerror(errno))
|
jpayne@69
|
1127
|
jpayne@69
|
1128 self.kstream = ks_init(self.fh)
|
jpayne@69
|
1129
|
jpayne@69
|
1130 self.buffer.s = <char*>malloc(buffer_size)
|
jpayne@69
|
1131 #if self.buffer == NULL:
|
jpayne@69
|
1132 # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
|
jpayne@69
|
1133 #self.size = buffer_size
|
jpayne@69
|
1134 self.parser = parser
|
jpayne@69
|
1135
|
jpayne@69
|
1136 def __iter__(self):
|
jpayne@69
|
1137 return self
|
jpayne@69
|
1138
|
jpayne@69
|
1139 cdef __cnext__(self):
|
jpayne@69
|
1140
|
jpayne@69
|
1141 cdef char * b
|
jpayne@69
|
1142 cdef int dret = 0
|
jpayne@69
|
1143 cdef int retval = 0
|
jpayne@69
|
1144 while 1:
|
jpayne@69
|
1145 with nogil:
|
jpayne@69
|
1146 retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
|
jpayne@69
|
1147
|
jpayne@69
|
1148 if retval < 0:
|
jpayne@69
|
1149 break
|
jpayne@69
|
1150 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
|
jpayne@69
|
1151
|
jpayne@69
|
1152 b = self.buffer.s
|
jpayne@69
|
1153
|
jpayne@69
|
1154 # skip comments
|
jpayne@69
|
1155 if (b[0] == b'#'):
|
jpayne@69
|
1156 continue
|
jpayne@69
|
1157
|
jpayne@69
|
1158 # skip empty lines
|
jpayne@69
|
1159 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
|
jpayne@69
|
1160 continue
|
jpayne@69
|
1161
|
jpayne@69
|
1162 # gzgets terminates at \n, no need to test
|
jpayne@69
|
1163
|
jpayne@69
|
1164 # parser creates a copy
|
jpayne@69
|
1165 return self.parser.parse(b, self.buffer.l)
|
jpayne@69
|
1166
|
jpayne@69
|
1167 raise StopIteration
|
jpayne@69
|
1168
|
jpayne@69
|
1169 def __dealloc__(self):
|
jpayne@69
|
1170 free(self.buffer.s)
|
jpayne@69
|
1171 ks_destroy(self.kstream)
|
jpayne@69
|
1172 bgzf_close(self.fh)
|
jpayne@69
|
1173
|
jpayne@69
|
1174 def __next__(self):
|
jpayne@69
|
1175 return self.__cnext__()
|
jpayne@69
|
1176
|
jpayne@69
|
1177
|
jpayne@69
|
1178 class tabix_generic_iterator:
|
jpayne@69
|
1179 '''iterate over ``infile``.
|
jpayne@69
|
1180
|
jpayne@69
|
1181 Permits the use of file-like objects for example from the gzip module.
|
jpayne@69
|
1182 '''
|
jpayne@69
|
1183 def __init__(self, infile, parser):
|
jpayne@69
|
1184
|
jpayne@69
|
1185 self.infile = infile
|
jpayne@69
|
1186 if self.infile.closed:
|
jpayne@69
|
1187 raise ValueError("I/O operation on closed file.")
|
jpayne@69
|
1188 self.parser = parser
|
jpayne@69
|
1189
|
jpayne@69
|
1190 def __iter__(self):
|
jpayne@69
|
1191 return self
|
jpayne@69
|
1192
|
jpayne@69
|
1193 # cython version - required for python 3
|
jpayne@69
|
1194 def __next__(self):
|
jpayne@69
|
1195
|
jpayne@69
|
1196 cdef char * b
|
jpayne@69
|
1197 cdef char * cpy
|
jpayne@69
|
1198 cdef size_t nbytes
|
jpayne@69
|
1199
|
jpayne@69
|
1200 encoding = self.parser.get_encoding()
|
jpayne@69
|
1201
|
jpayne@69
|
1202 # note that GzipFile.close() does not close the file
|
jpayne@69
|
1203 # reading is still possible.
|
jpayne@69
|
1204 if self.infile.closed:
|
jpayne@69
|
1205 raise ValueError("I/O operation on closed file.")
|
jpayne@69
|
1206
|
jpayne@69
|
1207 while 1:
|
jpayne@69
|
1208
|
jpayne@69
|
1209 line = self.infile.readline()
|
jpayne@69
|
1210 if not line:
|
jpayne@69
|
1211 break
|
jpayne@69
|
1212
|
jpayne@69
|
1213 s = force_bytes(line, encoding)
|
jpayne@69
|
1214 b = s
|
jpayne@69
|
1215 nbytes = len(line)
|
jpayne@69
|
1216 assert b[nbytes] == b'\0'
|
jpayne@69
|
1217
|
jpayne@69
|
1218 # skip comments
|
jpayne@69
|
1219 if b[0] == b'#':
|
jpayne@69
|
1220 continue
|
jpayne@69
|
1221
|
jpayne@69
|
1222 # skip empty lines
|
jpayne@69
|
1223 if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
|
jpayne@69
|
1224 continue
|
jpayne@69
|
1225
|
jpayne@69
|
1226 # make sure that entry is complete
|
jpayne@69
|
1227 if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
|
jpayne@69
|
1228 raise ValueError("incomplete line at %s" % line)
|
jpayne@69
|
1229
|
jpayne@69
|
1230 bytes_cpy = <bytes> b
|
jpayne@69
|
1231 cpy = <char *> bytes_cpy
|
jpayne@69
|
1232
|
jpayne@69
|
1233 return self.parser(cpy, nbytes)
|
jpayne@69
|
1234
|
jpayne@69
|
1235 raise StopIteration
|
jpayne@69
|
1236
|
jpayne@69
|
1237
|
jpayne@69
|
1238 def tabix_iterator(infile, parser):
|
jpayne@69
|
1239 """return an iterator over all entries in a file.
|
jpayne@69
|
1240
|
jpayne@69
|
1241 Results are returned parsed as specified by the *parser*. If
|
jpayne@69
|
1242 *parser* is None, the results are returned as an unparsed string.
|
jpayne@69
|
1243 Otherwise, *parser* is assumed to be a functor that will return
|
jpayne@69
|
1244 parsed data (see for example :class:`~pysam.asTuple` and
|
jpayne@69
|
1245 :class:`~pysam.asGTF`).
|
jpayne@69
|
1246
|
jpayne@69
|
1247 """
|
jpayne@69
|
1248 return tabix_generic_iterator(infile, parser)
|
jpayne@69
|
1249
|
jpayne@69
|
1250
|
jpayne@69
|
1251 cdef class Tabixfile(TabixFile):
|
jpayne@69
|
1252 """Tabixfile is deprecated: use TabixFile instead"""
|
jpayne@69
|
1253 pass
|
jpayne@69
|
1254
|
jpayne@69
|
1255
|
jpayne@69
|
1256 __all__ = [
|
jpayne@69
|
1257 "tabix_index",
|
jpayne@69
|
1258 "tabix_compress",
|
jpayne@69
|
1259 "TabixFile",
|
jpayne@69
|
1260 "Tabixfile",
|
jpayne@69
|
1261 "asTuple",
|
jpayne@69
|
1262 "asGTF",
|
jpayne@69
|
1263 "asGFF3",
|
jpayne@69
|
1264 "asVCF",
|
jpayne@69
|
1265 "asBed",
|
jpayne@69
|
1266 "GZIterator",
|
jpayne@69
|
1267 "GZIteratorHead",
|
jpayne@69
|
1268 "tabix_iterator",
|
jpayne@69
|
1269 "tabix_generic_iterator",
|
jpayne@69
|
1270 "tabix_file_iterator",
|
jpayne@69
|
1271 ]
|