jpayne@68
|
1 # cython: embedsignature=True
|
jpayne@68
|
2 # cython: profile=True
|
jpayne@68
|
3 ###############################################################################
|
jpayne@68
|
4 ###############################################################################
|
jpayne@68
|
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
|
jpayne@68
|
6 ###############################################################################
|
jpayne@68
|
7 # The principal classes defined in this module are:
|
jpayne@68
|
8 #
|
jpayne@68
|
9 # class FastaFile random read read/write access to faidx indexd files
|
jpayne@68
|
10 # class FastxFile streamed read/write access to fasta/fastq files
|
jpayne@68
|
11 #
|
jpayne@68
|
12 # Additionally this module defines several additional classes that are part
|
jpayne@68
|
13 # of the internal API. These are:
|
jpayne@68
|
14 #
|
jpayne@68
|
15 # class FastqProxy
|
jpayne@68
|
16 # class FastxRecord
|
jpayne@68
|
17 #
|
jpayne@68
|
18 # For backwards compatibility, the following classes are also defined:
|
jpayne@68
|
19 #
|
jpayne@68
|
20 # class Fastafile equivalent to FastaFile
|
jpayne@68
|
21 # class FastqFile equivalent to FastxFile
|
jpayne@68
|
22 #
|
jpayne@68
|
23 ###############################################################################
|
jpayne@68
|
24 #
|
jpayne@68
|
25 # The MIT License
|
jpayne@68
|
26 #
|
jpayne@68
|
27 # Copyright (c) 2015 Andreas Heger
|
jpayne@68
|
28 #
|
jpayne@68
|
29 # Permission is hereby granted, free of charge, to any person obtaining a
|
jpayne@68
|
30 # copy of this software and associated documentation files (the "Software"),
|
jpayne@68
|
31 # to deal in the Software without restriction, including without limitation
|
jpayne@68
|
32 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jpayne@68
|
33 # and/or sell copies of the Software, and to permit persons to whom the
|
jpayne@68
|
34 # Software is furnished to do so, subject to the following conditions:
|
jpayne@68
|
35 #
|
jpayne@68
|
36 # The above copyright notice and this permission notice shall be included in
|
jpayne@68
|
37 # all copies or substantial portions of the Software.
|
jpayne@68
|
38 #
|
jpayne@68
|
39 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@68
|
40 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@68
|
41 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
jpayne@68
|
42 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@68
|
43 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jpayne@68
|
44 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jpayne@68
|
45 # DEALINGS IN THE SOFTWARE.
|
jpayne@68
|
46 #
|
jpayne@68
|
47 ###############################################################################
|
jpayne@68
|
48 import sys
|
jpayne@68
|
49 import os
|
jpayne@68
|
50 import re
|
jpayne@68
|
51
|
jpayne@68
|
52
|
jpayne@68
|
53 from libc.errno cimport errno
|
jpayne@68
|
54 from libc.string cimport strerror
|
jpayne@68
|
55
|
jpayne@68
|
56 from cpython cimport array
|
jpayne@68
|
57
|
jpayne@68
|
58 from cpython cimport PyErr_SetString, \
|
jpayne@68
|
59 PyBytes_Check, \
|
jpayne@68
|
60 PyUnicode_Check, \
|
jpayne@68
|
61 PyBytes_FromStringAndSize
|
jpayne@68
|
62
|
jpayne@68
|
63 from pysam.libchtslib cimport \
|
jpayne@68
|
64 faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \
|
jpayne@68
|
65 faidx_seq_len, faidx_iseq, faidx_seq_len, \
|
jpayne@68
|
66 faidx_fetch_seq, hisremote, \
|
jpayne@68
|
67 bgzf_open, bgzf_close
|
jpayne@68
|
68
|
jpayne@68
|
69 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
|
jpayne@68
|
70 from pysam.libcutils cimport encode_filename, from_string_and_size
|
jpayne@68
|
71 from pysam.libcutils cimport qualitystring_to_array, parse_region
|
jpayne@68
|
72
|
jpayne@68
|
73 cdef class FastqProxy
|
jpayne@68
|
74 cdef makeFastqProxy(kseq_t * src):
|
jpayne@68
|
75 '''enter src into AlignedRead.'''
|
jpayne@68
|
76 cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
|
jpayne@68
|
77 dest._delegate = src
|
jpayne@68
|
78 return dest
|
jpayne@68
|
79
|
jpayne@68
|
80 ## TODO:
|
jpayne@68
|
81 ## add automatic indexing.
|
jpayne@68
|
82 ## add function to get sequence names.
|
jpayne@68
|
83 cdef class FastaFile:
|
jpayne@68
|
84 """Random access to fasta formatted files that
|
jpayne@68
|
85 have been indexed by :term:`faidx`.
|
jpayne@68
|
86
|
jpayne@68
|
87 The file is automatically opened. The index file of file
|
jpayne@68
|
88 ``<filename>`` is expected to be called ``<filename>.fai``.
|
jpayne@68
|
89
|
jpayne@68
|
90 Parameters
|
jpayne@68
|
91 ----------
|
jpayne@68
|
92
|
jpayne@68
|
93 filename : string
|
jpayne@68
|
94 Filename of fasta file to be opened.
|
jpayne@68
|
95
|
jpayne@68
|
96 filepath_index : string
|
jpayne@68
|
97 Optional, filename of the index. By default this is
|
jpayne@68
|
98 the filename + ".fai".
|
jpayne@68
|
99
|
jpayne@68
|
100 filepath_index_compressed : string
|
jpayne@68
|
101 Optional, filename of the index if fasta file is. By default this is
|
jpayne@68
|
102 the filename + ".gzi".
|
jpayne@68
|
103
|
jpayne@68
|
104 Raises
|
jpayne@68
|
105 ------
|
jpayne@68
|
106
|
jpayne@68
|
107 ValueError
|
jpayne@68
|
108 if index file is missing
|
jpayne@68
|
109
|
jpayne@68
|
110 IOError
|
jpayne@68
|
111 if file could not be opened
|
jpayne@68
|
112
|
jpayne@68
|
113 """
|
jpayne@68
|
114
|
jpayne@68
|
115 def __cinit__(self, *args, **kwargs):
|
jpayne@68
|
116 self.fastafile = NULL
|
jpayne@68
|
117 self._filename = None
|
jpayne@68
|
118 self._references = None
|
jpayne@68
|
119 self._lengths = None
|
jpayne@68
|
120 self.reference2length = None
|
jpayne@68
|
121 self._open(*args, **kwargs)
|
jpayne@68
|
122
|
jpayne@68
|
123 def is_open(self):
|
jpayne@68
|
124 '''return true if samfile has been opened.'''
|
jpayne@68
|
125 return self.fastafile != NULL
|
jpayne@68
|
126
|
jpayne@68
|
127 def __len__(self):
|
jpayne@68
|
128 if self.fastafile == NULL:
|
jpayne@68
|
129 raise ValueError("calling len() on closed file")
|
jpayne@68
|
130
|
jpayne@68
|
131 return faidx_nseq(self.fastafile)
|
jpayne@68
|
132
|
jpayne@68
|
133 def _open(self, filename, filepath_index=None, filepath_index_compressed=None):
|
jpayne@68
|
134 '''open an indexed fasta file.
|
jpayne@68
|
135
|
jpayne@68
|
136 This method expects an indexed fasta file.
|
jpayne@68
|
137 '''
|
jpayne@68
|
138
|
jpayne@68
|
139 # close a previously opened file
|
jpayne@68
|
140 if self.fastafile != NULL:
|
jpayne@68
|
141 self.close()
|
jpayne@68
|
142
|
jpayne@68
|
143 self._filename = encode_filename(filename)
|
jpayne@68
|
144 cdef char *cfilename = self._filename
|
jpayne@68
|
145 cdef char *cindexname = NULL
|
jpayne@68
|
146 cdef char *cindexname_compressed = NULL
|
jpayne@68
|
147 self.is_remote = hisremote(cfilename)
|
jpayne@68
|
148
|
jpayne@68
|
149 # open file for reading
|
jpayne@68
|
150 if (self._filename != b"-"
|
jpayne@68
|
151 and not self.is_remote
|
jpayne@68
|
152 and not os.path.exists(filename)):
|
jpayne@68
|
153 raise IOError("file `%s` not found" % filename)
|
jpayne@68
|
154
|
jpayne@68
|
155 # 3 modes to open:
|
jpayne@68
|
156 # compressed fa: fai_load3 with filename, index_fai and index_gzi
|
jpayne@68
|
157 # uncompressed fa: fai_load3 with filename and index_fai
|
jpayne@68
|
158 # uncompressed fa: fai_load with default index name
|
jpayne@68
|
159 if filepath_index:
|
jpayne@68
|
160 # when opening, set flags to 0 - do not automatically
|
jpayne@68
|
161 # build index if it does not exist.
|
jpayne@68
|
162
|
jpayne@68
|
163 if not os.path.exists(filepath_index):
|
jpayne@68
|
164 raise IOError("filename {} does not exist".format(filepath_index))
|
jpayne@68
|
165 cindexname = bindex_filename = encode_filename(filepath_index)
|
jpayne@68
|
166
|
jpayne@68
|
167 if filepath_index_compressed:
|
jpayne@68
|
168 if not os.path.exists(filepath_index_compressed):
|
jpayne@68
|
169 raise IOError("filename {} does not exist".format(filepath_index_compressed))
|
jpayne@68
|
170 cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed)
|
jpayne@68
|
171 with nogil:
|
jpayne@68
|
172 self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0)
|
jpayne@68
|
173 else:
|
jpayne@68
|
174 with nogil:
|
jpayne@68
|
175 self.fastafile = fai_load3(cfilename, cindexname, NULL, 0)
|
jpayne@68
|
176 else:
|
jpayne@68
|
177 with nogil:
|
jpayne@68
|
178 self.fastafile = fai_load(cfilename)
|
jpayne@68
|
179
|
jpayne@68
|
180 if self.fastafile == NULL:
|
jpayne@68
|
181 raise IOError("error when opening file `%s`" % filename)
|
jpayne@68
|
182
|
jpayne@68
|
183 cdef int nreferences = faidx_nseq(self.fastafile)
|
jpayne@68
|
184 cdef int x
|
jpayne@68
|
185 cdef const char * s
|
jpayne@68
|
186 self._references = []
|
jpayne@68
|
187 self._lengths = []
|
jpayne@68
|
188 for x from 0 <= x < nreferences:
|
jpayne@68
|
189 s = faidx_iseq(self.fastafile, x)
|
jpayne@68
|
190 ss = force_str(s)
|
jpayne@68
|
191 self._references.append(ss)
|
jpayne@68
|
192 self._lengths.append(faidx_seq_len(self.fastafile, s))
|
jpayne@68
|
193 self.reference2length = dict(zip(self._references, self._lengths))
|
jpayne@68
|
194
|
jpayne@68
|
195 def close(self):
|
jpayne@68
|
196 """close the file."""
|
jpayne@68
|
197 if self.fastafile != NULL:
|
jpayne@68
|
198 fai_destroy(self.fastafile)
|
jpayne@68
|
199 self.fastafile = NULL
|
jpayne@68
|
200
|
jpayne@68
|
201 def __dealloc__(self):
|
jpayne@68
|
202 if self.fastafile != NULL:
|
jpayne@68
|
203 fai_destroy(self.fastafile)
|
jpayne@68
|
204 self.fastafile = NULL
|
jpayne@68
|
205
|
jpayne@68
|
206 # context manager interface
|
jpayne@68
|
207 def __enter__(self):
|
jpayne@68
|
208 return self
|
jpayne@68
|
209
|
jpayne@68
|
210 def __exit__(self, exc_type, exc_value, traceback):
|
jpayne@68
|
211 self.close()
|
jpayne@68
|
212 return False
|
jpayne@68
|
213
|
jpayne@68
|
214 property closed:
|
jpayne@68
|
215 """bool indicating the current state of the file object.
|
jpayne@68
|
216 This is a read-only attribute; the close() method changes the value.
|
jpayne@68
|
217 """
|
jpayne@68
|
218 def __get__(self):
|
jpayne@68
|
219 return not self.is_open()
|
jpayne@68
|
220
|
jpayne@68
|
221 property filename:
|
jpayne@68
|
222 """filename associated with this object. This is a read-only attribute."""
|
jpayne@68
|
223 def __get__(self):
|
jpayne@68
|
224 return self._filename
|
jpayne@68
|
225
|
jpayne@68
|
226 property references:
|
jpayne@68
|
227 '''tuple with the names of :term:`reference` sequences.'''
|
jpayne@68
|
228 def __get__(self):
|
jpayne@68
|
229 return self._references
|
jpayne@68
|
230
|
jpayne@68
|
231 property nreferences:
|
jpayne@68
|
232 """int with the number of :term:`reference` sequences in the file.
|
jpayne@68
|
233 This is a read-only attribute."""
|
jpayne@68
|
234 def __get__(self):
|
jpayne@68
|
235 return len(self._references) if self.references else None
|
jpayne@68
|
236
|
jpayne@68
|
237 property lengths:
|
jpayne@68
|
238 """tuple with the lengths of :term:`reference` sequences."""
|
jpayne@68
|
239 def __get__(self):
|
jpayne@68
|
240 return self._lengths
|
jpayne@68
|
241
|
jpayne@68
|
242 def fetch(self,
|
jpayne@68
|
243 reference=None,
|
jpayne@68
|
244 start=None,
|
jpayne@68
|
245 end=None,
|
jpayne@68
|
246 region=None):
|
jpayne@68
|
247 """fetch sequences in a :term:`region`.
|
jpayne@68
|
248
|
jpayne@68
|
249 A region can
|
jpayne@68
|
250 either be specified by :term:`reference`, `start` and
|
jpayne@68
|
251 `end`. `start` and `end` denote 0-based, half-open
|
jpayne@68
|
252 intervals.
|
jpayne@68
|
253
|
jpayne@68
|
254 Alternatively, a samtools :term:`region` string can be
|
jpayne@68
|
255 supplied.
|
jpayne@68
|
256
|
jpayne@68
|
257 If any of the coordinates are missing they will be replaced by the
|
jpayne@68
|
258 minimum (`start`) or maximum (`end`) coordinate.
|
jpayne@68
|
259
|
jpayne@68
|
260 Note that region strings are 1-based, while `start` and `end` denote
|
jpayne@68
|
261 an interval in python coordinates.
|
jpayne@68
|
262 The region is specified by :term:`reference`, `start` and `end`.
|
jpayne@68
|
263
|
jpayne@68
|
264 Returns
|
jpayne@68
|
265 -------
|
jpayne@68
|
266
|
jpayne@68
|
267 string : a string with the sequence specified by the region.
|
jpayne@68
|
268
|
jpayne@68
|
269 Raises
|
jpayne@68
|
270 ------
|
jpayne@68
|
271
|
jpayne@68
|
272 IndexError
|
jpayne@68
|
273 if the coordinates are out of range
|
jpayne@68
|
274
|
jpayne@68
|
275 ValueError
|
jpayne@68
|
276 if the region is invalid
|
jpayne@68
|
277
|
jpayne@68
|
278 """
|
jpayne@68
|
279
|
jpayne@68
|
280 if not self.is_open():
|
jpayne@68
|
281 raise ValueError("I/O operation on closed file" )
|
jpayne@68
|
282
|
jpayne@68
|
283 cdef int length
|
jpayne@68
|
284 cdef char *seq
|
jpayne@68
|
285 cdef char *ref
|
jpayne@68
|
286 cdef int rstart, rend
|
jpayne@68
|
287
|
jpayne@68
|
288 contig, rstart, rend = parse_region(reference, start, end, region)
|
jpayne@68
|
289
|
jpayne@68
|
290 if contig is None:
|
jpayne@68
|
291 raise ValueError("no sequence/region supplied.")
|
jpayne@68
|
292
|
jpayne@68
|
293 if rstart == rend:
|
jpayne@68
|
294 return ""
|
jpayne@68
|
295
|
jpayne@68
|
296 contig_b = force_bytes(contig)
|
jpayne@68
|
297 ref = contig_b
|
jpayne@68
|
298 with nogil:
|
jpayne@68
|
299 length = faidx_seq_len(self.fastafile, ref)
|
jpayne@68
|
300 if length == -1:
|
jpayne@68
|
301 raise KeyError("sequence '%s' not present" % contig)
|
jpayne@68
|
302 if rstart >= length:
|
jpayne@68
|
303 return ""
|
jpayne@68
|
304
|
jpayne@68
|
305 # fai_fetch adds a '\0' at the end
|
jpayne@68
|
306 with nogil:
|
jpayne@68
|
307 seq = faidx_fetch_seq(self.fastafile,
|
jpayne@68
|
308 ref,
|
jpayne@68
|
309 rstart,
|
jpayne@68
|
310 rend-1,
|
jpayne@68
|
311 &length)
|
jpayne@68
|
312
|
jpayne@68
|
313 if not seq:
|
jpayne@68
|
314 if errno:
|
jpayne@68
|
315 raise IOError(errno, strerror(errno))
|
jpayne@68
|
316 else:
|
jpayne@68
|
317 raise ValueError("failure when retrieving sequence on '%s'" % contig)
|
jpayne@68
|
318
|
jpayne@68
|
319 try:
|
jpayne@68
|
320 return charptr_to_str(seq)
|
jpayne@68
|
321 finally:
|
jpayne@68
|
322 free(seq)
|
jpayne@68
|
323
|
jpayne@68
|
324 cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL:
|
jpayne@68
|
325 '''fetch sequence for reference, start and end'''
|
jpayne@68
|
326
|
jpayne@68
|
327 cdef char *seq
|
jpayne@68
|
328 with nogil:
|
jpayne@68
|
329 seq = faidx_fetch_seq(self.fastafile,
|
jpayne@68
|
330 reference,
|
jpayne@68
|
331 start,
|
jpayne@68
|
332 end-1,
|
jpayne@68
|
333 length)
|
jpayne@68
|
334
|
jpayne@68
|
335 if not seq:
|
jpayne@68
|
336 if errno:
|
jpayne@68
|
337 raise IOError(errno, strerror(errno))
|
jpayne@68
|
338 else:
|
jpayne@68
|
339 raise ValueError("failure when retrieving sequence on '%s'" % reference)
|
jpayne@68
|
340
|
jpayne@68
|
341 return seq
|
jpayne@68
|
342
|
jpayne@68
|
343 def get_reference_length(self, reference):
|
jpayne@68
|
344 '''return the length of reference.'''
|
jpayne@68
|
345 return self.reference2length[reference]
|
jpayne@68
|
346
|
jpayne@68
|
347 def __getitem__(self, reference):
|
jpayne@68
|
348 return self.fetch(reference)
|
jpayne@68
|
349
|
jpayne@68
|
350 def __contains__(self, reference):
|
jpayne@68
|
351 '''return true if reference in fasta file.'''
|
jpayne@68
|
352 return reference in self.reference2length
|
jpayne@68
|
353
|
jpayne@68
|
354
|
jpayne@68
|
355 cdef class FastqProxy:
|
jpayne@68
|
356 """A single entry in a fastq file."""
|
jpayne@68
|
357 def __init__(self):
|
jpayne@68
|
358 raise ValueError("do not instantiate FastqProxy directly")
|
jpayne@68
|
359
|
jpayne@68
|
360 property name:
|
jpayne@68
|
361 """The name of each entry in the fastq file."""
|
jpayne@68
|
362 def __get__(self):
|
jpayne@68
|
363 return charptr_to_str(self._delegate.name.s)
|
jpayne@68
|
364
|
jpayne@68
|
365 property sequence:
|
jpayne@68
|
366 """The sequence of each entry in the fastq file."""
|
jpayne@68
|
367 def __get__(self):
|
jpayne@68
|
368 return charptr_to_str(self._delegate.seq.s)
|
jpayne@68
|
369
|
jpayne@68
|
370 property comment:
|
jpayne@68
|
371 def __get__(self):
|
jpayne@68
|
372 if self._delegate.comment.l:
|
jpayne@68
|
373 return charptr_to_str(self._delegate.comment.s)
|
jpayne@68
|
374 else:
|
jpayne@68
|
375 return None
|
jpayne@68
|
376
|
jpayne@68
|
377 property quality:
|
jpayne@68
|
378 """The quality score of each entry in the fastq file, represented as a string."""
|
jpayne@68
|
379 def __get__(self):
|
jpayne@68
|
380 if self._delegate.qual.l:
|
jpayne@68
|
381 return charptr_to_str(self._delegate.qual.s)
|
jpayne@68
|
382 else:
|
jpayne@68
|
383 return None
|
jpayne@68
|
384
|
jpayne@68
|
385 cdef cython.str to_string(self):
|
jpayne@68
|
386 if self.comment is None:
|
jpayne@68
|
387 comment = ""
|
jpayne@68
|
388 else:
|
jpayne@68
|
389 comment = " %s" % self.comment
|
jpayne@68
|
390
|
jpayne@68
|
391 if self.quality is None:
|
jpayne@68
|
392 return ">%s%s\n%s" % (self.name, comment, self.sequence)
|
jpayne@68
|
393 else:
|
jpayne@68
|
394 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
|
jpayne@68
|
395 self.sequence, self.quality)
|
jpayne@68
|
396
|
jpayne@68
|
397 cdef cython.str tostring(self):
|
jpayne@68
|
398 """deprecated : use :meth:`to_string`"""
|
jpayne@68
|
399 return self.to_string()
|
jpayne@68
|
400
|
jpayne@68
|
401 def __str__(self):
|
jpayne@68
|
402 return self.to_string()
|
jpayne@68
|
403
|
jpayne@68
|
404 cpdef array.array get_quality_array(self, int offset=33):
|
jpayne@68
|
405 '''return quality values as integer array after subtracting offset.'''
|
jpayne@68
|
406 if self.quality is None:
|
jpayne@68
|
407 return None
|
jpayne@68
|
408 return qualitystring_to_array(force_bytes(self.quality),
|
jpayne@68
|
409 offset=offset)
|
jpayne@68
|
410
|
jpayne@68
|
411 cdef class FastxRecord:
|
jpayne@68
|
412 """A fasta/fastq record.
|
jpayne@68
|
413
|
jpayne@68
|
414 A record must contain a name and a sequence. If either of them are
|
jpayne@68
|
415 None, a ValueError is raised on writing.
|
jpayne@68
|
416
|
jpayne@68
|
417 """
|
jpayne@68
|
418 def __init__(self,
|
jpayne@68
|
419 name=None,
|
jpayne@68
|
420 comment=None,
|
jpayne@68
|
421 sequence=None,
|
jpayne@68
|
422 quality=None,
|
jpayne@68
|
423 FastqProxy proxy=None):
|
jpayne@68
|
424 if proxy is not None:
|
jpayne@68
|
425 self.comment = proxy.comment
|
jpayne@68
|
426 self.quality = proxy.quality
|
jpayne@68
|
427 self.sequence = proxy.sequence
|
jpayne@68
|
428 self.name = proxy.name
|
jpayne@68
|
429 else:
|
jpayne@68
|
430 self.comment = comment
|
jpayne@68
|
431 self.quality = quality
|
jpayne@68
|
432 self.sequence = sequence
|
jpayne@68
|
433 self.name = name
|
jpayne@68
|
434
|
jpayne@68
|
435 def __copy__(self):
|
jpayne@68
|
436 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
|
jpayne@68
|
437
|
jpayne@68
|
438 def __deepcopy__(self, memo):
|
jpayne@68
|
439 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
|
jpayne@68
|
440
|
jpayne@68
|
441 cdef cython.str to_string(self):
|
jpayne@68
|
442 if self.name is None:
|
jpayne@68
|
443 raise ValueError("can not write record without name")
|
jpayne@68
|
444
|
jpayne@68
|
445 if self.sequence is None:
|
jpayne@68
|
446 raise ValueError("can not write record without a sequence")
|
jpayne@68
|
447
|
jpayne@68
|
448 if self.comment is None:
|
jpayne@68
|
449 comment = ""
|
jpayne@68
|
450 else:
|
jpayne@68
|
451 comment = " %s" % self.comment
|
jpayne@68
|
452
|
jpayne@68
|
453 if self.quality is None:
|
jpayne@68
|
454 return ">%s%s\n%s" % (self.name, comment, self.sequence)
|
jpayne@68
|
455 else:
|
jpayne@68
|
456 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
|
jpayne@68
|
457 self.sequence, self.quality)
|
jpayne@68
|
458
|
jpayne@68
|
459 cdef cython.str tostring(self):
|
jpayne@68
|
460 """deprecated : use :meth:`to_string`"""
|
jpayne@68
|
461 return self.to_string()
|
jpayne@68
|
462
|
jpayne@68
|
463 def set_name(self, name):
|
jpayne@68
|
464 if name is None:
|
jpayne@68
|
465 raise ValueError("FastxRecord must have a name and not None")
|
jpayne@68
|
466 self.name = name
|
jpayne@68
|
467
|
jpayne@68
|
468 def set_comment(self, comment):
|
jpayne@68
|
469 self.comment = comment
|
jpayne@68
|
470
|
jpayne@68
|
471 def set_sequence(self, sequence, quality=None):
|
jpayne@68
|
472 """set sequence of this record.
|
jpayne@68
|
473
|
jpayne@68
|
474 """
|
jpayne@68
|
475 self.sequence = sequence
|
jpayne@68
|
476 if quality is not None:
|
jpayne@68
|
477 if len(sequence) != len(quality):
|
jpayne@68
|
478 raise ValueError("sequence and quality length do not match: {} vs {}".format(
|
jpayne@68
|
479 len(sequence), len(quality)))
|
jpayne@68
|
480
|
jpayne@68
|
481 self.quality = quality
|
jpayne@68
|
482 else:
|
jpayne@68
|
483 self.quality = None
|
jpayne@68
|
484
|
jpayne@68
|
485 def __str__(self):
|
jpayne@68
|
486 return self.to_string()
|
jpayne@68
|
487
|
jpayne@68
|
488 cpdef array.array get_quality_array(self, int offset=33):
|
jpayne@68
|
489 '''return quality values as array after subtracting offset.'''
|
jpayne@68
|
490 if self.quality is None:
|
jpayne@68
|
491 return None
|
jpayne@68
|
492 return qualitystring_to_array(force_bytes(self.quality),
|
jpayne@68
|
493 offset=offset)
|
jpayne@68
|
494
|
jpayne@68
|
495
|
jpayne@68
|
496 cdef class FastxFile:
|
jpayne@68
|
497 r"""Stream access to :term:`fasta` or :term:`fastq` formatted files.
|
jpayne@68
|
498
|
jpayne@68
|
499 The file is automatically opened.
|
jpayne@68
|
500
|
jpayne@68
|
501 Entries in the file can be both fastq or fasta formatted or even a
|
jpayne@68
|
502 mixture of the two.
|
jpayne@68
|
503
|
jpayne@68
|
504 This file object permits iterating over all entries in the
|
jpayne@68
|
505 file. Random access is not implemented. The iteration returns
|
jpayne@68
|
506 objects of type :class:`FastqProxy`
|
jpayne@68
|
507
|
jpayne@68
|
508 Parameters
|
jpayne@68
|
509 ----------
|
jpayne@68
|
510
|
jpayne@68
|
511 filename : string
|
jpayne@68
|
512 Filename of fasta/fastq file to be opened.
|
jpayne@68
|
513
|
jpayne@68
|
514 persist : bool
|
jpayne@68
|
515
|
jpayne@68
|
516 If True (default) make a copy of the entry in the file during
|
jpayne@68
|
517 iteration. If set to False, no copy will be made. This will
|
jpayne@68
|
518 permit much faster iteration, but an entry will not persist
|
jpayne@68
|
519 when the iteration continues and an entry is read-only.
|
jpayne@68
|
520
|
jpayne@68
|
521 Notes
|
jpayne@68
|
522 -----
|
jpayne@68
|
523 Prior to version 0.8.2, this class was called FastqFile.
|
jpayne@68
|
524
|
jpayne@68
|
525 Raises
|
jpayne@68
|
526 ------
|
jpayne@68
|
527
|
jpayne@68
|
528 IOError
|
jpayne@68
|
529 if file could not be opened
|
jpayne@68
|
530
|
jpayne@68
|
531
|
jpayne@68
|
532 Examples
|
jpayne@68
|
533 --------
|
jpayne@68
|
534 >>> with pysam.FastxFile(filename) as fh:
|
jpayne@68
|
535 ... for entry in fh:
|
jpayne@68
|
536 ... print(entry.name)
|
jpayne@68
|
537 ... print(entry.sequence)
|
jpayne@68
|
538 ... print(entry.comment)
|
jpayne@68
|
539 ... print(entry.quality)
|
jpayne@68
|
540 >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
|
jpayne@68
|
541 ... for entry in fin:
|
jpayne@68
|
542 ... fout.write(str(entry) + '\n')
|
jpayne@68
|
543
|
jpayne@68
|
544 """
|
jpayne@68
|
545 def __cinit__(self, *args, **kwargs):
|
jpayne@68
|
546 # self.fastqfile = <gzFile*>NULL
|
jpayne@68
|
547 self._filename = None
|
jpayne@68
|
548 self.entry = NULL
|
jpayne@68
|
549 self._open(*args, **kwargs)
|
jpayne@68
|
550
|
jpayne@68
|
551 def is_open(self):
|
jpayne@68
|
552 '''return true if samfile has been opened.'''
|
jpayne@68
|
553 return self.entry != NULL
|
jpayne@68
|
554
|
jpayne@68
|
555 def _open(self, filename, persist=True):
|
jpayne@68
|
556 '''open a fastq/fasta file in *filename*
|
jpayne@68
|
557
|
jpayne@68
|
558 Paramentes
|
jpayne@68
|
559 ----------
|
jpayne@68
|
560
|
jpayne@68
|
561 persist : bool
|
jpayne@68
|
562
|
jpayne@68
|
563 if True return a copy of the underlying data (default
|
jpayne@68
|
564 True). The copy will persist even if the iteration
|
jpayne@68
|
565 on the file continues.
|
jpayne@68
|
566
|
jpayne@68
|
567 '''
|
jpayne@68
|
568 if self.fastqfile != NULL:
|
jpayne@68
|
569 self.close()
|
jpayne@68
|
570
|
jpayne@68
|
571 self._filename = encode_filename(filename)
|
jpayne@68
|
572 cdef char *cfilename = self._filename
|
jpayne@68
|
573 self.is_remote = hisremote(cfilename)
|
jpayne@68
|
574
|
jpayne@68
|
575 # open file for reading
|
jpayne@68
|
576 if (self._filename != b"-"
|
jpayne@68
|
577 and not self.is_remote
|
jpayne@68
|
578 and not os.path.exists(filename)):
|
jpayne@68
|
579 raise IOError("file `%s` not found" % filename)
|
jpayne@68
|
580
|
jpayne@68
|
581 self.persist = persist
|
jpayne@68
|
582
|
jpayne@68
|
583 with nogil:
|
jpayne@68
|
584 self.fastqfile = bgzf_open(cfilename, "r")
|
jpayne@68
|
585 self.entry = kseq_init(self.fastqfile)
|
jpayne@68
|
586 self._filename = filename
|
jpayne@68
|
587
|
jpayne@68
|
588 def close(self):
|
jpayne@68
|
589 '''close the file.'''
|
jpayne@68
|
590 if self.fastqfile != NULL:
|
jpayne@68
|
591 bgzf_close(self.fastqfile)
|
jpayne@68
|
592 self.fastqfile = NULL
|
jpayne@68
|
593 if self.entry != NULL:
|
jpayne@68
|
594 kseq_destroy(self.entry)
|
jpayne@68
|
595 self.entry = NULL
|
jpayne@68
|
596
|
jpayne@68
|
597 def __dealloc__(self):
|
jpayne@68
|
598 if self.fastqfile != NULL:
|
jpayne@68
|
599 bgzf_close(self.fastqfile)
|
jpayne@68
|
600 if self.entry:
|
jpayne@68
|
601 kseq_destroy(self.entry)
|
jpayne@68
|
602
|
jpayne@68
|
603 # context manager interface
|
jpayne@68
|
604 def __enter__(self):
|
jpayne@68
|
605 return self
|
jpayne@68
|
606
|
jpayne@68
|
607 def __exit__(self, exc_type, exc_value, traceback):
|
jpayne@68
|
608 self.close()
|
jpayne@68
|
609 return False
|
jpayne@68
|
610
|
jpayne@68
|
611 property closed:
|
jpayne@68
|
612 """bool indicating the current state of the file object.
|
jpayne@68
|
613 This is a read-only attribute; the close() method changes the value.
|
jpayne@68
|
614 """
|
jpayne@68
|
615 def __get__(self):
|
jpayne@68
|
616 return not self.is_open()
|
jpayne@68
|
617
|
jpayne@68
|
618 property filename:
|
jpayne@68
|
619 """string with the filename associated with this object."""
|
jpayne@68
|
620 def __get__(self):
|
jpayne@68
|
621 return self._filename
|
jpayne@68
|
622
|
jpayne@68
|
623 def __iter__(self):
|
jpayne@68
|
624 if not self.is_open():
|
jpayne@68
|
625 raise ValueError("I/O operation on closed file")
|
jpayne@68
|
626 return self
|
jpayne@68
|
627
|
jpayne@68
|
628 cdef kseq_t * getCurrent(self):
|
jpayne@68
|
629 return self.entry
|
jpayne@68
|
630
|
jpayne@68
|
631 cdef int cnext(self):
|
jpayne@68
|
632 '''C version of iterator
|
jpayne@68
|
633 '''
|
jpayne@68
|
634 with nogil:
|
jpayne@68
|
635 return kseq_read(self.entry)
|
jpayne@68
|
636
|
jpayne@68
|
637 def __next__(self):
|
jpayne@68
|
638 """
|
jpayne@68
|
639 python version of next().
|
jpayne@68
|
640 """
|
jpayne@68
|
641 cdef int l
|
jpayne@68
|
642 with nogil:
|
jpayne@68
|
643 l = kseq_read(self.entry)
|
jpayne@68
|
644 if (l >= 0):
|
jpayne@68
|
645 if self.persist:
|
jpayne@68
|
646 return FastxRecord(proxy=makeFastqProxy(self.entry))
|
jpayne@68
|
647 return makeFastqProxy(self.entry)
|
jpayne@68
|
648 elif (l == -1):
|
jpayne@68
|
649 raise StopIteration
|
jpayne@68
|
650 elif (l == -2):
|
jpayne@68
|
651 raise ValueError('truncated quality string in {0}'
|
jpayne@68
|
652 .format(self._filename))
|
jpayne@68
|
653 else:
|
jpayne@68
|
654 raise ValueError('unknown problem parsing {0}'
|
jpayne@68
|
655 .format(self._filename))
|
jpayne@68
|
656
|
jpayne@68
|
657 # Compatibility Layer for pysam 0.8.1
|
jpayne@68
|
658 cdef class FastqFile(FastxFile):
|
jpayne@68
|
659 """FastqFile is deprecated: use FastxFile instead"""
|
jpayne@68
|
660 pass
|
jpayne@68
|
661
|
jpayne@68
|
662 # Compatibility Layer for pysam < 0.8
|
jpayne@68
|
663 cdef class Fastafile(FastaFile):
|
jpayne@68
|
664 """Fastafile is deprecated: use FastaFile instead"""
|
jpayne@68
|
665 pass
|
jpayne@68
|
666
|
jpayne@68
|
667 __all__ = ["FastaFile",
|
jpayne@68
|
668 "FastqFile",
|
jpayne@68
|
669 "FastxFile",
|
jpayne@68
|
670 "Fastafile",
|
jpayne@68
|
671 "FastxRecord",
|
jpayne@68
|
672 "FastqProxy"]
|