comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcfaidx.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # cython: embedsignature=True
2 # cython: profile=True
3 ###############################################################################
4 ###############################################################################
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
6 ###############################################################################
7 # The principal classes defined in this module are:
8 #
9 # class FastaFile random read read/write access to faidx indexd files
10 # class FastxFile streamed read/write access to fasta/fastq files
11 #
12 # Additionally this module defines several additional classes that are part
13 # of the internal API. These are:
14 #
15 # class FastqProxy
16 # class FastxRecord
17 #
18 # For backwards compatibility, the following classes are also defined:
19 #
20 # class Fastafile equivalent to FastaFile
21 # class FastqFile equivalent to FastxFile
22 #
23 ###############################################################################
24 #
25 # The MIT License
26 #
27 # Copyright (c) 2015 Andreas Heger
28 #
29 # Permission is hereby granted, free of charge, to any person obtaining a
30 # copy of this software and associated documentation files (the "Software"),
31 # to deal in the Software without restriction, including without limitation
32 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
33 # and/or sell copies of the Software, and to permit persons to whom the
34 # Software is furnished to do so, subject to the following conditions:
35 #
36 # The above copyright notice and this permission notice shall be included in
37 # all copies or substantial portions of the Software.
38 #
39 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
40 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
42 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
45 # DEALINGS IN THE SOFTWARE.
46 #
47 ###############################################################################
48 import sys
49 import os
50 import re
51
52
53 from libc.errno cimport errno
54 from libc.string cimport strerror
55
56 from cpython cimport array
57
58 from cpython cimport PyErr_SetString, \
59 PyBytes_Check, \
60 PyUnicode_Check, \
61 PyBytes_FromStringAndSize
62
63 from pysam.libchtslib cimport \
64 faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \
65 faidx_seq_len, faidx_iseq, faidx_seq_len, \
66 faidx_fetch_seq, hisremote, \
67 bgzf_open, bgzf_close
68
69 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
70 from pysam.libcutils cimport encode_filename, from_string_and_size
71 from pysam.libcutils cimport qualitystring_to_array, parse_region
72
73 cdef class FastqProxy
74 cdef makeFastqProxy(kseq_t * src):
75 '''enter src into AlignedRead.'''
76 cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
77 dest._delegate = src
78 return dest
79
80 ## TODO:
81 ## add automatic indexing.
82 ## add function to get sequence names.
83 cdef class FastaFile:
84 """Random access to fasta formatted files that
85 have been indexed by :term:`faidx`.
86
87 The file is automatically opened. The index file of file
88 ``<filename>`` is expected to be called ``<filename>.fai``.
89
90 Parameters
91 ----------
92
93 filename : string
94 Filename of fasta file to be opened.
95
96 filepath_index : string
97 Optional, filename of the index. By default this is
98 the filename + ".fai".
99
100 filepath_index_compressed : string
101 Optional, filename of the index if fasta file is. By default this is
102 the filename + ".gzi".
103
104 Raises
105 ------
106
107 ValueError
108 if index file is missing
109
110 IOError
111 if file could not be opened
112
113 """
114
115 def __cinit__(self, *args, **kwargs):
116 self.fastafile = NULL
117 self._filename = None
118 self._references = None
119 self._lengths = None
120 self.reference2length = None
121 self._open(*args, **kwargs)
122
123 def is_open(self):
124 '''return true if samfile has been opened.'''
125 return self.fastafile != NULL
126
127 def __len__(self):
128 if self.fastafile == NULL:
129 raise ValueError("calling len() on closed file")
130
131 return faidx_nseq(self.fastafile)
132
133 def _open(self, filename, filepath_index=None, filepath_index_compressed=None):
134 '''open an indexed fasta file.
135
136 This method expects an indexed fasta file.
137 '''
138
139 # close a previously opened file
140 if self.fastafile != NULL:
141 self.close()
142
143 self._filename = encode_filename(filename)
144 cdef char *cfilename = self._filename
145 cdef char *cindexname = NULL
146 cdef char *cindexname_compressed = NULL
147 self.is_remote = hisremote(cfilename)
148
149 # open file for reading
150 if (self._filename != b"-"
151 and not self.is_remote
152 and not os.path.exists(filename)):
153 raise IOError("file `%s` not found" % filename)
154
155 # 3 modes to open:
156 # compressed fa: fai_load3 with filename, index_fai and index_gzi
157 # uncompressed fa: fai_load3 with filename and index_fai
158 # uncompressed fa: fai_load with default index name
159 if filepath_index:
160 # when opening, set flags to 0 - do not automatically
161 # build index if it does not exist.
162
163 if not os.path.exists(filepath_index):
164 raise IOError("filename {} does not exist".format(filepath_index))
165 cindexname = bindex_filename = encode_filename(filepath_index)
166
167 if filepath_index_compressed:
168 if not os.path.exists(filepath_index_compressed):
169 raise IOError("filename {} does not exist".format(filepath_index_compressed))
170 cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed)
171 with nogil:
172 self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0)
173 else:
174 with nogil:
175 self.fastafile = fai_load3(cfilename, cindexname, NULL, 0)
176 else:
177 with nogil:
178 self.fastafile = fai_load(cfilename)
179
180 if self.fastafile == NULL:
181 raise IOError("error when opening file `%s`" % filename)
182
183 cdef int nreferences = faidx_nseq(self.fastafile)
184 cdef int x
185 cdef const char * s
186 self._references = []
187 self._lengths = []
188 for x from 0 <= x < nreferences:
189 s = faidx_iseq(self.fastafile, x)
190 ss = force_str(s)
191 self._references.append(ss)
192 self._lengths.append(faidx_seq_len(self.fastafile, s))
193 self.reference2length = dict(zip(self._references, self._lengths))
194
195 def close(self):
196 """close the file."""
197 if self.fastafile != NULL:
198 fai_destroy(self.fastafile)
199 self.fastafile = NULL
200
201 def __dealloc__(self):
202 if self.fastafile != NULL:
203 fai_destroy(self.fastafile)
204 self.fastafile = NULL
205
206 # context manager interface
207 def __enter__(self):
208 return self
209
210 def __exit__(self, exc_type, exc_value, traceback):
211 self.close()
212 return False
213
214 property closed:
215 """bool indicating the current state of the file object.
216 This is a read-only attribute; the close() method changes the value.
217 """
218 def __get__(self):
219 return not self.is_open()
220
221 property filename:
222 """filename associated with this object. This is a read-only attribute."""
223 def __get__(self):
224 return self._filename
225
226 property references:
227 '''tuple with the names of :term:`reference` sequences.'''
228 def __get__(self):
229 return self._references
230
231 property nreferences:
232 """int with the number of :term:`reference` sequences in the file.
233 This is a read-only attribute."""
234 def __get__(self):
235 return len(self._references) if self.references else None
236
237 property lengths:
238 """tuple with the lengths of :term:`reference` sequences."""
239 def __get__(self):
240 return self._lengths
241
242 def fetch(self,
243 reference=None,
244 start=None,
245 end=None,
246 region=None):
247 """fetch sequences in a :term:`region`.
248
249 A region can
250 either be specified by :term:`reference`, `start` and
251 `end`. `start` and `end` denote 0-based, half-open
252 intervals.
253
254 Alternatively, a samtools :term:`region` string can be
255 supplied.
256
257 If any of the coordinates are missing they will be replaced by the
258 minimum (`start`) or maximum (`end`) coordinate.
259
260 Note that region strings are 1-based, while `start` and `end` denote
261 an interval in python coordinates.
262 The region is specified by :term:`reference`, `start` and `end`.
263
264 Returns
265 -------
266
267 string : a string with the sequence specified by the region.
268
269 Raises
270 ------
271
272 IndexError
273 if the coordinates are out of range
274
275 ValueError
276 if the region is invalid
277
278 """
279
280 if not self.is_open():
281 raise ValueError("I/O operation on closed file" )
282
283 cdef int length
284 cdef char *seq
285 cdef char *ref
286 cdef int rstart, rend
287
288 contig, rstart, rend = parse_region(reference, start, end, region)
289
290 if contig is None:
291 raise ValueError("no sequence/region supplied.")
292
293 if rstart == rend:
294 return ""
295
296 contig_b = force_bytes(contig)
297 ref = contig_b
298 with nogil:
299 length = faidx_seq_len(self.fastafile, ref)
300 if length == -1:
301 raise KeyError("sequence '%s' not present" % contig)
302 if rstart >= length:
303 return ""
304
305 # fai_fetch adds a '\0' at the end
306 with nogil:
307 seq = faidx_fetch_seq(self.fastafile,
308 ref,
309 rstart,
310 rend-1,
311 &length)
312
313 if not seq:
314 if errno:
315 raise IOError(errno, strerror(errno))
316 else:
317 raise ValueError("failure when retrieving sequence on '%s'" % contig)
318
319 try:
320 return charptr_to_str(seq)
321 finally:
322 free(seq)
323
324 cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL:
325 '''fetch sequence for reference, start and end'''
326
327 cdef char *seq
328 with nogil:
329 seq = faidx_fetch_seq(self.fastafile,
330 reference,
331 start,
332 end-1,
333 length)
334
335 if not seq:
336 if errno:
337 raise IOError(errno, strerror(errno))
338 else:
339 raise ValueError("failure when retrieving sequence on '%s'" % reference)
340
341 return seq
342
343 def get_reference_length(self, reference):
344 '''return the length of reference.'''
345 return self.reference2length[reference]
346
347 def __getitem__(self, reference):
348 return self.fetch(reference)
349
350 def __contains__(self, reference):
351 '''return true if reference in fasta file.'''
352 return reference in self.reference2length
353
354
355 cdef class FastqProxy:
356 """A single entry in a fastq file."""
357 def __init__(self):
358 raise ValueError("do not instantiate FastqProxy directly")
359
360 property name:
361 """The name of each entry in the fastq file."""
362 def __get__(self):
363 return charptr_to_str(self._delegate.name.s)
364
365 property sequence:
366 """The sequence of each entry in the fastq file."""
367 def __get__(self):
368 return charptr_to_str(self._delegate.seq.s)
369
370 property comment:
371 def __get__(self):
372 if self._delegate.comment.l:
373 return charptr_to_str(self._delegate.comment.s)
374 else:
375 return None
376
377 property quality:
378 """The quality score of each entry in the fastq file, represented as a string."""
379 def __get__(self):
380 if self._delegate.qual.l:
381 return charptr_to_str(self._delegate.qual.s)
382 else:
383 return None
384
385 cdef cython.str to_string(self):
386 if self.comment is None:
387 comment = ""
388 else:
389 comment = " %s" % self.comment
390
391 if self.quality is None:
392 return ">%s%s\n%s" % (self.name, comment, self.sequence)
393 else:
394 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
395 self.sequence, self.quality)
396
397 cdef cython.str tostring(self):
398 """deprecated : use :meth:`to_string`"""
399 return self.to_string()
400
401 def __str__(self):
402 return self.to_string()
403
404 cpdef array.array get_quality_array(self, int offset=33):
405 '''return quality values as integer array after subtracting offset.'''
406 if self.quality is None:
407 return None
408 return qualitystring_to_array(force_bytes(self.quality),
409 offset=offset)
410
411 cdef class FastxRecord:
412 """A fasta/fastq record.
413
414 A record must contain a name and a sequence. If either of them are
415 None, a ValueError is raised on writing.
416
417 """
418 def __init__(self,
419 name=None,
420 comment=None,
421 sequence=None,
422 quality=None,
423 FastqProxy proxy=None):
424 if proxy is not None:
425 self.comment = proxy.comment
426 self.quality = proxy.quality
427 self.sequence = proxy.sequence
428 self.name = proxy.name
429 else:
430 self.comment = comment
431 self.quality = quality
432 self.sequence = sequence
433 self.name = name
434
435 def __copy__(self):
436 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
437
438 def __deepcopy__(self, memo):
439 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
440
441 cdef cython.str to_string(self):
442 if self.name is None:
443 raise ValueError("can not write record without name")
444
445 if self.sequence is None:
446 raise ValueError("can not write record without a sequence")
447
448 if self.comment is None:
449 comment = ""
450 else:
451 comment = " %s" % self.comment
452
453 if self.quality is None:
454 return ">%s%s\n%s" % (self.name, comment, self.sequence)
455 else:
456 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
457 self.sequence, self.quality)
458
459 cdef cython.str tostring(self):
460 """deprecated : use :meth:`to_string`"""
461 return self.to_string()
462
463 def set_name(self, name):
464 if name is None:
465 raise ValueError("FastxRecord must have a name and not None")
466 self.name = name
467
468 def set_comment(self, comment):
469 self.comment = comment
470
471 def set_sequence(self, sequence, quality=None):
472 """set sequence of this record.
473
474 """
475 self.sequence = sequence
476 if quality is not None:
477 if len(sequence) != len(quality):
478 raise ValueError("sequence and quality length do not match: {} vs {}".format(
479 len(sequence), len(quality)))
480
481 self.quality = quality
482 else:
483 self.quality = None
484
485 def __str__(self):
486 return self.to_string()
487
488 cpdef array.array get_quality_array(self, int offset=33):
489 '''return quality values as array after subtracting offset.'''
490 if self.quality is None:
491 return None
492 return qualitystring_to_array(force_bytes(self.quality),
493 offset=offset)
494
495
496 cdef class FastxFile:
497 r"""Stream access to :term:`fasta` or :term:`fastq` formatted files.
498
499 The file is automatically opened.
500
501 Entries in the file can be both fastq or fasta formatted or even a
502 mixture of the two.
503
504 This file object permits iterating over all entries in the
505 file. Random access is not implemented. The iteration returns
506 objects of type :class:`FastqProxy`
507
508 Parameters
509 ----------
510
511 filename : string
512 Filename of fasta/fastq file to be opened.
513
514 persist : bool
515
516 If True (default) make a copy of the entry in the file during
517 iteration. If set to False, no copy will be made. This will
518 permit much faster iteration, but an entry will not persist
519 when the iteration continues and an entry is read-only.
520
521 Notes
522 -----
523 Prior to version 0.8.2, this class was called FastqFile.
524
525 Raises
526 ------
527
528 IOError
529 if file could not be opened
530
531
532 Examples
533 --------
534 >>> with pysam.FastxFile(filename) as fh:
535 ... for entry in fh:
536 ... print(entry.name)
537 ... print(entry.sequence)
538 ... print(entry.comment)
539 ... print(entry.quality)
540 >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
541 ... for entry in fin:
542 ... fout.write(str(entry) + '\n')
543
544 """
545 def __cinit__(self, *args, **kwargs):
546 # self.fastqfile = <gzFile*>NULL
547 self._filename = None
548 self.entry = NULL
549 self._open(*args, **kwargs)
550
551 def is_open(self):
552 '''return true if samfile has been opened.'''
553 return self.entry != NULL
554
555 def _open(self, filename, persist=True):
556 '''open a fastq/fasta file in *filename*
557
558 Paramentes
559 ----------
560
561 persist : bool
562
563 if True return a copy of the underlying data (default
564 True). The copy will persist even if the iteration
565 on the file continues.
566
567 '''
568 if self.fastqfile != NULL:
569 self.close()
570
571 self._filename = encode_filename(filename)
572 cdef char *cfilename = self._filename
573 self.is_remote = hisremote(cfilename)
574
575 # open file for reading
576 if (self._filename != b"-"
577 and not self.is_remote
578 and not os.path.exists(filename)):
579 raise IOError("file `%s` not found" % filename)
580
581 self.persist = persist
582
583 with nogil:
584 self.fastqfile = bgzf_open(cfilename, "r")
585 self.entry = kseq_init(self.fastqfile)
586 self._filename = filename
587
588 def close(self):
589 '''close the file.'''
590 if self.fastqfile != NULL:
591 bgzf_close(self.fastqfile)
592 self.fastqfile = NULL
593 if self.entry != NULL:
594 kseq_destroy(self.entry)
595 self.entry = NULL
596
597 def __dealloc__(self):
598 if self.fastqfile != NULL:
599 bgzf_close(self.fastqfile)
600 if self.entry:
601 kseq_destroy(self.entry)
602
603 # context manager interface
604 def __enter__(self):
605 return self
606
607 def __exit__(self, exc_type, exc_value, traceback):
608 self.close()
609 return False
610
611 property closed:
612 """bool indicating the current state of the file object.
613 This is a read-only attribute; the close() method changes the value.
614 """
615 def __get__(self):
616 return not self.is_open()
617
618 property filename:
619 """string with the filename associated with this object."""
620 def __get__(self):
621 return self._filename
622
623 def __iter__(self):
624 if not self.is_open():
625 raise ValueError("I/O operation on closed file")
626 return self
627
628 cdef kseq_t * getCurrent(self):
629 return self.entry
630
631 cdef int cnext(self):
632 '''C version of iterator
633 '''
634 with nogil:
635 return kseq_read(self.entry)
636
637 def __next__(self):
638 """
639 python version of next().
640 """
641 cdef int l
642 with nogil:
643 l = kseq_read(self.entry)
644 if (l >= 0):
645 if self.persist:
646 return FastxRecord(proxy=makeFastqProxy(self.entry))
647 return makeFastqProxy(self.entry)
648 elif (l == -1):
649 raise StopIteration
650 elif (l == -2):
651 raise ValueError('truncated quality string in {0}'
652 .format(self._filename))
653 else:
654 raise ValueError('unknown problem parsing {0}'
655 .format(self._filename))
656
657 # Compatibility Layer for pysam 0.8.1
658 cdef class FastqFile(FastxFile):
659 """FastqFile is deprecated: use FastxFile instead"""
660 pass
661
662 # Compatibility Layer for pysam < 0.8
663 cdef class Fastafile(FastaFile):
664 """Fastafile is deprecated: use FastaFile instead"""
665 pass
666
667 __all__ = ["FastaFile",
668 "FastqFile",
669 "FastxFile",
670 "Fastafile",
671 "FastxRecord",
672 "FastqProxy"]