Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcfaidx.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # cython: embedsignature=True | |
2 # cython: profile=True | |
3 ############################################################################### | |
4 ############################################################################### | |
5 # Cython wrapper for SAM/BAM/CRAM files based on htslib | |
6 ############################################################################### | |
7 # The principal classes defined in this module are: | |
8 # | |
9 # class FastaFile random read read/write access to faidx indexd files | |
10 # class FastxFile streamed read/write access to fasta/fastq files | |
11 # | |
12 # Additionally this module defines several additional classes that are part | |
13 # of the internal API. These are: | |
14 # | |
15 # class FastqProxy | |
16 # class FastxRecord | |
17 # | |
18 # For backwards compatibility, the following classes are also defined: | |
19 # | |
20 # class Fastafile equivalent to FastaFile | |
21 # class FastqFile equivalent to FastxFile | |
22 # | |
23 ############################################################################### | |
24 # | |
25 # The MIT License | |
26 # | |
27 # Copyright (c) 2015 Andreas Heger | |
28 # | |
29 # Permission is hereby granted, free of charge, to any person obtaining a | |
30 # copy of this software and associated documentation files (the "Software"), | |
31 # to deal in the Software without restriction, including without limitation | |
32 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
33 # and/or sell copies of the Software, and to permit persons to whom the | |
34 # Software is furnished to do so, subject to the following conditions: | |
35 # | |
36 # The above copyright notice and this permission notice shall be included in | |
37 # all copies or substantial portions of the Software. | |
38 # | |
39 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
40 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
41 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
42 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
43 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
44 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
45 # DEALINGS IN THE SOFTWARE. | |
46 # | |
47 ############################################################################### | |
48 import sys | |
49 import os | |
50 import re | |
51 | |
52 | |
53 from libc.errno cimport errno | |
54 from libc.string cimport strerror | |
55 | |
56 from cpython cimport array | |
57 | |
58 from cpython cimport PyErr_SetString, \ | |
59 PyBytes_Check, \ | |
60 PyUnicode_Check, \ | |
61 PyBytes_FromStringAndSize | |
62 | |
63 from pysam.libchtslib cimport \ | |
64 faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \ | |
65 faidx_seq_len, faidx_iseq, faidx_seq_len, \ | |
66 faidx_fetch_seq, hisremote, \ | |
67 bgzf_open, bgzf_close | |
68 | |
69 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str | |
70 from pysam.libcutils cimport encode_filename, from_string_and_size | |
71 from pysam.libcutils cimport qualitystring_to_array, parse_region | |
72 | |
73 cdef class FastqProxy | |
74 cdef makeFastqProxy(kseq_t * src): | |
75 '''enter src into AlignedRead.''' | |
76 cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) | |
77 dest._delegate = src | |
78 return dest | |
79 | |
80 ## TODO: | |
81 ## add automatic indexing. | |
82 ## add function to get sequence names. | |
83 cdef class FastaFile: | |
84 """Random access to fasta formatted files that | |
85 have been indexed by :term:`faidx`. | |
86 | |
87 The file is automatically opened. The index file of file | |
88 ``<filename>`` is expected to be called ``<filename>.fai``. | |
89 | |
90 Parameters | |
91 ---------- | |
92 | |
93 filename : string | |
94 Filename of fasta file to be opened. | |
95 | |
96 filepath_index : string | |
97 Optional, filename of the index. By default this is | |
98 the filename + ".fai". | |
99 | |
100 filepath_index_compressed : string | |
101 Optional, filename of the index if fasta file is. By default this is | |
102 the filename + ".gzi". | |
103 | |
104 Raises | |
105 ------ | |
106 | |
107 ValueError | |
108 if index file is missing | |
109 | |
110 IOError | |
111 if file could not be opened | |
112 | |
113 """ | |
114 | |
115 def __cinit__(self, *args, **kwargs): | |
116 self.fastafile = NULL | |
117 self._filename = None | |
118 self._references = None | |
119 self._lengths = None | |
120 self.reference2length = None | |
121 self._open(*args, **kwargs) | |
122 | |
123 def is_open(self): | |
124 '''return true if samfile has been opened.''' | |
125 return self.fastafile != NULL | |
126 | |
127 def __len__(self): | |
128 if self.fastafile == NULL: | |
129 raise ValueError("calling len() on closed file") | |
130 | |
131 return faidx_nseq(self.fastafile) | |
132 | |
133 def _open(self, filename, filepath_index=None, filepath_index_compressed=None): | |
134 '''open an indexed fasta file. | |
135 | |
136 This method expects an indexed fasta file. | |
137 ''' | |
138 | |
139 # close a previously opened file | |
140 if self.fastafile != NULL: | |
141 self.close() | |
142 | |
143 self._filename = encode_filename(filename) | |
144 cdef char *cfilename = self._filename | |
145 cdef char *cindexname = NULL | |
146 cdef char *cindexname_compressed = NULL | |
147 self.is_remote = hisremote(cfilename) | |
148 | |
149 # open file for reading | |
150 if (self._filename != b"-" | |
151 and not self.is_remote | |
152 and not os.path.exists(filename)): | |
153 raise IOError("file `%s` not found" % filename) | |
154 | |
155 # 3 modes to open: | |
156 # compressed fa: fai_load3 with filename, index_fai and index_gzi | |
157 # uncompressed fa: fai_load3 with filename and index_fai | |
158 # uncompressed fa: fai_load with default index name | |
159 if filepath_index: | |
160 # when opening, set flags to 0 - do not automatically | |
161 # build index if it does not exist. | |
162 | |
163 if not os.path.exists(filepath_index): | |
164 raise IOError("filename {} does not exist".format(filepath_index)) | |
165 cindexname = bindex_filename = encode_filename(filepath_index) | |
166 | |
167 if filepath_index_compressed: | |
168 if not os.path.exists(filepath_index_compressed): | |
169 raise IOError("filename {} does not exist".format(filepath_index_compressed)) | |
170 cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed) | |
171 with nogil: | |
172 self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0) | |
173 else: | |
174 with nogil: | |
175 self.fastafile = fai_load3(cfilename, cindexname, NULL, 0) | |
176 else: | |
177 with nogil: | |
178 self.fastafile = fai_load(cfilename) | |
179 | |
180 if self.fastafile == NULL: | |
181 raise IOError("error when opening file `%s`" % filename) | |
182 | |
183 cdef int nreferences = faidx_nseq(self.fastafile) | |
184 cdef int x | |
185 cdef const char * s | |
186 self._references = [] | |
187 self._lengths = [] | |
188 for x from 0 <= x < nreferences: | |
189 s = faidx_iseq(self.fastafile, x) | |
190 ss = force_str(s) | |
191 self._references.append(ss) | |
192 self._lengths.append(faidx_seq_len(self.fastafile, s)) | |
193 self.reference2length = dict(zip(self._references, self._lengths)) | |
194 | |
195 def close(self): | |
196 """close the file.""" | |
197 if self.fastafile != NULL: | |
198 fai_destroy(self.fastafile) | |
199 self.fastafile = NULL | |
200 | |
201 def __dealloc__(self): | |
202 if self.fastafile != NULL: | |
203 fai_destroy(self.fastafile) | |
204 self.fastafile = NULL | |
205 | |
206 # context manager interface | |
207 def __enter__(self): | |
208 return self | |
209 | |
210 def __exit__(self, exc_type, exc_value, traceback): | |
211 self.close() | |
212 return False | |
213 | |
214 property closed: | |
215 """bool indicating the current state of the file object. | |
216 This is a read-only attribute; the close() method changes the value. | |
217 """ | |
218 def __get__(self): | |
219 return not self.is_open() | |
220 | |
221 property filename: | |
222 """filename associated with this object. This is a read-only attribute.""" | |
223 def __get__(self): | |
224 return self._filename | |
225 | |
226 property references: | |
227 '''tuple with the names of :term:`reference` sequences.''' | |
228 def __get__(self): | |
229 return self._references | |
230 | |
231 property nreferences: | |
232 """int with the number of :term:`reference` sequences in the file. | |
233 This is a read-only attribute.""" | |
234 def __get__(self): | |
235 return len(self._references) if self.references else None | |
236 | |
237 property lengths: | |
238 """tuple with the lengths of :term:`reference` sequences.""" | |
239 def __get__(self): | |
240 return self._lengths | |
241 | |
242 def fetch(self, | |
243 reference=None, | |
244 start=None, | |
245 end=None, | |
246 region=None): | |
247 """fetch sequences in a :term:`region`. | |
248 | |
249 A region can | |
250 either be specified by :term:`reference`, `start` and | |
251 `end`. `start` and `end` denote 0-based, half-open | |
252 intervals. | |
253 | |
254 Alternatively, a samtools :term:`region` string can be | |
255 supplied. | |
256 | |
257 If any of the coordinates are missing they will be replaced by the | |
258 minimum (`start`) or maximum (`end`) coordinate. | |
259 | |
260 Note that region strings are 1-based, while `start` and `end` denote | |
261 an interval in python coordinates. | |
262 The region is specified by :term:`reference`, `start` and `end`. | |
263 | |
264 Returns | |
265 ------- | |
266 | |
267 string : a string with the sequence specified by the region. | |
268 | |
269 Raises | |
270 ------ | |
271 | |
272 IndexError | |
273 if the coordinates are out of range | |
274 | |
275 ValueError | |
276 if the region is invalid | |
277 | |
278 """ | |
279 | |
280 if not self.is_open(): | |
281 raise ValueError("I/O operation on closed file" ) | |
282 | |
283 cdef int length | |
284 cdef char *seq | |
285 cdef char *ref | |
286 cdef int rstart, rend | |
287 | |
288 contig, rstart, rend = parse_region(reference, start, end, region) | |
289 | |
290 if contig is None: | |
291 raise ValueError("no sequence/region supplied.") | |
292 | |
293 if rstart == rend: | |
294 return "" | |
295 | |
296 contig_b = force_bytes(contig) | |
297 ref = contig_b | |
298 with nogil: | |
299 length = faidx_seq_len(self.fastafile, ref) | |
300 if length == -1: | |
301 raise KeyError("sequence '%s' not present" % contig) | |
302 if rstart >= length: | |
303 return "" | |
304 | |
305 # fai_fetch adds a '\0' at the end | |
306 with nogil: | |
307 seq = faidx_fetch_seq(self.fastafile, | |
308 ref, | |
309 rstart, | |
310 rend-1, | |
311 &length) | |
312 | |
313 if not seq: | |
314 if errno: | |
315 raise IOError(errno, strerror(errno)) | |
316 else: | |
317 raise ValueError("failure when retrieving sequence on '%s'" % contig) | |
318 | |
319 try: | |
320 return charptr_to_str(seq) | |
321 finally: | |
322 free(seq) | |
323 | |
324 cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL: | |
325 '''fetch sequence for reference, start and end''' | |
326 | |
327 cdef char *seq | |
328 with nogil: | |
329 seq = faidx_fetch_seq(self.fastafile, | |
330 reference, | |
331 start, | |
332 end-1, | |
333 length) | |
334 | |
335 if not seq: | |
336 if errno: | |
337 raise IOError(errno, strerror(errno)) | |
338 else: | |
339 raise ValueError("failure when retrieving sequence on '%s'" % reference) | |
340 | |
341 return seq | |
342 | |
343 def get_reference_length(self, reference): | |
344 '''return the length of reference.''' | |
345 return self.reference2length[reference] | |
346 | |
347 def __getitem__(self, reference): | |
348 return self.fetch(reference) | |
349 | |
350 def __contains__(self, reference): | |
351 '''return true if reference in fasta file.''' | |
352 return reference in self.reference2length | |
353 | |
354 | |
355 cdef class FastqProxy: | |
356 """A single entry in a fastq file.""" | |
357 def __init__(self): | |
358 raise ValueError("do not instantiate FastqProxy directly") | |
359 | |
360 property name: | |
361 """The name of each entry in the fastq file.""" | |
362 def __get__(self): | |
363 return charptr_to_str(self._delegate.name.s) | |
364 | |
365 property sequence: | |
366 """The sequence of each entry in the fastq file.""" | |
367 def __get__(self): | |
368 return charptr_to_str(self._delegate.seq.s) | |
369 | |
370 property comment: | |
371 def __get__(self): | |
372 if self._delegate.comment.l: | |
373 return charptr_to_str(self._delegate.comment.s) | |
374 else: | |
375 return None | |
376 | |
377 property quality: | |
378 """The quality score of each entry in the fastq file, represented as a string.""" | |
379 def __get__(self): | |
380 if self._delegate.qual.l: | |
381 return charptr_to_str(self._delegate.qual.s) | |
382 else: | |
383 return None | |
384 | |
385 cdef cython.str to_string(self): | |
386 if self.comment is None: | |
387 comment = "" | |
388 else: | |
389 comment = " %s" % self.comment | |
390 | |
391 if self.quality is None: | |
392 return ">%s%s\n%s" % (self.name, comment, self.sequence) | |
393 else: | |
394 return "@%s%s\n%s\n+\n%s" % (self.name, comment, | |
395 self.sequence, self.quality) | |
396 | |
397 cdef cython.str tostring(self): | |
398 """deprecated : use :meth:`to_string`""" | |
399 return self.to_string() | |
400 | |
401 def __str__(self): | |
402 return self.to_string() | |
403 | |
404 cpdef array.array get_quality_array(self, int offset=33): | |
405 '''return quality values as integer array after subtracting offset.''' | |
406 if self.quality is None: | |
407 return None | |
408 return qualitystring_to_array(force_bytes(self.quality), | |
409 offset=offset) | |
410 | |
411 cdef class FastxRecord: | |
412 """A fasta/fastq record. | |
413 | |
414 A record must contain a name and a sequence. If either of them are | |
415 None, a ValueError is raised on writing. | |
416 | |
417 """ | |
418 def __init__(self, | |
419 name=None, | |
420 comment=None, | |
421 sequence=None, | |
422 quality=None, | |
423 FastqProxy proxy=None): | |
424 if proxy is not None: | |
425 self.comment = proxy.comment | |
426 self.quality = proxy.quality | |
427 self.sequence = proxy.sequence | |
428 self.name = proxy.name | |
429 else: | |
430 self.comment = comment | |
431 self.quality = quality | |
432 self.sequence = sequence | |
433 self.name = name | |
434 | |
435 def __copy__(self): | |
436 return FastxRecord(self.name, self.comment, self.sequence, self.quality) | |
437 | |
438 def __deepcopy__(self, memo): | |
439 return FastxRecord(self.name, self.comment, self.sequence, self.quality) | |
440 | |
441 cdef cython.str to_string(self): | |
442 if self.name is None: | |
443 raise ValueError("can not write record without name") | |
444 | |
445 if self.sequence is None: | |
446 raise ValueError("can not write record without a sequence") | |
447 | |
448 if self.comment is None: | |
449 comment = "" | |
450 else: | |
451 comment = " %s" % self.comment | |
452 | |
453 if self.quality is None: | |
454 return ">%s%s\n%s" % (self.name, comment, self.sequence) | |
455 else: | |
456 return "@%s%s\n%s\n+\n%s" % (self.name, comment, | |
457 self.sequence, self.quality) | |
458 | |
459 cdef cython.str tostring(self): | |
460 """deprecated : use :meth:`to_string`""" | |
461 return self.to_string() | |
462 | |
463 def set_name(self, name): | |
464 if name is None: | |
465 raise ValueError("FastxRecord must have a name and not None") | |
466 self.name = name | |
467 | |
468 def set_comment(self, comment): | |
469 self.comment = comment | |
470 | |
471 def set_sequence(self, sequence, quality=None): | |
472 """set sequence of this record. | |
473 | |
474 """ | |
475 self.sequence = sequence | |
476 if quality is not None: | |
477 if len(sequence) != len(quality): | |
478 raise ValueError("sequence and quality length do not match: {} vs {}".format( | |
479 len(sequence), len(quality))) | |
480 | |
481 self.quality = quality | |
482 else: | |
483 self.quality = None | |
484 | |
485 def __str__(self): | |
486 return self.to_string() | |
487 | |
488 cpdef array.array get_quality_array(self, int offset=33): | |
489 '''return quality values as array after subtracting offset.''' | |
490 if self.quality is None: | |
491 return None | |
492 return qualitystring_to_array(force_bytes(self.quality), | |
493 offset=offset) | |
494 | |
495 | |
496 cdef class FastxFile: | |
497 r"""Stream access to :term:`fasta` or :term:`fastq` formatted files. | |
498 | |
499 The file is automatically opened. | |
500 | |
501 Entries in the file can be both fastq or fasta formatted or even a | |
502 mixture of the two. | |
503 | |
504 This file object permits iterating over all entries in the | |
505 file. Random access is not implemented. The iteration returns | |
506 objects of type :class:`FastqProxy` | |
507 | |
508 Parameters | |
509 ---------- | |
510 | |
511 filename : string | |
512 Filename of fasta/fastq file to be opened. | |
513 | |
514 persist : bool | |
515 | |
516 If True (default) make a copy of the entry in the file during | |
517 iteration. If set to False, no copy will be made. This will | |
518 permit much faster iteration, but an entry will not persist | |
519 when the iteration continues and an entry is read-only. | |
520 | |
521 Notes | |
522 ----- | |
523 Prior to version 0.8.2, this class was called FastqFile. | |
524 | |
525 Raises | |
526 ------ | |
527 | |
528 IOError | |
529 if file could not be opened | |
530 | |
531 | |
532 Examples | |
533 -------- | |
534 >>> with pysam.FastxFile(filename) as fh: | |
535 ... for entry in fh: | |
536 ... print(entry.name) | |
537 ... print(entry.sequence) | |
538 ... print(entry.comment) | |
539 ... print(entry.quality) | |
540 >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout: | |
541 ... for entry in fin: | |
542 ... fout.write(str(entry) + '\n') | |
543 | |
544 """ | |
545 def __cinit__(self, *args, **kwargs): | |
546 # self.fastqfile = <gzFile*>NULL | |
547 self._filename = None | |
548 self.entry = NULL | |
549 self._open(*args, **kwargs) | |
550 | |
551 def is_open(self): | |
552 '''return true if samfile has been opened.''' | |
553 return self.entry != NULL | |
554 | |
555 def _open(self, filename, persist=True): | |
556 '''open a fastq/fasta file in *filename* | |
557 | |
558 Paramentes | |
559 ---------- | |
560 | |
561 persist : bool | |
562 | |
563 if True return a copy of the underlying data (default | |
564 True). The copy will persist even if the iteration | |
565 on the file continues. | |
566 | |
567 ''' | |
568 if self.fastqfile != NULL: | |
569 self.close() | |
570 | |
571 self._filename = encode_filename(filename) | |
572 cdef char *cfilename = self._filename | |
573 self.is_remote = hisremote(cfilename) | |
574 | |
575 # open file for reading | |
576 if (self._filename != b"-" | |
577 and not self.is_remote | |
578 and not os.path.exists(filename)): | |
579 raise IOError("file `%s` not found" % filename) | |
580 | |
581 self.persist = persist | |
582 | |
583 with nogil: | |
584 self.fastqfile = bgzf_open(cfilename, "r") | |
585 self.entry = kseq_init(self.fastqfile) | |
586 self._filename = filename | |
587 | |
588 def close(self): | |
589 '''close the file.''' | |
590 if self.fastqfile != NULL: | |
591 bgzf_close(self.fastqfile) | |
592 self.fastqfile = NULL | |
593 if self.entry != NULL: | |
594 kseq_destroy(self.entry) | |
595 self.entry = NULL | |
596 | |
597 def __dealloc__(self): | |
598 if self.fastqfile != NULL: | |
599 bgzf_close(self.fastqfile) | |
600 if self.entry: | |
601 kseq_destroy(self.entry) | |
602 | |
603 # context manager interface | |
604 def __enter__(self): | |
605 return self | |
606 | |
607 def __exit__(self, exc_type, exc_value, traceback): | |
608 self.close() | |
609 return False | |
610 | |
611 property closed: | |
612 """bool indicating the current state of the file object. | |
613 This is a read-only attribute; the close() method changes the value. | |
614 """ | |
615 def __get__(self): | |
616 return not self.is_open() | |
617 | |
618 property filename: | |
619 """string with the filename associated with this object.""" | |
620 def __get__(self): | |
621 return self._filename | |
622 | |
623 def __iter__(self): | |
624 if not self.is_open(): | |
625 raise ValueError("I/O operation on closed file") | |
626 return self | |
627 | |
628 cdef kseq_t * getCurrent(self): | |
629 return self.entry | |
630 | |
631 cdef int cnext(self): | |
632 '''C version of iterator | |
633 ''' | |
634 with nogil: | |
635 return kseq_read(self.entry) | |
636 | |
637 def __next__(self): | |
638 """ | |
639 python version of next(). | |
640 """ | |
641 cdef int l | |
642 with nogil: | |
643 l = kseq_read(self.entry) | |
644 if (l >= 0): | |
645 if self.persist: | |
646 return FastxRecord(proxy=makeFastqProxy(self.entry)) | |
647 return makeFastqProxy(self.entry) | |
648 elif (l == -1): | |
649 raise StopIteration | |
650 elif (l == -2): | |
651 raise ValueError('truncated quality string in {0}' | |
652 .format(self._filename)) | |
653 else: | |
654 raise ValueError('unknown problem parsing {0}' | |
655 .format(self._filename)) | |
656 | |
657 # Compatibility Layer for pysam 0.8.1 | |
658 cdef class FastqFile(FastxFile): | |
659 """FastqFile is deprecated: use FastxFile instead""" | |
660 pass | |
661 | |
662 # Compatibility Layer for pysam < 0.8 | |
663 cdef class Fastafile(FastaFile): | |
664 """Fastafile is deprecated: use FastaFile instead""" | |
665 pass | |
666 | |
667 __all__ = ["FastaFile", | |
668 "FastqFile", | |
669 "FastxFile", | |
670 "Fastafile", | |
671 "FastxRecord", | |
672 "FastqProxy"] |