Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcfaidx.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcfaidx.pyx Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,672 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +# Cython wrapper for SAM/BAM/CRAM files based on htslib +############################################################################### +# The principal classes defined in this module are: +# +# class FastaFile random read read/write access to faidx indexd files +# class FastxFile streamed read/write access to fasta/fastq files +# +# Additionally this module defines several additional classes that are part +# of the internal API. These are: +# +# class FastqProxy +# class FastxRecord +# +# For backwards compatibility, the following classes are also defined: +# +# class Fastafile equivalent to FastaFile +# class FastqFile equivalent to FastxFile +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +import sys +import os +import re + + +from libc.errno cimport errno +from libc.string cimport strerror + +from cpython cimport array + +from cpython cimport PyErr_SetString, \ + PyBytes_Check, \ + PyUnicode_Check, \ + PyBytes_FromStringAndSize + +from pysam.libchtslib cimport \ + faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \ + faidx_seq_len, faidx_iseq, faidx_seq_len, \ + faidx_fetch_seq, hisremote, \ + bgzf_open, bgzf_close + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport qualitystring_to_array, parse_region + +cdef class FastqProxy +cdef makeFastqProxy(kseq_t * src): + '''enter src into AlignedRead.''' + cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) + dest._delegate = src + return dest + +## TODO: +## add automatic indexing. +## add function to get sequence names. +cdef class FastaFile: + """Random access to fasta formatted files that + have been indexed by :term:`faidx`. + + The file is automatically opened. The index file of file + ``<filename>`` is expected to be called ``<filename>.fai``. + + Parameters + ---------- + + filename : string + Filename of fasta file to be opened. + + filepath_index : string + Optional, filename of the index. By default this is + the filename + ".fai". + + filepath_index_compressed : string + Optional, filename of the index if fasta file is. By default this is + the filename + ".gzi". + + Raises + ------ + + ValueError + if index file is missing + + IOError + if file could not be opened + + """ + + def __cinit__(self, *args, **kwargs): + self.fastafile = NULL + self._filename = None + self._references = None + self._lengths = None + self.reference2length = None + self._open(*args, **kwargs) + + def is_open(self): + '''return true if samfile has been opened.''' + return self.fastafile != NULL + + def __len__(self): + if self.fastafile == NULL: + raise ValueError("calling len() on closed file") + + return faidx_nseq(self.fastafile) + + def _open(self, filename, filepath_index=None, filepath_index_compressed=None): + '''open an indexed fasta file. + + This method expects an indexed fasta file. + ''' + + # close a previously opened file + if self.fastafile != NULL: + self.close() + + self._filename = encode_filename(filename) + cdef char *cfilename = self._filename + cdef char *cindexname = NULL + cdef char *cindexname_compressed = NULL + self.is_remote = hisremote(cfilename) + + # open file for reading + if (self._filename != b"-" + and not self.is_remote + and not os.path.exists(filename)): + raise IOError("file `%s` not found" % filename) + + # 3 modes to open: + # compressed fa: fai_load3 with filename, index_fai and index_gzi + # uncompressed fa: fai_load3 with filename and index_fai + # uncompressed fa: fai_load with default index name + if filepath_index: + # when opening, set flags to 0 - do not automatically + # build index if it does not exist. + + if not os.path.exists(filepath_index): + raise IOError("filename {} does not exist".format(filepath_index)) + cindexname = bindex_filename = encode_filename(filepath_index) + + if filepath_index_compressed: + if not os.path.exists(filepath_index_compressed): + raise IOError("filename {} does not exist".format(filepath_index_compressed)) + cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed) + with nogil: + self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0) + else: + with nogil: + self.fastafile = fai_load3(cfilename, cindexname, NULL, 0) + else: + with nogil: + self.fastafile = fai_load(cfilename) + + if self.fastafile == NULL: + raise IOError("error when opening file `%s`" % filename) + + cdef int nreferences = faidx_nseq(self.fastafile) + cdef int x + cdef const char * s + self._references = [] + self._lengths = [] + for x from 0 <= x < nreferences: + s = faidx_iseq(self.fastafile, x) + ss = force_str(s) + self._references.append(ss) + self._lengths.append(faidx_seq_len(self.fastafile, s)) + self.reference2length = dict(zip(self._references, self._lengths)) + + def close(self): + """close the file.""" + if self.fastafile != NULL: + fai_destroy(self.fastafile) + self.fastafile = NULL + + def __dealloc__(self): + if self.fastafile != NULL: + fai_destroy(self.fastafile) + self.fastafile = NULL + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + property closed: + """bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ + def __get__(self): + return not self.is_open() + + property filename: + """filename associated with this object. This is a read-only attribute.""" + def __get__(self): + return self._filename + + property references: + '''tuple with the names of :term:`reference` sequences.''' + def __get__(self): + return self._references + + property nreferences: + """int with the number of :term:`reference` sequences in the file. + This is a read-only attribute.""" + def __get__(self): + return len(self._references) if self.references else None + + property lengths: + """tuple with the lengths of :term:`reference` sequences.""" + def __get__(self): + return self._lengths + + def fetch(self, + reference=None, + start=None, + end=None, + region=None): + """fetch sequences in a :term:`region`. + + A region can + either be specified by :term:`reference`, `start` and + `end`. `start` and `end` denote 0-based, half-open + intervals. + + Alternatively, a samtools :term:`region` string can be + supplied. + + If any of the coordinates are missing they will be replaced by the + minimum (`start`) or maximum (`end`) coordinate. + + Note that region strings are 1-based, while `start` and `end` denote + an interval in python coordinates. + The region is specified by :term:`reference`, `start` and `end`. + + Returns + ------- + + string : a string with the sequence specified by the region. + + Raises + ------ + + IndexError + if the coordinates are out of range + + ValueError + if the region is invalid + + """ + + if not self.is_open(): + raise ValueError("I/O operation on closed file" ) + + cdef int length + cdef char *seq + cdef char *ref + cdef int rstart, rend + + contig, rstart, rend = parse_region(reference, start, end, region) + + if contig is None: + raise ValueError("no sequence/region supplied.") + + if rstart == rend: + return "" + + contig_b = force_bytes(contig) + ref = contig_b + with nogil: + length = faidx_seq_len(self.fastafile, ref) + if length == -1: + raise KeyError("sequence '%s' not present" % contig) + if rstart >= length: + return "" + + # fai_fetch adds a '\0' at the end + with nogil: + seq = faidx_fetch_seq(self.fastafile, + ref, + rstart, + rend-1, + &length) + + if not seq: + if errno: + raise IOError(errno, strerror(errno)) + else: + raise ValueError("failure when retrieving sequence on '%s'" % contig) + + try: + return charptr_to_str(seq) + finally: + free(seq) + + cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL: + '''fetch sequence for reference, start and end''' + + cdef char *seq + with nogil: + seq = faidx_fetch_seq(self.fastafile, + reference, + start, + end-1, + length) + + if not seq: + if errno: + raise IOError(errno, strerror(errno)) + else: + raise ValueError("failure when retrieving sequence on '%s'" % reference) + + return seq + + def get_reference_length(self, reference): + '''return the length of reference.''' + return self.reference2length[reference] + + def __getitem__(self, reference): + return self.fetch(reference) + + def __contains__(self, reference): + '''return true if reference in fasta file.''' + return reference in self.reference2length + + +cdef class FastqProxy: + """A single entry in a fastq file.""" + def __init__(self): + raise ValueError("do not instantiate FastqProxy directly") + + property name: + """The name of each entry in the fastq file.""" + def __get__(self): + return charptr_to_str(self._delegate.name.s) + + property sequence: + """The sequence of each entry in the fastq file.""" + def __get__(self): + return charptr_to_str(self._delegate.seq.s) + + property comment: + def __get__(self): + if self._delegate.comment.l: + return charptr_to_str(self._delegate.comment.s) + else: + return None + + property quality: + """The quality score of each entry in the fastq file, represented as a string.""" + def __get__(self): + if self._delegate.qual.l: + return charptr_to_str(self._delegate.qual.s) + else: + return None + + cdef cython.str to_string(self): + if self.comment is None: + comment = "" + else: + comment = " %s" % self.comment + + if self.quality is None: + return ">%s%s\n%s" % (self.name, comment, self.sequence) + else: + return "@%s%s\n%s\n+\n%s" % (self.name, comment, + self.sequence, self.quality) + + cdef cython.str tostring(self): + """deprecated : use :meth:`to_string`""" + return self.to_string() + + def __str__(self): + return self.to_string() + + cpdef array.array get_quality_array(self, int offset=33): + '''return quality values as integer array after subtracting offset.''' + if self.quality is None: + return None + return qualitystring_to_array(force_bytes(self.quality), + offset=offset) + +cdef class FastxRecord: + """A fasta/fastq record. + + A record must contain a name and a sequence. If either of them are + None, a ValueError is raised on writing. + + """ + def __init__(self, + name=None, + comment=None, + sequence=None, + quality=None, + FastqProxy proxy=None): + if proxy is not None: + self.comment = proxy.comment + self.quality = proxy.quality + self.sequence = proxy.sequence + self.name = proxy.name + else: + self.comment = comment + self.quality = quality + self.sequence = sequence + self.name = name + + def __copy__(self): + return FastxRecord(self.name, self.comment, self.sequence, self.quality) + + def __deepcopy__(self, memo): + return FastxRecord(self.name, self.comment, self.sequence, self.quality) + + cdef cython.str to_string(self): + if self.name is None: + raise ValueError("can not write record without name") + + if self.sequence is None: + raise ValueError("can not write record without a sequence") + + if self.comment is None: + comment = "" + else: + comment = " %s" % self.comment + + if self.quality is None: + return ">%s%s\n%s" % (self.name, comment, self.sequence) + else: + return "@%s%s\n%s\n+\n%s" % (self.name, comment, + self.sequence, self.quality) + + cdef cython.str tostring(self): + """deprecated : use :meth:`to_string`""" + return self.to_string() + + def set_name(self, name): + if name is None: + raise ValueError("FastxRecord must have a name and not None") + self.name = name + + def set_comment(self, comment): + self.comment = comment + + def set_sequence(self, sequence, quality=None): + """set sequence of this record. + + """ + self.sequence = sequence + if quality is not None: + if len(sequence) != len(quality): + raise ValueError("sequence and quality length do not match: {} vs {}".format( + len(sequence), len(quality))) + + self.quality = quality + else: + self.quality = None + + def __str__(self): + return self.to_string() + + cpdef array.array get_quality_array(self, int offset=33): + '''return quality values as array after subtracting offset.''' + if self.quality is None: + return None + return qualitystring_to_array(force_bytes(self.quality), + offset=offset) + + +cdef class FastxFile: + r"""Stream access to :term:`fasta` or :term:`fastq` formatted files. + + The file is automatically opened. + + Entries in the file can be both fastq or fasta formatted or even a + mixture of the two. + + This file object permits iterating over all entries in the + file. Random access is not implemented. The iteration returns + objects of type :class:`FastqProxy` + + Parameters + ---------- + + filename : string + Filename of fasta/fastq file to be opened. + + persist : bool + + If True (default) make a copy of the entry in the file during + iteration. If set to False, no copy will be made. This will + permit much faster iteration, but an entry will not persist + when the iteration continues and an entry is read-only. + + Notes + ----- + Prior to version 0.8.2, this class was called FastqFile. + + Raises + ------ + + IOError + if file could not be opened + + + Examples + -------- + >>> with pysam.FastxFile(filename) as fh: + ... for entry in fh: + ... print(entry.name) + ... print(entry.sequence) + ... print(entry.comment) + ... print(entry.quality) + >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout: + ... for entry in fin: + ... fout.write(str(entry) + '\n') + + """ + def __cinit__(self, *args, **kwargs): + # self.fastqfile = <gzFile*>NULL + self._filename = None + self.entry = NULL + self._open(*args, **kwargs) + + def is_open(self): + '''return true if samfile has been opened.''' + return self.entry != NULL + + def _open(self, filename, persist=True): + '''open a fastq/fasta file in *filename* + + Paramentes + ---------- + + persist : bool + + if True return a copy of the underlying data (default + True). The copy will persist even if the iteration + on the file continues. + + ''' + if self.fastqfile != NULL: + self.close() + + self._filename = encode_filename(filename) + cdef char *cfilename = self._filename + self.is_remote = hisremote(cfilename) + + # open file for reading + if (self._filename != b"-" + and not self.is_remote + and not os.path.exists(filename)): + raise IOError("file `%s` not found" % filename) + + self.persist = persist + + with nogil: + self.fastqfile = bgzf_open(cfilename, "r") + self.entry = kseq_init(self.fastqfile) + self._filename = filename + + def close(self): + '''close the file.''' + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + self.fastqfile = NULL + if self.entry != NULL: + kseq_destroy(self.entry) + self.entry = NULL + + def __dealloc__(self): + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + if self.entry: + kseq_destroy(self.entry) + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + property closed: + """bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ + def __get__(self): + return not self.is_open() + + property filename: + """string with the filename associated with this object.""" + def __get__(self): + return self._filename + + def __iter__(self): + if not self.is_open(): + raise ValueError("I/O operation on closed file") + return self + + cdef kseq_t * getCurrent(self): + return self.entry + + cdef int cnext(self): + '''C version of iterator + ''' + with nogil: + return kseq_read(self.entry) + + def __next__(self): + """ + python version of next(). + """ + cdef int l + with nogil: + l = kseq_read(self.entry) + if (l >= 0): + if self.persist: + return FastxRecord(proxy=makeFastqProxy(self.entry)) + return makeFastqProxy(self.entry) + elif (l == -1): + raise StopIteration + elif (l == -2): + raise ValueError('truncated quality string in {0}' + .format(self._filename)) + else: + raise ValueError('unknown problem parsing {0}' + .format(self._filename)) + +# Compatibility Layer for pysam 0.8.1 +cdef class FastqFile(FastxFile): + """FastqFile is deprecated: use FastxFile instead""" + pass + +# Compatibility Layer for pysam < 0.8 +cdef class Fastafile(FastaFile): + """Fastafile is deprecated: use FastaFile instead""" + pass + +__all__ = ["FastaFile", + "FastqFile", + "FastxFile", + "Fastafile", + "FastxRecord", + "FastqProxy"]