annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcfaidx.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: embedsignature=True
jpayne@68 2 # cython: profile=True
jpayne@68 3 ###############################################################################
jpayne@68 4 ###############################################################################
jpayne@68 5 # Cython wrapper for SAM/BAM/CRAM files based on htslib
jpayne@68 6 ###############################################################################
jpayne@68 7 # The principal classes defined in this module are:
jpayne@68 8 #
jpayne@68 9 # class FastaFile random read read/write access to faidx indexd files
jpayne@68 10 # class FastxFile streamed read/write access to fasta/fastq files
jpayne@68 11 #
jpayne@68 12 # Additionally this module defines several additional classes that are part
jpayne@68 13 # of the internal API. These are:
jpayne@68 14 #
jpayne@68 15 # class FastqProxy
jpayne@68 16 # class FastxRecord
jpayne@68 17 #
jpayne@68 18 # For backwards compatibility, the following classes are also defined:
jpayne@68 19 #
jpayne@68 20 # class Fastafile equivalent to FastaFile
jpayne@68 21 # class FastqFile equivalent to FastxFile
jpayne@68 22 #
jpayne@68 23 ###############################################################################
jpayne@68 24 #
jpayne@68 25 # The MIT License
jpayne@68 26 #
jpayne@68 27 # Copyright (c) 2015 Andreas Heger
jpayne@68 28 #
jpayne@68 29 # Permission is hereby granted, free of charge, to any person obtaining a
jpayne@68 30 # copy of this software and associated documentation files (the "Software"),
jpayne@68 31 # to deal in the Software without restriction, including without limitation
jpayne@68 32 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jpayne@68 33 # and/or sell copies of the Software, and to permit persons to whom the
jpayne@68 34 # Software is furnished to do so, subject to the following conditions:
jpayne@68 35 #
jpayne@68 36 # The above copyright notice and this permission notice shall be included in
jpayne@68 37 # all copies or substantial portions of the Software.
jpayne@68 38 #
jpayne@68 39 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@68 40 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@68 41 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
jpayne@68 42 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@68 43 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jpayne@68 44 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jpayne@68 45 # DEALINGS IN THE SOFTWARE.
jpayne@68 46 #
jpayne@68 47 ###############################################################################
jpayne@68 48 import sys
jpayne@68 49 import os
jpayne@68 50 import re
jpayne@68 51
jpayne@68 52
jpayne@68 53 from libc.errno cimport errno
jpayne@68 54 from libc.string cimport strerror
jpayne@68 55
jpayne@68 56 from cpython cimport array
jpayne@68 57
jpayne@68 58 from cpython cimport PyErr_SetString, \
jpayne@68 59 PyBytes_Check, \
jpayne@68 60 PyUnicode_Check, \
jpayne@68 61 PyBytes_FromStringAndSize
jpayne@68 62
jpayne@68 63 from pysam.libchtslib cimport \
jpayne@68 64 faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \
jpayne@68 65 faidx_seq_len, faidx_iseq, faidx_seq_len, \
jpayne@68 66 faidx_fetch_seq, hisremote, \
jpayne@68 67 bgzf_open, bgzf_close
jpayne@68 68
jpayne@68 69 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68 70 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68 71 from pysam.libcutils cimport qualitystring_to_array, parse_region
jpayne@68 72
jpayne@68 73 cdef class FastqProxy
jpayne@68 74 cdef makeFastqProxy(kseq_t * src):
jpayne@68 75 '''enter src into AlignedRead.'''
jpayne@68 76 cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
jpayne@68 77 dest._delegate = src
jpayne@68 78 return dest
jpayne@68 79
jpayne@68 80 ## TODO:
jpayne@68 81 ## add automatic indexing.
jpayne@68 82 ## add function to get sequence names.
jpayne@68 83 cdef class FastaFile:
jpayne@68 84 """Random access to fasta formatted files that
jpayne@68 85 have been indexed by :term:`faidx`.
jpayne@68 86
jpayne@68 87 The file is automatically opened. The index file of file
jpayne@68 88 ``<filename>`` is expected to be called ``<filename>.fai``.
jpayne@68 89
jpayne@68 90 Parameters
jpayne@68 91 ----------
jpayne@68 92
jpayne@68 93 filename : string
jpayne@68 94 Filename of fasta file to be opened.
jpayne@68 95
jpayne@68 96 filepath_index : string
jpayne@68 97 Optional, filename of the index. By default this is
jpayne@68 98 the filename + ".fai".
jpayne@68 99
jpayne@68 100 filepath_index_compressed : string
jpayne@68 101 Optional, filename of the index if fasta file is. By default this is
jpayne@68 102 the filename + ".gzi".
jpayne@68 103
jpayne@68 104 Raises
jpayne@68 105 ------
jpayne@68 106
jpayne@68 107 ValueError
jpayne@68 108 if index file is missing
jpayne@68 109
jpayne@68 110 IOError
jpayne@68 111 if file could not be opened
jpayne@68 112
jpayne@68 113 """
jpayne@68 114
jpayne@68 115 def __cinit__(self, *args, **kwargs):
jpayne@68 116 self.fastafile = NULL
jpayne@68 117 self._filename = None
jpayne@68 118 self._references = None
jpayne@68 119 self._lengths = None
jpayne@68 120 self.reference2length = None
jpayne@68 121 self._open(*args, **kwargs)
jpayne@68 122
jpayne@68 123 def is_open(self):
jpayne@68 124 '''return true if samfile has been opened.'''
jpayne@68 125 return self.fastafile != NULL
jpayne@68 126
jpayne@68 127 def __len__(self):
jpayne@68 128 if self.fastafile == NULL:
jpayne@68 129 raise ValueError("calling len() on closed file")
jpayne@68 130
jpayne@68 131 return faidx_nseq(self.fastafile)
jpayne@68 132
jpayne@68 133 def _open(self, filename, filepath_index=None, filepath_index_compressed=None):
jpayne@68 134 '''open an indexed fasta file.
jpayne@68 135
jpayne@68 136 This method expects an indexed fasta file.
jpayne@68 137 '''
jpayne@68 138
jpayne@68 139 # close a previously opened file
jpayne@68 140 if self.fastafile != NULL:
jpayne@68 141 self.close()
jpayne@68 142
jpayne@68 143 self._filename = encode_filename(filename)
jpayne@68 144 cdef char *cfilename = self._filename
jpayne@68 145 cdef char *cindexname = NULL
jpayne@68 146 cdef char *cindexname_compressed = NULL
jpayne@68 147 self.is_remote = hisremote(cfilename)
jpayne@68 148
jpayne@68 149 # open file for reading
jpayne@68 150 if (self._filename != b"-"
jpayne@68 151 and not self.is_remote
jpayne@68 152 and not os.path.exists(filename)):
jpayne@68 153 raise IOError("file `%s` not found" % filename)
jpayne@68 154
jpayne@68 155 # 3 modes to open:
jpayne@68 156 # compressed fa: fai_load3 with filename, index_fai and index_gzi
jpayne@68 157 # uncompressed fa: fai_load3 with filename and index_fai
jpayne@68 158 # uncompressed fa: fai_load with default index name
jpayne@68 159 if filepath_index:
jpayne@68 160 # when opening, set flags to 0 - do not automatically
jpayne@68 161 # build index if it does not exist.
jpayne@68 162
jpayne@68 163 if not os.path.exists(filepath_index):
jpayne@68 164 raise IOError("filename {} does not exist".format(filepath_index))
jpayne@68 165 cindexname = bindex_filename = encode_filename(filepath_index)
jpayne@68 166
jpayne@68 167 if filepath_index_compressed:
jpayne@68 168 if not os.path.exists(filepath_index_compressed):
jpayne@68 169 raise IOError("filename {} does not exist".format(filepath_index_compressed))
jpayne@68 170 cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed)
jpayne@68 171 with nogil:
jpayne@68 172 self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0)
jpayne@68 173 else:
jpayne@68 174 with nogil:
jpayne@68 175 self.fastafile = fai_load3(cfilename, cindexname, NULL, 0)
jpayne@68 176 else:
jpayne@68 177 with nogil:
jpayne@68 178 self.fastafile = fai_load(cfilename)
jpayne@68 179
jpayne@68 180 if self.fastafile == NULL:
jpayne@68 181 raise IOError("error when opening file `%s`" % filename)
jpayne@68 182
jpayne@68 183 cdef int nreferences = faidx_nseq(self.fastafile)
jpayne@68 184 cdef int x
jpayne@68 185 cdef const char * s
jpayne@68 186 self._references = []
jpayne@68 187 self._lengths = []
jpayne@68 188 for x from 0 <= x < nreferences:
jpayne@68 189 s = faidx_iseq(self.fastafile, x)
jpayne@68 190 ss = force_str(s)
jpayne@68 191 self._references.append(ss)
jpayne@68 192 self._lengths.append(faidx_seq_len(self.fastafile, s))
jpayne@68 193 self.reference2length = dict(zip(self._references, self._lengths))
jpayne@68 194
jpayne@68 195 def close(self):
jpayne@68 196 """close the file."""
jpayne@68 197 if self.fastafile != NULL:
jpayne@68 198 fai_destroy(self.fastafile)
jpayne@68 199 self.fastafile = NULL
jpayne@68 200
jpayne@68 201 def __dealloc__(self):
jpayne@68 202 if self.fastafile != NULL:
jpayne@68 203 fai_destroy(self.fastafile)
jpayne@68 204 self.fastafile = NULL
jpayne@68 205
jpayne@68 206 # context manager interface
jpayne@68 207 def __enter__(self):
jpayne@68 208 return self
jpayne@68 209
jpayne@68 210 def __exit__(self, exc_type, exc_value, traceback):
jpayne@68 211 self.close()
jpayne@68 212 return False
jpayne@68 213
jpayne@68 214 property closed:
jpayne@68 215 """bool indicating the current state of the file object.
jpayne@68 216 This is a read-only attribute; the close() method changes the value.
jpayne@68 217 """
jpayne@68 218 def __get__(self):
jpayne@68 219 return not self.is_open()
jpayne@68 220
jpayne@68 221 property filename:
jpayne@68 222 """filename associated with this object. This is a read-only attribute."""
jpayne@68 223 def __get__(self):
jpayne@68 224 return self._filename
jpayne@68 225
jpayne@68 226 property references:
jpayne@68 227 '''tuple with the names of :term:`reference` sequences.'''
jpayne@68 228 def __get__(self):
jpayne@68 229 return self._references
jpayne@68 230
jpayne@68 231 property nreferences:
jpayne@68 232 """int with the number of :term:`reference` sequences in the file.
jpayne@68 233 This is a read-only attribute."""
jpayne@68 234 def __get__(self):
jpayne@68 235 return len(self._references) if self.references else None
jpayne@68 236
jpayne@68 237 property lengths:
jpayne@68 238 """tuple with the lengths of :term:`reference` sequences."""
jpayne@68 239 def __get__(self):
jpayne@68 240 return self._lengths
jpayne@68 241
jpayne@68 242 def fetch(self,
jpayne@68 243 reference=None,
jpayne@68 244 start=None,
jpayne@68 245 end=None,
jpayne@68 246 region=None):
jpayne@68 247 """fetch sequences in a :term:`region`.
jpayne@68 248
jpayne@68 249 A region can
jpayne@68 250 either be specified by :term:`reference`, `start` and
jpayne@68 251 `end`. `start` and `end` denote 0-based, half-open
jpayne@68 252 intervals.
jpayne@68 253
jpayne@68 254 Alternatively, a samtools :term:`region` string can be
jpayne@68 255 supplied.
jpayne@68 256
jpayne@68 257 If any of the coordinates are missing they will be replaced by the
jpayne@68 258 minimum (`start`) or maximum (`end`) coordinate.
jpayne@68 259
jpayne@68 260 Note that region strings are 1-based, while `start` and `end` denote
jpayne@68 261 an interval in python coordinates.
jpayne@68 262 The region is specified by :term:`reference`, `start` and `end`.
jpayne@68 263
jpayne@68 264 Returns
jpayne@68 265 -------
jpayne@68 266
jpayne@68 267 string : a string with the sequence specified by the region.
jpayne@68 268
jpayne@68 269 Raises
jpayne@68 270 ------
jpayne@68 271
jpayne@68 272 IndexError
jpayne@68 273 if the coordinates are out of range
jpayne@68 274
jpayne@68 275 ValueError
jpayne@68 276 if the region is invalid
jpayne@68 277
jpayne@68 278 """
jpayne@68 279
jpayne@68 280 if not self.is_open():
jpayne@68 281 raise ValueError("I/O operation on closed file" )
jpayne@68 282
jpayne@68 283 cdef int length
jpayne@68 284 cdef char *seq
jpayne@68 285 cdef char *ref
jpayne@68 286 cdef int rstart, rend
jpayne@68 287
jpayne@68 288 contig, rstart, rend = parse_region(reference, start, end, region)
jpayne@68 289
jpayne@68 290 if contig is None:
jpayne@68 291 raise ValueError("no sequence/region supplied.")
jpayne@68 292
jpayne@68 293 if rstart == rend:
jpayne@68 294 return ""
jpayne@68 295
jpayne@68 296 contig_b = force_bytes(contig)
jpayne@68 297 ref = contig_b
jpayne@68 298 with nogil:
jpayne@68 299 length = faidx_seq_len(self.fastafile, ref)
jpayne@68 300 if length == -1:
jpayne@68 301 raise KeyError("sequence '%s' not present" % contig)
jpayne@68 302 if rstart >= length:
jpayne@68 303 return ""
jpayne@68 304
jpayne@68 305 # fai_fetch adds a '\0' at the end
jpayne@68 306 with nogil:
jpayne@68 307 seq = faidx_fetch_seq(self.fastafile,
jpayne@68 308 ref,
jpayne@68 309 rstart,
jpayne@68 310 rend-1,
jpayne@68 311 &length)
jpayne@68 312
jpayne@68 313 if not seq:
jpayne@68 314 if errno:
jpayne@68 315 raise IOError(errno, strerror(errno))
jpayne@68 316 else:
jpayne@68 317 raise ValueError("failure when retrieving sequence on '%s'" % contig)
jpayne@68 318
jpayne@68 319 try:
jpayne@68 320 return charptr_to_str(seq)
jpayne@68 321 finally:
jpayne@68 322 free(seq)
jpayne@68 323
jpayne@68 324 cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL:
jpayne@68 325 '''fetch sequence for reference, start and end'''
jpayne@68 326
jpayne@68 327 cdef char *seq
jpayne@68 328 with nogil:
jpayne@68 329 seq = faidx_fetch_seq(self.fastafile,
jpayne@68 330 reference,
jpayne@68 331 start,
jpayne@68 332 end-1,
jpayne@68 333 length)
jpayne@68 334
jpayne@68 335 if not seq:
jpayne@68 336 if errno:
jpayne@68 337 raise IOError(errno, strerror(errno))
jpayne@68 338 else:
jpayne@68 339 raise ValueError("failure when retrieving sequence on '%s'" % reference)
jpayne@68 340
jpayne@68 341 return seq
jpayne@68 342
jpayne@68 343 def get_reference_length(self, reference):
jpayne@68 344 '''return the length of reference.'''
jpayne@68 345 return self.reference2length[reference]
jpayne@68 346
jpayne@68 347 def __getitem__(self, reference):
jpayne@68 348 return self.fetch(reference)
jpayne@68 349
jpayne@68 350 def __contains__(self, reference):
jpayne@68 351 '''return true if reference in fasta file.'''
jpayne@68 352 return reference in self.reference2length
jpayne@68 353
jpayne@68 354
jpayne@68 355 cdef class FastqProxy:
jpayne@68 356 """A single entry in a fastq file."""
jpayne@68 357 def __init__(self):
jpayne@68 358 raise ValueError("do not instantiate FastqProxy directly")
jpayne@68 359
jpayne@68 360 property name:
jpayne@68 361 """The name of each entry in the fastq file."""
jpayne@68 362 def __get__(self):
jpayne@68 363 return charptr_to_str(self._delegate.name.s)
jpayne@68 364
jpayne@68 365 property sequence:
jpayne@68 366 """The sequence of each entry in the fastq file."""
jpayne@68 367 def __get__(self):
jpayne@68 368 return charptr_to_str(self._delegate.seq.s)
jpayne@68 369
jpayne@68 370 property comment:
jpayne@68 371 def __get__(self):
jpayne@68 372 if self._delegate.comment.l:
jpayne@68 373 return charptr_to_str(self._delegate.comment.s)
jpayne@68 374 else:
jpayne@68 375 return None
jpayne@68 376
jpayne@68 377 property quality:
jpayne@68 378 """The quality score of each entry in the fastq file, represented as a string."""
jpayne@68 379 def __get__(self):
jpayne@68 380 if self._delegate.qual.l:
jpayne@68 381 return charptr_to_str(self._delegate.qual.s)
jpayne@68 382 else:
jpayne@68 383 return None
jpayne@68 384
jpayne@68 385 cdef cython.str to_string(self):
jpayne@68 386 if self.comment is None:
jpayne@68 387 comment = ""
jpayne@68 388 else:
jpayne@68 389 comment = " %s" % self.comment
jpayne@68 390
jpayne@68 391 if self.quality is None:
jpayne@68 392 return ">%s%s\n%s" % (self.name, comment, self.sequence)
jpayne@68 393 else:
jpayne@68 394 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
jpayne@68 395 self.sequence, self.quality)
jpayne@68 396
jpayne@68 397 cdef cython.str tostring(self):
jpayne@68 398 """deprecated : use :meth:`to_string`"""
jpayne@68 399 return self.to_string()
jpayne@68 400
jpayne@68 401 def __str__(self):
jpayne@68 402 return self.to_string()
jpayne@68 403
jpayne@68 404 cpdef array.array get_quality_array(self, int offset=33):
jpayne@68 405 '''return quality values as integer array after subtracting offset.'''
jpayne@68 406 if self.quality is None:
jpayne@68 407 return None
jpayne@68 408 return qualitystring_to_array(force_bytes(self.quality),
jpayne@68 409 offset=offset)
jpayne@68 410
jpayne@68 411 cdef class FastxRecord:
jpayne@68 412 """A fasta/fastq record.
jpayne@68 413
jpayne@68 414 A record must contain a name and a sequence. If either of them are
jpayne@68 415 None, a ValueError is raised on writing.
jpayne@68 416
jpayne@68 417 """
jpayne@68 418 def __init__(self,
jpayne@68 419 name=None,
jpayne@68 420 comment=None,
jpayne@68 421 sequence=None,
jpayne@68 422 quality=None,
jpayne@68 423 FastqProxy proxy=None):
jpayne@68 424 if proxy is not None:
jpayne@68 425 self.comment = proxy.comment
jpayne@68 426 self.quality = proxy.quality
jpayne@68 427 self.sequence = proxy.sequence
jpayne@68 428 self.name = proxy.name
jpayne@68 429 else:
jpayne@68 430 self.comment = comment
jpayne@68 431 self.quality = quality
jpayne@68 432 self.sequence = sequence
jpayne@68 433 self.name = name
jpayne@68 434
jpayne@68 435 def __copy__(self):
jpayne@68 436 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
jpayne@68 437
jpayne@68 438 def __deepcopy__(self, memo):
jpayne@68 439 return FastxRecord(self.name, self.comment, self.sequence, self.quality)
jpayne@68 440
jpayne@68 441 cdef cython.str to_string(self):
jpayne@68 442 if self.name is None:
jpayne@68 443 raise ValueError("can not write record without name")
jpayne@68 444
jpayne@68 445 if self.sequence is None:
jpayne@68 446 raise ValueError("can not write record without a sequence")
jpayne@68 447
jpayne@68 448 if self.comment is None:
jpayne@68 449 comment = ""
jpayne@68 450 else:
jpayne@68 451 comment = " %s" % self.comment
jpayne@68 452
jpayne@68 453 if self.quality is None:
jpayne@68 454 return ">%s%s\n%s" % (self.name, comment, self.sequence)
jpayne@68 455 else:
jpayne@68 456 return "@%s%s\n%s\n+\n%s" % (self.name, comment,
jpayne@68 457 self.sequence, self.quality)
jpayne@68 458
jpayne@68 459 cdef cython.str tostring(self):
jpayne@68 460 """deprecated : use :meth:`to_string`"""
jpayne@68 461 return self.to_string()
jpayne@68 462
jpayne@68 463 def set_name(self, name):
jpayne@68 464 if name is None:
jpayne@68 465 raise ValueError("FastxRecord must have a name and not None")
jpayne@68 466 self.name = name
jpayne@68 467
jpayne@68 468 def set_comment(self, comment):
jpayne@68 469 self.comment = comment
jpayne@68 470
jpayne@68 471 def set_sequence(self, sequence, quality=None):
jpayne@68 472 """set sequence of this record.
jpayne@68 473
jpayne@68 474 """
jpayne@68 475 self.sequence = sequence
jpayne@68 476 if quality is not None:
jpayne@68 477 if len(sequence) != len(quality):
jpayne@68 478 raise ValueError("sequence and quality length do not match: {} vs {}".format(
jpayne@68 479 len(sequence), len(quality)))
jpayne@68 480
jpayne@68 481 self.quality = quality
jpayne@68 482 else:
jpayne@68 483 self.quality = None
jpayne@68 484
jpayne@68 485 def __str__(self):
jpayne@68 486 return self.to_string()
jpayne@68 487
jpayne@68 488 cpdef array.array get_quality_array(self, int offset=33):
jpayne@68 489 '''return quality values as array after subtracting offset.'''
jpayne@68 490 if self.quality is None:
jpayne@68 491 return None
jpayne@68 492 return qualitystring_to_array(force_bytes(self.quality),
jpayne@68 493 offset=offset)
jpayne@68 494
jpayne@68 495
jpayne@68 496 cdef class FastxFile:
jpayne@68 497 r"""Stream access to :term:`fasta` or :term:`fastq` formatted files.
jpayne@68 498
jpayne@68 499 The file is automatically opened.
jpayne@68 500
jpayne@68 501 Entries in the file can be both fastq or fasta formatted or even a
jpayne@68 502 mixture of the two.
jpayne@68 503
jpayne@68 504 This file object permits iterating over all entries in the
jpayne@68 505 file. Random access is not implemented. The iteration returns
jpayne@68 506 objects of type :class:`FastqProxy`
jpayne@68 507
jpayne@68 508 Parameters
jpayne@68 509 ----------
jpayne@68 510
jpayne@68 511 filename : string
jpayne@68 512 Filename of fasta/fastq file to be opened.
jpayne@68 513
jpayne@68 514 persist : bool
jpayne@68 515
jpayne@68 516 If True (default) make a copy of the entry in the file during
jpayne@68 517 iteration. If set to False, no copy will be made. This will
jpayne@68 518 permit much faster iteration, but an entry will not persist
jpayne@68 519 when the iteration continues and an entry is read-only.
jpayne@68 520
jpayne@68 521 Notes
jpayne@68 522 -----
jpayne@68 523 Prior to version 0.8.2, this class was called FastqFile.
jpayne@68 524
jpayne@68 525 Raises
jpayne@68 526 ------
jpayne@68 527
jpayne@68 528 IOError
jpayne@68 529 if file could not be opened
jpayne@68 530
jpayne@68 531
jpayne@68 532 Examples
jpayne@68 533 --------
jpayne@68 534 >>> with pysam.FastxFile(filename) as fh:
jpayne@68 535 ... for entry in fh:
jpayne@68 536 ... print(entry.name)
jpayne@68 537 ... print(entry.sequence)
jpayne@68 538 ... print(entry.comment)
jpayne@68 539 ... print(entry.quality)
jpayne@68 540 >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
jpayne@68 541 ... for entry in fin:
jpayne@68 542 ... fout.write(str(entry) + '\n')
jpayne@68 543
jpayne@68 544 """
jpayne@68 545 def __cinit__(self, *args, **kwargs):
jpayne@68 546 # self.fastqfile = <gzFile*>NULL
jpayne@68 547 self._filename = None
jpayne@68 548 self.entry = NULL
jpayne@68 549 self._open(*args, **kwargs)
jpayne@68 550
jpayne@68 551 def is_open(self):
jpayne@68 552 '''return true if samfile has been opened.'''
jpayne@68 553 return self.entry != NULL
jpayne@68 554
jpayne@68 555 def _open(self, filename, persist=True):
jpayne@68 556 '''open a fastq/fasta file in *filename*
jpayne@68 557
jpayne@68 558 Paramentes
jpayne@68 559 ----------
jpayne@68 560
jpayne@68 561 persist : bool
jpayne@68 562
jpayne@68 563 if True return a copy of the underlying data (default
jpayne@68 564 True). The copy will persist even if the iteration
jpayne@68 565 on the file continues.
jpayne@68 566
jpayne@68 567 '''
jpayne@68 568 if self.fastqfile != NULL:
jpayne@68 569 self.close()
jpayne@68 570
jpayne@68 571 self._filename = encode_filename(filename)
jpayne@68 572 cdef char *cfilename = self._filename
jpayne@68 573 self.is_remote = hisremote(cfilename)
jpayne@68 574
jpayne@68 575 # open file for reading
jpayne@68 576 if (self._filename != b"-"
jpayne@68 577 and not self.is_remote
jpayne@68 578 and not os.path.exists(filename)):
jpayne@68 579 raise IOError("file `%s` not found" % filename)
jpayne@68 580
jpayne@68 581 self.persist = persist
jpayne@68 582
jpayne@68 583 with nogil:
jpayne@68 584 self.fastqfile = bgzf_open(cfilename, "r")
jpayne@68 585 self.entry = kseq_init(self.fastqfile)
jpayne@68 586 self._filename = filename
jpayne@68 587
jpayne@68 588 def close(self):
jpayne@68 589 '''close the file.'''
jpayne@68 590 if self.fastqfile != NULL:
jpayne@68 591 bgzf_close(self.fastqfile)
jpayne@68 592 self.fastqfile = NULL
jpayne@68 593 if self.entry != NULL:
jpayne@68 594 kseq_destroy(self.entry)
jpayne@68 595 self.entry = NULL
jpayne@68 596
jpayne@68 597 def __dealloc__(self):
jpayne@68 598 if self.fastqfile != NULL:
jpayne@68 599 bgzf_close(self.fastqfile)
jpayne@68 600 if self.entry:
jpayne@68 601 kseq_destroy(self.entry)
jpayne@68 602
jpayne@68 603 # context manager interface
jpayne@68 604 def __enter__(self):
jpayne@68 605 return self
jpayne@68 606
jpayne@68 607 def __exit__(self, exc_type, exc_value, traceback):
jpayne@68 608 self.close()
jpayne@68 609 return False
jpayne@68 610
jpayne@68 611 property closed:
jpayne@68 612 """bool indicating the current state of the file object.
jpayne@68 613 This is a read-only attribute; the close() method changes the value.
jpayne@68 614 """
jpayne@68 615 def __get__(self):
jpayne@68 616 return not self.is_open()
jpayne@68 617
jpayne@68 618 property filename:
jpayne@68 619 """string with the filename associated with this object."""
jpayne@68 620 def __get__(self):
jpayne@68 621 return self._filename
jpayne@68 622
jpayne@68 623 def __iter__(self):
jpayne@68 624 if not self.is_open():
jpayne@68 625 raise ValueError("I/O operation on closed file")
jpayne@68 626 return self
jpayne@68 627
jpayne@68 628 cdef kseq_t * getCurrent(self):
jpayne@68 629 return self.entry
jpayne@68 630
jpayne@68 631 cdef int cnext(self):
jpayne@68 632 '''C version of iterator
jpayne@68 633 '''
jpayne@68 634 with nogil:
jpayne@68 635 return kseq_read(self.entry)
jpayne@68 636
jpayne@68 637 def __next__(self):
jpayne@68 638 """
jpayne@68 639 python version of next().
jpayne@68 640 """
jpayne@68 641 cdef int l
jpayne@68 642 with nogil:
jpayne@68 643 l = kseq_read(self.entry)
jpayne@68 644 if (l >= 0):
jpayne@68 645 if self.persist:
jpayne@68 646 return FastxRecord(proxy=makeFastqProxy(self.entry))
jpayne@68 647 return makeFastqProxy(self.entry)
jpayne@68 648 elif (l == -1):
jpayne@68 649 raise StopIteration
jpayne@68 650 elif (l == -2):
jpayne@68 651 raise ValueError('truncated quality string in {0}'
jpayne@68 652 .format(self._filename))
jpayne@68 653 else:
jpayne@68 654 raise ValueError('unknown problem parsing {0}'
jpayne@68 655 .format(self._filename))
jpayne@68 656
jpayne@68 657 # Compatibility Layer for pysam 0.8.1
jpayne@68 658 cdef class FastqFile(FastxFile):
jpayne@68 659 """FastqFile is deprecated: use FastxFile instead"""
jpayne@68 660 pass
jpayne@68 661
jpayne@68 662 # Compatibility Layer for pysam < 0.8
jpayne@68 663 cdef class Fastafile(FastaFile):
jpayne@68 664 """Fastafile is deprecated: use FastaFile instead"""
jpayne@68 665 pass
jpayne@68 666
jpayne@68 667 __all__ = ["FastaFile",
jpayne@68 668 "FastqFile",
jpayne@68 669 "FastxFile",
jpayne@68 670 "Fastafile",
jpayne@68 671 "FastxRecord",
jpayne@68 672 "FastqProxy"]