jpayne@69: # cython: language_level=3 jpayne@69: """Functions that read and write block gzipped files. jpayne@69: jpayne@69: The user of the file doesn't have to worry about the compression jpayne@69: and random access is allowed if an index file is present.""" jpayne@69: jpayne@69: # based on Python 3.5's gzip module jpayne@69: jpayne@69: import io jpayne@69: jpayne@69: from libc.stdint cimport int8_t, int16_t, int32_t, int64_t jpayne@69: from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t jpayne@69: from libc.stdio cimport SEEK_SET jpayne@69: from libc.stdlib cimport malloc, calloc, realloc, free jpayne@69: jpayne@69: from cpython.object cimport PyObject jpayne@69: from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize jpayne@69: jpayne@69: from pysam.libcutils cimport force_bytes, encode_filename jpayne@69: from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \ jpayne@69: bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \ jpayne@69: bgzf_tell, bgzf_getline, kstring_t, BGZF jpayne@69: jpayne@69: __all__ = ["BGZFile"] jpayne@69: jpayne@69: jpayne@69: BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE jpayne@69: jpayne@69: jpayne@69: cdef class BGZFile(object): jpayne@69: """The BGZFile class simulates most of the methods of a file object with jpayne@69: the exception of the truncate() method. jpayne@69: jpayne@69: This class only supports opening files in binary mode. If you need to open a jpayne@69: compressed file in text mode, use the gzip.open() function. jpayne@69: """ jpayne@69: def __init__(self, filename, mode=None, index=None): jpayne@69: """Constructor for the BGZFile class. jpayne@69: jpayne@69: The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or jpayne@69: 'xb' depending on whether the file will be read or written. The default jpayne@69: is the mode of fileobj if discernible; otherwise, the default is 'rb'. jpayne@69: A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and jpayne@69: 'wb', 'a' and 'ab', and 'x' and 'xb'. jpayne@69: """ jpayne@69: if mode and ('t' in mode or 'U' in mode): jpayne@69: raise ValueError("Invalid mode: {!r}".format(mode)) jpayne@69: if not mode: jpayne@69: mode = 'rb' jpayne@69: elif mode and 'b' not in mode: jpayne@69: mode += 'b' jpayne@69: jpayne@69: mode = force_bytes(mode) jpayne@69: jpayne@69: self.name = encode_filename(filename) jpayne@69: self.index = encode_filename(index) if index is not None else None jpayne@69: jpayne@69: self.bgzf = bgzf_open(self.name, mode) jpayne@69: jpayne@69: if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0: jpayne@69: raise IOError('Error building bgzf index') jpayne@69: jpayne@69: def __dealloc__(self): jpayne@69: self.close() jpayne@69: jpayne@69: def write(self, data): jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("write() on closed BGZFile object") jpayne@69: jpayne@69: if not self.bgzf.is_write: jpayne@69: import errno jpayne@69: raise IOError(errno.EBADF, "write() on read-only BGZFile object") jpayne@69: jpayne@69: if isinstance(data, bytes): jpayne@69: length = len(data) jpayne@69: else: jpayne@69: # accept any data that supports the buffer protocol jpayne@69: data = memoryview(data) jpayne@69: length = data.nbytes jpayne@69: jpayne@69: if length > 0 and bgzf_write(self.bgzf, data, length) < 0: jpayne@69: raise IOError('BGZFile write failed') jpayne@69: jpayne@69: return length jpayne@69: jpayne@69: def read(self, size=-1): jpayne@69: cdef ssize_t read_size jpayne@69: jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("read() on closed BGZFile object") jpayne@69: jpayne@69: if self.bgzf.is_write: jpayne@69: import errno jpayne@69: raise IOError(errno.EBADF, "read() on write-only BGZFile object") jpayne@69: jpayne@69: if size < 0: jpayne@69: chunks = [] jpayne@69: while 1: jpayne@69: chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE) jpayne@69: cdata = chunk jpayne@69: read_size = bgzf_read(self.bgzf, chunk, BUFFER_SIZE) jpayne@69: if read_size < 0: jpayne@69: raise IOError('Error reading from BGZFile') jpayne@69: elif not read_size: jpayne@69: break jpayne@69: elif read_size < BUFFER_SIZE: jpayne@69: chunk = chunk[:read_size] jpayne@69: chunks.append(chunk) jpayne@69: return b''.join(chunks) jpayne@69: jpayne@69: elif size > 0: jpayne@69: chunk = PyBytes_FromStringAndSize(NULL, size) jpayne@69: read_size = bgzf_read(self.bgzf, chunk, size) jpayne@69: if read_size < 0: jpayne@69: raise IOError('Error reading from BGZFile') jpayne@69: elif read_size < size: jpayne@69: chunk = chunk[:read_size] jpayne@69: return chunk jpayne@69: else: jpayne@69: return b'' jpayne@69: jpayne@69: @property jpayne@69: def closed(self): jpayne@69: return self.bgzf == NULL jpayne@69: jpayne@69: def close(self): jpayne@69: if not self.bgzf: jpayne@69: return jpayne@69: jpayne@69: if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: jpayne@69: raise IOError('Error flushing BGZFile object') jpayne@69: jpayne@69: if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0: jpayne@69: raise IOError('Cannot write index') jpayne@69: jpayne@69: cdef ret = bgzf_close(self.bgzf) jpayne@69: self.bgzf = NULL jpayne@69: jpayne@69: if ret < 0: jpayne@69: raise IOError('Error closing BGZFile object') jpayne@69: jpayne@69: def __enter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __exit__(self, type, value, tb): jpayne@69: self.close() jpayne@69: jpayne@69: def flush(self): jpayne@69: if not self.bgzf: jpayne@69: return jpayne@69: jpayne@69: if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: jpayne@69: raise IOError('Error flushing BGZFile object') jpayne@69: jpayne@69: def fileno(self): jpayne@69: """Invoke the underlying file object's fileno() method. jpayne@69: jpayne@69: This will raise AttributeError if the underlying file object jpayne@69: doesn't support fileno(). jpayne@69: """ jpayne@69: raise AttributeError('fileno') jpayne@69: jpayne@69: def rewind(self): jpayne@69: '''Return the uncompressed stream file position indicator to the jpayne@69: beginning of the file''' jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("rewind() on closed BGZFile object") jpayne@69: if not self.bgzf.is_write: jpayne@69: raise IOError("Can't rewind in write mode") jpayne@69: if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0: jpayne@69: raise IOError('Error seeking BGZFFile object') jpayne@69: jpayne@69: def readable(self): jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("readable() on closed BGZFile object") jpayne@69: return self.bgzf != NULL and not self.bgzf.is_write jpayne@69: jpayne@69: def writable(self): jpayne@69: return self.bgzf != NULL and self.bgzf.is_write jpayne@69: jpayne@69: def seekable(self): jpayne@69: return True jpayne@69: jpayne@69: def tell(self): jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("seek() on closed BGZFile object") jpayne@69: cdef int64_t off = bgzf_tell(self.bgzf) jpayne@69: if off < 0: jpayne@69: raise IOError('Error in tell on BGZFFile object') jpayne@69: jpayne@69: return off jpayne@69: jpayne@69: def seek(self, offset, whence=io.SEEK_SET): jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("seek() on closed BGZFile object") jpayne@69: if whence is not io.SEEK_SET: jpayne@69: raise ValueError('Seek from end not supported') jpayne@69: jpayne@69: cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET) jpayne@69: if off < 0: jpayne@69: raise IOError('Error seeking BGZFFile object') jpayne@69: jpayne@69: return off jpayne@69: jpayne@69: def readline(self, size=-1): jpayne@69: if not self.bgzf: jpayne@69: raise ValueError("readline() on closed BGZFile object") jpayne@69: jpayne@69: cdef kstring_t line jpayne@69: cdef char c jpayne@69: jpayne@69: line.l = line.m = 0 jpayne@69: line.s = NULL jpayne@69: jpayne@69: cdef int ret = bgzf_getline(self.bgzf, b'\n', &line) jpayne@69: if ret == -1: jpayne@69: s = b'' jpayne@69: elif ret == -2: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: raise IOError('Error reading line in BGZFFile object') jpayne@69: else: jpayne@69: s = line.s[:line.l] jpayne@69: jpayne@69: if line.m: jpayne@69: free(line.s) jpayne@69: jpayne@69: return s jpayne@69: jpayne@69: def __iter__(self): jpayne@69: return self jpayne@69: jpayne@69: def __next__(self): jpayne@69: line = self.readline() jpayne@69: if not line: jpayne@69: raise StopIteration() jpayne@69: return line