jpayne@68: # cython: language_level=3 jpayne@68: """Functions that read and write block gzipped files. jpayne@68: jpayne@68: The user of the file doesn't have to worry about the compression jpayne@68: and random access is allowed if an index file is present.""" jpayne@68: jpayne@68: # based on Python 3.5's gzip module jpayne@68: jpayne@68: import io jpayne@68: jpayne@68: from libc.stdint cimport int8_t, int16_t, int32_t, int64_t jpayne@68: from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t jpayne@68: from libc.stdio cimport SEEK_SET jpayne@68: from libc.stdlib cimport malloc, calloc, realloc, free jpayne@68: jpayne@68: from cpython.object cimport PyObject jpayne@68: from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize jpayne@68: jpayne@68: from pysam.libcutils cimport force_bytes, encode_filename jpayne@68: from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \ jpayne@68: bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \ jpayne@68: bgzf_tell, bgzf_getline, kstring_t, BGZF jpayne@68: jpayne@68: __all__ = ["BGZFile"] jpayne@68: jpayne@68: jpayne@68: BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE jpayne@68: jpayne@68: jpayne@68: cdef class BGZFile(object): jpayne@68: """The BGZFile class simulates most of the methods of a file object with jpayne@68: the exception of the truncate() method. jpayne@68: jpayne@68: This class only supports opening files in binary mode. If you need to open a jpayne@68: compressed file in text mode, use the gzip.open() function. jpayne@68: """ jpayne@68: def __init__(self, filename, mode=None, index=None): jpayne@68: """Constructor for the BGZFile class. jpayne@68: jpayne@68: The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or jpayne@68: 'xb' depending on whether the file will be read or written. The default jpayne@68: is the mode of fileobj if discernible; otherwise, the default is 'rb'. jpayne@68: A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and jpayne@68: 'wb', 'a' and 'ab', and 'x' and 'xb'. jpayne@68: """ jpayne@68: if mode and ('t' in mode or 'U' in mode): jpayne@68: raise ValueError("Invalid mode: {!r}".format(mode)) jpayne@68: if not mode: jpayne@68: mode = 'rb' jpayne@68: elif mode and 'b' not in mode: jpayne@68: mode += 'b' jpayne@68: jpayne@68: mode = force_bytes(mode) jpayne@68: jpayne@68: self.name = encode_filename(filename) jpayne@68: self.index = encode_filename(index) if index is not None else None jpayne@68: jpayne@68: self.bgzf = bgzf_open(self.name, mode) jpayne@68: jpayne@68: if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0: jpayne@68: raise IOError('Error building bgzf index') jpayne@68: jpayne@68: def __dealloc__(self): jpayne@68: self.close() jpayne@68: jpayne@68: def write(self, data): jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("write() on closed BGZFile object") jpayne@68: jpayne@68: if not self.bgzf.is_write: jpayne@68: import errno jpayne@68: raise IOError(errno.EBADF, "write() on read-only BGZFile object") jpayne@68: jpayne@68: if isinstance(data, bytes): jpayne@68: length = len(data) jpayne@68: else: jpayne@68: # accept any data that supports the buffer protocol jpayne@68: data = memoryview(data) jpayne@68: length = data.nbytes jpayne@68: jpayne@68: if length > 0 and bgzf_write(self.bgzf, data, length) < 0: jpayne@68: raise IOError('BGZFile write failed') jpayne@68: jpayne@68: return length jpayne@68: jpayne@68: def read(self, size=-1): jpayne@68: cdef ssize_t read_size jpayne@68: jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("read() on closed BGZFile object") jpayne@68: jpayne@68: if self.bgzf.is_write: jpayne@68: import errno jpayne@68: raise IOError(errno.EBADF, "read() on write-only BGZFile object") jpayne@68: jpayne@68: if size < 0: jpayne@68: chunks = [] jpayne@68: while 1: jpayne@68: chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE) jpayne@68: cdata = chunk jpayne@68: read_size = bgzf_read(self.bgzf, chunk, BUFFER_SIZE) jpayne@68: if read_size < 0: jpayne@68: raise IOError('Error reading from BGZFile') jpayne@68: elif not read_size: jpayne@68: break jpayne@68: elif read_size < BUFFER_SIZE: jpayne@68: chunk = chunk[:read_size] jpayne@68: chunks.append(chunk) jpayne@68: return b''.join(chunks) jpayne@68: jpayne@68: elif size > 0: jpayne@68: chunk = PyBytes_FromStringAndSize(NULL, size) jpayne@68: read_size = bgzf_read(self.bgzf, chunk, size) jpayne@68: if read_size < 0: jpayne@68: raise IOError('Error reading from BGZFile') jpayne@68: elif read_size < size: jpayne@68: chunk = chunk[:read_size] jpayne@68: return chunk jpayne@68: else: jpayne@68: return b'' jpayne@68: jpayne@68: @property jpayne@68: def closed(self): jpayne@68: return self.bgzf == NULL jpayne@68: jpayne@68: def close(self): jpayne@68: if not self.bgzf: jpayne@68: return jpayne@68: jpayne@68: if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: jpayne@68: raise IOError('Error flushing BGZFile object') jpayne@68: jpayne@68: if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0: jpayne@68: raise IOError('Cannot write index') jpayne@68: jpayne@68: cdef ret = bgzf_close(self.bgzf) jpayne@68: self.bgzf = NULL jpayne@68: jpayne@68: if ret < 0: jpayne@68: raise IOError('Error closing BGZFile object') jpayne@68: jpayne@68: def __enter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __exit__(self, type, value, tb): jpayne@68: self.close() jpayne@68: jpayne@68: def flush(self): jpayne@68: if not self.bgzf: jpayne@68: return jpayne@68: jpayne@68: if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: jpayne@68: raise IOError('Error flushing BGZFile object') jpayne@68: jpayne@68: def fileno(self): jpayne@68: """Invoke the underlying file object's fileno() method. jpayne@68: jpayne@68: This will raise AttributeError if the underlying file object jpayne@68: doesn't support fileno(). jpayne@68: """ jpayne@68: raise AttributeError('fileno') jpayne@68: jpayne@68: def rewind(self): jpayne@68: '''Return the uncompressed stream file position indicator to the jpayne@68: beginning of the file''' jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("rewind() on closed BGZFile object") jpayne@68: if not self.bgzf.is_write: jpayne@68: raise IOError("Can't rewind in write mode") jpayne@68: if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0: jpayne@68: raise IOError('Error seeking BGZFFile object') jpayne@68: jpayne@68: def readable(self): jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("readable() on closed BGZFile object") jpayne@68: return self.bgzf != NULL and not self.bgzf.is_write jpayne@68: jpayne@68: def writable(self): jpayne@68: return self.bgzf != NULL and self.bgzf.is_write jpayne@68: jpayne@68: def seekable(self): jpayne@68: return True jpayne@68: jpayne@68: def tell(self): jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("seek() on closed BGZFile object") jpayne@68: cdef int64_t off = bgzf_tell(self.bgzf) jpayne@68: if off < 0: jpayne@68: raise IOError('Error in tell on BGZFFile object') jpayne@68: jpayne@68: return off jpayne@68: jpayne@68: def seek(self, offset, whence=io.SEEK_SET): jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("seek() on closed BGZFile object") jpayne@68: if whence is not io.SEEK_SET: jpayne@68: raise ValueError('Seek from end not supported') jpayne@68: jpayne@68: cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET) jpayne@68: if off < 0: jpayne@68: raise IOError('Error seeking BGZFFile object') jpayne@68: jpayne@68: return off jpayne@68: jpayne@68: def readline(self, size=-1): jpayne@68: if not self.bgzf: jpayne@68: raise ValueError("readline() on closed BGZFile object") jpayne@68: jpayne@68: cdef kstring_t line jpayne@68: cdef char c jpayne@68: jpayne@68: line.l = line.m = 0 jpayne@68: line.s = NULL jpayne@68: jpayne@68: cdef int ret = bgzf_getline(self.bgzf, b'\n', &line) jpayne@68: if ret == -1: jpayne@68: s = b'' jpayne@68: elif ret == -2: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: raise IOError('Error reading line in BGZFFile object') jpayne@68: else: jpayne@68: s = line.s[:line.l] jpayne@68: jpayne@68: if line.m: jpayne@68: free(line.s) jpayne@68: jpayne@68: return s jpayne@68: jpayne@68: def __iter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __next__(self): jpayne@68: line = self.readline() jpayne@68: if not line: jpayne@68: raise StopIteration() jpayne@68: return line