jpayne@69
|
1 # cython: language_level=3
|
jpayne@69
|
2 """Functions that read and write block gzipped files.
|
jpayne@69
|
3
|
jpayne@69
|
4 The user of the file doesn't have to worry about the compression
|
jpayne@69
|
5 and random access is allowed if an index file is present."""
|
jpayne@69
|
6
|
jpayne@69
|
7 # based on Python 3.5's gzip module
|
jpayne@69
|
8
|
jpayne@69
|
9 import io
|
jpayne@69
|
10
|
jpayne@69
|
11 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
|
jpayne@69
|
12 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
|
jpayne@69
|
13 from libc.stdio cimport SEEK_SET
|
jpayne@69
|
14 from libc.stdlib cimport malloc, calloc, realloc, free
|
jpayne@69
|
15
|
jpayne@69
|
16 from cpython.object cimport PyObject
|
jpayne@69
|
17 from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize
|
jpayne@69
|
18
|
jpayne@69
|
19 from pysam.libcutils cimport force_bytes, encode_filename
|
jpayne@69
|
20 from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \
|
jpayne@69
|
21 bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \
|
jpayne@69
|
22 bgzf_tell, bgzf_getline, kstring_t, BGZF
|
jpayne@69
|
23
|
jpayne@69
|
24 __all__ = ["BGZFile"]
|
jpayne@69
|
25
|
jpayne@69
|
26
|
jpayne@69
|
27 BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
|
jpayne@69
|
28
|
jpayne@69
|
29
|
jpayne@69
|
30 cdef class BGZFile(object):
|
jpayne@69
|
31 """The BGZFile class simulates most of the methods of a file object with
|
jpayne@69
|
32 the exception of the truncate() method.
|
jpayne@69
|
33
|
jpayne@69
|
34 This class only supports opening files in binary mode. If you need to open a
|
jpayne@69
|
35 compressed file in text mode, use the gzip.open() function.
|
jpayne@69
|
36 """
|
jpayne@69
|
37 def __init__(self, filename, mode=None, index=None):
|
jpayne@69
|
38 """Constructor for the BGZFile class.
|
jpayne@69
|
39
|
jpayne@69
|
40 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
|
jpayne@69
|
41 'xb' depending on whether the file will be read or written. The default
|
jpayne@69
|
42 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
|
jpayne@69
|
43 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
|
jpayne@69
|
44 'wb', 'a' and 'ab', and 'x' and 'xb'.
|
jpayne@69
|
45 """
|
jpayne@69
|
46 if mode and ('t' in mode or 'U' in mode):
|
jpayne@69
|
47 raise ValueError("Invalid mode: {!r}".format(mode))
|
jpayne@69
|
48 if not mode:
|
jpayne@69
|
49 mode = 'rb'
|
jpayne@69
|
50 elif mode and 'b' not in mode:
|
jpayne@69
|
51 mode += 'b'
|
jpayne@69
|
52
|
jpayne@69
|
53 mode = force_bytes(mode)
|
jpayne@69
|
54
|
jpayne@69
|
55 self.name = encode_filename(filename)
|
jpayne@69
|
56 self.index = encode_filename(index) if index is not None else None
|
jpayne@69
|
57
|
jpayne@69
|
58 self.bgzf = bgzf_open(self.name, mode)
|
jpayne@69
|
59
|
jpayne@69
|
60 if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
|
jpayne@69
|
61 raise IOError('Error building bgzf index')
|
jpayne@69
|
62
|
jpayne@69
|
63 def __dealloc__(self):
|
jpayne@69
|
64 self.close()
|
jpayne@69
|
65
|
jpayne@69
|
66 def write(self, data):
|
jpayne@69
|
67 if not self.bgzf:
|
jpayne@69
|
68 raise ValueError("write() on closed BGZFile object")
|
jpayne@69
|
69
|
jpayne@69
|
70 if not self.bgzf.is_write:
|
jpayne@69
|
71 import errno
|
jpayne@69
|
72 raise IOError(errno.EBADF, "write() on read-only BGZFile object")
|
jpayne@69
|
73
|
jpayne@69
|
74 if isinstance(data, bytes):
|
jpayne@69
|
75 length = len(data)
|
jpayne@69
|
76 else:
|
jpayne@69
|
77 # accept any data that supports the buffer protocol
|
jpayne@69
|
78 data = memoryview(data)
|
jpayne@69
|
79 length = data.nbytes
|
jpayne@69
|
80
|
jpayne@69
|
81 if length > 0 and bgzf_write(self.bgzf, <char *>data, length) < 0:
|
jpayne@69
|
82 raise IOError('BGZFile write failed')
|
jpayne@69
|
83
|
jpayne@69
|
84 return length
|
jpayne@69
|
85
|
jpayne@69
|
86 def read(self, size=-1):
|
jpayne@69
|
87 cdef ssize_t read_size
|
jpayne@69
|
88
|
jpayne@69
|
89 if not self.bgzf:
|
jpayne@69
|
90 raise ValueError("read() on closed BGZFile object")
|
jpayne@69
|
91
|
jpayne@69
|
92 if self.bgzf.is_write:
|
jpayne@69
|
93 import errno
|
jpayne@69
|
94 raise IOError(errno.EBADF, "read() on write-only BGZFile object")
|
jpayne@69
|
95
|
jpayne@69
|
96 if size < 0:
|
jpayne@69
|
97 chunks = []
|
jpayne@69
|
98 while 1:
|
jpayne@69
|
99 chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE)
|
jpayne@69
|
100 cdata = <bytes>chunk
|
jpayne@69
|
101 read_size = bgzf_read(self.bgzf, <char *>chunk, BUFFER_SIZE)
|
jpayne@69
|
102 if read_size < 0:
|
jpayne@69
|
103 raise IOError('Error reading from BGZFile')
|
jpayne@69
|
104 elif not read_size:
|
jpayne@69
|
105 break
|
jpayne@69
|
106 elif read_size < BUFFER_SIZE:
|
jpayne@69
|
107 chunk = chunk[:read_size]
|
jpayne@69
|
108 chunks.append(chunk)
|
jpayne@69
|
109 return b''.join(chunks)
|
jpayne@69
|
110
|
jpayne@69
|
111 elif size > 0:
|
jpayne@69
|
112 chunk = PyBytes_FromStringAndSize(NULL, size)
|
jpayne@69
|
113 read_size = bgzf_read(self.bgzf, <char *>chunk, size)
|
jpayne@69
|
114 if read_size < 0:
|
jpayne@69
|
115 raise IOError('Error reading from BGZFile')
|
jpayne@69
|
116 elif read_size < size:
|
jpayne@69
|
117 chunk = chunk[:read_size]
|
jpayne@69
|
118 return chunk
|
jpayne@69
|
119 else:
|
jpayne@69
|
120 return b''
|
jpayne@69
|
121
|
jpayne@69
|
122 @property
|
jpayne@69
|
123 def closed(self):
|
jpayne@69
|
124 return self.bgzf == NULL
|
jpayne@69
|
125
|
jpayne@69
|
126 def close(self):
|
jpayne@69
|
127 if not self.bgzf:
|
jpayne@69
|
128 return
|
jpayne@69
|
129
|
jpayne@69
|
130 if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
|
jpayne@69
|
131 raise IOError('Error flushing BGZFile object')
|
jpayne@69
|
132
|
jpayne@69
|
133 if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0:
|
jpayne@69
|
134 raise IOError('Cannot write index')
|
jpayne@69
|
135
|
jpayne@69
|
136 cdef ret = bgzf_close(self.bgzf)
|
jpayne@69
|
137 self.bgzf = NULL
|
jpayne@69
|
138
|
jpayne@69
|
139 if ret < 0:
|
jpayne@69
|
140 raise IOError('Error closing BGZFile object')
|
jpayne@69
|
141
|
jpayne@69
|
142 def __enter__(self):
|
jpayne@69
|
143 return self
|
jpayne@69
|
144
|
jpayne@69
|
145 def __exit__(self, type, value, tb):
|
jpayne@69
|
146 self.close()
|
jpayne@69
|
147
|
jpayne@69
|
148 def flush(self):
|
jpayne@69
|
149 if not self.bgzf:
|
jpayne@69
|
150 return
|
jpayne@69
|
151
|
jpayne@69
|
152 if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
|
jpayne@69
|
153 raise IOError('Error flushing BGZFile object')
|
jpayne@69
|
154
|
jpayne@69
|
155 def fileno(self):
|
jpayne@69
|
156 """Invoke the underlying file object's fileno() method.
|
jpayne@69
|
157
|
jpayne@69
|
158 This will raise AttributeError if the underlying file object
|
jpayne@69
|
159 doesn't support fileno().
|
jpayne@69
|
160 """
|
jpayne@69
|
161 raise AttributeError('fileno')
|
jpayne@69
|
162
|
jpayne@69
|
163 def rewind(self):
|
jpayne@69
|
164 '''Return the uncompressed stream file position indicator to the
|
jpayne@69
|
165 beginning of the file'''
|
jpayne@69
|
166 if not self.bgzf:
|
jpayne@69
|
167 raise ValueError("rewind() on closed BGZFile object")
|
jpayne@69
|
168 if not self.bgzf.is_write:
|
jpayne@69
|
169 raise IOError("Can't rewind in write mode")
|
jpayne@69
|
170 if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0:
|
jpayne@69
|
171 raise IOError('Error seeking BGZFFile object')
|
jpayne@69
|
172
|
jpayne@69
|
173 def readable(self):
|
jpayne@69
|
174 if not self.bgzf:
|
jpayne@69
|
175 raise ValueError("readable() on closed BGZFile object")
|
jpayne@69
|
176 return self.bgzf != NULL and not self.bgzf.is_write
|
jpayne@69
|
177
|
jpayne@69
|
178 def writable(self):
|
jpayne@69
|
179 return self.bgzf != NULL and self.bgzf.is_write
|
jpayne@69
|
180
|
jpayne@69
|
181 def seekable(self):
|
jpayne@69
|
182 return True
|
jpayne@69
|
183
|
jpayne@69
|
184 def tell(self):
|
jpayne@69
|
185 if not self.bgzf:
|
jpayne@69
|
186 raise ValueError("seek() on closed BGZFile object")
|
jpayne@69
|
187 cdef int64_t off = bgzf_tell(self.bgzf)
|
jpayne@69
|
188 if off < 0:
|
jpayne@69
|
189 raise IOError('Error in tell on BGZFFile object')
|
jpayne@69
|
190
|
jpayne@69
|
191 return off
|
jpayne@69
|
192
|
jpayne@69
|
193 def seek(self, offset, whence=io.SEEK_SET):
|
jpayne@69
|
194 if not self.bgzf:
|
jpayne@69
|
195 raise ValueError("seek() on closed BGZFile object")
|
jpayne@69
|
196 if whence is not io.SEEK_SET:
|
jpayne@69
|
197 raise ValueError('Seek from end not supported')
|
jpayne@69
|
198
|
jpayne@69
|
199 cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET)
|
jpayne@69
|
200 if off < 0:
|
jpayne@69
|
201 raise IOError('Error seeking BGZFFile object')
|
jpayne@69
|
202
|
jpayne@69
|
203 return off
|
jpayne@69
|
204
|
jpayne@69
|
205 def readline(self, size=-1):
|
jpayne@69
|
206 if not self.bgzf:
|
jpayne@69
|
207 raise ValueError("readline() on closed BGZFile object")
|
jpayne@69
|
208
|
jpayne@69
|
209 cdef kstring_t line
|
jpayne@69
|
210 cdef char c
|
jpayne@69
|
211
|
jpayne@69
|
212 line.l = line.m = 0
|
jpayne@69
|
213 line.s = NULL
|
jpayne@69
|
214
|
jpayne@69
|
215 cdef int ret = bgzf_getline(self.bgzf, b'\n', &line)
|
jpayne@69
|
216 if ret == -1:
|
jpayne@69
|
217 s = b''
|
jpayne@69
|
218 elif ret == -2:
|
jpayne@69
|
219 if line.m:
|
jpayne@69
|
220 free(line.s)
|
jpayne@69
|
221 raise IOError('Error reading line in BGZFFile object')
|
jpayne@69
|
222 else:
|
jpayne@69
|
223 s = line.s[:line.l]
|
jpayne@69
|
224
|
jpayne@69
|
225 if line.m:
|
jpayne@69
|
226 free(line.s)
|
jpayne@69
|
227
|
jpayne@69
|
228 return s
|
jpayne@69
|
229
|
jpayne@69
|
230 def __iter__(self):
|
jpayne@69
|
231 return self
|
jpayne@69
|
232
|
jpayne@69
|
233 def __next__(self):
|
jpayne@69
|
234 line = self.readline()
|
jpayne@69
|
235 if not line:
|
jpayne@69
|
236 raise StopIteration()
|
jpayne@69
|
237 return line
|