jpayne@69
|
1 # Copyright 1999 by Jeffrey Chang. All rights reserved.
|
jpayne@69
|
2 # Copyright 2009-2018 by Peter Cock. All rights reserved.
|
jpayne@69
|
3 #
|
jpayne@69
|
4 # This file is part of the Biopython distribution and governed by your
|
jpayne@69
|
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@69
|
6 # Please see the LICENSE file that should have been included as part of this
|
jpayne@69
|
7 # package.
|
jpayne@69
|
8 """Code for more fancy file handles.
|
jpayne@69
|
9
|
jpayne@69
|
10 Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for
|
jpayne@69
|
11 indexing files. These are not intended for direct use.
|
jpayne@69
|
12 """
|
jpayne@69
|
13
|
jpayne@69
|
14 import os
|
jpayne@69
|
15 import contextlib
|
jpayne@69
|
16 import itertools
|
jpayne@69
|
17 import collections.abc
|
jpayne@69
|
18
|
jpayne@69
|
19 from abc import ABC, abstractmethod
|
jpayne@69
|
20
|
jpayne@69
|
21 try:
|
jpayne@69
|
22 import sqlite3
|
jpayne@69
|
23 except ImportError:
|
jpayne@69
|
24 # May be missing if Python was compiled from source without its dependencies
|
jpayne@69
|
25 sqlite3 = None # type: ignore
|
jpayne@69
|
26
|
jpayne@69
|
27
|
jpayne@69
|
28 @contextlib.contextmanager
|
jpayne@69
|
29 def as_handle(handleish, mode="r", **kwargs):
|
jpayne@69
|
30 r"""Context manager to ensure we are using a handle.
|
jpayne@69
|
31
|
jpayne@69
|
32 Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
|
jpayne@69
|
33 and parse methods: either file objects or path-like objects (strings, pathlib.Path
|
jpayne@69
|
34 instances, or more generally, anything that can be handled by the builtin 'open'
|
jpayne@69
|
35 function).
|
jpayne@69
|
36
|
jpayne@69
|
37 When given a path-like object, returns an open file handle to that path, with provided
|
jpayne@69
|
38 mode, which will be closed when the manager exits.
|
jpayne@69
|
39
|
jpayne@69
|
40 All other inputs are returned, and are *not* closed.
|
jpayne@69
|
41
|
jpayne@69
|
42 Arguments:
|
jpayne@69
|
43 - handleish - Either a file handle or path-like object (anything which can be
|
jpayne@69
|
44 passed to the builtin 'open' function, such as str, bytes,
|
jpayne@69
|
45 pathlib.Path, and os.DirEntry objects)
|
jpayne@69
|
46 - mode - Mode to open handleish (used only if handleish is a string)
|
jpayne@69
|
47 - kwargs - Further arguments to pass to open(...)
|
jpayne@69
|
48
|
jpayne@69
|
49 Examples
|
jpayne@69
|
50 --------
|
jpayne@69
|
51 >>> from Bio import File
|
jpayne@69
|
52 >>> import os
|
jpayne@69
|
53 >>> with File.as_handle('seqs.fasta', 'w') as fp:
|
jpayne@69
|
54 ... fp.write('>test\nACGT')
|
jpayne@69
|
55 ...
|
jpayne@69
|
56 10
|
jpayne@69
|
57 >>> fp.closed
|
jpayne@69
|
58 True
|
jpayne@69
|
59
|
jpayne@69
|
60 >>> handle = open('seqs.fasta', 'w')
|
jpayne@69
|
61 >>> with File.as_handle(handle) as fp:
|
jpayne@69
|
62 ... fp.write('>test\nACGT')
|
jpayne@69
|
63 ...
|
jpayne@69
|
64 10
|
jpayne@69
|
65 >>> fp.closed
|
jpayne@69
|
66 False
|
jpayne@69
|
67 >>> fp.close()
|
jpayne@69
|
68 >>> os.remove("seqs.fasta") # tidy up
|
jpayne@69
|
69
|
jpayne@69
|
70 """
|
jpayne@69
|
71 try:
|
jpayne@69
|
72 with open(handleish, mode, **kwargs) as fp:
|
jpayne@69
|
73 yield fp
|
jpayne@69
|
74 except TypeError:
|
jpayne@69
|
75 yield handleish
|
jpayne@69
|
76
|
jpayne@69
|
77
|
jpayne@69
|
78 def _open_for_random_access(filename):
|
jpayne@69
|
79 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
|
jpayne@69
|
80
|
jpayne@69
|
81 This functionality is used by the Bio.SeqIO and Bio.SearchIO index
|
jpayne@69
|
82 and index_db functions.
|
jpayne@69
|
83
|
jpayne@69
|
84 If the file is gzipped but not BGZF, a specific ValueError is raised.
|
jpayne@69
|
85 """
|
jpayne@69
|
86 handle = open(filename, "rb")
|
jpayne@69
|
87 magic = handle.read(2)
|
jpayne@69
|
88 handle.seek(0)
|
jpayne@69
|
89
|
jpayne@69
|
90 if magic == b"\x1f\x8b":
|
jpayne@69
|
91 # This is a gzipped file, but is it BGZF?
|
jpayne@69
|
92 from . import bgzf
|
jpayne@69
|
93
|
jpayne@69
|
94 try:
|
jpayne@69
|
95 # If it is BGZF, we support that
|
jpayne@69
|
96 return bgzf.BgzfReader(mode="rb", fileobj=handle)
|
jpayne@69
|
97 except ValueError as e:
|
jpayne@69
|
98 assert "BGZF" in str(e)
|
jpayne@69
|
99 # Not a BGZF file after all,
|
jpayne@69
|
100 handle.close()
|
jpayne@69
|
101 raise ValueError(
|
jpayne@69
|
102 "Gzipped files are not suitable for indexing, "
|
jpayne@69
|
103 "please use BGZF (blocked gzip format) instead."
|
jpayne@69
|
104 ) from None
|
jpayne@69
|
105
|
jpayne@69
|
106 return handle
|
jpayne@69
|
107
|
jpayne@69
|
108
|
jpayne@69
|
109 # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
|
jpayne@69
|
110 # for indexing
|
jpayne@69
|
111
|
jpayne@69
|
112
|
jpayne@69
|
113 class _IndexedSeqFileProxy(ABC):
|
jpayne@69
|
114 """Abstract base class for file format specific random access (PRIVATE).
|
jpayne@69
|
115
|
jpayne@69
|
116 This is subclasses in both Bio.SeqIO for indexing as SeqRecord
|
jpayne@69
|
117 objects, and in Bio.SearchIO for indexing QueryResult objects.
|
jpayne@69
|
118
|
jpayne@69
|
119 Subclasses for each file format should define '__iter__', 'get'
|
jpayne@69
|
120 and optionally 'get_raw' methods.
|
jpayne@69
|
121 """
|
jpayne@69
|
122
|
jpayne@69
|
123 @abstractmethod
|
jpayne@69
|
124 def __iter__(self):
|
jpayne@69
|
125 """Return (identifier, offset, length in bytes) tuples.
|
jpayne@69
|
126
|
jpayne@69
|
127 The length can be zero where it is not implemented or not
|
jpayne@69
|
128 possible for a particular file format.
|
jpayne@69
|
129 """
|
jpayne@69
|
130 raise NotImplementedError
|
jpayne@69
|
131
|
jpayne@69
|
132 @abstractmethod
|
jpayne@69
|
133 def get(self, offset):
|
jpayne@69
|
134 """Return parsed object for this entry."""
|
jpayne@69
|
135 # Most file formats with self contained records can be handled by
|
jpayne@69
|
136 # parsing StringIO(self.get_raw(offset).decode())
|
jpayne@69
|
137 raise NotImplementedError
|
jpayne@69
|
138
|
jpayne@69
|
139 def get_raw(self, offset):
|
jpayne@69
|
140 """Return the raw record from the file as a bytes string (if implemented).
|
jpayne@69
|
141
|
jpayne@69
|
142 If the key is not found, a KeyError exception is raised.
|
jpayne@69
|
143
|
jpayne@69
|
144 This may not have been implemented for all file formats.
|
jpayne@69
|
145 """
|
jpayne@69
|
146 # Should be done by each sub-class (if possible)
|
jpayne@69
|
147 raise NotImplementedError("Not available for this file format.")
|
jpayne@69
|
148
|
jpayne@69
|
149
|
jpayne@69
|
150 class _IndexedSeqFileDict(collections.abc.Mapping):
|
jpayne@69
|
151 """Read only dictionary interface to a sequential record file.
|
jpayne@69
|
152
|
jpayne@69
|
153 This code is used in both Bio.SeqIO for indexing as SeqRecord
|
jpayne@69
|
154 objects, and in Bio.SearchIO for indexing QueryResult objects.
|
jpayne@69
|
155
|
jpayne@69
|
156 Keeps the keys and associated file offsets in memory, reads the file
|
jpayne@69
|
157 to access entries as objects parsing them on demand. This approach
|
jpayne@69
|
158 is memory limited, but will work even with millions of records.
|
jpayne@69
|
159
|
jpayne@69
|
160 Note duplicate keys are not allowed. If this happens, a ValueError
|
jpayne@69
|
161 exception is raised.
|
jpayne@69
|
162
|
jpayne@69
|
163 As used in Bio.SeqIO, by default the SeqRecord's id string is used
|
jpayne@69
|
164 as the dictionary key. In Bio.SearchIO, the query's id string is
|
jpayne@69
|
165 used. This can be changed by supplying an optional key_function,
|
jpayne@69
|
166 a callback function which will be given the record id and must
|
jpayne@69
|
167 return the desired key. For example, this allows you to parse
|
jpayne@69
|
168 NCBI style FASTA identifiers, and extract the GI number to use
|
jpayne@69
|
169 as the dictionary key.
|
jpayne@69
|
170
|
jpayne@69
|
171 Note that this dictionary is essentially read only. You cannot
|
jpayne@69
|
172 add or change values, pop values, nor clear the dictionary.
|
jpayne@69
|
173 """
|
jpayne@69
|
174
|
jpayne@69
|
175 def __init__(self, random_access_proxy, key_function, repr, obj_repr):
|
jpayne@69
|
176 """Initialize the class."""
|
jpayne@69
|
177 # Use key_function=None for default value
|
jpayne@69
|
178 self._proxy = random_access_proxy
|
jpayne@69
|
179 self._key_function = key_function
|
jpayne@69
|
180 self._repr = repr
|
jpayne@69
|
181 self._obj_repr = obj_repr
|
jpayne@69
|
182 self._cached_prev_record = (None, None) # (key, record)
|
jpayne@69
|
183 if key_function:
|
jpayne@69
|
184 offset_iter = (
|
jpayne@69
|
185 (key_function(key), offset, length)
|
jpayne@69
|
186 for (key, offset, length) in random_access_proxy
|
jpayne@69
|
187 )
|
jpayne@69
|
188 else:
|
jpayne@69
|
189 offset_iter = random_access_proxy
|
jpayne@69
|
190 offsets = {}
|
jpayne@69
|
191 for key, offset, length in offset_iter:
|
jpayne@69
|
192 # Note - we don't store the length because I want to minimise the
|
jpayne@69
|
193 # memory requirements. With the SQLite backend the length is kept
|
jpayne@69
|
194 # and is used to speed up the get_raw method (by about 3 times).
|
jpayne@69
|
195 # The length should be provided by all the current backends except
|
jpayne@69
|
196 # SFF where there is an existing Roche index we can reuse (very fast
|
jpayne@69
|
197 # but lacks the record lengths)
|
jpayne@69
|
198 # assert length or format in ["sff", "sff-trim"], \
|
jpayne@69
|
199 # "%s at offset %i given length %r (%s format %s)" \
|
jpayne@69
|
200 # % (key, offset, length, filename, format)
|
jpayne@69
|
201 if key in offsets:
|
jpayne@69
|
202 self._proxy._handle.close()
|
jpayne@69
|
203 raise ValueError(f"Duplicate key '{key}'")
|
jpayne@69
|
204 else:
|
jpayne@69
|
205 offsets[key] = offset
|
jpayne@69
|
206 self._offsets = offsets
|
jpayne@69
|
207
|
jpayne@69
|
208 def __repr__(self):
|
jpayne@69
|
209 """Return a string representation of the File object."""
|
jpayne@69
|
210 return self._repr
|
jpayne@69
|
211
|
jpayne@69
|
212 def __str__(self):
|
jpayne@69
|
213 """Create a string representation of the File object."""
|
jpayne@69
|
214 # TODO - How best to handle the __str__ for SeqIO and SearchIO?
|
jpayne@69
|
215 if self:
|
jpayne@69
|
216 return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}"
|
jpayne@69
|
217 else:
|
jpayne@69
|
218 return "{}"
|
jpayne@69
|
219
|
jpayne@69
|
220 def __len__(self):
|
jpayne@69
|
221 """Return the number of records."""
|
jpayne@69
|
222 return len(self._offsets)
|
jpayne@69
|
223
|
jpayne@69
|
224 def __iter__(self):
|
jpayne@69
|
225 """Iterate over the keys."""
|
jpayne@69
|
226 return iter(self._offsets)
|
jpayne@69
|
227
|
jpayne@69
|
228 def __getitem__(self, key):
|
jpayne@69
|
229 """Return record for the specified key.
|
jpayne@69
|
230
|
jpayne@69
|
231 As an optimization when repeatedly asked to look up the same record,
|
jpayne@69
|
232 the key and record are cached so that if the *same* record is
|
jpayne@69
|
233 requested next time, it can be returned without going to disk.
|
jpayne@69
|
234 """
|
jpayne@69
|
235 if key == self._cached_prev_record[0]:
|
jpayne@69
|
236 return self._cached_prev_record[1]
|
jpayne@69
|
237 # Pass the offset to the proxy
|
jpayne@69
|
238 record = self._proxy.get(self._offsets[key])
|
jpayne@69
|
239 if self._key_function:
|
jpayne@69
|
240 key2 = self._key_function(record.id)
|
jpayne@69
|
241 else:
|
jpayne@69
|
242 key2 = record.id
|
jpayne@69
|
243 if key != key2:
|
jpayne@69
|
244 raise ValueError(f"Key did not match ({key} vs {key2})")
|
jpayne@69
|
245 self._cached_prev_record = (key, record)
|
jpayne@69
|
246 return record
|
jpayne@69
|
247
|
jpayne@69
|
248 def get_raw(self, key):
|
jpayne@69
|
249 """Return the raw record from the file as a bytes string.
|
jpayne@69
|
250
|
jpayne@69
|
251 If the key is not found, a KeyError exception is raised.
|
jpayne@69
|
252 """
|
jpayne@69
|
253 # Pass the offset to the proxy
|
jpayne@69
|
254 return self._proxy.get_raw(self._offsets[key])
|
jpayne@69
|
255
|
jpayne@69
|
256 def close(self):
|
jpayne@69
|
257 """Close the file handle being used to read the data.
|
jpayne@69
|
258
|
jpayne@69
|
259 Once called, further use of the index won't work. The sole purpose
|
jpayne@69
|
260 of this method is to allow explicit handle closure - for example
|
jpayne@69
|
261 if you wish to delete the file, on Windows you must first close
|
jpayne@69
|
262 all open handles to that file.
|
jpayne@69
|
263 """
|
jpayne@69
|
264 self._proxy._handle.close()
|
jpayne@69
|
265
|
jpayne@69
|
266
|
jpayne@69
|
267 class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
|
jpayne@69
|
268 """Read only dictionary interface to many sequential record files.
|
jpayne@69
|
269
|
jpayne@69
|
270 This code is used in both Bio.SeqIO for indexing as SeqRecord
|
jpayne@69
|
271 objects, and in Bio.SearchIO for indexing QueryResult objects.
|
jpayne@69
|
272
|
jpayne@69
|
273 Keeps the keys, file-numbers and offsets in an SQLite database. To access
|
jpayne@69
|
274 a record by key, reads from the offset in the appropriate file and then
|
jpayne@69
|
275 parses the record into an object.
|
jpayne@69
|
276
|
jpayne@69
|
277 There are OS limits on the number of files that can be open at once,
|
jpayne@69
|
278 so a pool are kept. If a record is required from a closed file, then
|
jpayne@69
|
279 one of the open handles is closed first.
|
jpayne@69
|
280 """
|
jpayne@69
|
281
|
jpayne@69
|
282 def __init__(
|
jpayne@69
|
283 self,
|
jpayne@69
|
284 index_filename,
|
jpayne@69
|
285 filenames,
|
jpayne@69
|
286 proxy_factory,
|
jpayne@69
|
287 fmt,
|
jpayne@69
|
288 key_function,
|
jpayne@69
|
289 repr,
|
jpayne@69
|
290 max_open=10,
|
jpayne@69
|
291 ):
|
jpayne@69
|
292 """Initialize the class."""
|
jpayne@69
|
293 # TODO? - Don't keep filename list in memory (just in DB)?
|
jpayne@69
|
294 # Should save a chunk of memory if dealing with 1000s of files.
|
jpayne@69
|
295 # Furthermore could compare a generator to the DB on reloading
|
jpayne@69
|
296 # (no need to turn it into a list)
|
jpayne@69
|
297
|
jpayne@69
|
298 if sqlite3 is None:
|
jpayne@69
|
299 # Python was compiled without sqlite3 support
|
jpayne@69
|
300 from Bio import MissingPythonDependencyError
|
jpayne@69
|
301
|
jpayne@69
|
302 raise MissingPythonDependencyError(
|
jpayne@69
|
303 "Python was compiled without the sqlite3 module"
|
jpayne@69
|
304 )
|
jpayne@69
|
305 if filenames is not None:
|
jpayne@69
|
306 filenames = list(filenames) # In case it was a generator
|
jpayne@69
|
307
|
jpayne@69
|
308 # Cache the arguments as private variables
|
jpayne@69
|
309 self._index_filename = index_filename
|
jpayne@69
|
310 self._filenames = filenames
|
jpayne@69
|
311 self._format = fmt
|
jpayne@69
|
312 self._key_function = key_function
|
jpayne@69
|
313 self._proxy_factory = proxy_factory
|
jpayne@69
|
314 self._repr = repr
|
jpayne@69
|
315 self._max_open = max_open
|
jpayne@69
|
316 self._proxies = {}
|
jpayne@69
|
317
|
jpayne@69
|
318 # Note if using SQLite :memory: trick index filename, this will
|
jpayne@69
|
319 # give $PWD as the relative path (which is fine).
|
jpayne@69
|
320 self._relative_path = os.path.abspath(os.path.dirname(index_filename))
|
jpayne@69
|
321
|
jpayne@69
|
322 if os.path.isfile(index_filename):
|
jpayne@69
|
323 self._load_index()
|
jpayne@69
|
324 else:
|
jpayne@69
|
325 self._build_index()
|
jpayne@69
|
326
|
jpayne@69
|
327 def _load_index(self):
|
jpayne@69
|
328 """Call from __init__ to re-use an existing index (PRIVATE)."""
|
jpayne@69
|
329 index_filename = self._index_filename
|
jpayne@69
|
330 relative_path = self._relative_path
|
jpayne@69
|
331 filenames = self._filenames
|
jpayne@69
|
332 fmt = self._format
|
jpayne@69
|
333 proxy_factory = self._proxy_factory
|
jpayne@69
|
334
|
jpayne@69
|
335 con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False)
|
jpayne@69
|
336 self._con = con
|
jpayne@69
|
337 # Check the count...
|
jpayne@69
|
338 try:
|
jpayne@69
|
339 (count,) = con.execute(
|
jpayne@69
|
340 "SELECT value FROM meta_data WHERE key=?;", ("count",)
|
jpayne@69
|
341 ).fetchone()
|
jpayne@69
|
342 self._length = int(count)
|
jpayne@69
|
343 if self._length == -1:
|
jpayne@69
|
344 con.close()
|
jpayne@69
|
345 raise ValueError("Unfinished/partial database") from None
|
jpayne@69
|
346
|
jpayne@69
|
347 # use MAX(_ROWID_) to obtain the number of sequences in the database
|
jpayne@69
|
348 # using COUNT(key) is quite slow in SQLITE
|
jpayne@69
|
349 # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables)
|
jpayne@69
|
350 (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone()
|
jpayne@69
|
351 if self._length != int(count):
|
jpayne@69
|
352 con.close()
|
jpayne@69
|
353 raise ValueError(
|
jpayne@69
|
354 "Corrupt database? %i entries not %i" % (int(count), self._length)
|
jpayne@69
|
355 ) from None
|
jpayne@69
|
356 (self._format,) = con.execute(
|
jpayne@69
|
357 "SELECT value FROM meta_data WHERE key=?;", ("format",)
|
jpayne@69
|
358 ).fetchone()
|
jpayne@69
|
359 if fmt and fmt != self._format:
|
jpayne@69
|
360 con.close()
|
jpayne@69
|
361 raise ValueError(
|
jpayne@69
|
362 f"Index file says format {self._format}, not {fmt}"
|
jpayne@69
|
363 ) from None
|
jpayne@69
|
364 try:
|
jpayne@69
|
365 (filenames_relative_to_index,) = con.execute(
|
jpayne@69
|
366 "SELECT value FROM meta_data WHERE key=?;",
|
jpayne@69
|
367 ("filenames_relative_to_index",),
|
jpayne@69
|
368 ).fetchone()
|
jpayne@69
|
369 filenames_relative_to_index = (
|
jpayne@69
|
370 filenames_relative_to_index.upper() == "TRUE"
|
jpayne@69
|
371 )
|
jpayne@69
|
372 except TypeError:
|
jpayne@69
|
373 # Original behaviour, assume if meta_data missing
|
jpayne@69
|
374 filenames_relative_to_index = False
|
jpayne@69
|
375 self._filenames = [
|
jpayne@69
|
376 row[0]
|
jpayne@69
|
377 for row in con.execute(
|
jpayne@69
|
378 "SELECT name FROM file_data ORDER BY file_number;"
|
jpayne@69
|
379 ).fetchall()
|
jpayne@69
|
380 ]
|
jpayne@69
|
381 if filenames_relative_to_index:
|
jpayne@69
|
382 # Not implicitly relative to $PWD, explicitly relative to index file
|
jpayne@69
|
383 relative_path = os.path.abspath(os.path.dirname(index_filename))
|
jpayne@69
|
384 tmp = []
|
jpayne@69
|
385 for f in self._filenames:
|
jpayne@69
|
386 if os.path.isabs(f):
|
jpayne@69
|
387 tmp.append(f)
|
jpayne@69
|
388 else:
|
jpayne@69
|
389 # Would be stored with Unix / path separator, so convert
|
jpayne@69
|
390 # it to the local OS path separator here:
|
jpayne@69
|
391 tmp.append(
|
jpayne@69
|
392 os.path.join(relative_path, f.replace("/", os.path.sep))
|
jpayne@69
|
393 )
|
jpayne@69
|
394 self._filenames = tmp
|
jpayne@69
|
395 del tmp
|
jpayne@69
|
396 if filenames and len(filenames) != len(self._filenames):
|
jpayne@69
|
397 con.close()
|
jpayne@69
|
398 raise ValueError(
|
jpayne@69
|
399 "Index file says %i files, not %i"
|
jpayne@69
|
400 % (len(self._filenames), len(filenames))
|
jpayne@69
|
401 ) from None
|
jpayne@69
|
402 if filenames and filenames != self._filenames:
|
jpayne@69
|
403 for old, new in zip(self._filenames, filenames):
|
jpayne@69
|
404 # Want exact match (after making relative to the index above)
|
jpayne@69
|
405 if os.path.abspath(old) != os.path.abspath(new):
|
jpayne@69
|
406 con.close()
|
jpayne@69
|
407 if filenames_relative_to_index:
|
jpayne@69
|
408 raise ValueError(
|
jpayne@69
|
409 "Index file has different filenames, e.g. %r != %r"
|
jpayne@69
|
410 % (os.path.abspath(old), os.path.abspath(new))
|
jpayne@69
|
411 ) from None
|
jpayne@69
|
412 else:
|
jpayne@69
|
413 raise ValueError(
|
jpayne@69
|
414 "Index file has different filenames "
|
jpayne@69
|
415 "[This is an old index where any relative paths "
|
jpayne@69
|
416 "were relative to the original working directory]. "
|
jpayne@69
|
417 "e.g. %r != %r"
|
jpayne@69
|
418 % (os.path.abspath(old), os.path.abspath(new))
|
jpayne@69
|
419 ) from None
|
jpayne@69
|
420 # Filenames are equal (after imposing abspath)
|
jpayne@69
|
421 except sqlite3.OperationalError as err:
|
jpayne@69
|
422 con.close()
|
jpayne@69
|
423 raise ValueError(f"Not a Biopython index database? {err}") from None
|
jpayne@69
|
424 # Now we have the format (from the DB if not given to us),
|
jpayne@69
|
425 if not proxy_factory(self._format):
|
jpayne@69
|
426 con.close()
|
jpayne@69
|
427 raise ValueError(f"Unsupported format '{self._format}'")
|
jpayne@69
|
428
|
jpayne@69
|
429 def _build_index(self):
|
jpayne@69
|
430 """Call from __init__ to create a new index (PRIVATE)."""
|
jpayne@69
|
431 index_filename = self._index_filename
|
jpayne@69
|
432 relative_path = self._relative_path
|
jpayne@69
|
433 filenames = self._filenames
|
jpayne@69
|
434 fmt = self._format
|
jpayne@69
|
435 key_function = self._key_function
|
jpayne@69
|
436 proxy_factory = self._proxy_factory
|
jpayne@69
|
437 max_open = self._max_open
|
jpayne@69
|
438 random_access_proxies = self._proxies
|
jpayne@69
|
439
|
jpayne@69
|
440 if not fmt or not filenames:
|
jpayne@69
|
441 raise ValueError(
|
jpayne@69
|
442 f"Filenames to index and format required to build {index_filename!r}"
|
jpayne@69
|
443 )
|
jpayne@69
|
444 if not proxy_factory(fmt):
|
jpayne@69
|
445 raise ValueError(f"Unsupported format '{fmt}'")
|
jpayne@69
|
446 # Create the index
|
jpayne@69
|
447 con = sqlite3.dbapi2.connect(index_filename)
|
jpayne@69
|
448 self._con = con
|
jpayne@69
|
449 # print("Creating index")
|
jpayne@69
|
450 # Sqlite PRAGMA settings for speed
|
jpayne@69
|
451 con.execute("PRAGMA synchronous=OFF")
|
jpayne@69
|
452 con.execute("PRAGMA locking_mode=EXCLUSIVE")
|
jpayne@69
|
453 # Don't index the key column until the end (faster)
|
jpayne@69
|
454 # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
|
jpayne@69
|
455 # "offset INTEGER);")
|
jpayne@69
|
456 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
|
jpayne@69
|
457 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1))
|
jpayne@69
|
458 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt))
|
jpayne@69
|
459 con.execute(
|
jpayne@69
|
460 "INSERT INTO meta_data (key, value) VALUES (?,?);",
|
jpayne@69
|
461 ("filenames_relative_to_index", "True"),
|
jpayne@69
|
462 )
|
jpayne@69
|
463 # TODO - Record the file size and modified date?
|
jpayne@69
|
464 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
|
jpayne@69
|
465 con.execute(
|
jpayne@69
|
466 "CREATE TABLE offset_data (key TEXT, "
|
jpayne@69
|
467 "file_number INTEGER, offset INTEGER, length INTEGER);"
|
jpayne@69
|
468 )
|
jpayne@69
|
469 count = 0
|
jpayne@69
|
470 for file_index, filename in enumerate(filenames):
|
jpayne@69
|
471 # Default to storing as an absolute path,
|
jpayne@69
|
472 f = os.path.abspath(filename)
|
jpayne@69
|
473 if not os.path.isabs(filename) and not os.path.isabs(index_filename):
|
jpayne@69
|
474 # Since user gave BOTH filename & index as relative paths,
|
jpayne@69
|
475 # we will store this relative to the index file even though
|
jpayne@69
|
476 # if it may now start ../ (meaning up a level)
|
jpayne@69
|
477 # Note for cross platform use (e.g. shared drive over SAMBA),
|
jpayne@69
|
478 # convert any Windows slash into Unix style for rel paths.
|
jpayne@69
|
479 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
|
jpayne@69
|
480 elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith(
|
jpayne@69
|
481 relative_path + os.path.sep
|
jpayne@69
|
482 ):
|
jpayne@69
|
483 # Since sequence file is in same directory or sub directory,
|
jpayne@69
|
484 # might as well make this into a relative path:
|
jpayne@69
|
485 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
|
jpayne@69
|
486 assert not f.startswith("../"), f
|
jpayne@69
|
487 # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f))
|
jpayne@69
|
488 con.execute(
|
jpayne@69
|
489 "INSERT INTO file_data (file_number, name) VALUES (?,?);",
|
jpayne@69
|
490 (file_index, f),
|
jpayne@69
|
491 )
|
jpayne@69
|
492 random_access_proxy = proxy_factory(fmt, filename)
|
jpayne@69
|
493 if key_function:
|
jpayne@69
|
494 offset_iter = (
|
jpayne@69
|
495 (key_function(key), file_index, offset, length)
|
jpayne@69
|
496 for (key, offset, length) in random_access_proxy
|
jpayne@69
|
497 )
|
jpayne@69
|
498 else:
|
jpayne@69
|
499 offset_iter = (
|
jpayne@69
|
500 (key, file_index, offset, length)
|
jpayne@69
|
501 for (key, offset, length) in random_access_proxy
|
jpayne@69
|
502 )
|
jpayne@69
|
503 while True:
|
jpayne@69
|
504 batch = list(itertools.islice(offset_iter, 100))
|
jpayne@69
|
505 if not batch:
|
jpayne@69
|
506 break
|
jpayne@69
|
507 # print("Inserting batch of %i offsets, %s ... %s"
|
jpayne@69
|
508 # % (len(batch), batch[0][0], batch[-1][0]))
|
jpayne@69
|
509 con.executemany(
|
jpayne@69
|
510 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
|
jpayne@69
|
511 batch,
|
jpayne@69
|
512 )
|
jpayne@69
|
513 con.commit()
|
jpayne@69
|
514 count += len(batch)
|
jpayne@69
|
515 if len(random_access_proxies) < max_open:
|
jpayne@69
|
516 random_access_proxies[file_index] = random_access_proxy
|
jpayne@69
|
517 else:
|
jpayne@69
|
518 random_access_proxy._handle.close()
|
jpayne@69
|
519 self._length = count
|
jpayne@69
|
520 # print("About to index %i entries" % count)
|
jpayne@69
|
521 try:
|
jpayne@69
|
522 con.execute(
|
jpayne@69
|
523 "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);"
|
jpayne@69
|
524 )
|
jpayne@69
|
525 except sqlite3.IntegrityError as err:
|
jpayne@69
|
526 self._proxies = random_access_proxies
|
jpayne@69
|
527 self.close()
|
jpayne@69
|
528 con.close()
|
jpayne@69
|
529 raise ValueError(f"Duplicate key? {err}") from None
|
jpayne@69
|
530 con.execute("PRAGMA locking_mode=NORMAL")
|
jpayne@69
|
531 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count"))
|
jpayne@69
|
532 con.commit()
|
jpayne@69
|
533 # print("Index created")
|
jpayne@69
|
534
|
jpayne@69
|
535 def __repr__(self):
|
jpayne@69
|
536 return self._repr
|
jpayne@69
|
537
|
jpayne@69
|
538 def __contains__(self, key):
|
jpayne@69
|
539 return bool(
|
jpayne@69
|
540 self._con.execute(
|
jpayne@69
|
541 "SELECT key FROM offset_data WHERE key=?;", (key,)
|
jpayne@69
|
542 ).fetchone()
|
jpayne@69
|
543 )
|
jpayne@69
|
544
|
jpayne@69
|
545 def __len__(self):
|
jpayne@69
|
546 """Return the number of records indexed."""
|
jpayne@69
|
547 return self._length
|
jpayne@69
|
548 # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
|
jpayne@69
|
549
|
jpayne@69
|
550 def __iter__(self):
|
jpayne@69
|
551 """Iterate over the keys."""
|
jpayne@69
|
552 for row in self._con.execute(
|
jpayne@69
|
553 "SELECT key FROM offset_data ORDER BY file_number, offset;"
|
jpayne@69
|
554 ):
|
jpayne@69
|
555 yield str(row[0])
|
jpayne@69
|
556
|
jpayne@69
|
557 def __getitem__(self, key):
|
jpayne@69
|
558 """Return record for the specified key."""
|
jpayne@69
|
559 # Pass the offset to the proxy
|
jpayne@69
|
560 row = self._con.execute(
|
jpayne@69
|
561 "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,)
|
jpayne@69
|
562 ).fetchone()
|
jpayne@69
|
563 if not row:
|
jpayne@69
|
564 raise KeyError
|
jpayne@69
|
565 file_number, offset = row
|
jpayne@69
|
566 proxies = self._proxies
|
jpayne@69
|
567 if file_number in proxies:
|
jpayne@69
|
568 record = proxies[file_number].get(offset)
|
jpayne@69
|
569 else:
|
jpayne@69
|
570 if len(proxies) >= self._max_open:
|
jpayne@69
|
571 # Close an old handle...
|
jpayne@69
|
572 proxies.popitem()[1]._handle.close()
|
jpayne@69
|
573 # Open a new handle...
|
jpayne@69
|
574 proxy = self._proxy_factory(self._format, self._filenames[file_number])
|
jpayne@69
|
575 record = proxy.get(offset)
|
jpayne@69
|
576 proxies[file_number] = proxy
|
jpayne@69
|
577 if self._key_function:
|
jpayne@69
|
578 key2 = self._key_function(record.id)
|
jpayne@69
|
579 else:
|
jpayne@69
|
580 key2 = record.id
|
jpayne@69
|
581 if key != key2:
|
jpayne@69
|
582 raise ValueError(f"Key did not match ({key} vs {key2})")
|
jpayne@69
|
583 return record
|
jpayne@69
|
584
|
jpayne@69
|
585 def get_raw(self, key):
|
jpayne@69
|
586 """Return the raw record from the file as a bytes string.
|
jpayne@69
|
587
|
jpayne@69
|
588 If the key is not found, a KeyError exception is raised.
|
jpayne@69
|
589 """
|
jpayne@69
|
590 # Pass the offset to the proxy
|
jpayne@69
|
591 row = self._con.execute(
|
jpayne@69
|
592 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,)
|
jpayne@69
|
593 ).fetchone()
|
jpayne@69
|
594 if not row:
|
jpayne@69
|
595 raise KeyError
|
jpayne@69
|
596 file_number, offset, length = row
|
jpayne@69
|
597 proxies = self._proxies
|
jpayne@69
|
598 if file_number in proxies:
|
jpayne@69
|
599 if length:
|
jpayne@69
|
600 # Shortcut if we have the length
|
jpayne@69
|
601 h = proxies[file_number]._handle
|
jpayne@69
|
602 h.seek(offset)
|
jpayne@69
|
603 return h.read(length)
|
jpayne@69
|
604 else:
|
jpayne@69
|
605 return proxies[file_number].get_raw(offset)
|
jpayne@69
|
606 else:
|
jpayne@69
|
607 # This code is duplicated from __getitem__ to avoid a function call
|
jpayne@69
|
608 if len(proxies) >= self._max_open:
|
jpayne@69
|
609 # Close an old handle...
|
jpayne@69
|
610 proxies.popitem()[1]._handle.close()
|
jpayne@69
|
611 # Open a new handle...
|
jpayne@69
|
612 proxy = self._proxy_factory(self._format, self._filenames[file_number])
|
jpayne@69
|
613 proxies[file_number] = proxy
|
jpayne@69
|
614 if length:
|
jpayne@69
|
615 # Shortcut if we have the length
|
jpayne@69
|
616 h = proxy._handle
|
jpayne@69
|
617 h.seek(offset)
|
jpayne@69
|
618 return h.read(length)
|
jpayne@69
|
619 else:
|
jpayne@69
|
620 return proxy.get_raw(offset)
|
jpayne@69
|
621
|
jpayne@69
|
622 def close(self):
|
jpayne@69
|
623 """Close any open file handles."""
|
jpayne@69
|
624 proxies = self._proxies
|
jpayne@69
|
625 while proxies:
|
jpayne@69
|
626 proxies.popitem()[1]._handle.close()
|