comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright 1999 by Jeffrey Chang. All rights reserved.
2 # Copyright 2009-2018 by Peter Cock. All rights reserved.
3 #
4 # This file is part of the Biopython distribution and governed by your
5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
6 # Please see the LICENSE file that should have been included as part of this
7 # package.
8 """Code for more fancy file handles.
9
10 Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for
11 indexing files. These are not intended for direct use.
12 """
13
14 import os
15 import contextlib
16 import itertools
17 import collections.abc
18
19 from abc import ABC, abstractmethod
20
21 try:
22 import sqlite3
23 except ImportError:
24 # May be missing if Python was compiled from source without its dependencies
25 sqlite3 = None # type: ignore
26
27
28 @contextlib.contextmanager
29 def as_handle(handleish, mode="r", **kwargs):
30 r"""Context manager to ensure we are using a handle.
31
32 Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
33 and parse methods: either file objects or path-like objects (strings, pathlib.Path
34 instances, or more generally, anything that can be handled by the builtin 'open'
35 function).
36
37 When given a path-like object, returns an open file handle to that path, with provided
38 mode, which will be closed when the manager exits.
39
40 All other inputs are returned, and are *not* closed.
41
42 Arguments:
43 - handleish - Either a file handle or path-like object (anything which can be
44 passed to the builtin 'open' function, such as str, bytes,
45 pathlib.Path, and os.DirEntry objects)
46 - mode - Mode to open handleish (used only if handleish is a string)
47 - kwargs - Further arguments to pass to open(...)
48
49 Examples
50 --------
51 >>> from Bio import File
52 >>> import os
53 >>> with File.as_handle('seqs.fasta', 'w') as fp:
54 ... fp.write('>test\nACGT')
55 ...
56 10
57 >>> fp.closed
58 True
59
60 >>> handle = open('seqs.fasta', 'w')
61 >>> with File.as_handle(handle) as fp:
62 ... fp.write('>test\nACGT')
63 ...
64 10
65 >>> fp.closed
66 False
67 >>> fp.close()
68 >>> os.remove("seqs.fasta") # tidy up
69
70 """
71 try:
72 with open(handleish, mode, **kwargs) as fp:
73 yield fp
74 except TypeError:
75 yield handleish
76
77
78 def _open_for_random_access(filename):
79 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
80
81 This functionality is used by the Bio.SeqIO and Bio.SearchIO index
82 and index_db functions.
83
84 If the file is gzipped but not BGZF, a specific ValueError is raised.
85 """
86 handle = open(filename, "rb")
87 magic = handle.read(2)
88 handle.seek(0)
89
90 if magic == b"\x1f\x8b":
91 # This is a gzipped file, but is it BGZF?
92 from . import bgzf
93
94 try:
95 # If it is BGZF, we support that
96 return bgzf.BgzfReader(mode="rb", fileobj=handle)
97 except ValueError as e:
98 assert "BGZF" in str(e)
99 # Not a BGZF file after all,
100 handle.close()
101 raise ValueError(
102 "Gzipped files are not suitable for indexing, "
103 "please use BGZF (blocked gzip format) instead."
104 ) from None
105
106 return handle
107
108
109 # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
110 # for indexing
111
112
113 class _IndexedSeqFileProxy(ABC):
114 """Abstract base class for file format specific random access (PRIVATE).
115
116 This is subclasses in both Bio.SeqIO for indexing as SeqRecord
117 objects, and in Bio.SearchIO for indexing QueryResult objects.
118
119 Subclasses for each file format should define '__iter__', 'get'
120 and optionally 'get_raw' methods.
121 """
122
123 @abstractmethod
124 def __iter__(self):
125 """Return (identifier, offset, length in bytes) tuples.
126
127 The length can be zero where it is not implemented or not
128 possible for a particular file format.
129 """
130 raise NotImplementedError
131
132 @abstractmethod
133 def get(self, offset):
134 """Return parsed object for this entry."""
135 # Most file formats with self contained records can be handled by
136 # parsing StringIO(self.get_raw(offset).decode())
137 raise NotImplementedError
138
139 def get_raw(self, offset):
140 """Return the raw record from the file as a bytes string (if implemented).
141
142 If the key is not found, a KeyError exception is raised.
143
144 This may not have been implemented for all file formats.
145 """
146 # Should be done by each sub-class (if possible)
147 raise NotImplementedError("Not available for this file format.")
148
149
150 class _IndexedSeqFileDict(collections.abc.Mapping):
151 """Read only dictionary interface to a sequential record file.
152
153 This code is used in both Bio.SeqIO for indexing as SeqRecord
154 objects, and in Bio.SearchIO for indexing QueryResult objects.
155
156 Keeps the keys and associated file offsets in memory, reads the file
157 to access entries as objects parsing them on demand. This approach
158 is memory limited, but will work even with millions of records.
159
160 Note duplicate keys are not allowed. If this happens, a ValueError
161 exception is raised.
162
163 As used in Bio.SeqIO, by default the SeqRecord's id string is used
164 as the dictionary key. In Bio.SearchIO, the query's id string is
165 used. This can be changed by supplying an optional key_function,
166 a callback function which will be given the record id and must
167 return the desired key. For example, this allows you to parse
168 NCBI style FASTA identifiers, and extract the GI number to use
169 as the dictionary key.
170
171 Note that this dictionary is essentially read only. You cannot
172 add or change values, pop values, nor clear the dictionary.
173 """
174
175 def __init__(self, random_access_proxy, key_function, repr, obj_repr):
176 """Initialize the class."""
177 # Use key_function=None for default value
178 self._proxy = random_access_proxy
179 self._key_function = key_function
180 self._repr = repr
181 self._obj_repr = obj_repr
182 self._cached_prev_record = (None, None) # (key, record)
183 if key_function:
184 offset_iter = (
185 (key_function(key), offset, length)
186 for (key, offset, length) in random_access_proxy
187 )
188 else:
189 offset_iter = random_access_proxy
190 offsets = {}
191 for key, offset, length in offset_iter:
192 # Note - we don't store the length because I want to minimise the
193 # memory requirements. With the SQLite backend the length is kept
194 # and is used to speed up the get_raw method (by about 3 times).
195 # The length should be provided by all the current backends except
196 # SFF where there is an existing Roche index we can reuse (very fast
197 # but lacks the record lengths)
198 # assert length or format in ["sff", "sff-trim"], \
199 # "%s at offset %i given length %r (%s format %s)" \
200 # % (key, offset, length, filename, format)
201 if key in offsets:
202 self._proxy._handle.close()
203 raise ValueError(f"Duplicate key '{key}'")
204 else:
205 offsets[key] = offset
206 self._offsets = offsets
207
208 def __repr__(self):
209 """Return a string representation of the File object."""
210 return self._repr
211
212 def __str__(self):
213 """Create a string representation of the File object."""
214 # TODO - How best to handle the __str__ for SeqIO and SearchIO?
215 if self:
216 return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}"
217 else:
218 return "{}"
219
220 def __len__(self):
221 """Return the number of records."""
222 return len(self._offsets)
223
224 def __iter__(self):
225 """Iterate over the keys."""
226 return iter(self._offsets)
227
228 def __getitem__(self, key):
229 """Return record for the specified key.
230
231 As an optimization when repeatedly asked to look up the same record,
232 the key and record are cached so that if the *same* record is
233 requested next time, it can be returned without going to disk.
234 """
235 if key == self._cached_prev_record[0]:
236 return self._cached_prev_record[1]
237 # Pass the offset to the proxy
238 record = self._proxy.get(self._offsets[key])
239 if self._key_function:
240 key2 = self._key_function(record.id)
241 else:
242 key2 = record.id
243 if key != key2:
244 raise ValueError(f"Key did not match ({key} vs {key2})")
245 self._cached_prev_record = (key, record)
246 return record
247
248 def get_raw(self, key):
249 """Return the raw record from the file as a bytes string.
250
251 If the key is not found, a KeyError exception is raised.
252 """
253 # Pass the offset to the proxy
254 return self._proxy.get_raw(self._offsets[key])
255
256 def close(self):
257 """Close the file handle being used to read the data.
258
259 Once called, further use of the index won't work. The sole purpose
260 of this method is to allow explicit handle closure - for example
261 if you wish to delete the file, on Windows you must first close
262 all open handles to that file.
263 """
264 self._proxy._handle.close()
265
266
267 class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
268 """Read only dictionary interface to many sequential record files.
269
270 This code is used in both Bio.SeqIO for indexing as SeqRecord
271 objects, and in Bio.SearchIO for indexing QueryResult objects.
272
273 Keeps the keys, file-numbers and offsets in an SQLite database. To access
274 a record by key, reads from the offset in the appropriate file and then
275 parses the record into an object.
276
277 There are OS limits on the number of files that can be open at once,
278 so a pool are kept. If a record is required from a closed file, then
279 one of the open handles is closed first.
280 """
281
282 def __init__(
283 self,
284 index_filename,
285 filenames,
286 proxy_factory,
287 fmt,
288 key_function,
289 repr,
290 max_open=10,
291 ):
292 """Initialize the class."""
293 # TODO? - Don't keep filename list in memory (just in DB)?
294 # Should save a chunk of memory if dealing with 1000s of files.
295 # Furthermore could compare a generator to the DB on reloading
296 # (no need to turn it into a list)
297
298 if sqlite3 is None:
299 # Python was compiled without sqlite3 support
300 from Bio import MissingPythonDependencyError
301
302 raise MissingPythonDependencyError(
303 "Python was compiled without the sqlite3 module"
304 )
305 if filenames is not None:
306 filenames = list(filenames) # In case it was a generator
307
308 # Cache the arguments as private variables
309 self._index_filename = index_filename
310 self._filenames = filenames
311 self._format = fmt
312 self._key_function = key_function
313 self._proxy_factory = proxy_factory
314 self._repr = repr
315 self._max_open = max_open
316 self._proxies = {}
317
318 # Note if using SQLite :memory: trick index filename, this will
319 # give $PWD as the relative path (which is fine).
320 self._relative_path = os.path.abspath(os.path.dirname(index_filename))
321
322 if os.path.isfile(index_filename):
323 self._load_index()
324 else:
325 self._build_index()
326
327 def _load_index(self):
328 """Call from __init__ to re-use an existing index (PRIVATE)."""
329 index_filename = self._index_filename
330 relative_path = self._relative_path
331 filenames = self._filenames
332 fmt = self._format
333 proxy_factory = self._proxy_factory
334
335 con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False)
336 self._con = con
337 # Check the count...
338 try:
339 (count,) = con.execute(
340 "SELECT value FROM meta_data WHERE key=?;", ("count",)
341 ).fetchone()
342 self._length = int(count)
343 if self._length == -1:
344 con.close()
345 raise ValueError("Unfinished/partial database") from None
346
347 # use MAX(_ROWID_) to obtain the number of sequences in the database
348 # using COUNT(key) is quite slow in SQLITE
349 # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables)
350 (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone()
351 if self._length != int(count):
352 con.close()
353 raise ValueError(
354 "Corrupt database? %i entries not %i" % (int(count), self._length)
355 ) from None
356 (self._format,) = con.execute(
357 "SELECT value FROM meta_data WHERE key=?;", ("format",)
358 ).fetchone()
359 if fmt and fmt != self._format:
360 con.close()
361 raise ValueError(
362 f"Index file says format {self._format}, not {fmt}"
363 ) from None
364 try:
365 (filenames_relative_to_index,) = con.execute(
366 "SELECT value FROM meta_data WHERE key=?;",
367 ("filenames_relative_to_index",),
368 ).fetchone()
369 filenames_relative_to_index = (
370 filenames_relative_to_index.upper() == "TRUE"
371 )
372 except TypeError:
373 # Original behaviour, assume if meta_data missing
374 filenames_relative_to_index = False
375 self._filenames = [
376 row[0]
377 for row in con.execute(
378 "SELECT name FROM file_data ORDER BY file_number;"
379 ).fetchall()
380 ]
381 if filenames_relative_to_index:
382 # Not implicitly relative to $PWD, explicitly relative to index file
383 relative_path = os.path.abspath(os.path.dirname(index_filename))
384 tmp = []
385 for f in self._filenames:
386 if os.path.isabs(f):
387 tmp.append(f)
388 else:
389 # Would be stored with Unix / path separator, so convert
390 # it to the local OS path separator here:
391 tmp.append(
392 os.path.join(relative_path, f.replace("/", os.path.sep))
393 )
394 self._filenames = tmp
395 del tmp
396 if filenames and len(filenames) != len(self._filenames):
397 con.close()
398 raise ValueError(
399 "Index file says %i files, not %i"
400 % (len(self._filenames), len(filenames))
401 ) from None
402 if filenames and filenames != self._filenames:
403 for old, new in zip(self._filenames, filenames):
404 # Want exact match (after making relative to the index above)
405 if os.path.abspath(old) != os.path.abspath(new):
406 con.close()
407 if filenames_relative_to_index:
408 raise ValueError(
409 "Index file has different filenames, e.g. %r != %r"
410 % (os.path.abspath(old), os.path.abspath(new))
411 ) from None
412 else:
413 raise ValueError(
414 "Index file has different filenames "
415 "[This is an old index where any relative paths "
416 "were relative to the original working directory]. "
417 "e.g. %r != %r"
418 % (os.path.abspath(old), os.path.abspath(new))
419 ) from None
420 # Filenames are equal (after imposing abspath)
421 except sqlite3.OperationalError as err:
422 con.close()
423 raise ValueError(f"Not a Biopython index database? {err}") from None
424 # Now we have the format (from the DB if not given to us),
425 if not proxy_factory(self._format):
426 con.close()
427 raise ValueError(f"Unsupported format '{self._format}'")
428
429 def _build_index(self):
430 """Call from __init__ to create a new index (PRIVATE)."""
431 index_filename = self._index_filename
432 relative_path = self._relative_path
433 filenames = self._filenames
434 fmt = self._format
435 key_function = self._key_function
436 proxy_factory = self._proxy_factory
437 max_open = self._max_open
438 random_access_proxies = self._proxies
439
440 if not fmt or not filenames:
441 raise ValueError(
442 f"Filenames to index and format required to build {index_filename!r}"
443 )
444 if not proxy_factory(fmt):
445 raise ValueError(f"Unsupported format '{fmt}'")
446 # Create the index
447 con = sqlite3.dbapi2.connect(index_filename)
448 self._con = con
449 # print("Creating index")
450 # Sqlite PRAGMA settings for speed
451 con.execute("PRAGMA synchronous=OFF")
452 con.execute("PRAGMA locking_mode=EXCLUSIVE")
453 # Don't index the key column until the end (faster)
454 # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
455 # "offset INTEGER);")
456 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
457 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1))
458 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt))
459 con.execute(
460 "INSERT INTO meta_data (key, value) VALUES (?,?);",
461 ("filenames_relative_to_index", "True"),
462 )
463 # TODO - Record the file size and modified date?
464 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
465 con.execute(
466 "CREATE TABLE offset_data (key TEXT, "
467 "file_number INTEGER, offset INTEGER, length INTEGER);"
468 )
469 count = 0
470 for file_index, filename in enumerate(filenames):
471 # Default to storing as an absolute path,
472 f = os.path.abspath(filename)
473 if not os.path.isabs(filename) and not os.path.isabs(index_filename):
474 # Since user gave BOTH filename & index as relative paths,
475 # we will store this relative to the index file even though
476 # if it may now start ../ (meaning up a level)
477 # Note for cross platform use (e.g. shared drive over SAMBA),
478 # convert any Windows slash into Unix style for rel paths.
479 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
480 elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith(
481 relative_path + os.path.sep
482 ):
483 # Since sequence file is in same directory or sub directory,
484 # might as well make this into a relative path:
485 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
486 assert not f.startswith("../"), f
487 # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f))
488 con.execute(
489 "INSERT INTO file_data (file_number, name) VALUES (?,?);",
490 (file_index, f),
491 )
492 random_access_proxy = proxy_factory(fmt, filename)
493 if key_function:
494 offset_iter = (
495 (key_function(key), file_index, offset, length)
496 for (key, offset, length) in random_access_proxy
497 )
498 else:
499 offset_iter = (
500 (key, file_index, offset, length)
501 for (key, offset, length) in random_access_proxy
502 )
503 while True:
504 batch = list(itertools.islice(offset_iter, 100))
505 if not batch:
506 break
507 # print("Inserting batch of %i offsets, %s ... %s"
508 # % (len(batch), batch[0][0], batch[-1][0]))
509 con.executemany(
510 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
511 batch,
512 )
513 con.commit()
514 count += len(batch)
515 if len(random_access_proxies) < max_open:
516 random_access_proxies[file_index] = random_access_proxy
517 else:
518 random_access_proxy._handle.close()
519 self._length = count
520 # print("About to index %i entries" % count)
521 try:
522 con.execute(
523 "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);"
524 )
525 except sqlite3.IntegrityError as err:
526 self._proxies = random_access_proxies
527 self.close()
528 con.close()
529 raise ValueError(f"Duplicate key? {err}") from None
530 con.execute("PRAGMA locking_mode=NORMAL")
531 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count"))
532 con.commit()
533 # print("Index created")
534
535 def __repr__(self):
536 return self._repr
537
538 def __contains__(self, key):
539 return bool(
540 self._con.execute(
541 "SELECT key FROM offset_data WHERE key=?;", (key,)
542 ).fetchone()
543 )
544
545 def __len__(self):
546 """Return the number of records indexed."""
547 return self._length
548 # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
549
550 def __iter__(self):
551 """Iterate over the keys."""
552 for row in self._con.execute(
553 "SELECT key FROM offset_data ORDER BY file_number, offset;"
554 ):
555 yield str(row[0])
556
557 def __getitem__(self, key):
558 """Return record for the specified key."""
559 # Pass the offset to the proxy
560 row = self._con.execute(
561 "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,)
562 ).fetchone()
563 if not row:
564 raise KeyError
565 file_number, offset = row
566 proxies = self._proxies
567 if file_number in proxies:
568 record = proxies[file_number].get(offset)
569 else:
570 if len(proxies) >= self._max_open:
571 # Close an old handle...
572 proxies.popitem()[1]._handle.close()
573 # Open a new handle...
574 proxy = self._proxy_factory(self._format, self._filenames[file_number])
575 record = proxy.get(offset)
576 proxies[file_number] = proxy
577 if self._key_function:
578 key2 = self._key_function(record.id)
579 else:
580 key2 = record.id
581 if key != key2:
582 raise ValueError(f"Key did not match ({key} vs {key2})")
583 return record
584
585 def get_raw(self, key):
586 """Return the raw record from the file as a bytes string.
587
588 If the key is not found, a KeyError exception is raised.
589 """
590 # Pass the offset to the proxy
591 row = self._con.execute(
592 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,)
593 ).fetchone()
594 if not row:
595 raise KeyError
596 file_number, offset, length = row
597 proxies = self._proxies
598 if file_number in proxies:
599 if length:
600 # Shortcut if we have the length
601 h = proxies[file_number]._handle
602 h.seek(offset)
603 return h.read(length)
604 else:
605 return proxies[file_number].get_raw(offset)
606 else:
607 # This code is duplicated from __getitem__ to avoid a function call
608 if len(proxies) >= self._max_open:
609 # Close an old handle...
610 proxies.popitem()[1]._handle.close()
611 # Open a new handle...
612 proxy = self._proxy_factory(self._format, self._filenames[file_number])
613 proxies[file_number] = proxy
614 if length:
615 # Shortcut if we have the length
616 h = proxy._handle
617 h.seek(offset)
618 return h.read(length)
619 else:
620 return proxy.get_raw(offset)
621
622 def close(self):
623 """Close any open file handles."""
624 proxies = self._proxies
625 while proxies:
626 proxies.popitem()[1]._handle.close()