jpayne@68: # Copyright 1999 by Jeffrey Chang. All rights reserved. jpayne@68: # Copyright 2009-2018 by Peter Cock. All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: """Code for more fancy file handles. jpayne@68: jpayne@68: Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for jpayne@68: indexing files. These are not intended for direct use. jpayne@68: """ jpayne@68: jpayne@68: import os jpayne@68: import contextlib jpayne@68: import itertools jpayne@68: import collections.abc jpayne@68: jpayne@68: from abc import ABC, abstractmethod jpayne@68: jpayne@68: try: jpayne@68: import sqlite3 jpayne@68: except ImportError: jpayne@68: # May be missing if Python was compiled from source without its dependencies jpayne@68: sqlite3 = None # type: ignore jpayne@68: jpayne@68: jpayne@68: @contextlib.contextmanager jpayne@68: def as_handle(handleish, mode="r", **kwargs): jpayne@68: r"""Context manager to ensure we are using a handle. jpayne@68: jpayne@68: Context manager for arguments that can be passed to SeqIO and AlignIO read, write, jpayne@68: and parse methods: either file objects or path-like objects (strings, pathlib.Path jpayne@68: instances, or more generally, anything that can be handled by the builtin 'open' jpayne@68: function). jpayne@68: jpayne@68: When given a path-like object, returns an open file handle to that path, with provided jpayne@68: mode, which will be closed when the manager exits. jpayne@68: jpayne@68: All other inputs are returned, and are *not* closed. jpayne@68: jpayne@68: Arguments: jpayne@68: - handleish - Either a file handle or path-like object (anything which can be jpayne@68: passed to the builtin 'open' function, such as str, bytes, jpayne@68: pathlib.Path, and os.DirEntry objects) jpayne@68: - mode - Mode to open handleish (used only if handleish is a string) jpayne@68: - kwargs - Further arguments to pass to open(...) jpayne@68: jpayne@68: Examples jpayne@68: -------- jpayne@68: >>> from Bio import File jpayne@68: >>> import os jpayne@68: >>> with File.as_handle('seqs.fasta', 'w') as fp: jpayne@68: ... fp.write('>test\nACGT') jpayne@68: ... jpayne@68: 10 jpayne@68: >>> fp.closed jpayne@68: True jpayne@68: jpayne@68: >>> handle = open('seqs.fasta', 'w') jpayne@68: >>> with File.as_handle(handle) as fp: jpayne@68: ... fp.write('>test\nACGT') jpayne@68: ... jpayne@68: 10 jpayne@68: >>> fp.closed jpayne@68: False jpayne@68: >>> fp.close() jpayne@68: >>> os.remove("seqs.fasta") # tidy up jpayne@68: jpayne@68: """ jpayne@68: try: jpayne@68: with open(handleish, mode, **kwargs) as fp: jpayne@68: yield fp jpayne@68: except TypeError: jpayne@68: yield handleish jpayne@68: jpayne@68: jpayne@68: def _open_for_random_access(filename): jpayne@68: """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). jpayne@68: jpayne@68: This functionality is used by the Bio.SeqIO and Bio.SearchIO index jpayne@68: and index_db functions. jpayne@68: jpayne@68: If the file is gzipped but not BGZF, a specific ValueError is raised. jpayne@68: """ jpayne@68: handle = open(filename, "rb") jpayne@68: magic = handle.read(2) jpayne@68: handle.seek(0) jpayne@68: jpayne@68: if magic == b"\x1f\x8b": jpayne@68: # This is a gzipped file, but is it BGZF? jpayne@68: from . import bgzf jpayne@68: jpayne@68: try: jpayne@68: # If it is BGZF, we support that jpayne@68: return bgzf.BgzfReader(mode="rb", fileobj=handle) jpayne@68: except ValueError as e: jpayne@68: assert "BGZF" in str(e) jpayne@68: # Not a BGZF file after all, jpayne@68: handle.close() jpayne@68: raise ValueError( jpayne@68: "Gzipped files are not suitable for indexing, " jpayne@68: "please use BGZF (blocked gzip format) instead." jpayne@68: ) from None jpayne@68: jpayne@68: return handle jpayne@68: jpayne@68: jpayne@68: # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO jpayne@68: # for indexing jpayne@68: jpayne@68: jpayne@68: class _IndexedSeqFileProxy(ABC): jpayne@68: """Abstract base class for file format specific random access (PRIVATE). jpayne@68: jpayne@68: This is subclasses in both Bio.SeqIO for indexing as SeqRecord jpayne@68: objects, and in Bio.SearchIO for indexing QueryResult objects. jpayne@68: jpayne@68: Subclasses for each file format should define '__iter__', 'get' jpayne@68: and optionally 'get_raw' methods. jpayne@68: """ jpayne@68: jpayne@68: @abstractmethod jpayne@68: def __iter__(self): jpayne@68: """Return (identifier, offset, length in bytes) tuples. jpayne@68: jpayne@68: The length can be zero where it is not implemented or not jpayne@68: possible for a particular file format. jpayne@68: """ jpayne@68: raise NotImplementedError jpayne@68: jpayne@68: @abstractmethod jpayne@68: def get(self, offset): jpayne@68: """Return parsed object for this entry.""" jpayne@68: # Most file formats with self contained records can be handled by jpayne@68: # parsing StringIO(self.get_raw(offset).decode()) jpayne@68: raise NotImplementedError jpayne@68: jpayne@68: def get_raw(self, offset): jpayne@68: """Return the raw record from the file as a bytes string (if implemented). jpayne@68: jpayne@68: If the key is not found, a KeyError exception is raised. jpayne@68: jpayne@68: This may not have been implemented for all file formats. jpayne@68: """ jpayne@68: # Should be done by each sub-class (if possible) jpayne@68: raise NotImplementedError("Not available for this file format.") jpayne@68: jpayne@68: jpayne@68: class _IndexedSeqFileDict(collections.abc.Mapping): jpayne@68: """Read only dictionary interface to a sequential record file. jpayne@68: jpayne@68: This code is used in both Bio.SeqIO for indexing as SeqRecord jpayne@68: objects, and in Bio.SearchIO for indexing QueryResult objects. jpayne@68: jpayne@68: Keeps the keys and associated file offsets in memory, reads the file jpayne@68: to access entries as objects parsing them on demand. This approach jpayne@68: is memory limited, but will work even with millions of records. jpayne@68: jpayne@68: Note duplicate keys are not allowed. If this happens, a ValueError jpayne@68: exception is raised. jpayne@68: jpayne@68: As used in Bio.SeqIO, by default the SeqRecord's id string is used jpayne@68: as the dictionary key. In Bio.SearchIO, the query's id string is jpayne@68: used. This can be changed by supplying an optional key_function, jpayne@68: a callback function which will be given the record id and must jpayne@68: return the desired key. For example, this allows you to parse jpayne@68: NCBI style FASTA identifiers, and extract the GI number to use jpayne@68: as the dictionary key. jpayne@68: jpayne@68: Note that this dictionary is essentially read only. You cannot jpayne@68: add or change values, pop values, nor clear the dictionary. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, random_access_proxy, key_function, repr, obj_repr): jpayne@68: """Initialize the class.""" jpayne@68: # Use key_function=None for default value jpayne@68: self._proxy = random_access_proxy jpayne@68: self._key_function = key_function jpayne@68: self._repr = repr jpayne@68: self._obj_repr = obj_repr jpayne@68: self._cached_prev_record = (None, None) # (key, record) jpayne@68: if key_function: jpayne@68: offset_iter = ( jpayne@68: (key_function(key), offset, length) jpayne@68: for (key, offset, length) in random_access_proxy jpayne@68: ) jpayne@68: else: jpayne@68: offset_iter = random_access_proxy jpayne@68: offsets = {} jpayne@68: for key, offset, length in offset_iter: jpayne@68: # Note - we don't store the length because I want to minimise the jpayne@68: # memory requirements. With the SQLite backend the length is kept jpayne@68: # and is used to speed up the get_raw method (by about 3 times). jpayne@68: # The length should be provided by all the current backends except jpayne@68: # SFF where there is an existing Roche index we can reuse (very fast jpayne@68: # but lacks the record lengths) jpayne@68: # assert length or format in ["sff", "sff-trim"], \ jpayne@68: # "%s at offset %i given length %r (%s format %s)" \ jpayne@68: # % (key, offset, length, filename, format) jpayne@68: if key in offsets: jpayne@68: self._proxy._handle.close() jpayne@68: raise ValueError(f"Duplicate key '{key}'") jpayne@68: else: jpayne@68: offsets[key] = offset jpayne@68: self._offsets = offsets jpayne@68: jpayne@68: def __repr__(self): jpayne@68: """Return a string representation of the File object.""" jpayne@68: return self._repr jpayne@68: jpayne@68: def __str__(self): jpayne@68: """Create a string representation of the File object.""" jpayne@68: # TODO - How best to handle the __str__ for SeqIO and SearchIO? jpayne@68: if self: jpayne@68: return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}" jpayne@68: else: jpayne@68: return "{}" jpayne@68: jpayne@68: def __len__(self): jpayne@68: """Return the number of records.""" jpayne@68: return len(self._offsets) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: """Iterate over the keys.""" jpayne@68: return iter(self._offsets) jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: """Return record for the specified key. jpayne@68: jpayne@68: As an optimization when repeatedly asked to look up the same record, jpayne@68: the key and record are cached so that if the *same* record is jpayne@68: requested next time, it can be returned without going to disk. jpayne@68: """ jpayne@68: if key == self._cached_prev_record[0]: jpayne@68: return self._cached_prev_record[1] jpayne@68: # Pass the offset to the proxy jpayne@68: record = self._proxy.get(self._offsets[key]) jpayne@68: if self._key_function: jpayne@68: key2 = self._key_function(record.id) jpayne@68: else: jpayne@68: key2 = record.id jpayne@68: if key != key2: jpayne@68: raise ValueError(f"Key did not match ({key} vs {key2})") jpayne@68: self._cached_prev_record = (key, record) jpayne@68: return record jpayne@68: jpayne@68: def get_raw(self, key): jpayne@68: """Return the raw record from the file as a bytes string. jpayne@68: jpayne@68: If the key is not found, a KeyError exception is raised. jpayne@68: """ jpayne@68: # Pass the offset to the proxy jpayne@68: return self._proxy.get_raw(self._offsets[key]) jpayne@68: jpayne@68: def close(self): jpayne@68: """Close the file handle being used to read the data. jpayne@68: jpayne@68: Once called, further use of the index won't work. The sole purpose jpayne@68: of this method is to allow explicit handle closure - for example jpayne@68: if you wish to delete the file, on Windows you must first close jpayne@68: all open handles to that file. jpayne@68: """ jpayne@68: self._proxy._handle.close() jpayne@68: jpayne@68: jpayne@68: class _SQLiteManySeqFilesDict(_IndexedSeqFileDict): jpayne@68: """Read only dictionary interface to many sequential record files. jpayne@68: jpayne@68: This code is used in both Bio.SeqIO for indexing as SeqRecord jpayne@68: objects, and in Bio.SearchIO for indexing QueryResult objects. jpayne@68: jpayne@68: Keeps the keys, file-numbers and offsets in an SQLite database. To access jpayne@68: a record by key, reads from the offset in the appropriate file and then jpayne@68: parses the record into an object. jpayne@68: jpayne@68: There are OS limits on the number of files that can be open at once, jpayne@68: so a pool are kept. If a record is required from a closed file, then jpayne@68: one of the open handles is closed first. jpayne@68: """ jpayne@68: jpayne@68: def __init__( jpayne@68: self, jpayne@68: index_filename, jpayne@68: filenames, jpayne@68: proxy_factory, jpayne@68: fmt, jpayne@68: key_function, jpayne@68: repr, jpayne@68: max_open=10, jpayne@68: ): jpayne@68: """Initialize the class.""" jpayne@68: # TODO? - Don't keep filename list in memory (just in DB)? jpayne@68: # Should save a chunk of memory if dealing with 1000s of files. jpayne@68: # Furthermore could compare a generator to the DB on reloading jpayne@68: # (no need to turn it into a list) jpayne@68: jpayne@68: if sqlite3 is None: jpayne@68: # Python was compiled without sqlite3 support jpayne@68: from Bio import MissingPythonDependencyError jpayne@68: jpayne@68: raise MissingPythonDependencyError( jpayne@68: "Python was compiled without the sqlite3 module" jpayne@68: ) jpayne@68: if filenames is not None: jpayne@68: filenames = list(filenames) # In case it was a generator jpayne@68: jpayne@68: # Cache the arguments as private variables jpayne@68: self._index_filename = index_filename jpayne@68: self._filenames = filenames jpayne@68: self._format = fmt jpayne@68: self._key_function = key_function jpayne@68: self._proxy_factory = proxy_factory jpayne@68: self._repr = repr jpayne@68: self._max_open = max_open jpayne@68: self._proxies = {} jpayne@68: jpayne@68: # Note if using SQLite :memory: trick index filename, this will jpayne@68: # give $PWD as the relative path (which is fine). jpayne@68: self._relative_path = os.path.abspath(os.path.dirname(index_filename)) jpayne@68: jpayne@68: if os.path.isfile(index_filename): jpayne@68: self._load_index() jpayne@68: else: jpayne@68: self._build_index() jpayne@68: jpayne@68: def _load_index(self): jpayne@68: """Call from __init__ to re-use an existing index (PRIVATE).""" jpayne@68: index_filename = self._index_filename jpayne@68: relative_path = self._relative_path jpayne@68: filenames = self._filenames jpayne@68: fmt = self._format jpayne@68: proxy_factory = self._proxy_factory jpayne@68: jpayne@68: con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False) jpayne@68: self._con = con jpayne@68: # Check the count... jpayne@68: try: jpayne@68: (count,) = con.execute( jpayne@68: "SELECT value FROM meta_data WHERE key=?;", ("count",) jpayne@68: ).fetchone() jpayne@68: self._length = int(count) jpayne@68: if self._length == -1: jpayne@68: con.close() jpayne@68: raise ValueError("Unfinished/partial database") from None jpayne@68: jpayne@68: # use MAX(_ROWID_) to obtain the number of sequences in the database jpayne@68: # using COUNT(key) is quite slow in SQLITE jpayne@68: # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables) jpayne@68: (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone() jpayne@68: if self._length != int(count): jpayne@68: con.close() jpayne@68: raise ValueError( jpayne@68: "Corrupt database? %i entries not %i" % (int(count), self._length) jpayne@68: ) from None jpayne@68: (self._format,) = con.execute( jpayne@68: "SELECT value FROM meta_data WHERE key=?;", ("format",) jpayne@68: ).fetchone() jpayne@68: if fmt and fmt != self._format: jpayne@68: con.close() jpayne@68: raise ValueError( jpayne@68: f"Index file says format {self._format}, not {fmt}" jpayne@68: ) from None jpayne@68: try: jpayne@68: (filenames_relative_to_index,) = con.execute( jpayne@68: "SELECT value FROM meta_data WHERE key=?;", jpayne@68: ("filenames_relative_to_index",), jpayne@68: ).fetchone() jpayne@68: filenames_relative_to_index = ( jpayne@68: filenames_relative_to_index.upper() == "TRUE" jpayne@68: ) jpayne@68: except TypeError: jpayne@68: # Original behaviour, assume if meta_data missing jpayne@68: filenames_relative_to_index = False jpayne@68: self._filenames = [ jpayne@68: row[0] jpayne@68: for row in con.execute( jpayne@68: "SELECT name FROM file_data ORDER BY file_number;" jpayne@68: ).fetchall() jpayne@68: ] jpayne@68: if filenames_relative_to_index: jpayne@68: # Not implicitly relative to $PWD, explicitly relative to index file jpayne@68: relative_path = os.path.abspath(os.path.dirname(index_filename)) jpayne@68: tmp = [] jpayne@68: for f in self._filenames: jpayne@68: if os.path.isabs(f): jpayne@68: tmp.append(f) jpayne@68: else: jpayne@68: # Would be stored with Unix / path separator, so convert jpayne@68: # it to the local OS path separator here: jpayne@68: tmp.append( jpayne@68: os.path.join(relative_path, f.replace("/", os.path.sep)) jpayne@68: ) jpayne@68: self._filenames = tmp jpayne@68: del tmp jpayne@68: if filenames and len(filenames) != len(self._filenames): jpayne@68: con.close() jpayne@68: raise ValueError( jpayne@68: "Index file says %i files, not %i" jpayne@68: % (len(self._filenames), len(filenames)) jpayne@68: ) from None jpayne@68: if filenames and filenames != self._filenames: jpayne@68: for old, new in zip(self._filenames, filenames): jpayne@68: # Want exact match (after making relative to the index above) jpayne@68: if os.path.abspath(old) != os.path.abspath(new): jpayne@68: con.close() jpayne@68: if filenames_relative_to_index: jpayne@68: raise ValueError( jpayne@68: "Index file has different filenames, e.g. %r != %r" jpayne@68: % (os.path.abspath(old), os.path.abspath(new)) jpayne@68: ) from None jpayne@68: else: jpayne@68: raise ValueError( jpayne@68: "Index file has different filenames " jpayne@68: "[This is an old index where any relative paths " jpayne@68: "were relative to the original working directory]. " jpayne@68: "e.g. %r != %r" jpayne@68: % (os.path.abspath(old), os.path.abspath(new)) jpayne@68: ) from None jpayne@68: # Filenames are equal (after imposing abspath) jpayne@68: except sqlite3.OperationalError as err: jpayne@68: con.close() jpayne@68: raise ValueError(f"Not a Biopython index database? {err}") from None jpayne@68: # Now we have the format (from the DB if not given to us), jpayne@68: if not proxy_factory(self._format): jpayne@68: con.close() jpayne@68: raise ValueError(f"Unsupported format '{self._format}'") jpayne@68: jpayne@68: def _build_index(self): jpayne@68: """Call from __init__ to create a new index (PRIVATE).""" jpayne@68: index_filename = self._index_filename jpayne@68: relative_path = self._relative_path jpayne@68: filenames = self._filenames jpayne@68: fmt = self._format jpayne@68: key_function = self._key_function jpayne@68: proxy_factory = self._proxy_factory jpayne@68: max_open = self._max_open jpayne@68: random_access_proxies = self._proxies jpayne@68: jpayne@68: if not fmt or not filenames: jpayne@68: raise ValueError( jpayne@68: f"Filenames to index and format required to build {index_filename!r}" jpayne@68: ) jpayne@68: if not proxy_factory(fmt): jpayne@68: raise ValueError(f"Unsupported format '{fmt}'") jpayne@68: # Create the index jpayne@68: con = sqlite3.dbapi2.connect(index_filename) jpayne@68: self._con = con jpayne@68: # print("Creating index") jpayne@68: # Sqlite PRAGMA settings for speed jpayne@68: con.execute("PRAGMA synchronous=OFF") jpayne@68: con.execute("PRAGMA locking_mode=EXCLUSIVE") jpayne@68: # Don't index the key column until the end (faster) jpayne@68: # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " jpayne@68: # "offset INTEGER);") jpayne@68: con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") jpayne@68: con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1)) jpayne@68: con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt)) jpayne@68: con.execute( jpayne@68: "INSERT INTO meta_data (key, value) VALUES (?,?);", jpayne@68: ("filenames_relative_to_index", "True"), jpayne@68: ) jpayne@68: # TODO - Record the file size and modified date? jpayne@68: con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);") jpayne@68: con.execute( jpayne@68: "CREATE TABLE offset_data (key TEXT, " jpayne@68: "file_number INTEGER, offset INTEGER, length INTEGER);" jpayne@68: ) jpayne@68: count = 0 jpayne@68: for file_index, filename in enumerate(filenames): jpayne@68: # Default to storing as an absolute path, jpayne@68: f = os.path.abspath(filename) jpayne@68: if not os.path.isabs(filename) and not os.path.isabs(index_filename): jpayne@68: # Since user gave BOTH filename & index as relative paths, jpayne@68: # we will store this relative to the index file even though jpayne@68: # if it may now start ../ (meaning up a level) jpayne@68: # Note for cross platform use (e.g. shared drive over SAMBA), jpayne@68: # convert any Windows slash into Unix style for rel paths. jpayne@68: f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") jpayne@68: elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith( jpayne@68: relative_path + os.path.sep jpayne@68: ): jpayne@68: # Since sequence file is in same directory or sub directory, jpayne@68: # might as well make this into a relative path: jpayne@68: f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") jpayne@68: assert not f.startswith("../"), f jpayne@68: # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f)) jpayne@68: con.execute( jpayne@68: "INSERT INTO file_data (file_number, name) VALUES (?,?);", jpayne@68: (file_index, f), jpayne@68: ) jpayne@68: random_access_proxy = proxy_factory(fmt, filename) jpayne@68: if key_function: jpayne@68: offset_iter = ( jpayne@68: (key_function(key), file_index, offset, length) jpayne@68: for (key, offset, length) in random_access_proxy jpayne@68: ) jpayne@68: else: jpayne@68: offset_iter = ( jpayne@68: (key, file_index, offset, length) jpayne@68: for (key, offset, length) in random_access_proxy jpayne@68: ) jpayne@68: while True: jpayne@68: batch = list(itertools.islice(offset_iter, 100)) jpayne@68: if not batch: jpayne@68: break jpayne@68: # print("Inserting batch of %i offsets, %s ... %s" jpayne@68: # % (len(batch), batch[0][0], batch[-1][0])) jpayne@68: con.executemany( jpayne@68: "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", jpayne@68: batch, jpayne@68: ) jpayne@68: con.commit() jpayne@68: count += len(batch) jpayne@68: if len(random_access_proxies) < max_open: jpayne@68: random_access_proxies[file_index] = random_access_proxy jpayne@68: else: jpayne@68: random_access_proxy._handle.close() jpayne@68: self._length = count jpayne@68: # print("About to index %i entries" % count) jpayne@68: try: jpayne@68: con.execute( jpayne@68: "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);" jpayne@68: ) jpayne@68: except sqlite3.IntegrityError as err: jpayne@68: self._proxies = random_access_proxies jpayne@68: self.close() jpayne@68: con.close() jpayne@68: raise ValueError(f"Duplicate key? {err}") from None jpayne@68: con.execute("PRAGMA locking_mode=NORMAL") jpayne@68: con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count")) jpayne@68: con.commit() jpayne@68: # print("Index created") jpayne@68: jpayne@68: def __repr__(self): jpayne@68: return self._repr jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: return bool( jpayne@68: self._con.execute( jpayne@68: "SELECT key FROM offset_data WHERE key=?;", (key,) jpayne@68: ).fetchone() jpayne@68: ) jpayne@68: jpayne@68: def __len__(self): jpayne@68: """Return the number of records indexed.""" jpayne@68: return self._length jpayne@68: # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] jpayne@68: jpayne@68: def __iter__(self): jpayne@68: """Iterate over the keys.""" jpayne@68: for row in self._con.execute( jpayne@68: "SELECT key FROM offset_data ORDER BY file_number, offset;" jpayne@68: ): jpayne@68: yield str(row[0]) jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: """Return record for the specified key.""" jpayne@68: # Pass the offset to the proxy jpayne@68: row = self._con.execute( jpayne@68: "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,) jpayne@68: ).fetchone() jpayne@68: if not row: jpayne@68: raise KeyError jpayne@68: file_number, offset = row jpayne@68: proxies = self._proxies jpayne@68: if file_number in proxies: jpayne@68: record = proxies[file_number].get(offset) jpayne@68: else: jpayne@68: if len(proxies) >= self._max_open: jpayne@68: # Close an old handle... jpayne@68: proxies.popitem()[1]._handle.close() jpayne@68: # Open a new handle... jpayne@68: proxy = self._proxy_factory(self._format, self._filenames[file_number]) jpayne@68: record = proxy.get(offset) jpayne@68: proxies[file_number] = proxy jpayne@68: if self._key_function: jpayne@68: key2 = self._key_function(record.id) jpayne@68: else: jpayne@68: key2 = record.id jpayne@68: if key != key2: jpayne@68: raise ValueError(f"Key did not match ({key} vs {key2})") jpayne@68: return record jpayne@68: jpayne@68: def get_raw(self, key): jpayne@68: """Return the raw record from the file as a bytes string. jpayne@68: jpayne@68: If the key is not found, a KeyError exception is raised. jpayne@68: """ jpayne@68: # Pass the offset to the proxy jpayne@68: row = self._con.execute( jpayne@68: "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,) jpayne@68: ).fetchone() jpayne@68: if not row: jpayne@68: raise KeyError jpayne@68: file_number, offset, length = row jpayne@68: proxies = self._proxies jpayne@68: if file_number in proxies: jpayne@68: if length: jpayne@68: # Shortcut if we have the length jpayne@68: h = proxies[file_number]._handle jpayne@68: h.seek(offset) jpayne@68: return h.read(length) jpayne@68: else: jpayne@68: return proxies[file_number].get_raw(offset) jpayne@68: else: jpayne@68: # This code is duplicated from __getitem__ to avoid a function call jpayne@68: if len(proxies) >= self._max_open: jpayne@68: # Close an old handle... jpayne@68: proxies.popitem()[1]._handle.close() jpayne@68: # Open a new handle... jpayne@68: proxy = self._proxy_factory(self._format, self._filenames[file_number]) jpayne@68: proxies[file_number] = proxy jpayne@68: if length: jpayne@68: # Shortcut if we have the length jpayne@68: h = proxy._handle jpayne@68: h.seek(offset) jpayne@68: return h.read(length) jpayne@68: else: jpayne@68: return proxy.get_raw(offset) jpayne@68: jpayne@68: def close(self): jpayne@68: """Close any open file handles.""" jpayne@68: proxies = self._proxies jpayne@68: while proxies: jpayne@68: proxies.popitem()[1]._handle.close()