jpayne@68: # Copyright 1999 by Jeffrey Chang.  All rights reserved.
jpayne@68: # Copyright 2009-2018 by Peter Cock. All rights reserved.
jpayne@68: #
jpayne@68: # This file is part of the Biopython distribution and governed by your
jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68: # Please see the LICENSE file that should have been included as part of this
jpayne@68: # package.
jpayne@68: """Code for more fancy file handles.
jpayne@68: 
jpayne@68: Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for
jpayne@68: indexing files. These are not intended for direct use.
jpayne@68: """
jpayne@68: 
jpayne@68: import os
jpayne@68: import contextlib
jpayne@68: import itertools
jpayne@68: import collections.abc
jpayne@68: 
jpayne@68: from abc import ABC, abstractmethod
jpayne@68: 
jpayne@68: try:
jpayne@68:     import sqlite3
jpayne@68: except ImportError:
jpayne@68:     # May be missing if Python was compiled from source without its dependencies
jpayne@68:     sqlite3 = None  # type: ignore
jpayne@68: 
jpayne@68: 
jpayne@68: @contextlib.contextmanager
jpayne@68: def as_handle(handleish, mode="r", **kwargs):
jpayne@68:     r"""Context manager to ensure we are using a handle.
jpayne@68: 
jpayne@68:     Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
jpayne@68:     and parse methods: either file objects or path-like objects (strings, pathlib.Path
jpayne@68:     instances, or more generally, anything that can be handled by the builtin 'open'
jpayne@68:     function).
jpayne@68: 
jpayne@68:     When given a path-like object, returns an open file handle to that path, with provided
jpayne@68:     mode, which will be closed when the manager exits.
jpayne@68: 
jpayne@68:     All other inputs are returned, and are *not* closed.
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - handleish  - Either a file handle or path-like object (anything which can be
jpayne@68:                     passed to the builtin 'open' function, such as str, bytes,
jpayne@68:                     pathlib.Path, and os.DirEntry objects)
jpayne@68:      - mode       - Mode to open handleish (used only if handleish is a string)
jpayne@68:      - kwargs     - Further arguments to pass to open(...)
jpayne@68: 
jpayne@68:     Examples
jpayne@68:     --------
jpayne@68:     >>> from Bio import File
jpayne@68:     >>> import os
jpayne@68:     >>> with File.as_handle('seqs.fasta', 'w') as fp:
jpayne@68:     ...     fp.write('>test\nACGT')
jpayne@68:     ...
jpayne@68:     10
jpayne@68:     >>> fp.closed
jpayne@68:     True
jpayne@68: 
jpayne@68:     >>> handle = open('seqs.fasta', 'w')
jpayne@68:     >>> with File.as_handle(handle) as fp:
jpayne@68:     ...     fp.write('>test\nACGT')
jpayne@68:     ...
jpayne@68:     10
jpayne@68:     >>> fp.closed
jpayne@68:     False
jpayne@68:     >>> fp.close()
jpayne@68:     >>> os.remove("seqs.fasta")  # tidy up
jpayne@68: 
jpayne@68:     """
jpayne@68:     try:
jpayne@68:         with open(handleish, mode, **kwargs) as fp:
jpayne@68:             yield fp
jpayne@68:     except TypeError:
jpayne@68:         yield handleish
jpayne@68: 
jpayne@68: 
jpayne@68: def _open_for_random_access(filename):
jpayne@68:     """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
jpayne@68: 
jpayne@68:     This functionality is used by the Bio.SeqIO and Bio.SearchIO index
jpayne@68:     and index_db functions.
jpayne@68: 
jpayne@68:     If the file is gzipped but not BGZF, a specific ValueError is raised.
jpayne@68:     """
jpayne@68:     handle = open(filename, "rb")
jpayne@68:     magic = handle.read(2)
jpayne@68:     handle.seek(0)
jpayne@68: 
jpayne@68:     if magic == b"\x1f\x8b":
jpayne@68:         # This is a gzipped file, but is it BGZF?
jpayne@68:         from . import bgzf
jpayne@68: 
jpayne@68:         try:
jpayne@68:             # If it is BGZF, we support that
jpayne@68:             return bgzf.BgzfReader(mode="rb", fileobj=handle)
jpayne@68:         except ValueError as e:
jpayne@68:             assert "BGZF" in str(e)
jpayne@68:             # Not a BGZF file after all,
jpayne@68:             handle.close()
jpayne@68:             raise ValueError(
jpayne@68:                 "Gzipped files are not suitable for indexing, "
jpayne@68:                 "please use BGZF (blocked gzip format) instead."
jpayne@68:             ) from None
jpayne@68: 
jpayne@68:     return handle
jpayne@68: 
jpayne@68: 
jpayne@68: # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
jpayne@68: # for indexing
jpayne@68: 
jpayne@68: 
jpayne@68: class _IndexedSeqFileProxy(ABC):
jpayne@68:     """Abstract base class for file format specific random access (PRIVATE).
jpayne@68: 
jpayne@68:     This is subclasses in both Bio.SeqIO for indexing as SeqRecord
jpayne@68:     objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@68: 
jpayne@68:     Subclasses for each file format should define '__iter__', 'get'
jpayne@68:     and optionally 'get_raw' methods.
jpayne@68:     """
jpayne@68: 
jpayne@68:     @abstractmethod
jpayne@68:     def __iter__(self):
jpayne@68:         """Return (identifier, offset, length in bytes) tuples.
jpayne@68: 
jpayne@68:         The length can be zero where it is not implemented or not
jpayne@68:         possible for a particular file format.
jpayne@68:         """
jpayne@68:         raise NotImplementedError
jpayne@68: 
jpayne@68:     @abstractmethod
jpayne@68:     def get(self, offset):
jpayne@68:         """Return parsed object for this entry."""
jpayne@68:         # Most file formats with self contained records can be handled by
jpayne@68:         # parsing StringIO(self.get_raw(offset).decode())
jpayne@68:         raise NotImplementedError
jpayne@68: 
jpayne@68:     def get_raw(self, offset):
jpayne@68:         """Return the raw record from the file as a bytes string (if implemented).
jpayne@68: 
jpayne@68:         If the key is not found, a KeyError exception is raised.
jpayne@68: 
jpayne@68:         This may not have been implemented for all file formats.
jpayne@68:         """
jpayne@68:         # Should be done by each sub-class (if possible)
jpayne@68:         raise NotImplementedError("Not available for this file format.")
jpayne@68: 
jpayne@68: 
jpayne@68: class _IndexedSeqFileDict(collections.abc.Mapping):
jpayne@68:     """Read only dictionary interface to a sequential record file.
jpayne@68: 
jpayne@68:     This code is used in both Bio.SeqIO for indexing as SeqRecord
jpayne@68:     objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@68: 
jpayne@68:     Keeps the keys and associated file offsets in memory, reads the file
jpayne@68:     to access entries as objects parsing them on demand. This approach
jpayne@68:     is memory limited, but will work even with millions of records.
jpayne@68: 
jpayne@68:     Note duplicate keys are not allowed. If this happens, a ValueError
jpayne@68:     exception is raised.
jpayne@68: 
jpayne@68:     As used in Bio.SeqIO, by default the SeqRecord's id string is used
jpayne@68:     as the dictionary key. In Bio.SearchIO, the query's id string is
jpayne@68:     used. This can be changed by supplying an optional key_function,
jpayne@68:     a callback function which will be given the record id and must
jpayne@68:     return the desired key. For example, this allows you to parse
jpayne@68:     NCBI style FASTA identifiers, and extract the GI number to use
jpayne@68:     as the dictionary key.
jpayne@68: 
jpayne@68:     Note that this dictionary is essentially read only. You cannot
jpayne@68:     add or change values, pop values, nor clear the dictionary.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, random_access_proxy, key_function, repr, obj_repr):
jpayne@68:         """Initialize the class."""
jpayne@68:         # Use key_function=None for default value
jpayne@68:         self._proxy = random_access_proxy
jpayne@68:         self._key_function = key_function
jpayne@68:         self._repr = repr
jpayne@68:         self._obj_repr = obj_repr
jpayne@68:         self._cached_prev_record = (None, None)  # (key, record)
jpayne@68:         if key_function:
jpayne@68:             offset_iter = (
jpayne@68:                 (key_function(key), offset, length)
jpayne@68:                 for (key, offset, length) in random_access_proxy
jpayne@68:             )
jpayne@68:         else:
jpayne@68:             offset_iter = random_access_proxy
jpayne@68:         offsets = {}
jpayne@68:         for key, offset, length in offset_iter:
jpayne@68:             # Note - we don't store the length because I want to minimise the
jpayne@68:             # memory requirements. With the SQLite backend the length is kept
jpayne@68:             # and is used to speed up the get_raw method (by about 3 times).
jpayne@68:             # The length should be provided by all the current backends except
jpayne@68:             # SFF where there is an existing Roche index we can reuse (very fast
jpayne@68:             # but lacks the record lengths)
jpayne@68:             # assert length or format in ["sff", "sff-trim"], \
jpayne@68:             #       "%s at offset %i given length %r (%s format %s)" \
jpayne@68:             #       % (key, offset, length, filename, format)
jpayne@68:             if key in offsets:
jpayne@68:                 self._proxy._handle.close()
jpayne@68:                 raise ValueError(f"Duplicate key '{key}'")
jpayne@68:             else:
jpayne@68:                 offsets[key] = offset
jpayne@68:         self._offsets = offsets
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         """Return a string representation of the File object."""
jpayne@68:         return self._repr
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         """Create a string representation of the File object."""
jpayne@68:         # TODO - How best to handle the __str__ for SeqIO and SearchIO?
jpayne@68:         if self:
jpayne@68:             return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}"
jpayne@68:         else:
jpayne@68:             return "{}"
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return the number of records."""
jpayne@68:         return len(self._offsets)
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         """Iterate over the keys."""
jpayne@68:         return iter(self._offsets)
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         """Return record for the specified key.
jpayne@68: 
jpayne@68:         As an optimization when repeatedly asked to look up the same record,
jpayne@68:         the key and record are cached so that if the *same* record is
jpayne@68:         requested next time, it can be returned without going to disk.
jpayne@68:         """
jpayne@68:         if key == self._cached_prev_record[0]:
jpayne@68:             return self._cached_prev_record[1]
jpayne@68:         # Pass the offset to the proxy
jpayne@68:         record = self._proxy.get(self._offsets[key])
jpayne@68:         if self._key_function:
jpayne@68:             key2 = self._key_function(record.id)
jpayne@68:         else:
jpayne@68:             key2 = record.id
jpayne@68:         if key != key2:
jpayne@68:             raise ValueError(f"Key did not match ({key} vs {key2})")
jpayne@68:         self._cached_prev_record = (key, record)
jpayne@68:         return record
jpayne@68: 
jpayne@68:     def get_raw(self, key):
jpayne@68:         """Return the raw record from the file as a bytes string.
jpayne@68: 
jpayne@68:         If the key is not found, a KeyError exception is raised.
jpayne@68:         """
jpayne@68:         # Pass the offset to the proxy
jpayne@68:         return self._proxy.get_raw(self._offsets[key])
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Close the file handle being used to read the data.
jpayne@68: 
jpayne@68:         Once called, further use of the index won't work. The sole purpose
jpayne@68:         of this method is to allow explicit handle closure - for example
jpayne@68:         if you wish to delete the file, on Windows you must first close
jpayne@68:         all open handles to that file.
jpayne@68:         """
jpayne@68:         self._proxy._handle.close()
jpayne@68: 
jpayne@68: 
jpayne@68: class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
jpayne@68:     """Read only dictionary interface to many sequential record files.
jpayne@68: 
jpayne@68:     This code is used in both Bio.SeqIO for indexing as SeqRecord
jpayne@68:     objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@68: 
jpayne@68:     Keeps the keys, file-numbers and offsets in an SQLite database. To access
jpayne@68:     a record by key, reads from the offset in the appropriate file and then
jpayne@68:     parses the record into an object.
jpayne@68: 
jpayne@68:     There are OS limits on the number of files that can be open at once,
jpayne@68:     so a pool are kept. If a record is required from a closed file, then
jpayne@68:     one of the open handles is closed first.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(
jpayne@68:         self,
jpayne@68:         index_filename,
jpayne@68:         filenames,
jpayne@68:         proxy_factory,
jpayne@68:         fmt,
jpayne@68:         key_function,
jpayne@68:         repr,
jpayne@68:         max_open=10,
jpayne@68:     ):
jpayne@68:         """Initialize the class."""
jpayne@68:         # TODO? - Don't keep filename list in memory (just in DB)?
jpayne@68:         # Should save a chunk of memory if dealing with 1000s of files.
jpayne@68:         # Furthermore could compare a generator to the DB on reloading
jpayne@68:         # (no need to turn it into a list)
jpayne@68: 
jpayne@68:         if sqlite3 is None:
jpayne@68:             # Python was compiled without sqlite3 support
jpayne@68:             from Bio import MissingPythonDependencyError
jpayne@68: 
jpayne@68:             raise MissingPythonDependencyError(
jpayne@68:                 "Python was compiled without the sqlite3 module"
jpayne@68:             )
jpayne@68:         if filenames is not None:
jpayne@68:             filenames = list(filenames)  # In case it was a generator
jpayne@68: 
jpayne@68:         # Cache the arguments as private variables
jpayne@68:         self._index_filename = index_filename
jpayne@68:         self._filenames = filenames
jpayne@68:         self._format = fmt
jpayne@68:         self._key_function = key_function
jpayne@68:         self._proxy_factory = proxy_factory
jpayne@68:         self._repr = repr
jpayne@68:         self._max_open = max_open
jpayne@68:         self._proxies = {}
jpayne@68: 
jpayne@68:         # Note if using SQLite :memory: trick index filename, this will
jpayne@68:         # give $PWD as the relative path (which is fine).
jpayne@68:         self._relative_path = os.path.abspath(os.path.dirname(index_filename))
jpayne@68: 
jpayne@68:         if os.path.isfile(index_filename):
jpayne@68:             self._load_index()
jpayne@68:         else:
jpayne@68:             self._build_index()
jpayne@68: 
jpayne@68:     def _load_index(self):
jpayne@68:         """Call from __init__ to re-use an existing index (PRIVATE)."""
jpayne@68:         index_filename = self._index_filename
jpayne@68:         relative_path = self._relative_path
jpayne@68:         filenames = self._filenames
jpayne@68:         fmt = self._format
jpayne@68:         proxy_factory = self._proxy_factory
jpayne@68: 
jpayne@68:         con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False)
jpayne@68:         self._con = con
jpayne@68:         # Check the count...
jpayne@68:         try:
jpayne@68:             (count,) = con.execute(
jpayne@68:                 "SELECT value FROM meta_data WHERE key=?;", ("count",)
jpayne@68:             ).fetchone()
jpayne@68:             self._length = int(count)
jpayne@68:             if self._length == -1:
jpayne@68:                 con.close()
jpayne@68:                 raise ValueError("Unfinished/partial database") from None
jpayne@68: 
jpayne@68:             # use MAX(_ROWID_) to obtain the number of sequences in the database
jpayne@68:             # using COUNT(key) is quite slow in SQLITE
jpayne@68:             # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables)
jpayne@68:             (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone()
jpayne@68:             if self._length != int(count):
jpayne@68:                 con.close()
jpayne@68:                 raise ValueError(
jpayne@68:                     "Corrupt database? %i entries not %i" % (int(count), self._length)
jpayne@68:                 ) from None
jpayne@68:             (self._format,) = con.execute(
jpayne@68:                 "SELECT value FROM meta_data WHERE key=?;", ("format",)
jpayne@68:             ).fetchone()
jpayne@68:             if fmt and fmt != self._format:
jpayne@68:                 con.close()
jpayne@68:                 raise ValueError(
jpayne@68:                     f"Index file says format {self._format}, not {fmt}"
jpayne@68:                 ) from None
jpayne@68:             try:
jpayne@68:                 (filenames_relative_to_index,) = con.execute(
jpayne@68:                     "SELECT value FROM meta_data WHERE key=?;",
jpayne@68:                     ("filenames_relative_to_index",),
jpayne@68:                 ).fetchone()
jpayne@68:                 filenames_relative_to_index = (
jpayne@68:                     filenames_relative_to_index.upper() == "TRUE"
jpayne@68:                 )
jpayne@68:             except TypeError:
jpayne@68:                 # Original behaviour, assume if meta_data missing
jpayne@68:                 filenames_relative_to_index = False
jpayne@68:             self._filenames = [
jpayne@68:                 row[0]
jpayne@68:                 for row in con.execute(
jpayne@68:                     "SELECT name FROM file_data ORDER BY file_number;"
jpayne@68:                 ).fetchall()
jpayne@68:             ]
jpayne@68:             if filenames_relative_to_index:
jpayne@68:                 # Not implicitly relative to $PWD, explicitly relative to index file
jpayne@68:                 relative_path = os.path.abspath(os.path.dirname(index_filename))
jpayne@68:                 tmp = []
jpayne@68:                 for f in self._filenames:
jpayne@68:                     if os.path.isabs(f):
jpayne@68:                         tmp.append(f)
jpayne@68:                     else:
jpayne@68:                         # Would be stored with Unix / path separator, so convert
jpayne@68:                         # it to the local OS path separator here:
jpayne@68:                         tmp.append(
jpayne@68:                             os.path.join(relative_path, f.replace("/", os.path.sep))
jpayne@68:                         )
jpayne@68:                 self._filenames = tmp
jpayne@68:                 del tmp
jpayne@68:             if filenames and len(filenames) != len(self._filenames):
jpayne@68:                 con.close()
jpayne@68:                 raise ValueError(
jpayne@68:                     "Index file says %i files, not %i"
jpayne@68:                     % (len(self._filenames), len(filenames))
jpayne@68:                 ) from None
jpayne@68:             if filenames and filenames != self._filenames:
jpayne@68:                 for old, new in zip(self._filenames, filenames):
jpayne@68:                     # Want exact match (after making relative to the index above)
jpayne@68:                     if os.path.abspath(old) != os.path.abspath(new):
jpayne@68:                         con.close()
jpayne@68:                         if filenames_relative_to_index:
jpayne@68:                             raise ValueError(
jpayne@68:                                 "Index file has different filenames, e.g. %r != %r"
jpayne@68:                                 % (os.path.abspath(old), os.path.abspath(new))
jpayne@68:                             ) from None
jpayne@68:                         else:
jpayne@68:                             raise ValueError(
jpayne@68:                                 "Index file has different filenames "
jpayne@68:                                 "[This is an old index where any relative paths "
jpayne@68:                                 "were relative to the original working directory]. "
jpayne@68:                                 "e.g. %r != %r"
jpayne@68:                                 % (os.path.abspath(old), os.path.abspath(new))
jpayne@68:                             ) from None
jpayne@68:                 # Filenames are equal (after imposing abspath)
jpayne@68:         except sqlite3.OperationalError as err:
jpayne@68:             con.close()
jpayne@68:             raise ValueError(f"Not a Biopython index database? {err}") from None
jpayne@68:         # Now we have the format (from the DB if not given to us),
jpayne@68:         if not proxy_factory(self._format):
jpayne@68:             con.close()
jpayne@68:             raise ValueError(f"Unsupported format '{self._format}'")
jpayne@68: 
jpayne@68:     def _build_index(self):
jpayne@68:         """Call from __init__ to create a new index (PRIVATE)."""
jpayne@68:         index_filename = self._index_filename
jpayne@68:         relative_path = self._relative_path
jpayne@68:         filenames = self._filenames
jpayne@68:         fmt = self._format
jpayne@68:         key_function = self._key_function
jpayne@68:         proxy_factory = self._proxy_factory
jpayne@68:         max_open = self._max_open
jpayne@68:         random_access_proxies = self._proxies
jpayne@68: 
jpayne@68:         if not fmt or not filenames:
jpayne@68:             raise ValueError(
jpayne@68:                 f"Filenames to index and format required to build {index_filename!r}"
jpayne@68:             )
jpayne@68:         if not proxy_factory(fmt):
jpayne@68:             raise ValueError(f"Unsupported format '{fmt}'")
jpayne@68:         # Create the index
jpayne@68:         con = sqlite3.dbapi2.connect(index_filename)
jpayne@68:         self._con = con
jpayne@68:         # print("Creating index")
jpayne@68:         # Sqlite PRAGMA settings for speed
jpayne@68:         con.execute("PRAGMA synchronous=OFF")
jpayne@68:         con.execute("PRAGMA locking_mode=EXCLUSIVE")
jpayne@68:         # Don't index the key column until the end (faster)
jpayne@68:         # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
jpayne@68:         #             "offset INTEGER);")
jpayne@68:         con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
jpayne@68:         con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1))
jpayne@68:         con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt))
jpayne@68:         con.execute(
jpayne@68:             "INSERT INTO meta_data (key, value) VALUES (?,?);",
jpayne@68:             ("filenames_relative_to_index", "True"),
jpayne@68:         )
jpayne@68:         # TODO - Record the file size and modified date?
jpayne@68:         con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
jpayne@68:         con.execute(
jpayne@68:             "CREATE TABLE offset_data (key TEXT, "
jpayne@68:             "file_number INTEGER, offset INTEGER, length INTEGER);"
jpayne@68:         )
jpayne@68:         count = 0
jpayne@68:         for file_index, filename in enumerate(filenames):
jpayne@68:             # Default to storing as an absolute path,
jpayne@68:             f = os.path.abspath(filename)
jpayne@68:             if not os.path.isabs(filename) and not os.path.isabs(index_filename):
jpayne@68:                 # Since user gave BOTH filename & index as relative paths,
jpayne@68:                 # we will store this relative to the index file even though
jpayne@68:                 # if it may now start ../ (meaning up a level)
jpayne@68:                 # Note for cross platform use (e.g. shared drive over SAMBA),
jpayne@68:                 # convert any Windows slash into Unix style for rel paths.
jpayne@68:                 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
jpayne@68:             elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith(
jpayne@68:                 relative_path + os.path.sep
jpayne@68:             ):
jpayne@68:                 # Since sequence file is in same directory or sub directory,
jpayne@68:                 # might as well make this into a relative path:
jpayne@68:                 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
jpayne@68:                 assert not f.startswith("../"), f
jpayne@68:             # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f))
jpayne@68:             con.execute(
jpayne@68:                 "INSERT INTO file_data (file_number, name) VALUES (?,?);",
jpayne@68:                 (file_index, f),
jpayne@68:             )
jpayne@68:             random_access_proxy = proxy_factory(fmt, filename)
jpayne@68:             if key_function:
jpayne@68:                 offset_iter = (
jpayne@68:                     (key_function(key), file_index, offset, length)
jpayne@68:                     for (key, offset, length) in random_access_proxy
jpayne@68:                 )
jpayne@68:             else:
jpayne@68:                 offset_iter = (
jpayne@68:                     (key, file_index, offset, length)
jpayne@68:                     for (key, offset, length) in random_access_proxy
jpayne@68:                 )
jpayne@68:             while True:
jpayne@68:                 batch = list(itertools.islice(offset_iter, 100))
jpayne@68:                 if not batch:
jpayne@68:                     break
jpayne@68:                 # print("Inserting batch of %i offsets, %s ... %s"
jpayne@68:                 #       % (len(batch), batch[0][0], batch[-1][0]))
jpayne@68:                 con.executemany(
jpayne@68:                     "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
jpayne@68:                     batch,
jpayne@68:                 )
jpayne@68:                 con.commit()
jpayne@68:                 count += len(batch)
jpayne@68:             if len(random_access_proxies) < max_open:
jpayne@68:                 random_access_proxies[file_index] = random_access_proxy
jpayne@68:             else:
jpayne@68:                 random_access_proxy._handle.close()
jpayne@68:         self._length = count
jpayne@68:         # print("About to index %i entries" % count)
jpayne@68:         try:
jpayne@68:             con.execute(
jpayne@68:                 "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);"
jpayne@68:             )
jpayne@68:         except sqlite3.IntegrityError as err:
jpayne@68:             self._proxies = random_access_proxies
jpayne@68:             self.close()
jpayne@68:             con.close()
jpayne@68:             raise ValueError(f"Duplicate key? {err}") from None
jpayne@68:         con.execute("PRAGMA locking_mode=NORMAL")
jpayne@68:         con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count"))
jpayne@68:         con.commit()
jpayne@68:         # print("Index created")
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         return self._repr
jpayne@68: 
jpayne@68:     def __contains__(self, key):
jpayne@68:         return bool(
jpayne@68:             self._con.execute(
jpayne@68:                 "SELECT key FROM offset_data WHERE key=?;", (key,)
jpayne@68:             ).fetchone()
jpayne@68:         )
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return the number of records indexed."""
jpayne@68:         return self._length
jpayne@68:         # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         """Iterate over the keys."""
jpayne@68:         for row in self._con.execute(
jpayne@68:             "SELECT key FROM offset_data ORDER BY file_number, offset;"
jpayne@68:         ):
jpayne@68:             yield str(row[0])
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         """Return record for the specified key."""
jpayne@68:         # Pass the offset to the proxy
jpayne@68:         row = self._con.execute(
jpayne@68:             "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,)
jpayne@68:         ).fetchone()
jpayne@68:         if not row:
jpayne@68:             raise KeyError
jpayne@68:         file_number, offset = row
jpayne@68:         proxies = self._proxies
jpayne@68:         if file_number in proxies:
jpayne@68:             record = proxies[file_number].get(offset)
jpayne@68:         else:
jpayne@68:             if len(proxies) >= self._max_open:
jpayne@68:                 # Close an old handle...
jpayne@68:                 proxies.popitem()[1]._handle.close()
jpayne@68:             # Open a new handle...
jpayne@68:             proxy = self._proxy_factory(self._format, self._filenames[file_number])
jpayne@68:             record = proxy.get(offset)
jpayne@68:             proxies[file_number] = proxy
jpayne@68:         if self._key_function:
jpayne@68:             key2 = self._key_function(record.id)
jpayne@68:         else:
jpayne@68:             key2 = record.id
jpayne@68:         if key != key2:
jpayne@68:             raise ValueError(f"Key did not match ({key} vs {key2})")
jpayne@68:         return record
jpayne@68: 
jpayne@68:     def get_raw(self, key):
jpayne@68:         """Return the raw record from the file as a bytes string.
jpayne@68: 
jpayne@68:         If the key is not found, a KeyError exception is raised.
jpayne@68:         """
jpayne@68:         # Pass the offset to the proxy
jpayne@68:         row = self._con.execute(
jpayne@68:             "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,)
jpayne@68:         ).fetchone()
jpayne@68:         if not row:
jpayne@68:             raise KeyError
jpayne@68:         file_number, offset, length = row
jpayne@68:         proxies = self._proxies
jpayne@68:         if file_number in proxies:
jpayne@68:             if length:
jpayne@68:                 # Shortcut if we have the length
jpayne@68:                 h = proxies[file_number]._handle
jpayne@68:                 h.seek(offset)
jpayne@68:                 return h.read(length)
jpayne@68:             else:
jpayne@68:                 return proxies[file_number].get_raw(offset)
jpayne@68:         else:
jpayne@68:             # This code is duplicated from __getitem__ to avoid a function call
jpayne@68:             if len(proxies) >= self._max_open:
jpayne@68:                 # Close an old handle...
jpayne@68:                 proxies.popitem()[1]._handle.close()
jpayne@68:             # Open a new handle...
jpayne@68:             proxy = self._proxy_factory(self._format, self._filenames[file_number])
jpayne@68:             proxies[file_number] = proxy
jpayne@68:             if length:
jpayne@68:                 # Shortcut if we have the length
jpayne@68:                 h = proxy._handle
jpayne@68:                 h.seek(offset)
jpayne@68:                 return h.read(length)
jpayne@68:             else:
jpayne@68:                 return proxy.get_raw(offset)
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Close any open file handles."""
jpayne@68:         proxies = self._proxies
jpayne@68:         while proxies:
jpayne@68:             proxies.popitem()[1]._handle.close()