Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,626 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# Copyright 2009-2018 by Peter Cock. All rights reserved. +# +# This file is part of the Biopython distribution and governed by your +# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". +# Please see the LICENSE file that should have been included as part of this +# package. +"""Code for more fancy file handles. + +Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for +indexing files. These are not intended for direct use. +""" + +import os +import contextlib +import itertools +import collections.abc + +from abc import ABC, abstractmethod + +try: + import sqlite3 +except ImportError: + # May be missing if Python was compiled from source without its dependencies + sqlite3 = None # type: ignore + + +@contextlib.contextmanager +def as_handle(handleish, mode="r", **kwargs): + r"""Context manager to ensure we are using a handle. + + Context manager for arguments that can be passed to SeqIO and AlignIO read, write, + and parse methods: either file objects or path-like objects (strings, pathlib.Path + instances, or more generally, anything that can be handled by the builtin 'open' + function). + + When given a path-like object, returns an open file handle to that path, with provided + mode, which will be closed when the manager exits. + + All other inputs are returned, and are *not* closed. + + Arguments: + - handleish - Either a file handle or path-like object (anything which can be + passed to the builtin 'open' function, such as str, bytes, + pathlib.Path, and os.DirEntry objects) + - mode - Mode to open handleish (used only if handleish is a string) + - kwargs - Further arguments to pass to open(...) + + Examples + -------- + >>> from Bio import File + >>> import os + >>> with File.as_handle('seqs.fasta', 'w') as fp: + ... fp.write('>test\nACGT') + ... + 10 + >>> fp.closed + True + + >>> handle = open('seqs.fasta', 'w') + >>> with File.as_handle(handle) as fp: + ... fp.write('>test\nACGT') + ... + 10 + >>> fp.closed + False + >>> fp.close() + >>> os.remove("seqs.fasta") # tidy up + + """ + try: + with open(handleish, mode, **kwargs) as fp: + yield fp + except TypeError: + yield handleish + + +def _open_for_random_access(filename): + """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). + + This functionality is used by the Bio.SeqIO and Bio.SearchIO index + and index_db functions. + + If the file is gzipped but not BGZF, a specific ValueError is raised. + """ + handle = open(filename, "rb") + magic = handle.read(2) + handle.seek(0) + + if magic == b"\x1f\x8b": + # This is a gzipped file, but is it BGZF? + from . import bgzf + + try: + # If it is BGZF, we support that + return bgzf.BgzfReader(mode="rb", fileobj=handle) + except ValueError as e: + assert "BGZF" in str(e) + # Not a BGZF file after all, + handle.close() + raise ValueError( + "Gzipped files are not suitable for indexing, " + "please use BGZF (blocked gzip format) instead." + ) from None + + return handle + + +# The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO +# for indexing + + +class _IndexedSeqFileProxy(ABC): + """Abstract base class for file format specific random access (PRIVATE). + + This is subclasses in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Subclasses for each file format should define '__iter__', 'get' + and optionally 'get_raw' methods. + """ + + @abstractmethod + def __iter__(self): + """Return (identifier, offset, length in bytes) tuples. + + The length can be zero where it is not implemented or not + possible for a particular file format. + """ + raise NotImplementedError + + @abstractmethod + def get(self, offset): + """Return parsed object for this entry.""" + # Most file formats with self contained records can be handled by + # parsing StringIO(self.get_raw(offset).decode()) + raise NotImplementedError + + def get_raw(self, offset): + """Return the raw record from the file as a bytes string (if implemented). + + If the key is not found, a KeyError exception is raised. + + This may not have been implemented for all file formats. + """ + # Should be done by each sub-class (if possible) + raise NotImplementedError("Not available for this file format.") + + +class _IndexedSeqFileDict(collections.abc.Mapping): + """Read only dictionary interface to a sequential record file. + + This code is used in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Keeps the keys and associated file offsets in memory, reads the file + to access entries as objects parsing them on demand. This approach + is memory limited, but will work even with millions of records. + + Note duplicate keys are not allowed. If this happens, a ValueError + exception is raised. + + As used in Bio.SeqIO, by default the SeqRecord's id string is used + as the dictionary key. In Bio.SearchIO, the query's id string is + used. This can be changed by supplying an optional key_function, + a callback function which will be given the record id and must + return the desired key. For example, this allows you to parse + NCBI style FASTA identifiers, and extract the GI number to use + as the dictionary key. + + Note that this dictionary is essentially read only. You cannot + add or change values, pop values, nor clear the dictionary. + """ + + def __init__(self, random_access_proxy, key_function, repr, obj_repr): + """Initialize the class.""" + # Use key_function=None for default value + self._proxy = random_access_proxy + self._key_function = key_function + self._repr = repr + self._obj_repr = obj_repr + self._cached_prev_record = (None, None) # (key, record) + if key_function: + offset_iter = ( + (key_function(key), offset, length) + for (key, offset, length) in random_access_proxy + ) + else: + offset_iter = random_access_proxy + offsets = {} + for key, offset, length in offset_iter: + # Note - we don't store the length because I want to minimise the + # memory requirements. With the SQLite backend the length is kept + # and is used to speed up the get_raw method (by about 3 times). + # The length should be provided by all the current backends except + # SFF where there is an existing Roche index we can reuse (very fast + # but lacks the record lengths) + # assert length or format in ["sff", "sff-trim"], \ + # "%s at offset %i given length %r (%s format %s)" \ + # % (key, offset, length, filename, format) + if key in offsets: + self._proxy._handle.close() + raise ValueError(f"Duplicate key '{key}'") + else: + offsets[key] = offset + self._offsets = offsets + + def __repr__(self): + """Return a string representation of the File object.""" + return self._repr + + def __str__(self): + """Create a string representation of the File object.""" + # TODO - How best to handle the __str__ for SeqIO and SearchIO? + if self: + return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}" + else: + return "{}" + + def __len__(self): + """Return the number of records.""" + return len(self._offsets) + + def __iter__(self): + """Iterate over the keys.""" + return iter(self._offsets) + + def __getitem__(self, key): + """Return record for the specified key. + + As an optimization when repeatedly asked to look up the same record, + the key and record are cached so that if the *same* record is + requested next time, it can be returned without going to disk. + """ + if key == self._cached_prev_record[0]: + return self._cached_prev_record[1] + # Pass the offset to the proxy + record = self._proxy.get(self._offsets[key]) + if self._key_function: + key2 = self._key_function(record.id) + else: + key2 = record.id + if key != key2: + raise ValueError(f"Key did not match ({key} vs {key2})") + self._cached_prev_record = (key, record) + return record + + def get_raw(self, key): + """Return the raw record from the file as a bytes string. + + If the key is not found, a KeyError exception is raised. + """ + # Pass the offset to the proxy + return self._proxy.get_raw(self._offsets[key]) + + def close(self): + """Close the file handle being used to read the data. + + Once called, further use of the index won't work. The sole purpose + of this method is to allow explicit handle closure - for example + if you wish to delete the file, on Windows you must first close + all open handles to that file. + """ + self._proxy._handle.close() + + +class _SQLiteManySeqFilesDict(_IndexedSeqFileDict): + """Read only dictionary interface to many sequential record files. + + This code is used in both Bio.SeqIO for indexing as SeqRecord + objects, and in Bio.SearchIO for indexing QueryResult objects. + + Keeps the keys, file-numbers and offsets in an SQLite database. To access + a record by key, reads from the offset in the appropriate file and then + parses the record into an object. + + There are OS limits on the number of files that can be open at once, + so a pool are kept. If a record is required from a closed file, then + one of the open handles is closed first. + """ + + def __init__( + self, + index_filename, + filenames, + proxy_factory, + fmt, + key_function, + repr, + max_open=10, + ): + """Initialize the class.""" + # TODO? - Don't keep filename list in memory (just in DB)? + # Should save a chunk of memory if dealing with 1000s of files. + # Furthermore could compare a generator to the DB on reloading + # (no need to turn it into a list) + + if sqlite3 is None: + # Python was compiled without sqlite3 support + from Bio import MissingPythonDependencyError + + raise MissingPythonDependencyError( + "Python was compiled without the sqlite3 module" + ) + if filenames is not None: + filenames = list(filenames) # In case it was a generator + + # Cache the arguments as private variables + self._index_filename = index_filename + self._filenames = filenames + self._format = fmt + self._key_function = key_function + self._proxy_factory = proxy_factory + self._repr = repr + self._max_open = max_open + self._proxies = {} + + # Note if using SQLite :memory: trick index filename, this will + # give $PWD as the relative path (which is fine). + self._relative_path = os.path.abspath(os.path.dirname(index_filename)) + + if os.path.isfile(index_filename): + self._load_index() + else: + self._build_index() + + def _load_index(self): + """Call from __init__ to re-use an existing index (PRIVATE).""" + index_filename = self._index_filename + relative_path = self._relative_path + filenames = self._filenames + fmt = self._format + proxy_factory = self._proxy_factory + + con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False) + self._con = con + # Check the count... + try: + (count,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", ("count",) + ).fetchone() + self._length = int(count) + if self._length == -1: + con.close() + raise ValueError("Unfinished/partial database") from None + + # use MAX(_ROWID_) to obtain the number of sequences in the database + # using COUNT(key) is quite slow in SQLITE + # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables) + (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone() + if self._length != int(count): + con.close() + raise ValueError( + "Corrupt database? %i entries not %i" % (int(count), self._length) + ) from None + (self._format,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", ("format",) + ).fetchone() + if fmt and fmt != self._format: + con.close() + raise ValueError( + f"Index file says format {self._format}, not {fmt}" + ) from None + try: + (filenames_relative_to_index,) = con.execute( + "SELECT value FROM meta_data WHERE key=?;", + ("filenames_relative_to_index",), + ).fetchone() + filenames_relative_to_index = ( + filenames_relative_to_index.upper() == "TRUE" + ) + except TypeError: + # Original behaviour, assume if meta_data missing + filenames_relative_to_index = False + self._filenames = [ + row[0] + for row in con.execute( + "SELECT name FROM file_data ORDER BY file_number;" + ).fetchall() + ] + if filenames_relative_to_index: + # Not implicitly relative to $PWD, explicitly relative to index file + relative_path = os.path.abspath(os.path.dirname(index_filename)) + tmp = [] + for f in self._filenames: + if os.path.isabs(f): + tmp.append(f) + else: + # Would be stored with Unix / path separator, so convert + # it to the local OS path separator here: + tmp.append( + os.path.join(relative_path, f.replace("/", os.path.sep)) + ) + self._filenames = tmp + del tmp + if filenames and len(filenames) != len(self._filenames): + con.close() + raise ValueError( + "Index file says %i files, not %i" + % (len(self._filenames), len(filenames)) + ) from None + if filenames and filenames != self._filenames: + for old, new in zip(self._filenames, filenames): + # Want exact match (after making relative to the index above) + if os.path.abspath(old) != os.path.abspath(new): + con.close() + if filenames_relative_to_index: + raise ValueError( + "Index file has different filenames, e.g. %r != %r" + % (os.path.abspath(old), os.path.abspath(new)) + ) from None + else: + raise ValueError( + "Index file has different filenames " + "[This is an old index where any relative paths " + "were relative to the original working directory]. " + "e.g. %r != %r" + % (os.path.abspath(old), os.path.abspath(new)) + ) from None + # Filenames are equal (after imposing abspath) + except sqlite3.OperationalError as err: + con.close() + raise ValueError(f"Not a Biopython index database? {err}") from None + # Now we have the format (from the DB if not given to us), + if not proxy_factory(self._format): + con.close() + raise ValueError(f"Unsupported format '{self._format}'") + + def _build_index(self): + """Call from __init__ to create a new index (PRIVATE).""" + index_filename = self._index_filename + relative_path = self._relative_path + filenames = self._filenames + fmt = self._format + key_function = self._key_function + proxy_factory = self._proxy_factory + max_open = self._max_open + random_access_proxies = self._proxies + + if not fmt or not filenames: + raise ValueError( + f"Filenames to index and format required to build {index_filename!r}" + ) + if not proxy_factory(fmt): + raise ValueError(f"Unsupported format '{fmt}'") + # Create the index + con = sqlite3.dbapi2.connect(index_filename) + self._con = con + # print("Creating index") + # Sqlite PRAGMA settings for speed + con.execute("PRAGMA synchronous=OFF") + con.execute("PRAGMA locking_mode=EXCLUSIVE") + # Don't index the key column until the end (faster) + # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " + # "offset INTEGER);") + con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") + con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1)) + con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt)) + con.execute( + "INSERT INTO meta_data (key, value) VALUES (?,?);", + ("filenames_relative_to_index", "True"), + ) + # TODO - Record the file size and modified date? + con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);") + con.execute( + "CREATE TABLE offset_data (key TEXT, " + "file_number INTEGER, offset INTEGER, length INTEGER);" + ) + count = 0 + for file_index, filename in enumerate(filenames): + # Default to storing as an absolute path, + f = os.path.abspath(filename) + if not os.path.isabs(filename) and not os.path.isabs(index_filename): + # Since user gave BOTH filename & index as relative paths, + # we will store this relative to the index file even though + # if it may now start ../ (meaning up a level) + # Note for cross platform use (e.g. shared drive over SAMBA), + # convert any Windows slash into Unix style for rel paths. + f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") + elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith( + relative_path + os.path.sep + ): + # Since sequence file is in same directory or sub directory, + # might as well make this into a relative path: + f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") + assert not f.startswith("../"), f + # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f)) + con.execute( + "INSERT INTO file_data (file_number, name) VALUES (?,?);", + (file_index, f), + ) + random_access_proxy = proxy_factory(fmt, filename) + if key_function: + offset_iter = ( + (key_function(key), file_index, offset, length) + for (key, offset, length) in random_access_proxy + ) + else: + offset_iter = ( + (key, file_index, offset, length) + for (key, offset, length) in random_access_proxy + ) + while True: + batch = list(itertools.islice(offset_iter, 100)) + if not batch: + break + # print("Inserting batch of %i offsets, %s ... %s" + # % (len(batch), batch[0][0], batch[-1][0])) + con.executemany( + "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", + batch, + ) + con.commit() + count += len(batch) + if len(random_access_proxies) < max_open: + random_access_proxies[file_index] = random_access_proxy + else: + random_access_proxy._handle.close() + self._length = count + # print("About to index %i entries" % count) + try: + con.execute( + "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);" + ) + except sqlite3.IntegrityError as err: + self._proxies = random_access_proxies + self.close() + con.close() + raise ValueError(f"Duplicate key? {err}") from None + con.execute("PRAGMA locking_mode=NORMAL") + con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count")) + con.commit() + # print("Index created") + + def __repr__(self): + return self._repr + + def __contains__(self, key): + return bool( + self._con.execute( + "SELECT key FROM offset_data WHERE key=?;", (key,) + ).fetchone() + ) + + def __len__(self): + """Return the number of records indexed.""" + return self._length + # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] + + def __iter__(self): + """Iterate over the keys.""" + for row in self._con.execute( + "SELECT key FROM offset_data ORDER BY file_number, offset;" + ): + yield str(row[0]) + + def __getitem__(self, key): + """Return record for the specified key.""" + # Pass the offset to the proxy + row = self._con.execute( + "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,) + ).fetchone() + if not row: + raise KeyError + file_number, offset = row + proxies = self._proxies + if file_number in proxies: + record = proxies[file_number].get(offset) + else: + if len(proxies) >= self._max_open: + # Close an old handle... + proxies.popitem()[1]._handle.close() + # Open a new handle... + proxy = self._proxy_factory(self._format, self._filenames[file_number]) + record = proxy.get(offset) + proxies[file_number] = proxy + if self._key_function: + key2 = self._key_function(record.id) + else: + key2 = record.id + if key != key2: + raise ValueError(f"Key did not match ({key} vs {key2})") + return record + + def get_raw(self, key): + """Return the raw record from the file as a bytes string. + + If the key is not found, a KeyError exception is raised. + """ + # Pass the offset to the proxy + row = self._con.execute( + "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,) + ).fetchone() + if not row: + raise KeyError + file_number, offset, length = row + proxies = self._proxies + if file_number in proxies: + if length: + # Shortcut if we have the length + h = proxies[file_number]._handle + h.seek(offset) + return h.read(length) + else: + return proxies[file_number].get_raw(offset) + else: + # This code is duplicated from __getitem__ to avoid a function call + if len(proxies) >= self._max_open: + # Close an old handle... + proxies.popitem()[1]._handle.close() + # Open a new handle... + proxy = self._proxy_factory(self._format, self._filenames[file_number]) + proxies[file_number] = proxy + if length: + # Shortcut if we have the length + h = proxy._handle + h.seek(offset) + return h.read(length) + else: + return proxy.get_raw(offset) + + def close(self): + """Close any open file handles.""" + proxies = self._proxies + while proxies: + proxies.popitem()[1]._handle.close()