csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright 1999 by Jeffrey Chang. All rights reserved.
jpayne@69	2 # Copyright 2009-2018 by Peter Cock. All rights reserved.
jpayne@69	3 #
jpayne@69	4 # This file is part of the Biopython distribution and governed by your
jpayne@69	5 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69	6 # Please see the LICENSE file that should have been included as part of this
jpayne@69	7 # package.
jpayne@69	8 """Code for more fancy file handles.
jpayne@69	9
jpayne@69	10 Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for
jpayne@69	11 indexing files. These are not intended for direct use.
jpayne@69	12 """
jpayne@69	13
jpayne@69	14 import os
jpayne@69	15 import contextlib
jpayne@69	16 import itertools
jpayne@69	17 import collections.abc
jpayne@69	18
jpayne@69	19 from abc import ABC, abstractmethod
jpayne@69	20
jpayne@69	21 try:
jpayne@69	22 import sqlite3
jpayne@69	23 except ImportError:
jpayne@69	24 # May be missing if Python was compiled from source without its dependencies
jpayne@69	25 sqlite3 = None # type: ignore
jpayne@69	26
jpayne@69	27
jpayne@69	28 @contextlib.contextmanager
jpayne@69	29 def as_handle(handleish, mode="r", **kwargs):
jpayne@69	30 r"""Context manager to ensure we are using a handle.
jpayne@69	31
jpayne@69	32 Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
jpayne@69	33 and parse methods: either file objects or path-like objects (strings, pathlib.Path
jpayne@69	34 instances, or more generally, anything that can be handled by the builtin 'open'
jpayne@69	35 function).
jpayne@69	36
jpayne@69	37 When given a path-like object, returns an open file handle to that path, with provided
jpayne@69	38 mode, which will be closed when the manager exits.
jpayne@69	39
jpayne@69	40 All other inputs are returned, and are not closed.
jpayne@69	41
jpayne@69	42 Arguments:
jpayne@69	43 - handleish - Either a file handle or path-like object (anything which can be
jpayne@69	44 passed to the builtin 'open' function, such as str, bytes,
jpayne@69	45 pathlib.Path, and os.DirEntry objects)
jpayne@69	46 - mode - Mode to open handleish (used only if handleish is a string)
jpayne@69	47 - kwargs - Further arguments to pass to open(...)
jpayne@69	48
jpayne@69	49 Examples
jpayne@69	50 --------
jpayne@69	51 >>> from Bio import File
jpayne@69	52 >>> import os
jpayne@69	53 >>> with File.as_handle('seqs.fasta', 'w') as fp:
jpayne@69	54 ... fp.write('>test\nACGT')
jpayne@69	55 ...
jpayne@69	56 10
jpayne@69	57 >>> fp.closed
jpayne@69	58 True
jpayne@69	59
jpayne@69	60 >>> handle = open('seqs.fasta', 'w')
jpayne@69	61 >>> with File.as_handle(handle) as fp:
jpayne@69	62 ... fp.write('>test\nACGT')
jpayne@69	63 ...
jpayne@69	64 10
jpayne@69	65 >>> fp.closed
jpayne@69	66 False
jpayne@69	67 >>> fp.close()
jpayne@69	68 >>> os.remove("seqs.fasta") # tidy up
jpayne@69	69
jpayne@69	70 """
jpayne@69	71 try:
jpayne@69	72 with open(handleish, mode, **kwargs) as fp:
jpayne@69	73 yield fp
jpayne@69	74 except TypeError:
jpayne@69	75 yield handleish
jpayne@69	76
jpayne@69	77
jpayne@69	78 def _open_for_random_access(filename):
jpayne@69	79 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
jpayne@69	80
jpayne@69	81 This functionality is used by the Bio.SeqIO and Bio.SearchIO index
jpayne@69	82 and index_db functions.
jpayne@69	83
jpayne@69	84 If the file is gzipped but not BGZF, a specific ValueError is raised.
jpayne@69	85 """
jpayne@69	86 handle = open(filename, "rb")
jpayne@69	87 magic = handle.read(2)
jpayne@69	88 handle.seek(0)
jpayne@69	89
jpayne@69	90 if magic == b"\x1f\x8b":
jpayne@69	91 # This is a gzipped file, but is it BGZF?
jpayne@69	92 from . import bgzf
jpayne@69	93
jpayne@69	94 try:
jpayne@69	95 # If it is BGZF, we support that
jpayne@69	96 return bgzf.BgzfReader(mode="rb", fileobj=handle)
jpayne@69	97 except ValueError as e:
jpayne@69	98 assert "BGZF" in str(e)
jpayne@69	99 # Not a BGZF file after all,
jpayne@69	100 handle.close()
jpayne@69	101 raise ValueError(
jpayne@69	102 "Gzipped files are not suitable for indexing, "
jpayne@69	103 "please use BGZF (blocked gzip format) instead."
jpayne@69	104 ) from None
jpayne@69	105
jpayne@69	106 return handle
jpayne@69	107
jpayne@69	108
jpayne@69	109 # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
jpayne@69	110 # for indexing
jpayne@69	111
jpayne@69	112
jpayne@69	113 class _IndexedSeqFileProxy(ABC):
jpayne@69	114 """Abstract base class for file format specific random access (PRIVATE).
jpayne@69	115
jpayne@69	116 This is subclasses in both Bio.SeqIO for indexing as SeqRecord
jpayne@69	117 objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@69	118
jpayne@69	119 Subclasses for each file format should define '__iter__', 'get'
jpayne@69	120 and optionally 'get_raw' methods.
jpayne@69	121 """
jpayne@69	122
jpayne@69	123 @abstractmethod
jpayne@69	124 def __iter__(self):
jpayne@69	125 """Return (identifier, offset, length in bytes) tuples.
jpayne@69	126
jpayne@69	127 The length can be zero where it is not implemented or not
jpayne@69	128 possible for a particular file format.
jpayne@69	129 """
jpayne@69	130 raise NotImplementedError
jpayne@69	131
jpayne@69	132 @abstractmethod
jpayne@69	133 def get(self, offset):
jpayne@69	134 """Return parsed object for this entry."""
jpayne@69	135 # Most file formats with self contained records can be handled by
jpayne@69	136 # parsing StringIO(self.get_raw(offset).decode())
jpayne@69	137 raise NotImplementedError
jpayne@69	138
jpayne@69	139 def get_raw(self, offset):
jpayne@69	140 """Return the raw record from the file as a bytes string (if implemented).
jpayne@69	141
jpayne@69	142 If the key is not found, a KeyError exception is raised.
jpayne@69	143
jpayne@69	144 This may not have been implemented for all file formats.
jpayne@69	145 """
jpayne@69	146 # Should be done by each sub-class (if possible)
jpayne@69	147 raise NotImplementedError("Not available for this file format.")
jpayne@69	148
jpayne@69	149
jpayne@69	150 class _IndexedSeqFileDict(collections.abc.Mapping):
jpayne@69	151 """Read only dictionary interface to a sequential record file.
jpayne@69	152
jpayne@69	153 This code is used in both Bio.SeqIO for indexing as SeqRecord
jpayne@69	154 objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@69	155
jpayne@69	156 Keeps the keys and associated file offsets in memory, reads the file
jpayne@69	157 to access entries as objects parsing them on demand. This approach
jpayne@69	158 is memory limited, but will work even with millions of records.
jpayne@69	159
jpayne@69	160 Note duplicate keys are not allowed. If this happens, a ValueError
jpayne@69	161 exception is raised.
jpayne@69	162
jpayne@69	163 As used in Bio.SeqIO, by default the SeqRecord's id string is used
jpayne@69	164 as the dictionary key. In Bio.SearchIO, the query's id string is
jpayne@69	165 used. This can be changed by supplying an optional key_function,
jpayne@69	166 a callback function which will be given the record id and must
jpayne@69	167 return the desired key. For example, this allows you to parse
jpayne@69	168 NCBI style FASTA identifiers, and extract the GI number to use
jpayne@69	169 as the dictionary key.
jpayne@69	170
jpayne@69	171 Note that this dictionary is essentially read only. You cannot
jpayne@69	172 add or change values, pop values, nor clear the dictionary.
jpayne@69	173 """
jpayne@69	174
jpayne@69	175 def __init__(self, random_access_proxy, key_function, repr, obj_repr):
jpayne@69	176 """Initialize the class."""
jpayne@69	177 # Use key_function=None for default value
jpayne@69	178 self._proxy = random_access_proxy
jpayne@69	179 self._key_function = key_function
jpayne@69	180 self._repr = repr
jpayne@69	181 self._obj_repr = obj_repr
jpayne@69	182 self._cached_prev_record = (None, None) # (key, record)
jpayne@69	183 if key_function:
jpayne@69	184 offset_iter = (
jpayne@69	185 (key_function(key), offset, length)
jpayne@69	186 for (key, offset, length) in random_access_proxy
jpayne@69	187 )
jpayne@69	188 else:
jpayne@69	189 offset_iter = random_access_proxy
jpayne@69	190 offsets = {}
jpayne@69	191 for key, offset, length in offset_iter:
jpayne@69	192 # Note - we don't store the length because I want to minimise the
jpayne@69	193 # memory requirements. With the SQLite backend the length is kept
jpayne@69	194 # and is used to speed up the get_raw method (by about 3 times).
jpayne@69	195 # The length should be provided by all the current backends except
jpayne@69	196 # SFF where there is an existing Roche index we can reuse (very fast
jpayne@69	197 # but lacks the record lengths)
jpayne@69	198 # assert length or format in ["sff", "sff-trim"], \
jpayne@69	199 # "%s at offset %i given length %r (%s format %s)" \
jpayne@69	200 # % (key, offset, length, filename, format)
jpayne@69	201 if key in offsets:
jpayne@69	202 self._proxy._handle.close()
jpayne@69	203 raise ValueError(f"Duplicate key '{key}'")
jpayne@69	204 else:
jpayne@69	205 offsets[key] = offset
jpayne@69	206 self._offsets = offsets
jpayne@69	207
jpayne@69	208 def __repr__(self):
jpayne@69	209 """Return a string representation of the File object."""
jpayne@69	210 return self._repr
jpayne@69	211
jpayne@69	212 def __str__(self):
jpayne@69	213 """Create a string representation of the File object."""
jpayne@69	214 # TODO - How best to handle the __str__ for SeqIO and SearchIO?
jpayne@69	215 if self:
jpayne@69	216 return f"{{{list(self.keys())[0]!r} : {self._obj_repr}(...), ...}}"
jpayne@69	217 else:
jpayne@69	218 return "{}"
jpayne@69	219
jpayne@69	220 def __len__(self):
jpayne@69	221 """Return the number of records."""
jpayne@69	222 return len(self._offsets)
jpayne@69	223
jpayne@69	224 def __iter__(self):
jpayne@69	225 """Iterate over the keys."""
jpayne@69	226 return iter(self._offsets)
jpayne@69	227
jpayne@69	228 def __getitem__(self, key):
jpayne@69	229 """Return record for the specified key.
jpayne@69	230
jpayne@69	231 As an optimization when repeatedly asked to look up the same record,
jpayne@69	232 the key and record are cached so that if the same record is
jpayne@69	233 requested next time, it can be returned without going to disk.
jpayne@69	234 """
jpayne@69	235 if key == self._cached_prev_record[0]:
jpayne@69	236 return self._cached_prev_record[1]
jpayne@69	237 # Pass the offset to the proxy
jpayne@69	238 record = self._proxy.get(self._offsets[key])
jpayne@69	239 if self._key_function:
jpayne@69	240 key2 = self._key_function(record.id)
jpayne@69	241 else:
jpayne@69	242 key2 = record.id
jpayne@69	243 if key != key2:
jpayne@69	244 raise ValueError(f"Key did not match ({key} vs {key2})")
jpayne@69	245 self._cached_prev_record = (key, record)
jpayne@69	246 return record
jpayne@69	247
jpayne@69	248 def get_raw(self, key):
jpayne@69	249 """Return the raw record from the file as a bytes string.
jpayne@69	250
jpayne@69	251 If the key is not found, a KeyError exception is raised.
jpayne@69	252 """
jpayne@69	253 # Pass the offset to the proxy
jpayne@69	254 return self._proxy.get_raw(self._offsets[key])
jpayne@69	255
jpayne@69	256 def close(self):
jpayne@69	257 """Close the file handle being used to read the data.
jpayne@69	258
jpayne@69	259 Once called, further use of the index won't work. The sole purpose
jpayne@69	260 of this method is to allow explicit handle closure - for example
jpayne@69	261 if you wish to delete the file, on Windows you must first close
jpayne@69	262 all open handles to that file.
jpayne@69	263 """
jpayne@69	264 self._proxy._handle.close()
jpayne@69	265
jpayne@69	266
jpayne@69	267 class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
jpayne@69	268 """Read only dictionary interface to many sequential record files.
jpayne@69	269
jpayne@69	270 This code is used in both Bio.SeqIO for indexing as SeqRecord
jpayne@69	271 objects, and in Bio.SearchIO for indexing QueryResult objects.
jpayne@69	272
jpayne@69	273 Keeps the keys, file-numbers and offsets in an SQLite database. To access
jpayne@69	274 a record by key, reads from the offset in the appropriate file and then
jpayne@69	275 parses the record into an object.
jpayne@69	276
jpayne@69	277 There are OS limits on the number of files that can be open at once,
jpayne@69	278 so a pool are kept. If a record is required from a closed file, then
jpayne@69	279 one of the open handles is closed first.
jpayne@69	280 """
jpayne@69	281
jpayne@69	282 def __init__(
jpayne@69	283 self,
jpayne@69	284 index_filename,
jpayne@69	285 filenames,
jpayne@69	286 proxy_factory,
jpayne@69	287 fmt,
jpayne@69	288 key_function,
jpayne@69	289 repr,
jpayne@69	290 max_open=10,
jpayne@69	291 ):
jpayne@69	292 """Initialize the class."""
jpayne@69	293 # TODO? - Don't keep filename list in memory (just in DB)?
jpayne@69	294 # Should save a chunk of memory if dealing with 1000s of files.
jpayne@69	295 # Furthermore could compare a generator to the DB on reloading
jpayne@69	296 # (no need to turn it into a list)
jpayne@69	297
jpayne@69	298 if sqlite3 is None:
jpayne@69	299 # Python was compiled without sqlite3 support
jpayne@69	300 from Bio import MissingPythonDependencyError
jpayne@69	301
jpayne@69	302 raise MissingPythonDependencyError(
jpayne@69	303 "Python was compiled without the sqlite3 module"
jpayne@69	304 )
jpayne@69	305 if filenames is not None:
jpayne@69	306 filenames = list(filenames) # In case it was a generator
jpayne@69	307
jpayne@69	308 # Cache the arguments as private variables
jpayne@69	309 self._index_filename = index_filename
jpayne@69	310 self._filenames = filenames
jpayne@69	311 self._format = fmt
jpayne@69	312 self._key_function = key_function
jpayne@69	313 self._proxy_factory = proxy_factory
jpayne@69	314 self._repr = repr
jpayne@69	315 self._max_open = max_open
jpayne@69	316 self._proxies = {}
jpayne@69	317
jpayne@69	318 # Note if using SQLite :memory: trick index filename, this will
jpayne@69	319 # give $PWD as the relative path (which is fine).
jpayne@69	320 self._relative_path = os.path.abspath(os.path.dirname(index_filename))
jpayne@69	321
jpayne@69	322 if os.path.isfile(index_filename):
jpayne@69	323 self._load_index()
jpayne@69	324 else:
jpayne@69	325 self._build_index()
jpayne@69	326
jpayne@69	327 def _load_index(self):
jpayne@69	328 """Call from __init__ to re-use an existing index (PRIVATE)."""
jpayne@69	329 index_filename = self._index_filename
jpayne@69	330 relative_path = self._relative_path
jpayne@69	331 filenames = self._filenames
jpayne@69	332 fmt = self._format
jpayne@69	333 proxy_factory = self._proxy_factory
jpayne@69	334
jpayne@69	335 con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False)
jpayne@69	336 self._con = con
jpayne@69	337 # Check the count...
jpayne@69	338 try:
jpayne@69	339 (count,) = con.execute(
jpayne@69	340 "SELECT value FROM meta_data WHERE key=?;", ("count",)
jpayne@69	341 ).fetchone()
jpayne@69	342 self._length = int(count)
jpayne@69	343 if self._length == -1:
jpayne@69	344 con.close()
jpayne@69	345 raise ValueError("Unfinished/partial database") from None
jpayne@69	346
jpayne@69	347 # use MAX(_ROWID_) to obtain the number of sequences in the database
jpayne@69	348 # using COUNT(key) is quite slow in SQLITE
jpayne@69	349 # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables)
jpayne@69	350 (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone()
jpayne@69	351 if self._length != int(count):
jpayne@69	352 con.close()
jpayne@69	353 raise ValueError(
jpayne@69	354 "Corrupt database? %i entries not %i" % (int(count), self._length)
jpayne@69	355 ) from None
jpayne@69	356 (self._format,) = con.execute(
jpayne@69	357 "SELECT value FROM meta_data WHERE key=?;", ("format",)
jpayne@69	358 ).fetchone()
jpayne@69	359 if fmt and fmt != self._format:
jpayne@69	360 con.close()
jpayne@69	361 raise ValueError(
jpayne@69	362 f"Index file says format {self._format}, not {fmt}"
jpayne@69	363 ) from None
jpayne@69	364 try:
jpayne@69	365 (filenames_relative_to_index,) = con.execute(
jpayne@69	366 "SELECT value FROM meta_data WHERE key=?;",
jpayne@69	367 ("filenames_relative_to_index",),
jpayne@69	368 ).fetchone()
jpayne@69	369 filenames_relative_to_index = (
jpayne@69	370 filenames_relative_to_index.upper() == "TRUE"
jpayne@69	371 )
jpayne@69	372 except TypeError:
jpayne@69	373 # Original behaviour, assume if meta_data missing
jpayne@69	374 filenames_relative_to_index = False
jpayne@69	375 self._filenames = [
jpayne@69	376 row[0]
jpayne@69	377 for row in con.execute(
jpayne@69	378 "SELECT name FROM file_data ORDER BY file_number;"
jpayne@69	379 ).fetchall()
jpayne@69	380 ]
jpayne@69	381 if filenames_relative_to_index:
jpayne@69	382 # Not implicitly relative to $PWD, explicitly relative to index file
jpayne@69	383 relative_path = os.path.abspath(os.path.dirname(index_filename))
jpayne@69	384 tmp = []
jpayne@69	385 for f in self._filenames:
jpayne@69	386 if os.path.isabs(f):
jpayne@69	387 tmp.append(f)
jpayne@69	388 else:
jpayne@69	389 # Would be stored with Unix / path separator, so convert
jpayne@69	390 # it to the local OS path separator here:
jpayne@69	391 tmp.append(
jpayne@69	392 os.path.join(relative_path, f.replace("/", os.path.sep))
jpayne@69	393 )
jpayne@69	394 self._filenames = tmp
jpayne@69	395 del tmp
jpayne@69	396 if filenames and len(filenames) != len(self._filenames):
jpayne@69	397 con.close()
jpayne@69	398 raise ValueError(
jpayne@69	399 "Index file says %i files, not %i"
jpayne@69	400 % (len(self._filenames), len(filenames))
jpayne@69	401 ) from None
jpayne@69	402 if filenames and filenames != self._filenames:
jpayne@69	403 for old, new in zip(self._filenames, filenames):
jpayne@69	404 # Want exact match (after making relative to the index above)
jpayne@69	405 if os.path.abspath(old) != os.path.abspath(new):
jpayne@69	406 con.close()
jpayne@69	407 if filenames_relative_to_index:
jpayne@69	408 raise ValueError(
jpayne@69	409 "Index file has different filenames, e.g. %r != %r"
jpayne@69	410 % (os.path.abspath(old), os.path.abspath(new))
jpayne@69	411 ) from None
jpayne@69	412 else:
jpayne@69	413 raise ValueError(
jpayne@69	414 "Index file has different filenames "
jpayne@69	415 "[This is an old index where any relative paths "
jpayne@69	416 "were relative to the original working directory]. "
jpayne@69	417 "e.g. %r != %r"
jpayne@69	418 % (os.path.abspath(old), os.path.abspath(new))
jpayne@69	419 ) from None
jpayne@69	420 # Filenames are equal (after imposing abspath)
jpayne@69	421 except sqlite3.OperationalError as err:
jpayne@69	422 con.close()
jpayne@69	423 raise ValueError(f"Not a Biopython index database? {err}") from None
jpayne@69	424 # Now we have the format (from the DB if not given to us),
jpayne@69	425 if not proxy_factory(self._format):
jpayne@69	426 con.close()
jpayne@69	427 raise ValueError(f"Unsupported format '{self._format}'")
jpayne@69	428
jpayne@69	429 def _build_index(self):
jpayne@69	430 """Call from __init__ to create a new index (PRIVATE)."""
jpayne@69	431 index_filename = self._index_filename
jpayne@69	432 relative_path = self._relative_path
jpayne@69	433 filenames = self._filenames
jpayne@69	434 fmt = self._format
jpayne@69	435 key_function = self._key_function
jpayne@69	436 proxy_factory = self._proxy_factory
jpayne@69	437 max_open = self._max_open
jpayne@69	438 random_access_proxies = self._proxies
jpayne@69	439
jpayne@69	440 if not fmt or not filenames:
jpayne@69	441 raise ValueError(
jpayne@69	442 f"Filenames to index and format required to build {index_filename!r}"
jpayne@69	443 )
jpayne@69	444 if not proxy_factory(fmt):
jpayne@69	445 raise ValueError(f"Unsupported format '{fmt}'")
jpayne@69	446 # Create the index
jpayne@69	447 con = sqlite3.dbapi2.connect(index_filename)
jpayne@69	448 self._con = con
jpayne@69	449 # print("Creating index")
jpayne@69	450 # Sqlite PRAGMA settings for speed
jpayne@69	451 con.execute("PRAGMA synchronous=OFF")
jpayne@69	452 con.execute("PRAGMA locking_mode=EXCLUSIVE")
jpayne@69	453 # Don't index the key column until the end (faster)
jpayne@69	454 # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
jpayne@69	455 # "offset INTEGER);")
jpayne@69	456 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
jpayne@69	457 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1))
jpayne@69	458 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt))
jpayne@69	459 con.execute(
jpayne@69	460 "INSERT INTO meta_data (key, value) VALUES (?,?);",
jpayne@69	461 ("filenames_relative_to_index", "True"),
jpayne@69	462 )
jpayne@69	463 # TODO - Record the file size and modified date?
jpayne@69	464 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
jpayne@69	465 con.execute(
jpayne@69	466 "CREATE TABLE offset_data (key TEXT, "
jpayne@69	467 "file_number INTEGER, offset INTEGER, length INTEGER);"
jpayne@69	468 )
jpayne@69	469 count = 0
jpayne@69	470 for file_index, filename in enumerate(filenames):
jpayne@69	471 # Default to storing as an absolute path,
jpayne@69	472 f = os.path.abspath(filename)
jpayne@69	473 if not os.path.isabs(filename) and not os.path.isabs(index_filename):
jpayne@69	474 # Since user gave BOTH filename & index as relative paths,
jpayne@69	475 # we will store this relative to the index file even though
jpayne@69	476 # if it may now start ../ (meaning up a level)
jpayne@69	477 # Note for cross platform use (e.g. shared drive over SAMBA),
jpayne@69	478 # convert any Windows slash into Unix style for rel paths.
jpayne@69	479 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
jpayne@69	480 elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith(
jpayne@69	481 relative_path + os.path.sep
jpayne@69	482 ):
jpayne@69	483 # Since sequence file is in same directory or sub directory,
jpayne@69	484 # might as well make this into a relative path:
jpayne@69	485 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
jpayne@69	486 assert not f.startswith("../"), f
jpayne@69	487 # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f))
jpayne@69	488 con.execute(
jpayne@69	489 "INSERT INTO file_data (file_number, name) VALUES (?,?);",
jpayne@69	490 (file_index, f),
jpayne@69	491 )
jpayne@69	492 random_access_proxy = proxy_factory(fmt, filename)
jpayne@69	493 if key_function:
jpayne@69	494 offset_iter = (
jpayne@69	495 (key_function(key), file_index, offset, length)
jpayne@69	496 for (key, offset, length) in random_access_proxy
jpayne@69	497 )
jpayne@69	498 else:
jpayne@69	499 offset_iter = (
jpayne@69	500 (key, file_index, offset, length)
jpayne@69	501 for (key, offset, length) in random_access_proxy
jpayne@69	502 )
jpayne@69	503 while True:
jpayne@69	504 batch = list(itertools.islice(offset_iter, 100))
jpayne@69	505 if not batch:
jpayne@69	506 break
jpayne@69	507 # print("Inserting batch of %i offsets, %s ... %s"
jpayne@69	508 # % (len(batch), batch[0][0], batch[-1][0]))
jpayne@69	509 con.executemany(
jpayne@69	510 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
jpayne@69	511 batch,
jpayne@69	512 )
jpayne@69	513 con.commit()
jpayne@69	514 count += len(batch)
jpayne@69	515 if len(random_access_proxies) < max_open:
jpayne@69	516 random_access_proxies[file_index] = random_access_proxy
jpayne@69	517 else:
jpayne@69	518 random_access_proxy._handle.close()
jpayne@69	519 self._length = count
jpayne@69	520 # print("About to index %i entries" % count)
jpayne@69	521 try:
jpayne@69	522 con.execute(
jpayne@69	523 "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);"
jpayne@69	524 )
jpayne@69	525 except sqlite3.IntegrityError as err:
jpayne@69	526 self._proxies = random_access_proxies
jpayne@69	527 self.close()
jpayne@69	528 con.close()
jpayne@69	529 raise ValueError(f"Duplicate key? {err}") from None
jpayne@69	530 con.execute("PRAGMA locking_mode=NORMAL")
jpayne@69	531 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count"))
jpayne@69	532 con.commit()
jpayne@69	533 # print("Index created")
jpayne@69	534
jpayne@69	535 def __repr__(self):
jpayne@69	536 return self._repr
jpayne@69	537
jpayne@69	538 def __contains__(self, key):
jpayne@69	539 return bool(
jpayne@69	540 self._con.execute(
jpayne@69	541 "SELECT key FROM offset_data WHERE key=?;", (key,)
jpayne@69	542 ).fetchone()
jpayne@69	543 )
jpayne@69	544
jpayne@69	545 def __len__(self):
jpayne@69	546 """Return the number of records indexed."""
jpayne@69	547 return self._length
jpayne@69	548 # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
jpayne@69	549
jpayne@69	550 def __iter__(self):
jpayne@69	551 """Iterate over the keys."""
jpayne@69	552 for row in self._con.execute(
jpayne@69	553 "SELECT key FROM offset_data ORDER BY file_number, offset;"
jpayne@69	554 ):
jpayne@69	555 yield str(row[0])
jpayne@69	556
jpayne@69	557 def __getitem__(self, key):
jpayne@69	558 """Return record for the specified key."""
jpayne@69	559 # Pass the offset to the proxy
jpayne@69	560 row = self._con.execute(
jpayne@69	561 "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,)
jpayne@69	562 ).fetchone()
jpayne@69	563 if not row:
jpayne@69	564 raise KeyError
jpayne@69	565 file_number, offset = row
jpayne@69	566 proxies = self._proxies
jpayne@69	567 if file_number in proxies:
jpayne@69	568 record = proxies[file_number].get(offset)
jpayne@69	569 else:
jpayne@69	570 if len(proxies) >= self._max_open:
jpayne@69	571 # Close an old handle...
jpayne@69	572 proxies.popitem()[1]._handle.close()
jpayne@69	573 # Open a new handle...
jpayne@69	574 proxy = self._proxy_factory(self._format, self._filenames[file_number])
jpayne@69	575 record = proxy.get(offset)
jpayne@69	576 proxies[file_number] = proxy
jpayne@69	577 if self._key_function:
jpayne@69	578 key2 = self._key_function(record.id)
jpayne@69	579 else:
jpayne@69	580 key2 = record.id
jpayne@69	581 if key != key2:
jpayne@69	582 raise ValueError(f"Key did not match ({key} vs {key2})")
jpayne@69	583 return record
jpayne@69	584
jpayne@69	585 def get_raw(self, key):
jpayne@69	586 """Return the raw record from the file as a bytes string.
jpayne@69	587
jpayne@69	588 If the key is not found, a KeyError exception is raised.
jpayne@69	589 """
jpayne@69	590 # Pass the offset to the proxy
jpayne@69	591 row = self._con.execute(
jpayne@69	592 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,)
jpayne@69	593 ).fetchone()
jpayne@69	594 if not row:
jpayne@69	595 raise KeyError
jpayne@69	596 file_number, offset, length = row
jpayne@69	597 proxies = self._proxies
jpayne@69	598 if file_number in proxies:
jpayne@69	599 if length:
jpayne@69	600 # Shortcut if we have the length
jpayne@69	601 h = proxies[file_number]._handle
jpayne@69	602 h.seek(offset)
jpayne@69	603 return h.read(length)
jpayne@69	604 else:
jpayne@69	605 return proxies[file_number].get_raw(offset)
jpayne@69	606 else:
jpayne@69	607 # This code is duplicated from __getitem__ to avoid a function call
jpayne@69	608 if len(proxies) >= self._max_open:
jpayne@69	609 # Close an old handle...
jpayne@69	610 proxies.popitem()[1]._handle.close()
jpayne@69	611 # Open a new handle...
jpayne@69	612 proxy = self._proxy_factory(self._format, self._filenames[file_number])
jpayne@69	613 proxies[file_number] = proxy
jpayne@69	614 if length:
jpayne@69	615 # Shortcut if we have the length
jpayne@69	616 h = proxy._handle
jpayne@69	617 h.seek(offset)
jpayne@69	618 return h.read(length)
jpayne@69	619 else:
jpayne@69	620 return proxy.get_raw(offset)
jpayne@69	621
jpayne@69	622 def close(self):
jpayne@69	623 """Close any open file handles."""
jpayne@69	624 proxies = self._proxies
jpayne@69	625 while proxies:
jpayne@69	626 proxies.popitem()[1]._handle.close()

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/File.py @ 69:33d812a61356