jpayne@68: """A dumb and slow but simple dbm clone. jpayne@68: jpayne@68: For database spam, spam.dir contains the index (a text file), jpayne@68: spam.bak *may* contain a backup of the index (also a text file), jpayne@68: while spam.dat contains the data (a binary file). jpayne@68: jpayne@68: XXX TO DO: jpayne@68: jpayne@68: - seems to contain a bug when updating... jpayne@68: jpayne@68: - reclaim free space (currently, space once occupied by deleted or expanded jpayne@68: items is never reused) jpayne@68: jpayne@68: - support concurrent access (currently, if two processes take turns making jpayne@68: updates, they can mess up the index) jpayne@68: jpayne@68: - support efficient access to large databases (currently, the whole index jpayne@68: is read when the database is opened, and some updates rewrite the whole index) jpayne@68: jpayne@68: - support opening for read-only (flag = 'm') jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: import ast as _ast jpayne@68: import io as _io jpayne@68: import os as _os jpayne@68: import collections.abc jpayne@68: jpayne@68: __all__ = ["error", "open"] jpayne@68: jpayne@68: _BLOCKSIZE = 512 jpayne@68: jpayne@68: error = OSError jpayne@68: jpayne@68: class _Database(collections.abc.MutableMapping): jpayne@68: jpayne@68: # The on-disk directory and data files can remain in mutually jpayne@68: # inconsistent states for an arbitrarily long time (see comments jpayne@68: # at the end of __setitem__). This is only repaired when _commit() jpayne@68: # gets called. One place _commit() gets called is from __del__(), jpayne@68: # and if that occurs at program shutdown time, module globals may jpayne@68: # already have gotten rebound to None. Since it's crucial that jpayne@68: # _commit() finish successfully, we can't ignore shutdown races jpayne@68: # here, and _commit() must not reference any globals. jpayne@68: _os = _os # for _commit() jpayne@68: _io = _io # for _commit() jpayne@68: jpayne@68: def __init__(self, filebasename, mode, flag='c'): jpayne@68: self._mode = mode jpayne@68: self._readonly = (flag == 'r') jpayne@68: jpayne@68: # The directory file is a text file. Each line looks like jpayne@68: # "%r, (%d, %d)\n" % (key, pos, siz) jpayne@68: # where key is the string key, pos is the offset into the dat jpayne@68: # file of the associated value's first byte, and siz is the number jpayne@68: # of bytes in the associated value. jpayne@68: self._dirfile = filebasename + '.dir' jpayne@68: jpayne@68: # The data file is a binary file pointed into by the directory jpayne@68: # file, and holds the values associated with keys. Each value jpayne@68: # begins at a _BLOCKSIZE-aligned byte offset, and is a raw jpayne@68: # binary 8-bit string value. jpayne@68: self._datfile = filebasename + '.dat' jpayne@68: self._bakfile = filebasename + '.bak' jpayne@68: jpayne@68: # The index is an in-memory dict, mirroring the directory file. jpayne@68: self._index = None # maps keys to (pos, siz) pairs jpayne@68: jpayne@68: # Handle the creation jpayne@68: self._create(flag) jpayne@68: self._update(flag) jpayne@68: jpayne@68: def _create(self, flag): jpayne@68: if flag == 'n': jpayne@68: for filename in (self._datfile, self._bakfile, self._dirfile): jpayne@68: try: jpayne@68: _os.remove(filename) jpayne@68: except OSError: jpayne@68: pass jpayne@68: # Mod by Jack: create data file if needed jpayne@68: try: jpayne@68: f = _io.open(self._datfile, 'r', encoding="Latin-1") jpayne@68: except OSError: jpayne@68: if flag not in ('c', 'n'): jpayne@68: raise jpayne@68: with _io.open(self._datfile, 'w', encoding="Latin-1") as f: jpayne@68: self._chmod(self._datfile) jpayne@68: else: jpayne@68: f.close() jpayne@68: jpayne@68: # Read directory file into the in-memory index dict. jpayne@68: def _update(self, flag): jpayne@68: self._modified = False jpayne@68: self._index = {} jpayne@68: try: jpayne@68: f = _io.open(self._dirfile, 'r', encoding="Latin-1") jpayne@68: except OSError: jpayne@68: if flag not in ('c', 'n'): jpayne@68: raise jpayne@68: self._modified = True jpayne@68: else: jpayne@68: with f: jpayne@68: for line in f: jpayne@68: line = line.rstrip() jpayne@68: key, pos_and_siz_pair = _ast.literal_eval(line) jpayne@68: key = key.encode('Latin-1') jpayne@68: self._index[key] = pos_and_siz_pair jpayne@68: jpayne@68: # Write the index dict to the directory file. The original directory jpayne@68: # file (if any) is renamed with a .bak extension first. If a .bak jpayne@68: # file currently exists, it's deleted. jpayne@68: def _commit(self): jpayne@68: # CAUTION: It's vital that _commit() succeed, and _commit() can jpayne@68: # be called from __del__(). Therefore we must never reference a jpayne@68: # global in this routine. jpayne@68: if self._index is None or not self._modified: jpayne@68: return # nothing to do jpayne@68: jpayne@68: try: jpayne@68: self._os.unlink(self._bakfile) jpayne@68: except OSError: jpayne@68: pass jpayne@68: jpayne@68: try: jpayne@68: self._os.rename(self._dirfile, self._bakfile) jpayne@68: except OSError: jpayne@68: pass jpayne@68: jpayne@68: with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f: jpayne@68: self._chmod(self._dirfile) jpayne@68: for key, pos_and_siz_pair in self._index.items(): jpayne@68: # Use Latin-1 since it has no qualms with any value in any jpayne@68: # position; UTF-8, though, does care sometimes. jpayne@68: entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair) jpayne@68: f.write(entry) jpayne@68: jpayne@68: sync = _commit jpayne@68: jpayne@68: def _verify_open(self): jpayne@68: if self._index is None: jpayne@68: raise error('DBM object has already been closed') jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: if isinstance(key, str): jpayne@68: key = key.encode('utf-8') jpayne@68: self._verify_open() jpayne@68: pos, siz = self._index[key] # may raise KeyError jpayne@68: with _io.open(self._datfile, 'rb') as f: jpayne@68: f.seek(pos) jpayne@68: dat = f.read(siz) jpayne@68: return dat jpayne@68: jpayne@68: # Append val to the data file, starting at a _BLOCKSIZE-aligned jpayne@68: # offset. The data file is first padded with NUL bytes (if needed) jpayne@68: # to get to an aligned offset. Return pair jpayne@68: # (starting offset of val, len(val)) jpayne@68: def _addval(self, val): jpayne@68: with _io.open(self._datfile, 'rb+') as f: jpayne@68: f.seek(0, 2) jpayne@68: pos = int(f.tell()) jpayne@68: npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE jpayne@68: f.write(b'\0'*(npos-pos)) jpayne@68: pos = npos jpayne@68: f.write(val) jpayne@68: return (pos, len(val)) jpayne@68: jpayne@68: # Write val to the data file, starting at offset pos. The caller jpayne@68: # is responsible for ensuring that there's enough room starting at jpayne@68: # pos to hold val, without overwriting some other value. Return jpayne@68: # pair (pos, len(val)). jpayne@68: def _setval(self, pos, val): jpayne@68: with _io.open(self._datfile, 'rb+') as f: jpayne@68: f.seek(pos) jpayne@68: f.write(val) jpayne@68: return (pos, len(val)) jpayne@68: jpayne@68: # key is a new key whose associated value starts in the data file jpayne@68: # at offset pos and with length siz. Add an index record to jpayne@68: # the in-memory index dict, and append one to the directory file. jpayne@68: def _addkey(self, key, pos_and_siz_pair): jpayne@68: self._index[key] = pos_and_siz_pair jpayne@68: with _io.open(self._dirfile, 'a', encoding="Latin-1") as f: jpayne@68: self._chmod(self._dirfile) jpayne@68: f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair)) jpayne@68: jpayne@68: def __setitem__(self, key, val): jpayne@68: if self._readonly: jpayne@68: raise error('The database is opened for reading only') jpayne@68: if isinstance(key, str): jpayne@68: key = key.encode('utf-8') jpayne@68: elif not isinstance(key, (bytes, bytearray)): jpayne@68: raise TypeError("keys must be bytes or strings") jpayne@68: if isinstance(val, str): jpayne@68: val = val.encode('utf-8') jpayne@68: elif not isinstance(val, (bytes, bytearray)): jpayne@68: raise TypeError("values must be bytes or strings") jpayne@68: self._verify_open() jpayne@68: self._modified = True jpayne@68: if key not in self._index: jpayne@68: self._addkey(key, self._addval(val)) jpayne@68: else: jpayne@68: # See whether the new value is small enough to fit in the jpayne@68: # (padded) space currently occupied by the old value. jpayne@68: pos, siz = self._index[key] jpayne@68: oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE jpayne@68: newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE jpayne@68: if newblocks <= oldblocks: jpayne@68: self._index[key] = self._setval(pos, val) jpayne@68: else: jpayne@68: # The new value doesn't fit in the (padded) space used jpayne@68: # by the old value. The blocks used by the old value are jpayne@68: # forever lost. jpayne@68: self._index[key] = self._addval(val) jpayne@68: jpayne@68: # Note that _index may be out of synch with the directory jpayne@68: # file now: _setval() and _addval() don't update the directory jpayne@68: # file. This also means that the on-disk directory and data jpayne@68: # files are in a mutually inconsistent state, and they'll jpayne@68: # remain that way until _commit() is called. Note that this jpayne@68: # is a disaster (for the database) if the program crashes jpayne@68: # (so that _commit() never gets called). jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: if self._readonly: jpayne@68: raise error('The database is opened for reading only') jpayne@68: if isinstance(key, str): jpayne@68: key = key.encode('utf-8') jpayne@68: self._verify_open() jpayne@68: self._modified = True jpayne@68: # The blocks used by the associated value are lost. jpayne@68: del self._index[key] jpayne@68: # XXX It's unclear why we do a _commit() here (the code always jpayne@68: # XXX has, so I'm not changing it). __setitem__ doesn't try to jpayne@68: # XXX keep the directory file in synch. Why should we? Or jpayne@68: # XXX why shouldn't __setitem__? jpayne@68: self._commit() jpayne@68: jpayne@68: def keys(self): jpayne@68: try: jpayne@68: return list(self._index) jpayne@68: except TypeError: jpayne@68: raise error('DBM object has already been closed') from None jpayne@68: jpayne@68: def items(self): jpayne@68: self._verify_open() jpayne@68: return [(key, self[key]) for key in self._index.keys()] jpayne@68: jpayne@68: def __contains__(self, key): jpayne@68: if isinstance(key, str): jpayne@68: key = key.encode('utf-8') jpayne@68: try: jpayne@68: return key in self._index jpayne@68: except TypeError: jpayne@68: if self._index is None: jpayne@68: raise error('DBM object has already been closed') from None jpayne@68: else: jpayne@68: raise jpayne@68: jpayne@68: def iterkeys(self): jpayne@68: try: jpayne@68: return iter(self._index) jpayne@68: except TypeError: jpayne@68: raise error('DBM object has already been closed') from None jpayne@68: __iter__ = iterkeys jpayne@68: jpayne@68: def __len__(self): jpayne@68: try: jpayne@68: return len(self._index) jpayne@68: except TypeError: jpayne@68: raise error('DBM object has already been closed') from None jpayne@68: jpayne@68: def close(self): jpayne@68: try: jpayne@68: self._commit() jpayne@68: finally: jpayne@68: self._index = self._datfile = self._dirfile = self._bakfile = None jpayne@68: jpayne@68: __del__ = close jpayne@68: jpayne@68: def _chmod(self, file): jpayne@68: self._os.chmod(file, self._mode) jpayne@68: jpayne@68: def __enter__(self): jpayne@68: return self jpayne@68: jpayne@68: def __exit__(self, *args): jpayne@68: self.close() jpayne@68: jpayne@68: jpayne@68: def open(file, flag='c', mode=0o666): jpayne@68: """Open the database file, filename, and return corresponding object. jpayne@68: jpayne@68: The flag argument, used to control how the database is opened in the jpayne@68: other DBM implementations, supports only the semantics of 'c' and 'n' jpayne@68: values. Other values will default to the semantics of 'c' value: jpayne@68: the database will always opened for update and will be created if it jpayne@68: does not exist. jpayne@68: jpayne@68: The optional mode argument is the UNIX mode of the file, used only when jpayne@68: the database has to be created. It defaults to octal code 0o666 (and jpayne@68: will be modified by the prevailing umask). jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: # Modify mode depending on the umask jpayne@68: try: jpayne@68: um = _os.umask(0) jpayne@68: _os.umask(um) jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: else: jpayne@68: # Turn off any bits that are set in the umask jpayne@68: mode = mode & (~um) jpayne@68: if flag not in ('r', 'w', 'c', 'n'): jpayne@68: raise ValueError("Flag must be one of 'r', 'w', 'c', or 'n'") jpayne@68: return _Database(file, mode, flag=flag)