jpayne@68
|
1 """A dumb and slow but simple dbm clone.
|
jpayne@68
|
2
|
jpayne@68
|
3 For database spam, spam.dir contains the index (a text file),
|
jpayne@68
|
4 spam.bak *may* contain a backup of the index (also a text file),
|
jpayne@68
|
5 while spam.dat contains the data (a binary file).
|
jpayne@68
|
6
|
jpayne@68
|
7 XXX TO DO:
|
jpayne@68
|
8
|
jpayne@68
|
9 - seems to contain a bug when updating...
|
jpayne@68
|
10
|
jpayne@68
|
11 - reclaim free space (currently, space once occupied by deleted or expanded
|
jpayne@68
|
12 items is never reused)
|
jpayne@68
|
13
|
jpayne@68
|
14 - support concurrent access (currently, if two processes take turns making
|
jpayne@68
|
15 updates, they can mess up the index)
|
jpayne@68
|
16
|
jpayne@68
|
17 - support efficient access to large databases (currently, the whole index
|
jpayne@68
|
18 is read when the database is opened, and some updates rewrite the whole index)
|
jpayne@68
|
19
|
jpayne@68
|
20 - support opening for read-only (flag = 'm')
|
jpayne@68
|
21
|
jpayne@68
|
22 """
|
jpayne@68
|
23
|
jpayne@68
|
24 import ast as _ast
|
jpayne@68
|
25 import io as _io
|
jpayne@68
|
26 import os as _os
|
jpayne@68
|
27 import collections.abc
|
jpayne@68
|
28
|
jpayne@68
|
29 __all__ = ["error", "open"]
|
jpayne@68
|
30
|
jpayne@68
|
31 _BLOCKSIZE = 512
|
jpayne@68
|
32
|
jpayne@68
|
33 error = OSError
|
jpayne@68
|
34
|
jpayne@68
|
35 class _Database(collections.abc.MutableMapping):
|
jpayne@68
|
36
|
jpayne@68
|
37 # The on-disk directory and data files can remain in mutually
|
jpayne@68
|
38 # inconsistent states for an arbitrarily long time (see comments
|
jpayne@68
|
39 # at the end of __setitem__). This is only repaired when _commit()
|
jpayne@68
|
40 # gets called. One place _commit() gets called is from __del__(),
|
jpayne@68
|
41 # and if that occurs at program shutdown time, module globals may
|
jpayne@68
|
42 # already have gotten rebound to None. Since it's crucial that
|
jpayne@68
|
43 # _commit() finish successfully, we can't ignore shutdown races
|
jpayne@68
|
44 # here, and _commit() must not reference any globals.
|
jpayne@68
|
45 _os = _os # for _commit()
|
jpayne@68
|
46 _io = _io # for _commit()
|
jpayne@68
|
47
|
jpayne@68
|
48 def __init__(self, filebasename, mode, flag='c'):
|
jpayne@68
|
49 self._mode = mode
|
jpayne@68
|
50 self._readonly = (flag == 'r')
|
jpayne@68
|
51
|
jpayne@68
|
52 # The directory file is a text file. Each line looks like
|
jpayne@68
|
53 # "%r, (%d, %d)\n" % (key, pos, siz)
|
jpayne@68
|
54 # where key is the string key, pos is the offset into the dat
|
jpayne@68
|
55 # file of the associated value's first byte, and siz is the number
|
jpayne@68
|
56 # of bytes in the associated value.
|
jpayne@68
|
57 self._dirfile = filebasename + '.dir'
|
jpayne@68
|
58
|
jpayne@68
|
59 # The data file is a binary file pointed into by the directory
|
jpayne@68
|
60 # file, and holds the values associated with keys. Each value
|
jpayne@68
|
61 # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
|
jpayne@68
|
62 # binary 8-bit string value.
|
jpayne@68
|
63 self._datfile = filebasename + '.dat'
|
jpayne@68
|
64 self._bakfile = filebasename + '.bak'
|
jpayne@68
|
65
|
jpayne@68
|
66 # The index is an in-memory dict, mirroring the directory file.
|
jpayne@68
|
67 self._index = None # maps keys to (pos, siz) pairs
|
jpayne@68
|
68
|
jpayne@68
|
69 # Handle the creation
|
jpayne@68
|
70 self._create(flag)
|
jpayne@68
|
71 self._update(flag)
|
jpayne@68
|
72
|
jpayne@68
|
73 def _create(self, flag):
|
jpayne@68
|
74 if flag == 'n':
|
jpayne@68
|
75 for filename in (self._datfile, self._bakfile, self._dirfile):
|
jpayne@68
|
76 try:
|
jpayne@68
|
77 _os.remove(filename)
|
jpayne@68
|
78 except OSError:
|
jpayne@68
|
79 pass
|
jpayne@68
|
80 # Mod by Jack: create data file if needed
|
jpayne@68
|
81 try:
|
jpayne@68
|
82 f = _io.open(self._datfile, 'r', encoding="Latin-1")
|
jpayne@68
|
83 except OSError:
|
jpayne@68
|
84 if flag not in ('c', 'n'):
|
jpayne@68
|
85 raise
|
jpayne@68
|
86 with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
|
jpayne@68
|
87 self._chmod(self._datfile)
|
jpayne@68
|
88 else:
|
jpayne@68
|
89 f.close()
|
jpayne@68
|
90
|
jpayne@68
|
91 # Read directory file into the in-memory index dict.
|
jpayne@68
|
92 def _update(self, flag):
|
jpayne@68
|
93 self._modified = False
|
jpayne@68
|
94 self._index = {}
|
jpayne@68
|
95 try:
|
jpayne@68
|
96 f = _io.open(self._dirfile, 'r', encoding="Latin-1")
|
jpayne@68
|
97 except OSError:
|
jpayne@68
|
98 if flag not in ('c', 'n'):
|
jpayne@68
|
99 raise
|
jpayne@68
|
100 self._modified = True
|
jpayne@68
|
101 else:
|
jpayne@68
|
102 with f:
|
jpayne@68
|
103 for line in f:
|
jpayne@68
|
104 line = line.rstrip()
|
jpayne@68
|
105 key, pos_and_siz_pair = _ast.literal_eval(line)
|
jpayne@68
|
106 key = key.encode('Latin-1')
|
jpayne@68
|
107 self._index[key] = pos_and_siz_pair
|
jpayne@68
|
108
|
jpayne@68
|
109 # Write the index dict to the directory file. The original directory
|
jpayne@68
|
110 # file (if any) is renamed with a .bak extension first. If a .bak
|
jpayne@68
|
111 # file currently exists, it's deleted.
|
jpayne@68
|
112 def _commit(self):
|
jpayne@68
|
113 # CAUTION: It's vital that _commit() succeed, and _commit() can
|
jpayne@68
|
114 # be called from __del__(). Therefore we must never reference a
|
jpayne@68
|
115 # global in this routine.
|
jpayne@68
|
116 if self._index is None or not self._modified:
|
jpayne@68
|
117 return # nothing to do
|
jpayne@68
|
118
|
jpayne@68
|
119 try:
|
jpayne@68
|
120 self._os.unlink(self._bakfile)
|
jpayne@68
|
121 except OSError:
|
jpayne@68
|
122 pass
|
jpayne@68
|
123
|
jpayne@68
|
124 try:
|
jpayne@68
|
125 self._os.rename(self._dirfile, self._bakfile)
|
jpayne@68
|
126 except OSError:
|
jpayne@68
|
127 pass
|
jpayne@68
|
128
|
jpayne@68
|
129 with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
|
jpayne@68
|
130 self._chmod(self._dirfile)
|
jpayne@68
|
131 for key, pos_and_siz_pair in self._index.items():
|
jpayne@68
|
132 # Use Latin-1 since it has no qualms with any value in any
|
jpayne@68
|
133 # position; UTF-8, though, does care sometimes.
|
jpayne@68
|
134 entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
|
jpayne@68
|
135 f.write(entry)
|
jpayne@68
|
136
|
jpayne@68
|
137 sync = _commit
|
jpayne@68
|
138
|
jpayne@68
|
139 def _verify_open(self):
|
jpayne@68
|
140 if self._index is None:
|
jpayne@68
|
141 raise error('DBM object has already been closed')
|
jpayne@68
|
142
|
jpayne@68
|
143 def __getitem__(self, key):
|
jpayne@68
|
144 if isinstance(key, str):
|
jpayne@68
|
145 key = key.encode('utf-8')
|
jpayne@68
|
146 self._verify_open()
|
jpayne@68
|
147 pos, siz = self._index[key] # may raise KeyError
|
jpayne@68
|
148 with _io.open(self._datfile, 'rb') as f:
|
jpayne@68
|
149 f.seek(pos)
|
jpayne@68
|
150 dat = f.read(siz)
|
jpayne@68
|
151 return dat
|
jpayne@68
|
152
|
jpayne@68
|
153 # Append val to the data file, starting at a _BLOCKSIZE-aligned
|
jpayne@68
|
154 # offset. The data file is first padded with NUL bytes (if needed)
|
jpayne@68
|
155 # to get to an aligned offset. Return pair
|
jpayne@68
|
156 # (starting offset of val, len(val))
|
jpayne@68
|
157 def _addval(self, val):
|
jpayne@68
|
158 with _io.open(self._datfile, 'rb+') as f:
|
jpayne@68
|
159 f.seek(0, 2)
|
jpayne@68
|
160 pos = int(f.tell())
|
jpayne@68
|
161 npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
|
jpayne@68
|
162 f.write(b'\0'*(npos-pos))
|
jpayne@68
|
163 pos = npos
|
jpayne@68
|
164 f.write(val)
|
jpayne@68
|
165 return (pos, len(val))
|
jpayne@68
|
166
|
jpayne@68
|
167 # Write val to the data file, starting at offset pos. The caller
|
jpayne@68
|
168 # is responsible for ensuring that there's enough room starting at
|
jpayne@68
|
169 # pos to hold val, without overwriting some other value. Return
|
jpayne@68
|
170 # pair (pos, len(val)).
|
jpayne@68
|
171 def _setval(self, pos, val):
|
jpayne@68
|
172 with _io.open(self._datfile, 'rb+') as f:
|
jpayne@68
|
173 f.seek(pos)
|
jpayne@68
|
174 f.write(val)
|
jpayne@68
|
175 return (pos, len(val))
|
jpayne@68
|
176
|
jpayne@68
|
177 # key is a new key whose associated value starts in the data file
|
jpayne@68
|
178 # at offset pos and with length siz. Add an index record to
|
jpayne@68
|
179 # the in-memory index dict, and append one to the directory file.
|
jpayne@68
|
180 def _addkey(self, key, pos_and_siz_pair):
|
jpayne@68
|
181 self._index[key] = pos_and_siz_pair
|
jpayne@68
|
182 with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
|
jpayne@68
|
183 self._chmod(self._dirfile)
|
jpayne@68
|
184 f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
|
jpayne@68
|
185
|
jpayne@68
|
186 def __setitem__(self, key, val):
|
jpayne@68
|
187 if self._readonly:
|
jpayne@68
|
188 raise error('The database is opened for reading only')
|
jpayne@68
|
189 if isinstance(key, str):
|
jpayne@68
|
190 key = key.encode('utf-8')
|
jpayne@68
|
191 elif not isinstance(key, (bytes, bytearray)):
|
jpayne@68
|
192 raise TypeError("keys must be bytes or strings")
|
jpayne@68
|
193 if isinstance(val, str):
|
jpayne@68
|
194 val = val.encode('utf-8')
|
jpayne@68
|
195 elif not isinstance(val, (bytes, bytearray)):
|
jpayne@68
|
196 raise TypeError("values must be bytes or strings")
|
jpayne@68
|
197 self._verify_open()
|
jpayne@68
|
198 self._modified = True
|
jpayne@68
|
199 if key not in self._index:
|
jpayne@68
|
200 self._addkey(key, self._addval(val))
|
jpayne@68
|
201 else:
|
jpayne@68
|
202 # See whether the new value is small enough to fit in the
|
jpayne@68
|
203 # (padded) space currently occupied by the old value.
|
jpayne@68
|
204 pos, siz = self._index[key]
|
jpayne@68
|
205 oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
|
jpayne@68
|
206 newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
|
jpayne@68
|
207 if newblocks <= oldblocks:
|
jpayne@68
|
208 self._index[key] = self._setval(pos, val)
|
jpayne@68
|
209 else:
|
jpayne@68
|
210 # The new value doesn't fit in the (padded) space used
|
jpayne@68
|
211 # by the old value. The blocks used by the old value are
|
jpayne@68
|
212 # forever lost.
|
jpayne@68
|
213 self._index[key] = self._addval(val)
|
jpayne@68
|
214
|
jpayne@68
|
215 # Note that _index may be out of synch with the directory
|
jpayne@68
|
216 # file now: _setval() and _addval() don't update the directory
|
jpayne@68
|
217 # file. This also means that the on-disk directory and data
|
jpayne@68
|
218 # files are in a mutually inconsistent state, and they'll
|
jpayne@68
|
219 # remain that way until _commit() is called. Note that this
|
jpayne@68
|
220 # is a disaster (for the database) if the program crashes
|
jpayne@68
|
221 # (so that _commit() never gets called).
|
jpayne@68
|
222
|
jpayne@68
|
223 def __delitem__(self, key):
|
jpayne@68
|
224 if self._readonly:
|
jpayne@68
|
225 raise error('The database is opened for reading only')
|
jpayne@68
|
226 if isinstance(key, str):
|
jpayne@68
|
227 key = key.encode('utf-8')
|
jpayne@68
|
228 self._verify_open()
|
jpayne@68
|
229 self._modified = True
|
jpayne@68
|
230 # The blocks used by the associated value are lost.
|
jpayne@68
|
231 del self._index[key]
|
jpayne@68
|
232 # XXX It's unclear why we do a _commit() here (the code always
|
jpayne@68
|
233 # XXX has, so I'm not changing it). __setitem__ doesn't try to
|
jpayne@68
|
234 # XXX keep the directory file in synch. Why should we? Or
|
jpayne@68
|
235 # XXX why shouldn't __setitem__?
|
jpayne@68
|
236 self._commit()
|
jpayne@68
|
237
|
jpayne@68
|
238 def keys(self):
|
jpayne@68
|
239 try:
|
jpayne@68
|
240 return list(self._index)
|
jpayne@68
|
241 except TypeError:
|
jpayne@68
|
242 raise error('DBM object has already been closed') from None
|
jpayne@68
|
243
|
jpayne@68
|
244 def items(self):
|
jpayne@68
|
245 self._verify_open()
|
jpayne@68
|
246 return [(key, self[key]) for key in self._index.keys()]
|
jpayne@68
|
247
|
jpayne@68
|
248 def __contains__(self, key):
|
jpayne@68
|
249 if isinstance(key, str):
|
jpayne@68
|
250 key = key.encode('utf-8')
|
jpayne@68
|
251 try:
|
jpayne@68
|
252 return key in self._index
|
jpayne@68
|
253 except TypeError:
|
jpayne@68
|
254 if self._index is None:
|
jpayne@68
|
255 raise error('DBM object has already been closed') from None
|
jpayne@68
|
256 else:
|
jpayne@68
|
257 raise
|
jpayne@68
|
258
|
jpayne@68
|
259 def iterkeys(self):
|
jpayne@68
|
260 try:
|
jpayne@68
|
261 return iter(self._index)
|
jpayne@68
|
262 except TypeError:
|
jpayne@68
|
263 raise error('DBM object has already been closed') from None
|
jpayne@68
|
264 __iter__ = iterkeys
|
jpayne@68
|
265
|
jpayne@68
|
266 def __len__(self):
|
jpayne@68
|
267 try:
|
jpayne@68
|
268 return len(self._index)
|
jpayne@68
|
269 except TypeError:
|
jpayne@68
|
270 raise error('DBM object has already been closed') from None
|
jpayne@68
|
271
|
jpayne@68
|
272 def close(self):
|
jpayne@68
|
273 try:
|
jpayne@68
|
274 self._commit()
|
jpayne@68
|
275 finally:
|
jpayne@68
|
276 self._index = self._datfile = self._dirfile = self._bakfile = None
|
jpayne@68
|
277
|
jpayne@68
|
278 __del__ = close
|
jpayne@68
|
279
|
jpayne@68
|
280 def _chmod(self, file):
|
jpayne@68
|
281 self._os.chmod(file, self._mode)
|
jpayne@68
|
282
|
jpayne@68
|
283 def __enter__(self):
|
jpayne@68
|
284 return self
|
jpayne@68
|
285
|
jpayne@68
|
286 def __exit__(self, *args):
|
jpayne@68
|
287 self.close()
|
jpayne@68
|
288
|
jpayne@68
|
289
|
jpayne@68
|
290 def open(file, flag='c', mode=0o666):
|
jpayne@68
|
291 """Open the database file, filename, and return corresponding object.
|
jpayne@68
|
292
|
jpayne@68
|
293 The flag argument, used to control how the database is opened in the
|
jpayne@68
|
294 other DBM implementations, supports only the semantics of 'c' and 'n'
|
jpayne@68
|
295 values. Other values will default to the semantics of 'c' value:
|
jpayne@68
|
296 the database will always opened for update and will be created if it
|
jpayne@68
|
297 does not exist.
|
jpayne@68
|
298
|
jpayne@68
|
299 The optional mode argument is the UNIX mode of the file, used only when
|
jpayne@68
|
300 the database has to be created. It defaults to octal code 0o666 (and
|
jpayne@68
|
301 will be modified by the prevailing umask).
|
jpayne@68
|
302
|
jpayne@68
|
303 """
|
jpayne@68
|
304
|
jpayne@68
|
305 # Modify mode depending on the umask
|
jpayne@68
|
306 try:
|
jpayne@68
|
307 um = _os.umask(0)
|
jpayne@68
|
308 _os.umask(um)
|
jpayne@68
|
309 except AttributeError:
|
jpayne@68
|
310 pass
|
jpayne@68
|
311 else:
|
jpayne@68
|
312 # Turn off any bits that are set in the umask
|
jpayne@68
|
313 mode = mode & (~um)
|
jpayne@68
|
314 if flag not in ('r', 'w', 'c', 'n'):
|
jpayne@68
|
315 raise ValueError("Flag must be one of 'r', 'w', 'c', or 'n'")
|
jpayne@68
|
316 return _Database(file, mode, flag=flag)
|