1 #
2 # Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/)
3 # This module is free software, and you may redistribute it and/or modify
4 # under the same terms as Python, so long as this copyright message and
5 # disclaimer are retained in their original form.
6 #
7 # IN NO EVENT SHALL BIZAR SOFTWARE PTY LTD BE LIABLE TO ANY PARTY FOR
8 # DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING
9 # OUT OF THE USE OF THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE
10 # POSSIBILITY OF SUCH DAMAGE.
11 #
12 # BIZAR SOFTWARE PTY LTD SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
13 # BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
14 # FOR A PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS"
15 # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
16 # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
17 #
18 """This module exports file storage for roundup backends.
19 Files are stored into a directory hierarchy.
20 """
21 __docformat__ = 'restructuredtext'
23 import os
25 def files_in_dir(dir):
26 if not os.path.exists(dir):
27 return 0
28 num_files = 0
29 for dir_entry in os.listdir(dir):
30 full_filename = os.path.join(dir,dir_entry)
31 if os.path.isfile(full_filename):
32 num_files = num_files + 1
33 elif os.path.isdir(full_filename):
34 num_files = num_files + files_in_dir(full_filename)
35 return num_files
37 class FileStorage:
38 """Store files in some directory structure
40 Some databases do not permit the storage of arbitrary data (i.e.,
41 file content). And, some database schema explicitly store file
42 content in the fielsystem. In particular, if a class defines a
43 'filename' property, it is assumed that the data is stored in the
44 indicated file, outside of whatever database Roundup is otherwise
45 using.
47 In these situations, it is difficult to maintain the transactional
48 abstractions used elsewhere in Roundup. In particular, if a
49 file's content is edited, but then the containing transaction is
50 not committed, we do not want to commit the edit. Similarly, we
51 would like to guarantee that if a transaction is committed to the
52 database, then the edit has in fact taken place.
54 This class provides an approximation of these transactional
55 requirements.
57 For classes that do not have a 'filename' property, the file name
58 used to store the file's content is a deterministic function of
59 the classname and nodeid for the file. The 'filename' function
60 computes this name. The name will contain directories and
61 subdirectories, but, suppose, for the purposes of what follows,
62 that the filename is 'file'.
64 Edit Procotol
65 -------------
67 When a file is created or edited, the following protocol is used:
69 1. The new content of the file is placed in 'file.tmp'.
71 2. A transaction is recored in 'self.transactions' referencing the
72 'doStoreFile' method of this class.
74 3. At some subsequent point, the database 'commit' function is
75 called. This function first performs a traditional database
76 commit (for example, by issuing a SQL command to commit the
77 current transaction), and, then, runs the transactions recored
78 in 'self.transactions'.
80 4. The 'doStoreFile' method renames the 'file.tmp' to 'file'.
82 If Step 3 never occurs, but, instead, the database 'rollback'
83 method is called, then that method, after rolling back the
84 database transaction, calls 'rollbackStoreFile', which removes
85 'file.tmp'.
87 Race Condition
88 --------------
90 If two Roundup instances (say, the mail gateway and a web client,
91 or two web clients running with a multi-process server) attempt
92 edits at the same time, both will write to 'file.tmp', and the
93 results will be indeterminate.
95 Crash Analysis
96 --------------
98 There are several situations that may occur if a crash (whether
99 because the machine crashes, because an unhandled Python exception
100 is raised, or because the Python process is killed) occurs.
102 Complexity ensues because backuping up an RDBMS is generally more
103 complex than simply copying a file. Instead, some command is run
104 which stores a snapshot of the database in a file. So, if you
105 back up the database to a file, and then back up the filesystem,
106 it is likely that further database transactions have occurred
107 between the point of database backup and the point of filesystem
108 backup.
110 For the purposes, of this analysis, we assume that the filesystem
111 backup occurred after the database backup. Furthermore, we assume
112 that filesystem backups are atomic; i.e., the at the filesystem is
113 not being modified during the backup.
115 1. Neither the 'commit' nor 'rollback' methods on the database are
116 ever called.
118 In this case, the '.tmp' file should be ignored as the
119 transaction was not committed.
121 2. The 'commit' method is called. Subsequently, the machine
122 crashes, and is restored from backups.
124 The most recent filesystem backup and the most recent database
125 backup are not in general from the same instant in time.
127 This problem means that we can never be sure after a crash if
128 the contents of a file are what we intend. It is always
129 possible that an edit was made to the file that is not
130 reflected in the filesystem.
132 3. A crash occurs between the point of the database commit and the
133 call to 'doStoreFile'.
135 If only one of 'file' and 'file.tmp' exists, then that
136 version should be used. However, if both 'file' and 'file.tmp'
137 exist, there is no way to know which version to use.
139 Reading the File
140 ----------------
142 When determining the content of the file, we use the following
143 algorithm:
145 1. If 'self.transactions' reflects an edit of the file, then use
146 'file.tmp'.
148 We know that an edit to the file is in process so 'file.tmp' is
149 the right choice. If 'file.tmp' does not exist, raise an
150 exception; something has removed the content of the file while
151 we are in the process of editing it.
153 2. Otherwise, if 'file.tmp' exists, and 'file' does not, use
154 'file.tmp'.
156 We know that the file is supposed to exist because there is a
157 reference to it in the database. Since 'file' does not exist,
158 we assume that Crash 3 occurred during the initial creation of
159 the file.
161 3. Otherwise, use 'file'.
163 If 'file.tmp' is not present, this is obviously the best we can
164 do. This is always the right answer unless Crash 2 occurred,
165 in which case the contents of 'file' may be newer than they
166 were at the point of database backup.
168 If 'file.tmp' is present, we know that we are not actively
169 editing the file. The possibilities are:
171 a. Crash 1 has occurred. In this case, using 'file' is the
172 right answer, so we will have chosen correctly.
174 b. Crash 3 has occurred. In this case, 'file.tmp' is the right
175 answer, so we will have chosen incorrectly. However, 'file'
176 was at least a previously committed value.
178 Future Improvements
179 -------------------
181 One approach would be to take advantage of databases which do
182 allow the storage of arbitary date. For example, MySQL provides
183 the HUGE BLOB datatype for storing up to 4GB of data.
185 Another approach would be to store a version ('v') in the actual
186 database and name files 'file.v'. Then, the editing protocol
187 would become:
189 1. Generate a new version 'v', guaranteed to be different from all
190 other versions ever used by the database. (The version need
191 not be in any particular sequence; a UUID would be fine.)
193 2. Store the content in 'file.v'.
195 3. Update the database to indicate that the version of the node is
196 'v'.
198 Now, if the transaction is committed, the database will refer to
199 'file.v', where the content exists. If the transaction is rolled
200 back, or not committed, 'file.v' will never be referenced. In the
201 event of a crash, under the assumptions above, there may be
202 'file.v' files that are not referenced by the database, but the
203 database will be consistent, so long as unreferenced 'file.v'
204 files are never removed until after the database has been backed
205 up.
206 """
208 tempext = '.tmp'
209 """The suffix added to files indicating that they are uncommitted."""
211 def __init__(self, umask):
212 self.umask = umask
214 def subdirFilename(self, classname, nodeid, property=None):
215 """Determine what the filename and subdir for nodeid + classname is."""
216 if property:
217 name = '%s%s.%s'%(classname, nodeid, property)
218 else:
219 # roundupdb.FileClass never specified the property name, so don't
220 # include it
221 name = '%s%s'%(classname, nodeid)
223 # have a separate subdir for every thousand messages
224 subdir = str(int(nodeid) / 1000)
225 return os.path.join(subdir, name)
227 def _tempfile(self, filename):
228 """Return a temporary filename.
230 'filename' -- The name of the eventual destination file."""
232 return filename + self.tempext
234 def _editInProgress(self, classname, nodeid, property):
235 """Return true if the file indicated is being edited.
237 returns -- True if the current transaction includes an edit to
238 the file indicated."""
240 for method, args in self.transactions:
241 if (method == self.doStoreFile and
242 args == (classname, nodeid, property)):
243 return True
245 return False
248 def filename(self, classname, nodeid, property=None, create=0):
249 """Determine what the filename for the given node and optionally
250 property is.
252 Try a variety of different filenames - the file could be in the
253 usual place, or it could be in a temp file pre-commit *or* it
254 could be in an old-style, backwards-compatible flat directory.
255 """
256 filename = os.path.join(self.dir, 'files', classname,
257 self.subdirFilename(classname, nodeid, property))
258 # If the caller is going to create the file, return the
259 # post-commit filename. It is the callers responsibility to
260 # add self.tempext when actually creating the file.
261 if create:
262 return filename
264 tempfile = self._tempfile(filename)
266 # If an edit to this file is in progress, then return the name
267 # of the temporary file containing the edited content.
268 if self._editInProgress(classname, nodeid, property):
269 if not os.path.exists(tempfile):
270 raise IOError('content file for %s not found'%tempfile)
271 return tempfile
273 if os.path.exists(filename):
274 return filename
276 # Otherwise, if the temporary file exists, then the probable
277 # explanation is that a crash occurred between the point that
278 # the database entry recording the creation of the file
279 # occured and the point at which the file was renamed from the
280 # temporary name to the final name.
281 if os.path.exists(tempfile):
282 try:
283 # Clean up, by performing the commit now.
284 os.rename(tempfile, filename)
285 except:
286 pass
287 # If two Roundup clients both try to rename the file
288 # at the same time, only one of them will succeed.
289 # So, tolerate such an error -- but no other.
290 if not os.path.exists(filename):
291 raise IOError('content file for %s not found'%filename)
292 return filename
294 # ok, try flat (very old-style)
295 if property:
296 filename = os.path.join(self.dir, 'files', '%s%s.%s'%(classname,
297 nodeid, property))
298 else:
299 filename = os.path.join(self.dir, 'files', '%s%s'%(classname,
300 nodeid))
301 if os.path.exists(filename):
302 return filename
304 # file just ain't there
305 raise IOError('content file for %s not found'%filename)
307 def storefile(self, classname, nodeid, property, content):
308 """Store the content of the file in the database. The property may be
309 None, in which case the filename does not indicate which property
310 is being saved.
311 """
312 # determine the name of the file to write to
313 name = self.filename(classname, nodeid, property, create=1)
315 # make sure the file storage dir exists
316 if not os.path.exists(os.path.dirname(name)):
317 os.makedirs(os.path.dirname(name))
319 # save to a temp file
320 name = self._tempfile(name)
322 # make sure we don't register the rename action more than once
323 if not self._editInProgress(classname, nodeid, property):
324 # save off the rename action
325 self.transactions.append((self.doStoreFile, (classname, nodeid,
326 property)))
327 # always set umask before writing to make sure we have the proper one
328 # in multi-tracker (i.e. multi-umask) or modpython scenarios
329 # the umask may have changed since last we set it.
330 os.umask(self.umask)
331 open(name, 'wb').write(content)
333 def getfile(self, classname, nodeid, property):
334 """Get the content of the file in the database.
335 """
336 filename = self.filename(classname, nodeid, property)
338 f = open(filename, 'rb')
339 try:
340 # snarf the contents and make sure we close the file
341 return f.read()
342 finally:
343 f.close()
345 def numfiles(self):
346 """Get number of files in storage, even across subdirectories.
347 """
348 files_dir = os.path.join(self.dir, 'files')
349 return files_in_dir(files_dir)
351 def doStoreFile(self, classname, nodeid, property, **databases):
352 """Store the file as part of a transaction commit.
353 """
354 # determine the name of the file to write to
355 name = self.filename(classname, nodeid, property, 1)
357 # the file is currently ".tmp" - move it to its real name to commit
358 if name.endswith(self.tempext):
359 # creation
360 dstname = os.path.splitext(name)[0]
361 else:
362 # edit operation
363 dstname = name
364 name = self._tempfile(name)
366 # content is being updated (and some platforms, eg. win32, won't
367 # let us rename over the top of the old file)
368 if os.path.exists(dstname):
369 os.remove(dstname)
371 os.rename(name, dstname)
373 # return the classname, nodeid so we reindex this content
374 return (classname, nodeid)
376 def rollbackStoreFile(self, classname, nodeid, property, **databases):
377 """Remove the temp file as a part of a rollback
378 """
379 # determine the name of the file to delete
380 name = self.filename(classname, nodeid, property)
381 if not name.endswith(self.tempext):
382 name += self.tempext
383 os.remove(name)
385 def isStoreFile(self, classname, nodeid):
386 """See if there is actually any FileStorage for this node.
387 Is there a better way than using self.filename?
388 """
389 try:
390 fname = self.filename(classname, nodeid)
391 return True
392 except IOError:
393 return False
395 def destroy(self, classname, nodeid):
396 """If there is actually FileStorage for this node
397 remove it from the filesystem
398 """
399 if self.isStoreFile(classname, nodeid):
400 os.remove(self.filename(classname, nodeid))
402 # vim: set filetype=python ts=4 sw=4 et si