Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | # |
2 | # Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/) | |
3 | # This module is free software, and you may redistribute it and/or modify | |
4 | # under the same terms as Python, so long as this copyright message and | |
5 | # disclaimer are retained in their original form. | |
6 | # | |
7 | # IN NO EVENT SHALL BIZAR SOFTWARE PTY LTD BE LIABLE TO ANY PARTY FOR | |
8 | # DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING | |
9 | # OUT OF THE USE OF THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE | |
10 | # POSSIBILITY OF SUCH DAMAGE. | |
11 | # | |
12 | # BIZAR SOFTWARE PTY LTD SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, | |
13 | # BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
14 | # FOR A PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" | |
15 | # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, | |
16 | # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. | |
17 | # | |
18 | """This module exports file storage for roundup backends. | |
19 | Files are stored into a directory hierarchy. | |
20 | """ | |
21 | __docformat__ = 'restructuredtext' | |
22 | ||
23 | import os | |
24 | ||
25 | def files_in_dir(dir): | |
26 | if not os.path.exists(dir): | |
27 | return 0 | |
28 | num_files = 0 | |
29 | for dir_entry in os.listdir(dir): | |
30 | full_filename = os.path.join(dir,dir_entry) | |
31 | if os.path.isfile(full_filename): | |
32 | num_files = num_files + 1 | |
33 | elif os.path.isdir(full_filename): | |
34 | num_files = num_files + files_in_dir(full_filename) | |
35 | return num_files | |
36 | ||
37 | class FileStorage: | |
38 | """Store files in some directory structure | |
39 | ||
40 | Some databases do not permit the storage of arbitrary data (i.e., | |
41 | file content). And, some database schema explicitly store file | |
42 | content in the fielsystem. In particular, if a class defines a | |
43 | 'filename' property, it is assumed that the data is stored in the | |
44 | indicated file, outside of whatever database Roundup is otherwise | |
45 | using. | |
46 | ||
47 | In these situations, it is difficult to maintain the transactional | |
48 | abstractions used elsewhere in Roundup. In particular, if a | |
49 | file's content is edited, but then the containing transaction is | |
50 | not committed, we do not want to commit the edit. Similarly, we | |
51 | would like to guarantee that if a transaction is committed to the | |
52 | database, then the edit has in fact taken place. | |
53 | ||
54 | This class provides an approximation of these transactional | |
55 | requirements. | |
56 | ||
57 | For classes that do not have a 'filename' property, the file name | |
58 | used to store the file's content is a deterministic function of | |
59 | the classname and nodeid for the file. The 'filename' function | |
60 | computes this name. The name will contain directories and | |
61 | subdirectories, but, suppose, for the purposes of what follows, | |
62 | that the filename is 'file'. | |
63 | ||
64 | Edit Procotol | |
65 | ------------- | |
66 | ||
67 | When a file is created or edited, the following protocol is used: | |
68 | ||
69 | 1. The new content of the file is placed in 'file.tmp'. | |
70 | ||
71 | 2. A transaction is recored in 'self.transactions' referencing the | |
72 | 'doStoreFile' method of this class. | |
73 | ||
74 | 3. At some subsequent point, the database 'commit' function is | |
75 | called. This function first performs a traditional database | |
76 | commit (for example, by issuing a SQL command to commit the | |
77 | current transaction), and, then, runs the transactions recored | |
78 | in 'self.transactions'. | |
79 | ||
80 | 4. The 'doStoreFile' method renames the 'file.tmp' to 'file'. | |
81 | ||
82 | If Step 3 never occurs, but, instead, the database 'rollback' | |
83 | method is called, then that method, after rolling back the | |
84 | database transaction, calls 'rollbackStoreFile', which removes | |
85 | 'file.tmp'. | |
86 | ||
87 | Race Condition | |
88 | -------------- | |
89 | ||
90 | If two Roundup instances (say, the mail gateway and a web client, | |
91 | or two web clients running with a multi-process server) attempt | |
92 | edits at the same time, both will write to 'file.tmp', and the | |
93 | results will be indeterminate. | |
94 | ||
95 | Crash Analysis | |
96 | -------------- | |
97 | ||
98 | There are several situations that may occur if a crash (whether | |
99 | because the machine crashes, because an unhandled Python exception | |
100 | is raised, or because the Python process is killed) occurs. | |
101 | ||
102 | Complexity ensues because backuping up an RDBMS is generally more | |
103 | complex than simply copying a file. Instead, some command is run | |
104 | which stores a snapshot of the database in a file. So, if you | |
105 | back up the database to a file, and then back up the filesystem, | |
106 | it is likely that further database transactions have occurred | |
107 | between the point of database backup and the point of filesystem | |
108 | backup. | |
109 | ||
110 | For the purposes, of this analysis, we assume that the filesystem | |
111 | backup occurred after the database backup. Furthermore, we assume | |
112 | that filesystem backups are atomic; i.e., the at the filesystem is | |
113 | not being modified during the backup. | |
114 | ||
115 | 1. Neither the 'commit' nor 'rollback' methods on the database are | |
116 | ever called. | |
117 | ||
118 | In this case, the '.tmp' file should be ignored as the | |
119 | transaction was not committed. | |
120 | ||
121 | 2. The 'commit' method is called. Subsequently, the machine | |
122 | crashes, and is restored from backups. | |
123 | ||
124 | The most recent filesystem backup and the most recent database | |
125 | backup are not in general from the same instant in time. | |
126 | ||
127 | This problem means that we can never be sure after a crash if | |
128 | the contents of a file are what we intend. It is always | |
129 | possible that an edit was made to the file that is not | |
130 | reflected in the filesystem. | |
131 | ||
132 | 3. A crash occurs between the point of the database commit and the | |
133 | call to 'doStoreFile'. | |
134 | ||
135 | If only one of 'file' and 'file.tmp' exists, then that | |
136 | version should be used. However, if both 'file' and 'file.tmp' | |
137 | exist, there is no way to know which version to use. | |
138 | ||
139 | Reading the File | |
140 | ---------------- | |
141 | ||
142 | When determining the content of the file, we use the following | |
143 | algorithm: | |
144 | ||
145 | 1. If 'self.transactions' reflects an edit of the file, then use | |
146 | 'file.tmp'. | |
147 | ||
148 | We know that an edit to the file is in process so 'file.tmp' is | |
149 | the right choice. If 'file.tmp' does not exist, raise an | |
150 | exception; something has removed the content of the file while | |
151 | we are in the process of editing it. | |
152 | ||
153 | 2. Otherwise, if 'file.tmp' exists, and 'file' does not, use | |
154 | 'file.tmp'. | |
155 | ||
156 | We know that the file is supposed to exist because there is a | |
157 | reference to it in the database. Since 'file' does not exist, | |
158 | we assume that Crash 3 occurred during the initial creation of | |
159 | the file. | |
160 | ||
161 | 3. Otherwise, use 'file'. | |
162 | ||
163 | If 'file.tmp' is not present, this is obviously the best we can | |
164 | do. This is always the right answer unless Crash 2 occurred, | |
165 | in which case the contents of 'file' may be newer than they | |
166 | were at the point of database backup. | |
167 | ||
168 | If 'file.tmp' is present, we know that we are not actively | |
169 | editing the file. The possibilities are: | |
170 | ||
171 | a. Crash 1 has occurred. In this case, using 'file' is the | |
172 | right answer, so we will have chosen correctly. | |
173 | ||
174 | b. Crash 3 has occurred. In this case, 'file.tmp' is the right | |
175 | answer, so we will have chosen incorrectly. However, 'file' | |
176 | was at least a previously committed value. | |
177 | ||
178 | Future Improvements | |
179 | ------------------- | |
180 | ||
181 | One approach would be to take advantage of databases which do | |
182 | allow the storage of arbitary date. For example, MySQL provides | |
183 | the HUGE BLOB datatype for storing up to 4GB of data. | |
184 | ||
185 | Another approach would be to store a version ('v') in the actual | |
186 | database and name files 'file.v'. Then, the editing protocol | |
187 | would become: | |
188 | ||
189 | 1. Generate a new version 'v', guaranteed to be different from all | |
190 | other versions ever used by the database. (The version need | |
191 | not be in any particular sequence; a UUID would be fine.) | |
192 | ||
193 | 2. Store the content in 'file.v'. | |
194 | ||
195 | 3. Update the database to indicate that the version of the node is | |
196 | 'v'. | |
197 | ||
198 | Now, if the transaction is committed, the database will refer to | |
199 | 'file.v', where the content exists. If the transaction is rolled | |
200 | back, or not committed, 'file.v' will never be referenced. In the | |
201 | event of a crash, under the assumptions above, there may be | |
202 | 'file.v' files that are not referenced by the database, but the | |
203 | database will be consistent, so long as unreferenced 'file.v' | |
204 | files are never removed until after the database has been backed | |
205 | up. | |
206 | """ | |
207 | ||
208 | tempext = '.tmp' | |
209 | """The suffix added to files indicating that they are uncommitted.""" | |
210 | ||
211 | def __init__(self, umask): | |
212 | self.umask = umask | |
213 | ||
214 | def subdirFilename(self, classname, nodeid, property=None): | |
215 | """Determine what the filename and subdir for nodeid + classname is.""" | |
216 | if property: | |
217 | name = '%s%s.%s'%(classname, nodeid, property) | |
218 | else: | |
219 | # roundupdb.FileClass never specified the property name, so don't | |
220 | # include it | |
221 | name = '%s%s'%(classname, nodeid) | |
222 | ||
223 | # have a separate subdir for every thousand messages | |
224 | subdir = str(int(nodeid) / 1000) | |
225 | return os.path.join(subdir, name) | |
226 | ||
227 | def _tempfile(self, filename): | |
228 | """Return a temporary filename. | |
229 | ||
230 | 'filename' -- The name of the eventual destination file.""" | |
231 | ||
232 | return filename + self.tempext | |
233 | ||
234 | def _editInProgress(self, classname, nodeid, property): | |
235 | """Return true if the file indicated is being edited. | |
236 | ||
237 | returns -- True if the current transaction includes an edit to | |
238 | the file indicated.""" | |
239 | ||
240 | for method, args in self.transactions: | |
241 | if (method == self.doStoreFile and | |
242 | args == (classname, nodeid, property)): | |
243 | return True | |
244 | ||
245 | return False | |
246 | ||
247 | ||
248 | def filename(self, classname, nodeid, property=None, create=0): | |
249 | """Determine what the filename for the given node and optionally | |
250 | property is. | |
251 | ||
252 | Try a variety of different filenames - the file could be in the | |
253 | usual place, or it could be in a temp file pre-commit *or* it | |
254 | could be in an old-style, backwards-compatible flat directory. | |
255 | """ | |
256 | filename = os.path.join(self.dir, 'files', classname, | |
257 | self.subdirFilename(classname, nodeid, property)) | |
258 | # If the caller is going to create the file, return the | |
259 | # post-commit filename. It is the callers responsibility to | |
260 | # add self.tempext when actually creating the file. | |
261 | if create: | |
262 | return filename | |
263 | ||
264 | tempfile = self._tempfile(filename) | |
265 | ||
266 | # If an edit to this file is in progress, then return the name | |
267 | # of the temporary file containing the edited content. | |
268 | if self._editInProgress(classname, nodeid, property): | |
269 | if not os.path.exists(tempfile): | |
270 | raise IOError('content file for %s not found'%tempfile) | |
271 | return tempfile | |
272 | ||
273 | if os.path.exists(filename): | |
274 | return filename | |
275 | ||
276 | # Otherwise, if the temporary file exists, then the probable | |
277 | # explanation is that a crash occurred between the point that | |
278 | # the database entry recording the creation of the file | |
279 | # occured and the point at which the file was renamed from the | |
280 | # temporary name to the final name. | |
281 | if os.path.exists(tempfile): | |
282 | try: | |
283 | # Clean up, by performing the commit now. | |
284 | os.rename(tempfile, filename) | |
285 | except: | |
286 | pass | |
287 | # If two Roundup clients both try to rename the file | |
288 | # at the same time, only one of them will succeed. | |
289 | # So, tolerate such an error -- but no other. | |
290 | if not os.path.exists(filename): | |
291 | raise IOError('content file for %s not found'%filename) | |
292 | return filename | |
293 | ||
294 | # ok, try flat (very old-style) | |
295 | if property: | |
296 | filename = os.path.join(self.dir, 'files', '%s%s.%s'%(classname, | |
297 | nodeid, property)) | |
298 | else: | |
299 | filename = os.path.join(self.dir, 'files', '%s%s'%(classname, | |
300 | nodeid)) | |
301 | if os.path.exists(filename): | |
302 | return filename | |
303 | ||
304 | # file just ain't there | |
305 | raise IOError('content file for %s not found'%filename) | |
306 | ||
307 | def filesize(self, classname, nodeid, property=None, create=0): | |
308 | filename = self.filename(classname, nodeid, property, create) | |
309 | return os.path.getsize(filename) | |
310 | ||
311 | def storefile(self, classname, nodeid, property, content): | |
312 | """Store the content of the file in the database. The property may be | |
313 | None, in which case the filename does not indicate which property | |
314 | is being saved. | |
315 | """ | |
316 | # determine the name of the file to write to | |
317 | name = self.filename(classname, nodeid, property, create=1) | |
318 | ||
319 | # make sure the file storage dir exists | |
320 | if not os.path.exists(os.path.dirname(name)): | |
321 | os.makedirs(os.path.dirname(name)) | |
322 | ||
323 | # save to a temp file | |
324 | name = self._tempfile(name) | |
325 | ||
326 | # make sure we don't register the rename action more than once | |
327 | if not self._editInProgress(classname, nodeid, property): | |
328 | # save off the rename action | |
329 | self.transactions.append((self.doStoreFile, (classname, nodeid, | |
330 | property))) | |
331 | # always set umask before writing to make sure we have the proper one | |
332 | # in multi-tracker (i.e. multi-umask) or modpython scenarios | |
333 | # the umask may have changed since last we set it. | |
334 | os.umask(self.umask) | |
335 | open(name, 'wb').write(content) | |
336 | ||
337 | def getfile(self, classname, nodeid, property): | |
338 | """Get the content of the file in the database. | |
339 | """ | |
340 | filename = self.filename(classname, nodeid, property) | |
341 | ||
342 | f = open(filename, 'rb') | |
343 | try: | |
344 | # snarf the contents and make sure we close the file | |
345 | return f.read() | |
346 | finally: | |
347 | f.close() | |
348 | ||
349 | def numfiles(self): | |
350 | """Get number of files in storage, even across subdirectories. | |
351 | """ | |
352 | files_dir = os.path.join(self.dir, 'files') | |
353 | return files_in_dir(files_dir) | |
354 | ||
355 | def doStoreFile(self, classname, nodeid, property, **databases): | |
356 | """Store the file as part of a transaction commit. | |
357 | """ | |
358 | # determine the name of the file to write to | |
359 | name = self.filename(classname, nodeid, property, 1) | |
360 | ||
361 | # the file is currently ".tmp" - move it to its real name to commit | |
362 | if name.endswith(self.tempext): | |
363 | # creation | |
364 | dstname = os.path.splitext(name)[0] | |
365 | else: | |
366 | # edit operation | |
367 | dstname = name | |
368 | name = self._tempfile(name) | |
369 | ||
370 | # content is being updated (and some platforms, eg. win32, won't | |
371 | # let us rename over the top of the old file) | |
372 | if os.path.exists(dstname): | |
373 | os.remove(dstname) | |
374 | ||
375 | os.rename(name, dstname) | |
376 | ||
377 | # return the classname, nodeid so we reindex this content | |
378 | return (classname, nodeid) | |
379 | ||
380 | def rollbackStoreFile(self, classname, nodeid, property, **databases): | |
381 | """Remove the temp file as a part of a rollback | |
382 | """ | |
383 | # determine the name of the file to delete | |
384 | name = self.filename(classname, nodeid, property) | |
385 | if not name.endswith(self.tempext): | |
386 | name += self.tempext | |
387 | os.remove(name) | |
388 | ||
389 | def isStoreFile(self, classname, nodeid): | |
390 | """See if there is actually any FileStorage for this node. | |
391 | Is there a better way than using self.filename? | |
392 | """ | |
393 | try: | |
394 | fname = self.filename(classname, nodeid) | |
395 | return True | |
396 | except IOError: | |
397 | return False | |
398 | ||
399 | def destroy(self, classname, nodeid): | |
400 | """If there is actually FileStorage for this node | |
401 | remove it from the filesystem | |
402 | """ | |
403 | if self.isStoreFile(classname, nodeid): | |
404 | os.remove(self.filename(classname, nodeid)) | |
405 | ||
406 | # vim: set filetype=python ts=4 sw=4 et si |