Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | # |
2 | # This module is derived from the module described at: | |
3 | # http://gnosis.cx/publish/programming/charming_python_15.txt | |
4 | # | |
5 | # Author: David Mertz (mertz@gnosis.cx) | |
6 | # Thanks to: Pat Knight (p.knight@ktgroup.co.uk) | |
7 | # Gregory Popovitch (greg@gpy.com) | |
8 | # | |
9 | # The original module was released under this license, and remains under | |
10 | # it: | |
11 | # | |
12 | # This file is released to the public domain. I (dqm) would | |
13 | # appreciate it if you choose to keep derived works under terms | |
14 | # that promote freedom, but obviously am giving up any rights | |
15 | # to compel such. | |
16 | # | |
17 | #$Id: indexer_dbm.py,v 1.9 2006-04-27 05:48:26 richard Exp $ | |
18 | '''This module provides an indexer class, RoundupIndexer, that stores text | |
19 | indices in a roundup instance. This class makes searching the content of | |
20 | messages, string properties and text files possible. | |
21 | ''' | |
22 | __docformat__ = 'restructuredtext' | |
23 | ||
24 | import os, shutil, re, mimetypes, marshal, zlib, errno | |
25 | from roundup.hyperdb import Link, Multilink | |
26 | from roundup.backends.indexer_common import Indexer as IndexerBase | |
27 | ||
28 | class Indexer(IndexerBase): | |
29 | '''Indexes information from roundup's hyperdb to allow efficient | |
30 | searching. | |
31 | ||
32 | Three structures are created by the indexer:: | |
33 | ||
34 | files {identifier: (fileid, wordcount)} | |
35 | words {word: {fileid: count}} | |
36 | fileids {fileid: identifier} | |
37 | ||
38 | where identifier is (classname, nodeid, propertyname) | |
39 | ''' | |
40 | def __init__(self, db): | |
41 | IndexerBase.__init__(self, db) | |
42 | self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes') | |
43 | self.indexdb = os.path.join(self.indexdb_path, 'index.db') | |
44 | self.reindex = 0 | |
45 | self.quiet = 9 | |
46 | self.changed = 0 | |
47 | ||
48 | # see if we need to reindex because of a change in code | |
49 | version = os.path.join(self.indexdb_path, 'version') | |
50 | if (not os.path.exists(self.indexdb_path) or | |
51 | not os.path.exists(version)): | |
52 | # for now the file itself is a flag | |
53 | self.force_reindex() | |
54 | elif os.path.exists(version): | |
55 | version = open(version).read() | |
56 | # check the value and reindex if it's not the latest | |
57 | if version.strip() != '1': | |
58 | self.force_reindex() | |
59 | ||
60 | def force_reindex(self): | |
61 | '''Force a reindex condition | |
62 | ''' | |
63 | if os.path.exists(self.indexdb_path): | |
64 | shutil.rmtree(self.indexdb_path) | |
65 | os.makedirs(self.indexdb_path) | |
66 | os.chmod(self.indexdb_path, 0775) | |
67 | open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') | |
68 | self.reindex = 1 | |
69 | self.changed = 1 | |
70 | ||
71 | def should_reindex(self): | |
72 | '''Should we reindex? | |
73 | ''' | |
74 | return self.reindex | |
75 | ||
76 | def add_text(self, identifier, text, mime_type='text/plain'): | |
77 | '''Add some text associated with the (classname, nodeid, property) | |
78 | identifier. | |
79 | ''' | |
80 | # make sure the index is loaded | |
81 | self.load_index() | |
82 | ||
83 | # remove old entries for this identifier | |
84 | if identifier in self.files: | |
85 | self.purge_entry(identifier) | |
86 | ||
87 | # split into words | |
88 | words = self.splitter(text, mime_type) | |
89 | ||
90 | # Find new file index, and assign it to identifier | |
91 | # (_TOP uses trick of negative to avoid conflict with file index) | |
92 | self.files['_TOP'] = (self.files['_TOP'][0]-1, None) | |
93 | file_index = abs(self.files['_TOP'][0]) | |
94 | self.files[identifier] = (file_index, len(words)) | |
95 | self.fileids[file_index] = identifier | |
96 | ||
97 | # find the unique words | |
98 | filedict = {} | |
99 | for word in words: | |
100 | if self.is_stopword(word): | |
101 | continue | |
102 | if word in filedict: | |
103 | filedict[word] = filedict[word]+1 | |
104 | else: | |
105 | filedict[word] = 1 | |
106 | ||
107 | # now add to the totals | |
108 | for word in filedict: | |
109 | # each word has a dict of {identifier: count} | |
110 | if word in self.words: | |
111 | entry = self.words[word] | |
112 | else: | |
113 | # new word | |
114 | entry = {} | |
115 | self.words[word] = entry | |
116 | ||
117 | # make a reference to the file for this word | |
118 | entry[file_index] = filedict[word] | |
119 | ||
120 | # save needed | |
121 | self.changed = 1 | |
122 | ||
123 | def splitter(self, text, ftype): | |
124 | '''Split the contents of a text string into a list of 'words' | |
125 | ''' | |
126 | if ftype == 'text/plain': | |
127 | words = self.text_splitter(text) | |
128 | else: | |
129 | return [] | |
130 | return words | |
131 | ||
132 | def text_splitter(self, text): | |
133 | """Split text/plain string into a list of words | |
134 | """ | |
135 | # case insensitive | |
136 | text = str(text).upper() | |
137 | ||
138 | # Split the raw text | |
139 | return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), | |
140 | text) | |
141 | ||
142 | # we override this to ignore too short and too long words | |
143 | # and also to fix a bug - the (fail) case. | |
144 | def find(self, wordlist): | |
145 | '''Locate files that match ALL the words in wordlist | |
146 | ''' | |
147 | if not hasattr(self, 'words'): | |
148 | self.load_index() | |
149 | self.load_index(wordlist=wordlist) | |
150 | entries = {} | |
151 | hits = None | |
152 | for word in wordlist: | |
153 | if not self.minlength <= len(word) <= self.maxlength: | |
154 | # word outside the bounds of what we index - ignore | |
155 | continue | |
156 | word = word.upper() | |
157 | if self.is_stopword(word): | |
158 | continue | |
159 | entry = self.words.get(word) # For each word, get index | |
160 | entries[word] = entry # of matching files | |
161 | if not entry: # Nothing for this one word (fail) | |
162 | return {} | |
163 | if hits is None: | |
164 | hits = {} | |
165 | for k in entry: | |
166 | if k not in self.fileids: | |
167 | raise ValueError('Index is corrupted: re-generate it') | |
168 | hits[k] = self.fileids[k] | |
169 | else: | |
170 | # Eliminate hits for every non-match | |
171 | for fileid in list(hits): | |
172 | if fileid not in entry: | |
173 | del hits[fileid] | |
174 | if hits is None: | |
175 | return {} | |
176 | return list(hits.values()) | |
177 | ||
178 | segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!" | |
179 | def load_index(self, reload=0, wordlist=None): | |
180 | # Unless reload is indicated, do not load twice | |
181 | if self.index_loaded() and not reload: | |
182 | return 0 | |
183 | ||
184 | # Ok, now let's actually load it | |
185 | db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}} | |
186 | ||
187 | # Identify the relevant word-dictionary segments | |
188 | if not wordlist: | |
189 | segments = self.segments | |
190 | else: | |
191 | segments = ['-','#'] | |
192 | for word in wordlist: | |
193 | segments.append(word[0].upper()) | |
194 | ||
195 | # Load the segments | |
196 | for segment in segments: | |
197 | try: | |
198 | f = open(self.indexdb + segment, 'rb') | |
199 | except IOError, error: | |
200 | # probably just nonexistent segment index file | |
201 | if error.errno != errno.ENOENT: raise | |
202 | else: | |
203 | pickle_str = zlib.decompress(f.read()) | |
204 | f.close() | |
205 | dbslice = marshal.loads(pickle_str) | |
206 | if dbslice.get('WORDS'): | |
207 | # if it has some words, add them | |
208 | for word, entry in dbslice['WORDS'].iteritems(): | |
209 | db['WORDS'][word] = entry | |
210 | if dbslice.get('FILES'): | |
211 | # if it has some files, add them | |
212 | db['FILES'] = dbslice['FILES'] | |
213 | if dbslice.get('FILEIDS'): | |
214 | # if it has fileids, add them | |
215 | db['FILEIDS'] = dbslice['FILEIDS'] | |
216 | ||
217 | self.words = db['WORDS'] | |
218 | self.files = db['FILES'] | |
219 | self.fileids = db['FILEIDS'] | |
220 | self.changed = 0 | |
221 | ||
222 | def save_index(self): | |
223 | # only save if the index is loaded and changed | |
224 | if not self.index_loaded() or not self.changed: | |
225 | return | |
226 | ||
227 | # brutal space saver... delete all the small segments | |
228 | for segment in self.segments: | |
229 | try: | |
230 | os.remove(self.indexdb + segment) | |
231 | except OSError, error: | |
232 | # probably just nonexistent segment index file | |
233 | if error.errno != errno.ENOENT: raise | |
234 | ||
235 | # First write the much simpler filename/fileid dictionaries | |
236 | dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} | |
237 | open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil))) | |
238 | ||
239 | # The hard part is splitting the word dictionary up, of course | |
240 | letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_" | |
241 | segdicts = {} # Need batch of empty dicts | |
242 | for segment in letters: | |
243 | segdicts[segment] = {} | |
244 | for word, entry in self.words.iteritems(): # Split into segment dicts | |
245 | initchar = word[0].upper() | |
246 | segdicts[initchar][word] = entry | |
247 | ||
248 | # save | |
249 | for initchar in letters: | |
250 | db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None} | |
251 | pickle_str = marshal.dumps(db) | |
252 | filename = self.indexdb + initchar | |
253 | pickle_fh = open(filename, 'wb') | |
254 | pickle_fh.write(zlib.compress(pickle_str)) | |
255 | os.chmod(filename, 0664) | |
256 | ||
257 | # save done | |
258 | self.changed = 0 | |
259 | ||
260 | def purge_entry(self, identifier): | |
261 | '''Remove a file from file index and word index | |
262 | ''' | |
263 | self.load_index() | |
264 | ||
265 | if identifier not in self.files: | |
266 | return | |
267 | ||
268 | file_index = self.files[identifier][0] | |
269 | del self.files[identifier] | |
270 | del self.fileids[file_index] | |
271 | ||
272 | # The much harder part, cleanup the word index | |
273 | for key, occurs in self.words.iteritems(): | |
274 | if file_index in occurs: | |
275 | del occurs[file_index] | |
276 | ||
277 | # save needed | |
278 | self.changed = 1 | |
279 | ||
280 | def index_loaded(self): | |
281 | return (hasattr(self,'fileids') and hasattr(self,'files') and | |
282 | hasattr(self,'words')) | |
283 | ||
284 | def rollback(self): | |
285 | ''' load last saved index info. ''' | |
286 | self.load_index(reload=1) | |
287 | ||
288 | def close(self): | |
289 | pass | |
290 | ||
291 | ||
292 | # vim: set filetype=python ts=4 sw=4 et si |