2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
9 # The original module was released under this license, and remains under
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
17 #$Id: indexer_dbm.py,v 1.9 2006-04-27 05:48:26 richard Exp $
18 '''This module provides an indexer class, RoundupIndexer, that stores text
19 indices in a roundup instance. This class makes searching the content of
20 messages, string properties and text files possible.
22 __docformat__
= 'restructuredtext'
24 import os
, shutil
, re
, mimetypes
, marshal
, zlib
, errno
25 from roundup
.hyperdb
import Link
, Multilink
26 from roundup
.backends
.indexer_common
import Indexer
as IndexerBase
28 class Indexer(IndexerBase
):
29 '''Indexes information from roundup's hyperdb to allow efficient
32 Three structures are created by the indexer::
34 files {identifier: (fileid, wordcount)}
35 words {word: {fileid: count}}
36 fileids {fileid: identifier}
38 where identifier is (classname, nodeid, propertyname)
40 def __init__(self
, db
):
41 IndexerBase
.__init__(self
, db
)
42 self
.indexdb_path
= os
.path
.join(db
.config
.DATABASE
, 'indexes')
43 self
.indexdb
= os
.path
.join(self
.indexdb_path
, 'index.db')
48 # see if we need to reindex because of a change in code
49 version
= os
.path
.join(self
.indexdb_path
, 'version')
50 if (not os
.path
.exists(self
.indexdb_path
) or
51 not os
.path
.exists(version
)):
52 # for now the file itself is a flag
54 elif os
.path
.exists(version
):
55 version
= open(version
).read()
56 # check the value and reindex if it's not the latest
57 if version
.strip() != '1':
60 def force_reindex(self
):
61 '''Force a reindex condition
63 if os
.path
.exists(self
.indexdb_path
):
64 shutil
.rmtree(self
.indexdb_path
)
65 os
.makedirs(self
.indexdb_path
)
66 os
.chmod(self
.indexdb_path
, 0775)
67 open(os
.path
.join(self
.indexdb_path
, 'version'), 'w').write('1\n')
71 def should_reindex(self
):
76 def add_text(self
, identifier
, text
, mime_type
='text/plain'):
77 '''Add some text associated with the (classname, nodeid, property)
80 # make sure the index is loaded
83 # remove old entries for this identifier
84 if identifier
in self
.files
:
85 self
.purge_entry(identifier
)
88 words
= self
.splitter(text
, mime_type
)
90 # Find new file index, and assign it to identifier
91 # (_TOP uses trick of negative to avoid conflict with file index)
92 self
.files
['_TOP'] = (self
.files
['_TOP'][0]-1, None)
93 file_index
= abs(self
.files
['_TOP'][0])
94 self
.files
[identifier
] = (file_index
, len(words
))
95 self
.fileids
[file_index
] = identifier
97 # find the unique words
100 if self
.is_stopword(word
):
103 filedict
[word
] = filedict
[word
]+1
107 # now add to the totals
108 for word
in filedict
:
109 # each word has a dict of {identifier: count}
110 if word
in self
.words
:
111 entry
= self
.words
[word
]
115 self
.words
[word
] = entry
117 # make a reference to the file for this word
118 entry
[file_index
] = filedict
[word
]
123 def splitter(self
, text
, ftype
):
124 '''Split the contents of a text string into a list of 'words'
126 if ftype
== 'text/plain':
127 words
= self
.text_splitter(text
)
132 def text_splitter(self
, text
):
133 """Split text/plain string into a list of words
136 text
= str(text
).upper()
139 return re
.findall(r
'\b\w{%d,%d}\b' % (self
.minlength
, self
.maxlength
),
142 # we override this to ignore too short and too long words
143 # and also to fix a bug - the (fail) case.
144 def find(self
, wordlist
):
145 '''Locate files that match ALL the words in wordlist
147 if not hasattr(self
, 'words'):
149 self
.load_index(wordlist
=wordlist
)
152 for word
in wordlist
:
153 if not self
.minlength
<= len(word
) <= self
.maxlength
:
154 # word outside the bounds of what we index - ignore
157 if self
.is_stopword(word
):
159 entry
= self
.words
.get(word
) # For each word, get index
160 entries
[word
] = entry
# of matching files
161 if not entry
: # Nothing for this one word (fail)
166 if k
not in self
.fileids
:
167 raise ValueError('Index is corrupted: re-generate it')
168 hits
[k
] = self
.fileids
[k
]
170 # Eliminate hits for every non-match
171 for fileid
in list(hits
):
172 if fileid
not in entry
:
176 return list(hits
.values())
178 segments
= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
179 def load_index(self
, reload=0, wordlist
=None):
180 # Unless reload is indicated, do not load twice
181 if self
.index_loaded() and not reload:
184 # Ok, now let's actually load it
185 db
= {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
187 # Identify the relevant word-dictionary segments
189 segments
= self
.segments
192 for word
in wordlist
:
193 segments
.append(word
[0].upper())
196 for segment
in segments
:
198 f
= open(self
.indexdb
+ segment
, 'rb')
199 except IOError, error
:
200 # probably just nonexistent segment index file
201 if error
.errno
!= errno
.ENOENT
: raise
203 pickle_str
= zlib
.decompress(f
.read())
205 dbslice
= marshal
.loads(pickle_str
)
206 if dbslice
.get('WORDS'):
207 # if it has some words, add them
208 for word
, entry
in dbslice
['WORDS'].iteritems():
209 db
['WORDS'][word
] = entry
210 if dbslice
.get('FILES'):
211 # if it has some files, add them
212 db
['FILES'] = dbslice
['FILES']
213 if dbslice
.get('FILEIDS'):
214 # if it has fileids, add them
215 db
['FILEIDS'] = dbslice
['FILEIDS']
217 self
.words
= db
['WORDS']
218 self
.files
= db
['FILES']
219 self
.fileids
= db
['FILEIDS']
222 def save_index(self
):
223 # only save if the index is loaded and changed
224 if not self
.index_loaded() or not self
.changed
:
227 # brutal space saver... delete all the small segments
228 for segment
in self
.segments
:
230 os
.remove(self
.indexdb
+ segment
)
231 except OSError, error
:
232 # probably just nonexistent segment index file
233 if error
.errno
!= errno
.ENOENT
: raise
235 # First write the much simpler filename/fileid dictionaries
236 dbfil
= {'WORDS':None, 'FILES':self
.files
, 'FILEIDS':self
.fileids
}
237 open(self
.indexdb
+'-','wb').write(zlib
.compress(marshal
.dumps(dbfil
)))
239 # The hard part is splitting the word dictionary up, of course
240 letters
= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
241 segdicts
= {} # Need batch of empty dicts
242 for segment
in letters
:
243 segdicts
[segment
] = {}
244 for word
, entry
in self
.words
.iteritems(): # Split into segment dicts
245 initchar
= word
[0].upper()
246 segdicts
[initchar
][word
] = entry
249 for initchar
in letters
:
250 db
= {'WORDS':segdicts
[initchar
], 'FILES':None, 'FILEIDS':None}
251 pickle_str
= marshal
.dumps(db
)
252 filename
= self
.indexdb
+ initchar
253 pickle_fh
= open(filename
, 'wb')
254 pickle_fh
.write(zlib
.compress(pickle_str
))
255 os
.chmod(filename
, 0664)
260 def purge_entry(self
, identifier
):
261 '''Remove a file from file index and word index
265 if identifier
not in self
.files
:
268 file_index
= self
.files
[identifier
][0]
269 del self
.files
[identifier
]
270 del self
.fileids
[file_index
]
272 # The much harder part, cleanup the word index
273 for key
, occurs
in self
.words
.iteritems():
274 if file_index
in occurs
:
275 del occurs
[file_index
]
280 def index_loaded(self
):
281 return (hasattr(self
,'fileids') and hasattr(self
,'files') and
282 hasattr(self
,'words'))
285 ''' load last saved index info. '''
286 self
.load_index(reload=1)
292 # vim: set filetype=python ts=4 sw=4 et si