Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $ |
2 | """ This implements the full-text indexer over two RDBMS tables. The first | |
3 | is a mapping of words to occurance IDs. The second maps the IDs to (Class, | |
4 | propname, itemid) instances. | |
5 | """ | |
6 | import re | |
7 | # Python 2.3 ... 2.6 compatibility: | |
8 | from roundup.anypy.sets_ import set | |
9 | ||
10 | from roundup.backends.indexer_common import Indexer as IndexerBase | |
11 | ||
12 | class Indexer(IndexerBase): | |
13 | def __init__(self, db): | |
14 | IndexerBase.__init__(self, db) | |
15 | self.db = db | |
16 | self.reindex = 0 | |
17 | ||
18 | def close(self): | |
19 | """close the indexing database""" | |
20 | # just nuke the circular reference | |
21 | self.db = None | |
22 | ||
23 | def save_index(self): | |
24 | """Save the changes to the index.""" | |
25 | # not necessary - the RDBMS connection will handle this for us | |
26 | pass | |
27 | ||
28 | def force_reindex(self): | |
29 | """Force a reindexing of the database. This essentially | |
30 | empties the tables ids and index and sets a flag so | |
31 | that the databases are reindexed""" | |
32 | self.reindex = 1 | |
33 | ||
34 | def should_reindex(self): | |
35 | """returns True if the indexes need to be rebuilt""" | |
36 | return self.reindex | |
37 | ||
38 | def add_text(self, identifier, text, mime_type='text/plain'): | |
39 | """ "identifier" is (classname, itemid, property) """ | |
40 | if mime_type != 'text/plain': | |
41 | return | |
42 | ||
43 | # Ensure all elements of the identifier are strings 'cos the itemid | |
44 | # column is varchar even if item ids may be numbers elsewhere in the | |
45 | # code. ugh. | |
46 | identifier = tuple(map(str, identifier)) | |
47 | ||
48 | # first, find the id of the (classname, itemid, property) | |
49 | a = self.db.arg | |
50 | sql = 'select _textid from __textids where _class=%s and '\ | |
51 | '_itemid=%s and _prop=%s'%(a, a, a) | |
52 | self.db.cursor.execute(sql, identifier) | |
53 | r = self.db.cursor.fetchone() | |
54 | if not r: | |
55 | # not previously indexed | |
56 | id = self.db.newid('__textids') | |
57 | sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\ | |
58 | ' values (%s, %s, %s, %s)'%(a, a, a, a) | |
59 | self.db.cursor.execute(sql, (id, ) + identifier) | |
60 | else: | |
61 | id = int(r[0]) | |
62 | # clear out any existing indexed values | |
63 | sql = 'delete from __words where _textid=%s'%a | |
64 | self.db.cursor.execute(sql, (id, )) | |
65 | ||
66 | # ok, find all the unique words in the text | |
67 | if not isinstance(text, unicode): | |
68 | text = unicode(text, "utf-8", "replace") | |
69 | text = text.upper() | |
70 | wordlist = [w.encode("utf-8") | |
71 | for w in re.findall(r'(?u)\b\w{%d,%d}\b' | |
72 | % (self.minlength, self.maxlength), text)] | |
73 | words = set() | |
74 | for word in wordlist: | |
75 | if self.is_stopword(word): continue | |
76 | words.add(word) | |
77 | ||
78 | # for each word, add an entry in the db | |
79 | sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a) | |
80 | words = [(word, id) for word in words] | |
81 | self.db.cursor.executemany(sql, words) | |
82 | ||
83 | def find(self, wordlist): | |
84 | """look up all the words in the wordlist. | |
85 | If none are found return an empty dictionary | |
86 | * more rules here | |
87 | """ | |
88 | if not wordlist: | |
89 | return [] | |
90 | ||
91 | l = [word.upper() for word in wordlist | |
92 | if self.minlength <= len(word) <= self.maxlength] | |
93 | l = [word for word in l if not self.is_stopword(word)] | |
94 | ||
95 | if not l: | |
96 | return [] | |
97 | ||
98 | if self.db.implements_intersect: | |
99 | # simple AND search | |
100 | sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg | |
101 | sql = '\nINTERSECT\n'.join([sql]*len(l)) | |
102 | self.db.cursor.execute(sql, tuple(l)) | |
103 | r = self.db.cursor.fetchall() | |
104 | if not r: | |
105 | return [] | |
106 | a = ','.join([self.db.arg] * len(r)) | |
107 | sql = 'select _class, _itemid, _prop from __textids '\ | |
108 | 'where _textid in (%s)'%a | |
109 | self.db.cursor.execute(sql, tuple([int(row[0]) for row in r])) | |
110 | ||
111 | else: | |
112 | # A more complex version for MySQL since it doesn't implement INTERSECT | |
113 | ||
114 | # Construct SQL statement to join __words table to itself | |
115 | # multiple times. | |
116 | sql = """select distinct(__words1._textid) | |
117 | from __words as __words1 %s | |
118 | where __words1._word=%s %s""" | |
119 | ||
120 | join_tmpl = ' left join __words as __words%d using (_textid) \n' | |
121 | match_tmpl = ' and __words%d._word=%s \n' | |
122 | ||
123 | join_list = [] | |
124 | match_list = [] | |
125 | for n in xrange(len(l) - 1): | |
126 | join_list.append(join_tmpl % (n + 2)) | |
127 | match_list.append(match_tmpl % (n + 2, self.db.arg)) | |
128 | ||
129 | sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list)) | |
130 | self.db.cursor.execute(sql, l) | |
131 | ||
132 | r = [x[0] for x in self.db.cursor.fetchall()] | |
133 | if not r: | |
134 | return [] | |
135 | ||
136 | a = ','.join([self.db.arg] * len(r)) | |
137 | sql = 'select _class, _itemid, _prop from __textids '\ | |
138 | 'where _textid in (%s)'%a | |
139 | ||
140 | self.db.cursor.execute(sql, tuple(map(int, r))) | |
141 | ||
142 | return self.db.cursor.fetchall() | |
143 |