1 #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $
2 """ This implements the full-text indexer over two RDBMS tables. The first
3 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
4 propname, itemid) instances.
7 # Python 2.3 ... 2.6 compatibility:
8 from roundup
.anypy
.sets_
import set
10 from roundup
.backends
.indexer_common
import Indexer
as IndexerBase
12 class Indexer(IndexerBase
):
13 def __init__(self
, db
):
14 IndexerBase
.__init__(self
, db
)
19 """close the indexing database"""
20 # just nuke the circular reference
24 """Save the changes to the index."""
25 # not necessary - the RDBMS connection will handle this for us
28 def force_reindex(self
):
29 """Force a reindexing of the database. This essentially
30 empties the tables ids and index and sets a flag so
31 that the databases are reindexed"""
34 def should_reindex(self
):
35 """returns True if the indexes need to be rebuilt"""
38 def add_text(self
, identifier
, text
, mime_type
='text/plain'):
39 """ "identifier" is (classname, itemid, property) """
40 if mime_type
!= 'text/plain':
43 # Ensure all elements of the identifier are strings 'cos the itemid
44 # column is varchar even if item ids may be numbers elsewhere in the
46 identifier
= tuple(map(str, identifier
))
48 # first, find the id of the (classname, itemid, property)
50 sql
= 'select _textid from __textids where _class=%s and '\
51 '_itemid=%s and _prop=%s'%(a
, a
, a
)
52 self
.db
.cursor
.execute(sql
, identifier
)
53 r
= self
.db
.cursor
.fetchone()
55 # not previously indexed
56 id = self
.db
.newid('__textids')
57 sql
= 'insert into __textids (_textid, _class, _itemid, _prop)'\
58 ' values (%s, %s, %s, %s)'%(a
, a
, a
, a
)
59 self
.db
.cursor
.execute(sql
, (id, ) + identifier
)
62 # clear out any existing indexed values
63 sql
= 'delete from __words where _textid=%s'%a
64 self
.db
.cursor
.execute(sql
, (id, ))
66 # ok, find all the unique words in the text
67 if not isinstance(text
, unicode):
68 text
= unicode(text
, "utf-8", "replace")
70 wordlist
= [w
.encode("utf-8")
71 for w
in re
.findall(r
'(?u)\b\w{%d,%d}\b'
72 % (self
.minlength
, self
.maxlength
), text
)]
75 if self
.is_stopword(word
): continue
78 # for each word, add an entry in the db
79 sql
= 'insert into __words (_word, _textid) values (%s, %s)'%(a
, a
)
80 words
= [(word
, id) for word
in words
]
81 self
.db
.cursor
.executemany(sql
, words
)
83 def find(self
, wordlist
):
84 """look up all the words in the wordlist.
85 If none are found return an empty dictionary
91 l
= [word
.upper() for word
in wordlist
92 if self
.minlength
<= len(word
) <= self
.maxlength
]
93 l
= [word
for word
in l
if not self
.is_stopword(word
)]
98 if self
.db
.implements_intersect
:
100 sql
= 'select distinct(_textid) from __words where _word=%s'%self
.db
.arg
101 sql
= '\nINTERSECT\n'.join([sql
]*len(l
))
102 self
.db
.cursor
.execute(sql
, tuple(l
))
103 r
= self
.db
.cursor
.fetchall()
106 a
= ','.join([self
.db
.arg
] * len(r
))
107 sql
= 'select _class, _itemid, _prop from __textids '\
108 'where _textid in (%s)'%a
109 self
.db
.cursor
.execute(sql
, tuple([int(row
[0]) for row
in r
]))
112 # A more complex version for MySQL since it doesn't implement INTERSECT
114 # Construct SQL statement to join __words table to itself
116 sql
= """select distinct(__words1._textid)
117 from __words as __words1 %s
118 where __words1._word=%s %s"""
120 join_tmpl
= ' left join __words as __words%d using (_textid) \n'
121 match_tmpl
= ' and __words%d._word=%s \n'
125 for n
in xrange(len(l
) - 1):
126 join_list
.append(join_tmpl
% (n
+ 2))
127 match_list
.append(match_tmpl
% (n
+ 2, self
.db
.arg
))
129 sql
= sql
%(' '.join(join_list
), self
.db
.arg
, ' '.join(match_list
))
130 self
.db
.cursor
.execute(sql
, l
)
132 r
= [x
[0] for x
in self
.db
.cursor
.fetchall()]
136 a
= ','.join([self
.db
.arg
] * len(r
))
137 sql
= 'select _class, _itemid, _prop from __textids '\
138 'where _textid in (%s)'%a
140 self
.db
.cursor
.execute(sql
, tuple(map(int, r
)))
142 return self
.db
.cursor
.fetchall()