Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | #$Id: indexer_xapian.py,v 1.6 2007-10-25 07:02:42 richard Exp $ |
2 | ''' This implements the full-text indexer using the Xapian indexer. | |
3 | ''' | |
4 | import re, os | |
5 | ||
6 | import xapian | |
7 | ||
8 | from roundup.backends.indexer_common import Indexer as IndexerBase | |
9 | ||
10 | # TODO: we need to delete documents when a property is *reindexed* | |
11 | ||
12 | class Indexer(IndexerBase): | |
13 | def __init__(self, db): | |
14 | IndexerBase.__init__(self, db) | |
15 | self.db_path = db.config.DATABASE | |
16 | self.reindex = 0 | |
17 | self.transaction_active = False | |
18 | ||
19 | def _get_database(self): | |
20 | index = os.path.join(self.db_path, 'text-index') | |
21 | return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN) | |
22 | ||
23 | def save_index(self): | |
24 | '''Save the changes to the index.''' | |
25 | if not self.transaction_active: | |
26 | return | |
27 | # XXX: Xapian databases don't actually implement transactions yet | |
28 | database = self._get_database() | |
29 | database.commit_transaction() | |
30 | self.transaction_active = False | |
31 | ||
32 | def close(self): | |
33 | '''close the indexing database''' | |
34 | pass | |
35 | ||
36 | def rollback(self): | |
37 | if not self.transaction_active: | |
38 | return | |
39 | # XXX: Xapian databases don't actually implement transactions yet | |
40 | database = self._get_database() | |
41 | database.cancel_transaction() | |
42 | self.transaction_active = False | |
43 | ||
44 | def force_reindex(self): | |
45 | '''Force a reindexing of the database. This essentially | |
46 | empties the tables ids and index and sets a flag so | |
47 | that the databases are reindexed''' | |
48 | self.reindex = 1 | |
49 | ||
50 | def should_reindex(self): | |
51 | '''returns True if the indexes need to be rebuilt''' | |
52 | return self.reindex | |
53 | ||
54 | def add_text(self, identifier, text, mime_type='text/plain'): | |
55 | ''' "identifier" is (classname, itemid, property) ''' | |
56 | if mime_type != 'text/plain': | |
57 | return | |
58 | if not text: text = '' | |
59 | ||
60 | # open the database and start a transaction if needed | |
61 | database = self._get_database() | |
62 | # XXX: Xapian databases don't actually implement transactions yet | |
63 | #if not self.transaction_active: | |
64 | #database.begin_transaction() | |
65 | #self.transaction_active = True | |
66 | ||
67 | # TODO: allow configuration of other languages | |
68 | stemmer = xapian.Stem("english") | |
69 | ||
70 | # We use the identifier twice: once in the actual "text" being | |
71 | # indexed so we can search on it, and again as the "data" being | |
72 | # indexed so we know what we're matching when we get results | |
73 | identifier = '%s:%s:%s'%identifier | |
74 | ||
75 | # see if the id is in the database | |
76 | enquire = xapian.Enquire(database) | |
77 | query = xapian.Query(xapian.Query.OP_AND, [identifier]) | |
78 | enquire.set_query(query) | |
79 | matches = enquire.get_mset(0, 10) | |
80 | if matches.size(): # would it killya to implement __len__()?? | |
81 | b = matches.begin() | |
82 | docid = b.get_docid() | |
83 | else: | |
84 | docid = None | |
85 | ||
86 | # create the new document | |
87 | doc = xapian.Document() | |
88 | doc.set_data(identifier) | |
89 | doc.add_posting(identifier, 0) | |
90 | ||
91 | for match in re.finditer(r'\b\w{%d,%d}\b' | |
92 | % (self.minlength, self.maxlength), | |
93 | text.upper()): | |
94 | word = match.group(0) | |
95 | if self.is_stopword(word): | |
96 | continue | |
97 | term = stemmer(word) | |
98 | doc.add_posting(term, match.start(0)) | |
99 | if docid: | |
100 | database.replace_document(docid, doc) | |
101 | else: | |
102 | database.add_document(doc) | |
103 | ||
104 | def find(self, wordlist): | |
105 | '''look up all the words in the wordlist. | |
106 | If none are found return an empty dictionary | |
107 | * more rules here | |
108 | ''' | |
109 | if not wordlist: | |
110 | return {} | |
111 | ||
112 | database = self._get_database() | |
113 | ||
114 | enquire = xapian.Enquire(database) | |
115 | stemmer = xapian.Stem("english") | |
116 | terms = [] | |
117 | for term in [word.upper() for word in wordlist | |
118 | if self.minlength <= len(word) <= self.maxlength]: | |
119 | if not self.is_stopword(term): | |
120 | terms.append(stemmer(term)) | |
121 | query = xapian.Query(xapian.Query.OP_AND, terms) | |
122 | ||
123 | enquire.set_query(query) | |
124 | matches = enquire.get_mset(0, 10) | |
125 | ||
126 | return [tuple(m[xapian.MSET_DOCUMENT].get_data().split(':')) | |
127 | for m in matches] | |
128 |