1 #$Id: indexer_xapian.py,v 1.6 2007-10-25 07:02:42 richard Exp $
2 ''' This implements the full-text indexer using the Xapian indexer.
8 from roundup
.backends
.indexer_common
import Indexer
as IndexerBase
10 # TODO: we need to delete documents when a property is *reindexed*
12 class Indexer(IndexerBase
):
13 def __init__(self
, db
):
14 IndexerBase
.__init__(self
, db
)
15 self
.db_path
= db
.config
.DATABASE
17 self
.transaction_active
= False
19 def _get_database(self
):
20 index
= os
.path
.join(self
.db_path
, 'text-index')
21 return xapian
.WritableDatabase(index
, xapian
.DB_CREATE_OR_OPEN
)
24 '''Save the changes to the index.'''
25 if not self
.transaction_active
:
27 # XXX: Xapian databases don't actually implement transactions yet
28 database
= self
._get_database()
29 database
.commit_transaction()
30 self
.transaction_active
= False
33 '''close the indexing database'''
37 if not self
.transaction_active
:
39 # XXX: Xapian databases don't actually implement transactions yet
40 database
= self
._get_database()
41 database
.cancel_transaction()
42 self
.transaction_active
= False
44 def force_reindex(self
):
45 '''Force a reindexing of the database. This essentially
46 empties the tables ids and index and sets a flag so
47 that the databases are reindexed'''
50 def should_reindex(self
):
51 '''returns True if the indexes need to be rebuilt'''
54 def add_text(self
, identifier
, text
, mime_type
='text/plain'):
55 ''' "identifier" is (classname, itemid, property) '''
56 if mime_type
!= 'text/plain':
58 if not text
: text
= ''
60 # open the database and start a transaction if needed
61 database
= self
._get_database()
62 # XXX: Xapian databases don't actually implement transactions yet
63 #if not self.transaction_active:
64 #database.begin_transaction()
65 #self.transaction_active = True
67 # TODO: allow configuration of other languages
68 stemmer
= xapian
.Stem("english")
70 # We use the identifier twice: once in the actual "text" being
71 # indexed so we can search on it, and again as the "data" being
72 # indexed so we know what we're matching when we get results
73 identifier
= '%s:%s:%s'%identifier
75 # see if the id is in the database
76 enquire
= xapian
.Enquire(database
)
77 query
= xapian
.Query(xapian
.Query
.OP_AND
, [identifier
])
78 enquire
.set_query(query
)
79 matches
= enquire
.get_mset(0, 10)
80 if matches
.size(): # would it killya to implement __len__()??
86 # create the new document
87 doc
= xapian
.Document()
88 doc
.set_data(identifier
)
89 doc
.add_posting(identifier
, 0)
91 for match
in re
.finditer(r
'\b\w{%d,%d}\b'
92 % (self
.minlength
, self
.maxlength
),
95 if self
.is_stopword(word
):
98 doc
.add_posting(term
, match
.start(0))
100 database
.replace_document(docid
, doc
)
102 database
.add_document(doc
)
104 def find(self
, wordlist
):
105 '''look up all the words in the wordlist.
106 If none are found return an empty dictionary
112 database
= self
._get_database()
114 enquire
= xapian
.Enquire(database
)
115 stemmer
= xapian
.Stem("english")
117 for term
in [word
.upper() for word
in wordlist
118 if self
.minlength
<= len(word
) <= self
.maxlength
]:
119 if not self
.is_stopword(term
):
120 terms
.append(stemmer(term
))
121 query
= xapian
.Query(xapian
.Query
.OP_AND
, terms
)
123 enquire
.set_query(query
)
124 matches
= enquire
.get_mset(0, 10)
126 return [tuple(m
[xapian
.MSET_DOCUMENT
].get_data().split(':'))