roundup/backends/indexer_xapian.py

   1 #$Id: indexer_xapian.py,v 1.6 2007-10-25 07:02:42 richard Exp $
   2 ''' This implements the full-text indexer using the Xapian indexer.
   3 '''
   4 import re, os
   5
   6 import xapian
   7
   8 from roundup.backends.indexer_common import Indexer as IndexerBase
   9
  10 # TODO: we need to delete documents when a property is *reindexed*
  11
  12 class Indexer(IndexerBase):
  13     def __init__(self, db):
  14         IndexerBase.__init__(self, db)
  15         self.db_path = db.config.DATABASE
  16         self.reindex = 0
  17         self.transaction_active = False
  18
  19     def _get_database(self):
  20         index = os.path.join(self.db_path, 'text-index')
  21         return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN)
  22
  23     def save_index(self):
  24         '''Save the changes to the index.'''
  25         if not self.transaction_active:
  26             return
  27         database = self._get_database()
  28         database.commit_transaction()
  29         self.transaction_active = False
  30
  31     def close(self):
  32         '''close the indexing database'''
  33         pass
  34
  35     def rollback(self):
  36         if not self.transaction_active:
  37             return
  38         database = self._get_database()
  39         database.cancel_transaction()
  40         self.transaction_active = False
  41
  42     def force_reindex(self):
  43         '''Force a reindexing of the database.  This essentially
  44         empties the tables ids and index and sets a flag so
  45         that the databases are reindexed'''
  46         self.reindex = 1
  47
  48     def should_reindex(self):
  49         '''returns True if the indexes need to be rebuilt'''
  50         return self.reindex
  51
  52     def add_text(self, identifier, text, mime_type='text/plain'):
  53         ''' "identifier" is  (classname, itemid, property) '''
  54         if mime_type != 'text/plain':
  55             return
  56         if not text: text = ''
  57
  58         # open the database and start a transaction if needed
  59         database = self._get_database()
  60
  61         # XXX: Xapian now supports transactions,
  62         #  but there is a call to save_index() missing.
  63         #if not self.transaction_active:
  64             #database.begin_transaction()
  65             #self.transaction_active = True
  66
  67         # TODO: allow configuration of other languages
  68         stemmer = xapian.Stem("english")
  69
  70         # We use the identifier twice: once in the actual "text" being
  71         # indexed so we can search on it, and again as the "data" being
  72         # indexed so we know what we're matching when we get results
  73         identifier = '%s:%s:%s'%identifier
  74
  75         # create the new document
  76         doc = xapian.Document()
  77         doc.set_data(identifier)
  78         doc.add_term(identifier, 0)
  79
  80         for match in re.finditer(r'\b\w{%d,%d}\b'
  81                                  % (self.minlength, self.maxlength),
  82                                  text.upper()):
  83             word = match.group(0)
  84             if self.is_stopword(word):
  85                 continue
  86             term = stemmer(word)
  87             doc.add_posting(term, match.start(0))
  88
  89         database.replace_document(identifier, doc)
  90
  91     def find(self, wordlist):
  92         '''look up all the words in the wordlist.
  93         If none are found return an empty dictionary
  94         * more rules here
  95         '''
  96         if not wordlist:
  97             return {}
  98
  99         database = self._get_database()
 100
 101         enquire = xapian.Enquire(database)
 102         stemmer = xapian.Stem("english")
 103         terms = []
 104         for term in [word.upper() for word in wordlist
 105                           if self.minlength <= len(word) <= self.maxlength]:
 106             if not self.is_stopword(term):
 107                 terms.append(stemmer(term))
 108         query = xapian.Query(xapian.Query.OP_AND, terms)
 109
 110         enquire.set_query(query)
 111         matches = enquire.get_mset(0, 10)
 112
 113         return [tuple(m.document.get_data().split(':'))
 114             for m in matches]
 115