roundup/backends/indexer_rdbms.py

   1 #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $
   2 """ This implements the full-text indexer over two RDBMS tables. The first
   3 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
   4 propname, itemid) instances.
   5 """
   6 import re
   7 # Python 2.3 ... 2.6 compatibility:
   8 from roundup.anypy.sets_ import set
   9
  10 from roundup.backends.indexer_common import Indexer as IndexerBase
  11
  12 class Indexer(IndexerBase):
  13     def __init__(self, db):
  14         IndexerBase.__init__(self, db)
  15         self.db = db
  16         self.reindex = 0
  17
  18     def close(self):
  19         """close the indexing database"""
  20         # just nuke the circular reference
  21         self.db = None
  22
  23     def save_index(self):
  24         """Save the changes to the index."""
  25         # not necessary - the RDBMS connection will handle this for us
  26         pass
  27
  28     def force_reindex(self):
  29         """Force a reindexing of the database.  This essentially
  30         empties the tables ids and index and sets a flag so
  31         that the databases are reindexed"""
  32         self.reindex = 1
  33
  34     def should_reindex(self):
  35         """returns True if the indexes need to be rebuilt"""
  36         return self.reindex
  37
  38     def add_text(self, identifier, text, mime_type='text/plain'):
  39         """ "identifier" is  (classname, itemid, property) """
  40         if mime_type != 'text/plain':
  41             return
  42
  43         # Ensure all elements of the identifier are strings 'cos the itemid
  44         # column is varchar even if item ids may be numbers elsewhere in the
  45         # code. ugh.
  46         identifier = tuple(map(str, identifier))
  47
  48         # first, find the id of the (classname, itemid, property)
  49         a = self.db.arg
  50         sql = 'select _textid from __textids where _class=%s and '\
  51             '_itemid=%s and _prop=%s'%(a, a, a)
  52         self.db.cursor.execute(sql, identifier)
  53         r = self.db.cursor.fetchone()
  54         if not r:
  55             # not previously indexed
  56             id = self.db.newid('__textids')
  57             sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\
  58                 ' values (%s, %s, %s, %s)'%(a, a, a, a)
  59             self.db.cursor.execute(sql, (id, ) + identifier)
  60         else:
  61             id = int(r[0])
  62             # clear out any existing indexed values
  63             sql = 'delete from __words where _textid=%s'%a
  64             self.db.cursor.execute(sql, (id, ))
  65
  66         # ok, find all the unique words in the text
  67         if not isinstance(text, unicode):
  68             text = unicode(text, "utf-8", "replace")
  69         text = text.upper()
  70         wordlist = [w.encode("utf-8")
  71                     for w in re.findall(r'(?u)\b\w{%d,%d}\b'
  72                                         % (self.minlength, self.maxlength), text)]
  73         words = set()
  74         for word in wordlist:
  75             if self.is_stopword(word): continue
  76             words.add(word)
  77
  78         # for each word, add an entry in the db
  79         sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a)
  80         words = [(word, id) for word in words]
  81         self.db.cursor.executemany(sql, words)
  82
  83     def find(self, wordlist):
  84         """look up all the words in the wordlist.
  85         If none are found return an empty dictionary
  86         * more rules here
  87         """
  88         if not wordlist:
  89             return []
  90
  91         l = [word.upper() for word in wordlist
  92              if self.minlength <= len(word) <= self.maxlength]
  93         l = [word for word in l if not self.is_stopword(word)]
  94
  95         if not l:
  96             return []
  97
  98         if self.db.implements_intersect:
  99             # simple AND search
 100             sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg
 101             sql = '\nINTERSECT\n'.join([sql]*len(l))
 102             self.db.cursor.execute(sql, tuple(l))
 103             r = self.db.cursor.fetchall()
 104             if not r:
 105                 return []
 106             a = ','.join([self.db.arg] * len(r))
 107             sql = 'select _class, _itemid, _prop from __textids '\
 108                 'where _textid in (%s)'%a
 109             self.db.cursor.execute(sql, tuple([int(row[0]) for row in r]))
 110
 111         else:
 112             # A more complex version for MySQL since it doesn't implement INTERSECT
 113
 114             # Construct SQL statement to join __words table to itself
 115             # multiple times.
 116             sql = """select distinct(__words1._textid)
 117                         from __words as __words1 %s
 118                         where __words1._word=%s %s"""
 119
 120             join_tmpl = ' left join __words as __words%d using (_textid) \n'
 121             match_tmpl = ' and __words%d._word=%s \n'
 122
 123             join_list = []
 124             match_list = []
 125             for n in xrange(len(l) - 1):
 126                 join_list.append(join_tmpl % (n + 2))
 127                 match_list.append(match_tmpl % (n + 2, self.db.arg))
 128
 129             sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list))
 130             self.db.cursor.execute(sql, l)
 131
 132             r = [x[0] for x in self.db.cursor.fetchall()]
 133             if not r:
 134                 return []
 135
 136             a = ','.join([self.db.arg] * len(r))
 137             sql = 'select _class, _itemid, _prop from __textids '\
 138                 'where _textid in (%s)'%a
 139
 140             self.db.cursor.execute(sql, tuple(map(int, r)))
 141
 142         return self.db.cursor.fetchall()
 143