roundup/backends/indexer_rdbms.py

   1 #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $
   2 ''' This implements the full-text indexer over two RDBMS tables. The first
   3 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
   4 propname, itemid) instances.
   5 '''
   6 import re, sets
   7
   8 from roundup.backends.indexer_common import Indexer as IndexerBase
   9
  10 class Indexer(IndexerBase):
  11     def __init__(self, db):
  12         IndexerBase.__init__(self, db)
  13         self.db = db
  14         self.reindex = 0
  15
  16     def close(self):
  17         '''close the indexing database'''
  18         # just nuke the circular reference
  19         self.db = None
  20
  21     def save_index(self):
  22         '''Save the changes to the index.'''
  23         # not necessary - the RDBMS connection will handle this for us
  24         pass
  25
  26     def force_reindex(self):
  27         '''Force a reindexing of the database.  This essentially
  28         empties the tables ids and index and sets a flag so
  29         that the databases are reindexed'''
  30         self.reindex = 1
  31
  32     def should_reindex(self):
  33         '''returns True if the indexes need to be rebuilt'''
  34         return self.reindex
  35
  36     def add_text(self, identifier, text, mime_type='text/plain'):
  37         ''' "identifier" is  (classname, itemid, property) '''
  38         if mime_type != 'text/plain':
  39             return
  40
  41         # Ensure all elements of the identifier are strings 'cos the itemid
  42         # column is varchar even if item ids may be numbers elsewhere in the
  43         # code. ugh.
  44         identifier = tuple(map(str, identifier))
  45
  46         # first, find the id of the (classname, itemid, property)
  47         a = self.db.arg
  48         sql = 'select _textid from __textids where _class=%s and '\
  49             '_itemid=%s and _prop=%s'%(a, a, a)
  50         self.db.cursor.execute(sql, identifier)
  51         r = self.db.cursor.fetchone()
  52         if not r:
  53             # not previously indexed
  54             id = self.db.newid('__textids')
  55             sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\
  56                 ' values (%s, %s, %s, %s)'%(a, a, a, a)
  57             self.db.cursor.execute(sql, (id, ) + identifier)
  58         else:
  59             id = int(r[0])
  60             # clear out any existing indexed values
  61             sql = 'delete from __words where _textid=%s'%a
  62             self.db.cursor.execute(sql, (id, ))
  63
  64         # ok, find all the unique words in the text
  65         text = unicode(text, "utf-8", "replace").upper()
  66         wordlist = [w.encode("utf-8", "replace")
  67                 for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
  68         words = sets.Set()
  69         for word in wordlist:
  70             if self.is_stopword(word): continue
  71             if len(word) > 25: continue
  72             words.add(word)
  73
  74         # for each word, add an entry in the db
  75         sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a)
  76         words = [(word, id) for word in words]
  77         self.db.cursor.executemany(sql, words)
  78
  79     def find(self, wordlist):
  80         '''look up all the words in the wordlist.
  81         If none are found return an empty dictionary
  82         * more rules here
  83         '''
  84         if not wordlist:
  85             return []
  86
  87         l = [word.upper() for word in wordlist if 26 > len(word) > 2]
  88
  89         if not l:
  90             return []
  91
  92         if self.db.implements_intersect:
  93             # simple AND search
  94             sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg
  95             sql = '\nINTERSECT\n'.join([sql]*len(l))
  96             self.db.cursor.execute(sql, tuple(l))
  97             r = self.db.cursor.fetchall()
  98             if not r:
  99                 return []
 100             a = ','.join([self.db.arg] * len(r))
 101             sql = 'select _class, _itemid, _prop from __textids '\
 102                 'where _textid in (%s)'%a
 103             self.db.cursor.execute(sql, tuple([int(row[0]) for row in r]))
 104
 105         else:
 106             # A more complex version for MySQL since it doesn't implement INTERSECT
 107
 108             # Construct SQL statement to join __words table to itself
 109             # multiple times.
 110             sql = """select distinct(__words1._textid)
 111                         from __words as __words1 %s
 112                         where __words1._word=%s %s"""
 113
 114             join_tmpl = ' left join __words as __words%d using (_textid) \n'
 115             match_tmpl = ' and __words%d._word=%s \n'
 116
 117             join_list = []
 118             match_list = []
 119             for n in xrange(len(l) - 1):
 120                 join_list.append(join_tmpl % (n + 2))
 121                 match_list.append(match_tmpl % (n + 2, self.db.arg))
 122
 123             sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list))
 124             self.db.cursor.execute(sql, l)
 125
 126             r = map(lambda x: x[0], self.db.cursor.fetchall())
 127             if not r:
 128                 return []
 129
 130             a = ','.join([self.db.arg] * len(r))
 131             sql = 'select _class, _itemid, _prop from __textids '\
 132                 'where _textid in (%s)'%a
 133
 134             self.db.cursor.execute(sql, tuple(map(int, r)))
 135
 136         return self.db.cursor.fetchall()
 137