2547300f693d22e3bb7ac825a89937ba37da470a
1 #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $
2 ''' This implements the full-text indexer over two RDBMS tables. The first
3 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
4 propname, itemid) instances.
5 '''
6 import re, sets
8 from roundup.backends.indexer_common import Indexer as IndexerBase
10 class Indexer(IndexerBase):
11 def __init__(self, db):
12 IndexerBase.__init__(self, db)
13 self.db = db
14 self.reindex = 0
16 def close(self):
17 '''close the indexing database'''
18 # just nuke the circular reference
19 self.db = None
21 def save_index(self):
22 '''Save the changes to the index.'''
23 # not necessary - the RDBMS connection will handle this for us
24 pass
26 def force_reindex(self):
27 '''Force a reindexing of the database. This essentially
28 empties the tables ids and index and sets a flag so
29 that the databases are reindexed'''
30 self.reindex = 1
32 def should_reindex(self):
33 '''returns True if the indexes need to be rebuilt'''
34 return self.reindex
36 def add_text(self, identifier, text, mime_type='text/plain'):
37 ''' "identifier" is (classname, itemid, property) '''
38 if mime_type != 'text/plain':
39 return
41 # Ensure all elements of the identifier are strings 'cos the itemid
42 # column is varchar even if item ids may be numbers elsewhere in the
43 # code. ugh.
44 identifier = tuple(map(str, identifier))
46 # first, find the id of the (classname, itemid, property)
47 a = self.db.arg
48 sql = 'select _textid from __textids where _class=%s and '\
49 '_itemid=%s and _prop=%s'%(a, a, a)
50 self.db.cursor.execute(sql, identifier)
51 r = self.db.cursor.fetchone()
52 if not r:
53 # not previously indexed
54 id = self.db.newid('__textids')
55 sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\
56 ' values (%s, %s, %s, %s)'%(a, a, a, a)
57 self.db.cursor.execute(sql, (id, ) + identifier)
58 else:
59 id = int(r[0])
60 # clear out any existing indexed values
61 sql = 'delete from __words where _textid=%s'%a
62 self.db.cursor.execute(sql, (id, ))
64 # ok, find all the unique words in the text
65 text = unicode(text, "utf-8", "replace").upper()
66 wordlist = [w.encode("utf-8", "replace")
67 for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
68 words = sets.Set()
69 for word in wordlist:
70 if self.is_stopword(word): continue
71 if len(word) > 25: continue
72 words.add(word)
74 # for each word, add an entry in the db
75 sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a)
76 words = [(word, id) for word in words]
77 self.db.cursor.executemany(sql, words)
79 def find(self, wordlist):
80 '''look up all the words in the wordlist.
81 If none are found return an empty dictionary
82 * more rules here
83 '''
84 if not wordlist:
85 return []
87 l = [word.upper() for word in wordlist if 26 > len(word) > 2]
89 if not l:
90 return []
92 if self.db.implements_intersect:
93 # simple AND search
94 sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg
95 sql = '\nINTERSECT\n'.join([sql]*len(l))
96 self.db.cursor.execute(sql, tuple(l))
97 r = self.db.cursor.fetchall()
98 if not r:
99 return []
100 a = ','.join([self.db.arg] * len(r))
101 sql = 'select _class, _itemid, _prop from __textids '\
102 'where _textid in (%s)'%a
103 self.db.cursor.execute(sql, tuple([int(row[0]) for row in r]))
105 else:
106 # A more complex version for MySQL since it doesn't implement INTERSECT
108 # Construct SQL statement to join __words table to itself
109 # multiple times.
110 sql = """select distinct(__words1._textid)
111 from __words as __words1 %s
112 where __words1._word=%s %s"""
114 join_tmpl = ' left join __words as __words%d using (_textid) \n'
115 match_tmpl = ' and __words%d._word=%s \n'
117 join_list = []
118 match_list = []
119 for n in xrange(len(l) - 1):
120 join_list.append(join_tmpl % (n + 2))
121 match_list.append(match_tmpl % (n + 2, self.db.arg))
123 sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list))
124 self.db.cursor.execute(sql, l)
126 r = map(lambda x: x[0], self.db.cursor.fetchall())
127 if not r:
128 return []
130 a = ','.join([self.db.arg] * len(r))
131 sql = 'select _class, _itemid, _prop from __textids '\
132 'where _textid in (%s)'%a
134 self.db.cursor.execute(sql, tuple(map(int, r)))
136 return self.db.cursor.fetchall()