1 #$Id: indexer_rdbms.py,v 1.18 2008-09-01 00:43:02 richard Exp $
2 """ This implements the full-text indexer over two RDBMS tables. The first
3 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
4 propname, itemid) instances.
5 """
6 import re
7 # Python 2.3 ... 2.6 compatibility:
8 from roundup.anypy.sets_ import set
10 from roundup.backends.indexer_common import Indexer as IndexerBase
12 class Indexer(IndexerBase):
13 def __init__(self, db):
14 IndexerBase.__init__(self, db)
15 self.db = db
16 self.reindex = 0
18 def close(self):
19 """close the indexing database"""
20 # just nuke the circular reference
21 self.db = None
23 def save_index(self):
24 """Save the changes to the index."""
25 # not necessary - the RDBMS connection will handle this for us
26 pass
28 def force_reindex(self):
29 """Force a reindexing of the database. This essentially
30 empties the tables ids and index and sets a flag so
31 that the databases are reindexed"""
32 self.reindex = 1
34 def should_reindex(self):
35 """returns True if the indexes need to be rebuilt"""
36 return self.reindex
38 def add_text(self, identifier, text, mime_type='text/plain'):
39 """ "identifier" is (classname, itemid, property) """
40 if mime_type != 'text/plain':
41 return
43 # Ensure all elements of the identifier are strings 'cos the itemid
44 # column is varchar even if item ids may be numbers elsewhere in the
45 # code. ugh.
46 identifier = tuple(map(str, identifier))
48 # first, find the id of the (classname, itemid, property)
49 a = self.db.arg
50 sql = 'select _textid from __textids where _class=%s and '\
51 '_itemid=%s and _prop=%s'%(a, a, a)
52 self.db.cursor.execute(sql, identifier)
53 r = self.db.cursor.fetchone()
54 if not r:
55 # not previously indexed
56 id = self.db.newid('__textids')
57 sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\
58 ' values (%s, %s, %s, %s)'%(a, a, a, a)
59 self.db.cursor.execute(sql, (id, ) + identifier)
60 else:
61 id = int(r[0])
62 # clear out any existing indexed values
63 sql = 'delete from __words where _textid=%s'%a
64 self.db.cursor.execute(sql, (id, ))
66 # ok, find all the unique words in the text
67 if not isinstance(text, unicode):
68 text = unicode(text, "utf-8", "replace")
69 text = text.upper()
70 wordlist = [w.encode("utf-8")
71 for w in re.findall(r'(?u)\b\w{%d,%d}\b'
72 % (self.minlength, self.maxlength), text)]
73 words = set()
74 for word in wordlist:
75 if self.is_stopword(word): continue
76 words.add(word)
78 # for each word, add an entry in the db
79 sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a)
80 words = [(word, id) for word in words]
81 self.db.cursor.executemany(sql, words)
83 def find(self, wordlist):
84 """look up all the words in the wordlist.
85 If none are found return an empty dictionary
86 * more rules here
87 """
88 if not wordlist:
89 return []
91 l = [word.upper() for word in wordlist
92 if self.minlength <= len(word) <= self.maxlength]
93 l = [word for word in l if not self.is_stopword(word)]
95 if not l:
96 return []
98 if self.db.implements_intersect:
99 # simple AND search
100 sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg
101 sql = '\nINTERSECT\n'.join([sql]*len(l))
102 self.db.cursor.execute(sql, tuple(l))
103 r = self.db.cursor.fetchall()
104 if not r:
105 return []
106 a = ','.join([self.db.arg] * len(r))
107 sql = 'select _class, _itemid, _prop from __textids '\
108 'where _textid in (%s)'%a
109 self.db.cursor.execute(sql, tuple([int(row[0]) for row in r]))
111 else:
112 # A more complex version for MySQL since it doesn't implement INTERSECT
114 # Construct SQL statement to join __words table to itself
115 # multiple times.
116 sql = """select distinct(__words1._textid)
117 from __words as __words1 %s
118 where __words1._word=%s %s"""
120 join_tmpl = ' left join __words as __words%d using (_textid) \n'
121 match_tmpl = ' and __words%d._word=%s \n'
123 join_list = []
124 match_list = []
125 for n in xrange(len(l) - 1):
126 join_list.append(join_tmpl % (n + 2))
127 match_list.append(match_tmpl % (n + 2, self.db.arg))
129 sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list))
130 self.db.cursor.execute(sql, l)
132 r = [x[0] for x in self.db.cursor.fetchall()]
133 if not r:
134 return []
136 a = ','.join([self.db.arg] * len(r))
137 sql = 'select _class, _itemid, _prop from __textids '\
138 'where _textid in (%s)'%a
140 self.db.cursor.execute(sql, tuple(map(int, r)))
142 return self.db.cursor.fetchall()