4c8b913e273eaab43a2d85bbd48a2dbbb9e3a044
1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.15 2003-01-12 23:53:19 richard Exp $
18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of
21 messages, string properties and text files possible.
22 '''
23 import os, shutil, re, mimetypes, marshal, zlib, errno
24 from hyperdb import Link, Multilink
26 class Indexer:
27 ''' Indexes information from roundup's hyperdb to allow efficient
28 searching.
30 Three structures are created by the indexer:
31 files {identifier: (fileid, wordcount)}
32 words {word: {fileid: count}}
33 fileids {fileid: identifier}
34 where identifier is (classname, nodeid, propertyname)
35 '''
36 def __init__(self, db_path):
37 self.indexdb_path = os.path.join(db_path, 'indexes')
38 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
39 self.reindex = 0
40 self.quiet = 9
41 self.changed = 0
43 # see if we need to reindex because of a change in code
44 version = os.path.join(self.indexdb_path, 'version')
45 if (not os.path.exists(self.indexdb_path) or
46 not os.path.exists(version)):
47 # for now the file itself is a flag
48 self.force_reindex()
49 elif os.path.exists(version):
50 version = open(version).read()
51 # check the value and reindex if it's not the latest
52 if version.strip() != '1':
53 self.force_reindex()
55 def force_reindex(self):
56 '''Force a reindex condition
57 '''
58 if os.path.exists(self.indexdb_path):
59 shutil.rmtree(self.indexdb_path)
60 os.makedirs(self.indexdb_path)
61 os.chmod(self.indexdb_path, 0775)
62 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
63 self.reindex = 1
64 self.changed = 1
66 def should_reindex(self):
67 '''Should we reindex?
68 '''
69 return self.reindex
71 def add_text(self, identifier, text, mime_type='text/plain'):
72 ''' Add some text associated with the (classname, nodeid, property)
73 identifier.
74 '''
75 # make sure the index is loaded
76 self.load_index()
78 # remove old entries for this identifier
79 if self.files.has_key(identifier):
80 self.purge_entry(identifier)
82 # split into words
83 words = self.splitter(text, mime_type)
85 # Find new file index, and assign it to identifier
86 # (_TOP uses trick of negative to avoid conflict with file index)
87 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
88 file_index = abs(self.files['_TOP'][0])
89 self.files[identifier] = (file_index, len(words))
90 self.fileids[file_index] = identifier
92 # find the unique words
93 filedict = {}
94 for word in words:
95 if filedict.has_key(word):
96 filedict[word] = filedict[word]+1
97 else:
98 filedict[word] = 1
100 # now add to the totals
101 for word in filedict.keys():
102 # each word has a dict of {identifier: count}
103 if self.words.has_key(word):
104 entry = self.words[word]
105 else:
106 # new word
107 entry = {}
108 self.words[word] = entry
110 # make a reference to the file for this word
111 entry[file_index] = filedict[word]
113 # save needed
114 self.changed = 1
116 def splitter(self, text, ftype):
117 ''' Split the contents of a text string into a list of 'words'
118 '''
119 if ftype == 'text/plain':
120 words = self.text_splitter(text)
121 else:
122 return []
123 return words
125 def text_splitter(self, text):
126 """Split text/plain string into a list of words
127 """
128 # case insensitive
129 text = text.upper()
131 # Split the raw text, losing anything longer than 25 characters
132 # since that'll be gibberish (encoded text or somesuch) or shorter
133 # than 3 characters since those short words appear all over the
134 # place
135 return re.findall(r'\b\w{2,25}\b', text)
137 def search(self, search_terms, klass, ignore={},
138 dre=re.compile(r'([^\d]+)(\d+)')):
139 ''' Display search results looking for [search, terms] associated
140 with the hyperdb Class "klass". Ignore hits on {class: property}.
142 "dre" is a helper, not an argument.
143 '''
144 # do the index lookup
145 hits = self.find(search_terms)
146 if not hits:
147 return {}
149 #designator_propname = {'msg': 'messages', 'file': 'files'}
150 designator_propname = {}
151 for nm, propclass in klass.getprops().items():
152 if isinstance(propclass, Link) or isinstance(propclass, Multilink):
153 designator_propname[propclass.classname] = nm
155 # build a dictionary of nodes and their associated messages
156 # and files
157 nodeids = {} # this is the answer
158 propspec = {} # used to do the klass.find
159 for propname in designator_propname.values():
160 propspec[propname] = {} # used as a set (value doesn't matter)
161 for classname, nodeid, property in hits.values():
162 # skip this result if we don't care about this class/property
163 if ignore.has_key((classname, property)):
164 continue
166 # if it's a property on klass, it's easy
167 if classname == klass.classname:
168 if not nodeids.has_key(nodeid):
169 nodeids[nodeid] = {}
170 continue
172 # make sure the class is a linked one, otherwise ignore
173 if not designator_propname.has_key(classname):
174 continue
176 # it's a linked class - set up to do the klass.find
177 linkprop = designator_propname[classname] # eg, msg -> messages
178 propspec[linkprop][nodeid] = 1
180 # retain only the meaningful entries
181 for propname, idset in propspec.items():
182 if not idset:
183 del propspec[propname]
185 # klass.find tells me the klass nodeids the linked nodes relate to
186 for resid in klass.find(**propspec):
187 resid = str(resid)
188 if not nodeids.has_key(id):
189 nodeids[resid] = {}
190 node_dict = nodeids[resid]
191 # now figure out where it came from
192 for linkprop in propspec.keys():
193 for nodeid in klass.get(resid, linkprop):
194 if propspec[linkprop].has_key(nodeid):
195 # OK, this node[propname] has a winner
196 if not node_dict.has_key(linkprop):
197 node_dict[linkprop] = [nodeid]
198 else:
199 node_dict[linkprop].append(nodeid)
200 return nodeids
202 # we override this to ignore not 2 < word < 25 and also to fix a bug -
203 # the (fail) case.
204 def find(self, wordlist):
205 ''' Locate files that match ALL the words in wordlist
206 '''
207 if not hasattr(self, 'words'):
208 self.load_index()
209 self.load_index(wordlist=wordlist)
210 entries = {}
211 hits = None
212 for word in wordlist:
213 if not 2 < len(word) < 25:
214 # word outside the bounds of what we index - ignore
215 continue
216 word = word.upper()
217 entry = self.words[word] # For each word, get index
218 entries[word] = entry # of matching files
219 if not entry: # Nothing for this one word (fail)
220 return {}
221 if hits is None:
222 hits = {}
223 for k in entry.keys():
224 if not self.fileids.has_key(k):
225 raise ValueError, 'Index is corrupted: re-generate it'
226 hits[k] = self.fileids[k]
227 else:
228 # Eliminate hits for every non-match
229 for fileid in hits.keys():
230 if not entry.has_key(fileid):
231 del hits[fileid]
232 if hits is None:
233 return {}
234 return hits
236 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
237 def load_index(self, reload=0, wordlist=None):
238 # Unless reload is indicated, do not load twice
239 if self.index_loaded() and not reload:
240 return 0
242 # Ok, now let's actually load it
243 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
245 # Identify the relevant word-dictionary segments
246 if not wordlist:
247 segments = self.segments
248 else:
249 segments = ['-','#']
250 for word in wordlist:
251 segments.append(word[0].upper())
253 # Load the segments
254 for segment in segments:
255 try:
256 f = open(self.indexdb + segment, 'rb')
257 except IOError, error:
258 # probably just nonexistent segment index file
259 if error.errno != errno.ENOENT: raise
260 else:
261 pickle_str = zlib.decompress(f.read())
262 f.close()
263 dbslice = marshal.loads(pickle_str)
264 if dbslice.get('WORDS'):
265 # if it has some words, add them
266 for word, entry in dbslice['WORDS'].items():
267 db['WORDS'][word] = entry
268 if dbslice.get('FILES'):
269 # if it has some files, add them
270 db['FILES'] = dbslice['FILES']
271 if dbslice.get('FILEIDS'):
272 # if it has fileids, add them
273 db['FILEIDS'] = dbslice['FILEIDS']
275 self.words = db['WORDS']
276 self.files = db['FILES']
277 self.fileids = db['FILEIDS']
278 self.changed = 0
280 def save_index(self):
281 # only save if the index is loaded and changed
282 if not self.index_loaded() or not self.changed:
283 return
285 # brutal space saver... delete all the small segments
286 for segment in self.segments:
287 try:
288 os.remove(self.indexdb + segment)
289 except OSError, error:
290 # probably just nonexistent segment index file
291 if error.errno != errno.ENOENT: raise
293 # First write the much simpler filename/fileid dictionaries
294 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
295 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
297 # The hard part is splitting the word dictionary up, of course
298 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
299 segdicts = {} # Need batch of empty dicts
300 for segment in letters:
301 segdicts[segment] = {}
302 for word, entry in self.words.items(): # Split into segment dicts
303 initchar = word[0].upper()
304 segdicts[initchar][word] = entry
306 # save
307 for initchar in letters:
308 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
309 pickle_str = marshal.dumps(db)
310 filename = self.indexdb + initchar
311 pickle_fh = open(filename, 'wb')
312 pickle_fh.write(zlib.compress(pickle_str))
313 os.chmod(filename, 0664)
315 # save done
316 self.changed = 0
318 def purge_entry(self, identifier):
319 ''' Remove a file from file index and word index
320 '''
321 self.load_index()
323 if not self.files.has_key(identifier):
324 return
326 file_index = self.files[identifier][0]
327 del self.files[identifier]
328 del self.fileids[file_index]
330 # The much harder part, cleanup the word index
331 for key, occurs in self.words.items():
332 if occurs.has_key(file_index):
333 del occurs[file_index]
335 # save needed
336 self.changed = 1
338 def index_loaded(self):
339 return (hasattr(self,'fileids') and hasattr(self,'files') and
340 hasattr(self,'words'))
342 # vim: set filetype=python ts=4 sw=4 et si