1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.12 2002-07-19 03:36:33 richard Exp $
18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of
21 messages, string properties and text files possible.
22 '''
23 import os, shutil, re, mimetypes, marshal, zlib, errno
24 from hyperdb import Link, Multilink
26 class Indexer:
27 ''' Indexes information from roundup's hyperdb to allow efficient
28 searching.
30 Three structures are created by the indexer:
31 files {identifier: (fileid, wordcount)}
32 words {word: {fileid: count}}
33 fileids {fileid: identifier}
34 where identifier is (classname, nodeid, propertyname)
35 '''
36 def __init__(self, db_path):
37 self.indexdb_path = os.path.join(db_path, 'indexes')
38 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
39 self.reindex = 0
40 self.quiet = 9
41 self.changed = 0
43 # see if we need to reindex because of a change in code
44 version = os.path.join(self.indexdb_path, 'version')
45 if (not os.path.exists(self.indexdb_path) or
46 not os.path.exists(version)):
47 # for now the file itself is a flag
48 self.force_reindex()
49 elif os.path.exists(version):
50 version = open(version).read()
51 # check the value and reindex if it's not the latest
52 if version.strip() != '1':
53 self.force_reindex()
55 def force_reindex(self):
56 '''Force a reindex condition
57 '''
58 if os.path.exists(self.indexdb_path):
59 shutil.rmtree(self.indexdb_path)
60 os.makedirs(self.indexdb_path)
61 os.chmod(self.indexdb_path, 0775)
62 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
63 self.reindex = 1
64 self.changed = 1
66 def should_reindex(self):
67 '''Should we reindex?
68 '''
69 return self.reindex
71 def add_text(self, identifier, text, mime_type='text/plain'):
72 ''' Add some text associated with the (classname, nodeid, property)
73 identifier.
74 '''
75 # make sure the index is loaded
76 self.load_index()
78 # remove old entries for this identifier
79 if self.files.has_key(identifier):
80 self.purge_entry(identifier)
82 # split into words
83 words = self.splitter(text, mime_type)
85 # Find new file index, and assign it to identifier
86 # (_TOP uses trick of negative to avoid conflict with file index)
87 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
88 file_index = abs(self.files['_TOP'][0])
89 self.files[identifier] = (file_index, len(words))
90 self.fileids[file_index] = identifier
92 # find the unique words
93 filedict = {}
94 for word in words:
95 if filedict.has_key(word):
96 filedict[word] = filedict[word]+1
97 else:
98 filedict[word] = 1
100 # now add to the totals
101 for word in filedict.keys():
102 # each word has a dict of {identifier: count}
103 if self.words.has_key(word):
104 entry = self.words[word]
105 else:
106 # new word
107 entry = {}
108 self.words[word] = entry
110 # make a reference to the file for this word
111 entry[file_index] = filedict[word]
113 # save needed
114 self.changed = 1
116 def splitter(self, text, ftype):
117 ''' Split the contents of a text string into a list of 'words'
118 '''
119 if ftype == 'text/plain':
120 words = self.text_splitter(text)
121 else:
122 return []
123 return words
125 def text_splitter(self, text):
126 """Split text/plain string into a list of words
127 """
128 # case insensitive
129 text = text.upper()
131 # Split the raw text, losing anything longer than 25 characters
132 # since that'll be gibberish (encoded text or somesuch) or shorter
133 # than 3 characters since those short words appear all over the
134 # place
135 return re.findall(r'\b\w{2,25}\b', text)
137 def search(self, search_terms, klass, ignore={},
138 dre=re.compile(r'([^\d]+)(\d+)')):
139 ''' Display search results looking for [search, terms] associated
140 with the hyperdb Class "klass". Ignore hits on {class: property}.
142 "dre" is a helper, not an argument.
143 '''
144 # do the index lookup
145 hits = self.find(search_terms)
146 if not hits:
147 return {}
149 #designator_propname = {'msg': 'messages', 'file': 'files'}
150 designator_propname = {}
151 for nm, propclass in klass.getprops().items():
152 if isinstance(propclass, Link) or isinstance(propclass, Multilink):
153 designator_propname[propclass.classname] = nm
155 # build a dictionary of nodes and their associated messages
156 # and files
157 nodeids = {} # this is the answer
158 propspec = {} # used to do the klass.find
159 for propname in designator_propname.values():
160 propspec[propname] = {} # used as a set (value doesn't matter)
161 for classname, nodeid, property in hits.values():
162 # skip this result if we don't care about this class/property
163 if ignore.has_key((classname, property)):
164 continue
166 # if it's a property on klass, it's easy
167 if classname == klass.classname:
168 if not nodeids.has_key(nodeid):
169 nodeids[nodeid] = {}
170 continue
172 # it's a linked class - set up to do the klass.find
173 linkprop = designator_propname[classname] # eg, msg -> messages
174 propspec[linkprop][nodeid] = 1
176 # retain only the meaningful entries
177 for propname, idset in propspec.items():
178 if not idset:
179 del propspec[propname]
181 # klass.find tells me the klass nodeids the linked nodes relate to
182 for resid in klass.find(**propspec):
183 resid = str(resid)
184 if not nodeids.has_key(id):
185 nodeids[resid] = {}
186 node_dict = nodeids[resid]
187 # now figure out where it came from
188 for linkprop in propspec.keys():
189 for nodeid in klass.get(resid, linkprop):
190 if propspec[linkprop].has_key(nodeid):
191 # OK, this node[propname] has a winner
192 if not node_dict.has_key(linkprop):
193 node_dict[linkprop] = [nodeid]
194 else:
195 node_dict[linkprop].append(nodeid)
196 return nodeids
198 # we override this to ignore not 2 < word < 25 and also to fix a bug -
199 # the (fail) case.
200 def find(self, wordlist):
201 ''' Locate files that match ALL the words in wordlist
202 '''
203 if not hasattr(self, 'words'):
204 self.load_index()
205 self.load_index(wordlist=wordlist)
206 entries = {}
207 hits = None
208 for word in wordlist:
209 if not 2 < len(word) < 25:
210 # word outside the bounds of what we index - ignore
211 continue
212 word = word.upper()
213 entry = self.words.get(word) # For each word, get index
214 entries[word] = entry # of matching files
215 if not entry: # Nothing for this one word (fail)
216 return {}
217 if hits is None:
218 hits = {}
219 for k in entry.keys():
220 hits[k] = self.fileids[k]
221 else:
222 # Eliminate hits for every non-match
223 for fileid in hits.keys():
224 if not entry.has_key(fileid):
225 del hits[fileid]
226 if hits is None:
227 return {}
228 return hits
230 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
231 def load_index(self, reload=0, wordlist=None):
232 # Unless reload is indicated, do not load twice
233 if self.index_loaded() and not reload:
234 return 0
236 # Ok, now let's actually load it
237 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
239 # Identify the relevant word-dictionary segments
240 if not wordlist:
241 segments = self.segments
242 else:
243 segments = ['-','#']
244 for word in wordlist:
245 segments.append(word[0].upper())
247 # Load the segments
248 for segment in segments:
249 try:
250 f = open(self.indexdb + segment, 'rb')
251 except IOError, error:
252 # probably just nonexistent segment index file
253 if error.errno != errno.ENOENT: raise
254 else:
255 pickle_str = zlib.decompress(f.read())
256 f.close()
257 dbslice = marshal.loads(pickle_str)
258 if dbslice.get('WORDS'):
259 # if it has some words, add them
260 for word, entry in dbslice['WORDS'].items():
261 db['WORDS'][word] = entry
262 if dbslice.get('FILES'):
263 # if it has some files, add them
264 db['FILES'] = dbslice['FILES']
265 if dbslice.get('FILEIDS'):
266 # if it has fileids, add them
267 db['FILEIDS'] = dbslice['FILEIDS']
269 self.words = db['WORDS']
270 self.files = db['FILES']
271 self.fileids = db['FILEIDS']
272 self.changed = 0
274 def save_index(self):
275 # only save if the index is loaded and changed
276 if not self.index_loaded() or not self.changed:
277 return
279 # brutal space saver... delete all the small segments
280 for segment in self.segments:
281 try:
282 os.remove(self.indexdb + segment)
283 except OSError, error:
284 # probably just nonexistent segment index file
285 if error.errno != errno.ENOENT: raise
287 # First write the much simpler filename/fileid dictionaries
288 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
289 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
291 # The hard part is splitting the word dictionary up, of course
292 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
293 segdicts = {} # Need batch of empty dicts
294 for segment in letters:
295 segdicts[segment] = {}
296 for word, entry in self.words.items(): # Split into segment dicts
297 initchar = word[0].upper()
298 segdicts[initchar][word] = entry
300 # save
301 for initchar in letters:
302 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
303 pickle_str = marshal.dumps(db)
304 filename = self.indexdb + initchar
305 pickle_fh = open(filename, 'wb')
306 pickle_fh.write(zlib.compress(pickle_str))
307 os.chmod(filename, 0664)
309 # save done
310 self.changed = 0
312 def purge_entry(self, identifier):
313 ''' Remove a file from file index and word index
314 '''
315 self.load_index()
317 if not self.files.has_key(identifier):
318 return
320 file_index = self.files[identifier][0]
321 del self.files[identifier]
322 del self.fileids[file_index]
324 # The much harder part, cleanup the word index
325 for key, occurs in self.words.items():
326 if occurs.has_key(file_index):
327 del occurs[file_index]
329 # save needed
330 self.changed = 1
332 def index_loaded(self):
333 return (hasattr(self,'fileids') and hasattr(self,'files') and
334 hasattr(self,'words'))
336 #
337 #$Log: not supported by cvs2svn $
338 #Revision 1.11 2002/07/18 11:17:30 gmcm
339 #Add Number and Boolean types to hyperdb.
340 #Add conversion cases to web, mail & admin interfaces.
341 #Add storage/serialization cases to back_anydbm & back_metakit.
342 #
343 #Revision 1.10 2002/07/14 23:17:24 richard
344 #oops
345 #
346 #Revision 1.9 2002/07/14 06:11:16 richard
347 #Some TODOs
348 #
349 #Revision 1.8 2002/07/09 21:53:38 gmcm
350 #Optimize Class.find so that the propspec can contain a set of ids to match.
351 #This is used by indexer.search so it can do just one find for all the index matches.
352 #This was already confusing code, but for common terms (lots of index matches),
353 #it is enormously faster.
354 #
355 #Revision 1.7 2002/07/09 21:38:43 richard
356 #Only save the index if the thing is loaded and changed. Also, don't load
357 #the index just for a save.
358 #
359 #Revision 1.6 2002/07/09 04:26:44 richard
360 #We're indexing numbers now, and _underscore words
361 #
362 #Revision 1.5 2002/07/09 04:19:09 richard
363 #Added reindex command to roundup-admin.
364 #Fixed reindex on first access.
365 #Also fixed reindexing of entries that change.
366 #
367 #Revision 1.4 2002/07/09 03:02:52 richard
368 #More indexer work:
369 #- all String properties may now be indexed too. Currently there's a bit of
370 # "issue" specific code in the actual searching which needs to be
371 # addressed. In a nutshell:
372 # + pass 'indexme="yes"' as a String() property initialisation arg, eg:
373 # file = FileClass(db, "file", name=String(), type=String(),
374 # comment=String(indexme="yes"))
375 # + the comment will then be indexed and be searchable, with the results
376 # related back to the issue that the file is linked to
377 #- as a result of this work, the FileClass has a default MIME type that may
378 # be overridden in a subclass, or by the use of a "type" property as is
379 # done in the default templates.
380 #- the regeneration of the indexes (if necessary) is done once the schema is
381 # set up in the dbinit.
382 #
383 #Revision 1.3 2002/07/08 06:58:15 richard
384 #cleaned up the indexer code:
385 # - it splits more words out (much simpler, faster splitter)
386 # - removed code we'll never use (roundup.roundup_indexer has the full
387 # implementation, and replaces roundup.indexer)
388 # - only index text/plain and rfc822/message (ideas for other text formats to
389 # index are welcome)
390 # - added simple unit test for indexer. Needs more tests for regression.
391 #
392 #Revision 1.2 2002/05/25 07:16:24 rochecompaan
393 #Merged search_indexing-branch with HEAD
394 #
395 #Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
396 #Fixed small bug that prevented indexes from being generated.
397 #
398 #Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
399 #cgi_client.py
400 # removed search link for the time being
401 # moved rendering of matches to htmltemplate
402 #hyperdb.py
403 # filtering of nodes on full text search incorporated in filter method
404 #roundupdb.py
405 # added paramater to call of filter method
406 #roundup_indexer.py
407 # added search method to RoundupIndexer class
408 #
409 #Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
410 # . Added feature #526730 - search for messages capability
411 #