1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.8 2002-07-09 21:53:38 gmcm Exp $
18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of
21 messages, string properties and text files possible.
22 '''
23 import os, shutil, re, mimetypes, marshal, zlib, errno
24 from hyperdb import Link, Multilink
26 class Indexer:
27 ''' Indexes information from roundup's hyperdb to allow efficient
28 searching.
30 Three structures are created by the indexer:
31 files {identifier: (fileid, wordcount)}
32 words {word: {fileid: count}}
33 fileids {fileid: identifier}
34 where identifier is (classname, nodeid, propertyname)
35 '''
36 def __init__(self, db_path):
37 self.indexdb_path = os.path.join(db_path, 'indexes')
38 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
39 self.reindex = 0
40 self.quiet = 9
41 self.changed = 0
43 # see if we need to reindex because of a change in code
44 if (not os.path.exists(self.indexdb_path) or
45 not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
46 # TODO: if the version file exists (in the future) we'll want to
47 # check the value in it - for now the file itself is a flag
48 self.force_reindex()
50 def force_reindex(self):
51 '''Force a reindex condition
52 '''
53 if os.path.exists(self.indexdb_path):
54 shutil.rmtree(self.indexdb_path)
55 os.makedirs(self.indexdb_path)
56 os.chmod(self.indexdb_path, 0775)
57 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
58 self.reindex = 1
59 self.changed = 1
61 def should_reindex(self):
62 '''Should we reindex?
63 '''
64 return self.reindex
66 def add_text(self, identifier, text, mime_type='text/plain'):
67 ''' Add some text associated with the (classname, nodeid, property)
68 identifier.
69 '''
70 # make sure the index is loaded
71 self.load_index()
73 # remove old entries for this identifier
74 if self.files.has_key(identifier):
75 self.purge_entry(identifier)
77 # split into words
78 words = self.splitter(text, mime_type)
80 # Find new file index, and assign it to identifier
81 # (_TOP uses trick of negative to avoid conflict with file index)
82 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
83 file_index = abs(self.files['_TOP'][0])
84 self.files[identifier] = (file_index, len(words))
85 self.fileids[file_index] = identifier
87 # find the unique words
88 filedict = {}
89 for word in words:
90 if filedict.has_key(word):
91 filedict[word] = filedict[word]+1
92 else:
93 filedict[word] = 1
95 # now add to the totals
96 for word in filedict.keys():
97 # each word has a dict of {identifier: count}
98 if self.words.has_key(word):
99 entry = self.words[word]
100 else:
101 # new word
102 entry = {}
103 self.words[word] = entry
105 # make a reference to the file for this word
106 entry[file_index] = filedict[word]
108 # save needed
109 self.changed = 1
111 def splitter(self, text, ftype):
112 ''' Split the contents of a text string into a list of 'words'
113 '''
114 if ftype == 'text/plain':
115 words = self.text_splitter(text)
116 else:
117 return []
118 return words
120 def text_splitter(self, text):
121 """Split text/plain string into a list of words
122 """
123 # case insensitive
124 text = text.upper()
126 # Split the raw text, losing anything longer than 25 characters
127 # since that'll be gibberish (encoded text or somesuch) or shorter
128 # than 3 characters since those short words appear all over the
129 # place
130 return re.findall(r'\b\w{2,25}\b', text)
132 def search(self, search_terms, klass, ignore={},
133 dre=re.compile(r'([^\d]+)(\d+)')):
134 ''' Display search results looking for [search, terms] associated
135 with the hyperdb Class "klass". Ignore hits on {class: property}.
137 "dre" is a helper, not an argument.
138 '''
139 # do the index lookup
140 hits = self.find(search_terms)
141 if not hits:
142 return {}
144 #designator_propname = {'msg': 'messages', 'file': 'files'}
145 designator_propname = {}
146 for nm, propclass in klass.getprops().items():
147 if isinstance(propclass, Link) or isinstance(propclass, Multilink):
148 designator_propname[propclass.classname] = nm
150 # build a dictionary of nodes and their associated messages
151 # and files
152 nodeids = {} # this is the answer
153 propspec = {} # used to do the klass.find
154 for propname in designator_propname.values():
155 propspec[propname] = {} # used as a set (value doesn't matter)
156 for classname, nodeid, property in hits.values():
157 # skip this result if we don't care about this class/property
158 if ignore.has_key((classname, property)):
159 continue
161 # if it's a property on klass, it's easy
162 if classname == klass.classname:
163 if not nodeids.has_key(nodeid):
164 nodeids[nodeid] = {}
165 continue
167 # it's a linked class - set up to do the klass.find
168 linkprop = designator_propname[classname] # eg, msg -> messages
169 propspec[linkprop][nodeid] = 1
171 # retain only the meaningful entries
172 for propname, idset in propspec.items():
173 if not idset:
174 del propspec[propname]
176 # klass.find tells me the klass nodeids the linked nodes relate to
177 for resid in klass.find(**propspec):
178 resid = str(resid)
179 if not nodeids.has_key(id):
180 nodeids[resid] = {}
181 node_dict = nodeids[resid]
182 # now figure out where it came from
183 for linkprop in propspec.keys():
184 for nodeid in klass.get(resid, linkprop):
185 if propspec[linkprop].has_key(nodeid):
186 # OK, this node[propname] has a winner
187 if not node_dict.has_key(linkprop):
188 node_dict[linkprop] = [nodeid]
189 else:
190 node_dict[linkprop].append(nodeid)
191 return nodeids
193 # we override this to ignore not 2 < word < 25 and also to fix a bug -
194 # the (fail) case.
195 def find(self, wordlist):
196 ''' Locate files that match ALL the words in wordlist
197 '''
198 if not hasattr(self, 'words'):
199 self.load_index()
200 self.load_index(wordlist=wordlist)
201 entries = {}
202 hits = None
203 for word in wordlist:
204 if not 2 < len(word) < 25:
205 # word outside the bounds of what we index - ignore
206 continue
207 word = word.upper()
208 entry = self.words.get(word) # For each word, get index
209 entries[word] = entry # of matching files
210 if not entry: # Nothing for this one word (fail)
211 return {}
212 if hits is None:
213 hits = {}
214 for k in entry.keys():
215 hits[k] = self.fileids[k]
216 else:
217 # Eliminate hits for every non-match
218 for fileid in hits.keys():
219 if not entry.has_key(fileid):
220 del hits[fileid]
221 if hits is None:
222 return {}
223 return hits
225 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
226 def load_index(self, reload=0, wordlist=None):
227 # Unless reload is indicated, do not load twice
228 if self.index_loaded() and not reload:
229 return 0
231 # Ok, now let's actually load it
232 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
234 # Identify the relevant word-dictionary segments
235 if not wordlist:
236 segments = self.segments
237 else:
238 segments = ['-','#']
239 for word in wordlist:
240 segments.append(word[0].upper())
242 # Load the segments
243 for segment in segments:
244 try:
245 f = open(self.indexdb + segment, 'rb')
246 except IOError, error:
247 if error.errno != errno.ENOENT:
248 raise
249 else:
250 pickle_str = zlib.decompress(f.read())
251 f.close()
252 dbslice = marshal.loads(pickle_str)
253 if dbslice.get('WORDS'):
254 # if it has some words, add them
255 for word, entry in dbslice['WORDS'].items():
256 db['WORDS'][word] = entry
257 if dbslice.get('FILES'):
258 # if it has some files, add them
259 db['FILES'] = dbslice['FILES']
260 if dbslice.get('FILEIDS'):
261 # if it has fileids, add them
262 db['FILEIDS'] = dbslice['FILEIDS']
264 self.words = db['WORDS']
265 self.files = db['FILES']
266 self.fileids = db['FILEIDS']
267 self.changed = 0
269 def save_index(self):
270 # only save if the index is loaded and changed
271 if not self.index_loaded() or not self.changed:
272 return
274 # brutal space saver... delete all the small segments
275 for segment in self.segments:
276 try:
277 os.remove(self.indexdb + segment)
278 except OSError:
279 # probably just nonexistent segment index file
280 # TODO: make sure it's an EEXIST
281 pass
283 # First write the much simpler filename/fileid dictionaries
284 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
285 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
287 # The hard part is splitting the word dictionary up, of course
288 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
289 segdicts = {} # Need batch of empty dicts
290 for segment in letters:
291 segdicts[segment] = {}
292 for word, entry in self.words.items(): # Split into segment dicts
293 initchar = word[0].upper()
294 segdicts[initchar][word] = entry
296 # save
297 for initchar in letters:
298 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
299 pickle_str = marshal.dumps(db)
300 filename = self.indexdb + initchar
301 pickle_fh = open(filename, 'wb')
302 pickle_fh.write(zlib.compress(pickle_str))
303 os.chmod(filename, 0664)
305 # save done
306 self.changed = 0
308 def purge_entry(self, identifier):
309 ''' Remove a file from file index and word index
310 '''
311 if not self.files.has_key(identifier):
312 return
314 file_index = self.files[identifier][0]
315 del self.files[identifier]
316 del self.fileids[file_index]
318 # The much harder part, cleanup the word index
319 for key, occurs in self.words.items():
320 if occurs.has_key(file_index):
321 del occurs[file_index]
323 # save needed
324 self.changed = 1
326 def index_loaded(self):
327 return (hasattr(self,'fileids') and hasattr(self,'files') and
328 hasattr(self,'words'))
330 #
331 #$Log: not supported by cvs2svn $
332 #Revision 1.7 2002/07/09 21:38:43 richard
333 #Only save the index if the thing is loaded and changed. Also, don't load
334 #the index just for a save.
335 #
336 #Revision 1.6 2002/07/09 04:26:44 richard
337 #We're indexing numbers now, and _underscore words
338 #
339 #Revision 1.5 2002/07/09 04:19:09 richard
340 #Added reindex command to roundup-admin.
341 #Fixed reindex on first access.
342 #Also fixed reindexing of entries that change.
343 #
344 #Revision 1.4 2002/07/09 03:02:52 richard
345 #More indexer work:
346 #- all String properties may now be indexed too. Currently there's a bit of
347 # "issue" specific code in the actual searching which needs to be
348 # addressed. In a nutshell:
349 # + pass 'indexme="yes"' as a String() property initialisation arg, eg:
350 # file = FileClass(db, "file", name=String(), type=String(),
351 # comment=String(indexme="yes"))
352 # + the comment will then be indexed and be searchable, with the results
353 # related back to the issue that the file is linked to
354 #- as a result of this work, the FileClass has a default MIME type that may
355 # be overridden in a subclass, or by the use of a "type" property as is
356 # done in the default templates.
357 #- the regeneration of the indexes (if necessary) is done once the schema is
358 # set up in the dbinit.
359 #
360 #Revision 1.3 2002/07/08 06:58:15 richard
361 #cleaned up the indexer code:
362 # - it splits more words out (much simpler, faster splitter)
363 # - removed code we'll never use (roundup.roundup_indexer has the full
364 # implementation, and replaces roundup.indexer)
365 # - only index text/plain and rfc822/message (ideas for other text formats to
366 # index are welcome)
367 # - added simple unit test for indexer. Needs more tests for regression.
368 #
369 #Revision 1.2 2002/05/25 07:16:24 rochecompaan
370 #Merged search_indexing-branch with HEAD
371 #
372 #Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
373 #Fixed small bug that prevented indexes from being generated.
374 #
375 #Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
376 #cgi_client.py
377 # removed search link for the time being
378 # moved rendering of matches to htmltemplate
379 #hyperdb.py
380 # filtering of nodes on full text search incorporated in filter method
381 #roundupdb.py
382 # added paramater to call of filter method
383 #roundup_indexer.py
384 # added search method to RoundupIndexer class
385 #
386 #Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
387 # . Added feature #526730 - search for messages capability
388 #