1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.17 2004-01-20 03:58:38 richard Exp $
18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of
21 messages, string properties and text files possible.
22 '''
23 import os, shutil, re, mimetypes, marshal, zlib, errno
24 from hyperdb import Link, Multilink
26 class Indexer:
27 ''' Indexes information from roundup's hyperdb to allow efficient
28 searching.
30 Three structures are created by the indexer:
31 files {identifier: (fileid, wordcount)}
32 words {word: {fileid: count}}
33 fileids {fileid: identifier}
34 where identifier is (classname, nodeid, propertyname)
35 '''
36 def __init__(self, db_path):
37 self.indexdb_path = os.path.join(db_path, 'indexes')
38 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
39 self.reindex = 0
40 self.quiet = 9
41 self.changed = 0
43 # see if we need to reindex because of a change in code
44 version = os.path.join(self.indexdb_path, 'version')
45 if (not os.path.exists(self.indexdb_path) or
46 not os.path.exists(version)):
47 # for now the file itself is a flag
48 self.force_reindex()
49 elif os.path.exists(version):
50 version = open(version).read()
51 # check the value and reindex if it's not the latest
52 if version.strip() != '1':
53 self.force_reindex()
55 def force_reindex(self):
56 '''Force a reindex condition
57 '''
58 if os.path.exists(self.indexdb_path):
59 shutil.rmtree(self.indexdb_path)
60 os.makedirs(self.indexdb_path)
61 os.chmod(self.indexdb_path, 0775)
62 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
63 self.reindex = 1
64 self.changed = 1
66 def should_reindex(self):
67 '''Should we reindex?
68 '''
69 return self.reindex
71 def add_text(self, identifier, text, mime_type='text/plain'):
72 ''' Add some text associated with the (classname, nodeid, property)
73 identifier.
74 '''
75 # make sure the index is loaded
76 self.load_index()
78 # remove old entries for this identifier
79 if self.files.has_key(identifier):
80 self.purge_entry(identifier)
82 # split into words
83 words = self.splitter(text, mime_type)
85 # Find new file index, and assign it to identifier
86 # (_TOP uses trick of negative to avoid conflict with file index)
87 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
88 file_index = abs(self.files['_TOP'][0])
89 self.files[identifier] = (file_index, len(words))
90 self.fileids[file_index] = identifier
92 # find the unique words
93 filedict = {}
94 for word in words:
95 if filedict.has_key(word):
96 filedict[word] = filedict[word]+1
97 else:
98 filedict[word] = 1
100 # now add to the totals
101 for word in filedict.keys():
102 # each word has a dict of {identifier: count}
103 if self.words.has_key(word):
104 entry = self.words[word]
105 else:
106 # new word
107 entry = {}
108 self.words[word] = entry
110 # make a reference to the file for this word
111 entry[file_index] = filedict[word]
113 # save needed
114 self.changed = 1
116 def splitter(self, text, ftype):
117 ''' Split the contents of a text string into a list of 'words'
118 '''
119 if ftype == 'text/plain':
120 words = self.text_splitter(text)
121 else:
122 return []
123 return words
125 def text_splitter(self, text):
126 """Split text/plain string into a list of words
127 """
128 # case insensitive
129 text = text.upper()
131 # Split the raw text, losing anything longer than 25 characters
132 # since that'll be gibberish (encoded text or somesuch) or shorter
133 # than 3 characters since those short words appear all over the
134 # place
135 return re.findall(r'\b\w{2,25}\b', text)
137 def search(self, search_terms, klass, ignore={},
138 dre=re.compile(r'([^\d]+)(\d+)')):
139 ''' Display search results looking for [search, terms] associated
140 with the hyperdb Class "klass". Ignore hits on {class: property}.
142 "dre" is a helper, not an argument.
143 '''
144 # do the index lookup
145 hits = self.find(search_terms)
146 if not hits:
147 return {}
149 designator_propname = {}
150 for nm, propclass in klass.getprops().items():
151 if isinstance(propclass, Link) or isinstance(propclass, Multilink):
152 designator_propname[propclass.classname] = nm
154 # build a dictionary of nodes and their associated messages
155 # and files
156 nodeids = {} # this is the answer
157 propspec = {} # used to do the klass.find
158 for propname in designator_propname.values():
159 propspec[propname] = {} # used as a set (value doesn't matter)
160 for classname, nodeid, property in hits.values():
161 # skip this result if we don't care about this class/property
162 if ignore.has_key((classname, property)):
163 continue
165 # if it's a property on klass, it's easy
166 if classname == klass.classname:
167 if not nodeids.has_key(nodeid):
168 nodeids[nodeid] = {}
169 continue
171 # make sure the class is a linked one, otherwise ignore
172 if not designator_propname.has_key(classname):
173 continue
175 # it's a linked class - set up to do the klass.find
176 linkprop = designator_propname[classname] # eg, msg -> messages
177 propspec[linkprop][nodeid] = 1
179 # retain only the meaningful entries
180 for propname, idset in propspec.items():
181 if not idset:
182 del propspec[propname]
184 # klass.find tells me the klass nodeids the linked nodes relate to
185 for resid in klass.find(**propspec):
186 resid = str(resid)
187 if not nodeids.has_key(id):
188 nodeids[resid] = {}
189 node_dict = nodeids[resid]
190 # now figure out where it came from
191 for linkprop in propspec.keys():
192 for nodeid in klass.get(resid, linkprop):
193 if propspec[linkprop].has_key(nodeid):
194 # OK, this node[propname] has a winner
195 if not node_dict.has_key(linkprop):
196 node_dict[linkprop] = [nodeid]
197 else:
198 node_dict[linkprop].append(nodeid)
199 return nodeids
201 # we override this to ignore not 2 < word < 25 and also to fix a bug -
202 # the (fail) case.
203 def find(self, wordlist):
204 ''' Locate files that match ALL the words in wordlist
205 '''
206 if not hasattr(self, 'words'):
207 self.load_index()
208 self.load_index(wordlist=wordlist)
209 entries = {}
210 hits = None
211 for word in wordlist:
212 if not 2 < len(word) < 25:
213 # word outside the bounds of what we index - ignore
214 continue
215 word = word.upper()
216 entry = self.words.get(word) # For each word, get index
217 entries[word] = entry # of matching files
218 if not entry: # Nothing for this one word (fail)
219 return {}
220 if hits is None:
221 hits = {}
222 for k in entry.keys():
223 if not self.fileids.has_key(k):
224 raise ValueError, 'Index is corrupted: re-generate it'
225 hits[k] = self.fileids[k]
226 else:
227 # Eliminate hits for every non-match
228 for fileid in hits.keys():
229 if not entry.has_key(fileid):
230 del hits[fileid]
231 if hits is None:
232 return {}
233 return hits
235 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
236 def load_index(self, reload=0, wordlist=None):
237 # Unless reload is indicated, do not load twice
238 if self.index_loaded() and not reload:
239 return 0
241 # Ok, now let's actually load it
242 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
244 # Identify the relevant word-dictionary segments
245 if not wordlist:
246 segments = self.segments
247 else:
248 segments = ['-','#']
249 for word in wordlist:
250 segments.append(word[0].upper())
252 # Load the segments
253 for segment in segments:
254 try:
255 f = open(self.indexdb + segment, 'rb')
256 except IOError, error:
257 # probably just nonexistent segment index file
258 if error.errno != errno.ENOENT: raise
259 else:
260 pickle_str = zlib.decompress(f.read())
261 f.close()
262 dbslice = marshal.loads(pickle_str)
263 if dbslice.get('WORDS'):
264 # if it has some words, add them
265 for word, entry in dbslice['WORDS'].items():
266 db['WORDS'][word] = entry
267 if dbslice.get('FILES'):
268 # if it has some files, add them
269 db['FILES'] = dbslice['FILES']
270 if dbslice.get('FILEIDS'):
271 # if it has fileids, add them
272 db['FILEIDS'] = dbslice['FILEIDS']
274 self.words = db['WORDS']
275 self.files = db['FILES']
276 self.fileids = db['FILEIDS']
277 self.changed = 0
279 def save_index(self):
280 # only save if the index is loaded and changed
281 if not self.index_loaded() or not self.changed:
282 return
284 # brutal space saver... delete all the small segments
285 for segment in self.segments:
286 try:
287 os.remove(self.indexdb + segment)
288 except OSError, error:
289 # probably just nonexistent segment index file
290 if error.errno != errno.ENOENT: raise
292 # First write the much simpler filename/fileid dictionaries
293 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
294 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
296 # The hard part is splitting the word dictionary up, of course
297 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
298 segdicts = {} # Need batch of empty dicts
299 for segment in letters:
300 segdicts[segment] = {}
301 for word, entry in self.words.items(): # Split into segment dicts
302 initchar = word[0].upper()
303 segdicts[initchar][word] = entry
305 # save
306 for initchar in letters:
307 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
308 pickle_str = marshal.dumps(db)
309 filename = self.indexdb + initchar
310 pickle_fh = open(filename, 'wb')
311 pickle_fh.write(zlib.compress(pickle_str))
312 os.chmod(filename, 0664)
314 # save done
315 self.changed = 0
317 def purge_entry(self, identifier):
318 ''' Remove a file from file index and word index
319 '''
320 self.load_index()
322 if not self.files.has_key(identifier):
323 return
325 file_index = self.files[identifier][0]
326 del self.files[identifier]
327 del self.fileids[file_index]
329 # The much harder part, cleanup the word index
330 for key, occurs in self.words.items():
331 if occurs.has_key(file_index):
332 del occurs[file_index]
334 # save needed
335 self.changed = 1
337 def index_loaded(self):
338 return (hasattr(self,'fileids') and hasattr(self,'files') and
339 hasattr(self,'words'))
341 # vim: set filetype=python ts=4 sw=4 et si