4cd5d24ad1b763e5571f1d4bc25233a22715594d
1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $
18 '''This module provides an indexer class, RoundupIndexer, that stores text
19 indices in a roundup instance. This class makes searching the content of
20 messages, string properties and text files possible.
21 '''
22 __docformat__ = 'restructuredtext'
24 import os, shutil, re, mimetypes, marshal, zlib, errno
25 from hyperdb import Link, Multilink
27 class Indexer:
28 '''Indexes information from roundup's hyperdb to allow efficient
29 searching.
31 Three structures are created by the indexer::
33 files {identifier: (fileid, wordcount)}
34 words {word: {fileid: count}}
35 fileids {fileid: identifier}
37 where identifier is (classname, nodeid, propertyname)
38 '''
39 def __init__(self, db_path):
40 self.indexdb_path = os.path.join(db_path, 'indexes')
41 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
42 self.reindex = 0
43 self.quiet = 9
44 self.changed = 0
46 # see if we need to reindex because of a change in code
47 version = os.path.join(self.indexdb_path, 'version')
48 if (not os.path.exists(self.indexdb_path) or
49 not os.path.exists(version)):
50 # for now the file itself is a flag
51 self.force_reindex()
52 elif os.path.exists(version):
53 version = open(version).read()
54 # check the value and reindex if it's not the latest
55 if version.strip() != '1':
56 self.force_reindex()
58 def force_reindex(self):
59 '''Force a reindex condition
60 '''
61 if os.path.exists(self.indexdb_path):
62 shutil.rmtree(self.indexdb_path)
63 os.makedirs(self.indexdb_path)
64 os.chmod(self.indexdb_path, 0775)
65 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
66 self.reindex = 1
67 self.changed = 1
69 def should_reindex(self):
70 '''Should we reindex?
71 '''
72 return self.reindex
74 def add_text(self, identifier, text, mime_type='text/plain'):
75 '''Add some text associated with the (classname, nodeid, property)
76 identifier.
77 '''
78 # make sure the index is loaded
79 self.load_index()
81 # remove old entries for this identifier
82 if self.files.has_key(identifier):
83 self.purge_entry(identifier)
85 # split into words
86 words = self.splitter(text, mime_type)
88 # Find new file index, and assign it to identifier
89 # (_TOP uses trick of negative to avoid conflict with file index)
90 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
91 file_index = abs(self.files['_TOP'][0])
92 self.files[identifier] = (file_index, len(words))
93 self.fileids[file_index] = identifier
95 # find the unique words
96 filedict = {}
97 for word in words:
98 if filedict.has_key(word):
99 filedict[word] = filedict[word]+1
100 else:
101 filedict[word] = 1
103 # now add to the totals
104 for word in filedict.keys():
105 # each word has a dict of {identifier: count}
106 if self.words.has_key(word):
107 entry = self.words[word]
108 else:
109 # new word
110 entry = {}
111 self.words[word] = entry
113 # make a reference to the file for this word
114 entry[file_index] = filedict[word]
116 # save needed
117 self.changed = 1
119 def splitter(self, text, ftype):
120 '''Split the contents of a text string into a list of 'words'
121 '''
122 if ftype == 'text/plain':
123 words = self.text_splitter(text)
124 else:
125 return []
126 return words
128 def text_splitter(self, text):
129 """Split text/plain string into a list of words
130 """
131 # case insensitive
132 text = text.upper()
134 # Split the raw text, losing anything longer than 25 characters
135 # since that'll be gibberish (encoded text or somesuch) or shorter
136 # than 3 characters since those short words appear all over the
137 # place
138 return re.findall(r'\b\w{2,25}\b', text)
140 def search(self, search_terms, klass, ignore={},
141 dre=re.compile(r'([^\d]+)(\d+)')):
142 '''Display search results looking for [search, terms] associated
143 with the hyperdb Class "klass". Ignore hits on {class: property}.
145 "dre" is a helper, not an argument.
146 '''
147 # do the index lookup
148 hits = self.find(search_terms)
149 if not hits:
150 return {}
152 designator_propname = {}
153 for nm, propclass in klass.getprops().items():
154 if isinstance(propclass, Link) or isinstance(propclass, Multilink):
155 designator_propname[propclass.classname] = nm
157 # build a dictionary of nodes and their associated messages
158 # and files
159 nodeids = {} # this is the answer
160 propspec = {} # used to do the klass.find
161 for propname in designator_propname.values():
162 propspec[propname] = {} # used as a set (value doesn't matter)
163 for classname, nodeid, property in hits.values():
164 # skip this result if we don't care about this class/property
165 if ignore.has_key((classname, property)):
166 continue
168 # if it's a property on klass, it's easy
169 if classname == klass.classname:
170 if not nodeids.has_key(nodeid):
171 nodeids[nodeid] = {}
172 continue
174 # make sure the class is a linked one, otherwise ignore
175 if not designator_propname.has_key(classname):
176 continue
178 # it's a linked class - set up to do the klass.find
179 linkprop = designator_propname[classname] # eg, msg -> messages
180 propspec[linkprop][nodeid] = 1
182 # retain only the meaningful entries
183 for propname, idset in propspec.items():
184 if not idset:
185 del propspec[propname]
187 # klass.find tells me the klass nodeids the linked nodes relate to
188 for resid in klass.find(**propspec):
189 resid = str(resid)
190 if not nodeids.has_key(id):
191 nodeids[resid] = {}
192 node_dict = nodeids[resid]
193 # now figure out where it came from
194 for linkprop in propspec.keys():
195 for nodeid in klass.get(resid, linkprop):
196 if propspec[linkprop].has_key(nodeid):
197 # OK, this node[propname] has a winner
198 if not node_dict.has_key(linkprop):
199 node_dict[linkprop] = [nodeid]
200 else:
201 node_dict[linkprop].append(nodeid)
202 return nodeids
204 # we override this to ignore not 2 < word < 25 and also to fix a bug -
205 # the (fail) case.
206 def find(self, wordlist):
207 '''Locate files that match ALL the words in wordlist
208 '''
209 if not hasattr(self, 'words'):
210 self.load_index()
211 self.load_index(wordlist=wordlist)
212 entries = {}
213 hits = None
214 for word in wordlist:
215 if not 2 < len(word) < 25:
216 # word outside the bounds of what we index - ignore
217 continue
218 word = word.upper()
219 entry = self.words.get(word) # For each word, get index
220 entries[word] = entry # of matching files
221 if not entry: # Nothing for this one word (fail)
222 return {}
223 if hits is None:
224 hits = {}
225 for k in entry.keys():
226 if not self.fileids.has_key(k):
227 raise ValueError, 'Index is corrupted: re-generate it'
228 hits[k] = self.fileids[k]
229 else:
230 # Eliminate hits for every non-match
231 for fileid in hits.keys():
232 if not entry.has_key(fileid):
233 del hits[fileid]
234 if hits is None:
235 return {}
236 return hits
238 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
239 def load_index(self, reload=0, wordlist=None):
240 # Unless reload is indicated, do not load twice
241 if self.index_loaded() and not reload:
242 return 0
244 # Ok, now let's actually load it
245 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
247 # Identify the relevant word-dictionary segments
248 if not wordlist:
249 segments = self.segments
250 else:
251 segments = ['-','#']
252 for word in wordlist:
253 segments.append(word[0].upper())
255 # Load the segments
256 for segment in segments:
257 try:
258 f = open(self.indexdb + segment, 'rb')
259 except IOError, error:
260 # probably just nonexistent segment index file
261 if error.errno != errno.ENOENT: raise
262 else:
263 pickle_str = zlib.decompress(f.read())
264 f.close()
265 dbslice = marshal.loads(pickle_str)
266 if dbslice.get('WORDS'):
267 # if it has some words, add them
268 for word, entry in dbslice['WORDS'].items():
269 db['WORDS'][word] = entry
270 if dbslice.get('FILES'):
271 # if it has some files, add them
272 db['FILES'] = dbslice['FILES']
273 if dbslice.get('FILEIDS'):
274 # if it has fileids, add them
275 db['FILEIDS'] = dbslice['FILEIDS']
277 self.words = db['WORDS']
278 self.files = db['FILES']
279 self.fileids = db['FILEIDS']
280 self.changed = 0
282 def save_index(self):
283 # only save if the index is loaded and changed
284 if not self.index_loaded() or not self.changed:
285 return
287 # brutal space saver... delete all the small segments
288 for segment in self.segments:
289 try:
290 os.remove(self.indexdb + segment)
291 except OSError, error:
292 # probably just nonexistent segment index file
293 if error.errno != errno.ENOENT: raise
295 # First write the much simpler filename/fileid dictionaries
296 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
297 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
299 # The hard part is splitting the word dictionary up, of course
300 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
301 segdicts = {} # Need batch of empty dicts
302 for segment in letters:
303 segdicts[segment] = {}
304 for word, entry in self.words.items(): # Split into segment dicts
305 initchar = word[0].upper()
306 segdicts[initchar][word] = entry
308 # save
309 for initchar in letters:
310 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
311 pickle_str = marshal.dumps(db)
312 filename = self.indexdb + initchar
313 pickle_fh = open(filename, 'wb')
314 pickle_fh.write(zlib.compress(pickle_str))
315 os.chmod(filename, 0664)
317 # save done
318 self.changed = 0
320 def purge_entry(self, identifier):
321 '''Remove a file from file index and word index
322 '''
323 self.load_index()
325 if not self.files.has_key(identifier):
326 return
328 file_index = self.files[identifier][0]
329 del self.files[identifier]
330 del self.fileids[file_index]
332 # The much harder part, cleanup the word index
333 for key, occurs in self.words.items():
334 if occurs.has_key(file_index):
335 del occurs[file_index]
337 # save needed
338 self.changed = 1
340 def index_loaded(self):
341 return (hasattr(self,'fileids') and hasattr(self,'files') and
342 hasattr(self,'words'))
344 # vim: set filetype=python ts=4 sw=4 et si