1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $
18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of
21 messages and text files possible.
22 '''
23 import os, shutil, re, mimetypes, marshal, zlib, errno
25 class Indexer:
26 ''' Indexes messages and files.
28 This implements a new splitter based on re.findall '\w+' and the
29 add_othertext method.
30 '''
31 def __init__(self, db_path):
32 indexdb_path = os.path.join(db_path, 'indexes')
34 # see if we need to reindex because of a change in code
35 if (os.path.exists(indexdb_path) and
36 not os.path.exists(os.path.join(indexdb_path, 'version'))):
37 shutil.rmtree(indexdb_path)
39 # see if the index exists
40 index_exists = 0
41 if not os.path.exists(indexdb_path):
42 os.makedirs(indexdb_path)
43 os.chmod(indexdb_path, 0775)
44 open(os.path.join(indexdb_path, 'version'), 'w').write('1\n')
45 else:
46 index_exists = 1
48 # save off the path to the indexdb
49 self.indexdb = os.path.join(indexdb_path, 'index.db')
50 self.reindex = 0
51 self.casesensitive = 0
52 self.quiet = 9
54 if not index_exists:
55 # index everything
56 files_path = os.path.join(db_path, 'files')
57 self.add_files(dir=files_path)
58 self.save_index()
60 # override add_files so it's a little smarter about file types
61 def add_files(self, dir):
62 if not hasattr(self, 'files'):
63 self.load_index()
64 os.path.walk(dir, self.walk_add_file, None)
65 # Rebuild the fileid index
66 self.fileids = {}
67 for fname in self.files.keys():
68 fileid = self.files[fname][0]
69 self.fileids[fileid] = fname
71 # override add_file so it can be a little smarter about determining the
72 # file type
73 def walk_add_file(self, arg, dname, names, ftype=None):
74 for name in names:
75 name = os.path.join(dname, name)
76 if os.path.isfile(name):
77 self.add_file(name)
78 elif os.path.isdir(name):
79 os.path.walk(name, self.walk_add_file, None)
80 def add_file(self, fname, ftype=None):
81 ''' Index the contents of a regular file
82 '''
83 if not hasattr(self, 'files'):
84 self.load_index()
85 # Is file eligible for (re)indexing?
86 if self.files.has_key(fname):
87 if self.reindex:
88 # Reindexing enabled, cleanup dicts
89 self.purge_entry(fname, self.files, self.words)
90 else:
91 # DO NOT reindex this file
92 if self.quiet < 5:
93 print "Skipping", fname
94 return 0
96 # guess the file type
97 if ftype is None:
98 ftype = mimetypes.guess_type(fname)
100 # read in the file
101 text = open(fname).read()
102 if self.quiet < 5: print "Indexing", fname
103 words = self.splitter(text, ftype)
105 # Find new file index, and assign it to filename
106 # (_TOP uses trick of negative to avoid conflict with file index)
107 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
108 file_index = abs(self.files['_TOP'][0])
109 self.files[fname] = (file_index, len(words))
111 filedict = {}
112 for word in words:
113 if filedict.has_key(word):
114 filedict[word] = filedict[word]+1
115 else:
116 filedict[word] = 1
118 for word in filedict.keys():
119 if self.words.has_key(word):
120 entry = self.words[word]
121 else:
122 entry = {}
123 entry[file_index] = filedict[word]
124 self.words[word] = entry
126 # NOTE: this method signature deviates from the one specified in
127 # indexer - I'm not entirely sure where it was expected to the text
128 # from otherwise...
129 def add_othertext(self, identifier, text):
130 ''' Add some text associated with the identifier
131 '''
132 # Is file eligible for (re)indexing?
133 if self.files.has_key(identifier):
134 # Reindexing enabled, cleanup dicts
135 if self.reindex:
136 self.purge_entry(identifier, self.files, self.words)
137 else:
138 # DO NOT reindex this file
139 if self.quiet < 5:
140 print "Not reindexing", identifier
141 return 0
143 # split into words
144 words = self.splitter(text, 'text/plain')
146 # Find new file index, and assign it to identifier
147 # (_TOP uses trick of negative to avoid conflict with file index)
148 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
149 file_index = abs(self.files['_TOP'][0])
150 self.files[identifier] = (file_index, len(words))
151 self.fileids[file_index] = identifier
153 # find the unique words
154 filedict = {}
155 for word in words:
156 if filedict.has_key(word):
157 filedict[word] = filedict[word]+1
158 else:
159 filedict[word] = 1
161 # now add to the totals
162 for word in filedict.keys():
163 # each word has a dict of {identifier: count}
164 if self.words.has_key(word):
165 entry = self.words[word]
166 else:
167 # new word
168 entry = {}
169 self.words[word] = entry
171 # make a reference to the file for this word
172 entry[file_index] = filedict[word]
174 def splitter(self, text, ftype):
175 ''' Split the contents of a text string into a list of 'words'
176 '''
177 if ftype in ('text/plain', 'message/rfc822'):
178 words = self.text_splitter(text, self.casesensitive)
179 else:
180 return []
181 return words
183 def text_splitter(self, text, casesensitive=0):
184 """Split text/plain string into a list of words
185 """
186 # Let's adjust case if not case-sensitive
187 if not casesensitive:
188 text = text.upper()
190 # Split the raw text, losing anything longer than 25 characters
191 # since that'll be gibberish (encoded text or somesuch) or shorter
192 # than 3 characters since those short words appear all over the
193 # place
194 return re.findall(r'\b\w{2,25}\b', text)
196 def search(self, search_terms, klass):
197 ''' display search results
198 '''
199 hits = self.find(search_terms)
200 links = []
201 nodeids = {}
202 designator_propname = {'msg': 'messages', 'file': 'files'}
203 if hits:
204 hitcount = len(hits)
205 # build a dictionary of nodes and their associated messages
206 # and files
207 for hit in hits.keys():
208 filename = hits[hit].split('/')[-1]
209 for designator, propname in designator_propname.items():
210 if not filename.startswith(designator):
211 continue
212 nodeid = filename[len(designator):]
213 result = apply(klass.find, (), {propname:nodeid})
214 if not result:
215 continue
217 id = str(result[0])
218 if not nodeids.has_key(id):
219 nodeids[id] = {}
221 node_dict = nodeids[id]
222 if not node_dict.has_key(propname):
223 node_dict[propname] = [nodeid]
224 elif node_dict.has_key(propname):
225 node_dict[propname].append(nodeid)
227 return nodeids
229 # we override this to ignore not 2 < word < 25 and also to fix a bug -
230 # the (fail) case.
231 def find(self, wordlist):
232 ''' Locate files that match ALL the words in wordlist
233 '''
234 if not hasattr(self, 'words'):
235 self.load_index()
236 self.load_index(wordlist=wordlist)
237 entries = {}
238 hits = None
239 for word in wordlist:
240 if not 2 < len(word) < 25:
241 # word outside the bounds of what we index - ignore
242 continue
243 if not self.casesensitive:
244 word = word.upper()
245 entry = self.words.get(word) # For each word, get index
246 entries[word] = entry # of matching files
247 if not entry: # Nothing for this one word (fail)
248 return {}
249 if hits is None:
250 hits = {}
251 for k in entry.keys():
252 hits[k] = self.fileids[k]
253 else:
254 # Eliminate hits for every non-match
255 for fileid in hits.keys():
256 if not entry.has_key(fileid):
257 del hits[fileid]
258 if hits is None:
259 return {}
260 return hits
262 segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
263 def load_index(self, reload=0, wordlist=None):
264 # Unless reload is indicated, do not load twice
265 if self.index_loaded() and not reload:
266 return 0
268 # Ok, now let's actually load it
269 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
271 # Identify the relevant word-dictionary segments
272 if not wordlist:
273 segments = self.segments
274 else:
275 segments = ['-','#']
276 for word in wordlist:
277 segments.append(word[0].upper())
279 # Load the segments
280 for segment in segments:
281 try:
282 f = open(self.indexdb + segment, 'rb')
283 except IOError, error:
284 if error.errno != errno.ENOENT:
285 raise
286 else:
287 pickle_str = zlib.decompress(f.read())
288 f.close()
289 dbslice = marshal.loads(pickle_str)
290 if dbslice.get('WORDS'):
291 # if it has some words, add them
292 for word, entry in dbslice['WORDS'].items():
293 db['WORDS'][word] = entry
294 if dbslice.get('FILES'):
295 # if it has some files, add them
296 db['FILES'] = dbslice['FILES']
297 if dbslice.get('FILEIDS'):
298 # if it has fileids, add them
299 db['FILEIDS'] = dbslice['FILEIDS']
301 self.words = db['WORDS']
302 self.files = db['FILES']
303 self.fileids = db['FILEIDS']
305 def save_index(self):
306 # brutal space saver... delete all the small segments
307 for segment in self.segments:
308 try:
309 os.remove(self.indexdb + segment)
310 except OSError:
311 # probably just nonexistent segment index file
312 # TODO: make sure it's an EEXIST
313 pass
315 # First write the much simpler filename/fileid dictionaries
316 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
317 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
319 # The hard part is splitting the word dictionary up, of course
320 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
321 segdicts = {} # Need batch of empty dicts
322 for segment in letters:
323 segdicts[segment] = {}
324 for word, entry in self.words.items(): # Split into segment dicts
325 initchar = word[0].upper()
326 segdicts[initchar][word] = entry
328 # save
329 for initchar in letters:
330 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
331 pickle_str = marshal.dumps(db)
332 filename = self.indexdb + initchar
333 pickle_fh = open(filename, 'wb')
334 pickle_fh.write(zlib.compress(pickle_str))
335 os.chmod(filename, 0664)
337 def purge_entry(self, fname, file_dct, word_dct):
338 ''' Remove a file from file index and word index
339 '''
340 try: # The easy part, cleanup the file index
341 file_index = file_dct[fname]
342 del file_dct[fname]
343 except KeyError:
344 pass # We'll assume we only encounter KeyError's
345 # The much harder part, cleanup the word index
346 for word, occurs in word_dct.items():
347 if occurs.has_key(file_index):
348 del occurs[file_index]
349 word_dct[word] = occurs
351 def index_loaded(self):
352 return (hasattr(self,'fileids') and hasattr(self,'files') and
353 hasattr(self,'words'))
355 #
356 #$Log: not supported by cvs2svn $
357 #Revision 1.2 2002/05/25 07:16:24 rochecompaan
358 #Merged search_indexing-branch with HEAD
359 #
360 #Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
361 #Fixed small bug that prevented indexes from being generated.
362 #
363 #Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
364 #cgi_client.py
365 # removed search link for the time being
366 # moved rendering of matches to htmltemplate
367 #hyperdb.py
368 # filtering of nodes on full text search incorporated in filter method
369 #roundupdb.py
370 # added paramater to call of filter method
371 #roundup_indexer.py
372 # added search method to RoundupIndexer class
373 #
374 #Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
375 # . Added feature #526730 - search for messages capability
376 #