5b166b52e88e616e7e61a8e8907b054c85f77a54
1 #
2 # This module is derived from the module described at:
3 # http://gnosis.cx/publish/programming/charming_python_15.txt
4 #
5 # Author: David Mertz (mertz@gnosis.cx)
6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
7 # Gregory Popovitch (greg@gpy.com)
8 #
9 # The original module was released under this license, and remains under
10 # it:
11 #
12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights
15 # to compel such.
16 #
17 #$Id: indexer_dbm.py,v 1.9 2006-04-27 05:48:26 richard Exp $
18 '''This module provides an indexer class, RoundupIndexer, that stores text
19 indices in a roundup instance. This class makes searching the content of
20 messages, string properties and text files possible.
21 '''
22 __docformat__ = 'restructuredtext'
24 import os, shutil, re, mimetypes, marshal, zlib, errno
25 from roundup.hyperdb import Link, Multilink
26 from roundup.backends.indexer_common import Indexer as IndexerBase
28 class Indexer(IndexerBase):
29 '''Indexes information from roundup's hyperdb to allow efficient
30 searching.
32 Three structures are created by the indexer::
34 files {identifier: (fileid, wordcount)}
35 words {word: {fileid: count}}
36 fileids {fileid: identifier}
38 where identifier is (classname, nodeid, propertyname)
39 '''
40 def __init__(self, db):
41 IndexerBase.__init__(self, db)
42 self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes')
43 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
44 self.reindex = 0
45 self.quiet = 9
46 self.changed = 0
48 # see if we need to reindex because of a change in code
49 version = os.path.join(self.indexdb_path, 'version')
50 if (not os.path.exists(self.indexdb_path) or
51 not os.path.exists(version)):
52 # for now the file itself is a flag
53 self.force_reindex()
54 elif os.path.exists(version):
55 version = open(version).read()
56 # check the value and reindex if it's not the latest
57 if version.strip() != '1':
58 self.force_reindex()
60 def force_reindex(self):
61 '''Force a reindex condition
62 '''
63 if os.path.exists(self.indexdb_path):
64 shutil.rmtree(self.indexdb_path)
65 os.makedirs(self.indexdb_path)
66 os.chmod(self.indexdb_path, 0775)
67 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
68 self.reindex = 1
69 self.changed = 1
71 def should_reindex(self):
72 '''Should we reindex?
73 '''
74 return self.reindex
76 def add_text(self, identifier, text, mime_type='text/plain'):
77 '''Add some text associated with the (classname, nodeid, property)
78 identifier.
79 '''
80 # make sure the index is loaded
81 self.load_index()
83 # remove old entries for this identifier
84 if self.files.has_key(identifier):
85 self.purge_entry(identifier)
87 # split into words
88 words = self.splitter(text, mime_type)
90 # Find new file index, and assign it to identifier
91 # (_TOP uses trick of negative to avoid conflict with file index)
92 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
93 file_index = abs(self.files['_TOP'][0])
94 self.files[identifier] = (file_index, len(words))
95 self.fileids[file_index] = identifier
97 # find the unique words
98 filedict = {}
99 for word in words:
100 if self.is_stopword(word):
101 continue
102 if filedict.has_key(word):
103 filedict[word] = filedict[word]+1
104 else:
105 filedict[word] = 1
107 # now add to the totals
108 for word in filedict.keys():
109 # each word has a dict of {identifier: count}
110 if self.words.has_key(word):
111 entry = self.words[word]
112 else:
113 # new word
114 entry = {}
115 self.words[word] = entry
117 # make a reference to the file for this word
118 entry[file_index] = filedict[word]
120 # save needed
121 self.changed = 1
123 def splitter(self, text, ftype):
124 '''Split the contents of a text string into a list of 'words'
125 '''
126 if ftype == 'text/plain':
127 words = self.text_splitter(text)
128 else:
129 return []
130 return words
132 def text_splitter(self, text):
133 """Split text/plain string into a list of words
134 """
135 # case insensitive
136 text = str(text).upper()
138 # Split the raw text
139 return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
140 text)
142 # we override this to ignore too short and too long words
143 # and also to fix a bug - the (fail) case.
144 def find(self, wordlist):
145 '''Locate files that match ALL the words in wordlist
146 '''
147 if not hasattr(self, 'words'):
148 self.load_index()
149 self.load_index(wordlist=wordlist)
150 entries = {}
151 hits = None
152 for word in wordlist:
153 if not self.minlength <= len(word) <= self.maxlength:
154 # word outside the bounds of what we index - ignore
155 continue
156 word = word.upper()
157 if self.is_stopword(word):
158 continue
159 entry = self.words.get(word) # For each word, get index
160 entries[word] = entry # of matching files
161 if not entry: # Nothing for this one word (fail)
162 return {}
163 if hits is None:
164 hits = {}
165 for k in entry.keys():
166 if not self.fileids.has_key(k):
167 raise ValueError, 'Index is corrupted: re-generate it'
168 hits[k] = self.fileids[k]
169 else:
170 # Eliminate hits for every non-match
171 for fileid in hits.keys():
172 if not entry.has_key(fileid):
173 del hits[fileid]
174 if hits is None:
175 return {}
176 return hits.values()
178 segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
179 def load_index(self, reload=0, wordlist=None):
180 # Unless reload is indicated, do not load twice
181 if self.index_loaded() and not reload:
182 return 0
184 # Ok, now let's actually load it
185 db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
187 # Identify the relevant word-dictionary segments
188 if not wordlist:
189 segments = self.segments
190 else:
191 segments = ['-','#']
192 for word in wordlist:
193 segments.append(word[0].upper())
195 # Load the segments
196 for segment in segments:
197 try:
198 f = open(self.indexdb + segment, 'rb')
199 except IOError, error:
200 # probably just nonexistent segment index file
201 if error.errno != errno.ENOENT: raise
202 else:
203 pickle_str = zlib.decompress(f.read())
204 f.close()
205 dbslice = marshal.loads(pickle_str)
206 if dbslice.get('WORDS'):
207 # if it has some words, add them
208 for word, entry in dbslice['WORDS'].items():
209 db['WORDS'][word] = entry
210 if dbslice.get('FILES'):
211 # if it has some files, add them
212 db['FILES'] = dbslice['FILES']
213 if dbslice.get('FILEIDS'):
214 # if it has fileids, add them
215 db['FILEIDS'] = dbslice['FILEIDS']
217 self.words = db['WORDS']
218 self.files = db['FILES']
219 self.fileids = db['FILEIDS']
220 self.changed = 0
222 def save_index(self):
223 # only save if the index is loaded and changed
224 if not self.index_loaded() or not self.changed:
225 return
227 # brutal space saver... delete all the small segments
228 for segment in self.segments:
229 try:
230 os.remove(self.indexdb + segment)
231 except OSError, error:
232 # probably just nonexistent segment index file
233 if error.errno != errno.ENOENT: raise
235 # First write the much simpler filename/fileid dictionaries
236 dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
237 open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
239 # The hard part is splitting the word dictionary up, of course
240 letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
241 segdicts = {} # Need batch of empty dicts
242 for segment in letters:
243 segdicts[segment] = {}
244 for word, entry in self.words.items(): # Split into segment dicts
245 initchar = word[0].upper()
246 segdicts[initchar][word] = entry
248 # save
249 for initchar in letters:
250 db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
251 pickle_str = marshal.dumps(db)
252 filename = self.indexdb + initchar
253 pickle_fh = open(filename, 'wb')
254 pickle_fh.write(zlib.compress(pickle_str))
255 os.chmod(filename, 0664)
257 # save done
258 self.changed = 0
260 def purge_entry(self, identifier):
261 '''Remove a file from file index and word index
262 '''
263 self.load_index()
265 if not self.files.has_key(identifier):
266 return
268 file_index = self.files[identifier][0]
269 del self.files[identifier]
270 del self.fileids[file_index]
272 # The much harder part, cleanup the word index
273 for key, occurs in self.words.items():
274 if occurs.has_key(file_index):
275 del occurs[file_index]
277 # save needed
278 self.changed = 1
280 def index_loaded(self):
281 return (hasattr(self,'fileids') and hasattr(self,'files') and
282 hasattr(self,'words'))
284 def rollback(self):
285 ''' load last saved index info. '''
286 self.load_index(reload=1)
288 def close(self):
289 pass
292 # vim: set filetype=python ts=4 sw=4 et si